1 /*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "nir/nir_to_tgsi.h"
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "tgsi/tgsi_from_mesa.h"
31 #include "tgsi/tgsi_info.h"
32 #include "tgsi/tgsi_ureg.h"
33 #include "util/debug.h"
34
35 struct ntt_compile {
36 nir_shader *s;
37 nir_function_impl *impl;
38 struct pipe_screen *screen;
39 struct ureg_program *ureg;
40
41 bool needs_texcoord_semantic;
42 bool any_reg_as_address;
43 bool native_integers;
44
45 int next_addr_reg;
46 bool addr_declared[2];
47 struct ureg_dst addr_reg[2];
48
49 unsigned loop_label;
50
51 /* if condition set up at the end of a block, for ntt_emit_if(). */
52 struct ureg_src if_cond;
53
54 /* TGSI temps for our NIR SSA and register values. */
55 struct ureg_dst *reg_temp;
56 struct ureg_dst *ssa_temp;
57
58 nir_instr_liveness *liveness;
59
60 /* Mappings from driver_location to TGSI input/output number.
61 *
62 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
63 * their numbers assigned incrementally, unlike inputs or constants.
64 */
65 struct ureg_src *input_index_map;
66 uint64_t centroid_inputs;
67
68 struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
69 };
70
71 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
72
73 static unsigned
ntt_64bit_write_mask(unsigned write_mask)74 ntt_64bit_write_mask(unsigned write_mask)
75 {
76 return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
77 }
78
79 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)80 ntt_64bit_1f(struct ntt_compile *c)
81 {
82 return ureg_imm4u(c->ureg,
83 0x00000000, 0x3ff00000,
84 0x00000000, 0x3ff00000);
85 }
86
87 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)88 ntt_shader_input_type(struct ntt_compile *c,
89 struct nir_variable *var)
90 {
91 switch (c->s->info.stage) {
92 case MESA_SHADER_GEOMETRY:
93 case MESA_SHADER_TESS_EVAL:
94 case MESA_SHADER_TESS_CTRL:
95 if (glsl_type_is_array(var->type))
96 return glsl_get_array_element(var->type);
97 else
98 return var->type;
99 default:
100 return var->type;
101 }
102 }
103
104 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)105 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
106 unsigned *semantic_name, unsigned *semantic_index)
107 {
108 /* We want to use most of tgsi_get_gl_varying_semantic(), but the
109 * !texcoord shifting has already been applied, so avoid that.
110 */
111 if (!c->needs_texcoord_semantic &&
112 (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
113 *semantic_name = TGSI_SEMANTIC_GENERIC;
114 *semantic_index = location - VARYING_SLOT_VAR0;
115 return;
116 }
117
118 tgsi_get_gl_varying_semantic(location, true,
119 semantic_name, semantic_index);
120 }
121
122 /* TGSI varying declarations have a component usage mask associated (used by
123 * r600 and svga).
124 */
125 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)126 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
127 bool is_64)
128 {
129 uint32_t usage_mask =
130 u_bit_consecutive(start_component, num_components);
131
132 if (is_64) {
133 if (start_component >= 2)
134 usage_mask >>= 2;
135
136 uint32_t tgsi_usage_mask = 0;
137
138 if (usage_mask & TGSI_WRITEMASK_X)
139 tgsi_usage_mask |= TGSI_WRITEMASK_XY;
140 if (usage_mask & TGSI_WRITEMASK_Y)
141 tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
142
143 return tgsi_usage_mask;
144 } else {
145 return usage_mask;
146 }
147 }
148
149 /* TGSI varying declarations have a component usage mask associated (used by
150 * r600 and svga).
151 */
152 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)153 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
154 {
155 const struct glsl_type *type_without_array =
156 glsl_without_array(var->type);
157 unsigned num_components = glsl_get_vector_elements(type_without_array);
158 if (num_components == 0) /* structs */
159 num_components = 4;
160
161 return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
162 glsl_type_is_64bit(type_without_array));
163 }
164
165 static void
ntt_setup_inputs(struct ntt_compile * c)166 ntt_setup_inputs(struct ntt_compile *c)
167 {
168 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
169 return;
170
171 unsigned num_inputs = 0;
172 int num_input_arrays = 0;
173
174 nir_foreach_shader_in_variable(var, c->s) {
175 const struct glsl_type *type = ntt_shader_input_type(c, var);
176 unsigned array_len =
177 glsl_count_attribute_slots(type, false);
178
179 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
180 }
181
182 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
183
184 nir_foreach_shader_in_variable(var, c->s) {
185 const struct glsl_type *type = ntt_shader_input_type(c, var);
186 unsigned array_len =
187 glsl_count_attribute_slots(type, false);
188
189 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
190 unsigned sample_loc;
191 struct ureg_src decl;
192
193 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
194 interpolation =
195 tgsi_get_interp_mode(var->data.interpolation,
196 var->data.location == VARYING_SLOT_COL0 ||
197 var->data.location == VARYING_SLOT_COL1);
198
199 if (var->data.location == VARYING_SLOT_POS)
200 interpolation = TGSI_INTERPOLATE_LINEAR;
201 }
202
203 unsigned semantic_name, semantic_index;
204 ntt_get_gl_varying_semantic(c, var->data.location,
205 &semantic_name, &semantic_index);
206
207 if (var->data.sample) {
208 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
209 } else if (var->data.centroid) {
210 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
211 c->centroid_inputs |= (BITSET_MASK(array_len) <<
212 var->data.driver_location);
213 } else {
214 sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
215 }
216
217 unsigned array_id = 0;
218 if (glsl_type_is_array(type))
219 array_id = ++num_input_arrays;
220
221 uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
222
223 decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,
224 semantic_name,
225 semantic_index,
226 interpolation,
227 0,
228 sample_loc,
229 var->data.driver_location,
230 usage_mask,
231 array_id, array_len);
232
233 if (semantic_name == TGSI_SEMANTIC_FACE) {
234 struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
235 /* NIR is ~0 front and 0 back, while TGSI is +1 front */
236 ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
237 decl = ureg_src(temp);
238 }
239
240 for (unsigned i = 0; i < array_len; i++) {
241 c->input_index_map[var->data.driver_location + i] = decl;
242 c->input_index_map[var->data.driver_location + i].Index += i;
243 }
244 }
245 }
246
247 static void
ntt_setup_uniforms(struct ntt_compile * c)248 ntt_setup_uniforms(struct ntt_compile *c)
249 {
250 struct pipe_screen *screen = c->screen;
251 bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);
252
253 nir_foreach_uniform_variable(var, c->s) {
254 if (glsl_type_is_image(var->type)) {
255 c->images[var->data.binding] = ureg_DECL_image(c->ureg,
256 var->data.binding,
257 TGSI_TEXTURE_2D,
258 var->data.image.format,
259 !var->data.read_only,
260 false);
261 } else {
262 unsigned size;
263 if (packed) {
264 size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,
265 var->data.bindless), 4);
266 } else {
267 size = glsl_count_vec4_slots(var->type, false, var->data.bindless);
268 }
269
270 for (unsigned i = 0; i < size; i++)
271 ureg_DECL_constant(c->ureg, var->data.driver_location + i);
272 }
273 }
274
275 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
276 ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location + 1);
277 }
278
279 for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {
280 if (c->s->info.textures_used & (1 << i))
281 ureg_DECL_sampler(c->ureg, i);
282 }
283 }
284
285 static void
ntt_setup_registers(struct ntt_compile * c,struct exec_list * list)286 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
287 {
288 foreach_list_typed(nir_register, nir_reg, node, list) {
289 struct ureg_dst decl;
290 if (nir_reg->num_array_elems == 0) {
291 uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
292 if (nir_reg->bit_size == 64) {
293 if (nir_reg->num_components > 2) {
294 fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
295 nir_reg->num_components, nir_reg->index);
296 }
297
298 write_mask = ntt_64bit_write_mask(write_mask);
299 }
300
301 decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
302 } else {
303 decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
304 true);
305 }
306 c->reg_temp[nir_reg->index] = decl;
307 }
308 }
309
310 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)311 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
312 {
313 uint32_t values[4];
314 int num_components = instr->def.num_components;
315
316 if (instr->def.bit_size == 32) {
317 for (int i = 0; i < num_components; i++)
318 values[i] = instr->value[i].u32;
319 } else {
320 assert(num_components <= 2);
321 for (int i = 0; i < num_components; i++) {
322 values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
323 values[i * 2 + 1] = instr->value[i].u64 >> 32;
324 }
325 num_components *= 2;
326 }
327
328 return ureg_DECL_immediate_uint(c->ureg, values, num_components);
329 }
330
331 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr)332 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
333 {
334 if (c->any_reg_as_address) {
335 /* Make sure we're getting the refcounting right even on any_reg
336 * drivers.
337 */
338 c->next_addr_reg++;
339
340 return ureg_scalar(addr, 0);
341 }
342
343 assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));
344
345 if (!c->addr_declared[c->next_addr_reg]) {
346 c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
347 TGSI_WRITEMASK_X);
348 c->addr_declared[c->next_addr_reg] = true;
349 }
350
351 ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
352 return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
353 }
354
355 static void
ntt_put_reladdr(struct ntt_compile * c)356 ntt_put_reladdr(struct ntt_compile *c)
357 {
358 c->next_addr_reg--;
359 assert(c->next_addr_reg >= 0);
360 }
361
362 static void
ntt_reladdr_dst_put(struct ntt_compile * c,struct ureg_dst dst)363 ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
364 {
365 if (c->any_reg_as_address)
366 return;
367
368 if (dst.Indirect)
369 ntt_put_reladdr(c);
370 if (dst.DimIndirect)
371 ntt_put_reladdr(c);
372 }
373
374 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)375 ntt_get_src(struct ntt_compile *c, nir_src src)
376 {
377 if (src.is_ssa) {
378 if (src.ssa->parent_instr->type == nir_instr_type_load_const)
379 return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
380
381 return ureg_src(c->ssa_temp[src.ssa->index]);
382 } else {
383 nir_register *reg = src.reg.reg;
384 struct ureg_dst reg_temp = c->reg_temp[reg->index];
385 reg_temp.Index += src.reg.base_offset;
386
387 if (src.reg.indirect) {
388 struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
389 return ureg_src_indirect(ureg_src(reg_temp),
390 ntt_reladdr(c, offset));
391 } else {
392 return ureg_src(reg_temp);
393 }
394 }
395 }
396
397 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)398 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
399 {
400 nir_alu_src src = instr->src[i];
401 struct ureg_src usrc = ntt_get_src(c, src.src);
402
403 if (nir_src_bit_size(src.src) == 64) {
404 int chan0 = 0, chan1 = 1;
405 if (nir_op_infos[instr->op].input_sizes[i] == 0) {
406 chan0 = ffs(instr->dest.write_mask) - 1;
407 chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
408 if (chan1 == -1)
409 chan1 = chan0;
410 }
411 usrc = ureg_swizzle(usrc,
412 src.swizzle[chan0] * 2,
413 src.swizzle[chan0] * 2 + 1,
414 src.swizzle[chan1] * 2,
415 src.swizzle[chan1] * 2 + 1);
416 } else {
417 usrc = ureg_swizzle(usrc,
418 src.swizzle[0],
419 src.swizzle[1],
420 src.swizzle[2],
421 src.swizzle[3]);
422 }
423
424 if (src.abs)
425 usrc = ureg_abs(usrc);
426 if (src.negate)
427 usrc = ureg_negate(usrc);
428
429 return usrc;
430 }
431
432 static struct ureg_dst *
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_ssa_def * ssa)433 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
434 {
435 struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
436
437 uint32_t writemask = BITSET_MASK(ssa->num_components);
438 if (ssa->bit_size == 64)
439 writemask = ntt_64bit_write_mask(writemask);
440
441 c->ssa_temp[ssa->index] = ureg_writemask(temp, writemask);
442
443 return &c->ssa_temp[ssa->index];
444 }
445
446 static struct ureg_dst *
ntt_get_dest_decl(struct ntt_compile * c,nir_dest * dest)447 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
448 {
449 if (dest->is_ssa)
450 return ntt_get_ssa_def_decl(c, &dest->ssa);
451 else
452 return &c->reg_temp[dest->reg.reg->index];
453 }
454
455 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_dest * dest)456 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
457 {
458 struct ureg_dst dst = *ntt_get_dest_decl(c, dest);
459
460 if (!dest->is_ssa) {
461 dst.Index += dest->reg.base_offset;
462
463 if (dest->reg.indirect) {
464 struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
465 dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
466 }
467 }
468
469 return dst;
470 }
471
472 /* For an SSA dest being populated by a constant src, replace the storage with
473 * a copy of the ureg_src.
474 */
475 static void
ntt_store_def(struct ntt_compile * c,nir_ssa_def * def,struct ureg_src src)476 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
477 {
478 if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&
479 src.SwizzleX == TGSI_SWIZZLE_X &&
480 (src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&
481 (src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&
482 (src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {
483 switch (src.File) {
484 case TGSI_FILE_IMMEDIATE:
485 case TGSI_FILE_INPUT:
486 case TGSI_FILE_CONSTANT:
487 case TGSI_FILE_SYSTEM_VALUE:
488 c->ssa_temp[def->index] = ureg_dst(src);
489 return;
490 }
491 }
492
493 ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);
494 }
495
496 static void
ntt_store(struct ntt_compile * c,nir_dest * dest,struct ureg_src src)497 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
498 {
499 if (dest->is_ssa)
500 ntt_store_def(c, &dest->ssa, src);
501 else {
502 struct ureg_dst dst = ntt_get_dest(c, dest);
503 ureg_MOV(c->ureg, dst, src);
504 }
505 }
506
507 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)508 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
509 struct ureg_dst dst,
510 struct ureg_src src0,
511 struct ureg_src src1)
512 {
513 unsigned i;
514 int num_src;
515
516 /* POW is the only 2-operand scalar op. */
517 if (tgsi_op == TGSI_OPCODE_POW) {
518 num_src = 2;
519 } else {
520 num_src = 1;
521 src1 = src0;
522 }
523
524 for (i = 0; i < 4; i++) {
525 if (dst.WriteMask & (1 << i)) {
526 struct ureg_dst this_dst = dst;
527 struct ureg_src srcs[2] = {
528 ureg_scalar(src0, i),
529 ureg_scalar(src1, i),
530 };
531 this_dst.WriteMask = (1 << i);
532
533 ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
534 }
535 }
536 }
537
538 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)539 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
540 {
541 struct ureg_src src[4];
542 struct ureg_dst dst;
543 unsigned i;
544 int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
545 int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
546 int num_srcs = nir_op_infos[instr->op].num_inputs;
547
548 assert(num_srcs <= ARRAY_SIZE(src));
549 for (i = 0; i < num_srcs; i++)
550 src[i] = ntt_get_alu_src(c, instr, i);
551 dst = ntt_get_dest(c, &instr->dest.dest);
552
553 if (instr->dest.saturate)
554 dst.Saturate = true;
555
556 if (dst_64)
557 dst.WriteMask = ntt_64bit_write_mask(instr->dest.write_mask);
558 else
559 dst.WriteMask = instr->dest.write_mask;
560
561 static enum tgsi_opcode op_map[][2] = {
562 [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
563
564 /* fabs/fneg 32-bit are special-cased below. */
565 [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
566 [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
567
568 [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
569 [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
570 [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
571 [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
572 [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
573 [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
574 [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
575 [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
576 [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
577 [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
578
579 [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
580 [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
581 [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
582
583 /* The conversions will have one combination of src and dst bitsize. */
584 [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
585 [nir_op_f2f64] = { TGSI_OPCODE_F2D },
586 [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
587
588 [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
589 [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
590 [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
591 [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
592 [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
593 [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
594 [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
595 [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
596
597 [nir_op_slt] = { TGSI_OPCODE_SLT },
598 [nir_op_sge] = { TGSI_OPCODE_SGE },
599 [nir_op_seq] = { TGSI_OPCODE_SEQ },
600 [nir_op_sne] = { TGSI_OPCODE_SNE },
601
602 [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
603 [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
604 [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
605 [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
606
607 [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
608 [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
609 [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
610 [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
611
612 [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
613 [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
614
615 [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
616 [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
617 [nir_op_fsign] = { TGSI_OPCODE_SSG },
618 [nir_op_isign] = { TGSI_OPCODE_ISSG },
619 [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
620 [nir_op_fddx] = { TGSI_OPCODE_DDX },
621 [nir_op_fddy] = { TGSI_OPCODE_DDY },
622 [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
623 [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
624 [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
625 [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
626 [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
627 [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
628 [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
629 [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
630 [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
631 [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
632 [nir_op_bit_count] = { TGSI_OPCODE_POPC },
633 [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
634 [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
635 [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
636 [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
637 [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
638 [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
639 [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
640 [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
641 [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
642 [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
643 [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
644 [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
645 [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
646 [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
647
648 /* These bitwise ops don't care about 32 vs 64 types, so they have the
649 * same TGSI op.
650 */
651 [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
652 [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
653 [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
654 [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
655
656 [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
657 [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
658 [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
659 [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
660 [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
661 [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
662 [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
663 [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
664 };
665
666 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
667 * of .xy. Store to a temp and move it to the real dst.
668 */
669 bool tgsi_64bit_compare = src_64 && !dst_64 &&
670 (num_srcs == 2 ||
671 nir_op_infos[instr->op].output_type == nir_type_bool32) &&
672 (dst.WriteMask != TGSI_WRITEMASK_X);
673
674 /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
675 * channels and will need to get fixed up.
676 */
677 bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
678 num_srcs == 1 && !tgsi_64bit_compare &&
679 (dst.WriteMask & ~TGSI_WRITEMASK_XY));
680
681 struct ureg_dst real_dst = ureg_dst_undef();
682 if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
683 real_dst = dst;
684 dst = ureg_DECL_temporary(c->ureg);
685 }
686
687 bool table_op64 = src_64;
688 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
689 /* The normal path for NIR to TGSI ALU op translation */
690 ureg_insn(c->ureg, op_map[instr->op][table_op64],
691 &dst, 1, src, num_srcs, false);
692 } else {
693 /* Special cases for NIR to TGSI ALU op translation. */
694
695 /* TODO: Use something like the ntt_store() path for the MOV calls so we
696 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
697 */
698
699 switch (instr->op) {
700 case nir_op_u2u64:
701 ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
702 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
703 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
704 ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
705 break;
706
707 case nir_op_i2i32:
708 case nir_op_u2u32:
709 assert(src_64);
710 ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
711 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
712 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
713 break;
714
715 case nir_op_fabs:
716 ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
717 break;
718
719 case nir_op_fsat:
720 if (dst_64) {
721 ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
722 ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
723 } else {
724 ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
725 }
726 break;
727
728 case nir_op_fneg:
729 ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
730 break;
731
732 /* NOTE: TGSI 32-bit math ops have the old "one source channel
733 * replicated to all dst channels" behavior, while 64 is normal mapping
734 * of src channels to dst.
735 */
736 case nir_op_frcp:
737 assert(!dst_64);
738 ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
739 break;
740
741 case nir_op_frsq:
742 assert(!dst_64);
743 ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
744 break;
745
746 case nir_op_fsqrt:
747 assert(!dst_64);
748 ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
749 break;
750
751 case nir_op_fexp2:
752 assert(!dst_64);
753 ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
754 break;
755
756 case nir_op_flog2:
757 assert(!dst_64);
758 ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
759 break;
760
761 case nir_op_b2f32:
762 ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
763 break;
764
765 case nir_op_b2f64:
766 ureg_AND(c->ureg, dst,
767 ureg_swizzle(src[0],
768 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
769 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
770 ntt_64bit_1f(c));
771 break;
772
773 case nir_op_f2b32:
774 if (src_64)
775 ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
776 else
777 ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
778 break;
779
780 case nir_op_i2b32:
781 if (src_64) {
782 ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
783 } else
784 ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
785 break;
786
787 case nir_op_b2i32:
788 ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
789 break;
790
791 case nir_op_b2i64:
792 ureg_AND(c->ureg, dst,
793 ureg_swizzle(src[0],
794 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
795 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
796 ureg_imm4u(c->ureg, 1, 0, 1, 0));
797 break;
798
799 case nir_op_fsin:
800 ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
801 break;
802
803 case nir_op_fcos:
804 ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
805 break;
806
807 case nir_op_fsub:
808 assert(!dst_64);
809 ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
810 break;
811
812 case nir_op_isub:
813 assert(!dst_64);
814 ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
815 break;
816
817 /* XXX: carry */
818
819 case nir_op_fmod:
820 unreachable("should be handled by .lower_fmod = true");
821 break;
822
823 case nir_op_fpow:
824 ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
825 break;
826
827 case nir_op_flrp:
828 ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
829 break;
830
831 case nir_op_pack_64_2x32_split:
832 ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
833 ureg_swizzle(src[0],
834 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
835 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
836 ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
837 ureg_swizzle(src[1],
838 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
839 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
840 break;
841
842 case nir_op_unpack_64_2x32_split_x:
843 ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
844 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
845 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
846 break;
847
848 case nir_op_unpack_64_2x32_split_y:
849 ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
850 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
851 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
852 break;
853
854 case nir_op_b32csel:
855 if (nir_src_bit_size(instr->src[1].src) == 64) {
856 ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
857 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
858 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
859 src[1], src[2]);
860 } else {
861 ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
862 }
863 break;
864
865 case nir_op_fcsel:
866 /* NIR is src0 != 0 ? src1 : src2.
867 * TGSI is src0 < 0 ? src1 : src2.
868 *
869 * However, fcsel so far as I can find only appears on
870 * bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.
871 */
872 ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[2], src[1]);
873 break;
874
875 /* It would be nice if we could get this left as scalar in NIR, since
876 * the TGSI op is scalar.
877 */
878 case nir_op_frexp_sig:
879 case nir_op_frexp_exp: {
880 assert(src_64);
881 struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
882
883 for (int chan = 0; chan < 2; chan++) {
884 int wm = 1 << chan;
885
886 if (!(instr->dest.write_mask & wm))
887 continue;
888
889 struct ureg_dst dsts[2] = { temp, temp };
890 if (instr->op == nir_op_frexp_sig) {
891 dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
892 } else {
893 dsts[1] = ureg_writemask(dst, wm);
894 }
895
896 struct ureg_src chan_src = ureg_swizzle(src[0],
897 chan * 2, chan * 2 + 1,
898 chan * 2, chan * 2 + 1);
899
900 ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
901 dsts, 2,
902 &chan_src, 1, false);
903 }
904
905 ureg_release_temporary(c->ureg, temp);
906 break;
907 }
908
909 case nir_op_ldexp:
910 assert(dst_64); /* 32bit handled in table. */
911 ureg_DLDEXP(c->ureg, dst, src[0],
912 ureg_swizzle(src[1],
913 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
914 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
915 break;
916
917 case nir_op_vec4:
918 case nir_op_vec3:
919 case nir_op_vec2:
920 unreachable("covered by nir_lower_vec_to_movs()");
921
922 default:
923 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
924 unreachable("Unknown NIR opcode");
925 }
926 }
927
928 /* 64-bit op fixup movs */
929 if (!ureg_dst_is_undef(real_dst)) {
930 if (tgsi_64bit_compare) {
931 ureg_MOV(c->ureg, real_dst,
932 ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
933 } else {
934 assert(tgsi_64bit_downconvert);
935 uint8_t swizzle[] = {0, 0, 0, 0};
936 uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
937 if (second_bit)
938 swizzle[ffs(second_bit) - 1] = 1;
939 ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
940 swizzle[0],
941 swizzle[1],
942 swizzle[2],
943 swizzle[3]));
944 }
945 ureg_release_temporary(c->ureg, dst);
946 }
947 }
948
949 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)950 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
951 nir_src src)
952 {
953 if (nir_src_is_const(src)) {
954 usrc.Index += nir_src_as_uint(src);
955 return usrc;
956 } else {
957 return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
958 }
959 }
960
961 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)962 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
963 nir_src src)
964 {
965 if (nir_src_is_const(src)) {
966 dst.Index += nir_src_as_uint(src);
967 return dst;
968 } else {
969 return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
970 }
971 }
972
973 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)974 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
975 nir_src src)
976 {
977 if (nir_src_is_const(src)) {
978 return ureg_src_dimension(usrc, nir_src_as_uint(src));
979 } else {
980 return ureg_src_dimension_indirect(usrc,
981 ntt_reladdr(c, ntt_get_src(c, src)),
982 1);
983 }
984 }
985
986 static void
ntt_emit_load_uniform(struct ntt_compile * c,nir_intrinsic_instr * instr)987 ntt_emit_load_uniform(struct ntt_compile *c, nir_intrinsic_instr *instr)
988 {
989 struct ureg_src src =
990 ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_CONSTANT,
991 nir_intrinsic_base(instr)),
992 instr->src[0]);
993 ntt_store(c, &instr->dest, src);
994 }
995
996 /* Some load operations in NIR will have a fractional offset that we need to
997 * swizzle down before storing to the result register.
998 */
999 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1000 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1001 {
1002 return ureg_swizzle(src,
1003 frac,
1004 frac + MIN2(num_components - 1, 1),
1005 frac + MIN2(num_components - 1, 2),
1006 frac + MIN2(num_components - 1, 3));
1007 }
1008
1009 /* PIPE_CAP_LOAD_CONSTBUF */
1010 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1011 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1012 {
1013 /* XXX: Emit a TGSI_OPCODE_LOAD instr. */
1014 }
1015
1016 /* !PIPE_CAP_LOAD_CONSTBUF */
1017 static void
ntt_emit_load_ubo_vec4(struct ntt_compile * c,nir_intrinsic_instr * instr)1018 ntt_emit_load_ubo_vec4(struct ntt_compile *c, nir_intrinsic_instr *instr)
1019 {
1020 int bit_size = nir_dest_bit_size(instr->dest);
1021 assert(bit_size == 32 || instr->num_components <= 2);
1022
1023 struct ureg_src src;
1024 if (nir_src_is_const(instr->src[1])) {
1025 src = ureg_src_register(TGSI_FILE_CONSTANT,
1026 nir_src_as_uint(instr->src[1]));
1027 } else {
1028 src = ureg_src_indirect(ureg_src_register(TGSI_FILE_CONSTANT, 0),
1029 ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
1030 }
1031
1032 int start_component = nir_intrinsic_component(instr);
1033 if (bit_size == 64)
1034 start_component *= 2;
1035
1036 src = ntt_shift_by_frac(src, start_component,
1037 instr->num_components * bit_size / 32);
1038
1039 if (nir_src_is_const(instr->src[0])) {
1040 src = ureg_src_dimension(src, nir_src_as_uint(instr->src[0]) + 1);
1041 } else {
1042 struct ureg_src block_index = ntt_get_src(c, instr->src[0]);
1043
1044 src = ureg_src_dimension_indirect(src, ntt_reladdr(c, block_index), 1);
1045 }
1046
1047 ntt_store(c, &instr->dest, src);
1048 }
1049
1050 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1051 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1052 {
1053 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1054 unsigned qualifier = 0;
1055
1056 if (access & ACCESS_COHERENT)
1057 qualifier |= TGSI_MEMORY_COHERENT;
1058 if (access & ACCESS_VOLATILE)
1059 qualifier |= TGSI_MEMORY_VOLATILE;
1060 if (access & ACCESS_RESTRICT)
1061 qualifier |= TGSI_MEMORY_RESTRICT;
1062
1063 return qualifier;
1064 }
1065
1066 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1067 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1068 nir_variable_mode mode)
1069 {
1070 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1071 instr->intrinsic == nir_intrinsic_store_shared);
1072 bool is_load = (instr->intrinsic == nir_intrinsic_load_ssbo ||
1073 instr->intrinsic == nir_intrinsic_load_shared);
1074 unsigned opcode;
1075 struct ureg_src src[4];
1076 int num_src = 0;
1077 int nir_src;
1078
1079 struct ureg_src memory;
1080 switch (mode) {
1081 case nir_var_mem_ssbo:
1082 /* XXX: TGSI should have BUFFER declarations for the SSBOs. Needed for
1083 * r600, nv50, llvmpipe.
1084 */
1085 memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1086 instr->src[is_store ? 1 : 0]);
1087 nir_src = 1;
1088 break;
1089 case nir_var_mem_shared:
1090 memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1091 nir_src = 0;
1092 break;
1093 default:
1094 unreachable("unknown memory type");
1095 }
1096
1097 if (is_store) {
1098 src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
1099 src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1100 } else {
1101 src[num_src++] = memory;
1102 if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1103 src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
1104 if (!is_load)
1105 src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
1106 }
1107 }
1108
1109
1110 switch (instr->intrinsic) {
1111 case nir_intrinsic_ssbo_atomic_add:
1112 case nir_intrinsic_shared_atomic_add:
1113 opcode = TGSI_OPCODE_ATOMUADD;
1114 break;
1115 case nir_intrinsic_ssbo_atomic_fadd:
1116 case nir_intrinsic_shared_atomic_fadd:
1117 opcode = TGSI_OPCODE_ATOMFADD;
1118 break;
1119 case nir_intrinsic_ssbo_atomic_imin:
1120 case nir_intrinsic_shared_atomic_imin:
1121 opcode = TGSI_OPCODE_ATOMIMIN;
1122 break;
1123 case nir_intrinsic_ssbo_atomic_imax:
1124 case nir_intrinsic_shared_atomic_imax:
1125 opcode = TGSI_OPCODE_ATOMIMAX;
1126 break;
1127 case nir_intrinsic_ssbo_atomic_umin:
1128 case nir_intrinsic_shared_atomic_umin:
1129 opcode = TGSI_OPCODE_ATOMUMIN;
1130 break;
1131 case nir_intrinsic_ssbo_atomic_umax:
1132 case nir_intrinsic_shared_atomic_umax:
1133 opcode = TGSI_OPCODE_ATOMUMAX;
1134 break;
1135 case nir_intrinsic_ssbo_atomic_and:
1136 case nir_intrinsic_shared_atomic_and:
1137 opcode = TGSI_OPCODE_ATOMAND;
1138 break;
1139 case nir_intrinsic_ssbo_atomic_or:
1140 case nir_intrinsic_shared_atomic_or:
1141 opcode = TGSI_OPCODE_ATOMOR;
1142 break;
1143 case nir_intrinsic_ssbo_atomic_xor:
1144 case nir_intrinsic_shared_atomic_xor:
1145 opcode = TGSI_OPCODE_ATOMXOR;
1146 break;
1147 case nir_intrinsic_ssbo_atomic_exchange:
1148 case nir_intrinsic_shared_atomic_exchange:
1149 opcode = TGSI_OPCODE_ATOMXCHG;
1150 break;
1151 case nir_intrinsic_ssbo_atomic_comp_swap:
1152 case nir_intrinsic_shared_atomic_comp_swap:
1153 opcode = TGSI_OPCODE_ATOMCAS;
1154 src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
1155 break;
1156 case nir_intrinsic_load_ssbo:
1157 case nir_intrinsic_load_shared:
1158 opcode = TGSI_OPCODE_LOAD;
1159 break;
1160 case nir_intrinsic_store_ssbo:
1161 case nir_intrinsic_store_shared:
1162 opcode = TGSI_OPCODE_STORE;
1163 break;
1164 case nir_intrinsic_get_ssbo_size:
1165 opcode = TGSI_OPCODE_RESQ;
1166 break;
1167 default:
1168 unreachable("unknown memory op");
1169 }
1170
1171 unsigned qualifier = 0;
1172 if (mode == nir_var_mem_ssbo &&
1173 instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1174 qualifier = ntt_get_access_qualifier(instr);
1175 }
1176
1177 struct ureg_dst dst;
1178 if (is_store) {
1179 dst = ureg_dst(memory);
1180
1181 unsigned write_mask = nir_intrinsic_write_mask(instr);
1182 if (nir_src_bit_size(instr->src[0]) == 64)
1183 write_mask = ntt_64bit_write_mask(write_mask);
1184 dst = ureg_writemask(dst, write_mask);
1185 } else {
1186 dst = ntt_get_dest(c, &instr->dest);
1187 }
1188
1189 ureg_memory_insn(c->ureg, opcode,
1190 &dst, 1,
1191 src, num_src,
1192 qualifier,
1193 TGSI_TEXTURE_BUFFER,
1194 0 /* format: unused */);
1195 }
1196
1197 static enum tgsi_texture_type
tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array)1198 tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
1199 {
1200 switch (dim) {
1201 case GLSL_SAMPLER_DIM_1D:
1202 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
1203 case GLSL_SAMPLER_DIM_2D:
1204 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
1205 case GLSL_SAMPLER_DIM_3D:
1206 return TGSI_TEXTURE_3D;
1207 case GLSL_SAMPLER_DIM_CUBE:
1208 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
1209 case GLSL_SAMPLER_DIM_RECT:
1210 return TGSI_TEXTURE_RECT;
1211 case GLSL_SAMPLER_DIM_BUF:
1212 return TGSI_TEXTURE_BUFFER;
1213 default:
1214 unreachable("unknown sampler dim");
1215 }
1216 }
1217
1218 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)1219 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
1220 {
1221 unsigned op;
1222 struct ureg_src srcs[3];
1223 int num_src = 0;
1224
1225 enum tgsi_texture_type target =
1226 tgsi_target_from_sampler_dim(nir_intrinsic_image_dim(instr),
1227 nir_intrinsic_image_array(instr));
1228
1229 struct ureg_src resource =
1230 ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
1231 instr->src[0]);
1232
1233 struct ureg_dst dst;
1234 if (instr->intrinsic == nir_intrinsic_image_store) {
1235 dst = ureg_dst(resource);
1236 } else {
1237 srcs[num_src++] = resource;
1238 dst = ntt_get_dest(c, &instr->dest);
1239 }
1240
1241 if (instr->intrinsic != nir_intrinsic_image_size) {
1242 srcs[num_src++] = ntt_get_src(c, instr->src[1]); /* coord */
1243 /* XXX: src[2] sample index to coord.z (2d) or coord.w (2darray) */
1244 if (instr->intrinsic != nir_intrinsic_image_load) {
1245 srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
1246 if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
1247 srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
1248 }
1249 }
1250
1251 switch (instr->intrinsic) {
1252 case nir_intrinsic_image_load:
1253 op = TGSI_OPCODE_LOAD;
1254 break;
1255 case nir_intrinsic_image_store:
1256 op = TGSI_OPCODE_STORE;
1257 break;
1258 case nir_intrinsic_image_size:
1259 op = TGSI_OPCODE_RESQ;
1260 break;
1261 case nir_intrinsic_image_atomic_add:
1262 op = TGSI_OPCODE_ATOMUADD;
1263 break;
1264 case nir_intrinsic_image_atomic_fadd:
1265 op = TGSI_OPCODE_ATOMFADD;
1266 break;
1267 case nir_intrinsic_image_atomic_imin:
1268 op = TGSI_OPCODE_ATOMIMIN;
1269 break;
1270 case nir_intrinsic_image_atomic_umin:
1271 op = TGSI_OPCODE_ATOMUMIN;
1272 break;
1273 case nir_intrinsic_image_atomic_imax:
1274 op = TGSI_OPCODE_ATOMIMAX;
1275 break;
1276 case nir_intrinsic_image_atomic_umax:
1277 op = TGSI_OPCODE_ATOMUMAX;
1278 break;
1279 case nir_intrinsic_image_atomic_and:
1280 op = TGSI_OPCODE_ATOMAND;
1281 break;
1282 case nir_intrinsic_image_atomic_or:
1283 op = TGSI_OPCODE_ATOMOR;
1284 break;
1285 case nir_intrinsic_image_atomic_xor:
1286 op = TGSI_OPCODE_ATOMXOR;
1287 break;
1288 case nir_intrinsic_image_atomic_exchange:
1289 op = TGSI_OPCODE_ATOMXCHG;
1290 break;
1291 case nir_intrinsic_image_atomic_comp_swap:
1292 op = TGSI_OPCODE_ATOMCAS;
1293 break;
1294 default:
1295 unreachable("bad op");
1296 }
1297
1298 ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
1299 ntt_get_access_qualifier(instr),
1300 target,
1301 nir_intrinsic_format(instr));
1302 }
1303
1304 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)1305 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
1306 {
1307 uint32_t frac = nir_intrinsic_component(instr);
1308 uint32_t num_components = instr->num_components;
1309 unsigned base = nir_intrinsic_base(instr);
1310 struct ureg_src input;
1311 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1312 bool is_64 = nir_dest_bit_size(instr->dest) == 64;
1313
1314 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1315 input = ureg_DECL_vs_input(c->ureg, base);
1316 for (int i = 1; i < semantics.num_slots; i++)
1317 ureg_DECL_vs_input(c->ureg, base + i);
1318 } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
1319 unsigned semantic_name, semantic_index;
1320 ntt_get_gl_varying_semantic(c, semantics.location,
1321 &semantic_name, &semantic_index);
1322
1323 /* XXX: ArrayID is used in r600 gs inputs */
1324 uint32_t array_id = 0;
1325
1326 input = ureg_DECL_input_layout(c->ureg,
1327 semantic_name,
1328 semantic_index,
1329 base,
1330 ntt_tgsi_usage_mask(frac,
1331 instr->num_components,
1332 is_64),
1333 array_id,
1334 semantics.num_slots);
1335 } else {
1336 input = c->input_index_map[base];
1337 }
1338
1339 if (is_64)
1340 num_components *= 2;
1341
1342 input = ntt_shift_by_frac(input, frac, num_components);
1343
1344 switch (instr->intrinsic) {
1345 case nir_intrinsic_load_input:
1346 input = ntt_ureg_src_indirect(c, input, instr->src[0]);
1347 ntt_store(c, &instr->dest, input);
1348 break;
1349
1350 case nir_intrinsic_load_per_vertex_input:
1351 input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1352 input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
1353 ntt_store(c, &instr->dest, input);
1354 break;
1355
1356 case nir_intrinsic_load_interpolated_input: {
1357 input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1358
1359 nir_intrinsic_instr *bary_instr =
1360 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1361
1362 switch (bary_instr->intrinsic) {
1363 case nir_intrinsic_load_barycentric_pixel:
1364 ntt_store(c, &instr->dest, input);
1365 break;
1366
1367 case nir_intrinsic_load_barycentric_centroid:
1368 /* If the input was declared centroid, then there's no need to
1369 * emit the extra TGSI interp instruction, we can just read the
1370 * input.
1371 */
1372 if (c->centroid_inputs & (1 << nir_intrinsic_base(instr))) {
1373 ntt_store(c, &instr->dest, input);
1374 } else {
1375 ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
1376 input);
1377 }
1378 break;
1379
1380 case nir_intrinsic_load_barycentric_at_sample:
1381 ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
1382 ureg_imm1u(c->ureg,
1383 nir_src_as_uint(bary_instr->src[0])));
1384 break;
1385
1386 case nir_intrinsic_load_barycentric_at_offset:
1387 /* We stored the offset in the fake "bary" dest. */
1388 ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
1389 ntt_get_src(c, instr->src[0]));
1390 break;
1391
1392 default:
1393 unreachable("bad barycentric interp intrinsic\n");
1394 }
1395 break;
1396 }
1397
1398 default:
1399 unreachable("bad load input intrinsic\n");
1400 }
1401 }
1402
1403 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)1404 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1405 {
1406 /* TODO: When making an SSA def's storage, we should check if it's only
1407 * used as the source of a store_output and point it at our
1408 * TGSI_FILE_OUTPUT instead of generating the extra MOV here.
1409 */
1410 uint32_t base = nir_intrinsic_base(instr);
1411 struct ureg_src src = ntt_get_src(c, instr->src[0]);
1412 bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
1413 struct ureg_dst out;
1414 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1415 uint32_t frac = nir_intrinsic_component(instr);
1416
1417 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1418 if (semantics.location == FRAG_RESULT_COLOR)
1419 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
1420
1421 unsigned semantic_name, semantic_index;
1422 tgsi_get_gl_frag_result_semantic(semantics.location,
1423 &semantic_name, &semantic_index);
1424 semantic_index += semantics.dual_source_blend_index;
1425
1426 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
1427
1428 switch (semantics.location) {
1429 case FRAG_RESULT_DEPTH:
1430 frac = 2; /* z write is the to the .z channel in TGSI */
1431 break;
1432 case FRAG_RESULT_STENCIL:
1433 frac = 1;
1434 break;
1435 default:
1436 break;
1437 }
1438 } else {
1439 unsigned semantic_name, semantic_index;
1440
1441 ntt_get_gl_varying_semantic(c, semantics.location,
1442 &semantic_name, &semantic_index);
1443
1444 uint32_t usage_mask = ntt_tgsi_usage_mask(frac,
1445 instr->num_components,
1446 is_64);
1447 uint32_t gs_streams = semantics.gs_streams;
1448 for (int i = 0; i < 4; i++) {
1449 if (!(usage_mask & (1 << i)))
1450 gs_streams &= ~(0x3 << 2 * i);
1451 }
1452
1453 /* XXX: array_id is used in svga tess. */
1454 unsigned array_id = 0;
1455
1456 /* This bit is lost in the i/o semantics, but it's unused in in-tree
1457 * drivers.
1458 */
1459 bool invariant = false;
1460
1461 out = ureg_DECL_output_layout(c->ureg,
1462 semantic_name, semantic_index,
1463 gs_streams,
1464 base,
1465 usage_mask,
1466 array_id,
1467 semantics.num_slots,
1468 invariant);
1469 }
1470
1471 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1472
1473 unsigned write_mask = nir_intrinsic_write_mask(instr);
1474
1475 if (is_64) {
1476 write_mask = ntt_64bit_write_mask(write_mask);
1477 if (frac >= 2)
1478 write_mask = write_mask << 2;
1479 } else {
1480 write_mask = write_mask << frac;
1481 }
1482
1483 uint8_t swizzle[4] = { 0, 0, 0, 0 };
1484 for (int i = frac; i <= 4; i++) {
1485 if (write_mask & (1 << i))
1486 swizzle[i] = i - frac;
1487 }
1488
1489 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1490 out = ureg_writemask(out, write_mask);
1491
1492 ureg_MOV(c->ureg, out, src);
1493 ntt_reladdr_dst_put(c, out);
1494 }
1495
1496 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)1497 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
1498 {
1499 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1500 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1501 ntt_store(c, &instr->dest, ureg_DECL_system_value(c->ureg, semantic, 0));
1502 }
1503
1504 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)1505 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
1506 {
1507 switch (instr->intrinsic) {
1508 case nir_intrinsic_load_uniform:
1509 ntt_emit_load_uniform(c, instr);
1510 break;
1511
1512 case nir_intrinsic_load_ubo:
1513 ntt_emit_load_ubo(c, instr);
1514 break;
1515
1516 case nir_intrinsic_load_ubo_vec4:
1517 ntt_emit_load_ubo_vec4(c, instr);
1518 break;
1519
1520 /* Vertex */
1521 case nir_intrinsic_load_vertex_id:
1522 case nir_intrinsic_load_vertex_id_zero_base:
1523 case nir_intrinsic_load_base_vertex:
1524 case nir_intrinsic_load_base_instance:
1525 case nir_intrinsic_load_instance_id:
1526 case nir_intrinsic_load_draw_id:
1527 case nir_intrinsic_load_invocation_id:
1528 case nir_intrinsic_load_frag_coord:
1529 case nir_intrinsic_load_point_coord:
1530 case nir_intrinsic_load_front_face:
1531 case nir_intrinsic_load_sample_id:
1532 case nir_intrinsic_load_sample_mask_in:
1533 case nir_intrinsic_load_helper_invocation:
1534 case nir_intrinsic_load_tess_coord:
1535 case nir_intrinsic_load_patch_vertices_in:
1536 case nir_intrinsic_load_primitive_id:
1537 case nir_intrinsic_load_tess_level_outer:
1538 case nir_intrinsic_load_tess_level_inner:
1539 case nir_intrinsic_load_local_invocation_id:
1540 case nir_intrinsic_load_work_group_id:
1541 case nir_intrinsic_load_num_work_groups:
1542 case nir_intrinsic_load_local_group_size:
1543 case nir_intrinsic_load_subgroup_size:
1544 case nir_intrinsic_load_subgroup_invocation:
1545 case nir_intrinsic_load_subgroup_eq_mask:
1546 case nir_intrinsic_load_subgroup_ge_mask:
1547 case nir_intrinsic_load_subgroup_gt_mask:
1548 case nir_intrinsic_load_subgroup_lt_mask:
1549 ntt_emit_load_sysval(c, instr);
1550 break;
1551
1552 case nir_intrinsic_load_input:
1553 case nir_intrinsic_load_per_vertex_input:
1554 case nir_intrinsic_load_interpolated_input:
1555 ntt_emit_load_input(c, instr);
1556 break;
1557
1558 case nir_intrinsic_store_output:
1559 ntt_emit_store_output(c, instr);
1560 break;
1561
1562 case nir_intrinsic_discard:
1563 ureg_KILL(c->ureg);
1564 break;
1565
1566 case nir_intrinsic_discard_if: {
1567 struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
1568
1569 if (c->native_integers) {
1570 struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
1571 ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
1572 ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
1573 ureg_release_temporary(c->ureg, temp);
1574 } else {
1575 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1576 ureg_KILL_IF(c->ureg, ureg_negate(cond));
1577 }
1578 break;
1579 }
1580
1581 case nir_intrinsic_load_ssbo:
1582 case nir_intrinsic_store_ssbo:
1583 case nir_intrinsic_ssbo_atomic_add:
1584 case nir_intrinsic_ssbo_atomic_fadd:
1585 case nir_intrinsic_ssbo_atomic_imin:
1586 case nir_intrinsic_ssbo_atomic_imax:
1587 case nir_intrinsic_ssbo_atomic_umin:
1588 case nir_intrinsic_ssbo_atomic_umax:
1589 case nir_intrinsic_ssbo_atomic_and:
1590 case nir_intrinsic_ssbo_atomic_or:
1591 case nir_intrinsic_ssbo_atomic_xor:
1592 case nir_intrinsic_ssbo_atomic_exchange:
1593 case nir_intrinsic_ssbo_atomic_comp_swap:
1594 case nir_intrinsic_get_ssbo_size:
1595 ntt_emit_mem(c, instr, nir_var_mem_ssbo);
1596 break;
1597
1598 case nir_intrinsic_load_shared:
1599 case nir_intrinsic_store_shared:
1600 case nir_intrinsic_shared_atomic_add:
1601 case nir_intrinsic_shared_atomic_fadd:
1602 case nir_intrinsic_shared_atomic_imin:
1603 case nir_intrinsic_shared_atomic_imax:
1604 case nir_intrinsic_shared_atomic_umin:
1605 case nir_intrinsic_shared_atomic_umax:
1606 case nir_intrinsic_shared_atomic_and:
1607 case nir_intrinsic_shared_atomic_or:
1608 case nir_intrinsic_shared_atomic_xor:
1609 case nir_intrinsic_shared_atomic_exchange:
1610 case nir_intrinsic_shared_atomic_comp_swap:
1611 ntt_emit_mem(c, instr, nir_var_mem_shared);
1612 break;
1613
1614 case nir_intrinsic_image_load:
1615 case nir_intrinsic_image_store:
1616 case nir_intrinsic_image_size:
1617 case nir_intrinsic_image_atomic_add:
1618 case nir_intrinsic_image_atomic_fadd:
1619 case nir_intrinsic_image_atomic_imin:
1620 case nir_intrinsic_image_atomic_umin:
1621 case nir_intrinsic_image_atomic_imax:
1622 case nir_intrinsic_image_atomic_umax:
1623 case nir_intrinsic_image_atomic_and:
1624 case nir_intrinsic_image_atomic_or:
1625 case nir_intrinsic_image_atomic_xor:
1626 case nir_intrinsic_image_atomic_exchange:
1627 case nir_intrinsic_image_atomic_comp_swap:
1628 ntt_emit_image_load_store(c, instr);
1629 break;
1630
1631 case nir_intrinsic_control_barrier:
1632 ureg_BARRIER(c->ureg);
1633 break;
1634
1635 case nir_intrinsic_memory_barrier:
1636 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
1637 TGSI_MEMBAR_SHADER_BUFFER |
1638 TGSI_MEMBAR_ATOMIC_BUFFER |
1639 TGSI_MEMBAR_SHADER_IMAGE |
1640 TGSI_MEMBAR_SHARED));
1641 break;
1642
1643 case nir_intrinsic_memory_barrier_atomic_counter:
1644 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
1645 break;
1646
1647 case nir_intrinsic_memory_barrier_buffer:
1648 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
1649 break;
1650
1651 case nir_intrinsic_memory_barrier_image:
1652 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
1653 break;
1654
1655 case nir_intrinsic_memory_barrier_shared:
1656 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
1657 break;
1658
1659 case nir_intrinsic_group_memory_barrier:
1660 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
1661 TGSI_MEMBAR_SHADER_BUFFER |
1662 TGSI_MEMBAR_ATOMIC_BUFFER |
1663 TGSI_MEMBAR_SHADER_IMAGE |
1664 TGSI_MEMBAR_SHARED |
1665 TGSI_MEMBAR_THREAD_GROUP));
1666 break;
1667
1668 case nir_intrinsic_end_primitive:
1669 ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
1670 break;
1671
1672 case nir_intrinsic_emit_vertex:
1673 ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
1674 break;
1675
1676 /* In TGSI we don't actually generate the barycentric coords, and emit
1677 * interp intrinsics later. However, we do need to store the _at_offset
1678 * argument so that we can use it at that point.
1679 */
1680 case nir_intrinsic_load_barycentric_pixel:
1681 case nir_intrinsic_load_barycentric_centroid:
1682 case nir_intrinsic_load_barycentric_at_sample:
1683 break;
1684
1685 case nir_intrinsic_load_barycentric_at_offset:
1686 ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
1687 break;
1688
1689 default:
1690 fprintf(stderr, "Unknown intrinsic: ");
1691 nir_print_instr(&instr->instr, stderr);
1692 fprintf(stderr, "\n");
1693 break;
1694 }
1695 }
1696
1697 struct ntt_tex_operand_state {
1698 struct ureg_src srcs[4];
1699 unsigned i;
1700 unsigned chan;
1701 bool is_temp[4];
1702 };
1703
1704 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)1705 ntt_push_tex_arg(struct ntt_compile *c,
1706 nir_tex_instr *instr,
1707 nir_tex_src_type tex_src_type,
1708 struct ntt_tex_operand_state *s)
1709 {
1710 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
1711 if (tex_src < 0)
1712 return;
1713
1714 struct ureg_src src = ntt_get_src(c, instr->src[tex_src].src);
1715 int num_components = nir_tex_instr_src_size(instr, tex_src);
1716
1717 /* Find which src in the tex args we'll fit in. */
1718 if (s->chan + num_components > 4) {
1719 s->chan = 0;
1720 s->i++;
1721 }
1722
1723 /* Would need to fix up swizzling up to the writemask channel here. */
1724 assert(num_components == 1 || s->chan == 0);
1725 if (num_components == 1)
1726 src = ureg_scalar(src, 0);
1727
1728 if (ureg_src_is_undef(s->srcs[s->i])) {
1729 /* First emit of a tex operand's components, no need for a mov. */
1730 s->srcs[s->i] = src;
1731 } else {
1732 /* Otherwise, we need to have a temporary for all the components that go
1733 * in this operand.
1734 */
1735 if (!s->is_temp[s->i]) {
1736 struct ureg_src prev_src = s->srcs[s->i];
1737 s->srcs[s->i] = ureg_src(ureg_DECL_temporary(c->ureg));
1738 s->is_temp[s->i] = true;
1739
1740 ureg_MOV(c->ureg,
1741 ureg_writemask(ureg_dst(s->srcs[s->i]),
1742 BITFIELD_MASK(s->chan)), prev_src);
1743 }
1744
1745 ureg_MOV(c->ureg,
1746 ureg_writemask(ureg_dst(s->srcs[s->i]),
1747 BITFIELD_RANGE(s->chan, num_components)),
1748 src);
1749 }
1750
1751 s->chan += num_components;
1752 }
1753
1754 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)1755 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
1756 {
1757 struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
1758 unsigned target;
1759 unsigned tex_opcode;
1760
1761 struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
1762 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
1763 if (sampler_src >= 0) {
1764 struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
1765 sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
1766 }
1767
1768 switch (instr->op) {
1769 case nir_texop_tex:
1770 tex_opcode = TGSI_OPCODE_TEX;
1771 break;
1772 case nir_texop_txf:
1773 case nir_texop_txf_ms:
1774 /* XXX: Support txf_lz */
1775 tex_opcode = TGSI_OPCODE_TXF;
1776 break;
1777 case nir_texop_txl:
1778 tex_opcode = TGSI_OPCODE_TXL;
1779 break;
1780 case nir_texop_txb:
1781 tex_opcode = TGSI_OPCODE_TXB;
1782 break;
1783 case nir_texop_txd:
1784 tex_opcode = TGSI_OPCODE_TXD;
1785 break;
1786 case nir_texop_txs:
1787 tex_opcode = TGSI_OPCODE_TXQ;
1788 break;
1789 case nir_texop_tg4:
1790 tex_opcode = TGSI_OPCODE_TG4;
1791 break;
1792 case nir_texop_query_levels:
1793 tex_opcode = TGSI_OPCODE_TXQ;
1794 break;
1795 case nir_texop_lod:
1796 tex_opcode = TGSI_OPCODE_LODQ;
1797 break;
1798 case nir_texop_texture_samples:
1799 tex_opcode = TGSI_OPCODE_TXQS;
1800 break;
1801 default:
1802 unreachable("unsupported tex op");
1803 }
1804
1805 struct ntt_tex_operand_state s = { .i = 0 };
1806 ntt_push_tex_arg(c, instr, nir_tex_src_coord, &s);
1807 /* We always have at least two slots for the coordinate, even on 1D. */
1808 s.chan = MAX2(s.chan, 2);
1809
1810 ntt_push_tex_arg(c, instr, nir_tex_src_comparator, &s);
1811 s.chan = MAX2(s.chan, 3);
1812
1813 ntt_push_tex_arg(c, instr, nir_tex_src_bias, &s);
1814 ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
1815
1816 /* End of packed src setup, everything that follows gets its own operand. */
1817 if (s.chan)
1818 s.i++;
1819
1820 switch (instr->sampler_dim) {
1821 case GLSL_SAMPLER_DIM_1D:
1822 if (instr->is_array) {
1823 if (instr->is_shadow) {
1824 target = TGSI_TEXTURE_SHADOW1D_ARRAY;
1825 } else {
1826 target = TGSI_TEXTURE_1D_ARRAY;
1827 }
1828 } else {
1829 if (instr->is_shadow) {
1830 target = TGSI_TEXTURE_SHADOW1D;
1831 } else {
1832 target = TGSI_TEXTURE_1D;
1833 }
1834 }
1835 break;
1836 case GLSL_SAMPLER_DIM_2D:
1837 case GLSL_SAMPLER_DIM_EXTERNAL:
1838 if (instr->is_array) {
1839 if (instr->is_shadow) {
1840 target = TGSI_TEXTURE_SHADOW2D_ARRAY;
1841 } else {
1842 target = TGSI_TEXTURE_2D_ARRAY;
1843 }
1844 } else {
1845 if (instr->is_shadow) {
1846 target = TGSI_TEXTURE_SHADOW2D;
1847 } else {
1848 target = TGSI_TEXTURE_2D;
1849 }
1850 }
1851 break;
1852 case GLSL_SAMPLER_DIM_MS:
1853 if (instr->is_array) {
1854 target = TGSI_TEXTURE_2D_ARRAY_MSAA;
1855 } else {
1856 target = TGSI_TEXTURE_2D_ARRAY;
1857 }
1858 break;
1859 case GLSL_SAMPLER_DIM_3D:
1860 assert(!instr->is_shadow);
1861 target = TGSI_TEXTURE_3D;
1862 break;
1863 case GLSL_SAMPLER_DIM_RECT:
1864 if (instr->is_shadow) {
1865 target = TGSI_TEXTURE_SHADOWRECT;
1866 } else {
1867 target = TGSI_TEXTURE_RECT;
1868 }
1869 break;
1870 case GLSL_SAMPLER_DIM_CUBE:
1871 if (instr->is_array) {
1872 if (instr->is_shadow) {
1873 target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;
1874 } else {
1875 target = TGSI_TEXTURE_CUBE_ARRAY;
1876 }
1877 } else {
1878 if (instr->is_shadow) {
1879 target = TGSI_TEXTURE_SHADOWCUBE;
1880 } else {
1881 target = TGSI_TEXTURE_CUBE;
1882 }
1883 }
1884 break;
1885 case GLSL_SAMPLER_DIM_BUF:
1886 target = TGSI_TEXTURE_BUFFER;
1887 break;
1888 default:
1889 fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);
1890 abort();
1891 }
1892
1893 if (s.i > 1) {
1894 if (tex_opcode == TGSI_OPCODE_TEX)
1895 tex_opcode = TGSI_OPCODE_TEX2;
1896 if (tex_opcode == TGSI_OPCODE_TXB)
1897 tex_opcode = TGSI_OPCODE_TXB2;
1898 if (tex_opcode == TGSI_OPCODE_TXL)
1899 tex_opcode = TGSI_OPCODE_TXL2;
1900 }
1901
1902 if (instr->op == nir_texop_txd) {
1903 /* Derivs appear in their own src args */
1904 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
1905 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
1906 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
1907 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
1908 }
1909
1910 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
1911 if (c->screen->get_param(c->screen,
1912 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
1913 sampler = ureg_scalar(sampler, instr->component);
1914 s.srcs[s.i++] = ureg_src_undef();
1915 } else {
1916 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
1917 }
1918 }
1919
1920 s.srcs[s.i++] = sampler;
1921
1922 enum tgsi_return_type tex_type;
1923 switch (instr->dest_type) {
1924 case nir_type_float:
1925 tex_type = TGSI_RETURN_TYPE_FLOAT;
1926 break;
1927 case nir_type_int:
1928 tex_type = TGSI_RETURN_TYPE_SINT;
1929 break;
1930 case nir_type_uint:
1931 tex_type = TGSI_RETURN_TYPE_UINT;
1932 break;
1933 default:
1934 unreachable("unknown texture type");
1935 }
1936
1937 struct tgsi_texture_offset tex_offsets[4];
1938 unsigned num_tex_offsets = 0;
1939 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
1940 if (tex_offset_src >= 0) {
1941 struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
1942
1943 tex_offsets[0].File = offset.File;
1944 tex_offsets[0].Index = offset.Index;
1945 tex_offsets[0].SwizzleX = offset.SwizzleX;
1946 tex_offsets[0].SwizzleY = offset.SwizzleY;
1947 tex_offsets[0].SwizzleZ = offset.SwizzleZ;
1948 tex_offsets[0].Padding = 0;
1949
1950 num_tex_offsets = 1;
1951 }
1952
1953 struct ureg_dst tex_dst;
1954 if (instr->op == nir_texop_query_levels)
1955 tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
1956 else
1957 tex_dst = dst;
1958
1959 ureg_tex_insn(c->ureg, tex_opcode,
1960 &tex_dst, 1,
1961 target,
1962 tex_type,
1963 tex_offsets, num_tex_offsets,
1964 s.srcs, s.i);
1965
1966 if (instr->op == nir_texop_query_levels) {
1967 ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
1968 ureg_release_temporary(c->ureg, tex_dst);
1969 }
1970
1971 for (int i = 0; i < s.i; i++) {
1972 if (s.is_temp[i])
1973 ureg_release_temporary(c->ureg, ureg_dst(s.srcs[i]));
1974 }
1975 }
1976
1977 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)1978 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
1979 {
1980 switch (jump->type) {
1981 case nir_jump_break:
1982 ureg_BRK(c->ureg);
1983 break;
1984
1985 case nir_jump_continue:
1986 ureg_CONT(c->ureg);
1987 break;
1988
1989 default:
1990 fprintf(stderr, "Unknown jump instruction: ");
1991 nir_print_instr(&jump->instr, stderr);
1992 fprintf(stderr, "\n");
1993 abort();
1994 }
1995 }
1996
1997 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_ssa_undef_instr * instr)1998 ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
1999 {
2000 /* Nothing to do but make sure that we have some storage to deref. */
2001 (void)ntt_get_ssa_def_decl(c, &instr->def);
2002 }
2003
2004 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2005 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2006 {
2007 /* There is no addr reg in use before we start emitting an instr. */
2008 c->next_addr_reg = 0;
2009
2010 switch (instr->type) {
2011 case nir_instr_type_deref:
2012 /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2013 break;
2014
2015 case nir_instr_type_alu:
2016 ntt_emit_alu(c, nir_instr_as_alu(instr));
2017 break;
2018
2019 case nir_instr_type_intrinsic:
2020 ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2021 break;
2022
2023 case nir_instr_type_load_const:
2024 /* Nothing to do here, as load consts are done directly from
2025 * ntt_get_src() (since many constant NIR srcs will often get folded
2026 * directly into a register file index instead of as a TGSI src).
2027 */
2028 break;
2029
2030 case nir_instr_type_tex:
2031 ntt_emit_texture(c, nir_instr_as_tex(instr));
2032 break;
2033
2034 case nir_instr_type_jump:
2035 ntt_emit_jump(c, nir_instr_as_jump(instr));
2036 break;
2037
2038 case nir_instr_type_ssa_undef:
2039 ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2040 break;
2041
2042 default:
2043 fprintf(stderr, "Unknown NIR instr type: ");
2044 nir_print_instr(instr, stderr);
2045 fprintf(stderr, "\n");
2046 abort();
2047 }
2048 }
2049
2050 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2051 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2052 {
2053 unsigned label;
2054 ureg_UIF(c->ureg, c->if_cond, &label);
2055 ntt_emit_cf_list(c, &if_stmt->then_list);
2056
2057 if (!exec_list_is_empty(&if_stmt->else_list)) {
2058 ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2059 ureg_ELSE(c->ureg, &label);
2060 ntt_emit_cf_list(c, &if_stmt->else_list);
2061 }
2062
2063 ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2064 ureg_ENDIF(c->ureg);
2065 }
2066
2067 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2068 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2069 {
2070 unsigned last_loop_label = c->loop_label;
2071
2072 unsigned begin_label;
2073 ureg_BGNLOOP(c->ureg, &begin_label);
2074 ntt_emit_cf_list(c, &loop->body);
2075
2076 /* XXX: Need to set cont/break labels for svga, nv30, nv50.
2077 *
2078 * ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2079 */
2080 unsigned end_label;
2081 ureg_ENDLOOP(c->ureg, &end_label);
2082
2083 c->loop_label = last_loop_label;
2084 }
2085
2086 static void
ntt_free_ssa_temp_by_index(struct ntt_compile * c,int index)2087 ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
2088 {
2089 /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
2090 if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
2091 return;
2092
2093 ureg_release_temporary(c->ureg, c->ssa_temp[index]);
2094 memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
2095 }
2096
2097 /* Releases any temporaries for SSA defs with a live interval ending at this
2098 * instruction.
2099 */
2100 static bool
ntt_src_live_interval_end_cb(nir_src * src,void * state)2101 ntt_src_live_interval_end_cb(nir_src *src, void *state)
2102 {
2103 struct ntt_compile *c = state;
2104
2105 if (src->is_ssa) {
2106 nir_ssa_def *def = src->ssa;
2107
2108 if (c->liveness->defs[def->index].end == src->parent_instr->index)
2109 ntt_free_ssa_temp_by_index(c, def->index);
2110 }
2111
2112 return true;
2113 }
2114
2115 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)2116 ntt_emit_block(struct ntt_compile *c, nir_block *block)
2117 {
2118 nir_foreach_instr(instr, block) {
2119 ntt_emit_instr(c, instr);
2120
2121 nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
2122 }
2123
2124 /* Set up the if condition for ntt_emit_if(), which we have to do before
2125 * freeing up the temps (the "if" is treated as inside the block for liveness
2126 * purposes, despite not being an instruction)
2127 */
2128 nir_if *nif = nir_block_get_following_if(block);
2129 if (nif)
2130 c->if_cond = ntt_get_src(c, nif->condition);
2131
2132 /* Free up any SSA temps that are unused at the end of the block. */
2133 unsigned index;
2134 BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
2135 unsigned def_end_ip = c->liveness->defs[index].end;
2136 if (def_end_ip == block->end_ip)
2137 ntt_free_ssa_temp_by_index(c, index);
2138 }
2139 }
2140
2141 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)2142 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2143 {
2144 /* There is no addr reg in use before we start emitting any part of a CF
2145 * node (such as an if condition)
2146 */
2147 c->next_addr_reg = 0;
2148
2149 foreach_list_typed(nir_cf_node, node, node, list) {
2150 switch (node->type) {
2151 case nir_cf_node_block:
2152 ntt_emit_block(c, nir_cf_node_as_block(node));
2153 break;
2154
2155 case nir_cf_node_if:
2156 ntt_emit_if(c, nir_cf_node_as_if(node));
2157 break;
2158
2159 case nir_cf_node_loop:
2160 ntt_emit_loop(c, nir_cf_node_as_loop(node));
2161 break;
2162
2163 default:
2164 unreachable("unknown CF type");
2165 }
2166 }
2167 }
2168
2169 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)2170 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
2171 {
2172 /* reindex values so the numbers are reasonably small despite
2173 * optimization having deleted most of them.
2174 */
2175 nir_index_ssa_defs(impl);
2176 nir_index_local_regs(impl);
2177
2178 nir_index_instrs(impl);
2179
2180 c->impl = impl;
2181 c->liveness = nir_live_ssa_defs_per_instr(impl);
2182
2183 c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
2184 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
2185
2186 ntt_setup_registers(c, &impl->registers);
2187 ntt_emit_cf_list(c, &impl->body);
2188
2189 ralloc_free(c->liveness);
2190 c->liveness = NULL;
2191 }
2192
2193 static int
type_size(const struct glsl_type * type,bool bindless)2194 type_size(const struct glsl_type *type, bool bindless)
2195 {
2196 return glsl_count_attribute_slots(type, false);
2197 }
2198
2199 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
2200 * can handle for 64-bit values in TGSI.
2201 */
2202 static bool
ntt_should_vectorize_instr(const nir_instr * in_a,const nir_instr * in_b,void * data)2203 ntt_should_vectorize_instr(const nir_instr *in_a, const nir_instr *in_b,
2204 void *data)
2205 {
2206 if (in_a->type != nir_instr_type_alu)
2207 return false;
2208
2209 nir_alu_instr *a = nir_instr_as_alu(in_a);
2210 nir_alu_instr *b = nir_instr_as_alu(in_b);
2211
2212 unsigned a_num_components = a->dest.dest.ssa.num_components;
2213 unsigned b_num_components = b->dest.dest.ssa.num_components;
2214
2215 int src_bit_size = nir_src_bit_size(a->src[0].src);
2216 int dst_bit_size = nir_dest_bit_size(a->dest.dest);
2217
2218 if (src_bit_size == 64 || dst_bit_size == 64) {
2219 if (a_num_components + b_num_components > 2)
2220 return false;
2221 }
2222
2223 return true;
2224 }
2225
2226 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high)2227 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
2228 unsigned num_components, unsigned high_offset,
2229 nir_intrinsic_instr *low, nir_intrinsic_instr *high)
2230 {
2231 if (bit_size != 32)
2232 return false;
2233
2234 /* Our offset alignment should aways be at least 4 bytes */
2235 if (align < 4)
2236 return false;
2237
2238 /* No wrapping off the end of a TGSI reg. We could do a bit better by
2239 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
2240 * need this restriction.
2241 */
2242 unsigned worst_start_component = align == 4 ? 3 : align / 4;
2243 if (worst_start_component + num_components > 4)
2244 return false;
2245
2246 return true;
2247 }
2248
2249 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)2250 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2251 {
2252 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2253 unsigned indirect_mask = 0;
2254
2255 if (!screen->get_shader_param(screen, pipe_stage,
2256 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2257 indirect_mask |= nir_var_shader_in;
2258 }
2259
2260 if (!screen->get_shader_param(screen, pipe_stage,
2261 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2262 indirect_mask |= nir_var_shader_out;
2263 }
2264
2265 if (!screen->get_shader_param(screen, pipe_stage,
2266 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2267 indirect_mask |= nir_var_function_temp;
2268 }
2269
2270 return indirect_mask;
2271 }
2272
2273 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)2274 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
2275 {
2276 bool progress;
2277 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2278 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2279 unsigned control_flow_depth =
2280 screen->get_shader_param(screen, pipe_stage,
2281 PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
2282 do {
2283 progress = false;
2284
2285 NIR_PASS_V(s, nir_lower_vars_to_ssa);
2286
2287 NIR_PASS(progress, s, nir_copy_prop);
2288 NIR_PASS(progress, s, nir_opt_algebraic);
2289 NIR_PASS(progress, s, nir_opt_remove_phis);
2290 NIR_PASS(progress, s, nir_opt_conditional_discard);
2291 NIR_PASS(progress, s, nir_opt_dce);
2292 NIR_PASS(progress, s, nir_opt_dead_cf);
2293 NIR_PASS(progress, s, nir_opt_cse);
2294 NIR_PASS(progress, s, nir_opt_find_array_copies);
2295 NIR_PASS(progress, s, nir_opt_if, true);
2296 NIR_PASS(progress, s, nir_opt_peephole_select,
2297 control_flow_depth == 0 ? ~0 : 8, true, true);
2298 NIR_PASS(progress, s, nir_opt_algebraic);
2299 NIR_PASS(progress, s, nir_opt_constant_folding);
2300 NIR_PASS(progress, s, nir_opt_load_store_vectorize, nir_var_mem_ubo,
2301 ntt_should_vectorize_io, 0);
2302 NIR_PASS(progress, s, nir_opt_shrink_vectors);
2303 NIR_PASS(progress, s, nir_opt_trivial_continues);
2304 NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
2305 NIR_PASS(progress, s, nir_opt_undef);
2306 NIR_PASS(progress, s, nir_opt_loop_unroll, no_indirects_mask);
2307
2308 } while (progress);
2309 }
2310
2311 /* Scalarizes all 64-bit ALU ops. Note that we only actually need to
2312 * scalarize vec3/vec4s, should probably fix that.
2313 */
2314 static bool
scalarize_64bit(const nir_instr * instr,const void * data)2315 scalarize_64bit(const nir_instr *instr, const void *data)
2316 {
2317 const nir_alu_instr *alu = nir_instr_as_alu(instr);
2318
2319 return (nir_dest_bit_size(alu->dest.dest) == 64 ||
2320 nir_src_bit_size(alu->src[0].src) == 64);
2321 }
2322
2323 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)2324 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
2325 {
2326 b->cursor = nir_after_instr(&instr->instr);
2327
2328 switch (instr->intrinsic) {
2329 case nir_intrinsic_load_uniform:
2330 case nir_intrinsic_load_ubo:
2331 case nir_intrinsic_load_ubo_vec4:
2332 case nir_intrinsic_load_ssbo:
2333 case nir_intrinsic_load_input:
2334 case nir_intrinsic_load_interpolated_input:
2335 case nir_intrinsic_load_per_vertex_input:
2336 case nir_intrinsic_store_output:
2337 case nir_intrinsic_store_ssbo:
2338 break;
2339 default:
2340 return false;
2341 }
2342
2343 if (instr->num_components <= 2)
2344 return false;
2345
2346 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
2347 if (has_dest) {
2348 if (nir_dest_bit_size(instr->dest) != 64)
2349 return false;
2350 } else {
2351 if (nir_src_bit_size(instr->src[0]) != 64)
2352 return false;
2353 }
2354
2355 nir_intrinsic_instr *first =
2356 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2357 nir_intrinsic_instr *second =
2358 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2359
2360 switch (instr->intrinsic) {
2361 case nir_intrinsic_load_uniform:
2362 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2363 break;
2364
2365 case nir_intrinsic_load_ubo:
2366 case nir_intrinsic_load_ubo_vec4:
2367 case nir_intrinsic_load_ssbo:
2368 case nir_intrinsic_store_ssbo:
2369 break;
2370
2371 default: {
2372 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
2373 semantics.location++;
2374 semantics.num_slots--;
2375 nir_intrinsic_set_io_semantics(second, semantics);
2376
2377 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2378 break;
2379 }
2380 }
2381
2382 first->num_components = 2;
2383 second->num_components -= 2;
2384 if (has_dest) {
2385 first->dest.ssa.num_components = 2;
2386 second->dest.ssa.num_components -= 2;
2387 }
2388
2389 nir_builder_instr_insert(b, &first->instr);
2390 nir_builder_instr_insert(b, &second->instr);
2391
2392 if (has_dest) {
2393 /* Merge the two loads' results back into a vector. */
2394 nir_ssa_def *channels[4] = {
2395 nir_channel(b, &first->dest.ssa, 0),
2396 nir_channel(b, &first->dest.ssa, 1),
2397 nir_channel(b, &second->dest.ssa, 0),
2398 second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
2399 };
2400 nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
2401 nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(new));
2402 } else {
2403 /* Split the src value across the two stores. */
2404 b->cursor = nir_before_instr(&instr->instr);
2405
2406 nir_ssa_def *src0 = instr->src[0].ssa;
2407 nir_ssa_def *channels[4] = { 0 };
2408 for (int i = 0; i < instr->num_components; i++)
2409 channels[i] = nir_channel(b, src0, i);
2410
2411 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
2412 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
2413
2414 nir_instr_rewrite_src(&first->instr, &first->src[0],
2415 nir_src_for_ssa(nir_vec(b, channels, 2)));
2416 nir_instr_rewrite_src(&second->instr, &second->src[0],
2417 nir_src_for_ssa(nir_vec(b, &channels[2],
2418 second->num_components)));
2419 }
2420
2421 int offset_src = -1;
2422 uint32_t offset_amount = 16;
2423
2424 switch (instr->intrinsic) {
2425 case nir_intrinsic_load_ssbo:
2426 case nir_intrinsic_load_ubo:
2427 offset_src = 1;
2428 break;
2429 case nir_intrinsic_load_ubo_vec4:
2430 offset_src = 1;
2431 offset_amount = 1;
2432 break;
2433 case nir_intrinsic_store_ssbo:
2434 offset_src = 2;
2435 break;
2436 default:
2437 break;
2438 }
2439 if (offset_src != -1) {
2440 b->cursor = nir_before_instr(&second->instr);
2441 nir_ssa_def *second_offset =
2442 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
2443 nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
2444 nir_src_for_ssa(second_offset));
2445 }
2446
2447 /* DCE stores we generated with no writemask (nothing else does this
2448 * currently).
2449 */
2450 if (!has_dest) {
2451 if (nir_intrinsic_write_mask(first) == 0)
2452 nir_instr_remove(&first->instr);
2453 if (nir_intrinsic_write_mask(second) == 0)
2454 nir_instr_remove(&second->instr);
2455 }
2456
2457 nir_instr_remove(&instr->instr);
2458
2459 return true;
2460 }
2461
2462 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)2463 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
2464 {
2465 int num_components = instr->def.num_components;
2466
2467 if (instr->def.bit_size != 64 || num_components <= 2)
2468 return false;
2469
2470 b->cursor = nir_before_instr(&instr->instr);
2471
2472 nir_load_const_instr *first =
2473 nir_load_const_instr_create(b->shader, 2, 64);
2474 nir_load_const_instr *second =
2475 nir_load_const_instr_create(b->shader, num_components - 2, 64);
2476
2477 first->value[0] = instr->value[0];
2478 first->value[1] = instr->value[1];
2479 second->value[0] = instr->value[2];
2480 if (num_components == 4)
2481 second->value[1] = instr->value[3];
2482
2483 nir_builder_instr_insert(b, &first->instr);
2484 nir_builder_instr_insert(b, &second->instr);
2485
2486 nir_ssa_def *channels[4] = {
2487 nir_channel(b, &first->def, 0),
2488 nir_channel(b, &first->def, 1),
2489 nir_channel(b, &second->def, 0),
2490 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
2491 };
2492 nir_ssa_def *new = nir_vec(b, channels, num_components);
2493 nir_ssa_def_rewrite_uses(&instr->def, nir_src_for_ssa(new));
2494 nir_instr_remove(&instr->instr);
2495
2496 return true;
2497 }
2498
2499 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)2500 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
2501 void *data)
2502 {
2503 switch (instr->type) {
2504 case nir_instr_type_load_const:
2505 return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
2506
2507 case nir_instr_type_intrinsic:
2508 return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
2509 default:
2510 return false;
2511 }
2512 }
2513
2514 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)2515 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
2516 {
2517 return nir_shader_instructions_pass(s,
2518 nir_to_tgsi_lower_64bit_to_vec2_instr,
2519 nir_metadata_block_index |
2520 nir_metadata_dominance,
2521 NULL);
2522 }
2523
2524 static void
ntt_sanity_check_driver_options(struct nir_shader * s)2525 ntt_sanity_check_driver_options(struct nir_shader *s)
2526 {
2527 UNUSED const struct nir_shader_compiler_options *options = s->options;
2528
2529 assert(options->lower_extract_byte);
2530 assert(options->lower_extract_word);
2531 assert(options->lower_fdph);
2532 assert(options->lower_flrp64);
2533 assert(options->lower_fmod);
2534 assert(options->lower_rotate);
2535 assert(options->lower_vector_cmp);
2536 }
2537
2538 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)2539 nir_to_tgsi(struct nir_shader *s,
2540 struct pipe_screen *screen)
2541 {
2542 struct ntt_compile *c;
2543 const void *tgsi_tokens;
2544 bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
2545 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2546 bool native_integers = screen->get_shader_param(screen,
2547 pipe_shader_type_from_mesa(s->info.stage),
2548 PIPE_SHADER_CAP_INTEGERS);
2549
2550 ntt_sanity_check_driver_options(s);
2551
2552 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2553 type_size, (nir_lower_io_options)0);
2554 NIR_PASS_V(s, nir_lower_regs_to_ssa);
2555
2556 const nir_lower_tex_options lower_tex_options = {
2557 /* XXX: We could skip lowering of TXP for TEX with <=3 coord_compoennts.
2558 */
2559 .lower_txp = ~0,
2560 };
2561 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
2562
2563 /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
2564 * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
2565 * duplication logic we just make it so that we only see vec2s.
2566 */
2567 NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
2568 NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
2569
2570 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
2571 NIR_PASS_V(s, nir_lower_ubo_vec4);
2572
2573 ntt_optimize_nir(s, screen);
2574
2575 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
2576
2577 bool progress;
2578 do {
2579 progress = false;
2580 NIR_PASS(progress, s, nir_opt_algebraic_late);
2581 if (progress) {
2582 NIR_PASS_V(s, nir_copy_prop);
2583 NIR_PASS_V(s, nir_opt_dce);
2584 NIR_PASS_V(s, nir_opt_cse);
2585 }
2586 } while (progress);
2587
2588 if (screen->get_shader_param(screen,
2589 pipe_shader_type_from_mesa(s->info.stage),
2590 PIPE_SHADER_CAP_INTEGERS)) {
2591 NIR_PASS_V(s, nir_lower_bool_to_int32);
2592 } else {
2593 NIR_PASS_V(s, nir_lower_int_to_float);
2594 NIR_PASS_V(s, nir_lower_bool_to_float);
2595 }
2596
2597 NIR_PASS_V(s, nir_lower_to_source_mods,
2598 nir_lower_float_source_mods |
2599 nir_lower_int_source_mods); /* no doubles */
2600 NIR_PASS_V(s, nir_convert_from_ssa, true);
2601 NIR_PASS_V(s, nir_lower_vec_to_movs);
2602
2603 /* locals_to_regs will leave dead derefs that are good to clean up. */
2604 NIR_PASS_V(s, nir_lower_locals_to_regs);
2605 NIR_PASS_V(s, nir_opt_dce);
2606
2607 if (debug) {
2608 fprintf(stderr, "NIR before translation to TGSI:\n");
2609 nir_print_shader(s, stderr);
2610 }
2611
2612 c = rzalloc(NULL, struct ntt_compile);
2613 c->screen = screen;
2614
2615 c->needs_texcoord_semantic =
2616 screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
2617 c->any_reg_as_address =
2618 screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
2619
2620 c->s = s;
2621 c->native_integers = native_integers;
2622 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
2623 ureg_setup_shader_info(c->ureg, &s->info);
2624
2625 ntt_setup_inputs(c);
2626 ntt_setup_uniforms(c);
2627
2628 if (s->info.stage == MESA_SHADER_FRAGMENT) {
2629 /* The draw module's polygon stipple layer doesn't respect the chosen
2630 * coordinate mode, so leave it as unspecified unless we're actually
2631 * reading the position in the shader already. See
2632 * gl-2.1-polygon-stipple-fs on softpipe.
2633 */
2634 if ((s->info.inputs_read & VARYING_BIT_POS) ||
2635 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
2636 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
2637 s->info.fs.origin_upper_left ?
2638 TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
2639 TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
2640
2641 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
2642 s->info.fs.pixel_center_integer ?
2643 TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
2644 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
2645 }
2646 }
2647 /* Emit the main function */
2648 nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
2649 ntt_emit_impl(c, impl);
2650 ureg_END(c->ureg);
2651
2652 tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
2653
2654 if (debug) {
2655 fprintf(stderr, "TGSI after translation from NIR:\n");
2656 tgsi_dump(tgsi_tokens, 0);
2657 }
2658
2659 ureg_destroy(c->ureg);
2660
2661 ralloc_free(c);
2662
2663 return tgsi_tokens;
2664 }
2665
2666 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
2667 .fuse_ffma32 = true,
2668 .fuse_ffma64 = true,
2669 .lower_extract_byte = true,
2670 .lower_extract_word = true,
2671 .lower_fdph = true,
2672 .lower_flrp64 = true,
2673 .lower_fmod = true,
2674 .lower_rotate = true,
2675 .lower_sub = true,
2676 .lower_vector_cmp = true,
2677 .use_interpolated_input_intrinsics = true,
2678 };
2679
2680 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
2681 * NIR support.
2682 */
2683 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)2684 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
2685 enum pipe_shader_ir ir,
2686 unsigned shader)
2687 {
2688 assert(ir == PIPE_SHADER_IR_NIR);
2689 return &nir_to_tgsi_compiler_options;
2690 }
2691