1 /*
2 * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2016 Broadcom
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "main/glheader.h"
27 #include "compiler/nir_types.h"
28 #include "compiler/nir/nir_builder.h"
29 #include "util/u_debug.h"
30 #include "util/fast_idiv_by_const.h"
31 #include "agx_compile.h"
32 #include "agx_compiler.h"
33 #include "agx_builder.h"
34
35 static const struct debug_named_value agx_debug_options[] = {
36 {"msgs", AGX_DBG_MSGS, "Print debug messages"},
37 {"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"},
38 {"shaderdb", AGX_DBG_SHADERDB, "Print statistics"},
39 {"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"},
40 {"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"},
41 {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
42 DEBUG_NAMED_VALUE_END
43 };
44
45 DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
46
47 int agx_debug = 0;
48
49 #define DBG(fmt, ...) \
50 do { if (agx_debug & AGX_DBG_MSGS) \
51 fprintf(stderr, "%s:%d: "fmt, \
52 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
53
54 /* Builds a 64-bit hash table key for an index */
55 static uint64_t
agx_index_to_key(agx_index idx)56 agx_index_to_key(agx_index idx)
57 {
58 STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
59
60 uint64_t key = 0;
61 memcpy(&key, &idx, sizeof(idx));
62 return key;
63 }
64
65 /*
66 * Extract a single channel out of a vector source. We split vectors with
67 * p_split so we can use the split components directly, without emitting a
68 * machine instruction. This has advantages of RA, as the split can usually be
69 * optimized away.
70 */
71 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)72 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
73 {
74 agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
75 agx_index_to_key(vec));
76
77 assert(components != NULL && "missing agx_emit_combine_to");
78
79 return components[channel];
80 }
81
82 static void
agx_cache_combine(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)83 agx_cache_combine(agx_builder *b, agx_index dst,
84 agx_index s0, agx_index s1, agx_index s2, agx_index s3)
85 {
86 /* Lifetime of a hash table entry has to be at least as long as the table */
87 agx_index *channels = ralloc_array(b->shader, agx_index, 4);
88
89 channels[0] = s0;
90 channels[1] = s1;
91 channels[2] = s2;
92 channels[3] = s3;
93
94 _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
95 channels);
96 }
97
98 /*
99 * Combine multiple scalars into a vector destination. This corresponds to
100 * p_combine, lowered to moves (a shuffle in general) after register allocation.
101 *
102 * To optimize vector extractions, we record the individual channels
103 */
104 static agx_instr *
agx_emit_combine_to(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)105 agx_emit_combine_to(agx_builder *b, agx_index dst,
106 agx_index s0, agx_index s1, agx_index s2, agx_index s3)
107 {
108 agx_cache_combine(b, dst, s0, s1, s2, s3);
109 return agx_p_combine_to(b, dst, s0, s1, s2, s3);
110 }
111
112 static void
agx_block_add_successor(agx_block * block,agx_block * successor)113 agx_block_add_successor(agx_block *block, agx_block *successor)
114 {
115 assert(block != NULL && successor != NULL);
116
117 /* Cull impossible edges */
118 if (block->unconditional_jumps)
119 return;
120
121 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
122 if (block->successors[i]) {
123 if (block->successors[i] == successor)
124 return;
125 else
126 continue;
127 }
128
129 block->successors[i] = successor;
130 util_dynarray_append(&successor->predecessors, agx_block *, block);
131 return;
132 }
133
134 unreachable("Too many successors");
135 }
136
137 /*
138 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
139 * split pseudo-instruction.
140 *
141 * Pre-condition: dests is filled with agx_null().
142 */
143 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)144 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
145 {
146 /* Setup the destinations */
147 for (unsigned i = 0; i < n; ++i) {
148 dests[i] = agx_temp(b->shader, vec.size);
149 }
150
151 /* Emit the split */
152 agx_p_split_to(b, dests[0], dests[1], dests[2], dests[3], vec);
153 }
154
155 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)156 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
157 {
158 agx_index dests[4] = { agx_null(), agx_null(), agx_null(), agx_null() };
159 agx_emit_split(b, dests, vec, n);
160 agx_cache_combine(b, vec, dests[0], dests[1], dests[2], dests[3]);
161 }
162
163 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)164 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
165 {
166 /* Ensure we've been scalarized and bit size lowered */
167 unsigned bit_size = instr->def.bit_size;
168 assert(instr->def.num_components == 1);
169 assert(bit_size == 1 || bit_size == 16 || bit_size == 32);
170
171 /* Emit move, later passes can inline/push if useful */
172 agx_mov_imm_to(b,
173 agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
174 nir_const_value_as_uint(instr->value[0], bit_size));
175 }
176
177 /* Emit code dividing P by Q */
178 static agx_index
agx_udiv_const(agx_builder * b,agx_index P,uint32_t Q)179 agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q)
180 {
181 /* P / 1 = P */
182 if (Q == 1) {
183 return P;
184 }
185
186 /* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */
187 if (Q == UINT32_MAX) {
188 agx_index max = agx_mov_imm(b, 32, UINT32_MAX);
189 agx_index one = agx_mov_imm(b, 32, 1);
190 return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ);
191 }
192
193 /* P / 2^N = P >> N */
194 if (util_is_power_of_two_or_zero(Q)) {
195 return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q)));
196 }
197
198 /* Fall back on multiplication by a magic number */
199 struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32);
200 agx_index preshift = agx_mov_imm(b, 32, info.pre_shift);
201 agx_index increment = agx_mov_imm(b, 32, info.increment);
202 agx_index postshift = agx_mov_imm(b, 32, info.post_shift);
203 agx_index multiplier = agx_mov_imm(b, 32, info.multiplier);
204 agx_index multiplied = agx_temp(b->shader, AGX_SIZE_64);
205 agx_index n = P;
206
207 if (info.pre_shift != 0) n = agx_ushr(b, n, preshift);
208 if (info.increment != 0) n = agx_iadd(b, n, increment, 0);
209
210 /* 64-bit multiplication, zero extending 32-bit x 32-bit, get the top word */
211 agx_imad_to(b, multiplied, agx_abs(n), agx_abs(multiplier), agx_zero(), 0);
212 n = agx_temp(b->shader, AGX_SIZE_32);
213 agx_p_extract_to(b, n, multiplied, 1);
214
215 if (info.post_shift != 0) n = agx_ushr(b, n, postshift);
216
217 return n;
218 }
219
220 /* AGX appears to lack support for vertex attributes. Lower to global loads. */
221 static void
agx_emit_load_attr(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)222 agx_emit_load_attr(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
223 {
224 nir_src *offset_src = nir_get_io_offset_src(instr);
225 assert(nir_src_is_const(*offset_src) && "no attribute indirects");
226 unsigned index = nir_intrinsic_base(instr) +
227 nir_src_as_uint(*offset_src);
228
229 struct agx_shader_key *key = b->shader->key;
230 struct agx_attribute attrib = key->vs.attributes[index];
231
232 /* address = base + (stride * vertex_id) + src_offset */
233 unsigned buf = attrib.buf;
234 unsigned stride = key->vs.vbuf_strides[buf];
235 unsigned shift = agx_format_shift(attrib.format);
236
237 agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
238 agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
239
240 agx_index vertex_id = agx_register(10, AGX_SIZE_32);
241 agx_index instance_id = agx_register(12, AGX_SIZE_32);
242
243 /* A nonzero divisor requires dividing the instance ID. A zero divisor
244 * specifies per-instance data. */
245 agx_index element_id = (attrib.divisor == 0) ? vertex_id :
246 agx_udiv_const(b, instance_id, attrib.divisor);
247
248 agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
249
250 /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */
251 unsigned num_vbos = key->vs.num_vbufs;
252 unsigned base_length = (num_vbos * 4);
253 agx_index base = agx_indexed_sysval(b->shader,
254 AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);
255
256 /* Load the data */
257 assert(instr->num_components <= 4);
258
259 unsigned actual_comps = (attrib.nr_comps_minus_1 + 1);
260 agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
261 agx_device_load_to(b, vec, base, offset, attrib.format,
262 BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
263 agx_wait(b, 0);
264
265 agx_emit_split(b, dests, vec, actual_comps);
266
267 agx_index one = agx_mov_imm(b, 32, fui(1.0));
268 agx_index zero = agx_mov_imm(b, 32, 0);
269 agx_index default_value[4] = { zero, zero, zero, one };
270
271 for (unsigned i = actual_comps; i < instr->num_components; ++i)
272 dests[i] = default_value[i];
273 }
274
275 static void
agx_emit_load_vary_flat(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)276 agx_emit_load_vary_flat(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
277 {
278 unsigned components = instr->num_components;
279 assert(components >= 1 && components <= 4);
280
281 nir_src *offset = nir_get_io_offset_src(instr);
282 assert(nir_src_is_const(*offset) && "no indirects");
283 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
284 imm_index += nir_src_as_uint(*offset);
285
286 assert(nir_dest_bit_size(instr->dest) == 32 && "no 16-bit flat shading");
287
288 for (unsigned i = 0; i < components; ++i) {
289 /* vec3 for each vertex, unknown what first 2 channels are for */
290 agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);
291 dests[i] = agx_p_extract(b, values, 2);
292 }
293 }
294
295 static void
agx_emit_load_vary(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)296 agx_emit_load_vary(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
297 {
298 ASSERTED unsigned components = instr->num_components;
299 ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
300
301 assert(components >= 1 && components <= 4);
302 assert(parent);
303
304 /* TODO: Interpolation modes */
305 assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);
306
307 nir_src *offset = nir_get_io_offset_src(instr);
308 assert(nir_src_is_const(*offset) && "no indirects");
309 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
310 imm_index += nir_src_as_uint(*offset) * 4;
311
312 agx_index vec = agx_vec_for_intr(b->shader, instr);
313 agx_ld_vary_to(b, vec, agx_immediate(imm_index), components, true);
314 agx_emit_split(b, dests, vec, components);
315 }
316
317 static agx_instr *
agx_emit_store_vary(agx_builder * b,nir_intrinsic_instr * instr)318 agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
319 {
320 nir_src *offset = nir_get_io_offset_src(instr);
321 assert(nir_src_is_const(*offset) && "todo: indirects");
322 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
323 imm_index += nir_intrinsic_component(instr);
324 imm_index += nir_src_as_uint(*offset);
325
326 /* nir_lower_io_to_scalar */
327 assert(nir_intrinsic_write_mask(instr) == 0x1);
328
329 return agx_st_vary(b,
330 agx_immediate(imm_index),
331 agx_src_index(&instr->src[0]));
332 }
333
334 static agx_instr *
agx_emit_fragment_out(agx_builder * b,nir_intrinsic_instr * instr)335 agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
336 {
337 const nir_variable *var =
338 nir_find_variable_with_driver_location(b->shader->nir,
339 nir_var_shader_out, nir_intrinsic_base(instr));
340 assert(var);
341
342 unsigned loc = var->data.location;
343 assert(var->data.index == 0 && "todo: dual-source blending");
344 assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
345 unsigned rt = (loc - FRAG_RESULT_DATA0);
346
347 /* TODO: Reverse-engineer interactions with MRT */
348 if (b->shader->nir->info.internal) {
349 /* clear */
350 } else if (b->shader->did_writeout) {
351 agx_writeout(b, 0x0004);
352 } else {
353 agx_writeout(b, 0xC200);
354 agx_writeout(b, 0x000C);
355 }
356
357 if (b->shader->nir->info.fs.uses_discard) {
358 /* If the shader uses discard, the sample mask must be written by the
359 * shader on all exeuction paths. If we've reached the end of the shader,
360 * we are therefore still active and need to write a full sample mask.
361 * TODO: interactions with MSAA and gl_SampleMask writes
362 */
363 agx_sample_mask(b, agx_immediate(1));
364 }
365
366 b->shader->did_writeout = true;
367 return agx_st_tile(b, agx_src_index(&instr->src[0]),
368 b->shader->key->fs.tib_formats[rt]);
369 }
370
371 static void
agx_emit_load_tile(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)372 agx_emit_load_tile(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
373 {
374 const nir_variable *var =
375 nir_find_variable_with_driver_location(b->shader->nir,
376 nir_var_shader_out, nir_intrinsic_base(instr));
377 assert(var);
378
379 unsigned loc = var->data.location;
380 assert(var->data.index == 0 && "todo: dual-source blending");
381 assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
382 unsigned rt = (loc - FRAG_RESULT_DATA0);
383
384 /* TODO: Reverse-engineer interactions with MRT */
385 agx_writeout(b, 0xC200);
386 agx_writeout(b, 0x0008);
387 b->shader->did_writeout = true;
388 b->shader->out->reads_tib = true;
389
390 agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
391 agx_ld_tile_to(b, vec, b->shader->key->fs.tib_formats[rt]);
392 agx_emit_split(b, dests, vec, 4);
393 }
394
395 static enum agx_format
agx_format_for_bits(unsigned bits)396 agx_format_for_bits(unsigned bits)
397 {
398 switch (bits) {
399 case 8: return AGX_FORMAT_I8;
400 case 16: return AGX_FORMAT_I16;
401 case 32: return AGX_FORMAT_I32;
402 default: unreachable("Invalid bit size for load/store");
403 }
404 }
405
406 static agx_instr *
agx_emit_load_ubo(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)407 agx_emit_load_ubo(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
408 {
409 bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
410 nir_src *offset = nir_get_io_offset_src(instr);
411
412 if (!kernel_input && !nir_src_is_const(instr->src[0]))
413 unreachable("todo: indirect UBO access");
414
415 /* UBO blocks are specified (kernel inputs are always 0) */
416 uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);
417
418 /* Each UBO has a 64-bit = 4 x 16-bit address */
419 unsigned num_ubos = b->shader->nir->info.num_ubos;
420 unsigned base_length = (num_ubos * 4);
421 unsigned index = block * 4; /* 16 bit units */
422
423 /* Lookup the base address (TODO: indirection) */
424 agx_index base = agx_indexed_sysval(b->shader,
425 AGX_PUSH_UBO_BASES, AGX_SIZE_64,
426 index, base_length);
427
428 /* Load the data */
429 assert(instr->num_components <= 4);
430
431 agx_device_load_to(b, dst, base, agx_src_index(offset),
432 agx_format_for_bits(nir_dest_bit_size(instr->dest)),
433 BITFIELD_MASK(instr->num_components), 0);
434 agx_wait(b, 0);
435 agx_emit_cached_split(b, dst, instr->num_components);
436
437 return NULL;
438 }
439
440 static void
agx_emit_load_frag_coord(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)441 agx_emit_load_frag_coord(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
442 {
443 /* xy */
444 for (unsigned i = 0; i < 2; ++i) {
445 dests[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
446 agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
447 AGX_ROUND_RTE), agx_immediate_f(0.5f));
448 }
449
450 dests[2] = agx_ld_vary(b, agx_immediate(1), 1, false); /* z */
451 dests[3] = agx_ld_vary(b, agx_immediate(0), 1, false); /* w */
452 }
453
454 static agx_instr *
agx_blend_const(agx_builder * b,agx_index dst,unsigned comp)455 agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
456 {
457 agx_index val = agx_indexed_sysval(b->shader,
458 AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
459
460 return agx_mov_to(b, dst, val);
461 }
462
463 /*
464 * Demoting a helper invocation is logically equivalent to zeroing the sample
465 * mask. Metal implement discard as such.
466 *
467 * XXX: Actually, Metal's "discard" is a demote, and what is implemented here
468 * is a demote. There might be a better way to implement this to get correct
469 * helper invocation semantics. For now, I'm kicking the can down the road.
470 */
471 static agx_instr *
agx_emit_discard(agx_builder * b,nir_intrinsic_instr * instr)472 agx_emit_discard(agx_builder *b, nir_intrinsic_instr *instr)
473 {
474 agx_writeout(b, 0xC200);
475 agx_writeout(b, 0x0001);
476 b->shader->did_writeout = true;
477
478 b->shader->out->writes_sample_mask = true;
479 return agx_sample_mask(b, agx_immediate(0));
480 }
481
482 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)483 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
484 {
485 agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
486 agx_dest_index(&instr->dest) : agx_null();
487 gl_shader_stage stage = b->shader->stage;
488 agx_index dests[4] = { agx_null() };
489
490 switch (instr->intrinsic) {
491 case nir_intrinsic_load_barycentric_pixel:
492 case nir_intrinsic_load_barycentric_centroid:
493 case nir_intrinsic_load_barycentric_sample:
494 case nir_intrinsic_load_barycentric_at_sample:
495 case nir_intrinsic_load_barycentric_at_offset:
496 /* handled later via load_vary */
497 return NULL;
498 case nir_intrinsic_load_interpolated_input:
499 assert(stage == MESA_SHADER_FRAGMENT);
500 agx_emit_load_vary(b, dests, instr);
501 break;
502
503 case nir_intrinsic_load_input:
504 if (stage == MESA_SHADER_FRAGMENT)
505 agx_emit_load_vary_flat(b, dests, instr);
506 else if (stage == MESA_SHADER_VERTEX)
507 agx_emit_load_attr(b, dests, instr);
508 else
509 unreachable("Unsupported shader stage");
510
511 break;
512
513 case nir_intrinsic_store_output:
514 if (stage == MESA_SHADER_FRAGMENT)
515 return agx_emit_fragment_out(b, instr);
516 else if (stage == MESA_SHADER_VERTEX)
517 return agx_emit_store_vary(b, instr);
518 else
519 unreachable("Unsupported shader stage");
520
521 case nir_intrinsic_load_output:
522 assert(stage == MESA_SHADER_FRAGMENT);
523 agx_emit_load_tile(b, dests, instr);
524 break;
525
526 case nir_intrinsic_load_ubo:
527 case nir_intrinsic_load_kernel_input:
528 return agx_emit_load_ubo(b, dst, instr);
529
530 case nir_intrinsic_load_frag_coord:
531 agx_emit_load_frag_coord(b, dests, instr);
532 break;
533
534 case nir_intrinsic_discard:
535 return agx_emit_discard(b, instr);
536
537 case nir_intrinsic_load_back_face_agx:
538 return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
539
540 case nir_intrinsic_load_vertex_id:
541 return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32)));
542
543 case nir_intrinsic_load_instance_id:
544 return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32)));
545
546 case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
547 case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
548 case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
549 case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
550
551 default:
552 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
553 unreachable("Unhandled intrinsic");
554 }
555
556 /* If we got here, there is a vector destination for the intrinsic composed
557 * of separate scalars. Its components are specified separately in the dests
558 * array. We need to combine them so the vector destination itself is valid.
559 * If only individual components are accessed, this combine will be dead code
560 * eliminated.
561 */
562 return agx_emit_combine_to(b, dst, dests[0], dests[1], dests[2], dests[3]);
563 }
564
565 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)566 agx_alu_src_index(agx_builder *b, nir_alu_src src)
567 {
568 /* Check well-formedness of the input NIR */
569 ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
570 unsigned comps = nir_src_num_components(src.src);
571 unsigned channel = src.swizzle[0];
572
573 assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
574 assert(!(src.negate || src.abs));
575 assert(channel < comps);
576
577 agx_index idx = agx_src_index(&src.src);
578
579 /* We only deal with scalars, extract a single scalar if needed */
580 if (comps > 1)
581 return agx_emit_extract(b, idx, channel);
582 else
583 return idx;
584 }
585
586 static agx_instr *
agx_emit_alu_bool(agx_builder * b,nir_op op,agx_index dst,agx_index s0,agx_index s1,agx_index s2)587 agx_emit_alu_bool(agx_builder *b, nir_op op,
588 agx_index dst, agx_index s0, agx_index s1, agx_index s2)
589 {
590 /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
591 * This will give the optimizer flexibility. */
592 agx_index f = agx_immediate(0);
593 agx_index t = agx_immediate(0x1);
594
595 switch (op) {
596 case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
597 case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
598 case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
599 case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
600
601 case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
602 case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
603 case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
604 case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
605 case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
606 case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
607
608 case nir_op_mov: return agx_mov_to(b, dst, s0);
609 case nir_op_iand: return agx_and_to(b, dst, s0, s1);
610 case nir_op_ior: return agx_or_to(b, dst, s0, s1);
611 case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
612 case nir_op_inot: return agx_xor_to(b, dst, s0, t);
613
614 case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
615 case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
616 case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
617
618 case nir_op_bcsel:
619 return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
620
621 default:
622 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
623 unreachable("Unhandled boolean ALU instruction");
624 }
625 }
626
627 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)628 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
629 {
630 unsigned srcs = nir_op_infos[instr->op].num_inputs;
631 unsigned sz = nir_dest_bit_size(instr->dest.dest);
632 unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
633 ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
634
635 assert(comps == 1 || nir_op_is_vec(instr->op));
636 assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
637
638 agx_index dst = agx_dest_index(&instr->dest.dest);
639 agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
640 agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
641 agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
642 agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
643
644 /* 1-bit bools are a bit special, only handle with select ops */
645 if (sz == 1)
646 return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
647
648 #define UNOP(nop, aop) \
649 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
650 #define BINOP(nop, aop) \
651 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
652 #define TRIOP(nop, aop) \
653 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
654
655 switch (instr->op) {
656 BINOP(fadd, fadd);
657 BINOP(fmul, fmul);
658 TRIOP(ffma, fma);
659
660 UNOP(f2f16, fmov);
661 UNOP(f2f32, fmov);
662 UNOP(fround_even, roundeven);
663 UNOP(ftrunc, trunc);
664 UNOP(ffloor, floor);
665 UNOP(fceil, ceil);
666 UNOP(frcp, rcp);
667 UNOP(frsq, rsqrt);
668 UNOP(flog2, log2);
669 UNOP(fexp2, exp2);
670
671 UNOP(fddx, dfdx);
672 UNOP(fddx_coarse, dfdx);
673 UNOP(fddx_fine, dfdx);
674
675 UNOP(fddy, dfdy);
676 UNOP(fddy_coarse, dfdy);
677 UNOP(fddy_fine, dfdy);
678
679 UNOP(mov, mov);
680 UNOP(u2u16, mov);
681 UNOP(u2u32, mov);
682 UNOP(inot, not);
683 BINOP(iand, and);
684 BINOP(ior, or);
685 BINOP(ixor, xor);
686
687 case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
688 case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
689 case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
690 case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
691
692 case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
693 case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
694 case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
695 case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
696 case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
697 case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
698
699 case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
700 case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
701 case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
702 case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
703
704 case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
705 case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1);
706 case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
707
708 case nir_op_bcsel:
709 return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
710
711 case nir_op_b2i32:
712 case nir_op_b2i16:
713 return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
714
715 case nir_op_b2f16:
716 case nir_op_b2f32:
717 {
718 /* At this point, boolean is just zero/nonzero, so compare with zero */
719 agx_index one = (sz == 16) ?
720 agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
721 agx_mov_imm(b, 32, fui(1.0));
722
723 agx_index zero = agx_zero();
724
725 return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
726 }
727
728 case nir_op_i2i32:
729 {
730 if (s0.size != AGX_SIZE_16)
731 unreachable("todo: more conversions");
732
733 return agx_iadd_to(b, dst, s0, agx_zero(), 0);
734 }
735
736 case nir_op_i2i16:
737 {
738 if (s0.size != AGX_SIZE_32)
739 unreachable("todo: more conversions");
740
741 return agx_iadd_to(b, dst, s0, agx_zero(), 0);
742 }
743
744 case nir_op_iadd_sat:
745 {
746 agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
747 I->saturate = true;
748 return I;
749 }
750
751 case nir_op_isub_sat:
752 {
753 agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
754 I->saturate = true;
755 return I;
756 }
757
758 case nir_op_uadd_sat:
759 {
760 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
761 I->saturate = true;
762 return I;
763 }
764
765 case nir_op_usub_sat:
766 {
767 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
768 I->saturate = true;
769 return I;
770 }
771
772 case nir_op_fsat:
773 {
774 agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
775 I->saturate = true;
776 return I;
777 }
778
779 case nir_op_fsin_agx:
780 {
781 agx_index fixup = agx_sin_pt_1(b, s0);
782 agx_index sinc = agx_sin_pt_2(b, fixup);
783 return agx_fmul_to(b, dst, sinc, fixup);
784 }
785
786 case nir_op_f2i16:
787 return agx_convert_to(b, dst,
788 agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
789
790 case nir_op_f2i32:
791 return agx_convert_to(b, dst,
792 agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
793
794 case nir_op_f2u16:
795 return agx_convert_to(b, dst,
796 agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
797
798 case nir_op_f2u32:
799 return agx_convert_to(b, dst,
800 agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
801
802 case nir_op_u2f16:
803 case nir_op_u2f32:
804 {
805 if (src_sz == 64)
806 unreachable("64-bit conversions unimplemented");
807
808 enum agx_convert mode =
809 (src_sz == 32) ? AGX_CONVERT_U32_TO_F :
810 (src_sz == 16) ? AGX_CONVERT_U16_TO_F :
811 AGX_CONVERT_U8_TO_F;
812
813 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
814 }
815
816 case nir_op_i2f16:
817 case nir_op_i2f32:
818 {
819 if (src_sz == 64)
820 unreachable("64-bit conversions unimplemented");
821
822 enum agx_convert mode =
823 (src_sz == 32) ? AGX_CONVERT_S32_TO_F :
824 (src_sz == 16) ? AGX_CONVERT_S16_TO_F :
825 AGX_CONVERT_S8_TO_F;
826
827 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
828 }
829
830 case nir_op_vec2:
831 case nir_op_vec3:
832 case nir_op_vec4:
833 return agx_emit_combine_to(b, dst, s0, s1, s2, s3);
834
835 case nir_op_vec8:
836 case nir_op_vec16:
837 unreachable("should've been lowered");
838
839 default:
840 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
841 unreachable("Unhandled ALU instruction");
842 }
843 }
844
845 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)846 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
847 {
848 switch (dim) {
849 case GLSL_SAMPLER_DIM_1D:
850 case GLSL_SAMPLER_DIM_BUF:
851 return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;
852
853 case GLSL_SAMPLER_DIM_2D:
854 case GLSL_SAMPLER_DIM_RECT:
855 case GLSL_SAMPLER_DIM_EXTERNAL:
856 return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;
857
858 case GLSL_SAMPLER_DIM_MS:
859 assert(!array && "multisampled arrays unsupported");
860 return AGX_DIM_TEX_2D_MS;
861
862 case GLSL_SAMPLER_DIM_3D:
863 assert(!array && "3D arrays unsupported");
864 return AGX_DIM_TEX_3D;
865
866 case GLSL_SAMPLER_DIM_CUBE:
867 return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;
868
869 default:
870 unreachable("Invalid sampler dim\n");
871 }
872 }
873
874 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op)875 agx_lod_mode_for_nir(nir_texop op)
876 {
877 switch (op) {
878 case nir_texop_tex: return AGX_LOD_MODE_AUTO_LOD;
879 case nir_texop_txb: return AGX_LOD_MODE_AUTO_LOD_BIAS;
880 case nir_texop_txl: return AGX_LOD_MODE_LOD_MIN;
881 default: unreachable("Unhandled texture op");
882 }
883 }
884
885 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)886 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
887 {
888 switch (instr->op) {
889 case nir_texop_tex:
890 case nir_texop_txl:
891 case nir_texop_txb:
892 break;
893 default:
894 unreachable("Unhandled texture op");
895 }
896
897 agx_index coords = agx_null(),
898 texture = agx_immediate(instr->texture_index),
899 sampler = agx_immediate(instr->sampler_index),
900 lod = agx_immediate(0),
901 offset = agx_null();
902
903 for (unsigned i = 0; i < instr->num_srcs; ++i) {
904 agx_index index = agx_src_index(&instr->src[i].src);
905
906 switch (instr->src[i].src_type) {
907 case nir_tex_src_coord:
908 coords = index;
909
910 /* Array textures are indexed by a floating-point in NIR, but by an
911 * integer in AGX. Convert the array index from float-to-int for array
912 * textures. The array index is the last source in NIR. The conversion
913 * is according to the rule from 8.9 ("Texture Functions") of the GLSL
914 * ES 3.20 specification:
915 *
916 * max(0, min(d - 1, floor(layer + 0.5))) =
917 * max(0, min(d - 1, f32_to_u32(layer + 0.5))) =
918 * min(d - 1, f32_to_u32(layer + 0.5))
919 */
920 if (instr->is_array) {
921 unsigned nr = nir_src_num_components(instr->src[i].src);
922 agx_index channels[4] = {};
923
924 for (unsigned i = 0; i < nr; ++i)
925 channels[i] = agx_emit_extract(b, index, i);
926
927 agx_index layer = agx_fadd(b, channels[nr - 1],
928 agx_immediate_f(0.5f));
929
930 agx_index d1 = agx_indexed_sysval(b->shader,
931 AGX_PUSH_ARRAY_SIZE_MINUS_1, AGX_SIZE_16,
932 instr->texture_index, 1);
933
934 layer = agx_convert(b, agx_immediate(AGX_CONVERT_F_TO_U32), layer,
935 AGX_ROUND_RTZ);
936
937 agx_index layer16 = agx_temp(b->shader, AGX_SIZE_16);
938 agx_mov_to(b, layer16, layer);
939
940 layer = agx_icmpsel(b, layer16, d1, layer16, d1, AGX_ICOND_ULT);
941
942 agx_index layer32 = agx_temp(b->shader, AGX_SIZE_32);
943 agx_mov_to(b, layer32, layer);
944
945 channels[nr - 1] = layer32;
946 coords = agx_p_combine(b, channels[0], channels[1], channels[2], channels[3]);
947 } else {
948 coords = index;
949 }
950
951 break;
952
953 case nir_tex_src_lod:
954 case nir_tex_src_bias:
955 lod = index;
956 break;
957
958 case nir_tex_src_ms_index:
959 case nir_tex_src_offset:
960 case nir_tex_src_comparator:
961 case nir_tex_src_texture_offset:
962 case nir_tex_src_sampler_offset:
963 default:
964 unreachable("todo");
965 }
966 }
967
968 agx_index dst = agx_dest_index(&instr->dest);
969 agx_texture_sample_to(b, dst, coords, lod, texture, sampler, offset,
970 agx_tex_dim(instr->sampler_dim, instr->is_array),
971 agx_lod_mode_for_nir(instr->op),
972 0xF, /* TODO: wrmask */
973 0);
974
975 agx_wait(b, 0);
976 agx_emit_cached_split(b, dst, 4);
977 }
978
979 /*
980 * Mark the logical end of the current block by emitting a p_logical_end marker.
981 * Note if an unconditional jump is emitted (for instance, to break out of a
982 * loop from inside an if), the block has already reached its logical end so we
983 * don't re-emit p_logical_end. The validator checks this, and correct register
984 * allocation depends on it.
985 */
986 static void
agx_emit_logical_end(agx_builder * b)987 agx_emit_logical_end(agx_builder *b)
988 {
989 if (!b->shader->current_block->unconditional_jumps)
990 agx_p_logical_end(b);
991 }
992
993 /* NIR loops are treated as a pair of AGX loops:
994 *
995 * do {
996 * do {
997 * ...
998 * } while (0);
999 * } while (cond);
1000 *
1001 * By manipulating the nesting counter (r0l), we may break out of nested loops,
1002 * so under the model, both break and continue may be implemented as breaks,
1003 * where break breaks out of the outer loop (2 layers) and continue breaks out
1004 * of the inner loop (1 layer).
1005 *
1006 * After manipulating the nesting counter directly, pop_exec #0 must be used to
1007 * flush the update to the execution mask.
1008 */
1009
1010 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)1011 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
1012 {
1013 agx_context *ctx = b->shader;
1014 assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
1015
1016 /* Break out of either one or two loops */
1017 unsigned nestings = b->shader->loop_nesting;
1018
1019 if (instr->type == nir_jump_continue) {
1020 nestings += 1;
1021 agx_block_add_successor(ctx->current_block, ctx->continue_block);
1022 } else if (instr->type == nir_jump_break) {
1023 nestings += 2;
1024 agx_block_add_successor(ctx->current_block, ctx->break_block);
1025 }
1026
1027 /* Update the counter and flush */
1028 agx_index r0l = agx_register(0, false);
1029 agx_mov_to(b, r0l, agx_immediate(nestings));
1030
1031 /* Jumps must come at the end of a block */
1032 agx_emit_logical_end(b);
1033 agx_pop_exec(b, 0);
1034
1035 ctx->current_block->unconditional_jumps = true;
1036 }
1037
1038 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)1039 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
1040 {
1041 agx_instr *I = agx_phi_to(b, agx_dest_index(&instr->dest));
1042
1043 /* Deferred */
1044 I->phi = instr;
1045 }
1046
1047 /* Look up the AGX block corresponding to a given NIR block. Used when
1048 * translating phi nodes after emitting all blocks.
1049 */
1050 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)1051 agx_from_nir_block(agx_context *ctx, nir_block *block)
1052 {
1053 return ctx->indexed_nir_blocks[block->index];
1054 }
1055
1056 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)1057 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
1058 {
1059 nir_phi_instr *phi = I->phi;
1060
1061 /* Guaranteed by lower_phis_to_scalar */
1062 assert(phi->dest.ssa.num_components == 1);
1063
1064 I->nr_srcs = exec_list_length(&phi->srcs);
1065 I->src = rzalloc_array(I, agx_index, I->nr_srcs);
1066
1067 nir_foreach_phi_src(src, phi) {
1068 agx_block *pred = agx_from_nir_block(ctx, src->pred);
1069 unsigned i = agx_predecessor_index(block, pred);
1070 assert(i < I->nr_srcs);
1071
1072 I->src[i] = agx_src_index(&src->src);
1073 }
1074 }
1075
1076 static void
agx_emit_phis_deferred(agx_context * ctx)1077 agx_emit_phis_deferred(agx_context *ctx)
1078 {
1079 agx_foreach_block(ctx, block) {
1080 agx_foreach_instr_in_block(block, I) {
1081 if (I->op == AGX_OPCODE_PHI)
1082 agx_emit_phi_deferred(ctx, block, I);
1083 }
1084 }
1085 }
1086
1087 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)1088 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
1089 {
1090 switch (instr->type) {
1091 case nir_instr_type_load_const:
1092 agx_emit_load_const(b, nir_instr_as_load_const(instr));
1093 break;
1094
1095 case nir_instr_type_intrinsic:
1096 agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
1097 break;
1098
1099 case nir_instr_type_alu:
1100 agx_emit_alu(b, nir_instr_as_alu(instr));
1101 break;
1102
1103 case nir_instr_type_tex:
1104 agx_emit_tex(b, nir_instr_as_tex(instr));
1105 break;
1106
1107 case nir_instr_type_jump:
1108 agx_emit_jump(b, nir_instr_as_jump(instr));
1109 break;
1110
1111 case nir_instr_type_phi:
1112 agx_emit_phi(b, nir_instr_as_phi(instr));
1113 break;
1114
1115 default:
1116 unreachable("should've been lowered");
1117 }
1118 }
1119
1120 static agx_block *
agx_create_block(agx_context * ctx)1121 agx_create_block(agx_context *ctx)
1122 {
1123 agx_block *blk = rzalloc(ctx, agx_block);
1124
1125 util_dynarray_init(&blk->predecessors, blk);
1126
1127 return blk;
1128 }
1129
1130 static agx_block *
emit_block(agx_context * ctx,nir_block * block)1131 emit_block(agx_context *ctx, nir_block *block)
1132 {
1133 if (ctx->after_block) {
1134 ctx->current_block = ctx->after_block;
1135 ctx->after_block = NULL;
1136 } else {
1137 ctx->current_block = agx_create_block(ctx);
1138 }
1139
1140 agx_block *blk = ctx->current_block;
1141 list_addtail(&blk->link, &ctx->blocks);
1142 list_inithead(&blk->instructions);
1143
1144 ctx->indexed_nir_blocks[block->index] = blk;
1145
1146 agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
1147
1148 nir_foreach_instr(instr, block) {
1149 agx_emit_instr(&_b, instr);
1150 }
1151
1152 return blk;
1153 }
1154
1155 static agx_block *
1156 emit_cf_list(agx_context *ctx, struct exec_list *list);
1157
1158 /* Emit if-else as
1159 *
1160 * if_icmp cond != 0
1161 * ...
1162 * else_icmp cond == 0
1163 * ...
1164 * pop_exec
1165 *
1166 * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
1167 * an empty else block can become nonempty after RA due to phi lowering. This is
1168 * not usually optimal, but it's a start.
1169 */
1170
1171 static void
emit_if(agx_context * ctx,nir_if * nif)1172 emit_if(agx_context *ctx, nir_if *nif)
1173 {
1174 agx_block *first_block = ctx->current_block;
1175 agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
1176 agx_index cond = agx_src_index(&nif->condition);
1177
1178 agx_emit_logical_end(&_b);
1179 agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
1180 ctx->loop_nesting++;
1181
1182 /* Emit the two subblocks. */
1183 agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
1184 agx_block *end_then = ctx->current_block;
1185
1186 _b.cursor = agx_after_block(ctx->current_block);
1187 agx_emit_logical_end(&_b);
1188 agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
1189
1190 agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
1191 agx_block *end_else = ctx->current_block;
1192
1193 ctx->after_block = agx_create_block(ctx);
1194
1195 agx_block_add_successor(first_block, if_block);
1196 agx_block_add_successor(first_block, else_block);
1197 agx_block_add_successor(end_then, ctx->after_block);
1198 agx_block_add_successor(end_else, ctx->after_block);
1199
1200 _b.cursor = agx_after_block(ctx->current_block);
1201 agx_emit_logical_end(&_b);
1202 agx_pop_exec(&_b, 1);
1203 ctx->loop_nesting--;
1204 }
1205
1206 static void
emit_loop(agx_context * ctx,nir_loop * nloop)1207 emit_loop(agx_context *ctx, nir_loop *nloop)
1208 {
1209 /* We only track nesting within the innermost loop, so push and reset */
1210 unsigned pushed_nesting = ctx->loop_nesting;
1211 ctx->loop_nesting = 0;
1212
1213 agx_block *popped_break = ctx->break_block;
1214 agx_block *popped_continue = ctx->continue_block;
1215
1216 ctx->break_block = agx_create_block(ctx);
1217 ctx->continue_block = agx_create_block(ctx);
1218
1219 /* Make room for break/continue nesting (TODO: skip if no divergent CF) */
1220 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
1221 agx_emit_logical_end(&_b);
1222 agx_push_exec(&_b, 2);
1223
1224 /* Fallthrough to body */
1225 agx_block_add_successor(ctx->current_block, ctx->continue_block);
1226
1227 /* Emit the body */
1228 ctx->after_block = ctx->continue_block;
1229 agx_block *start_block = emit_cf_list(ctx, &nloop->body);
1230
1231 /* Fix up the nesting counter via an always true while_icmp, and branch back
1232 * to start of loop if any lanes are active */
1233 _b.cursor = agx_after_block(ctx->current_block);
1234 agx_emit_logical_end(&_b);
1235 agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
1236 agx_jmp_exec_any(&_b, start_block);
1237 agx_pop_exec(&_b, 2);
1238 agx_block_add_successor(ctx->current_block, ctx->continue_block);
1239
1240 /* Pop off */
1241 ctx->after_block = ctx->break_block;
1242 ctx->break_block = popped_break;
1243 ctx->continue_block = popped_continue;
1244
1245 /* Update shader-db stats */
1246 ++ctx->loop_count;
1247
1248 /* All nested control flow must have finished */
1249 assert(ctx->loop_nesting == 0);
1250
1251 /* Restore loop nesting (we might be inside an if inside an outer loop) */
1252 ctx->loop_nesting = pushed_nesting;
1253 }
1254
1255 /* Before the first control flow structure, the nesting counter (r0l) needs to
1256 * be zeroed for correct operation. This only happens at most once, since by
1257 * definition this occurs at the end of the first block, which dominates the
1258 * rest of the program. */
1259
1260 static void
emit_first_cf(agx_context * ctx)1261 emit_first_cf(agx_context *ctx)
1262 {
1263 if (ctx->any_cf)
1264 return;
1265
1266 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
1267 agx_index r0l = agx_register(0, false);
1268
1269 agx_mov_to(&_b, r0l, agx_immediate(0));
1270 ctx->any_cf = true;
1271 }
1272
1273 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)1274 emit_cf_list(agx_context *ctx, struct exec_list *list)
1275 {
1276 agx_block *start_block = NULL;
1277
1278 foreach_list_typed(nir_cf_node, node, node, list) {
1279 switch (node->type) {
1280 case nir_cf_node_block: {
1281 agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
1282
1283 if (!start_block)
1284 start_block = block;
1285
1286 break;
1287 }
1288
1289 case nir_cf_node_if:
1290 emit_first_cf(ctx);
1291 emit_if(ctx, nir_cf_node_as_if(node));
1292 break;
1293
1294 case nir_cf_node_loop:
1295 emit_first_cf(ctx);
1296 emit_loop(ctx, nir_cf_node_as_loop(node));
1297 break;
1298
1299 default:
1300 unreachable("Unknown control flow");
1301 }
1302 }
1303
1304 return start_block;
1305 }
1306
1307 static void
agx_set_st_vary_final(agx_context * ctx)1308 agx_set_st_vary_final(agx_context *ctx)
1309 {
1310 agx_foreach_instr_global_rev(ctx, I) {
1311 if (I->op == AGX_OPCODE_ST_VARY) {
1312 I->last = true;
1313 return;
1314 }
1315 }
1316 }
1317
1318 static void
agx_print_stats(agx_context * ctx,unsigned size,FILE * fp)1319 agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)
1320 {
1321 unsigned nr_ins = 0, max_reg = 0;
1322
1323 agx_foreach_instr_global(ctx, I) {
1324 /* Count instructions */
1325 nr_ins++;
1326
1327 /* Count registers */
1328 agx_foreach_dest(I, d) {
1329 if (I->dest[d].type == AGX_INDEX_REGISTER) {
1330 max_reg = MAX2(max_reg,
1331 I->dest[d].value + agx_write_registers(I, d) - 1);
1332 }
1333 }
1334 }
1335
1336 /* TODO: Pipe through occupancy */
1337 unsigned nr_threads = 1;
1338
1339 fprintf(stderr, "%s - %s shader: %u inst, %u bytes, %u halfregs, %u threads, "
1340 "%u loops, %u:%u spills:fills\n",
1341 ctx->nir->info.label ?: "",
1342 gl_shader_stage_name(ctx->stage),
1343 nr_ins, size, max_reg, nr_threads, ctx->loop_count,
1344 ctx->spills, ctx->fills);
1345 }
1346
1347 static int
glsl_type_size(const struct glsl_type * type,bool bindless)1348 glsl_type_size(const struct glsl_type *type, bool bindless)
1349 {
1350 return glsl_count_attribute_slots(type, false);
1351 }
1352
1353 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)1354 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
1355 {
1356 if (instr->type != nir_instr_type_alu)
1357 return false;
1358
1359 nir_alu_instr *alu = nir_instr_as_alu(instr);
1360 return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
1361 }
1362
1363 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
1364 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
1365 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
1366 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
1367 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
1368 * need to change units from radians to quadrants modulo turns. Cosine is
1369 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
1370 */
1371
1372 static nir_ssa_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)1373 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
1374 {
1375 nir_alu_instr *alu = nir_instr_as_alu(instr);
1376 nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
1377 nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
1378
1379 if (alu->op == nir_op_fcos)
1380 turns = nir_fadd_imm(b, turns, 0.25f);
1381
1382 nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
1383 return nir_fsin_agx(b, quadrants);
1384 }
1385
1386 static bool
agx_lower_sincos(nir_shader * shader)1387 agx_lower_sincos(nir_shader *shader)
1388 {
1389 return nir_shader_lower_instructions(shader,
1390 agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
1391 }
1392
1393 static bool
agx_lower_front_face(struct nir_builder * b,nir_instr * instr,UNUSED void * data)1394 agx_lower_front_face(struct nir_builder *b,
1395 nir_instr *instr, UNUSED void *data)
1396 {
1397 if (instr->type != nir_instr_type_intrinsic)
1398 return false;
1399
1400 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1401 if (intr->intrinsic != nir_intrinsic_load_front_face)
1402 return false;
1403
1404 assert(intr->dest.is_ssa);
1405 nir_ssa_def *def = &intr->dest.ssa;
1406 assert(def->bit_size == 1);
1407
1408 b->cursor = nir_before_instr(&intr->instr);
1409 nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
1410 return true;
1411 }
1412
1413 static bool
agx_lower_aligned_offsets(struct nir_builder * b,nir_instr * instr,UNUSED void * data)1414 agx_lower_aligned_offsets(struct nir_builder *b,
1415 nir_instr *instr, UNUSED void *data)
1416 {
1417 if (instr->type != nir_instr_type_intrinsic)
1418 return false;
1419
1420 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1421 if (intr->intrinsic != nir_intrinsic_load_ubo)
1422 return false;
1423
1424 b->cursor = nir_before_instr(&intr->instr);
1425
1426 unsigned bytes = nir_dest_bit_size(intr->dest) / 8;
1427 assert(util_is_power_of_two_or_zero(bytes) && bytes != 0);
1428
1429 nir_src *offset = &intr->src[1];
1430
1431 unsigned shift = util_logbase2(bytes);
1432
1433 nir_ssa_def *old = nir_ssa_for_src(b, *offset, 1);
1434 nir_ssa_def *new = nir_ishr_imm(b, old, shift);
1435
1436 nir_instr_rewrite_src_ssa(instr, offset, new);
1437 return true;
1438 }
1439
1440 static void
agx_optimize_nir(nir_shader * nir)1441 agx_optimize_nir(nir_shader *nir)
1442 {
1443 bool progress;
1444
1445 nir_lower_idiv_options idiv_options = {
1446 .imprecise_32bit_lowering = true,
1447 .allow_fp16 = true,
1448 };
1449
1450 NIR_PASS_V(nir, nir_lower_regs_to_ssa);
1451 NIR_PASS_V(nir, nir_lower_int64);
1452 NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
1453 NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1454 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1455 NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
1456 NIR_PASS_V(nir, agx_lower_sincos);
1457 NIR_PASS_V(nir, nir_shader_instructions_pass,
1458 agx_lower_front_face,
1459 nir_metadata_block_index | nir_metadata_dominance, NULL);
1460
1461 do {
1462 progress = false;
1463
1464 NIR_PASS(progress, nir, nir_lower_var_copies);
1465 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
1466
1467 NIR_PASS(progress, nir, nir_copy_prop);
1468 NIR_PASS(progress, nir, nir_opt_remove_phis);
1469 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
1470 NIR_PASS(progress, nir, nir_opt_dce);
1471 NIR_PASS(progress, nir, nir_opt_dead_cf);
1472 NIR_PASS(progress, nir, nir_opt_cse);
1473 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
1474 NIR_PASS(progress, nir, nir_opt_algebraic);
1475 NIR_PASS(progress, nir, nir_opt_constant_folding);
1476
1477 NIR_PASS(progress, nir, nir_opt_undef);
1478 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
1479
1480 NIR_PASS(progress, nir, nir_opt_loop_unroll);
1481 } while (progress);
1482
1483 NIR_PASS_V(nir, nir_opt_algebraic_late);
1484 NIR_PASS_V(nir, nir_opt_constant_folding);
1485 NIR_PASS_V(nir, nir_copy_prop);
1486 NIR_PASS_V(nir, nir_opt_dce);
1487 NIR_PASS_V(nir, nir_opt_cse);
1488 NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1489 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1490
1491 /* Cleanup optimizations */
1492 nir_move_options move_all =
1493 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1494 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
1495
1496 NIR_PASS_V(nir, nir_opt_sink, move_all);
1497 NIR_PASS_V(nir, nir_opt_move, move_all);
1498 NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
1499 }
1500
1501 /* ABI: position first, then user, then psiz */
1502 static void
agx_remap_varyings_vs(nir_shader * nir,struct agx_varyings * varyings,unsigned * remap)1503 agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,
1504 unsigned *remap)
1505 {
1506 unsigned base = 0;
1507
1508 nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);
1509 if (pos) {
1510 assert(pos->data.driver_location < AGX_MAX_VARYINGS);
1511 remap[pos->data.driver_location] = base;
1512 base += 4;
1513 }
1514
1515 nir_foreach_shader_out_variable(var, nir) {
1516 unsigned loc = var->data.location;
1517
1518 if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {
1519 continue;
1520 }
1521
1522 assert(var->data.driver_location < AGX_MAX_VARYINGS);
1523 remap[var->data.driver_location] = base;
1524 base += 4;
1525 }
1526
1527 nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
1528 if (psiz) {
1529 assert(psiz->data.driver_location < AGX_MAX_VARYINGS);
1530 remap[psiz->data.driver_location] = base;
1531 base += 1;
1532 }
1533
1534 varyings->nr_slots = base;
1535 }
1536
1537 static void
agx_remap_varyings_fs(nir_shader * nir,struct agx_varyings * varyings,unsigned * remap)1538 agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,
1539 unsigned *remap)
1540 {
1541 struct agx_varying_packed *packed = varyings->packed;
1542 unsigned base = 0;
1543
1544 agx_pack(packed, VARYING, cfg) {
1545 cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;
1546 cfg.components = 1;
1547 cfg.triangle_slot = cfg.point_slot = base;
1548 }
1549
1550 base++;
1551 packed++;
1552
1553 agx_pack(packed, VARYING, cfg) {
1554 cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;
1555 cfg.components = 1;
1556 cfg.triangle_slot = cfg.point_slot = base;
1557 }
1558
1559 base++;
1560 packed++;
1561
1562 unsigned comps[MAX_VARYING] = { 0 };
1563
1564 nir_foreach_shader_in_variable(var, nir) {
1565 unsigned loc = var->data.driver_location;
1566 const struct glsl_type *column =
1567 glsl_without_array_or_matrix(var->type);
1568 unsigned chan = glsl_get_components(column);
1569
1570 /* If we have a fractional location added, we need to increase the size
1571 * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
1572 * We could do better but this is an edge case as it is, normally
1573 * packed varyings will be aligned.
1574 */
1575 chan += var->data.location_frac;
1576 comps[loc] = MAX2(comps[loc], chan);
1577 }
1578
1579 nir_foreach_shader_in_variable(var, nir) {
1580 unsigned loc = var->data.driver_location;
1581 unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
1582 unsigned channels = comps[loc];
1583
1584 assert(var->data.driver_location <= AGX_MAX_VARYINGS);
1585 remap[var->data.driver_location] = base;
1586
1587 for (int c = 0; c < sz; ++c) {
1588 agx_pack(packed, VARYING, cfg) {
1589 cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?
1590 AGX_VARYING_TYPE_POINT_COORDINATES :
1591 (var->data.interpolation == INTERP_MODE_FLAT) ?
1592 AGX_VARYING_TYPE_FLAT_LAST :
1593 AGX_VARYING_TYPE_SMOOTH;
1594
1595 cfg.components = channels;
1596 cfg.triangle_slot = cfg.point_slot = base;
1597 }
1598
1599 base += channels;
1600 packed++;
1601 }
1602 }
1603
1604 varyings->nr_descs = (packed - varyings->packed);
1605 varyings->nr_slots = base;
1606 }
1607
1608 /*
1609 * Build a bit mask of varyings (by location) that are flatshaded. This
1610 * information is needed by lower_mediump_io.
1611 */
1612 static uint64_t
agx_flat_varying_mask(nir_shader * nir)1613 agx_flat_varying_mask(nir_shader *nir)
1614 {
1615 uint64_t mask = 0;
1616
1617 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1618
1619 nir_foreach_shader_in_variable(var, nir) {
1620 if (var->data.interpolation == INTERP_MODE_FLAT)
1621 mask |= BITFIELD64_BIT(var->data.location);
1622 }
1623
1624 return mask;
1625 }
1626
1627 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_dynarray * binary,struct agx_shader_info * out)1628 agx_compile_shader_nir(nir_shader *nir,
1629 struct agx_shader_key *key,
1630 struct util_dynarray *binary,
1631 struct agx_shader_info *out)
1632 {
1633 agx_debug = debug_get_option_agx_debug();
1634
1635 agx_context *ctx = rzalloc(NULL, agx_context);
1636 ctx->nir = nir;
1637 ctx->out = out;
1638 ctx->key = key;
1639 ctx->stage = nir->info.stage;
1640 list_inithead(&ctx->blocks);
1641
1642 if (ctx->stage == MESA_SHADER_VERTEX) {
1643 out->writes_psiz = nir->info.outputs_written &
1644 BITFIELD_BIT(VARYING_SLOT_PSIZ);
1645 }
1646
1647 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1648
1649 /* Lower large arrays to scratch and small arrays to csel */
1650 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
1651 glsl_get_natural_size_align_bytes);
1652 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
1653
1654 if (ctx->stage == MESA_SHADER_VERTEX) {
1655 /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
1656 if (!key->vs.clip_halfz)
1657 NIR_PASS_V(nir, nir_lower_clip_halfz);
1658 }
1659
1660 NIR_PASS_V(nir, nir_split_var_copies);
1661 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
1662 NIR_PASS_V(nir, nir_lower_var_copies);
1663 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1664 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1665 glsl_type_size, 0);
1666 if (ctx->stage == MESA_SHADER_FRAGMENT) {
1667 /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
1668 * exception, interpolate flat shaded at fp32. This works around a
1669 * hardware limitation. The resulting code (with an extra f2f16 at the end
1670 * if needed) matches what Metal produces.
1671 */
1672 NIR_PASS_V(nir, nir_lower_mediump_io,
1673 nir_var_shader_in | nir_var_shader_out,
1674 ~agx_flat_varying_mask(nir), false);
1675 }
1676 NIR_PASS_V(nir, nir_shader_instructions_pass,
1677 agx_lower_aligned_offsets,
1678 nir_metadata_block_index | nir_metadata_dominance, NULL);
1679
1680 NIR_PASS_V(nir, nir_lower_ssbo);
1681
1682 /* Varying output is scalar, other I/O is vector */
1683 if (ctx->stage == MESA_SHADER_VERTEX) {
1684 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
1685 }
1686
1687 nir_lower_tex_options lower_tex_options = {
1688 .lower_txs_lod = true,
1689 .lower_txp = ~0,
1690 .lower_invalid_implicit_lod = true,
1691 };
1692
1693 nir_tex_src_type_constraints tex_constraints = {
1694 [nir_tex_src_lod] = { true, 16 },
1695 [nir_tex_src_bias] = { true, 16 },
1696 };
1697
1698 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
1699 NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
1700
1701 agx_optimize_nir(nir);
1702
1703 /* Implement conditional discard with real control flow like Metal */
1704 NIR_PASS_V(nir, nir_lower_discard_if);
1705
1706 /* Must be last since NIR passes can remap driver_location freely */
1707 if (ctx->stage == MESA_SHADER_VERTEX) {
1708 agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);
1709 } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1710 agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);
1711 }
1712
1713 bool skip_internal = nir->info.internal;
1714 skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);
1715
1716 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {
1717 nir_print_shader(nir, stdout);
1718 }
1719
1720 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
1721
1722 nir_foreach_function(func, nir) {
1723 if (!func->impl)
1724 continue;
1725
1726 nir_index_blocks(func->impl);
1727
1728 ctx->indexed_nir_blocks =
1729 rzalloc_array(ctx, agx_block *, func->impl->num_blocks);
1730
1731 ctx->alloc += func->impl->ssa_alloc;
1732 emit_cf_list(ctx, &func->impl->body);
1733 agx_emit_phis_deferred(ctx);
1734 break; /* TODO: Multi-function shaders */
1735 }
1736
1737 /* Terminate the shader after the exit block */
1738 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
1739 agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
1740 agx_stop(&_b);
1741
1742 /* Also add traps to match the blob, unsure what the function is */
1743 for (unsigned i = 0; i < 8; ++i)
1744 agx_trap(&_b);
1745
1746 /* Index blocks now that we're done emitting so the order is consistent */
1747 agx_foreach_block(ctx, block)
1748 block->index = ctx->num_blocks++;
1749
1750 agx_validate(ctx, "IR translation");
1751
1752 if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1753 agx_print_shader(ctx, stdout);
1754
1755 agx_optimizer(ctx);
1756 agx_dce(ctx);
1757 agx_validate(ctx, "Optimization");
1758
1759 if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1760 agx_print_shader(ctx, stdout);
1761
1762 agx_ra(ctx);
1763
1764 if (ctx->stage == MESA_SHADER_VERTEX)
1765 agx_set_st_vary_final(ctx);
1766
1767 if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1768 agx_print_shader(ctx, stdout);
1769
1770 agx_lower_pseudo(ctx);
1771
1772 agx_pack_binary(ctx, binary);
1773
1774 if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)
1775 agx_print_stats(ctx, binary->size, stderr);
1776
1777 ralloc_free(ctx);
1778 }
1779