1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * Copyright 2019-2020 Collabora, Ltd.
4 * Copyright 2014-2017 Broadcom
5 * Copyright 2010 Red Hat Inc.
6 * SPDX-License-Identifier: MIT
7 */
8 #include "agx_state.h"
9 #include <errno.h>
10 #include <stdio.h>
11 #include "asahi/compiler/agx_compile.h"
12 #include "asahi/compiler/agx_nir.h"
13 #include "asahi/genxml/agx_pack.h"
14 #include "asahi/layout/layout.h"
15 #include "asahi/lib/agx_abi.h"
16 #include "asahi/lib/agx_helpers.h"
17 #include "asahi/lib/agx_ppp.h"
18 #include "asahi/lib/agx_usc.h"
19 #include "asahi/libagx/compression.h"
20 #include "asahi/libagx/query.h"
21 #include "asahi/libagx/tessellator.h"
22 #include "compiler/nir/nir.h"
23 #include "compiler/nir/nir_serialize.h"
24 #include "compiler/shader_enums.h"
25 #include "gallium/auxiliary/nir/pipe_nir.h"
26 #include "gallium/auxiliary/nir/tgsi_to_nir.h"
27 #include "gallium/auxiliary/tgsi/tgsi_from_mesa.h"
28 #include "gallium/auxiliary/util/u_draw.h"
29 #include "gallium/auxiliary/util/u_framebuffer.h"
30 #include "gallium/auxiliary/util/u_helpers.h"
31 #include "gallium/auxiliary/util/u_prim_restart.h"
32 #include "gallium/auxiliary/util/u_viewport.h"
33 #include "pipe/p_context.h"
34 #include "pipe/p_defines.h"
35 #include "pipe/p_screen.h"
36 #include "pipe/p_state.h"
37 #include "util/bitscan.h"
38 #include "util/bitset.h"
39 #include "util/blend.h"
40 #include "util/blob.h"
41 #include "util/compiler.h"
42 #include "util/format/u_format.h"
43 #include "util/format/u_formats.h"
44 #include "util/half_float.h"
45 #include "util/hash_table.h"
46 #include "util/macros.h"
47 #include "util/ralloc.h"
48 #include "util/u_inlines.h"
49 #include "util/u_math.h"
50 #include "util/u_memory.h"
51 #include "util/u_prim.h"
52 #include "util/u_transfer.h"
53 #include "util/u_upload_mgr.h"
54 #include "agx_bg_eot.h"
55 #include "agx_bo.h"
56 #include "agx_device.h"
57 #include "agx_disk_cache.h"
58 #include "agx_linker.h"
59 #include "agx_nir.h"
60 #include "agx_nir_lower_gs.h"
61 #include "agx_nir_lower_vbo.h"
62 #include "agx_tilebuffer.h"
63 #include "libagx.h"
64 #include "libagx_dgc.h"
65 #include "libagx_shaders.h"
66 #include "nir_builder.h"
67 #include "nir_builder_opcodes.h"
68 #include "nir_intrinsics.h"
69 #include "nir_intrinsics_indices.h"
70 #include "nir_xfb_info.h"
71 #include "pool.h"
72
73 void
agx_legalize_compression(struct agx_context * ctx,struct agx_resource * rsrc,enum pipe_format format)74 agx_legalize_compression(struct agx_context *ctx, struct agx_resource *rsrc,
75 enum pipe_format format)
76 {
77 if (!ail_is_view_compatible(&rsrc->layout, format)) {
78 agx_decompress(ctx, rsrc, "Incompatible formats");
79 }
80 }
81
82 static void
agx_set_shader_images(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * iviews)83 agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader,
84 unsigned start_slot, unsigned count,
85 unsigned unbind_num_trailing_slots,
86 const struct pipe_image_view *iviews)
87 {
88 struct agx_context *ctx = agx_context(pctx);
89 ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
90
91 /* Unbind start_slot...start_slot+count */
92 if (!iviews) {
93 for (int i = start_slot;
94 i < start_slot + count + unbind_num_trailing_slots; i++) {
95 pipe_resource_reference(&ctx->stage[shader].images[i].resource, NULL);
96 }
97
98 ctx->stage[shader].image_mask &=
99 ~BITFIELD64_MASK(count + unbind_num_trailing_slots) << start_slot;
100 return;
101 }
102
103 /* Images writeable with pixel granularity are incompatible with
104 * compression. Decompress if necessary.
105 *
106 * Driver-internal images are used by the compute blitter and are exempt
107 * from these transitions, as it only uses compressed images when safe.
108 *
109 * We do this upfront because agx_decompress and agx_legalize_compression can
110 * call set_shader_images internall.
111 */
112 for (int i = 0; i < count; i++) {
113 const struct pipe_image_view *image = &iviews[i];
114 struct agx_resource *rsrc = agx_resource(image->resource);
115
116 if (rsrc && !(image->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL)) {
117 if (!rsrc->layout.writeable_image &&
118 (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)) {
119
120 agx_decompress(ctx, rsrc, "Shader image");
121 }
122
123 /* Readable images may be compressed but are still subject to format
124 * reinterpretation rules.
125 */
126 agx_legalize_compression(ctx, rsrc, image->format);
127
128 if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)
129 assert(rsrc->layout.writeable_image);
130 }
131 }
132
133 /* Bind start_slot...start_slot+count */
134 for (int i = 0; i < count; i++) {
135 const struct pipe_image_view *image = &iviews[i];
136
137 if (!image->resource) {
138 util_copy_image_view(&ctx->stage[shader].images[start_slot + i], NULL);
139 ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + i);
140 } else {
141 util_copy_image_view(&ctx->stage[shader].images[start_slot + i],
142 image);
143 ctx->stage[shader].image_mask |= BITFIELD_BIT(start_slot + i);
144 }
145 }
146
147 /* Unbind start_slot+count...start_slot+count+unbind_num_trailing_slots */
148 for (int i = 0; i < unbind_num_trailing_slots; i++) {
149 ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + count + i);
150 util_copy_image_view(&ctx->stage[shader].images[start_slot + count + i],
151 NULL);
152 }
153 }
154
155 static void
agx_set_shader_buffers(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)156 agx_set_shader_buffers(struct pipe_context *pctx, enum pipe_shader_type shader,
157 unsigned start, unsigned count,
158 const struct pipe_shader_buffer *buffers,
159 unsigned writable_bitmask)
160 {
161 struct agx_context *ctx = agx_context(pctx);
162
163 util_set_shader_buffers_mask(ctx->stage[shader].ssbo,
164 &ctx->stage[shader].ssbo_mask, buffers, start,
165 count);
166
167 ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SSBO;
168 ctx->stage[shader].ssbo_writable_mask &= ~(BITFIELD_MASK(count) << start);
169 ctx->stage[shader].ssbo_writable_mask |= writable_bitmask << start;
170 }
171
172 static void
agx_set_blend_color(struct pipe_context * pctx,const struct pipe_blend_color * state)173 agx_set_blend_color(struct pipe_context *pctx,
174 const struct pipe_blend_color *state)
175 {
176 struct agx_context *ctx = agx_context(pctx);
177
178 if (state)
179 memcpy(&ctx->blend_color, state, sizeof(*state));
180
181 ctx->dirty |= AGX_DIRTY_BLEND_COLOR;
182 }
183
184 static void
agx_set_patch_vertices(struct pipe_context * pctx,unsigned char n)185 agx_set_patch_vertices(struct pipe_context *pctx, unsigned char n)
186 {
187 struct agx_context *ctx = agx_context(pctx);
188 ctx->patch_vertices = n;
189 }
190
191 static void
agx_set_tess_state(struct pipe_context * pctx,const float default_outer_level[4],const float default_inner_level[2])192 agx_set_tess_state(struct pipe_context *pctx,
193 const float default_outer_level[4],
194 const float default_inner_level[2])
195 {
196 struct agx_context *ctx = agx_context(pctx);
197
198 memcpy(ctx->default_outer_level, default_outer_level, 4 * sizeof(float));
199 memcpy(ctx->default_inner_level, default_inner_level, 2 * sizeof(float));
200 }
201
202 static void *
agx_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)203 agx_create_blend_state(struct pipe_context *ctx,
204 const struct pipe_blend_state *state)
205 {
206 struct agx_blend *so = CALLOC_STRUCT(agx_blend);
207 struct agx_blend_key *key = &so->key;
208
209 key->alpha_to_coverage = state->alpha_to_coverage;
210 key->alpha_to_one = state->alpha_to_one;
211
212 key->logicop_func =
213 state->logicop_enable ? state->logicop_func : PIPE_LOGICOP_COPY;
214
215 for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
216 unsigned rti = state->independent_blend_enable ? i : 0;
217 struct pipe_rt_blend_state rt = state->rt[rti];
218
219 if (state->logicop_enable || !rt.blend_enable) {
220 /* No blending, but we get the colour mask below */
221 key->rt[i] = (struct agx_blend_rt_key){
222 .rgb_func = PIPE_BLEND_ADD,
223 .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
224 .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
225
226 .alpha_func = PIPE_BLEND_ADD,
227 .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
228 .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
229 };
230 } else {
231 key->rt[i].rgb_func = rt.rgb_func;
232 key->rt[i].rgb_src_factor = rt.rgb_src_factor;
233 key->rt[i].rgb_dst_factor = rt.rgb_dst_factor;
234
235 key->rt[i].alpha_func = rt.alpha_func;
236 key->rt[i].alpha_src_factor = rt.alpha_src_factor;
237 key->rt[i].alpha_dst_factor = rt.alpha_dst_factor;
238 }
239
240 key->rt[i].colormask = rt.colormask;
241
242 if (rt.colormask)
243 so->store |= (PIPE_CLEAR_COLOR0 << i);
244 }
245
246 return so;
247 }
248
249 static void
agx_bind_blend_state(struct pipe_context * pctx,void * cso)250 agx_bind_blend_state(struct pipe_context *pctx, void *cso)
251 {
252 struct agx_context *ctx = agx_context(pctx);
253 ctx->blend = cso;
254 ctx->dirty |= AGX_DIRTY_BLEND;
255 }
256
257 static const enum agx_stencil_op agx_stencil_ops[PIPE_STENCIL_OP_INVERT + 1] = {
258 [PIPE_STENCIL_OP_KEEP] = AGX_STENCIL_OP_KEEP,
259 [PIPE_STENCIL_OP_ZERO] = AGX_STENCIL_OP_ZERO,
260 [PIPE_STENCIL_OP_REPLACE] = AGX_STENCIL_OP_REPLACE,
261 [PIPE_STENCIL_OP_INCR] = AGX_STENCIL_OP_INCR_SAT,
262 [PIPE_STENCIL_OP_DECR] = AGX_STENCIL_OP_DECR_SAT,
263 [PIPE_STENCIL_OP_INCR_WRAP] = AGX_STENCIL_OP_INCR_WRAP,
264 [PIPE_STENCIL_OP_DECR_WRAP] = AGX_STENCIL_OP_DECR_WRAP,
265 [PIPE_STENCIL_OP_INVERT] = AGX_STENCIL_OP_INVERT,
266 };
267
268 static void
agx_pack_stencil(struct agx_fragment_stencil_packed * out,struct pipe_stencil_state st)269 agx_pack_stencil(struct agx_fragment_stencil_packed *out,
270 struct pipe_stencil_state st)
271 {
272 if (st.enabled) {
273 agx_pack(out, FRAGMENT_STENCIL, cfg) {
274 cfg.compare = (enum agx_zs_func)st.func;
275 cfg.write_mask = st.writemask;
276 cfg.read_mask = st.valuemask;
277
278 cfg.depth_pass = agx_stencil_ops[st.zpass_op];
279 cfg.depth_fail = agx_stencil_ops[st.zfail_op];
280 cfg.stencil_fail = agx_stencil_ops[st.fail_op];
281 }
282 } else {
283 agx_pack(out, FRAGMENT_STENCIL, cfg) {
284 cfg.compare = AGX_ZS_FUNC_ALWAYS;
285 cfg.write_mask = 0xFF;
286 cfg.read_mask = 0xFF;
287
288 cfg.depth_pass = AGX_STENCIL_OP_KEEP;
289 cfg.depth_fail = AGX_STENCIL_OP_KEEP;
290 cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
291 }
292 }
293 }
294
295 static void *
agx_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)296 agx_create_zsa_state(struct pipe_context *ctx,
297 const struct pipe_depth_stencil_alpha_state *state)
298 {
299 struct agx_zsa *so = CALLOC_STRUCT(agx_zsa);
300 assert(!state->depth_bounds_test && "todo");
301
302 so->base = *state;
303
304 /* Handle the enable flag */
305 enum pipe_compare_func depth_func =
306 state->depth_enabled ? state->depth_func : PIPE_FUNC_ALWAYS;
307
308 /* Z func can otherwise be used as-is */
309 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NEVER == AGX_ZS_FUNC_NEVER);
310 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LESS == AGX_ZS_FUNC_LESS);
311 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_EQUAL == AGX_ZS_FUNC_EQUAL);
312 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LEQUAL == AGX_ZS_FUNC_LEQUAL);
313 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GREATER == AGX_ZS_FUNC_GREATER);
314 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NOTEQUAL == AGX_ZS_FUNC_NOT_EQUAL);
315 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GEQUAL == AGX_ZS_FUNC_GEQUAL);
316 STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_ALWAYS == AGX_ZS_FUNC_ALWAYS);
317
318 agx_pack(&so->depth, FRAGMENT_FACE, cfg) {
319 cfg.depth_function = (enum agx_zs_func)depth_func;
320 cfg.disable_depth_write = !state->depth_writemask;
321 }
322
323 agx_pack_stencil(&so->front_stencil, state->stencil[0]);
324
325 if (state->stencil[1].enabled) {
326 agx_pack_stencil(&so->back_stencil, state->stencil[1]);
327 } else {
328 /* One sided stencil */
329 so->back_stencil = so->front_stencil;
330 }
331
332 if (depth_func != PIPE_FUNC_NEVER && depth_func != PIPE_FUNC_ALWAYS)
333 so->load |= PIPE_CLEAR_DEPTH;
334
335 if (state->depth_writemask) {
336 so->load |= PIPE_CLEAR_DEPTH;
337 so->store |= PIPE_CLEAR_DEPTH;
338 }
339
340 if (state->stencil[0].enabled) {
341 so->load |= PIPE_CLEAR_STENCIL; /* TODO: Optimize */
342 so->store |= PIPE_CLEAR_STENCIL;
343 }
344
345 return so;
346 }
347
348 static void
agx_bind_zsa_state(struct pipe_context * pctx,void * cso)349 agx_bind_zsa_state(struct pipe_context *pctx, void *cso)
350 {
351 struct agx_context *ctx = agx_context(pctx);
352 ctx->zs = cso;
353 ctx->dirty |= AGX_DIRTY_ZS;
354 }
355
356 static enum agx_polygon_mode
agx_translate_polygon_mode(unsigned mode)357 agx_translate_polygon_mode(unsigned mode)
358 {
359 switch (mode) {
360 case PIPE_POLYGON_MODE_FILL:
361 return AGX_POLYGON_MODE_FILL;
362 case PIPE_POLYGON_MODE_POINT:
363 return AGX_POLYGON_MODE_POINT;
364 case PIPE_POLYGON_MODE_LINE:
365 return AGX_POLYGON_MODE_LINE;
366 default:
367 unreachable("Unsupported polygon mode");
368 }
369 }
370
371 static void *
agx_create_rs_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * cso)372 agx_create_rs_state(struct pipe_context *ctx,
373 const struct pipe_rasterizer_state *cso)
374 {
375 struct agx_rasterizer *so = CALLOC_STRUCT(agx_rasterizer);
376 so->base = *cso;
377
378 agx_pack(so->cull, CULL, cfg) {
379 cfg.cull_front = cso->cull_face & PIPE_FACE_FRONT;
380 cfg.cull_back = cso->cull_face & PIPE_FACE_BACK;
381 cfg.depth_clip = cso->depth_clip_near;
382 cfg.depth_clamp = !cso->depth_clip_near;
383 cfg.flat_shading_vertex =
384 cso->flatshade_first ? AGX_PPP_VERTEX_0 : AGX_PPP_VERTEX_2;
385 cfg.rasterizer_discard = cso->rasterizer_discard;
386 };
387
388 /* Two-sided polygon mode doesn't seem to work on G13. Apple's OpenGL
389 * implementation lowers to multiple draws with culling. Warn.
390 */
391 if (unlikely(cso->fill_front != cso->fill_back)) {
392 agx_msg("Warning: Two-sided fill modes are unsupported, "
393 "rendering may be incorrect.\n");
394 }
395
396 so->polygon_mode = agx_translate_polygon_mode(cso->fill_front);
397 so->line_width = agx_pack_line_width(cso->line_width);
398 so->depth_bias = util_get_offset(cso, cso->fill_front);
399
400 return so;
401 }
402
403 static void
agx_bind_rasterizer_state(struct pipe_context * pctx,void * cso)404 agx_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
405 {
406 struct agx_context *ctx = agx_context(pctx);
407 struct agx_rasterizer *so = cso;
408
409 bool base_cso_changed = (cso == NULL) || (ctx->rast == NULL);
410
411 /* Check if scissor or depth bias state has changed, since scissor/depth bias
412 * enable is part of the rasterizer state but everything else needed for
413 * scissors and depth bias is part of the scissor/depth bias arrays */
414 bool scissor_zbias_changed = base_cso_changed ||
415 (ctx->rast->base.scissor != so->base.scissor) ||
416 (ctx->rast->depth_bias != so->depth_bias);
417
418 ctx->dirty |= AGX_DIRTY_RS;
419
420 if (scissor_zbias_changed)
421 ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
422
423 if (base_cso_changed ||
424 (ctx->rast->base.sprite_coord_mode != so->base.sprite_coord_mode))
425 ctx->dirty |= AGX_DIRTY_SPRITE_COORD_MODE;
426
427 ctx->rast = so;
428 }
429
430 static bool
has_edgeflags(struct agx_context * ctx,enum mesa_prim mode)431 has_edgeflags(struct agx_context *ctx, enum mesa_prim mode)
432 {
433 return ctx->stage[PIPE_SHADER_VERTEX].shader->info.has_edgeflags &&
434 mode == MESA_PRIM_TRIANGLES &&
435 (ctx->rast->base.fill_front != PIPE_POLYGON_MODE_FILL);
436 }
437
438 static enum agx_wrap
agx_wrap_from_pipe(enum pipe_tex_wrap in)439 agx_wrap_from_pipe(enum pipe_tex_wrap in)
440 {
441 switch (in) {
442 case PIPE_TEX_WRAP_REPEAT:
443 return AGX_WRAP_REPEAT;
444 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
445 return AGX_WRAP_CLAMP_TO_EDGE;
446 case PIPE_TEX_WRAP_MIRROR_REPEAT:
447 return AGX_WRAP_MIRRORED_REPEAT;
448 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
449 return AGX_WRAP_CLAMP_TO_BORDER;
450 case PIPE_TEX_WRAP_CLAMP:
451 return AGX_WRAP_CLAMP_GL;
452 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
453 return AGX_WRAP_MIRRORED_CLAMP_TO_EDGE;
454 default:
455 unreachable("Invalid wrap mode");
456 }
457 }
458
459 static enum agx_mip_filter
agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)460 agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)
461 {
462 switch (in) {
463 case PIPE_TEX_MIPFILTER_NEAREST:
464 return AGX_MIP_FILTER_NEAREST;
465 case PIPE_TEX_MIPFILTER_LINEAR:
466 return AGX_MIP_FILTER_LINEAR;
467 case PIPE_TEX_MIPFILTER_NONE:
468 return AGX_MIP_FILTER_NONE;
469 }
470
471 unreachable("Invalid mip filter");
472 }
473
474 static const enum agx_compare_func agx_compare_funcs[PIPE_FUNC_ALWAYS + 1] = {
475 [PIPE_FUNC_NEVER] = AGX_COMPARE_FUNC_NEVER,
476 [PIPE_FUNC_LESS] = AGX_COMPARE_FUNC_LESS,
477 [PIPE_FUNC_EQUAL] = AGX_COMPARE_FUNC_EQUAL,
478 [PIPE_FUNC_LEQUAL] = AGX_COMPARE_FUNC_LEQUAL,
479 [PIPE_FUNC_GREATER] = AGX_COMPARE_FUNC_GREATER,
480 [PIPE_FUNC_NOTEQUAL] = AGX_COMPARE_FUNC_NOT_EQUAL,
481 [PIPE_FUNC_GEQUAL] = AGX_COMPARE_FUNC_GEQUAL,
482 [PIPE_FUNC_ALWAYS] = AGX_COMPARE_FUNC_ALWAYS,
483 };
484
485 static const enum agx_filter agx_filters[] = {
486 [PIPE_TEX_FILTER_LINEAR] = AGX_FILTER_LINEAR,
487 [PIPE_TEX_FILTER_NEAREST] = AGX_FILTER_NEAREST,
488 };
489
490 static enum pipe_format
fixup_border_zs(enum pipe_format orig,union pipe_color_union * c)491 fixup_border_zs(enum pipe_format orig, union pipe_color_union *c)
492 {
493 switch (orig) {
494 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
495 case PIPE_FORMAT_Z24X8_UNORM:
496 /* Z24 is internally promoted to Z32F via transfer_helper. These formats
497 * are normalized so should get clamped, but Z32F does not get clamped, so
498 * we clamp here.
499 */
500 c->f[0] = SATURATE(c->f[0]);
501 return PIPE_FORMAT_Z32_FLOAT;
502
503 case PIPE_FORMAT_X24S8_UINT:
504 case PIPE_FORMAT_X32_S8X24_UINT:
505 /* Separate stencil is internally promoted */
506 return PIPE_FORMAT_S8_UINT;
507
508 default:
509 return orig;
510 }
511 }
512
513 static void *
agx_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * state)514 agx_create_sampler_state(struct pipe_context *pctx,
515 const struct pipe_sampler_state *state)
516 {
517 struct agx_sampler_state *so = CALLOC_STRUCT(agx_sampler_state);
518 so->base = *state;
519
520 /* We report a max texture LOD bias of 16, so clamp appropriately */
521 float lod_bias = CLAMP(state->lod_bias, -16.0, 16.0);
522 so->lod_bias_as_fp16 = _mesa_float_to_half(lod_bias);
523
524 agx_pack(&so->desc, SAMPLER, cfg) {
525 cfg.minimum_lod = state->min_lod;
526 cfg.maximum_lod = state->max_lod;
527 cfg.maximum_anisotropy =
528 util_next_power_of_two(MAX2(state->max_anisotropy, 1));
529 cfg.magnify = agx_filters[state->mag_img_filter];
530 cfg.minify = agx_filters[state->min_img_filter];
531 cfg.mip_filter = agx_mip_filter_from_pipe(state->min_mip_filter);
532 cfg.wrap_s = agx_wrap_from_pipe(state->wrap_s);
533 cfg.wrap_t = agx_wrap_from_pipe(state->wrap_t);
534 cfg.wrap_r = agx_wrap_from_pipe(state->wrap_r);
535 cfg.pixel_coordinates = state->unnormalized_coords;
536 cfg.compare_func = agx_compare_funcs[state->compare_func];
537 cfg.compare_enable = state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE;
538 cfg.seamful_cube_maps = !state->seamless_cube_map;
539
540 if (state->border_color_format != PIPE_FORMAT_NONE) {
541 /* TODO: Optimize to use compact descriptors for black/white borders */
542 so->uses_custom_border = true;
543 cfg.border_colour = AGX_BORDER_COLOUR_CUSTOM;
544 }
545 }
546
547 memcpy(&so->desc_without_custom_border, &so->desc, sizeof(so->desc));
548
549 if (so->uses_custom_border) {
550 union pipe_color_union border = state->border_color;
551 enum pipe_format format =
552 fixup_border_zs(state->border_color_format, &border);
553
554 agx_pack_border(&so->border, border.ui, format);
555
556 /* Neutralize the bindless-safe descriptor. XXX: This is a hack. */
557 so->desc_without_custom_border.opaque[1] &= ~(1u << 23);
558 }
559
560 return so;
561 }
562
563 static void
agx_delete_sampler_state(struct pipe_context * ctx,void * state)564 agx_delete_sampler_state(struct pipe_context *ctx, void *state)
565 {
566 struct agx_sampler_state *so = state;
567 FREE(so);
568 }
569
570 static void
agx_bind_sampler_states(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,void ** states)571 agx_bind_sampler_states(struct pipe_context *pctx, enum pipe_shader_type shader,
572 unsigned start, unsigned count, void **states)
573 {
574 struct agx_context *ctx = agx_context(pctx);
575
576 ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SAMPLER;
577
578 for (unsigned i = 0; i < count; i++) {
579 unsigned p = start + i;
580 ctx->stage[shader].samplers[p] = states ? states[i] : NULL;
581 if (ctx->stage[shader].samplers[p])
582 ctx->stage[shader].valid_samplers |= BITFIELD_BIT(p);
583 else
584 ctx->stage[shader].valid_samplers &= ~BITFIELD_BIT(p);
585 }
586
587 ctx->stage[shader].sampler_count =
588 util_last_bit(ctx->stage[shader].valid_samplers);
589
590 /* Recalculate whether we need custom borders */
591 ctx->stage[shader].custom_borders = false;
592
593 u_foreach_bit(i, ctx->stage[shader].valid_samplers) {
594 if (ctx->stage[shader].samplers[i]->uses_custom_border)
595 ctx->stage[shader].custom_borders = true;
596 }
597 }
598
599 static enum agx_texture_dimension
agx_translate_tex_dim(enum pipe_texture_target dim,unsigned samples)600 agx_translate_tex_dim(enum pipe_texture_target dim, unsigned samples)
601 {
602 assert(samples >= 1);
603
604 switch (dim) {
605 case PIPE_BUFFER:
606 case PIPE_TEXTURE_1D:
607 /* Lowered to 2D */
608 assert(samples == 1);
609 return AGX_TEXTURE_DIMENSION_2D;
610
611 case PIPE_TEXTURE_RECT:
612 case PIPE_TEXTURE_2D:
613 return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
614 : AGX_TEXTURE_DIMENSION_2D;
615
616 case PIPE_TEXTURE_1D_ARRAY:
617 assert(samples == 1);
618 /* Lowered to 2D */
619 FALLTHROUGH;
620 case PIPE_TEXTURE_2D_ARRAY:
621 return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
622 : AGX_TEXTURE_DIMENSION_2D_ARRAY;
623
624 case PIPE_TEXTURE_3D:
625 assert(samples == 1);
626 return AGX_TEXTURE_DIMENSION_3D;
627
628 case PIPE_TEXTURE_CUBE:
629 assert(samples == 1);
630 return AGX_TEXTURE_DIMENSION_CUBE;
631
632 case PIPE_TEXTURE_CUBE_ARRAY:
633 assert(samples == 1);
634 return AGX_TEXTURE_DIMENSION_CUBE_ARRAY;
635
636 default:
637 unreachable("Unsupported texture dimension");
638 }
639 }
640
641 static bool
target_is_cube(enum pipe_texture_target target)642 target_is_cube(enum pipe_texture_target target)
643 {
644 return target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY;
645 }
646
647 static void
agx_pack_texture(void * out,struct agx_resource * rsrc,enum pipe_format format,const struct pipe_sampler_view * state)648 agx_pack_texture(void *out, struct agx_resource *rsrc,
649 enum pipe_format format /* override */,
650 const struct pipe_sampler_view *state)
651 {
652 const struct util_format_description *desc = util_format_description(format);
653
654 assert(ail_is_valid_pixel_format(format));
655
656 uint8_t format_swizzle[4] = {
657 desc->swizzle[0],
658 desc->swizzle[1],
659 desc->swizzle[2],
660 desc->swizzle[3],
661 };
662
663 if (util_format_is_depth_or_stencil(format)) {
664 assert(!util_format_is_depth_and_stencil(format) &&
665 "separate stencil always used");
666
667 /* Broadcast depth and stencil */
668 format_swizzle[0] = 0;
669 format_swizzle[1] = 0;
670 format_swizzle[2] = 0;
671 format_swizzle[3] = 0;
672 }
673
674 /* We only have a single swizzle for the user swizzle and the format fixup,
675 * so compose them now. */
676 uint8_t out_swizzle[4];
677 uint8_t view_swizzle[4] = {state->swizzle_r, state->swizzle_g,
678 state->swizzle_b, state->swizzle_a};
679
680 util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle);
681
682 unsigned first_layer =
683 (state->target == PIPE_BUFFER) ? 0 : state->u.tex.first_layer;
684
685 /* Pack the descriptor into GPU memory */
686 agx_pack(out, TEXTURE, cfg) {
687 cfg.dimension = agx_translate_tex_dim(state->target,
688 util_res_sample_count(&rsrc->base));
689 cfg.layout = agx_translate_layout(rsrc->layout.tiling);
690 cfg.channels = ail_pixel_format[format].channels;
691 cfg.type = ail_pixel_format[format].type;
692 cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]);
693 cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]);
694 cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]);
695 cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]);
696
697 if (state->target == PIPE_BUFFER) {
698 unsigned size_el =
699 agx_texture_buffer_size_el(format, state->u.buf.size);
700
701 /* Use a 2D texture to increase the maximum size */
702 cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
703 cfg.height = DIV_ROUND_UP(size_el, cfg.width);
704 cfg.first_level = cfg.last_level = 0;
705 cfg.buffer_size_sw = size_el;
706 cfg.buffer_offset_sw = 0;
707 } else {
708 cfg.width = rsrc->base.width0;
709 cfg.height = rsrc->base.height0;
710 cfg.first_level = state->u.tex.first_level;
711 cfg.last_level = state->u.tex.last_level;
712 }
713
714 cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
715 cfg.unk_mipmapped = rsrc->mipmapped;
716 cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
717
718 if (ail_is_compressed(&rsrc->layout)) {
719 cfg.compressed_1 = true;
720 cfg.extended = true;
721 }
722
723 cfg.address = agx_map_texture_gpu(rsrc, first_layer);
724
725 if (state->target == PIPE_BUFFER)
726 cfg.address += state->u.buf.offset;
727
728 if (ail_is_compressed(&rsrc->layout)) {
729 cfg.acceleration_buffer =
730 agx_map_texture_gpu(rsrc, 0) + rsrc->layout.metadata_offset_B +
731 (first_layer * rsrc->layout.compression_layer_stride_B);
732 }
733
734 if (state->target == PIPE_TEXTURE_3D) {
735 cfg.depth = rsrc->base.depth0;
736 } else if (state->target == PIPE_BUFFER) {
737 cfg.depth = 1;
738 } else {
739 unsigned layers =
740 state->u.tex.last_layer - state->u.tex.first_layer + 1;
741
742 if (target_is_cube(state->target))
743 layers /= 6;
744
745 if (rsrc->layout.tiling == AIL_TILING_LINEAR &&
746 (state->target == PIPE_TEXTURE_1D_ARRAY ||
747 state->target == PIPE_TEXTURE_2D_ARRAY)) {
748
749 cfg.depth_linear = layers;
750 cfg.layer_stride_linear = (rsrc->layout.layer_stride_B - 0x80);
751 cfg.extended = true;
752 } else {
753 assert((rsrc->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
754 cfg.depth = layers;
755 }
756 }
757
758 if (rsrc->base.nr_samples > 1)
759 cfg.samples = agx_translate_sample_count(rsrc->base.nr_samples);
760
761 if (state->target == PIPE_BUFFER) {
762 cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16;
763 } else if (rsrc->layout.tiling == AIL_TILING_LINEAR) {
764 cfg.stride = ail_get_linear_stride_B(&rsrc->layout, 0) - 16;
765 } else {
766 assert(rsrc->layout.tiling == AIL_TILING_TWIDDLED ||
767 rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED);
768
769 cfg.page_aligned_layers = rsrc->layout.page_aligned_layers;
770 }
771 }
772 }
773
774 static struct pipe_sampler_view *
agx_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * orig_texture,const struct pipe_sampler_view * state)775 agx_create_sampler_view(struct pipe_context *pctx,
776 struct pipe_resource *orig_texture,
777 const struct pipe_sampler_view *state)
778 {
779 struct agx_resource *rsrc = agx_resource(orig_texture);
780 struct agx_sampler_view *so = CALLOC_STRUCT(agx_sampler_view);
781
782 if (!so)
783 return NULL;
784
785 struct pipe_resource *texture = orig_texture;
786 enum pipe_format format = state->format;
787
788 const struct util_format_description *desc = util_format_description(format);
789
790 /* Separate stencil always used on G13, so we need to fix up for Z32S8 */
791 if (util_format_has_stencil(desc) && rsrc->separate_stencil) {
792 if (util_format_has_depth(desc)) {
793 /* Reinterpret as the depth-only part */
794 format = util_format_get_depth_only(format);
795 } else {
796 /* Use the stencil-only-part */
797 rsrc = rsrc->separate_stencil;
798 texture = &rsrc->base;
799 format = texture->format;
800 }
801 }
802
803 agx_legalize_compression(agx_context(pctx), rsrc, format);
804
805 /* Save off the resource that we actually use, with the stencil fixed up */
806 so->rsrc = rsrc;
807 so->format = format;
808
809 so->base = *state;
810 so->base.texture = NULL;
811 pipe_resource_reference(&so->base.texture, orig_texture);
812 pipe_reference_init(&so->base.reference, 1);
813 so->base.context = pctx;
814 return &so->base;
815 }
816
817 static void
agx_set_sampler_views(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)818 agx_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
819 unsigned start, unsigned count,
820 unsigned unbind_num_trailing_slots, bool take_ownership,
821 struct pipe_sampler_view **views)
822 {
823 struct agx_context *ctx = agx_context(pctx);
824 unsigned new_nr = 0;
825 unsigned i;
826
827 assert(start == 0);
828
829 if (!views)
830 count = 0;
831
832 for (i = 0; i < count; ++i) {
833 if (take_ownership) {
834 pipe_sampler_view_reference(
835 (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
836 ctx->stage[shader].textures[i] = (struct agx_sampler_view *)views[i];
837 } else {
838 pipe_sampler_view_reference(
839 (struct pipe_sampler_view **)&ctx->stage[shader].textures[i],
840 views[i]);
841 }
842 }
843
844 for (; i < count + unbind_num_trailing_slots; i++) {
845 pipe_sampler_view_reference(
846 (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
847 }
848
849 for (unsigned t = 0; t < MAX2(ctx->stage[shader].texture_count, count);
850 ++t) {
851 if (ctx->stage[shader].textures[t])
852 new_nr = t + 1;
853 }
854
855 ctx->stage[shader].texture_count = new_nr;
856 ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
857 }
858
859 static void
agx_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * pview)860 agx_sampler_view_destroy(struct pipe_context *ctx,
861 struct pipe_sampler_view *pview)
862 {
863 struct agx_sampler_view *view = (struct agx_sampler_view *)pview;
864 pipe_resource_reference(&view->base.texture, NULL);
865 FREE(view);
866 }
867
868 static struct pipe_surface *
agx_create_surface(struct pipe_context * ctx,struct pipe_resource * texture,const struct pipe_surface * surf_tmpl)869 agx_create_surface(struct pipe_context *ctx, struct pipe_resource *texture,
870 const struct pipe_surface *surf_tmpl)
871 {
872 agx_legalize_compression(agx_context(ctx), agx_resource(texture),
873 surf_tmpl->format);
874
875 struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
876
877 if (!surface)
878 return NULL;
879
880 unsigned level = surf_tmpl->u.tex.level;
881
882 pipe_reference_init(&surface->reference, 1);
883 pipe_resource_reference(&surface->texture, texture);
884
885 assert(texture->target != PIPE_BUFFER && "buffers are not renderable");
886
887 surface->context = ctx;
888 surface->format = surf_tmpl->format;
889 surface->nr_samples = surf_tmpl->nr_samples;
890 surface->width = u_minify(texture->width0, level);
891 surface->height = u_minify(texture->height0, level);
892 surface->texture = texture;
893 surface->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
894 surface->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
895 surface->u.tex.level = level;
896
897 return surface;
898 }
899
900 static void
agx_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)901 agx_set_clip_state(struct pipe_context *ctx,
902 const struct pipe_clip_state *state)
903 {
904 }
905
906 static void
agx_set_polygon_stipple(struct pipe_context * pctx,const struct pipe_poly_stipple * state)907 agx_set_polygon_stipple(struct pipe_context *pctx,
908 const struct pipe_poly_stipple *state)
909 {
910 struct agx_context *ctx = agx_context(pctx);
911
912 memcpy(ctx->poly_stipple, state->stipple, sizeof(ctx->poly_stipple));
913 ctx->dirty |= AGX_DIRTY_POLY_STIPPLE;
914 }
915
916 static void
agx_set_sample_mask(struct pipe_context * pipe,unsigned sample_mask)917 agx_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
918 {
919 struct agx_context *ctx = agx_context(pipe);
920
921 /* Optimization: At most MSAA 4x supported, so normalize to avoid pointless
922 * dirtying switching between e.g. 0xFFFF and 0xFFFFFFFF masks.
923 */
924 unsigned new_mask = sample_mask & BITFIELD_MASK(4);
925
926 if (ctx->sample_mask != new_mask) {
927 ctx->sample_mask = new_mask;
928 ctx->dirty |= AGX_DIRTY_SAMPLE_MASK;
929 }
930 }
931
932 static void
agx_set_scissor_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * scissor)933 agx_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
934 unsigned num_scissors,
935 const struct pipe_scissor_state *scissor)
936 {
937 struct agx_context *ctx = agx_context(pctx);
938
939 STATIC_ASSERT(sizeof(ctx->scissor[0]) == sizeof(*scissor));
940 assert(start_slot + num_scissors <= AGX_MAX_VIEWPORTS);
941
942 memcpy(&ctx->scissor[start_slot], scissor, sizeof(*scissor) * num_scissors);
943 ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
944 }
945
946 static void
agx_set_stencil_ref(struct pipe_context * pctx,const struct pipe_stencil_ref state)947 agx_set_stencil_ref(struct pipe_context *pctx,
948 const struct pipe_stencil_ref state)
949 {
950 struct agx_context *ctx = agx_context(pctx);
951 ctx->stencil_ref = state;
952 ctx->dirty |= AGX_DIRTY_STENCIL_REF;
953 }
954
955 static void
agx_set_viewport_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_viewports,const struct pipe_viewport_state * vp)956 agx_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
957 unsigned num_viewports,
958 const struct pipe_viewport_state *vp)
959 {
960 struct agx_context *ctx = agx_context(pctx);
961
962 STATIC_ASSERT(sizeof(ctx->viewport[0]) == sizeof(*vp));
963 assert(start_slot + num_viewports <= AGX_MAX_VIEWPORTS);
964
965 memcpy(&ctx->viewport[start_slot], vp, sizeof(*vp) * num_viewports);
966 ctx->dirty |= AGX_DIRTY_VIEWPORT;
967 }
968
969 static void
agx_get_scissor_extents(const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,const struct pipe_framebuffer_state * fb,unsigned * minx,unsigned * miny,unsigned * maxx,unsigned * maxy)970 agx_get_scissor_extents(const struct pipe_viewport_state *vp,
971 const struct pipe_scissor_state *ss,
972 const struct pipe_framebuffer_state *fb, unsigned *minx,
973 unsigned *miny, unsigned *maxx, unsigned *maxy)
974 {
975 float trans_x = vp->translate[0], trans_y = vp->translate[1];
976 float abs_scale_x = fabsf(vp->scale[0]), abs_scale_y = fabsf(vp->scale[1]);
977
978 /* Calculate the extent of the viewport. Note if a particular dimension of
979 * the viewport is an odd number of pixels, both the translate and the scale
980 * will have a fractional part of 0.5, so adding and subtracting them yields
981 * an integer. Therefore we don't need to round explicitly */
982 *minx = CLAMP((int)(trans_x - abs_scale_x), 0, fb->width);
983 *miny = CLAMP((int)(trans_y - abs_scale_y), 0, fb->height);
984 *maxx = CLAMP((int)(trans_x + abs_scale_x), 0, fb->width);
985 *maxy = CLAMP((int)(trans_y + abs_scale_y), 0, fb->height);
986
987 if (ss) {
988 *minx = MAX2(ss->minx, *minx);
989 *miny = MAX2(ss->miny, *miny);
990 *maxx = MIN2(ss->maxx, *maxx);
991 *maxy = MIN2(ss->maxy, *maxy);
992 }
993 }
994
995 static void
agx_upload_viewport_scissor(struct agx_pool * pool,struct agx_batch * batch,uint8_t ** out,const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,bool clip_halfz,bool multi_viewport)996 agx_upload_viewport_scissor(struct agx_pool *pool, struct agx_batch *batch,
997 uint8_t **out, const struct pipe_viewport_state *vp,
998 const struct pipe_scissor_state *ss,
999 bool clip_halfz, bool multi_viewport)
1000 {
1001 /* Number of viewports/scissors isn't precisely determinable in Gallium, so
1002 * just key off whether we can write to anything other than viewport 0. This
1003 * could be tuned in the future.
1004 */
1005 unsigned count = multi_viewport ? AGX_MAX_VIEWPORTS : 1;
1006
1007 /* Allocate scissor descriptors */
1008 unsigned index = batch->scissor.size / AGX_SCISSOR_LENGTH;
1009 struct agx_scissor_packed *scissors =
1010 util_dynarray_grow_bytes(&batch->scissor, count, AGX_SCISSOR_LENGTH);
1011
1012 unsigned minx[AGX_MAX_VIEWPORTS], miny[AGX_MAX_VIEWPORTS];
1013 unsigned maxx[AGX_MAX_VIEWPORTS], maxy[AGX_MAX_VIEWPORTS];
1014
1015 /* Upload each scissor */
1016 for (unsigned i = 0; i < count; ++i) {
1017 agx_get_scissor_extents(&vp[i], ss ? &ss[i] : NULL, &batch->key, &minx[i],
1018 &miny[i], &maxx[i], &maxy[i]);
1019
1020 float minz, maxz;
1021 util_viewport_zmin_zmax(vp, clip_halfz, &minz, &maxz);
1022
1023 agx_pack(scissors + i, SCISSOR, cfg) {
1024 cfg.min_x = minx[i];
1025 cfg.min_y = miny[i];
1026 cfg.min_z = minz;
1027 cfg.max_x = maxx[i];
1028 cfg.max_y = maxy[i];
1029 cfg.max_z = maxz;
1030 }
1031 }
1032
1033 /* Upload state */
1034 struct AGX_PPP_HEADER present = {
1035 .depth_bias_scissor = true,
1036 .region_clip = true,
1037 .viewport = true,
1038 .viewport_count = count,
1039 };
1040
1041 size_t size = agx_ppp_update_size(&present);
1042 struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
1043 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
1044
1045 agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
1046 cfg.scissor = index;
1047
1048 /* Use the current depth bias, we allocate linearly */
1049 unsigned count = batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
1050 cfg.depth_bias = count ? count - 1 : 0;
1051 };
1052
1053 for (unsigned i = 0; i < count; ++i) {
1054 agx_ppp_push(&ppp, REGION_CLIP, cfg) {
1055 cfg.enable = true;
1056 cfg.min_x = minx[i] / 32;
1057 cfg.min_y = miny[i] / 32;
1058 cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
1059 cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
1060 }
1061 }
1062
1063 agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
1064 ;
1065
1066 /* Upload viewports */
1067 for (unsigned i = 0; i < count; ++i) {
1068 agx_ppp_push(&ppp, VIEWPORT, cfg) {
1069 cfg.translate_x = vp[i].translate[0];
1070 cfg.translate_y = vp[i].translate[1];
1071 cfg.translate_z = vp[i].translate[2];
1072 cfg.scale_x = vp[i].scale[0];
1073 cfg.scale_y = vp[i].scale[1];
1074 cfg.scale_z = vp[i].scale[2];
1075
1076 if (!clip_halfz) {
1077 cfg.translate_z -= cfg.scale_z;
1078 cfg.scale_z *= 2;
1079 }
1080 }
1081 }
1082
1083 agx_ppp_fini(out, &ppp);
1084 }
1085
1086 static void
agx_upload_depth_bias(struct agx_batch * batch,const struct pipe_rasterizer_state * rast)1087 agx_upload_depth_bias(struct agx_batch *batch,
1088 const struct pipe_rasterizer_state *rast)
1089 {
1090 void *ptr =
1091 util_dynarray_grow_bytes(&batch->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
1092
1093 agx_pack(ptr, DEPTH_BIAS, cfg) {
1094 cfg.depth_bias = rast->offset_units * 2.0f;
1095 cfg.slope_scale = rast->offset_scale;
1096 cfg.clamp = rast->offset_clamp;
1097 }
1098 }
1099
1100 /* A framebuffer state can be reused across batches, so it doesn't make sense
1101 * to add surfaces to the BO list here. Instead we added them when flushing.
1102 */
1103
1104 static void
agx_set_framebuffer_state(struct pipe_context * pctx,const struct pipe_framebuffer_state * state)1105 agx_set_framebuffer_state(struct pipe_context *pctx,
1106 const struct pipe_framebuffer_state *state)
1107 {
1108 struct agx_context *ctx = agx_context(pctx);
1109
1110 if (!state)
1111 return;
1112
1113 util_copy_framebuffer_state(&ctx->framebuffer, state);
1114 ctx->batch = NULL;
1115 agx_dirty_all(ctx);
1116 }
1117
1118 /*
1119 * To write out render targets, each render target surface is bound as a
1120 * writable shader image, written with the end-of-tile program. This helper
1121 * constructs the internal pipe_image_view used.
1122 */
1123 static struct pipe_image_view
image_view_for_surface(struct pipe_surface * surf)1124 image_view_for_surface(struct pipe_surface *surf)
1125 {
1126 return (struct pipe_image_view){
1127 .resource = surf->texture,
1128 .format = surf->format,
1129 .access = PIPE_IMAGE_ACCESS_READ_WRITE,
1130 .shader_access = PIPE_IMAGE_ACCESS_READ_WRITE,
1131 .u.tex.single_layer_view =
1132 surf->u.tex.first_layer == surf->u.tex.last_layer,
1133 .u.tex.first_layer = surf->u.tex.first_layer,
1134 .u.tex.last_layer = surf->u.tex.last_layer,
1135 .u.tex.level = surf->u.tex.level,
1136 };
1137 }
1138
1139 /* Similarly, to read render targets, surfaces are bound as textures */
1140 static struct pipe_sampler_view
sampler_view_for_surface(struct pipe_surface * surf)1141 sampler_view_for_surface(struct pipe_surface *surf)
1142 {
1143 bool layered = surf->u.tex.last_layer > surf->u.tex.first_layer;
1144
1145 return (struct pipe_sampler_view){
1146 /* To reduce shader variants, we always use a 2D texture. For reloads of
1147 * arrays and cube maps, we map a single layer as a 2D image.
1148 */
1149 .target = layered ? PIPE_TEXTURE_2D_ARRAY : PIPE_TEXTURE_2D,
1150 .swizzle_r = PIPE_SWIZZLE_X,
1151 .swizzle_g = PIPE_SWIZZLE_Y,
1152 .swizzle_b = PIPE_SWIZZLE_Z,
1153 .swizzle_a = PIPE_SWIZZLE_W,
1154 .u.tex =
1155 {
1156 .first_layer = surf->u.tex.first_layer,
1157 .last_layer = surf->u.tex.last_layer,
1158 .first_level = surf->u.tex.level,
1159 .last_level = surf->u.tex.level,
1160 },
1161 };
1162 }
1163
1164 static bool
target_is_array(enum pipe_texture_target target)1165 target_is_array(enum pipe_texture_target target)
1166 {
1167 switch (target) {
1168 case PIPE_TEXTURE_3D:
1169 case PIPE_TEXTURE_CUBE:
1170 case PIPE_TEXTURE_1D_ARRAY:
1171 case PIPE_TEXTURE_2D_ARRAY:
1172 case PIPE_TEXTURE_CUBE_ARRAY:
1173 return true;
1174 default:
1175 return false;
1176 }
1177 }
1178
1179 static void
agx_batch_upload_pbe(struct agx_batch * batch,struct agx_pbe_packed * out,struct pipe_image_view * view,bool block_access,bool arrays_as_2d,bool force_2d_array,bool emrt)1180 agx_batch_upload_pbe(struct agx_batch *batch, struct agx_pbe_packed *out,
1181 struct pipe_image_view *view, bool block_access,
1182 bool arrays_as_2d, bool force_2d_array, bool emrt)
1183 {
1184 struct agx_resource *tex = agx_resource(view->resource);
1185 const struct util_format_description *desc =
1186 util_format_description(view->format);
1187 enum pipe_texture_target target = tex->base.target;
1188 bool is_buffer = (target == PIPE_BUFFER);
1189
1190 if (!is_buffer && view->u.tex.single_layer_view)
1191 target = PIPE_TEXTURE_2D;
1192
1193 arrays_as_2d |= (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
1194
1195 /* To reduce shader variants, spilled layered render targets are accessed as
1196 * 2D Arrays regardless of the actual target, so force in that case.
1197 *
1198 * Likewise, cubes are accessed as arrays for consistency with NIR.
1199 */
1200 if ((arrays_as_2d && target_is_array(target)) || target_is_cube(target) ||
1201 force_2d_array)
1202 target = PIPE_TEXTURE_2D_ARRAY;
1203
1204 unsigned level = is_buffer ? 0 : view->u.tex.level;
1205 unsigned layer = is_buffer ? 0 : view->u.tex.first_layer;
1206
1207 agx_pack(out, PBE, cfg) {
1208 cfg.dimension =
1209 agx_translate_tex_dim(target, util_res_sample_count(&tex->base));
1210 cfg.layout = agx_translate_layout(tex->layout.tiling);
1211 cfg.channels = ail_pixel_format[view->format].channels;
1212 cfg.type = ail_pixel_format[view->format].type;
1213 cfg.srgb = util_format_is_srgb(view->format);
1214
1215 assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
1216
1217 for (unsigned i = 0; i < desc->nr_channels; ++i) {
1218 if (desc->swizzle[i] == 0)
1219 cfg.swizzle_r = i;
1220 else if (desc->swizzle[i] == 1)
1221 cfg.swizzle_g = i;
1222 else if (desc->swizzle[i] == 2)
1223 cfg.swizzle_b = i;
1224 else if (desc->swizzle[i] == 3)
1225 cfg.swizzle_a = i;
1226 }
1227
1228 cfg.buffer = agx_map_texture_gpu(tex, layer);
1229 cfg.unk_mipmapped = tex->mipmapped;
1230
1231 if (is_buffer) {
1232 unsigned size_el =
1233 agx_texture_buffer_size_el(view->format, view->u.buf.size);
1234
1235 /* Buffers uniquely have offsets (in bytes, not texels) */
1236 cfg.buffer += view->u.buf.offset;
1237
1238 /* Use a 2D texture to increase the maximum size */
1239 cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
1240 cfg.height = DIV_ROUND_UP(size_el, cfg.width);
1241 cfg.level = 0;
1242 cfg.stride = (cfg.width * util_format_get_blocksize(view->format)) - 4;
1243 cfg.layers = 1;
1244 cfg.levels = 1;
1245 } else if (util_res_sample_count(&tex->base) > 1 && !block_access) {
1246 /* Multisampled images are bound like buffer textures, with
1247 * addressing arithmetic to determine the texel to write.
1248 *
1249 * Note that the end-of-tile program uses real multisample images with
1250 * image_write_block instructions.
1251 */
1252 unsigned blocksize_B = util_format_get_blocksize(view->format);
1253 unsigned size_px =
1254 (tex->layout.size_B - tex->layout.layer_stride_B * layer) /
1255 blocksize_B;
1256
1257 cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
1258 cfg.layout = AGX_LAYOUT_LINEAR;
1259 cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
1260 cfg.height = DIV_ROUND_UP(size_px, cfg.width);
1261 cfg.stride = (cfg.width * blocksize_B) - 4;
1262 cfg.layers = 1;
1263 cfg.levels = 1;
1264
1265 cfg.buffer += tex->layout.level_offsets_B[level];
1266 cfg.level = 0;
1267 } else {
1268 cfg.width = view->resource->width0;
1269 cfg.height = view->resource->height0;
1270 cfg.level = level;
1271
1272 unsigned layers = view->u.tex.last_layer - layer + 1;
1273
1274 if (tex->layout.tiling == AIL_TILING_LINEAR &&
1275 (target == PIPE_TEXTURE_1D_ARRAY ||
1276 target == PIPE_TEXTURE_2D_ARRAY)) {
1277
1278 cfg.depth_linear = layers;
1279 cfg.layer_stride_linear = (tex->layout.layer_stride_B - 0x80);
1280 cfg.extended = true;
1281 } else {
1282 assert((tex->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
1283 cfg.layers = layers;
1284 }
1285
1286 if (tex->layout.tiling == AIL_TILING_LINEAR) {
1287 cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
1288 cfg.levels = 1;
1289 } else {
1290 cfg.page_aligned_layers = tex->layout.page_aligned_layers;
1291 cfg.levels = tex->base.last_level + 1;
1292 }
1293
1294 if (tex->base.nr_samples > 1)
1295 cfg.samples = agx_translate_sample_count(tex->base.nr_samples);
1296 }
1297
1298 if (ail_is_compressed(&tex->layout) && !emrt) {
1299 cfg.compressed_1 = true;
1300 cfg.extended = true;
1301
1302 cfg.acceleration_buffer =
1303 agx_map_texture_gpu(tex, 0) + tex->layout.metadata_offset_B +
1304 (layer * tex->layout.compression_layer_stride_B);
1305 }
1306
1307 /* When the descriptor isn't extended architecturally, we can use the last
1308 * 8 bytes as a sideband. We use it to provide metadata for image atomics.
1309 */
1310 if (!cfg.extended && (tex->layout.writeable_image || emrt) &&
1311 tex->base.target != PIPE_BUFFER) {
1312
1313 if (util_res_sample_count(&tex->base) > 1) {
1314 cfg.aligned_width_msaa_sw =
1315 align(u_minify(view->resource->width0, level),
1316 tex->layout.tilesize_el[level].width_el);
1317 } else {
1318 cfg.level_offset_sw =
1319 ail_get_level_offset_B(&tex->layout, cfg.level);
1320 }
1321
1322 cfg.sample_count_log2_sw = util_logbase2(tex->base.nr_samples);
1323
1324 if (tex->layout.tiling == AIL_TILING_TWIDDLED || emrt) {
1325 struct ail_tile tile_size = tex->layout.tilesize_el[level];
1326 cfg.tile_width_sw = tile_size.width_el;
1327 cfg.tile_height_sw = tile_size.height_el;
1328
1329 cfg.layer_stride_sw = tex->layout.layer_stride_B;
1330 }
1331 }
1332 };
1333 }
1334
1335 /* Likewise constant buffers, textures, and samplers are handled in a common
1336 * per-draw path, with dirty tracking to reduce the costs involved.
1337 */
1338
1339 static void
agx_set_constant_buffer(struct pipe_context * pctx,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)1340 agx_set_constant_buffer(struct pipe_context *pctx, enum pipe_shader_type shader,
1341 uint index, bool take_ownership,
1342 const struct pipe_constant_buffer *cb)
1343 {
1344 struct agx_context *ctx = agx_context(pctx);
1345 struct agx_stage *s = &ctx->stage[shader];
1346 struct pipe_constant_buffer *constants = &s->cb[index];
1347
1348 util_copy_constant_buffer(&s->cb[index], cb, take_ownership);
1349
1350 /* Upload user buffer immediately */
1351 if (constants->user_buffer && !constants->buffer) {
1352 u_upload_data(ctx->base.const_uploader, 0, constants->buffer_size, 64,
1353 constants->user_buffer, &constants->buffer_offset,
1354 &constants->buffer);
1355 }
1356
1357 unsigned mask = (1 << index);
1358
1359 if (cb)
1360 s->cb_mask |= mask;
1361 else
1362 s->cb_mask &= ~mask;
1363
1364 ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_CONST;
1365 }
1366
1367 static void
agx_surface_destroy(struct pipe_context * ctx,struct pipe_surface * surface)1368 agx_surface_destroy(struct pipe_context *ctx, struct pipe_surface *surface)
1369 {
1370 pipe_resource_reference(&surface->texture, NULL);
1371 FREE(surface);
1372 }
1373
1374 static void
agx_delete_state(struct pipe_context * ctx,void * state)1375 agx_delete_state(struct pipe_context *ctx, void *state)
1376 {
1377 FREE(state);
1378 }
1379
1380 /* BOs added to the batch in the uniform upload path */
1381
1382 static void
agx_set_vertex_buffers(struct pipe_context * pctx,unsigned count,const struct pipe_vertex_buffer * buffers)1383 agx_set_vertex_buffers(struct pipe_context *pctx, unsigned count,
1384 const struct pipe_vertex_buffer *buffers)
1385 {
1386 struct agx_context *ctx = agx_context(pctx);
1387
1388 util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers,
1389 count, true);
1390
1391 ctx->dirty |= AGX_DIRTY_VERTEX;
1392 }
1393
1394 static void *
agx_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)1395 agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
1396 const struct pipe_vertex_element *state)
1397 {
1398 assert(count <= AGX_MAX_ATTRIBS);
1399
1400 struct agx_vertex_elements *so = calloc(1, sizeof(*so));
1401
1402 for (unsigned i = 0; i < count; ++i) {
1403 const struct pipe_vertex_element ve = state[i];
1404
1405 const struct util_format_description *desc =
1406 util_format_description(ve.src_format);
1407 unsigned chan_size = desc->channel[0].size / 8;
1408 assert((ve.src_offset & (chan_size - 1)) == 0);
1409
1410 so->buffers[i] = ve.vertex_buffer_index;
1411 so->src_offsets[i] = ve.src_offset;
1412
1413 so->key[i] = (struct agx_velem_key){
1414 .stride = ve.src_stride,
1415 .format = ve.src_format,
1416 .divisor = ve.instance_divisor,
1417 .instanced = ve.instance_divisor > 0,
1418 };
1419 }
1420
1421 return so;
1422 }
1423
1424 static void
agx_bind_vertex_elements_state(struct pipe_context * pctx,void * cso)1425 agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso)
1426 {
1427 struct agx_context *ctx = agx_context(pctx);
1428 ctx->attributes = cso;
1429 ctx->dirty |= AGX_DIRTY_VERTEX;
1430 }
1431
1432 DERIVE_HASH_TABLE(asahi_vs_shader_key);
1433 DERIVE_HASH_TABLE(asahi_gs_shader_key);
1434 DERIVE_HASH_TABLE(asahi_fs_shader_key);
1435 DERIVE_HASH_TABLE(agx_fast_link_key);
1436
1437 /* No compute variants */
1438 static uint32_t
asahi_cs_shader_key_hash(const void * key)1439 asahi_cs_shader_key_hash(const void *key)
1440 {
1441 return 0;
1442 }
1443
1444 static bool
asahi_cs_shader_key_equal(const void * a,const void * b)1445 asahi_cs_shader_key_equal(const void *a, const void *b)
1446 {
1447 return true;
1448 }
1449
1450 /* Dynamic lowered I/O version of nir_lower_clip_halfz */
1451 static bool
agx_nir_lower_clip_m1_1(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1452 agx_nir_lower_clip_m1_1(nir_builder *b, nir_intrinsic_instr *intr,
1453 UNUSED void *data)
1454 {
1455 if (intr->intrinsic != nir_intrinsic_store_output)
1456 return false;
1457 if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_POS)
1458 return false;
1459
1460 assert(nir_intrinsic_component(intr) == 0 && "not yet scalarized");
1461 b->cursor = nir_before_instr(&intr->instr);
1462
1463 nir_def *pos = intr->src[0].ssa;
1464 nir_def *z = nir_channel(b, pos, 2);
1465 nir_def *w = nir_channel(b, pos, 3);
1466 nir_def *c = nir_load_clip_z_coeff_agx(b);
1467
1468 /* Lerp. If c = 0, reduces to z. If c = 1/2, reduces to (z + w)/2 */
1469 nir_def *new_z = nir_ffma(b, nir_fneg(b, z), c, nir_ffma(b, w, c, z));
1470 nir_src_rewrite(&intr->src[0], nir_vector_insert_imm(b, pos, new_z, 2));
1471 return true;
1472 }
1473
1474 /*
1475 * To implement point sprites, we'll replace TEX0...7 with point coordinate
1476 * reads as required. However, the .zw needs to read back 0.0/1.0. This pass
1477 * fixes up TEX loads of Z and W according to a uniform passed in a sideband,
1478 * eliminating shader variants.
1479 */
1480 static bool
agx_nir_lower_point_sprite_zw(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1481 agx_nir_lower_point_sprite_zw(nir_builder *b, nir_intrinsic_instr *intr,
1482 UNUSED void *data)
1483 {
1484 if (intr->intrinsic != nir_intrinsic_load_input &&
1485 intr->intrinsic != nir_intrinsic_load_interpolated_input)
1486 return false;
1487
1488 gl_varying_slot loc = nir_intrinsic_io_semantics(intr).location;
1489 if (!(loc >= VARYING_SLOT_TEX0 && loc <= VARYING_SLOT_TEX7))
1490 return false;
1491
1492 b->cursor = nir_after_instr(&intr->instr);
1493 unsigned component = nir_intrinsic_component(intr);
1494
1495 nir_def *mask = nir_load_tex_sprite_mask_agx(b);
1496 nir_def *location = nir_iadd_imm(b, nir_get_io_offset_src(intr)->ssa,
1497 loc - VARYING_SLOT_TEX0);
1498 nir_def *bit = nir_ishl(b, nir_imm_intN_t(b, 1, 16), location);
1499 nir_def *replace = nir_i2b(b, nir_iand(b, mask, bit));
1500
1501 nir_def *vec = nir_pad_vec4(b, &intr->def);
1502 nir_def *chans[4] = {NULL, NULL, nir_imm_floatN_t(b, 0.0, vec->bit_size),
1503 nir_imm_floatN_t(b, 1.0, vec->bit_size)};
1504
1505 for (unsigned i = 0; i < 4; ++i) {
1506 nir_def *chan = nir_channel_or_undef(b, vec, i - component);
1507 chans[i] = chans[i] ? nir_bcsel(b, replace, chans[i], chan) : chan;
1508 }
1509
1510 nir_def *new_vec = nir_vec(b, &chans[component], intr->def.num_components);
1511 nir_def_rewrite_uses_after(&intr->def, new_vec, new_vec->parent_instr);
1512 return true;
1513 }
1514
1515 /*
1516 * Compile a NIR shader. The only lowering left at this point is sysvals. The
1517 * shader key should have already been applied. agx_compile_variant may call
1518 * this multiple times if there are auxiliary shaders.
1519 */
1520 static struct agx_compiled_shader *
agx_compile_nir(struct agx_device * dev,nir_shader * nir,struct util_debug_callback * debug,enum pipe_shader_type stage,bool internal_kernel,bool terminal,bool secondary,unsigned cf_base,BITSET_WORD * attrib_components_read)1521 agx_compile_nir(struct agx_device *dev, nir_shader *nir,
1522 struct util_debug_callback *debug, enum pipe_shader_type stage,
1523 bool internal_kernel, bool terminal, bool secondary,
1524 unsigned cf_base, BITSET_WORD *attrib_components_read)
1525 {
1526 struct agx_compiled_shader *compiled = CALLOC_STRUCT(agx_compiled_shader);
1527 compiled->stage = stage;
1528 if (attrib_components_read)
1529 BITSET_COPY(compiled->attrib_components_read, attrib_components_read);
1530
1531 struct agx_shader_key key = {
1532 .dev = agx_gather_device_key(dev),
1533 .libagx = dev->libagx,
1534 .has_scratch = !secondary,
1535 .promote_constants = true,
1536 .no_stop = !terminal,
1537 .secondary = secondary,
1538 };
1539
1540 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1541 NIR_PASS(_, nir, agx_nir_lower_interpolation);
1542 }
1543
1544 /* We always use dynamic sample shading in the GL driver. Indicate that. */
1545 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
1546 nir->info.fs.uses_sample_shading)
1547 key.fs.inside_sample_loop = true;
1548
1549 if (internal_kernel) {
1550 key.reserved_preamble = 8;
1551 } else if (!secondary) {
1552 NIR_PASS(_, nir, agx_nir_lower_sysvals, stage, true);
1553 NIR_PASS(_, nir, agx_nir_layout_uniforms, compiled,
1554 &key.reserved_preamble);
1555 }
1556
1557 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1558 key.fs.cf_base = cf_base;
1559 }
1560
1561 agx_compile_shader_nir(nir, &key, debug, &compiled->b);
1562
1563 if (compiled->b.info.binary_size && !secondary) {
1564 compiled->bo = agx_bo_create(dev, compiled->b.info.binary_size, 0,
1565 AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
1566
1567 memcpy(agx_bo_map(compiled->bo), compiled->b.binary,
1568 compiled->b.info.binary_size);
1569 }
1570
1571 return compiled;
1572 }
1573
1574 static struct agx_compiled_shader *
1575 agx_build_meta_shader_internal(struct agx_context *ctx,
1576 meta_shader_builder_t builder, void *data,
1577 size_t data_size, bool prolog, bool epilog,
1578 unsigned cf_base, bool internal_kernel);
1579
1580 /* Does not take ownership of key. Clones if necessary. */
1581 static struct agx_compiled_shader *
agx_compile_variant(struct agx_device * dev,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key_)1582 agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
1583 struct agx_uncompiled_shader *so,
1584 struct util_debug_callback *debug,
1585 union asahi_shader_key *key_)
1586 {
1587 struct blob_reader reader;
1588 blob_reader_init(&reader, so->serialized_nir.data, so->serialized_nir.size);
1589 nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);
1590
1591 /* Auxiliary programs */
1592 enum mesa_prim gs_out_prim = MESA_PRIM_MAX;
1593 uint64_t outputs = 0;
1594 struct agx_fs_epilog_link_info epilog_key = {false};
1595 unsigned gs_out_count_words = 0;
1596 nir_shader *gs_count = NULL;
1597 nir_shader *gs_copy = NULL;
1598 nir_shader *pre_gs = NULL;
1599 BITSET_DECLARE(attrib_components_read, VERT_ATTRIB_MAX * 4) = {0};
1600
1601 /* This can happen at inopportune times and cause jank, log it */
1602 perf_debug(dev, "Compiling %s shader variant #%u",
1603 _mesa_shader_stage_to_abbrev(so->type),
1604 _mesa_hash_table_num_entries(so->variants));
1605
1606 struct agx_unlinked_uvs_layout uvs = {0};
1607 bool translucent = false;
1608
1609 if (nir->info.stage == MESA_SHADER_VERTEX) {
1610 struct asahi_vs_shader_key *key = &key_->vs;
1611
1612 if (nir->info.vs.tes_agx) {
1613 NIR_PASS(_, nir, agx_nir_lower_tes, dev->libagx, key->hw);
1614 } else {
1615 NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog,
1616 attrib_components_read);
1617 }
1618
1619 if (key->hw) {
1620 NIR_PASS(_, nir, agx_nir_lower_point_size, true);
1621 NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1622 nir_metadata_control_flow, NULL);
1623
1624 NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
1625 NULL);
1626 NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
1627 NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
1628 } else {
1629 NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx);
1630
1631 /* Turn into a compute shader now that we're free of vertexisms */
1632 nir->info.stage = MESA_SHADER_COMPUTE;
1633 memset(&nir->info.cs, 0, sizeof(nir->info.cs));
1634 nir->xfb_info = NULL;
1635 outputs = nir->info.outputs_written;
1636 }
1637 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1638 NIR_PASS_V(nir, agx_nir_lower_tcs, dev->libagx);
1639 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1640 struct asahi_gs_shader_key *key = &key_->gs;
1641
1642 NIR_PASS(_, nir, agx_nir_lower_gs, dev->libagx, key->rasterizer_discard,
1643 &gs_count, &gs_copy, &pre_gs, &gs_out_prim, &gs_out_count_words);
1644 } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1645 struct asahi_fs_shader_key *key = &key_->fs;
1646
1647 /* Discards must be lowering before lowering MSAA to handle discards */
1648 NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
1649 NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog, &epilog_key);
1650
1651 if (nir->info.fs.uses_fbfetch_output) {
1652 struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
1653 key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples,
1654 true);
1655
1656 if (dev->debug & AGX_DBG_SMALLTILE)
1657 tib.tile_size = (struct agx_tile_size){16, 16};
1658
1659 /* XXX: don't replicate this all over the driver */
1660 unsigned rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
1661 (2 * BITSET_LAST_BIT(nir->info.images_used));
1662 unsigned rt_spill = rt_spill_base;
1663 NIR_PASS(_, nir, agx_nir_lower_tilebuffer, &tib, NULL, &rt_spill, NULL,
1664 &translucent);
1665 }
1666
1667 if (nir->info.fs.uses_sample_shading) {
1668 /* Ensure the sample ID is preserved in register */
1669 nir_builder b =
1670 nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
1671 nir_export_agx(
1672 &b,
1673 nir_load_exported_agx(&b, 1, 16, .base = AGX_ABI_FIN_SAMPLE_MASK),
1674 .base = AGX_ABI_FOUT_SAMPLE_MASK);
1675
1676 NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
1677 }
1678
1679 NIR_PASS(_, nir, agx_nir_lower_sample_mask);
1680 NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
1681 }
1682
1683 NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
1684
1685 struct agx_compiled_shader *compiled = agx_compile_nir(
1686 dev, nir, debug, so->type, false, so->type != PIPE_SHADER_FRAGMENT, false,
1687 0, attrib_components_read);
1688
1689 if (so->type == PIPE_SHADER_FRAGMENT) {
1690 /* XXX: don't replicate this all over the driver */
1691 epilog_key.rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
1692 (2 * BITSET_LAST_BIT(nir->info.images_used));
1693
1694 compiled->epilog_key = epilog_key;
1695 compiled->b.info.reads_tib |= translucent;
1696 }
1697
1698 compiled->so = so;
1699 compiled->uvs = uvs;
1700
1701 /* Compile auxiliary programs */
1702 if (gs_count) {
1703 compiled->gs_count = agx_compile_nir(dev, gs_count, debug, so->type,
1704 false, true, false, 0, NULL);
1705 compiled->gs_count->so = so;
1706 }
1707
1708 if (pre_gs) {
1709 compiled->pre_gs = agx_compile_nir(
1710 dev, pre_gs, debug, PIPE_SHADER_COMPUTE, false, true, false, 0, NULL);
1711 }
1712
1713 if (gs_copy) {
1714 /* Replace the point size write if present, but do not insert a write:
1715 * the GS rast program writes point size iff we have points.
1716 */
1717 NIR_PASS(_, gs_copy, agx_nir_lower_point_size, false);
1718
1719 NIR_PASS(_, gs_copy, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1720 nir_metadata_control_flow, NULL);
1721
1722 NIR_PASS(_, gs_copy, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
1723 NULL);
1724 NIR_PASS(_, gs_copy, agx_nir_lower_cull_distance_vs);
1725
1726 struct agx_unlinked_uvs_layout uvs = {0};
1727 NIR_PASS(_, gs_copy, agx_nir_lower_uvs, &uvs);
1728
1729 compiled->gs_copy =
1730 agx_compile_nir(dev, gs_copy, debug, PIPE_SHADER_GEOMETRY, false, true,
1731 false, 0, NULL);
1732 compiled->gs_copy->so = so;
1733 compiled->gs_copy->stage = so->type;
1734 compiled->gs_copy->uvs = uvs;
1735 }
1736
1737 compiled->gs_output_mode = gs_out_prim;
1738 compiled->gs_count_words = gs_out_count_words;
1739 compiled->b.info.outputs = outputs;
1740
1741 ralloc_free(nir);
1742 ralloc_free(pre_gs);
1743 ralloc_free(gs_count);
1744 return compiled;
1745 }
1746
1747 static struct agx_compiled_shader *
agx_get_shader_variant(struct agx_screen * screen,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key)1748 agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
1749 struct agx_uncompiled_shader *so,
1750 struct util_debug_callback *debug,
1751 union asahi_shader_key *key)
1752 {
1753 struct agx_compiled_shader *compiled =
1754 agx_disk_cache_retrieve(screen, so, key);
1755
1756 if (!compiled) {
1757 compiled = agx_compile_variant(&screen->dev, pctx, so, debug, key);
1758 agx_disk_cache_store(screen->disk_cache, so, key, compiled);
1759 }
1760
1761 /* key may be destroyed after we return, so clone it before using it as a
1762 * hash table key. The clone is logically owned by the hash table.
1763 */
1764 union asahi_shader_key *cloned_key =
1765 rzalloc(so->variants, union asahi_shader_key);
1766
1767 if (so->type == PIPE_SHADER_FRAGMENT) {
1768 memcpy(cloned_key, key, sizeof(struct asahi_fs_shader_key));
1769 } else if (so->type == PIPE_SHADER_VERTEX ||
1770 so->type == PIPE_SHADER_TESS_EVAL) {
1771 memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
1772 } else if (so->type == PIPE_SHADER_GEOMETRY) {
1773 memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
1774 } else {
1775 assert(gl_shader_stage_is_compute(so->type) ||
1776 so->type == PIPE_SHADER_TESS_CTRL);
1777 /* No key */
1778 }
1779
1780 _mesa_hash_table_insert(so->variants, cloned_key, compiled);
1781
1782 return compiled;
1783 }
1784
1785 static int
glsl_type_size(const struct glsl_type * type,bool bindless)1786 glsl_type_size(const struct glsl_type *type, bool bindless)
1787 {
1788 return glsl_count_attribute_slots(type, false);
1789 }
1790
1791 static bool
should_lower_robustness(const nir_intrinsic_instr * intr,const void * data)1792 should_lower_robustness(const nir_intrinsic_instr *intr, const void *data)
1793 {
1794 const bool *gl_robust = data;
1795
1796 switch (intr->intrinsic) {
1797 /* The texture/PBE hardware is robust, but our buffer image implementation
1798 * is not. Lower robustness only for buffer images.
1799 */
1800 case nir_intrinsic_image_load:
1801 case nir_intrinsic_image_store:
1802 return nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF;
1803
1804 /* Image atomics are lowered to raw memory access */
1805 case nir_intrinsic_image_atomic:
1806 case nir_intrinsic_image_atomic_swap:
1807 return true;
1808
1809 /* UBOs/SSBOs are lowered to raw pointers */
1810 case nir_intrinsic_load_ubo:
1811 case nir_intrinsic_load_ssbo:
1812 case nir_intrinsic_store_ssbo:
1813 case nir_intrinsic_ssbo_atomic:
1814 case nir_intrinsic_ssbo_atomic_swap:
1815 return *gl_robust;
1816
1817 default:
1818 return false;
1819 }
1820 }
1821
1822 static void
agx_shader_initialize(struct agx_device * dev,struct agx_uncompiled_shader * so,nir_shader * nir,bool support_lod_bias,bool robust)1823 agx_shader_initialize(struct agx_device *dev, struct agx_uncompiled_shader *so,
1824 nir_shader *nir, bool support_lod_bias, bool robust)
1825 {
1826 if (nir->info.stage == MESA_SHADER_KERNEL)
1827 nir->info.stage = MESA_SHADER_COMPUTE;
1828
1829 blob_init(&so->early_serialized_nir);
1830 nir_serialize(&so->early_serialized_nir, nir, true);
1831
1832 /* We need to lower robustness before bindings, since robustness lowering
1833 * affects the bindings used.
1834 */
1835 NIR_PASS(_, nir, nir_lower_robust_access, should_lower_robustness, &robust);
1836
1837 /* Similarly, we need to do early texture lowering before bindings */
1838 NIR_PASS(_, nir, agx_nir_lower_texture_early, support_lod_bias);
1839
1840 /* We need to lower binding tables before calling agx_preprocess_nir, since
1841 * that does texture lowering that needs to know the binding model.
1842 */
1843 NIR_PASS(_, nir, agx_nir_lower_bindings, &so->uses_bindless_samplers);
1844
1845 /* We need to do some I/O lowering before lowering textures */
1846 so->info.nr_bindful_textures = BITSET_LAST_BIT(nir->info.textures_used);
1847 so->info.nr_bindful_images = BITSET_LAST_BIT(nir->info.images_used);
1848
1849 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1850 glsl_type_size,
1851 nir_lower_io_lower_64bit_to_32 |
1852 nir_lower_io_use_interpolated_input_intrinsics);
1853
1854 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1855 struct agx_interp_info interp = agx_gather_interp_info(nir);
1856
1857 /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
1858 * exception, interpolate flat shaded at fp32. This works around a
1859 * hardware limitation. The resulting code (with an extra f2f16 at the end
1860 * if needed) matches what Metal produces.
1861 */
1862 if (likely(!(dev->debug & AGX_DBG_NO16))) {
1863 uint64_t texcoord = agx_gather_texcoords(nir);
1864
1865 NIR_PASS(_, nir, nir_lower_mediump_io,
1866 nir_var_shader_in | nir_var_shader_out,
1867 ~(interp.flat | texcoord), false);
1868 }
1869
1870 so->info.inputs_flat_shaded = interp.flat;
1871 so->info.inputs_linear_shaded = interp.linear;
1872 so->info.uses_fbfetch = nir->info.fs.uses_fbfetch_output;
1873 } else if (nir->info.stage == MESA_SHADER_VERTEX ||
1874 nir->info.stage == MESA_SHADER_TESS_EVAL) {
1875 so->info.has_edgeflags = nir->info.outputs_written & VARYING_BIT_EDGE;
1876 so->info.cull_distance_size = nir->info.cull_distance_array_size;
1877 }
1878
1879 /* Shrink and vectorize SSBOs before lowering them, since it is harder to
1880 * optimize the lowered code.
1881 */
1882 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
1883 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
1884 NIR_PASS(_, nir, agx_nir_cleanup_amul);
1885 NIR_PASS(_, nir, nir_opt_constant_folding);
1886 NIR_PASS(_, nir, nir_copy_prop);
1887 NIR_PASS(_, nir, nir_opt_cse);
1888 NIR_PASS(_, nir, nir_opt_dce);
1889 NIR_PASS(_, nir, nir_opt_shrink_vectors, true);
1890 NIR_PASS(_, nir, nir_copy_prop);
1891
1892 NIR_PASS(
1893 _, nir, nir_opt_load_store_vectorize,
1894 &(const nir_load_store_vectorize_options){
1895 .modes = nir_var_mem_global | nir_var_mem_constant | nir_var_mem_ssbo,
1896 .callback = agx_mem_vectorize_cb,
1897 });
1898
1899 NIR_PASS(_, nir, agx_nir_lower_texture);
1900 NIR_PASS(_, nir, nir_lower_ssbo, NULL);
1901
1902 agx_preprocess_nir(nir, dev->libagx);
1903
1904 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
1905 (nir->info.inputs_read & VARYING_BITS_TEX_ANY)) {
1906
1907 NIR_PASS(_, nir, nir_shader_intrinsics_pass,
1908 agx_nir_lower_point_sprite_zw, nir_metadata_control_flow, NULL);
1909 }
1910
1911 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1912 NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, true);
1913 }
1914
1915 so->type = pipe_shader_type_from_mesa(nir->info.stage);
1916
1917 if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1918 nir->info.stage = MESA_SHADER_VERTEX;
1919 nir->info.vs.tes_agx = true;
1920 }
1921
1922 blob_init(&so->serialized_nir);
1923 nir_serialize(&so->serialized_nir, nir, true);
1924 _mesa_sha1_compute(so->serialized_nir.data, so->serialized_nir.size,
1925 so->nir_sha1);
1926
1927 so->has_xfb_info = (nir->xfb_info != NULL);
1928
1929 static_assert(
1930 ARRAY_SIZE(so->xfb_strides) == ARRAY_SIZE(nir->info.xfb_stride),
1931 "known target count");
1932
1933 if (so->has_xfb_info) {
1934 struct nir_xfb_info *xfb = nir->xfb_info;
1935
1936 for (unsigned i = 0; i < ARRAY_SIZE(so->xfb_strides); ++i) {
1937 so->xfb_strides[i] = xfb->buffers[i].stride;
1938 }
1939 }
1940 }
1941
1942 static void *
agx_create_shader_state(struct pipe_context * pctx,const struct pipe_shader_state * cso)1943 agx_create_shader_state(struct pipe_context *pctx,
1944 const struct pipe_shader_state *cso)
1945 {
1946 struct agx_context *ctx = agx_context(pctx);
1947 struct agx_uncompiled_shader *so =
1948 rzalloc(NULL, struct agx_uncompiled_shader);
1949 struct agx_device *dev = agx_device(pctx->screen);
1950
1951 if (!so)
1952 return NULL;
1953
1954 so->base = *cso;
1955
1956 nir_shader *nir = cso->type == PIPE_SHADER_IR_NIR
1957 ? cso->ir.nir
1958 : tgsi_to_nir(cso->tokens, pctx->screen, false);
1959
1960 if (nir->info.stage == MESA_SHADER_VERTEX ||
1961 nir->info.stage == MESA_SHADER_TESS_EVAL) {
1962 so->variants = asahi_vs_shader_key_table_create(so);
1963 so->linked_shaders = agx_fast_link_key_table_create(so);
1964 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1965 so->variants = asahi_gs_shader_key_table_create(so);
1966 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1967 /* No variants */
1968 so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
1969 asahi_cs_shader_key_equal);
1970 } else {
1971 so->variants = asahi_fs_shader_key_table_create(so);
1972 so->linked_shaders = agx_fast_link_key_table_create(so);
1973 }
1974
1975 if (nir->info.stage == MESA_SHADER_TESS_EVAL ||
1976 nir->info.stage == MESA_SHADER_TESS_CTRL) {
1977
1978 so->tess.ccw = nir->info.tess.ccw;
1979 so->tess.point_mode = nir->info.tess.point_mode;
1980 so->tess.spacing = nir->info.tess.spacing;
1981 so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
1982 so->tess.primitive = nir->info.tess._primitive_mode;
1983 so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
1984 so->tess.nr_patch_outputs =
1985 util_last_bit(nir->info.patch_outputs_written);
1986 if (nir->info.stage == MESA_SHADER_TESS_CTRL)
1987 so->tess.output_stride = agx_tcs_output_stride(nir);
1988 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1989 so->gs_mode = nir->info.gs.output_primitive;
1990 }
1991
1992 agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
1993 gl_shader_stage next_stage = nir->info.next_stage;
1994
1995 /* We're done with the NIR, throw it away */
1996 ralloc_free(nir);
1997 nir = NULL;
1998
1999 /* Precompile shaders that have a small key. For shader-db, precompile a
2000 * shader with a default key. This could be improved but hopefully this is
2001 * acceptable for now.
2002 */
2003 if ((so->type == PIPE_SHADER_TESS_CTRL) ||
2004 (so->type == PIPE_SHADER_FRAGMENT && !so->info.uses_fbfetch)) {
2005 union asahi_shader_key key = {0};
2006 agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2007 &key);
2008 } else if (so->type == PIPE_SHADER_VERTEX) {
2009 union asahi_shader_key key = {
2010 .vs.hw = next_stage == MESA_SHADER_FRAGMENT,
2011 };
2012 agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2013 &key);
2014
2015 if (!next_stage) {
2016 key.vs.hw = true;
2017 agx_get_shader_variant(agx_screen(pctx->screen), pctx, so,
2018 &pctx->debug, &key);
2019 }
2020 } else if (dev->debug & AGX_DBG_PRECOMPILE) {
2021 union asahi_shader_key key = {0};
2022
2023 switch (so->type) {
2024 case PIPE_SHADER_GEOMETRY:
2025 break;
2026
2027 case PIPE_SHADER_TESS_EVAL:
2028 /* TODO: Tessellation shaders with shader-db */
2029 return so;
2030
2031 case PIPE_SHADER_FRAGMENT:
2032 key.fs.nr_samples = 1;
2033 break;
2034 default:
2035 unreachable("Unknown shader stage in shader-db precompile");
2036 }
2037
2038 agx_compile_variant(dev, pctx, so, &pctx->debug, &key);
2039 }
2040
2041 return so;
2042 }
2043
2044 static void *
agx_create_compute_state(struct pipe_context * pctx,const struct pipe_compute_state * cso)2045 agx_create_compute_state(struct pipe_context *pctx,
2046 const struct pipe_compute_state *cso)
2047 {
2048 struct agx_context *ctx = agx_context(pctx);
2049 struct agx_device *dev = agx_device(pctx->screen);
2050 struct agx_uncompiled_shader *so =
2051 rzalloc(NULL, struct agx_uncompiled_shader);
2052
2053 if (!so)
2054 return NULL;
2055
2056 so->variants = _mesa_hash_table_create(so, asahi_cs_shader_key_hash,
2057 asahi_cs_shader_key_equal);
2058
2059 union asahi_shader_key key = {0};
2060
2061 assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
2062 nir_shader *nir = (void *)cso->prog;
2063
2064 agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
2065 agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2066 &key);
2067
2068 /* We're done with the NIR, throw it away */
2069 ralloc_free(nir);
2070 return so;
2071 }
2072
2073 static void
agx_get_compute_state_info(struct pipe_context * pctx,void * cso,struct pipe_compute_state_object_info * info)2074 agx_get_compute_state_info(struct pipe_context *pctx, void *cso,
2075 struct pipe_compute_state_object_info *info)
2076 {
2077 union asahi_shader_key key = {0};
2078 struct agx_compiled_shader *so = agx_get_shader_variant(
2079 agx_screen(pctx->screen), pctx, cso, &pctx->debug, &key);
2080
2081 info->max_threads =
2082 agx_occupancy_for_register_count(so->b.info.nr_gprs).max_threads;
2083 info->private_memory = 0;
2084 info->preferred_simd_size = 32;
2085 info->simd_sizes = 32;
2086 }
2087
2088 /* Does not take ownership of key. Clones if necessary. */
2089 static bool
agx_update_shader(struct agx_context * ctx,struct agx_compiled_shader ** out,enum pipe_shader_type stage,union asahi_shader_key * key)2090 agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
2091 enum pipe_shader_type stage, union asahi_shader_key *key)
2092 {
2093 struct agx_uncompiled_shader *so = ctx->stage[stage].shader;
2094 assert(so != NULL);
2095
2096 struct hash_entry *he = _mesa_hash_table_search(so->variants, key);
2097
2098 if (he) {
2099 if ((*out) == he->data)
2100 return false;
2101
2102 *out = he->data;
2103 return true;
2104 }
2105
2106 struct agx_screen *screen = agx_screen(ctx->base.screen);
2107 *out = agx_get_shader_variant(screen, &ctx->base, so, &ctx->base.debug, key);
2108 return true;
2109 }
2110
2111 static enum mesa_prim
rast_prim(enum mesa_prim mode,unsigned fill_mode)2112 rast_prim(enum mesa_prim mode, unsigned fill_mode)
2113 {
2114 if (u_reduced_prim(mode) == MESA_PRIM_TRIANGLES) {
2115 if (fill_mode == PIPE_POLYGON_MODE_POINT)
2116 return MESA_PRIM_POINTS;
2117 else if (fill_mode == PIPE_POLYGON_MODE_LINE)
2118 return MESA_PRIM_LINES;
2119 }
2120
2121 return mode;
2122 }
2123
2124 static bool
lower_fs_prolog_abi(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2125 lower_fs_prolog_abi(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2126 {
2127 if (intr->intrinsic == nir_intrinsic_load_polygon_stipple_agx) {
2128 b->cursor = nir_instr_remove(&intr->instr);
2129
2130 nir_def *root = nir_load_preamble(b, 1, 64, .base = 12);
2131 off_t stipple_offs = offsetof(struct agx_draw_uniforms, polygon_stipple);
2132 nir_def *stipple_ptr_ptr = nir_iadd_imm(b, root, stipple_offs);
2133 nir_def *base = nir_load_global_constant(b, stipple_ptr_ptr, 4, 1, 64);
2134
2135 nir_def *row = intr->src[0].ssa;
2136 nir_def *addr = nir_iadd(b, base, nir_u2u64(b, nir_imul_imm(b, row, 4)));
2137
2138 nir_def *pattern = nir_load_global_constant(b, addr, 4, 1, 32);
2139 nir_def_rewrite_uses(&intr->def, pattern);
2140 return true;
2141 } else if (intr->intrinsic == nir_intrinsic_load_stat_query_address_agx) {
2142 b->cursor = nir_instr_remove(&intr->instr);
2143
2144 /* ABI: root descriptor address in u6_u7 */
2145 nir_def *root = nir_load_preamble(b, 1, intr->def.bit_size, .base = 12);
2146
2147 off_t offs = offsetof(struct agx_draw_uniforms,
2148 pipeline_statistics[nir_intrinsic_base(intr)]);
2149
2150 nir_def *ptr = nir_iadd_imm(b, root, offs);
2151 nir_def *load = nir_load_global_constant(b, ptr, 4, 1, 64);
2152 nir_def_rewrite_uses(&intr->def, load);
2153 return true;
2154 } else {
2155 return false;
2156 }
2157 }
2158
2159 static void
build_fs_prolog(nir_builder * b,const void * key)2160 build_fs_prolog(nir_builder *b, const void *key)
2161 {
2162 agx_nir_fs_prolog(b, key);
2163
2164 NIR_PASS(_, b->shader, nir_shader_intrinsics_pass, lower_fs_prolog_abi,
2165 nir_metadata_control_flow, NULL);
2166 }
2167
2168 static struct agx_linked_shader *
asahi_fast_link(struct agx_context * ctx,struct agx_uncompiled_shader * so,struct agx_fast_link_key * key)2169 asahi_fast_link(struct agx_context *ctx, struct agx_uncompiled_shader *so,
2170 struct agx_fast_link_key *key)
2171 {
2172 /* Try the cache */
2173 struct hash_entry *ent = _mesa_hash_table_search(so->linked_shaders, key);
2174 if (ent)
2175 return ent->data;
2176
2177 struct agx_compiled_shader *prolog = NULL, *epilog = NULL;
2178
2179 /* Build the prolog/epilog now */
2180 if (so->type == MESA_SHADER_FRAGMENT) {
2181 prolog = agx_build_meta_shader_internal(
2182 ctx, build_fs_prolog, &key->prolog.fs, sizeof(key->prolog.fs), true,
2183 false, key->prolog.fs.cf_base, false);
2184
2185 epilog = agx_build_meta_shader_internal(
2186 ctx, agx_nir_fs_epilog, &key->epilog.fs, sizeof(key->epilog.fs), false,
2187 true, 0, false);
2188
2189 } else if (so->type == MESA_SHADER_TESS_EVAL) {
2190 /* No prolog/epilog needed */
2191 } else {
2192 assert(so->type == MESA_SHADER_VERTEX);
2193
2194 prolog = agx_build_meta_shader_internal(
2195 ctx, agx_nir_vs_prolog, &key->prolog.vs, sizeof(key->prolog.vs), true,
2196 false, 0, false);
2197 }
2198
2199 /* Fast-link it all together */
2200 struct agx_device *dev = agx_device(ctx->base.screen);
2201
2202 struct agx_linked_shader *linked =
2203 rzalloc(so->linked_shaders, struct agx_linked_shader);
2204 agx_fast_link(linked, dev, so->type == PIPE_SHADER_FRAGMENT, &key->main->b,
2205 &prolog->b, &epilog->b, key->nr_samples_shaded);
2206
2207 /* Cache the fast linked program */
2208 union asahi_shader_key *cloned_key =
2209 ralloc_memdup(so->linked_shaders, key, sizeof(*key));
2210 _mesa_hash_table_insert(so->linked_shaders, cloned_key, linked);
2211 return linked;
2212 }
2213
2214 static bool
agx_update_vs(struct agx_batch * batch,unsigned index_size_B)2215 agx_update_vs(struct agx_batch *batch, unsigned index_size_B)
2216 {
2217 struct agx_context *ctx = batch->ctx;
2218
2219 /* Only proceed if the shader or anything the key depends on changes
2220 *
2221 * vb_mask, attributes, vertex_buffers: VERTEX
2222 */
2223 if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB)) ||
2224 ctx->stage[PIPE_SHADER_TESS_EVAL].dirty ||
2225 ctx->stage[PIPE_SHADER_GEOMETRY].dirty ||
2226 ctx->stage[PIPE_SHADER_TESS_EVAL].shader ||
2227 ctx->stage[PIPE_SHADER_GEOMETRY].shader || ctx->in_tess))
2228 return false;
2229
2230 struct asahi_vs_shader_key key = {
2231 .hw = !((ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess) ||
2232 ctx->stage[PIPE_SHADER_GEOMETRY].shader),
2233 };
2234
2235 agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX,
2236 (union asahi_shader_key *)&key);
2237
2238 struct agx_device *dev = agx_device(ctx->base.screen);
2239 struct agx_fast_link_key link_key = {
2240 .prolog.vs.hw = key.hw,
2241 .prolog.vs.sw_index_size_B = key.hw ? 0 : index_size_B,
2242
2243 .prolog.vs.robustness.level =
2244 ctx->robust ? AGX_ROBUSTNESS_GL : AGX_ROBUSTNESS_DISABLED,
2245
2246 .prolog.vs.robustness.soft_fault = agx_has_soft_fault(dev),
2247 .main = ctx->vs,
2248 };
2249
2250 STATIC_ASSERT(sizeof(link_key.prolog.vs.component_mask) ==
2251 sizeof(ctx->vs->attrib_components_read));
2252 BITSET_COPY(link_key.prolog.vs.component_mask,
2253 ctx->vs->attrib_components_read);
2254
2255 memcpy(link_key.prolog.vs.attribs, &ctx->attributes->key,
2256 sizeof(link_key.prolog.vs.attribs));
2257
2258 void *old = ctx->linked.vs;
2259
2260 ctx->linked.vs =
2261 asahi_fast_link(ctx, ctx->stage[PIPE_SHADER_VERTEX].shader, &link_key);
2262
2263 agx_batch_add_bo(batch, ctx->vs->bo);
2264 if (ctx->linked.vs)
2265 agx_batch_add_bo(batch, ctx->linked.vs->bo);
2266
2267 return old != ctx->linked.vs;
2268 }
2269
2270 static bool
agx_update_tcs(struct agx_context * ctx,const struct pipe_draw_info * info)2271 agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
2272 {
2273 assert(info->mode == MESA_PRIM_PATCHES);
2274
2275 ctx->tcs = _mesa_hash_table_next_entry(
2276 ctx->stage[PIPE_SHADER_TESS_CTRL].shader->variants, NULL)
2277 ->data;
2278 return true;
2279 }
2280
2281 static bool
agx_update_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect)2282 agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
2283 const struct pipe_draw_indirect_info *indirect)
2284 {
2285 /* Only proceed if there is a geometry shader. Due to input assembly
2286 * dependence, we don't bother to dirty track right now.
2287 */
2288 if (!ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
2289 ctx->gs = NULL;
2290 return false;
2291 }
2292
2293 /* Transform feedback always happens via the geometry shader, so look there
2294 * to get the XFB strides.
2295 */
2296 struct agx_uncompiled_shader *gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
2297
2298 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2299 struct agx_streamout_target *tgt =
2300 agx_so_target(ctx->streamout.targets[i]);
2301
2302 if (tgt != NULL)
2303 tgt->stride = gs->xfb_strides[i];
2304 }
2305
2306 struct asahi_gs_shader_key key = {
2307 .rasterizer_discard = ctx->rast->base.rasterizer_discard,
2308 };
2309
2310 return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
2311 (union asahi_shader_key *)&key);
2312 }
2313
2314 static enum pipe_blendfactor
optimize_blend_factor_w_1(enum pipe_blendfactor f)2315 optimize_blend_factor_w_1(enum pipe_blendfactor f)
2316 {
2317 if (f == PIPE_BLENDFACTOR_SRC_ALPHA)
2318 return PIPE_BLENDFACTOR_ONE;
2319 else if (f == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
2320 return PIPE_BLENDFACTOR_ZERO;
2321 else
2322 return f;
2323 }
2324
2325 static bool
agx_update_fs(struct agx_batch * batch)2326 agx_update_fs(struct agx_batch *batch)
2327 {
2328 struct agx_context *ctx = batch->ctx;
2329
2330 /* Only proceed if the shader or anything the key depends on changes
2331 *
2332 * batch->key: implicitly dirties everything, no explicit check
2333 * rast: RS
2334 * blend: BLEND
2335 * sample_mask: SAMPLE_MASK
2336 * reduced_prim: PRIM
2337 */
2338 if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG | AGX_DIRTY_RS |
2339 AGX_DIRTY_BLEND | AGX_DIRTY_SAMPLE_MASK |
2340 AGX_DIRTY_PRIM | AGX_DIRTY_QUERY)))
2341 return false;
2342
2343 struct agx_device *dev = agx_device(ctx->base.screen);
2344 unsigned nr_samples = util_framebuffer_get_num_samples(&batch->key);
2345
2346 /* Get main shader */
2347 struct asahi_fs_shader_key key = {0};
2348
2349 if (ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.uses_fbfetch) {
2350 key.nr_samples = nr_samples;
2351
2352 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
2353 struct pipe_surface *surf = batch->key.cbufs[i];
2354
2355 key.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2356 }
2357 }
2358
2359 agx_update_shader(ctx, &ctx->fs, PIPE_SHADER_FRAGMENT,
2360 (union asahi_shader_key *)&key);
2361
2362 /* Fast link with prolog/epilog */
2363 bool msaa = ctx->rast->base.multisample;
2364 unsigned sample_mask = ctx->sample_mask & BITFIELD_MASK(nr_samples);
2365
2366 struct agx_fast_link_key link_key = {
2367 .prolog.fs.statistics =
2368 ctx->pipeline_statistics[PIPE_STAT_QUERY_PS_INVOCATIONS],
2369
2370 .prolog.fs.cull_distance_size =
2371 ctx->stage[MESA_SHADER_VERTEX].shader->info.cull_distance_size,
2372
2373 .prolog.fs.polygon_stipple =
2374 ctx->rast->base.poly_stipple_enable &&
2375 rast_prim(batch->reduced_prim, ctx->rast->base.fill_front) ==
2376 MESA_PRIM_TRIANGLES,
2377
2378 .prolog.fs.api_sample_mask =
2379 (msaa && nr_samples > 1 && sample_mask != BITFIELD_MASK(nr_samples))
2380 ? sample_mask
2381 : 0xff,
2382
2383 .epilog.fs.nr_samples = nr_samples,
2384 .epilog.fs.link = ctx->fs->epilog_key,
2385 .epilog.fs.force_small_tile = dev->debug & AGX_DBG_SMALLTILE,
2386
2387 .main = ctx->fs,
2388 .nr_samples_shaded = ctx->fs->epilog_key.sample_shading ? nr_samples : 0,
2389 };
2390
2391 for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
2392 struct pipe_surface *surf = batch->key.cbufs[i];
2393
2394 link_key.epilog.fs.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2395 link_key.epilog.fs.remap[i] =
2396 link_key.epilog.fs.link.broadcast_rt0 ? 0 : i;
2397 }
2398
2399 memcpy(&link_key.epilog.fs.blend, &ctx->blend->key,
2400 sizeof(link_key.epilog.fs.blend));
2401
2402 /* Normalize */
2403 if (!agx_tilebuffer_spills(&batch->tilebuffer_layout))
2404 link_key.epilog.fs.link.rt_spill_base = 0;
2405
2406 /* Try to disable blending to get rid of some fsats */
2407 if (link_key.epilog.fs.link.loc0_w_1) {
2408 struct agx_blend_rt_key *k = &link_key.epilog.fs.blend.rt[0];
2409
2410 k->rgb_src_factor = optimize_blend_factor_w_1(k->rgb_src_factor);
2411 k->rgb_dst_factor = optimize_blend_factor_w_1(k->rgb_dst_factor);
2412
2413 k->alpha_src_factor = optimize_blend_factor_w_1(k->alpha_src_factor);
2414 k->alpha_dst_factor = optimize_blend_factor_w_1(k->alpha_dst_factor);
2415 }
2416
2417 link_key.epilog.fs.blend.alpha_to_coverage &= msaa;
2418
2419 /* The main shader must not run tests if the epilog will */
2420 bool epilog_discards = link_key.epilog.fs.blend.alpha_to_coverage;
2421 batch->uniforms.no_epilog_discard = !epilog_discards ? ~0 : 0;
2422
2423 bool prolog_discards = (link_key.prolog.fs.api_sample_mask != 0xff ||
2424 link_key.prolog.fs.cull_distance_size ||
2425 link_key.prolog.fs.polygon_stipple);
2426
2427 /* The prolog runs tests if neither the main shader nor epilog will */
2428 link_key.prolog.fs.run_zs_tests = !ctx->fs->b.info.writes_sample_mask &&
2429 !epilog_discards && prolog_discards;
2430
2431 if (link_key.prolog.fs.cull_distance_size)
2432 link_key.prolog.fs.cf_base = ctx->fs->b.info.varyings.fs.nr_cf;
2433
2434 void *old = ctx->linked.fs;
2435
2436 ctx->linked.fs =
2437 asahi_fast_link(ctx, ctx->stage[PIPE_SHADER_FRAGMENT].shader, &link_key);
2438
2439 if (ctx->fs->bo)
2440 agx_batch_add_bo(batch, ctx->fs->bo);
2441
2442 agx_batch_add_bo(batch, ctx->linked.fs->bo);
2443
2444 return old != ctx->linked.fs;
2445 }
2446
2447 static void
agx_bind_shader_state(struct pipe_context * pctx,void * cso,enum pipe_shader_type stage)2448 agx_bind_shader_state(struct pipe_context *pctx, void *cso,
2449 enum pipe_shader_type stage)
2450 {
2451 struct agx_context *ctx = agx_context(pctx);
2452
2453 if (stage == PIPE_SHADER_VERTEX)
2454 ctx->dirty |= AGX_DIRTY_VS_PROG;
2455 else if (stage == PIPE_SHADER_FRAGMENT)
2456 ctx->dirty |= AGX_DIRTY_FS_PROG;
2457 else
2458 ctx->stage[stage].dirty = ~0;
2459
2460 ctx->stage[stage].shader = cso;
2461 }
2462
2463 static void
agx_bind_vs_state(struct pipe_context * pctx,void * cso)2464 agx_bind_vs_state(struct pipe_context *pctx, void *cso)
2465 {
2466 agx_bind_shader_state(pctx, cso, PIPE_SHADER_VERTEX);
2467 }
2468
2469 static void
agx_bind_fs_state(struct pipe_context * pctx,void * cso)2470 agx_bind_fs_state(struct pipe_context *pctx, void *cso)
2471 {
2472 agx_bind_shader_state(pctx, cso, PIPE_SHADER_FRAGMENT);
2473 }
2474
2475 static void
agx_bind_gs_state(struct pipe_context * pctx,void * cso)2476 agx_bind_gs_state(struct pipe_context *pctx, void *cso)
2477 {
2478 agx_bind_shader_state(pctx, cso, PIPE_SHADER_GEOMETRY);
2479 }
2480
2481 static void
agx_bind_tcs_state(struct pipe_context * pctx,void * cso)2482 agx_bind_tcs_state(struct pipe_context *pctx, void *cso)
2483 {
2484 agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_CTRL);
2485 }
2486
2487 static void
agx_bind_tes_state(struct pipe_context * pctx,void * cso)2488 agx_bind_tes_state(struct pipe_context *pctx, void *cso)
2489 {
2490 agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_EVAL);
2491 }
2492
2493 static void
agx_bind_cs_state(struct pipe_context * pctx,void * cso)2494 agx_bind_cs_state(struct pipe_context *pctx, void *cso)
2495 {
2496 agx_bind_shader_state(pctx, cso, PIPE_SHADER_COMPUTE);
2497 }
2498
2499 /* Forward declare because of the recursion hit with geometry shaders */
2500 static void agx_delete_uncompiled_shader(struct agx_device *dev,
2501 struct agx_uncompiled_shader *so);
2502
2503 static void
agx_delete_compiled_shader(struct agx_device * dev,struct agx_compiled_shader * so)2504 agx_delete_compiled_shader(struct agx_device *dev,
2505 struct agx_compiled_shader *so)
2506 {
2507 if (so->gs_count)
2508 agx_delete_compiled_shader(dev, so->gs_count);
2509
2510 if (so->pre_gs)
2511 agx_delete_compiled_shader(dev, so->pre_gs);
2512
2513 if (so->gs_copy)
2514 agx_delete_compiled_shader(dev, so->gs_copy);
2515
2516 free(so->b.binary);
2517 agx_bo_unreference(dev, so->bo);
2518 FREE(so);
2519 }
2520
2521 static void
agx_delete_uncompiled_shader(struct agx_device * dev,struct agx_uncompiled_shader * so)2522 agx_delete_uncompiled_shader(struct agx_device *dev,
2523 struct agx_uncompiled_shader *so)
2524 {
2525 hash_table_foreach(so->variants, ent) {
2526 agx_delete_compiled_shader(dev, ent->data);
2527 }
2528
2529 _mesa_hash_table_destroy(so->variants, NULL);
2530
2531 if (so->linked_shaders) {
2532 hash_table_foreach(so->linked_shaders, ent) {
2533 struct agx_linked_shader *link = ent->data;
2534 agx_bo_unreference(dev, link->bo);
2535 }
2536
2537 _mesa_hash_table_destroy(so->linked_shaders, NULL);
2538 }
2539
2540 blob_finish(&so->serialized_nir);
2541 blob_finish(&so->early_serialized_nir);
2542
2543 for (unsigned i = 0; i < MESA_PRIM_COUNT; ++i) {
2544 for (unsigned j = 0; j < 3; ++j) {
2545 for (unsigned k = 0; k < 2; ++k) {
2546 if (so->passthrough_progs[i][j][k])
2547 agx_delete_uncompiled_shader(dev,
2548 so->passthrough_progs[i][j][k]);
2549 }
2550 }
2551 }
2552
2553 for (unsigned i = 0; i < ARRAY_SIZE(so->passthrough_tcs); ++i) {
2554 if (so->passthrough_tcs[i])
2555 agx_delete_uncompiled_shader(dev, so->passthrough_tcs[i]);
2556 }
2557
2558 ralloc_free(so);
2559 }
2560
2561 static void
agx_delete_shader_state(struct pipe_context * ctx,void * cso)2562 agx_delete_shader_state(struct pipe_context *ctx, void *cso)
2563 {
2564 struct agx_device *dev = agx_device(ctx->screen);
2565 agx_delete_uncompiled_shader(dev, cso);
2566 }
2567
2568 struct agx_generic_meta_key {
2569 meta_shader_builder_t builder;
2570 size_t key_size;
2571 uint8_t key[];
2572 };
2573
2574 static uint32_t
meta_key_hash(const void * key_)2575 meta_key_hash(const void *key_)
2576 {
2577 const struct agx_generic_meta_key *key = key_;
2578
2579 return _mesa_hash_data(key,
2580 sizeof(struct agx_generic_meta_key) + key->key_size);
2581 }
2582
2583 static bool
meta_key_equal(const void * a_,const void * b_)2584 meta_key_equal(const void *a_, const void *b_)
2585 {
2586 const struct agx_generic_meta_key *a = a_;
2587 const struct agx_generic_meta_key *b = b_;
2588
2589 return a->builder == b->builder && a->key_size == b->key_size &&
2590 memcmp(a->key, b->key, a->key_size) == 0;
2591 }
2592
2593 void
agx_init_meta_shaders(struct agx_context * ctx)2594 agx_init_meta_shaders(struct agx_context *ctx)
2595 {
2596 ctx->generic_meta =
2597 _mesa_hash_table_create(ctx, meta_key_hash, meta_key_equal);
2598 }
2599
2600 static void
agx_destroy_compute_blitter(struct pipe_context * ctx,struct asahi_blitter * bl)2601 agx_destroy_compute_blitter(struct pipe_context *ctx, struct asahi_blitter *bl)
2602 {
2603 hash_table_foreach(bl->blit_cs, ent) {
2604 ctx->delete_compute_state(ctx, ent->data);
2605 }
2606
2607 ctx->delete_sampler_state(ctx, bl->sampler[0]);
2608 ctx->delete_sampler_state(ctx, bl->sampler[1]);
2609
2610 _mesa_hash_table_destroy(bl->blit_cs, NULL);
2611 }
2612
2613 void
agx_destroy_meta_shaders(struct agx_context * ctx)2614 agx_destroy_meta_shaders(struct agx_context *ctx)
2615 {
2616 struct agx_device *dev = agx_device(ctx->base.screen);
2617 hash_table_foreach(ctx->generic_meta, ent) {
2618 agx_delete_compiled_shader(dev, ent->data);
2619 }
2620
2621 agx_destroy_compute_blitter(&ctx->base, &ctx->compute_blitter);
2622 _mesa_hash_table_destroy(ctx->generic_meta, NULL);
2623 }
2624
2625 static struct agx_compiled_shader *
agx_build_meta_shader_internal(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size,bool prolog,bool epilog,unsigned cf_base,bool internal_kernel)2626 agx_build_meta_shader_internal(struct agx_context *ctx,
2627 meta_shader_builder_t builder, void *data,
2628 size_t data_size, bool prolog, bool epilog,
2629 unsigned cf_base, bool internal_kernel)
2630 {
2631 /* Build the meta shader key */
2632 size_t total_key_size = sizeof(struct agx_generic_meta_key) + data_size;
2633 struct agx_generic_meta_key *key = alloca(total_key_size);
2634
2635 *key = (struct agx_generic_meta_key){
2636 .builder = builder,
2637 .key_size = data_size,
2638 };
2639
2640 if (data_size)
2641 memcpy(key->key, data, data_size);
2642
2643 /* Try to get the cached shader */
2644 struct hash_entry *ent = _mesa_hash_table_search(ctx->generic_meta, key);
2645 if (ent)
2646 return ent->data;
2647
2648 /* Otherwise, compile the shader fresh */
2649 nir_builder b = nir_builder_init_simple_shader(
2650 MESA_SHADER_COMPUTE, &agx_nir_options, "AGX meta shader");
2651
2652 builder(&b, data);
2653
2654 struct agx_device *dev = agx_device(ctx->base.screen);
2655 if (!prolog) {
2656 /* We need to link libagx and assign shared before preprocessing, matching
2657 * what the driver would otherwise produce.
2658 */
2659 agx_link_libagx(b.shader, dev->libagx);
2660
2661 NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
2662 nir_var_mem_shared, glsl_get_cl_type_size_align);
2663
2664 NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
2665 nir_address_format_62bit_generic);
2666
2667 agx_preprocess_nir(b.shader, NULL);
2668 NIR_PASS(_, b.shader, agx_nir_lower_texture);
2669 NIR_PASS(_, b.shader, agx_nir_lower_multisampled_image_store);
2670 }
2671
2672 struct agx_compiled_shader *shader = agx_compile_nir(
2673 dev, b.shader, NULL, PIPE_SHADER_COMPUTE, internal_kernel,
2674 !prolog && !(b.shader->info.stage == MESA_SHADER_FRAGMENT &&
2675 b.shader->info.fs.uses_sample_shading),
2676 prolog || epilog, cf_base, NULL);
2677
2678 ralloc_free(b.shader);
2679
2680 /* ..and cache it before we return. The key is on the stack right now, so
2681 * clone it before using it as a hash table key. The clone is logically owned
2682 * by the hash table.
2683 */
2684 void *cloned_key = rzalloc_size(ctx->generic_meta, total_key_size);
2685 memcpy(cloned_key, key, total_key_size);
2686
2687 _mesa_hash_table_insert(ctx->generic_meta, cloned_key, shader);
2688 return shader;
2689 }
2690
2691 struct agx_compiled_shader *
agx_build_meta_shader(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size)2692 agx_build_meta_shader(struct agx_context *ctx, meta_shader_builder_t builder,
2693 void *data, size_t data_size)
2694 {
2695 return agx_build_meta_shader_internal(ctx, builder, data, data_size, false,
2696 false, 0, false);
2697 }
2698
2699 static unsigned
sampler_count(struct agx_context * ctx,enum pipe_shader_type stage)2700 sampler_count(struct agx_context *ctx, enum pipe_shader_type stage)
2701 {
2702 /* We reserve sampler #0 for txf so add 1 to the API count */
2703 return ctx->stage[stage].sampler_count + 1;
2704 }
2705
2706 static inline enum agx_sampler_states
translate_sampler_state_count(struct agx_context * ctx,enum pipe_shader_type stage)2707 translate_sampler_state_count(struct agx_context *ctx,
2708 enum pipe_shader_type stage)
2709 {
2710 /* Clamp to binding table maximum, anything larger will be bindless */
2711 return agx_translate_sampler_state_count(MIN2(sampler_count(ctx, stage), 16),
2712 ctx->stage[stage].custom_borders);
2713 }
2714
2715 static uint32_t
agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader * cs)2716 agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader *cs)
2717 {
2718 if (!cs || !cs->so)
2719 return 0;
2720
2721 /* 2 descriptors per image, 1 descriptor per texture */
2722 return cs->so->info.nr_bindful_textures +
2723 (2 * cs->so->info.nr_bindful_images);
2724 }
2725
2726 static uint32_t
agx_nr_tex_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2727 agx_nr_tex_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2728 {
2729 unsigned n = agx_nr_tex_descriptors_without_spilled_rts(cs);
2730
2731 /* We add on texture/PBE descriptors for spilled render targets */
2732 bool spilled_rt = cs->stage == PIPE_SHADER_FRAGMENT &&
2733 agx_tilebuffer_spills(&batch->tilebuffer_layout);
2734 if (spilled_rt)
2735 n += (batch->key.nr_cbufs * 2);
2736
2737 return n;
2738 }
2739
2740 /*
2741 * For spilled render targets, upload a texture/PBE pair for each surface to
2742 * allow loading/storing to the render target from the shader.
2743 */
2744 static void
agx_upload_spilled_rt_descriptors(struct agx_texture_packed * out,struct agx_batch * batch)2745 agx_upload_spilled_rt_descriptors(struct agx_texture_packed *out,
2746 struct agx_batch *batch)
2747 {
2748 for (unsigned rt = 0; rt < batch->key.nr_cbufs; ++rt) {
2749 struct agx_texture_packed *texture = out + (2 * rt);
2750 struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2751
2752 struct pipe_surface *surf = batch->key.cbufs[rt];
2753 if (!surf)
2754 continue;
2755
2756 struct agx_resource *rsrc = agx_resource(surf->texture);
2757 struct pipe_image_view view = image_view_for_surface(surf);
2758 struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
2759 sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2760
2761 agx_pack_texture(texture, rsrc, surf->format, &sampler_view);
2762 agx_batch_upload_pbe(batch, pbe, &view, false, false, true, true);
2763 }
2764 }
2765
2766 static void
agx_upload_textures(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2767 agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs,
2768 enum pipe_shader_type stage)
2769 {
2770 struct agx_context *ctx = batch->ctx;
2771
2772 /* This can occur for meta shaders */
2773 if (!cs->so) {
2774 batch->texture_count[stage] = 0;
2775 batch->stage_uniforms[stage].texture_base = 0;
2776 return;
2777 }
2778
2779 unsigned nr_textures = cs->so->info.nr_bindful_textures;
2780
2781 unsigned nr_active_textures = ctx->stage[stage].texture_count;
2782 unsigned nr_tex_descriptors = agx_nr_tex_descriptors(batch, cs);
2783 unsigned nr_images = cs->so->info.nr_bindful_images;
2784
2785 struct agx_ptr T_tex = agx_pool_alloc_aligned(
2786 &batch->pool, AGX_TEXTURE_LENGTH * nr_tex_descriptors, 64);
2787
2788 struct agx_texture_packed *textures = T_tex.cpu;
2789
2790 for (unsigned i = 0; i < MIN2(nr_textures, nr_active_textures); ++i) {
2791 struct agx_sampler_view *tex = ctx->stage[stage].textures[i];
2792
2793 if (tex == NULL) {
2794 agx_set_null_texture(&textures[i], T_tex.gpu);
2795 continue;
2796 }
2797
2798 struct agx_resource *rsrc = tex->rsrc;
2799 agx_batch_reads(batch, tex->rsrc);
2800
2801 /* Re-emit state because the layout might have changed from under us.
2802 * TODO: optimize this somehow?
2803 */
2804 agx_pack_texture(&tex->desc, rsrc, tex->format, &tex->base);
2805
2806 textures[i] = tex->desc;
2807 }
2808
2809 for (unsigned i = nr_active_textures; i < nr_textures; ++i)
2810 agx_set_null_texture(&textures[i], T_tex.gpu);
2811
2812 for (unsigned i = 0; i < nr_images; ++i) {
2813 /* Image descriptors come in pairs after the textures */
2814 struct agx_texture_packed *texture =
2815 ((struct agx_texture_packed *)T_tex.cpu) + nr_textures + (2 * i);
2816
2817 struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2818
2819 if (!(ctx->stage[stage].image_mask & BITFIELD_BIT(i))) {
2820 agx_set_null_texture(texture, T_tex.gpu);
2821 agx_set_null_pbe(pbe, agx_pool_alloc_aligned(&batch->pool, 1, 64).gpu);
2822 continue;
2823 }
2824
2825 struct pipe_image_view *view = &ctx->stage[stage].images[i];
2826 agx_batch_track_image(batch, view);
2827
2828 struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view);
2829
2830 /* For the texture descriptor, lower cubes to 2D arrays. This matches the
2831 * transform done in the compiler. Also, force 2D arrays for internal
2832 * blitter images, this helps reduce shader variants.
2833 */
2834 bool internal = (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
2835
2836 if (target_is_cube(sampler_view.target) ||
2837 (sampler_view.target == PIPE_TEXTURE_3D && internal))
2838 sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2839
2840 agx_pack_texture(texture, agx_resource(view->resource), view->format,
2841 &sampler_view);
2842 agx_batch_upload_pbe(batch, pbe, view, false, false, false, false);
2843 }
2844
2845 if (stage == PIPE_SHADER_FRAGMENT &&
2846 agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
2847
2848 struct agx_texture_packed *out =
2849 ((struct agx_texture_packed *)T_tex.cpu) +
2850 agx_nr_tex_descriptors_without_spilled_rts(cs);
2851
2852 agx_upload_spilled_rt_descriptors(out, batch);
2853 }
2854
2855 batch->texture_count[stage] = nr_tex_descriptors;
2856 batch->stage_uniforms[stage].texture_base = T_tex.gpu;
2857 }
2858
2859 uint16_t
agx_sampler_heap_add(struct agx_device * dev,struct agx_sampler_heap * heap,struct agx_sampler_packed * sampler)2860 agx_sampler_heap_add(struct agx_device *dev, struct agx_sampler_heap *heap,
2861 struct agx_sampler_packed *sampler)
2862 {
2863 /* Allocate (maximally sized) BO if we haven't already */
2864 if (!heap->bo) {
2865 heap->bo = agx_bo_create(dev, AGX_SAMPLER_HEAP_SIZE * AGX_SAMPLER_LENGTH,
2866 0, AGX_BO_WRITEBACK, "Sampler heap");
2867
2868 assert(heap->count == 0);
2869 }
2870
2871 /* TODO search */
2872
2873 /* Precondition: there is room in the heap */
2874 assert(heap->count < AGX_SAMPLER_HEAP_SIZE);
2875 struct agx_sampler_packed *samplers = agx_bo_map(heap->bo);
2876 memcpy(samplers + heap->count, sampler, sizeof(*sampler));
2877
2878 return heap->count++;
2879 }
2880
2881 static void
agx_upload_samplers(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2882 agx_upload_samplers(struct agx_batch *batch, struct agx_compiled_shader *cs,
2883 enum pipe_shader_type stage)
2884 {
2885 struct agx_context *ctx = batch->ctx;
2886
2887 unsigned nr_samplers = sampler_count(ctx, stage);
2888 bool custom_borders = ctx->stage[stage].custom_borders;
2889
2890 size_t sampler_length =
2891 AGX_SAMPLER_LENGTH + (custom_borders ? AGX_BORDER_LENGTH : 0);
2892
2893 struct agx_ptr T =
2894 agx_pool_alloc_aligned(&batch->pool, sampler_length * nr_samplers, 64);
2895
2896 /* Sampler #0 is reserved for txf */
2897 agx_pack_txf_sampler(T.cpu);
2898
2899 /* Remaining samplers are API samplers */
2900 uint8_t *out_sampler = (uint8_t *)T.cpu + sampler_length;
2901 for (unsigned i = 0; i < ctx->stage[stage].sampler_count; ++i) {
2902 struct agx_sampler_state *sampler = ctx->stage[stage].samplers[i];
2903 struct agx_sampler_packed *out = (struct agx_sampler_packed *)out_sampler;
2904
2905 if (sampler) {
2906 *out = sampler->desc;
2907
2908 if (custom_borders) {
2909 STATIC_ASSERT(sizeof(sampler->border) == AGX_BORDER_LENGTH);
2910
2911 memcpy(out_sampler + AGX_SAMPLER_LENGTH, &sampler->border,
2912 AGX_BORDER_LENGTH);
2913 } else {
2914 assert(!sampler->uses_custom_border && "invalid combination");
2915 }
2916 } else {
2917 memset(out, 0, sampler_length);
2918 }
2919
2920 out_sampler += sampler_length;
2921 }
2922
2923 batch->sampler_count[stage] = nr_samplers;
2924 batch->samplers[stage] = T.gpu;
2925 }
2926
2927 static void
agx_update_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2928 agx_update_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2929 {
2930 struct agx_context *ctx = batch->ctx;
2931 if (!cs)
2932 return;
2933
2934 enum pipe_shader_type stage = cs->stage;
2935 if (!ctx->stage[stage].dirty)
2936 return;
2937
2938 if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_CONST)
2939 agx_set_cbuf_uniforms(batch, stage);
2940
2941 if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SSBO)
2942 agx_set_ssbo_uniforms(batch, stage);
2943
2944 if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE)
2945 agx_upload_textures(batch, cs, stage);
2946
2947 if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
2948 agx_set_sampler_uniforms(batch, stage);
2949
2950 if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
2951 agx_upload_samplers(batch, cs, stage);
2952
2953 struct agx_stage_uniforms *unif = &batch->stage_uniforms[stage];
2954
2955 batch->uniforms.tables[AGX_SYSVAL_STAGE(stage)] =
2956 agx_pool_upload_aligned(&batch->pool, unif, sizeof(*unif), 16);
2957 }
2958
2959 static uint32_t
agx_build_pipeline(struct agx_batch * batch,struct agx_compiled_shader * cs,struct agx_linked_shader * linked,enum pipe_shader_type phys_stage,unsigned variable_shared_mem,size_t max_subgroups)2960 agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
2961 struct agx_linked_shader *linked,
2962 enum pipe_shader_type phys_stage,
2963 unsigned variable_shared_mem, size_t max_subgroups)
2964 {
2965 struct agx_context *ctx = batch->ctx;
2966 struct agx_device *dev = agx_device(ctx->base.screen);
2967 unsigned constant_push_ranges = DIV_ROUND_UP(cs->b.info.rodata.size_16, 64);
2968
2969 size_t usc_size =
2970 agx_usc_size(constant_push_ranges + cs->push_range_count + 2);
2971
2972 struct agx_ptr t =
2973 agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
2974
2975 struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
2976
2977 enum pipe_shader_type stage = cs->stage;
2978
2979 if (batch->texture_count[stage]) {
2980 agx_usc_pack(&b, TEXTURE, cfg) {
2981 cfg.start = 0;
2982 cfg.count =
2983 MIN2(batch->texture_count[stage], AGX_NUM_TEXTURE_STATE_REGS);
2984 cfg.buffer = batch->stage_uniforms[stage].texture_base;
2985 }
2986 }
2987
2988 if (batch->sampler_count[stage]) {
2989 agx_usc_pack(&b, SAMPLER, cfg) {
2990 cfg.start = 0;
2991 cfg.count = batch->sampler_count[stage];
2992 cfg.buffer = batch->samplers[stage];
2993 }
2994 }
2995
2996 for (unsigned i = 0; i < cs->push_range_count; ++i) {
2997 unsigned table = cs->push[i].table;
2998 uint64_t table_ptr = batch->uniforms.tables[table];
2999
3000 /* Params may be omitted if the VS prolog does not read them, but the
3001 * reservation is always there in the API shader just in case.
3002 */
3003 if (table == AGX_SYSVAL_TABLE_PARAMS && !table_ptr)
3004 continue;
3005
3006 assert(table_ptr);
3007
3008 agx_usc_uniform(&b, cs->push[i].uniform, cs->push[i].length,
3009 table_ptr + cs->push[i].offset);
3010 }
3011
3012 if (cs->bo) {
3013 agx_usc_immediates(&b, &cs->b.info.rodata, cs->bo->va->addr);
3014 }
3015
3016 uint32_t max_scratch_size =
3017 MAX2(cs->b.info.scratch_size, cs->b.info.preamble_scratch_size);
3018
3019 if (max_scratch_size > 0) {
3020 unsigned preamble_size = (cs->b.info.preamble_scratch_size > 0) ? 1 : 0;
3021
3022 switch (phys_stage) {
3023 case PIPE_SHADER_FRAGMENT:
3024 agx_scratch_alloc(&ctx->scratch_fs, max_scratch_size, max_subgroups);
3025 batch->fs_scratch = true;
3026 batch->fs_preamble_scratch =
3027 MAX2(batch->fs_preamble_scratch, preamble_size);
3028 break;
3029 case PIPE_SHADER_VERTEX:
3030 agx_scratch_alloc(&ctx->scratch_vs, max_scratch_size, max_subgroups);
3031 batch->vs_scratch = true;
3032 batch->vs_preamble_scratch =
3033 MAX2(batch->vs_preamble_scratch, preamble_size);
3034 break;
3035 default:
3036 agx_scratch_alloc(&ctx->scratch_cs, max_scratch_size, max_subgroups);
3037 batch->cs_scratch = true;
3038 batch->cs_preamble_scratch =
3039 MAX2(batch->cs_preamble_scratch, preamble_size);
3040 break;
3041 }
3042 }
3043
3044 if (stage == PIPE_SHADER_FRAGMENT) {
3045 agx_usc_push_packed(&b, SHARED, &batch->tilebuffer_layout.usc);
3046 } else {
3047 agx_usc_shared_non_fragment(&b, &cs->b.info, variable_shared_mem);
3048 }
3049
3050 if (linked) {
3051 agx_usc_push_packed(&b, SHADER, linked->shader);
3052 agx_usc_push_packed(&b, REGISTERS, linked->regs);
3053
3054 if (stage == PIPE_SHADER_FRAGMENT)
3055 agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, linked->fragment_props);
3056 } else {
3057 agx_usc_pack(&b, SHADER, cfg) {
3058 cfg.code =
3059 agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.main_offset);
3060 cfg.unk_2 = 3;
3061 }
3062
3063 agx_usc_pack(&b, REGISTERS, cfg) {
3064 cfg.register_count = cs->b.info.nr_gprs;
3065 cfg.spill_size = cs->b.info.scratch_size
3066 ? agx_scratch_get_bucket(cs->b.info.scratch_size)
3067 : 0;
3068 }
3069 }
3070
3071 if (cs->b.info.has_preamble) {
3072 agx_usc_pack(&b, PRESHADER, cfg) {
3073 cfg.code =
3074 agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.preamble_offset);
3075 }
3076 } else {
3077 agx_usc_pack(&b, NO_PRESHADER, cfg)
3078 ;
3079 }
3080
3081 return agx_usc_addr(dev, t.gpu);
3082 }
3083
3084 static void
agx_launch_internal(struct agx_batch * batch,struct agx_grid grid,struct agx_workgroup wg,struct agx_cdm_launch_word_0_packed launch,enum pipe_shader_type stage,uint32_t usc)3085 agx_launch_internal(struct agx_batch *batch, struct agx_grid grid,
3086 struct agx_workgroup wg,
3087 struct agx_cdm_launch_word_0_packed launch,
3088 enum pipe_shader_type stage, uint32_t usc)
3089 {
3090 struct agx_context *ctx = batch->ctx;
3091 struct agx_device *dev = agx_device(ctx->base.screen);
3092
3093 /* TODO: Ensure space if we allow multiple kernels in a batch */
3094 uint32_t *out = (uint32_t *)batch->cdm.current;
3095
3096 out = agx_cdm_launch(out, dev->chip, grid, wg, launch, usc);
3097 out = agx_cdm_barrier(out, dev->chip);
3098
3099 batch->cdm.current = (void *)out;
3100 assert(batch->cdm.current <= batch->cdm.end &&
3101 "Failed to reserve sufficient space in encoder");
3102 }
3103
3104 void
agx_launch_precomp(struct agx_batch * batch,struct agx_grid grid,enum agx_barrier barrier,enum libagx_program program,void * args,size_t arg_size)3105 agx_launch_precomp(struct agx_batch *batch, struct agx_grid grid,
3106 enum agx_barrier barrier, enum libagx_program program,
3107 void *args, size_t arg_size)
3108 {
3109 struct agx_device *dev = agx_device(batch->ctx->base.screen);
3110 struct agx_precompiled_shader *cs =
3111 agx_get_precompiled(&batch->ctx->bg_eot, program);
3112
3113 struct agx_ptr t =
3114 agx_pool_alloc_aligned(&batch->pipeline_pool, agx_usc_size(15), 64);
3115
3116 uint64_t uploaded_data =
3117 agx_pool_upload_aligned(&batch->pool, args, arg_size, 4);
3118
3119 uint32_t usc = agx_usc_addr(dev, t.gpu);
3120 agx_usc_words_precomp(t.cpu, &cs->b, uploaded_data, arg_size);
3121
3122 agx_batch_add_bo(batch, cs->bo);
3123 agx_launch_internal(batch, grid, cs->b.workgroup, cs->b.launch,
3124 PIPE_SHADER_COMPUTE, usc);
3125 }
3126
3127 struct asahi_bg_eot
agx_build_bg_eot(struct agx_batch * batch,bool store,bool partial_render)3128 agx_build_bg_eot(struct agx_batch *batch, bool store, bool partial_render)
3129 {
3130 struct agx_context *ctx = batch->ctx;
3131
3132 /* Construct the key */
3133 struct agx_bg_eot_key key = {.tib = batch->tilebuffer_layout};
3134
3135 bool needs_textures_for_spilled_rts =
3136 agx_tilebuffer_spills(&batch->tilebuffer_layout) && !partial_render &&
3137 !store;
3138
3139 for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3140 struct pipe_surface *surf = batch->key.cbufs[rt];
3141
3142 if (surf == NULL)
3143 continue;
3144
3145 if (store) {
3146 /* TODO: Suppress stores to discarded render targets */
3147 key.op[rt] = AGX_EOT_STORE;
3148 } else if (batch->tilebuffer_layout.spilled[rt] && partial_render) {
3149 /* Partial render programs exist only to store/load the tilebuffer to
3150 * main memory. When render targets are already spilled to main memory,
3151 * there's nothing to do.
3152 */
3153 key.op[rt] = AGX_BG_EOT_NONE;
3154 } else {
3155 bool valid = (batch->load & (PIPE_CLEAR_COLOR0 << rt));
3156 bool clear = (batch->clear & (PIPE_CLEAR_COLOR0 << rt));
3157 bool load = valid && !clear;
3158
3159 /* Don't read back spilled render targets, they're already in memory */
3160 load &= !batch->tilebuffer_layout.spilled[rt];
3161
3162 /* The background program used for partial renders must always load
3163 * whatever was stored in the mid-frame end-of-tile program.
3164 */
3165 load |= partial_render;
3166
3167 key.op[rt] = load ? AGX_BG_LOAD
3168 : clear ? AGX_BG_CLEAR
3169 : AGX_BG_EOT_NONE;
3170 }
3171 }
3172
3173 /* Begin building the pipeline */
3174 size_t usc_size = agx_usc_size(3 + PIPE_MAX_COLOR_BUFS);
3175 struct agx_ptr t =
3176 agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
3177 struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
3178
3179 bool needs_sampler = false;
3180 unsigned uniforms = 0;
3181 unsigned nr_tex = 0;
3182
3183 for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3184 if (key.op[rt] == AGX_BG_LOAD) {
3185 /* Each reloaded render target is textured */
3186 needs_sampler = true;
3187
3188 /* Will be uploaded later, this would be clobbered */
3189 if (needs_textures_for_spilled_rts)
3190 continue;
3191
3192 struct agx_ptr texture =
3193 agx_pool_alloc_aligned(&batch->pool, AGX_TEXTURE_LENGTH, 64);
3194 struct pipe_surface *surf = batch->key.cbufs[rt];
3195 assert(surf != NULL && "cannot load nonexistent attachment");
3196
3197 struct agx_resource *rsrc = agx_resource(surf->texture);
3198 struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
3199
3200 agx_pack_texture(texture.cpu, rsrc, surf->format, &sampler_view);
3201
3202 agx_usc_pack(&b, TEXTURE, cfg) {
3203 /* Shifted to match eMRT indexing, could be optimized */
3204 cfg.start = rt * 2;
3205 cfg.count = 1;
3206 cfg.buffer = texture.gpu;
3207 }
3208
3209 nr_tex = (rt * 2) + 1;
3210 } else if (key.op[rt] == AGX_BG_CLEAR) {
3211 assert(batch->uploaded_clear_color[rt] && "set when cleared");
3212 agx_usc_uniform(&b, 4 + (8 * rt), 8, batch->uploaded_clear_color[rt]);
3213 uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
3214 } else if (key.op[rt] == AGX_EOT_STORE) {
3215 struct pipe_image_view view =
3216 image_view_for_surface(batch->key.cbufs[rt]);
3217 struct agx_ptr pbe =
3218 agx_pool_alloc_aligned(&batch->pool, AGX_PBE_LENGTH, 256);
3219
3220 /* The tilebuffer is already in sRGB space if needed. Do not convert */
3221 view.format = util_format_linear(view.format);
3222
3223 bool no_compress = batch->feedback & (PIPE_CLEAR_COLOR0 << rt);
3224 agx_batch_upload_pbe(batch, pbe.cpu, &view, true, true, false,
3225 no_compress);
3226
3227 agx_usc_pack(&b, TEXTURE, cfg) {
3228 cfg.start = rt;
3229 cfg.count = 1;
3230 cfg.buffer = pbe.gpu;
3231 }
3232
3233 nr_tex = rt + 1;
3234 }
3235 }
3236
3237 if (needs_textures_for_spilled_rts) {
3238 /* Upload texture/PBE descriptors for each render target so we can clear
3239 * spilled render targets.
3240 */
3241 struct agx_ptr descs = agx_pool_alloc_aligned(
3242 &batch->pool, AGX_TEXTURE_LENGTH * 2 * batch->key.nr_cbufs, 64);
3243 agx_upload_spilled_rt_descriptors(descs.cpu, batch);
3244
3245 agx_usc_pack(&b, TEXTURE, cfg) {
3246 cfg.start = 0;
3247 cfg.count = 2 * batch->key.nr_cbufs;
3248 cfg.buffer = descs.gpu;
3249 }
3250
3251 nr_tex = MAX2(nr_tex, 2 * batch->key.nr_cbufs);
3252
3253 /* Bind the base as u0_u1 for bindless access */
3254 agx_usc_uniform(&b, 0, 4,
3255 agx_pool_upload_aligned(&batch->pool, &descs.gpu, 8, 8));
3256 uniforms = MAX2(uniforms, 4);
3257 }
3258
3259 /* All render targets share a sampler */
3260 if (needs_sampler) {
3261 struct agx_ptr sampler =
3262 agx_pool_alloc_aligned(&batch->pool, AGX_SAMPLER_LENGTH, 64);
3263
3264 agx_pack(sampler.cpu, SAMPLER, cfg) {
3265 cfg.minimum_lod = 0.0f;
3266 cfg.maximum_lod = INFINITY;
3267 cfg.magnify = AGX_FILTER_LINEAR;
3268 cfg.minify = AGX_FILTER_NEAREST;
3269 cfg.mip_filter = AGX_MIP_FILTER_NONE;
3270 cfg.wrap_s = AGX_WRAP_CLAMP_TO_EDGE;
3271 cfg.wrap_t = AGX_WRAP_CLAMP_TO_EDGE;
3272 cfg.wrap_r = AGX_WRAP_CLAMP_TO_EDGE;
3273 cfg.pixel_coordinates = true;
3274 cfg.compare_func = AGX_COMPARE_FUNC_ALWAYS;
3275 }
3276
3277 agx_usc_pack(&b, SAMPLER, cfg) {
3278 cfg.start = 0;
3279 cfg.count = 1;
3280 cfg.buffer = sampler.gpu;
3281 }
3282 }
3283
3284 agx_usc_push_packed(&b, SHARED, &batch->tilebuffer_layout.usc);
3285
3286 /* Get the shader */
3287 key.reserved_preamble = uniforms;
3288 struct agx_device *dev = agx_device(ctx->base.screen);
3289 struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&ctx->bg_eot, &key);
3290 agx_batch_add_bo(batch, shader->bo);
3291 assert(shader->info.rodata.size_16 == 0);
3292
3293 agx_usc_pack(&b, SHADER, cfg) {
3294 cfg.code = agx_usc_addr(dev, shader->ptr + shader->info.main_offset);
3295 cfg.unk_2 = 0;
3296 }
3297
3298 agx_usc_pack(&b, REGISTERS, cfg)
3299 cfg.register_count = shader->info.nr_gprs;
3300
3301 if (shader->info.has_preamble) {
3302 agx_usc_pack(&b, PRESHADER, cfg) {
3303 cfg.code =
3304 agx_usc_addr(dev, shader->ptr + shader->info.preamble_offset);
3305 }
3306 } else {
3307 agx_usc_pack(&b, NO_PRESHADER, cfg)
3308 ;
3309 }
3310
3311 struct asahi_bg_eot ret = {.usc = t.gpu};
3312
3313 agx_pack(&ret.counts, COUNTS, cfg) {
3314 cfg.uniform_register_count = shader->info.push_count;
3315 cfg.preshader_register_count = shader->info.nr_preamble_gprs;
3316 cfg.texture_state_register_count = nr_tex;
3317 cfg.sampler_state_register_count =
3318 agx_translate_sampler_state_count(needs_sampler ? 1 : 0, false);
3319
3320 if (!store)
3321 cfg.unknown = 0xFFFF;
3322 }
3323
3324 return ret;
3325 }
3326
3327 /*
3328 * Return the standard sample positions, packed into a 32-bit word with fixed
3329 * point nibbles for each x/y component of the (at most 4) samples. This is
3330 * suitable for programming the PPP_MULTISAMPLECTL control register.
3331 */
3332 static uint32_t
agx_default_sample_positions(unsigned nr_samples)3333 agx_default_sample_positions(unsigned nr_samples)
3334 {
3335 switch (nr_samples) {
3336 case 1:
3337 return 0x88;
3338 case 2:
3339 return 0x44cc;
3340 case 4:
3341 return 0xeaa26e26;
3342 default:
3343 unreachable("Invalid sample count");
3344 }
3345 }
3346
3347 void
agx_batch_init_state(struct agx_batch * batch)3348 agx_batch_init_state(struct agx_batch *batch)
3349 {
3350 if (batch->initialized)
3351 return;
3352
3353 if (agx_batch_is_compute(batch)) {
3354 batch->initialized = true;
3355
3356 struct agx_context *ctx = batch->ctx;
3357 struct agx_device *dev = agx_device(ctx->base.screen);
3358 uint8_t *out = batch->cdm.current;
3359
3360 /* See below */
3361 agx_push(out, CDM_BARRIER, cfg) {
3362 cfg.usc_cache_inval = true;
3363 cfg.unk_5 = true;
3364 cfg.unk_6 = true;
3365 cfg.unk_8 = true;
3366 // cfg.unk_11 = true;
3367 // cfg.unk_20 = true;
3368 if (dev->params.num_clusters_total > 1) {
3369 // cfg.unk_24 = true;
3370 if (dev->params.gpu_generation == 13) {
3371 cfg.unk_4 = true;
3372 // cfg.unk_26 = true;
3373 }
3374 }
3375 }
3376
3377 return;
3378 }
3379
3380 /* Emit state on the batch that we don't change and so don't dirty track */
3381 uint8_t *out = batch->vdm.current;
3382
3383 /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
3384 * with another that caused stale data to be cached and the CPU wrote to it
3385 * in the meantime.
3386 */
3387 agx_push(out, VDM_BARRIER, cfg) {
3388 cfg.usc_cache_inval = true;
3389 }
3390
3391 struct AGX_PPP_HEADER present = {
3392 .w_clamp = true,
3393 .occlusion_query_2 = true,
3394 .output_unknown = true,
3395 .varying_word_2 = true,
3396 .viewport_count = 1, /* irrelevant */
3397 };
3398
3399 size_t size = agx_ppp_update_size(&present);
3400 struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
3401 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
3402
3403 /* clang-format off */
3404 agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
3405 agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
3406 agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
3407 agx_ppp_push(&ppp, VARYING_2, cfg);
3408 /* clang-format on */
3409
3410 agx_ppp_fini(&out, &ppp);
3411 batch->vdm.current = out;
3412
3413 /* Mark it as initialized now, since agx_batch_writes() will check this. */
3414 batch->initialized = true;
3415
3416 /* Choose a tilebuffer layout given the framebuffer key */
3417 enum pipe_format formats[PIPE_MAX_COLOR_BUFS] = {0};
3418 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3419 struct pipe_surface *surf = batch->key.cbufs[i];
3420 if (surf)
3421 formats[i] = surf->format;
3422 }
3423
3424 batch->tilebuffer_layout = agx_build_tilebuffer_layout(
3425 formats, batch->key.nr_cbufs,
3426 util_framebuffer_get_num_samples(&batch->key),
3427 util_framebuffer_get_num_layers(&batch->key) > 1);
3428
3429 if (agx_device(batch->ctx->base.screen)->debug & AGX_DBG_SMALLTILE)
3430 batch->tilebuffer_layout.tile_size = (struct agx_tile_size){16, 16};
3431
3432 /* If the layout spilled render targets, we need to decompress those render
3433 * targets to ensure we can write to them.
3434 */
3435 if (agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
3436 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3437 if (!batch->tilebuffer_layout.spilled[i])
3438 continue;
3439
3440 struct pipe_surface *surf = batch->key.cbufs[i];
3441 if (!surf)
3442 continue;
3443
3444 struct agx_resource *rsrc = agx_resource(surf->texture);
3445 struct ail_layout *layout = &rsrc->layout;
3446 unsigned level = surf->u.tex.level;
3447
3448 if (!ail_is_level_compressed(layout, level))
3449 continue;
3450
3451 if (true || (rsrc->base.bind & PIPE_BIND_SHARED)) {
3452 agx_decompress_inplace(batch, surf, "Render target spilled");
3453 } else {
3454 agx_decompress(batch->ctx, rsrc, "Render target spilled");
3455 }
3456 }
3457 }
3458
3459 if (batch->key.zsbuf) {
3460 unsigned level = batch->key.zsbuf->u.tex.level;
3461 struct agx_resource *rsrc = agx_resource(batch->key.zsbuf->texture);
3462
3463 agx_batch_writes(batch, rsrc, level);
3464
3465 if (rsrc->separate_stencil)
3466 agx_batch_writes(batch, rsrc->separate_stencil, level);
3467 }
3468
3469 for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3470 if (batch->key.cbufs[i]) {
3471 struct agx_resource *rsrc = agx_resource(batch->key.cbufs[i]->texture);
3472 unsigned level = batch->key.cbufs[i]->u.tex.level;
3473
3474 if (agx_resource_valid(rsrc, level))
3475 batch->load |= PIPE_CLEAR_COLOR0 << i;
3476
3477 agx_batch_writes(batch, rsrc, level);
3478 assert(agx_resource_valid(rsrc, level));
3479 }
3480 }
3481
3482 /* Set up standard sample positions */
3483 batch->uniforms.ppp_multisamplectl =
3484 agx_default_sample_positions(batch->tilebuffer_layout.nr_samples);
3485 }
3486
3487 static enum agx_object_type
agx_point_object_type(struct agx_rasterizer * rast)3488 agx_point_object_type(struct agx_rasterizer *rast)
3489 {
3490 return (rast->base.sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT)
3491 ? AGX_OBJECT_TYPE_POINT_SPRITE_UV01
3492 : AGX_OBJECT_TYPE_POINT_SPRITE_UV10;
3493 }
3494
3495 #define MAX_PPP_UPDATES 2
3496 #define IS_DIRTY(ST) !!(ctx->dirty & AGX_DIRTY_##ST)
3497
3498 static uint8_t *
agx_encode_state(struct agx_batch * batch,uint8_t * out)3499 agx_encode_state(struct agx_batch *batch, uint8_t *out)
3500 {
3501 struct agx_context *ctx = batch->ctx;
3502 struct agx_device *dev = agx_device(ctx->base.screen);
3503
3504 /* If nothing is dirty, encode nothing */
3505 if (!ctx->dirty)
3506 return out;
3507
3508 struct agx_rasterizer *rast = ctx->rast;
3509 unsigned ppp_updates = 0;
3510
3511 struct agx_compiled_shader *vs = ctx->vs;
3512 if (ctx->gs)
3513 vs = ctx->gs->gs_copy;
3514
3515 bool varyings_dirty = false;
3516
3517 if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS) ||
3518 IS_DIRTY(PRIM)) {
3519
3520 unsigned bindings = ctx->linked.fs->cf.nr_bindings;
3521 if (bindings) {
3522 size_t linkage_size =
3523 AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
3524
3525 struct agx_ptr t =
3526 agx_pool_alloc_aligned(&batch->pipeline_pool, linkage_size, 16);
3527
3528 agx_link_varyings_vs_fs(t.cpu, &batch->linked_varyings,
3529 vs->uvs.user_size, &ctx->linked.fs->cf,
3530 ctx->rast->base.flatshade_first ? 0 : 2,
3531 (batch->reduced_prim == MESA_PRIM_POINTS)
3532 ? ctx->rast->base.sprite_coord_enable
3533 : 0,
3534 &batch->generate_primitive_id);
3535
3536 batch->varyings = agx_usc_addr(dev, t.gpu);
3537 } else {
3538 batch->varyings = 0;
3539 }
3540
3541 varyings_dirty = true;
3542 ppp_updates++;
3543 }
3544
3545 if (IS_DIRTY(VS) || varyings_dirty) {
3546 agx_push(out, VDM_STATE, cfg) {
3547 cfg.vertex_shader_word_0_present = true;
3548 cfg.vertex_shader_word_1_present = true;
3549 cfg.vertex_outputs_present = true;
3550 cfg.vertex_unknown_present = true;
3551 }
3552
3553 agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_0, cfg) {
3554 cfg.uniform_register_count = vs->b.info.push_count;
3555 cfg.preshader_register_count = vs->b.info.nr_preamble_gprs;
3556 cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, vs);
3557 cfg.sampler_state_register_count =
3558 translate_sampler_state_count(ctx, vs->stage);
3559 }
3560
3561 agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3562 cfg.pipeline =
3563 agx_build_pipeline(batch, vs, ctx->gs ? NULL : ctx->linked.vs,
3564 PIPE_SHADER_VERTEX, 0, 0);
3565 }
3566
3567 agx_push_packed(out, vs->uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3568
3569 agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3570 cfg.flat_shading_control = ctx->rast->base.flatshade_first
3571 ? AGX_VDM_VERTEX_0
3572 : AGX_VDM_VERTEX_2;
3573 cfg.unknown_4 = cfg.unknown_5 = ctx->rast->base.rasterizer_discard;
3574
3575 cfg.generate_primitive_id = batch->generate_primitive_id;
3576 }
3577
3578 /* Pad up to a multiple of 8 bytes */
3579 memset(out, 0, 4);
3580 out += 4;
3581 }
3582
3583 struct agx_pool *pool = &batch->pool;
3584
3585 if ((ctx->dirty & AGX_DIRTY_RS) && ctx->rast->depth_bias) {
3586 agx_upload_depth_bias(batch, &ctx->rast->base);
3587 ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
3588 }
3589
3590 if (ctx->dirty & (AGX_DIRTY_VIEWPORT | AGX_DIRTY_SCISSOR_ZBIAS |
3591 AGX_DIRTY_RS | AGX_DIRTY_VS)) {
3592
3593 agx_upload_viewport_scissor(pool, batch, &out, ctx->viewport,
3594 ctx->rast->base.scissor ? ctx->scissor : NULL,
3595 ctx->rast->base.clip_halfz,
3596 vs->b.info.nonzero_viewport);
3597 }
3598
3599 bool is_points = batch->reduced_prim == MESA_PRIM_POINTS;
3600 bool is_lines = batch->reduced_prim == MESA_PRIM_LINES;
3601
3602 bool object_type_dirty =
3603 IS_DIRTY(PRIM) || (is_points && IS_DIRTY(SPRITE_COORD_MODE));
3604
3605 bool fragment_face_dirty =
3606 IS_DIRTY(ZS) || IS_DIRTY(STENCIL_REF) || IS_DIRTY(RS);
3607
3608 enum agx_object_type object_type = is_points ? agx_point_object_type(rast)
3609 : is_lines ? AGX_OBJECT_TYPE_LINE
3610 : AGX_OBJECT_TYPE_TRIANGLE;
3611
3612 struct AGX_PPP_HEADER dirty = {
3613 .fragment_control =
3614 IS_DIRTY(ZS) || IS_DIRTY(RS) || IS_DIRTY(PRIM) || IS_DIRTY(QUERY),
3615 .fragment_control_2 = IS_DIRTY(FS_PROG) || IS_DIRTY(RS),
3616 .fragment_front_face = fragment_face_dirty,
3617 .fragment_front_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3618 .fragment_front_stencil = IS_DIRTY(ZS),
3619 .fragment_back_face = fragment_face_dirty,
3620 .fragment_back_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3621 .fragment_back_stencil = IS_DIRTY(ZS),
3622 .output_select = varyings_dirty,
3623 .varying_counts_32 = varyings_dirty,
3624 .varying_counts_16 = varyings_dirty,
3625 /* Also dirty with tess but agx_draw_patches dirties RS for that */
3626 .cull = IS_DIRTY(RS),
3627 .cull_2 = varyings_dirty,
3628 .fragment_shader =
3629 IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
3630 .occlusion_query = IS_DIRTY(QUERY),
3631 .output_size = IS_DIRTY(VS_PROG),
3632 .viewport_count = 1, /* irrelevant */
3633 };
3634
3635 size_t size = agx_ppp_update_size(&dirty);
3636 struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
3637 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
3638
3639 if (dirty.fragment_control) {
3640 agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
3641 if (ctx->active_queries && ctx->occlusion_query) {
3642 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
3643 cfg.visibility_mode = AGX_VISIBILITY_MODE_COUNTING;
3644 else
3645 cfg.visibility_mode = AGX_VISIBILITY_MODE_BOOLEAN;
3646 }
3647
3648 cfg.stencil_test_enable = ctx->zs->base.stencil[0].enabled;
3649 cfg.two_sided_stencil = ctx->zs->base.stencil[1].enabled;
3650 cfg.depth_bias_enable =
3651 rast->depth_bias && object_type == AGX_OBJECT_TYPE_TRIANGLE;
3652
3653 /* Always enable scissoring so we may scissor to the viewport (TODO:
3654 * optimize this out if the viewport is the default and the app does
3655 * not use the scissor test)
3656 */
3657 cfg.scissor_enable = true;
3658
3659 /* This avoids broken derivatives along primitive edges */
3660 cfg.disable_tri_merging = is_lines || is_points;
3661 }
3662 }
3663
3664 if (dirty.fragment_control_2) {
3665 /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
3666 * main fragment control word and has to be combined into the secondary
3667 * word for reliable behaviour.
3668 */
3669 agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
3670 ctx->linked.fs->fragment_control) {
3671 cfg.tag_write_disable = rast->base.rasterizer_discard;
3672 }
3673 }
3674
3675 if (dirty.fragment_front_face) {
3676 agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, ctx->zs->depth) {
3677 cfg.stencil_reference = ctx->stencil_ref.ref_value[0];
3678 cfg.line_width = rast->line_width;
3679 cfg.polygon_mode = rast->polygon_mode;
3680 }
3681 }
3682
3683 if (dirty.fragment_front_face_2)
3684 agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->b.info);
3685
3686 if (dirty.fragment_front_stencil) {
3687 agx_ppp_push_packed(&ppp, ctx->zs->front_stencil.opaque,
3688 FRAGMENT_STENCIL);
3689 }
3690
3691 if (dirty.fragment_back_face) {
3692 agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, ctx->zs->depth) {
3693 bool twosided = ctx->zs->base.stencil[1].enabled;
3694 cfg.stencil_reference = ctx->stencil_ref.ref_value[twosided ? 1 : 0];
3695 cfg.line_width = rast->line_width;
3696 cfg.polygon_mode = rast->polygon_mode;
3697 }
3698 }
3699
3700 if (dirty.fragment_back_face_2)
3701 agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->b.info);
3702
3703 if (dirty.fragment_back_stencil)
3704 agx_ppp_push_packed(&ppp, ctx->zs->back_stencil.opaque, FRAGMENT_STENCIL);
3705
3706 assert(dirty.varying_counts_32 == dirty.varying_counts_16);
3707 assert(dirty.varying_counts_32 == dirty.output_select);
3708
3709 if (dirty.output_select) {
3710 agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &vs->uvs.osel,
3711 &ctx->linked.fs->osel);
3712
3713 agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_32,
3714 VARYING_COUNTS);
3715
3716 agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_16,
3717 VARYING_COUNTS);
3718 }
3719
3720 if (dirty.cull) {
3721 agx_ppp_push_merged(&ppp, CULL, cfg, ctx->rast->cull) {
3722 cfg.front_face_ccw = ctx->rast->base.front_ccw;
3723
3724 if (ctx->in_tess && !ctx->gs) {
3725 /* Yes, OpenGL is backwards. Deal with it. */
3726 cfg.front_face_ccw ^=
3727 !ctx->stage[MESA_SHADER_TESS_EVAL].shader->tess.ccw;
3728 }
3729 }
3730 }
3731
3732 if (dirty.cull_2) {
3733 agx_ppp_push(&ppp, CULL_2, cfg) {
3734 cfg.needs_primitive_id = batch->generate_primitive_id;
3735 cfg.clamp_w = true;
3736 }
3737 }
3738
3739 if (dirty.fragment_shader) {
3740 unsigned frag_tex_count = ctx->stage[PIPE_SHADER_FRAGMENT].texture_count;
3741
3742 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_0, cfg) {
3743 cfg.uniform_register_count = ctx->fs->b.info.push_count;
3744 cfg.preshader_register_count = ctx->fs->b.info.nr_preamble_gprs;
3745 cfg.texture_state_register_count =
3746 agx_nr_tex_descriptors(batch, ctx->fs);
3747 cfg.sampler_state_register_count =
3748 translate_sampler_state_count(ctx, PIPE_SHADER_FRAGMENT);
3749 cfg.cf_binding_count = ctx->linked.fs->cf.nr_bindings;
3750 }
3751
3752 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
3753 cfg.pipeline = agx_build_pipeline(batch, ctx->fs, ctx->linked.fs,
3754 PIPE_SHADER_FRAGMENT, 0, 0);
3755 }
3756
3757 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
3758 cfg.cf_bindings = batch->varyings;
3759 }
3760
3761 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg) {
3762 /* XXX: This is wrong */
3763 cfg.unknown = frag_tex_count >= 4;
3764 }
3765 }
3766
3767 if (dirty.occlusion_query) {
3768 agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
3769 if (ctx->active_queries && ctx->occlusion_query) {
3770 cfg.index = agx_get_oq_index(batch, ctx->occlusion_query);
3771 }
3772 }
3773 }
3774
3775 if (dirty.output_size) {
3776 agx_ppp_push(&ppp, OUTPUT_SIZE, cfg)
3777 cfg.count = vs->uvs.size;
3778 }
3779
3780 agx_ppp_fini(&out, &ppp);
3781 ppp_updates++;
3782
3783 assert(ppp_updates <= MAX_PPP_UPDATES);
3784 return out;
3785 }
3786
3787 static enum agx_primitive
agx_primitive_for_pipe(enum mesa_prim mode)3788 agx_primitive_for_pipe(enum mesa_prim mode)
3789 {
3790 switch (mode) {
3791 case MESA_PRIM_POINTS:
3792 return AGX_PRIMITIVE_POINTS;
3793 case MESA_PRIM_LINES:
3794 return AGX_PRIMITIVE_LINES;
3795 case MESA_PRIM_LINE_STRIP:
3796 return AGX_PRIMITIVE_LINE_STRIP;
3797 case MESA_PRIM_LINE_LOOP:
3798 return AGX_PRIMITIVE_LINE_LOOP;
3799 case MESA_PRIM_TRIANGLES:
3800 return AGX_PRIMITIVE_TRIANGLES;
3801 case MESA_PRIM_TRIANGLE_STRIP:
3802 return AGX_PRIMITIVE_TRIANGLE_STRIP;
3803 case MESA_PRIM_TRIANGLE_FAN:
3804 return AGX_PRIMITIVE_TRIANGLE_FAN;
3805 case MESA_PRIM_QUADS:
3806 return AGX_PRIMITIVE_QUADS;
3807 case MESA_PRIM_QUAD_STRIP:
3808 return AGX_PRIMITIVE_QUAD_STRIP;
3809 default:
3810 unreachable("todo: other primitive types");
3811 }
3812 }
3813
3814 static uint64_t
agx_index_buffer_rsrc_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,size_t * extent)3815 agx_index_buffer_rsrc_ptr(struct agx_batch *batch,
3816 const struct pipe_draw_info *info, size_t *extent)
3817 {
3818 assert(!info->has_user_indices && "cannot use user pointers with indirect");
3819
3820 struct agx_resource *rsrc = agx_resource(info->index.resource);
3821 agx_batch_reads(batch, rsrc);
3822
3823 *extent = ALIGN_POT(rsrc->layout.size_B, 4);
3824 return rsrc->bo->va->addr;
3825 }
3826
3827 static uint64_t
agx_index_buffer_direct_ptr(struct agx_batch * batch,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_info * info,size_t * extent)3828 agx_index_buffer_direct_ptr(struct agx_batch *batch,
3829 const struct pipe_draw_start_count_bias *draw,
3830 const struct pipe_draw_info *info, size_t *extent)
3831 {
3832 off_t offset = draw->start * info->index_size;
3833 uint32_t max_extent = draw->count * info->index_size;
3834
3835 if (!info->has_user_indices) {
3836 uint64_t base = agx_index_buffer_rsrc_ptr(batch, info, extent);
3837
3838 *extent = ALIGN_POT(MIN2(*extent - offset, max_extent), 4);
3839 return base + offset;
3840 } else {
3841 *extent = ALIGN_POT(max_extent, 4);
3842
3843 return agx_pool_upload_aligned(&batch->pool,
3844 ((uint8_t *)info->index.user) + offset,
3845 draw->count * info->index_size, 64);
3846 }
3847 }
3848
3849 static uint64_t
agx_index_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,size_t * extent)3850 agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
3851 const struct pipe_draw_start_count_bias *draw,
3852 size_t *extent)
3853 {
3854 if (draw)
3855 return agx_index_buffer_direct_ptr(batch, draw, info, extent);
3856 else
3857 return agx_index_buffer_rsrc_ptr(batch, info, extent);
3858 }
3859
3860 static void
agx_ensure_cmdbuf_has_space(struct agx_batch * batch,struct agx_encoder * enc,size_t space)3861 agx_ensure_cmdbuf_has_space(struct agx_batch *batch, struct agx_encoder *enc,
3862 size_t space)
3863 {
3864 bool vdm = enc == &batch->vdm;
3865 assert(vdm || (enc == &batch->cdm));
3866
3867 size_t link_length =
3868 vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
3869
3870 /* Assert that we have space for a link tag */
3871 assert((enc->current + link_length) <= enc->end && "Encoder overflowed");
3872
3873 /* Always leave room for a link tag, in case we run out of space later,
3874 * plus padding because VDM apparently overreads?
3875 *
3876 * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
3877 */
3878 space += link_length + 0x800;
3879
3880 /* If there is room in the command buffer, we're done */
3881 if (likely((enc->end - enc->current) >= space))
3882 return;
3883
3884 /* Otherwise, we need to allocate a new command buffer. We use memory owned
3885 * by the batch to simplify lifetime management for the BO.
3886 */
3887 size_t size = 65536;
3888 struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 256);
3889
3890 /* Jump from the old command buffer to the new command buffer */
3891 agx_cs_jump((uint32_t *)enc->current, T.gpu, vdm);
3892
3893 /* Swap out the command buffer */
3894 enc->current = T.cpu;
3895 enc->end = enc->current + size;
3896 }
3897
3898 static void
agx_ia_update(struct agx_batch * batch,const struct pipe_draw_info * info,uint64_t draw,uint64_t ib,uint64_t ib_range_el)3899 agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
3900 uint64_t draw, uint64_t ib, uint64_t ib_range_el)
3901 {
3902 struct agx_context *ctx = batch->ctx;
3903 struct agx_device *dev = agx_device(ctx->base.screen);
3904
3905 if (!batch->cdm.bo) {
3906 batch->cdm = agx_encoder_allocate(batch, dev);
3907 }
3908
3909 uint64_t ia_vertices = agx_get_query_address(
3910 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]);
3911
3912 uint64_t ia_primitives = agx_get_query_address(
3913 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES]);
3914
3915 uint64_t vs_invocations = agx_get_query_address(
3916 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]);
3917
3918 uint64_t c_prims = agx_get_query_address(
3919 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES]);
3920
3921 uint64_t c_invs = agx_get_query_address(
3922 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]);
3923
3924 /* With a geometry shader, clipper counters are written by the pre-GS kernel
3925 * since they depend on the output on the geometry shader. Without a geometry
3926 * shader, they are written along with IA.
3927 *
3928 * TODO: Broken tessellation interaction, but nobody cares.
3929 */
3930 if (ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
3931 c_prims = 0;
3932 c_invs = 0;
3933 }
3934
3935 if (info->primitive_restart) {
3936 perf_debug(dev, "Input assembly counters with primitive restart");
3937
3938 libagx_increment_ia_restart(
3939 batch, agx_1d(1024), AGX_BARRIER_ALL, ia_vertices, ia_primitives,
3940 vs_invocations, c_prims, c_invs, draw, ib, ib_range_el,
3941 info->restart_index, info->index_size, info->mode);
3942 } else {
3943 perf_debug(dev, "Input assembly counters");
3944
3945 libagx_increment_ia(batch, agx_1d(1), AGX_BARRIER_ALL, ia_vertices,
3946 ia_primitives, vs_invocations, c_prims, c_invs, draw,
3947 info->mode);
3948 }
3949 }
3950
3951 static uint64_t
agx_batch_geometry_state(struct agx_batch * batch)3952 agx_batch_geometry_state(struct agx_batch *batch)
3953 {
3954 struct agx_context *ctx = batch->ctx;
3955
3956 if (!batch->geometry_state) {
3957 uint32_t size = 128 * 1024 * 1024;
3958
3959 if (!ctx->heap) {
3960 ctx->heap = pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL,
3961 PIPE_USAGE_DEFAULT, size);
3962 }
3963
3964 struct agx_geometry_state state = {
3965 .heap = agx_resource(ctx->heap)->bo->va->addr,
3966 .heap_size = size,
3967 };
3968
3969 agx_batch_writes(batch, agx_resource(ctx->heap), 0);
3970
3971 batch->geometry_state =
3972 agx_pool_upload_aligned(&batch->pool, &state, sizeof(state), 8);
3973 }
3974
3975 return batch->geometry_state;
3976 }
3977
3978 static uint64_t
agx_batch_geometry_params(struct agx_batch * batch,uint64_t input_index_buffer,size_t index_buffer_size_B,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_indirect_info * indirect)3979 agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
3980 size_t index_buffer_size_B,
3981 const struct pipe_draw_info *info,
3982 const struct pipe_draw_start_count_bias *draw,
3983 const struct pipe_draw_indirect_info *indirect)
3984 {
3985 struct agx_ia_state ia = {
3986 .index_buffer = input_index_buffer,
3987 .index_buffer_range_el = index_buffer_size_B / info->index_size,
3988 .verts_per_instance = draw ? draw->count : 0,
3989 };
3990
3991 batch->uniforms.input_assembly =
3992 agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
3993
3994 struct agx_geometry_params params = {
3995 .state = agx_batch_geometry_state(batch),
3996 .indirect_desc = batch->geom_indirect,
3997 .flat_outputs =
3998 batch->ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
3999 .input_topology = info->mode,
4000 };
4001
4002 for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->streamout.targets); ++i) {
4003 struct agx_streamout_target *so =
4004 agx_so_target(batch->ctx->streamout.targets[i]);
4005 struct agx_resource *rsrc = so ? agx_resource(so->offset) : NULL;
4006
4007 uint32_t size;
4008 params.xfb_base_original[i] = agx_batch_get_so_address(batch, i, &size);
4009 params.xfb_size[i] = size;
4010
4011 if (rsrc) {
4012 params.xfb_offs_ptrs[i] = rsrc->bo->va->addr;
4013 agx_batch_writes(batch, rsrc, 0);
4014 batch->incoherent_writes = true;
4015 } else {
4016 params.xfb_offs_ptrs[i] = 0;
4017 }
4018 }
4019
4020 for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->prims_generated); ++i) {
4021 params.prims_generated_counter[i] =
4022 agx_get_query_address(batch, batch->ctx->prims_generated[i]);
4023 }
4024
4025 for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_prims_generated); ++i) {
4026 params.xfb_prims_generated_counter[i] =
4027 agx_get_query_address(batch, batch->ctx->tf_prims_generated[i]);
4028 }
4029
4030 if (batch->ctx->active_queries && batch->ctx->streamout.num_targets > 0) {
4031 for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_overflow); ++i) {
4032 params.xfb_overflow[i] =
4033 agx_get_query_address(batch, batch->ctx->tf_overflow[i]);
4034 }
4035
4036 params.xfb_any_overflow =
4037 agx_get_query_address(batch, batch->ctx->tf_any_overflow);
4038 }
4039
4040 /* Calculate input primitive count for direct draws, and allocate the vertex
4041 * & count buffers. GPU calculates and allocates for indirect draws.
4042 */
4043 batch->uniforms.vertex_outputs = batch->ctx->vs->b.info.outputs;
4044 params.input_mask = batch->uniforms.vertex_outputs;
4045 params.count_buffer_stride = batch->ctx->gs->gs_count_words * 4;
4046
4047 if (indirect) {
4048 batch->uniforms.vertex_output_buffer_ptr =
4049 agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
4050
4051 params.vs_grid[2] = params.gs_grid[2] = 1;
4052 } else {
4053 params.vs_grid[0] = draw->count;
4054 params.gs_grid[0] =
4055 u_decomposed_prims_for_vertices(info->mode, draw->count);
4056
4057 params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
4058
4059 params.input_primitives = params.gs_grid[0] * info->instance_count;
4060
4061 unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
4062 batch->uniforms.vertex_outputs);
4063 unsigned size = params.input_primitives * params.count_buffer_stride;
4064
4065 if (size) {
4066 params.count_buffer =
4067 agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
4068 }
4069
4070 if (vb_size) {
4071 uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4072 batch->uniforms.vertex_output_buffer_ptr =
4073 agx_pool_upload(&batch->pool, &addr, 8);
4074
4075 params.input_buffer = addr;
4076 }
4077 }
4078
4079 return agx_pool_upload_aligned_with_bo(&batch->pool, ¶ms, sizeof(params),
4080 8, &batch->geom_params_bo);
4081 }
4082
4083 static uint64_t
agx_indirect_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect)4084 agx_indirect_buffer_ptr(struct agx_batch *batch,
4085 const struct pipe_draw_indirect_info *indirect)
4086 {
4087 assert(indirect->buffer && "drawauto already handled");
4088
4089 struct agx_resource *rsrc = agx_resource(indirect->buffer);
4090 agx_batch_reads(batch, rsrc);
4091 return rsrc->bo->va->addr + indirect->offset;
4092 }
4093
4094 static void
agx_launch_gs_prerast(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_indirect_info * indirect)4095 agx_launch_gs_prerast(struct agx_batch *batch,
4096 const struct pipe_draw_info *info,
4097 const struct pipe_draw_start_count_bias *draws,
4098 const struct pipe_draw_indirect_info *indirect)
4099 {
4100 struct agx_context *ctx = batch->ctx;
4101 struct agx_device *dev = agx_device(ctx->base.screen);
4102 struct agx_compiled_shader *gs = ctx->gs;
4103
4104 if (ctx->stage[PIPE_SHADER_GEOMETRY].shader->is_xfb_passthrough)
4105 perf_debug(dev, "Transform feedbck");
4106 else
4107 perf_debug(dev, "Geometry shader");
4108
4109 /* This is a graphics batch, so it may not have had a CDM encoder allocated
4110 * yet. Allocate that so we can start enqueueing compute work.
4111 */
4112 if (!batch->cdm.bo) {
4113 batch->cdm = agx_encoder_allocate(batch, dev);
4114 }
4115
4116 agx_ensure_cmdbuf_has_space(
4117 batch, &batch->cdm,
4118 8 * (AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH +
4119 AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH +
4120 AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH +
4121 AGX_CDM_BARRIER_LENGTH));
4122
4123 assert(!info->primitive_restart && "should have been lowered");
4124
4125 uint64_t gp = batch->uniforms.geometry_params;
4126 struct agx_grid grid_vs, grid_gs;
4127 struct agx_workgroup wg;
4128
4129 /* Setup grids */
4130 if (indirect) {
4131 uint64_t ib = 0;
4132 size_t ib_extent = 0;
4133
4134 if (info->index_size) {
4135 ib = agx_index_buffer_ptr(batch, info, indirect ? NULL : draws,
4136 &ib_extent);
4137 }
4138
4139 struct libagx_gs_setup_indirect_args gsi = {
4140 .index_buffer = ib,
4141 .index_buffer_range_el = ib_extent / info->index_size,
4142 .draw = agx_indirect_buffer_ptr(batch, indirect),
4143 .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
4144 .ia = batch->uniforms.input_assembly,
4145 .p = batch->uniforms.geometry_params,
4146 .vs_outputs = batch->uniforms.vertex_outputs,
4147 .index_size_B = info->index_size,
4148 .prim = info->mode,
4149 };
4150
4151 libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
4152
4153 wg = agx_workgroup(1, 1, 1);
4154 grid_vs =
4155 agx_grid_indirect(gp + offsetof(struct agx_geometry_params, vs_grid));
4156
4157 grid_gs =
4158 agx_grid_indirect(gp + offsetof(struct agx_geometry_params, gs_grid));
4159 } else {
4160 wg = agx_workgroup(64, 1, 1);
4161 grid_vs = agx_3d(draws->count, info->instance_count, 1);
4162
4163 grid_gs =
4164 agx_3d(u_decomposed_prims_for_vertices(info->mode, draws->count),
4165 info->instance_count, 1);
4166 }
4167
4168 /* Launch the vertex shader first */
4169 agx_launch(batch, grid_vs, wg, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0);
4170
4171 /* If there is a count shader, launch it and prefix sum the results. */
4172 if (gs->gs_count) {
4173 perf_debug(dev, "Geometry shader count");
4174 agx_launch(batch, grid_gs, wg, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY,
4175 0);
4176
4177 libagx_prefix_sum_geom(batch, agx_1d(1024 * gs->gs_count_words),
4178 AGX_BARRIER_ALL, gp);
4179 }
4180
4181 /* Pre-GS shader */
4182 agx_launch(batch, agx_1d(1), agx_workgroup(1, 1, 1), gs->pre_gs, NULL,
4183 PIPE_SHADER_COMPUTE, 0);
4184
4185 /* Pre-rast geometry shader */
4186 agx_launch(batch, grid_gs, wg, gs, NULL, PIPE_SHADER_GEOMETRY, 0);
4187 }
4188
4189 static void
agx_draw_without_restart(struct agx_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)4190 agx_draw_without_restart(struct agx_batch *batch,
4191 const struct pipe_draw_info *info,
4192 unsigned drawid_offset,
4193 const struct pipe_draw_indirect_info *indirect,
4194 const struct pipe_draw_start_count_bias *draw)
4195 {
4196 struct agx_context *ctx = batch->ctx;
4197 struct agx_device *dev = agx_device(ctx->base.screen);
4198
4199 perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
4200
4201 agx_batch_init_state(batch);
4202
4203 size_t ib_extent = 0;
4204 uint64_t ib;
4205
4206 /* The rest of this function handles only the general case of indirect
4207 * multidraws, so synthesize an indexed indirect draw now if we need one for
4208 * a direct draw (necessarily only one). This unifies the code paths.
4209 */
4210 struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
4211
4212 if (!indirect) {
4213 /* Adds in the offset so set to 0 in the desc */
4214 ib = agx_index_buffer_direct_ptr(batch, draw, info, &ib_extent);
4215
4216 uint32_t desc[5] = {draw->count, info->instance_count, 0,
4217 draw->index_bias, info->start_instance};
4218
4219 u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
4220 &indirect_synthesized.offset, &indirect_synthesized.buffer);
4221
4222 indirect = &indirect_synthesized;
4223 } else {
4224 /* Does not add in offset, the unroll kernel uses the desc's offset */
4225 ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
4226 }
4227
4228 /* Next, we unroll the index buffer used by the indirect draw */
4229 if (!batch->cdm.bo)
4230 batch->cdm = agx_encoder_allocate(batch, dev);
4231
4232 /* Allocate output indirect draw descriptors. This is exact. */
4233 struct agx_resource out_draws_rsrc = {0};
4234 struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
4235 &batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
4236 &out_draws_rsrc.bo);
4237
4238 struct libagx_unroll_restart_args unroll = {
4239 .heap = agx_batch_geometry_state(batch),
4240 .index_buffer = ib,
4241 .out_draw = out_draws.gpu,
4242 .restart_index = info->restart_index,
4243 .index_buffer_size_el = ib_extent / info->index_size,
4244 .flatshade_first = batch->ctx->rast->base.flatshade_first,
4245 .in_draw = agx_indirect_buffer_ptr(batch, indirect),
4246 };
4247
4248 /* Unroll the index buffer for each draw */
4249 libagx_unroll_restart_struct(
4250 batch, agx_1d(1024 * indirect->draw_count), AGX_BARRIER_ALL, unroll,
4251 util_logbase2(info->index_size), libagx_compact_prim(info->mode));
4252
4253 /* Now draw the results without restart */
4254 struct pipe_draw_info new_info = {
4255 .mode = u_decomposed_prim(info->mode),
4256 .index_size = info->index_size,
4257 .index.resource = ctx->heap,
4258 .increment_draw_id = info->increment_draw_id,
4259 .index_bias_varies = info->index_bias_varies,
4260 };
4261
4262 struct pipe_draw_indirect_info new_indirect = *indirect;
4263 new_indirect.buffer = &out_draws_rsrc.base;
4264 new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->va->addr;
4265 new_indirect.stride = 5 * sizeof(uint32_t);
4266
4267 ctx->active_draw_without_restart = true;
4268 ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, NULL,
4269 1);
4270 ctx->active_draw_without_restart = false;
4271 }
4272
4273 static bool
agx_needs_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,bool * xfb_only)4274 agx_needs_passthrough_gs(struct agx_context *ctx,
4275 const struct pipe_draw_info *info,
4276 const struct pipe_draw_indirect_info *indirect,
4277 bool *xfb_only)
4278 {
4279 /* If there is already a geometry shader in the pipeline, we do not need to
4280 * apply a passthrough GS of our own.
4281 */
4282 if (ctx->stage[PIPE_SHADER_GEOMETRY].shader)
4283 return false;
4284
4285 /* Rendering adjacency requires a GS, add a passthrough since we don't have
4286 * one.
4287 */
4288 if (mesa_prim_has_adjacency(info->mode)) {
4289 perf_debug_ctx(ctx, "Using passthrough GS due to adjacency primitives");
4290 return true;
4291 }
4292
4293 /* TODO: Handle fans properly, we need to plumb a sysval. */
4294 if (info->mode == MESA_PRIM_TRIANGLE_FAN &&
4295 ctx->rast->base.flatshade_first &&
4296 ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded) {
4297
4298 perf_debug_ctx(ctx, "Using passthrough GS due to first tri fans");
4299 return true;
4300 }
4301
4302 /* TODO: this is really sloppy, we should add a VDM kernel for this. */
4303 if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries &&
4304 ctx->prims_generated[0]) {
4305 perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
4306 return true;
4307 }
4308
4309 /* Edge flags are emulated with a geometry shader */
4310 if (has_edgeflags(ctx, info->mode)) {
4311 perf_debug_ctx(ctx, "Using passthrough GS due to edge flags");
4312 return true;
4313 }
4314
4315 /* Transform feedback is layered on geometry shaders, so if transform
4316 * feedback is used, we need a GS.
4317 */
4318 struct agx_uncompiled_shader *last_vtx =
4319 ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4320 ?: ctx->stage[PIPE_SHADER_VERTEX].shader;
4321
4322 if (last_vtx->has_xfb_info && ctx->streamout.num_targets) {
4323 *xfb_only = true;
4324 return true;
4325 }
4326
4327 /* Otherwise, we don't need one */
4328 return false;
4329 }
4330
4331 static enum mesa_prim
agx_tess_output_prim(struct agx_uncompiled_shader * tcs,struct agx_uncompiled_shader * tes)4332 agx_tess_output_prim(struct agx_uncompiled_shader *tcs,
4333 struct agx_uncompiled_shader *tes)
4334 {
4335 if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) {
4336 return MESA_PRIM_POINTS;
4337 } else if (TESS_PRIMITIVE_ISOLINES ==
4338 MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) {
4339 return MESA_PRIM_LINES;
4340 } else {
4341 return MESA_PRIM_TRIANGLES;
4342 }
4343 }
4344
4345 static struct agx_uncompiled_shader *
agx_get_passthrough_gs(struct agx_context * ctx,struct agx_uncompiled_shader * prev_cso,enum mesa_prim mode,bool xfb_passthrough)4346 agx_get_passthrough_gs(struct agx_context *ctx,
4347 struct agx_uncompiled_shader *prev_cso,
4348 enum mesa_prim mode, bool xfb_passthrough)
4349 {
4350 bool edgeflags = has_edgeflags(ctx, mode);
4351
4352 if (mode == MESA_PRIM_PATCHES) {
4353 mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader,
4354 ctx->stage[MESA_SHADER_TESS_EVAL].shader);
4355 }
4356
4357 /* Only handle the polygon mode when edge flags are in use, because
4358 * nir_passthrough_gs doesn't handle transform feedback + polygon mode
4359 * properly. Technically this can break edge flags + transform feedback
4360 * but that's firmly in "doctor, it hurts when I do this" territory, and
4361 * I'm not sure that's even possible to hit. TODO: Reevaluate.
4362 */
4363 unsigned poly_mode =
4364 edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
4365
4366 if (prev_cso->passthrough_progs[mode][poly_mode][edgeflags])
4367 return prev_cso->passthrough_progs[mode][poly_mode][edgeflags];
4368
4369 struct blob_reader reader;
4370 blob_reader_init(&reader, prev_cso->early_serialized_nir.data,
4371 prev_cso->early_serialized_nir.size);
4372 nir_shader *prev = nir_deserialize(NULL, &agx_nir_options, &reader);
4373
4374 nir_shader *gs = nir_create_passthrough_gs(
4375 &agx_nir_options, prev, mode, rast_prim(mode, poly_mode), edgeflags,
4376 false /* force line strip out */, false);
4377
4378 ralloc_free(prev);
4379
4380 struct agx_uncompiled_shader *cso = pipe_shader_from_nir(&ctx->base, gs);
4381 cso->is_xfb_passthrough = xfb_passthrough;
4382 prev_cso->passthrough_progs[mode][poly_mode][edgeflags] = cso;
4383 return cso;
4384 }
4385
4386 static void
agx_apply_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,bool xfb_passthrough)4387 agx_apply_passthrough_gs(struct agx_context *ctx,
4388 const struct pipe_draw_info *info,
4389 unsigned drawid_offset,
4390 const struct pipe_draw_indirect_info *indirect,
4391 const struct pipe_draw_start_count_bias *draws,
4392 unsigned num_draws, bool xfb_passthrough)
4393 {
4394 enum pipe_shader_type prev_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4395 ? PIPE_SHADER_TESS_EVAL
4396 : PIPE_SHADER_VERTEX;
4397 struct agx_uncompiled_shader *prev_cso = ctx->stage[prev_stage].shader;
4398
4399 assert(ctx->stage[PIPE_SHADER_GEOMETRY].shader == NULL);
4400
4401 /* Draw with passthrough */
4402 ctx->base.bind_gs_state(
4403 &ctx->base,
4404 agx_get_passthrough_gs(ctx, prev_cso, info->mode, xfb_passthrough));
4405 ctx->base.draw_vbo(&ctx->base, info, drawid_offset, indirect, draws,
4406 num_draws);
4407 ctx->base.bind_gs_state(&ctx->base, NULL);
4408 }
4409
4410 static void
util_draw_multi_unroll_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4411 util_draw_multi_unroll_indirect(struct pipe_context *pctx,
4412 const struct pipe_draw_info *info,
4413 const struct pipe_draw_indirect_info *indirect,
4414 const struct pipe_draw_start_count_bias *draws)
4415 {
4416 for (unsigned i = 0; i < indirect->draw_count; ++i) {
4417 const struct pipe_draw_indirect_info subindirect = {
4418 .buffer = indirect->buffer,
4419 .count_from_stream_output = indirect->count_from_stream_output,
4420 .offset = indirect->offset + (i * indirect->stride),
4421 .draw_count = 1,
4422 };
4423
4424 pctx->draw_vbo(pctx, info, i, &subindirect, draws, 1);
4425 }
4426 }
4427
4428 static void
util_draw_multi_upload_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4429 util_draw_multi_upload_indirect(struct pipe_context *pctx,
4430 const struct pipe_draw_info *info,
4431 const struct pipe_draw_indirect_info *indirect,
4432 const struct pipe_draw_start_count_bias *draws)
4433 {
4434 struct pipe_draw_indirect_info indirect_ = *indirect;
4435 u_upload_data(pctx->const_uploader, 0, 4, 4, &indirect->draw_count,
4436 &indirect_.indirect_draw_count_offset,
4437 &indirect_.indirect_draw_count);
4438
4439 pctx->draw_vbo(pctx, info, 0, &indirect_, draws, 1);
4440 }
4441
4442 static void
agx_upload_draw_params(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_info * info)4443 agx_upload_draw_params(struct agx_batch *batch,
4444 const struct pipe_draw_indirect_info *indirect,
4445 const struct pipe_draw_start_count_bias *draws,
4446 const struct pipe_draw_info *info)
4447 {
4448 if (indirect) {
4449 uint64_t address = agx_indirect_buffer_ptr(batch, indirect);
4450
4451 /* To implement draw parameters, we use the last 2 words of the
4452 * indirect draw descriptor. Offset by 3 words for indexed draw (5
4453 * total) and 2 words for non-indexed (4 total). See the layouts of
4454 * indexed vs non-indexed draw descriptors.
4455 *
4456 * This gives us a consistent layout
4457 *
4458 * uint32_t first_vertex;
4459 * uint32_t base_instance;
4460 *
4461 * and we can implement load_first_vertex & load_base_instance without
4462 * checking for indexing.
4463 */
4464 uint32_t offset = info->index_size ? 3 : 2;
4465 batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = address + offset * 4;
4466 } else {
4467 /* Upload just those two words. */
4468 uint32_t params[2] = {
4469 info->index_size ? draws->index_bias : draws->start,
4470 info->start_instance,
4471 };
4472
4473 batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] =
4474 agx_pool_upload_aligned(&batch->pool, params, sizeof(params), 4);
4475 }
4476 }
4477
4478 static void
agx_draw_patches(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4479 agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
4480 unsigned drawid_offset,
4481 const struct pipe_draw_indirect_info *indirect,
4482 const struct pipe_draw_start_count_bias *draws,
4483 unsigned num_draws)
4484 {
4485 struct agx_device *dev = agx_device(ctx->base.screen);
4486 perf_debug(dev, "Tessellation");
4487
4488 struct agx_uncompiled_shader *tcs = ctx->stage[MESA_SHADER_TESS_CTRL].shader;
4489 struct agx_uncompiled_shader *tes = ctx->stage[MESA_SHADER_TESS_EVAL].shader;
4490
4491 assert(tes != NULL && "required with patches");
4492
4493 unsigned patch_vertices = ctx->patch_vertices;
4494
4495 /* OpenGL allows omitting the tcs, fill in a passthrough program if needed.
4496 * In principle, we could optimize this case, but I don't think it matters.
4497 */
4498 bool unbind_tcs_when_done = false;
4499 if (!tcs) {
4500 struct agx_uncompiled_shader *vs = ctx->stage[MESA_SHADER_VERTEX].shader;
4501
4502 assert(patch_vertices >= 1 &&
4503 patch_vertices <= ARRAY_SIZE(vs->passthrough_tcs));
4504
4505 if (!vs->passthrough_tcs[patch_vertices - 1]) {
4506 struct blob_reader reader;
4507 blob_reader_init(&reader, vs->early_serialized_nir.data,
4508 vs->early_serialized_nir.size);
4509 nir_shader *vs_nir = nir_deserialize(NULL, &agx_nir_options, &reader);
4510 nir_shader *nir = nir_create_passthrough_tcs(&agx_nir_options, vs_nir,
4511 patch_vertices);
4512 ralloc_free(vs_nir);
4513
4514 /* Lower the tess level sysvals and gather info, since mesa/st won't do
4515 * either for us.
4516 */
4517 NIR_PASS(_, nir, nir_lower_system_values);
4518
4519 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
4520
4521 vs->passthrough_tcs[patch_vertices - 1] =
4522 pipe_shader_from_nir(&ctx->base, nir);
4523 }
4524
4525 tcs = vs->passthrough_tcs[patch_vertices - 1];
4526 ctx->base.bind_tcs_state(&ctx->base, tcs);
4527 unbind_tcs_when_done = true;
4528 }
4529
4530 enum tess_primitive_mode mode =
4531 MAX2(tcs->tess.primitive, tes->tess.primitive);
4532 enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
4533
4534 enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
4535 ? PIPE_TESS_SPACING_EQUAL
4536 : spacing == TESS_SPACING_FRACTIONAL_ODD
4537 ? PIPE_TESS_SPACING_FRACTIONAL_ODD
4538 : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
4539
4540 bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
4541 enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
4542
4543 enum libagx_tess_partitioning partitioning =
4544 (enum libagx_tess_partitioning)pspacing;
4545
4546 struct agx_bo *draw_bo = NULL;
4547 size_t draw_stride = 5 * sizeof(uint32_t);
4548
4549 struct agx_batch *batch = agx_get_batch(ctx);
4550 agx_batch_init_state(batch);
4551
4552 if (!batch->cdm.bo) {
4553 batch->cdm = agx_encoder_allocate(batch, dev);
4554 }
4555
4556 uint64_t ib = 0;
4557 size_t ib_extent = 0;
4558
4559 if (info->index_size)
4560 ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
4561
4562 struct agx_ia_state ia = {
4563 .index_buffer = ib,
4564 .index_buffer_range_el = ib_extent,
4565 .verts_per_instance = draws ? draws->count : 0,
4566 };
4567
4568 batch->uniforms.input_assembly =
4569 agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
4570
4571 agx_upload_draw_params(batch, indirect, draws, info);
4572
4573 /* Setup parameters */
4574 uint64_t geom_state = agx_batch_geometry_state(batch);
4575 assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
4576
4577 struct libagx_tess_args args = {
4578 .heap = geom_state,
4579 .tcs_stride_el = tcs->tess.output_stride / 4,
4580 .statistic = agx_get_query_address(
4581 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]),
4582 .input_patch_size = patch_vertices,
4583 .output_patch_size = tcs->tess.output_patch_size,
4584 .tcs_patch_constants = tcs->tess.nr_patch_outputs,
4585 .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
4586 .patch_coord_buffer = agx_resource(ctx->heap)->bo->va->addr,
4587 .partitioning = partitioning,
4588 .points_mode = point_mode,
4589 };
4590
4591 if (!point_mode && tes->tess.primitive != TESS_PRIMITIVE_ISOLINES) {
4592 args.ccw = !tes->tess.ccw;
4593 }
4594
4595 memcpy(&args.tess_level_outer_default, ctx->default_outer_level,
4596 sizeof(ctx->default_outer_level));
4597 memcpy(&args.tess_level_inner_default, ctx->default_inner_level,
4598 sizeof(ctx->default_inner_level));
4599
4600 struct agx_grid vs_grid, tcs_grid, tess_grid;
4601
4602 agx_upload_vbos(batch);
4603 agx_update_vs(batch, info->index_size);
4604 agx_update_tcs(ctx, info);
4605 /* XXX */
4606 ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0;
4607 ctx->stage[PIPE_SHADER_TESS_EVAL].dirty = ~0;
4608 agx_update_descriptors(batch, ctx->vs);
4609 agx_update_descriptors(batch, ctx->tcs);
4610
4611 batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs;
4612
4613 if (indirect == NULL) {
4614 unsigned in_patches = draws->count / patch_vertices;
4615 if (in_patches == 0)
4616 return;
4617
4618 /* TCS invocation counter increments once per-patch */
4619 agx_query_increment_cpu(
4620 ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
4621 in_patches);
4622
4623 unsigned unrolled_patches = in_patches * info->instance_count;
4624
4625 uint32_t alloc = 0;
4626 uint32_t tcs_out_offs = alloc;
4627 alloc += unrolled_patches * tcs->tess.output_stride;
4628
4629 uint32_t patch_coord_offs = alloc;
4630 alloc += unrolled_patches * 4;
4631
4632 uint32_t count_offs = alloc;
4633 alloc += unrolled_patches * sizeof(uint32_t);
4634
4635 uint32_t draw_offs = alloc;
4636 alloc += draw_stride;
4637
4638 struct agx_ptr blob =
4639 agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo);
4640
4641 args.tcs_buffer = blob.gpu + tcs_out_offs;
4642 args.patches_per_instance = in_patches;
4643 args.coord_allocs = blob.gpu + patch_coord_offs;
4644 args.nr_patches = unrolled_patches;
4645 args.out_draws = blob.gpu + draw_offs;
4646 args.counts = blob.gpu + count_offs;
4647
4648 unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
4649 batch->uniforms.vertex_outputs);
4650 uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4651 batch->uniforms.vertex_output_buffer_ptr =
4652 agx_pool_upload(&batch->pool, &addr, 8);
4653
4654 vs_grid = agx_3d(draws->count, info->instance_count, 1);
4655 tcs_grid = agx_3d(in_patches * tcs->tess.output_patch_size,
4656 info->instance_count, 1);
4657
4658 tess_grid = agx_1d(unrolled_patches);
4659 } else if (indirect) {
4660 args.out_draws =
4661 agx_pool_alloc_aligned_with_bo(&batch->pool, draw_stride, 4, &draw_bo)
4662 .gpu;
4663 }
4664
4665 uint64_t state =
4666 agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4);
4667
4668 if (indirect) {
4669 uint32_t grid_stride = sizeof(uint32_t) * 6;
4670
4671 uint64_t vertex_out_ptr = agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
4672 uint64_t indirect_ptr = agx_indirect_buffer_ptr(batch, indirect);
4673
4674 uint64_t tcs_statistic = agx_get_query_address(
4675 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
4676
4677 /* Allocate 3x indirect global+local grids for VS/TCS/tess */
4678 uint64_t grids =
4679 agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
4680
4681 libagx_tess_setup_indirect(
4682 batch, agx_1d(1), AGX_BARRIER_ALL, state, grids, 0 /* XXX: IA */,
4683 indirect_ptr, vertex_out_ptr, 0, 0, 0 /* XXX: Index buffer */,
4684 ctx->vs->b.info.outputs, tcs_statistic);
4685
4686 batch->uniforms.vertex_output_buffer_ptr = vertex_out_ptr;
4687
4688 vs_grid = agx_grid_indirect_local(grids + 0 * grid_stride);
4689 tcs_grid = agx_grid_indirect_local(grids + 1 * grid_stride);
4690 tess_grid = agx_grid_indirect_local(grids + 2 * grid_stride);
4691 }
4692
4693 batch->uniforms.tess_params = state;
4694
4695 agx_launch(batch, vs_grid, agx_workgroup(64, 1, 1), ctx->vs, ctx->linked.vs,
4696 PIPE_SHADER_VERTEX, 0);
4697
4698 agx_launch(batch, tcs_grid, agx_workgroup(tcs->tess.output_patch_size, 1, 1),
4699 ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0);
4700
4701 batch->uniforms.vertex_output_buffer_ptr = 0;
4702
4703 /* Generate counts, then prefix sum them, then finally tessellate. */
4704 libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
4705 LIBAGX_TESS_MODE_COUNT, state);
4706 libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state);
4707 libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
4708 LIBAGX_TESS_MODE_WITH_COUNTS, state);
4709
4710 /* Face culling state needs to be specialized for tess */
4711 ctx->dirty |= AGX_DIRTY_RS;
4712
4713 /* Run TES as VS */
4714 void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
4715 void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
4716 ctx->base.bind_vs_state(&ctx->base, tes_cso);
4717 ctx->in_tess = true;
4718
4719 struct pipe_draw_info draw_info = {
4720 .mode = out_prim,
4721 .index_size = 4,
4722 .index.resource = ctx->heap,
4723 .instance_count = 1,
4724 };
4725
4726 /* Wrap the pool allocation in a fake resource for meta-Gallium use */
4727 struct agx_resource indirect_rsrc = {.bo = draw_bo};
4728
4729 struct pipe_draw_indirect_info copy_indirect = {
4730 .buffer = &indirect_rsrc.base,
4731 .offset = args.out_draws - draw_bo->va->addr,
4732 .stride = draw_stride,
4733 .draw_count = 1,
4734 };
4735
4736 ctx->base.draw_vbo(&ctx->base, &draw_info, 0, ©_indirect, NULL, 1);
4737
4738 /* Restore vertex state */
4739 ctx->base.bind_vs_state(&ctx->base, vs_cso);
4740 ctx->in_tess = false;
4741
4742 if (unbind_tcs_when_done) {
4743 ctx->base.bind_tcs_state(&ctx->base, NULL);
4744 }
4745 }
4746
4747 /*
4748 * From the ARB_texture_barrier spec:
4749 *
4750 * Specifically, the values of rendered fragments are undefined if any
4751 * shader stage fetches texels and the same texels are written via fragment
4752 * shader outputs, even if the reads and writes are not in the same Draw
4753 * call, unless any of the following exceptions apply:
4754 *
4755 * - The reads and writes are from/to disjoint sets of texels (after
4756 * accounting for texture filtering rules).
4757 *
4758 * - There is only a single read and write of each texel, and the read is in
4759 * the fragment shader invocation that writes the same texel (e.g. using
4760 * "texelFetch2D(sampler, ivec2(gl_FragCoord.xy), 0);").
4761 *
4762 * - If a texel has been written, then in order to safely read the result
4763 * a texel fetch must be in a subsequent Draw separated by the command
4764 *
4765 * void TextureBarrier(void);
4766 *
4767 * TextureBarrier() will guarantee that writes have completed and caches
4768 * have been invalidated before subsequent Draws are executed."
4769 *
4770 * The wording is subtle, but we are not required to flush implicitly for
4771 * feedback loops, even though we're a tiler. What we are required to do is
4772 * decompress framebuffers involved in feedback loops, because otherwise
4773 * the hardware will race itself with exception #1, where we have a disjoint
4774 * group texels that intersects a compressed tile being written out.
4775 */
4776 static void
agx_legalize_feedback_loops(struct agx_context * ctx)4777 agx_legalize_feedback_loops(struct agx_context *ctx)
4778 {
4779 /* Trust that u_blitter knows what it's doing */
4780 if (ctx->blitter->running)
4781 return;
4782
4783 for (unsigned stage = 0; stage < ARRAY_SIZE(ctx->stage); ++stage) {
4784 if (!(ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE))
4785 continue;
4786
4787 for (unsigned i = 0; i < ctx->stage[stage].texture_count; ++i) {
4788 if (!ctx->stage[stage].textures[i])
4789 continue;
4790
4791 struct agx_resource *rsrc = ctx->stage[stage].textures[i]->rsrc;
4792
4793 for (unsigned cb = 0; cb < ctx->framebuffer.nr_cbufs; ++cb) {
4794 if (ctx->framebuffer.cbufs[cb] &&
4795 agx_resource(ctx->framebuffer.cbufs[cb]->texture) == rsrc) {
4796
4797 if (rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED) {
4798 /* Decompress if we can and shadow if we can't. */
4799 if (rsrc->base.bind & PIPE_BIND_SHARED) {
4800 struct agx_batch *batch = agx_get_batch(ctx);
4801
4802 /* If we already did in-place decompression for this one */
4803 if (batch->feedback & (PIPE_CLEAR_COLOR0 << i))
4804 continue;
4805
4806 /* Use our current context batch. If it already touched
4807 * this buffer, that will have been flushed above.
4808 */
4809 agx_decompress_inplace(batch, ctx->framebuffer.cbufs[cb],
4810 "Texture feedback loop");
4811
4812 /* Mark it as a feedback cbuf, so it will be written to
4813 * uncompressed despite having a compressed layout.
4814 */
4815 batch->feedback |= PIPE_CLEAR_COLOR0 << i;
4816 } else {
4817 agx_decompress(ctx, rsrc, "Texture feedback loop");
4818 }
4819 }
4820
4821 /* Not required by the spec, just for debug */
4822 if (agx_device(ctx->base.screen)->debug & AGX_DBG_FEEDBACK)
4823 agx_flush_writer(ctx, rsrc, "Feedback loop");
4824 }
4825 }
4826 }
4827 }
4828 }
4829
4830 static void
agx_draw_vbo(struct pipe_context * pctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4831 agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
4832 unsigned drawid_offset,
4833 const struct pipe_draw_indirect_info *indirect,
4834 const struct pipe_draw_start_count_bias *draws, unsigned num_draws)
4835 {
4836 struct agx_context *ctx = agx_context(pctx);
4837
4838 if (unlikely(!agx_render_condition_check(ctx)))
4839 return;
4840
4841 if (num_draws > 1) {
4842 util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
4843 return;
4844 }
4845
4846 if (indirect && indirect->draw_count > 1 && !indirect->indirect_draw_count) {
4847 assert(drawid_offset == 0);
4848 assert(num_draws == 1);
4849
4850 util_draw_multi_unroll_indirect(pctx, info, indirect, draws);
4851 return;
4852 }
4853
4854 if (indirect && indirect->count_from_stream_output) {
4855 agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect);
4856 return;
4857 }
4858
4859 /* TODO: stop cheating */
4860 if (indirect && indirect->indirect_draw_count) {
4861 perf_debug_ctx(ctx, "multi-draw indirect");
4862 util_draw_indirect(pctx, info, drawid_offset, indirect);
4863 return;
4864 }
4865
4866 /* TODO: stop cheating.
4867 *
4868 * libagx supports this, just needs test coverage and gallium side wiring.
4869 */
4870 if (indirect && info->mode == MESA_PRIM_PATCHES && info->index_size) {
4871 perf_debug_ctx(ctx, "indexed indirect with tess");
4872 util_draw_indirect(pctx, info, drawid_offset, indirect);
4873 return;
4874 }
4875
4876 bool xfb_passthrough = false;
4877 if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
4878 agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
4879 num_draws, xfb_passthrough);
4880 return;
4881 }
4882
4883 if (info->mode == MESA_PRIM_PATCHES) {
4884 agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
4885 return;
4886 }
4887
4888 agx_legalize_feedback_loops(ctx);
4889
4890 /* Only the rasterization stream counts */
4891 if (ctx->active_queries && ctx->prims_generated[0] &&
4892 !ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
4893
4894 assert(!indirect && "we force a passthrough GS for this");
4895 agx_primitives_update_direct(ctx, info, draws);
4896 }
4897
4898 struct agx_batch *batch = agx_get_batch(ctx);
4899 uint64_t ib = 0;
4900 size_t ib_extent = 0;
4901
4902 if (info->index_size) {
4903 ib =
4904 agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
4905 }
4906
4907 if (ctx->active_queries && !ctx->active_draw_without_restart &&
4908 (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
4909 ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
4910 ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS] ||
4911 ((ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
4912 ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) &&
4913 !ctx->stage[PIPE_SHADER_GEOMETRY].shader))) {
4914
4915 uint64_t ptr;
4916 if (indirect) {
4917 ptr = agx_indirect_buffer_ptr(batch, indirect);
4918 } else {
4919 uint32_t desc[] = {draws->count, info->instance_count, 0};
4920 ptr = agx_pool_upload(&batch->pool, &desc, sizeof(desc));
4921 }
4922
4923 agx_ia_update(batch, info, ptr, ib,
4924 info->index_size ? ib_extent / info->index_size : 1);
4925 }
4926
4927 if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
4928 info->index_size) {
4929
4930 agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
4931 return;
4932 }
4933
4934 agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
4935
4936 #ifndef NDEBUG
4937 if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
4938 agx_dirty_all(ctx);
4939 #endif
4940
4941 agx_batch_init_state(batch);
4942
4943 /* Dirty track the reduced prim: lines vs points vs triangles. Happens before
4944 * agx_update_vs/agx_update_fs, which specialize based on primitive.
4945 */
4946 enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
4947 if (reduced_prim != batch->reduced_prim)
4948 ctx->dirty |= AGX_DIRTY_PRIM;
4949 batch->reduced_prim = reduced_prim;
4950
4951 /* Update shaders first so we can use them after */
4952 if (agx_update_vs(batch, info->index_size)) {
4953 ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG;
4954 ctx->stage[PIPE_SHADER_VERTEX].dirty = ~0;
4955 } else if (ctx->stage[PIPE_SHADER_VERTEX].dirty ||
4956 (ctx->dirty & AGX_DIRTY_VERTEX))
4957 ctx->dirty |= AGX_DIRTY_VS;
4958
4959 /* This is subtle. But agx_update_vs will be true at least once per batch. */
4960 assert(agx_batch_uses_bo(batch, ctx->vs->bo));
4961 assert(!ctx->linked.vs || agx_batch_uses_bo(batch, ctx->linked.vs->bo));
4962
4963 agx_update_gs(ctx, info, indirect);
4964
4965 if (ctx->gs) {
4966 batch->geom_indirect = agx_pool_alloc_aligned_with_bo(
4967 &batch->pool, 64, 4, &batch->geom_indirect_bo)
4968 .gpu;
4969
4970 batch->uniforms.geometry_params =
4971 agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
4972
4973 agx_batch_add_bo(batch, ctx->gs->bo);
4974 agx_batch_add_bo(batch, ctx->gs->gs_copy->bo);
4975 }
4976
4977 if (ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG)) {
4978 struct agx_compiled_shader *vs = ctx->vs;
4979 if (ctx->gs)
4980 vs = ctx->gs->gs_copy;
4981
4982 agx_assign_uvs(
4983 &batch->linked_varyings, &vs->uvs,
4984 ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
4985 ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded);
4986
4987 for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
4988 batch->uniforms.uvs_index[i] = batch->linked_varyings.slots[i];
4989 }
4990 }
4991
4992 /* Set draw ID */
4993 if (ctx->vs->b.info.uses_draw_id) {
4994 batch->uniforms.draw_id = drawid_offset;
4995
4996 ctx->dirty |= AGX_DIRTY_VS;
4997 }
4998
4999 if (agx_update_fs(batch)) {
5000 ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
5001 ctx->stage[PIPE_SHADER_FRAGMENT].dirty = ~0;
5002 } else if ((ctx->stage[PIPE_SHADER_FRAGMENT].dirty) ||
5003 (ctx->dirty & (AGX_DIRTY_BLEND_COLOR | AGX_DIRTY_SAMPLE_MASK))) {
5004 ctx->dirty |= AGX_DIRTY_FS;
5005 }
5006
5007 /* This is subtle. But agx_update_fs will be true at least once per batch. */
5008 assert(!ctx->fs->bo || agx_batch_uses_bo(batch, ctx->fs->bo));
5009 assert(agx_batch_uses_bo(batch, ctx->linked.fs->bo));
5010
5011 if (ctx->linked.vs->uses_base_param || ctx->gs) {
5012 agx_upload_draw_params(batch, indirect, draws, info);
5013
5014 batch->uniforms.is_indexed_draw = (info->index_size > 0);
5015 ctx->dirty |= AGX_DIRTY_VS;
5016 }
5017
5018 agx_update_descriptors(batch, ctx->vs);
5019 agx_update_descriptors(batch, ctx->gs);
5020 agx_update_descriptors(batch, ctx->fs);
5021
5022 if (IS_DIRTY(VS) || IS_DIRTY(FS) || ctx->gs || IS_DIRTY(VERTEX) ||
5023 IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
5024 IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) {
5025
5026 if (IS_DIRTY(VERTEX)) {
5027 agx_upload_vbos(batch);
5028 }
5029
5030 if (IS_DIRTY(BLEND_COLOR)) {
5031 memcpy(batch->uniforms.blend_constant, &ctx->blend_color,
5032 sizeof(ctx->blend_color));
5033 }
5034
5035 if (IS_DIRTY(RS)) {
5036 struct pipe_rasterizer_state *rs = &ctx->rast->base;
5037
5038 batch->uniforms.fixed_point_size =
5039 rs->point_size_per_vertex ? 0.0 : rs->point_size;
5040
5041 /* TODO: tri fans */
5042 batch->uniforms.provoking_vertex = !rs->flatshade_first ? 2 : 0;
5043 }
5044
5045 if (IS_DIRTY(QUERY)) {
5046 for (unsigned i = 0; i < ARRAY_SIZE(ctx->pipeline_statistics); ++i) {
5047 struct agx_query *query = ctx->pipeline_statistics[i];
5048 batch->uniforms.pipeline_statistics[i] =
5049 agx_get_query_address(batch, query);
5050 }
5051 }
5052
5053 if (IS_DIRTY(POLY_STIPPLE)) {
5054 STATIC_ASSERT(sizeof(ctx->poly_stipple) == 32 * 4);
5055
5056 batch->uniforms.polygon_stipple = agx_pool_upload_aligned(
5057 &batch->pool, ctx->poly_stipple, sizeof(ctx->poly_stipple), 4);
5058 }
5059
5060 agx_upload_uniforms(batch);
5061 }
5062
5063 struct pipe_draw_info info_gs;
5064 struct pipe_draw_indirect_info indirect_gs;
5065
5066 /* Wrap the pool allocation in a fake resource for meta-Gallium use */
5067 struct agx_resource indirect_rsrc = {.bo = batch->geom_indirect_bo};
5068
5069 if (ctx->gs) {
5070 /* Launch the pre-rasterization parts of the geometry shader */
5071 agx_launch_gs_prerast(batch, info, draws, indirect);
5072
5073 if (ctx->rast->base.rasterizer_discard)
5074 return;
5075
5076 /* Setup to rasterize the GS results */
5077 info_gs = (struct pipe_draw_info){
5078 .mode = ctx->gs->gs_output_mode,
5079 .index_size = 4,
5080 .primitive_restart = ctx->gs->gs_output_mode != MESA_PRIM_POINTS,
5081 .restart_index = ~0,
5082 .index.resource = ctx->heap,
5083 .instance_count = 1,
5084 };
5085
5086 indirect_gs = (struct pipe_draw_indirect_info){
5087 .draw_count = 1,
5088 .buffer = &indirect_rsrc.base,
5089 .offset = batch->geom_indirect - indirect_rsrc.bo->va->addr,
5090 };
5091
5092 info = &info_gs;
5093 indirect = &indirect_gs;
5094
5095 /* TODO: Deduplicate? */
5096 batch->reduced_prim = u_reduced_prim(info->mode);
5097 ctx->dirty |= AGX_DIRTY_PRIM;
5098
5099 if (info_gs.index_size) {
5100 ib = agx_resource(ctx->heap)->bo->va->addr;
5101 ib_extent = agx_resource(ctx->heap)->bo->size;
5102 } else {
5103 ib = 0;
5104 ib_extent = 0;
5105 }
5106
5107 /* We need to reemit geometry descriptors since the txf sampler may change
5108 * between the GS prepass and the GS rast program.
5109 */
5110 agx_update_descriptors(batch, ctx->gs->gs_copy);
5111 }
5112
5113 assert((!indirect || !indirect->indirect_draw_count) && "multidraw handled");
5114
5115 /* Update batch masks based on current state */
5116 if (ctx->dirty & AGX_DIRTY_BLEND) {
5117 /* TODO: Any point to tracking load? */
5118 batch->draw |= ctx->blend->store;
5119 batch->resolve |= ctx->blend->store;
5120 }
5121
5122 if (ctx->dirty & AGX_DIRTY_ZS) {
5123 batch->load |= ctx->zs->load;
5124 batch->draw |= ctx->zs->store;
5125 batch->resolve |= ctx->zs->store;
5126 }
5127
5128 /* When we approach the end of a command buffer, cycle it out for a new one.
5129 * We only need to do this once per draw as long as we conservatively
5130 * estimate the maximum bytes of VDM commands that this draw will emit.
5131 */
5132 agx_ensure_cmdbuf_has_space(
5133 batch, &batch->vdm,
5134 (AGX_VDM_STATE_LENGTH * 2) + (AGX_PPP_STATE_LENGTH * MAX_PPP_UPDATES) +
5135 AGX_VDM_STATE_RESTART_INDEX_LENGTH +
5136 AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH +
5137 AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH +
5138 AGX_VDM_STATE_VERTEX_OUTPUTS_LENGTH +
5139 AGX_VDM_STATE_VERTEX_UNKNOWN_LENGTH + 4 /* padding */ +
5140 AGX_INDEX_LIST_LENGTH + AGX_INDEX_LIST_BUFFER_LO_LENGTH +
5141 AGX_INDEX_LIST_COUNT_LENGTH + AGX_INDEX_LIST_INSTANCES_LENGTH +
5142 AGX_INDEX_LIST_START_LENGTH + AGX_INDEX_LIST_BUFFER_SIZE_LENGTH);
5143
5144 uint8_t *out = agx_encode_state(batch, batch->vdm.current);
5145
5146 if (info->index_size && info->primitive_restart) {
5147 agx_push(out, VDM_STATE, cfg)
5148 cfg.restart_index_present = true;
5149
5150 agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
5151 cfg.value = info->restart_index;
5152 }
5153
5154 struct agx_draw draw = {0};
5155 if (info->index_size) {
5156 draw.index_size = agx_translate_index_size(info->index_size);
5157 draw.index_buffer = ib;
5158 draw.index_buffer_range_B = ib_extent;
5159 draw.restart = info->primitive_restart;
5160 draw.indexed = true;
5161 } else {
5162 draw.start = draws->start;
5163 }
5164
5165 if (indirect) {
5166 draw.b = agx_grid_indirect(agx_indirect_buffer_ptr(batch, indirect));
5167 } else {
5168 draw.b = agx_3d(draws->count, info->instance_count, 1);
5169 if (info->index_size)
5170 draw.index_bias = draws->index_bias;
5171 }
5172
5173 out = (void *)agx_vdm_draw((uint32_t *)out, 0 /* ignored for now */, draw,
5174 agx_primitive_for_pipe(info->mode));
5175
5176 batch->vdm.current = out;
5177 assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
5178 "Failed to reserve sufficient space in encoder");
5179 agx_dirty_reset_graphics(ctx);
5180
5181 assert(batch == agx_get_batch(ctx) && "batch should not change under us");
5182
5183 batch->draws++;
5184
5185 /* The scissor/zbias arrays are indexed with 16-bit integers, imposigin a
5186 * maximum of UINT16_MAX descriptors. Flush if the next draw would overflow
5187 */
5188 if (unlikely(
5189 (((batch->scissor.size / AGX_SCISSOR_LENGTH) + AGX_MAX_VIEWPORTS) >
5190 UINT16_MAX) ||
5191 (batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH) >= UINT16_MAX)) {
5192 agx_flush_batch_for_reason(ctx, batch, "Scissor/depth bias overflow");
5193 } else if (unlikely(batch->draws > 100000)) {
5194 /* Mostly so drawoverhead doesn't OOM */
5195 agx_flush_batch_for_reason(ctx, batch, "Absurd number of draws");
5196 } else if (unlikely(batch->sampler_heap.count >
5197 (AGX_SAMPLER_HEAP_SIZE - (PIPE_MAX_SAMPLERS * 6)))) {
5198 agx_flush_batch_for_reason(ctx, batch, "Sampler heap overflow");
5199 }
5200 }
5201
5202 static void
agx_texture_barrier(struct pipe_context * pipe,unsigned flags)5203 agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
5204 {
5205 struct agx_context *ctx = agx_context(pipe);
5206
5207 /* Framebuffer fetch is coherent, so barriers are a no-op. */
5208 if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER)
5209 return;
5210
5211 agx_flush_all(ctx, "Texture barrier");
5212 }
5213
5214 void
agx_launch(struct agx_batch * batch,struct agx_grid grid,struct agx_workgroup wg,struct agx_compiled_shader * cs,struct agx_linked_shader * linked,enum pipe_shader_type stage,unsigned variable_shared_mem)5215 agx_launch(struct agx_batch *batch, struct agx_grid grid,
5216 struct agx_workgroup wg, struct agx_compiled_shader *cs,
5217 struct agx_linked_shader *linked, enum pipe_shader_type stage,
5218 unsigned variable_shared_mem)
5219 {
5220 struct agx_context *ctx = batch->ctx;
5221
5222 /* To implement load_num_workgroups, the number of workgroups needs to be
5223 * available in GPU memory. This is either the indirect buffer, or just a
5224 * buffer we upload ourselves if not indirect.
5225 */
5226 if (grid.mode == AGX_CDM_MODE_DIRECT) {
5227 uint32_t groups[3] = {
5228 grid.count[0] / wg.x,
5229 grid.count[1] / wg.y,
5230 grid.count[2] / wg.z,
5231 };
5232
5233 batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
5234 agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4);
5235 } else {
5236 batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid.ptr;
5237 }
5238
5239 util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
5240 if (!*res)
5241 continue;
5242
5243 struct agx_resource *buffer = agx_resource(*res);
5244 agx_batch_writes(batch, buffer, 0);
5245 batch->incoherent_writes = true;
5246 }
5247
5248 agx_update_descriptors(batch, cs);
5249 agx_upload_uniforms(batch);
5250
5251 // TODO: This is broken.
5252 size_t subgroups_per_core = 0;
5253 #if 0
5254 if (!info->indirect) {
5255 size_t subgroups_per_workgroup =
5256 DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 32);
5257 subgroups_per_core =
5258 local_workgroups *
5259 DIV_ROUND_UP(info->grid[0] * info->grid[1] * info->grid[2],
5260 ctx->scratch_cs.num_cores);
5261 }
5262 #endif
5263
5264 uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
5265 variable_shared_mem, subgroups_per_core);
5266
5267 if (cs)
5268 agx_batch_add_bo(batch, cs->bo);
5269
5270 struct agx_cdm_launch_word_0_packed launch;
5271 agx_pack(&launch, CDM_LAUNCH_WORD_0, cfg) {
5272 cfg.uniform_register_count = cs->b.info.push_count;
5273 cfg.preshader_register_count = cs->b.info.nr_preamble_gprs;
5274 cfg.texture_state_register_count =
5275 cs ? agx_nr_tex_descriptors(batch, cs) : 0;
5276 cfg.sampler_state_register_count =
5277 translate_sampler_state_count(ctx, stage);
5278 }
5279
5280 agx_launch_internal(batch, grid, wg, launch, stage, usc);
5281 }
5282
5283 static void
agx_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)5284 agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
5285 {
5286 struct agx_context *ctx = agx_context(pipe);
5287 if (unlikely(!ctx->compute_blitter.active &&
5288 !agx_render_condition_check(ctx)))
5289 return;
5290
5291 struct agx_batch *batch = agx_get_compute_batch(ctx);
5292
5293 uint64_t indirect = 0;
5294 if (info->indirect) {
5295 struct agx_resource *rsrc = agx_resource(info->indirect);
5296 agx_batch_reads(batch, rsrc);
5297 indirect = rsrc->bo->va->addr + info->indirect_offset;
5298 }
5299
5300 /* Increment the pipeline stats query.
5301 *
5302 * TODO: Can we use the hardware counter for this?
5303 */
5304 struct agx_query *statistic =
5305 ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS];
5306
5307 struct agx_workgroup wg =
5308 agx_workgroup(info->block[0], info->block[1], info->block[2]);
5309
5310 if (statistic) {
5311 if (indirect) {
5312 uint64_t addr = agx_get_query_address(batch, statistic);
5313
5314 libagx_increment_cs_invocations(batch, agx_1d(1), AGX_BARRIER_ALL,
5315 indirect, addr,
5316 agx_workgroup_threads(wg));
5317 } else {
5318 agx_query_increment_cpu(ctx, statistic,
5319 agx_workgroup_threads(wg) * info->grid[0] *
5320 info->grid[1] * info->grid[2]);
5321 }
5322 }
5323
5324 agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
5325
5326 agx_batch_init_state(batch);
5327
5328 struct agx_uncompiled_shader *uncompiled =
5329 ctx->stage[PIPE_SHADER_COMPUTE].shader;
5330
5331 /* There is exactly one variant, get it */
5332 struct agx_compiled_shader *cs =
5333 _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
5334
5335 struct agx_grid grid;
5336 if (indirect) {
5337 grid = agx_grid_indirect(indirect);
5338 } else {
5339 grid = agx_3d(0, 0, 0);
5340
5341 for (unsigned d = 0; d < 3; ++d) {
5342 grid.count[d] = ((info->grid[d] - 1) * info->block[d]) +
5343 (info->last_block[d] ?: info->block[d]);
5344 }
5345 }
5346
5347 agx_launch(batch, grid, wg, cs, NULL, PIPE_SHADER_COMPUTE,
5348 info->variable_shared_mem);
5349
5350 /* TODO: Dirty tracking? */
5351 agx_dirty_all(ctx);
5352
5353 batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = 0;
5354
5355 /* If the next dispatch might overflow, flush now. TODO: If this is ever hit
5356 * in practice, we can use CDM stream links.
5357 */
5358 size_t dispatch_upper_bound =
5359 AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH +
5360 AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH +
5361 AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH +
5362 AGX_CDM_BARRIER_LENGTH;
5363
5364 if (batch->cdm.current + dispatch_upper_bound >= batch->cdm.end)
5365 agx_flush_batch_for_reason(ctx, batch, "CDM overfull");
5366 }
5367
5368 static void
agx_set_global_binding(struct pipe_context * pipe,unsigned first,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)5369 agx_set_global_binding(struct pipe_context *pipe, unsigned first,
5370 unsigned count, struct pipe_resource **resources,
5371 uint32_t **handles)
5372 {
5373 struct agx_context *ctx = agx_context(pipe);
5374 unsigned old_size =
5375 util_dynarray_num_elements(&ctx->global_buffers, *resources);
5376
5377 if (old_size < first + count) {
5378 /* we are screwed no matter what */
5379 if (!util_dynarray_grow(&ctx->global_buffers, *resources,
5380 (first + count) - old_size))
5381 unreachable("out of memory");
5382
5383 for (unsigned i = old_size; i < first + count; i++)
5384 *util_dynarray_element(&ctx->global_buffers, struct pipe_resource *,
5385 i) = NULL;
5386 }
5387
5388 for (unsigned i = 0; i < count; ++i) {
5389 struct pipe_resource **res = util_dynarray_element(
5390 &ctx->global_buffers, struct pipe_resource *, first + i);
5391 if (resources && resources[i]) {
5392 pipe_resource_reference(res, resources[i]);
5393
5394 /* The handle points to uint32_t, but space is allocated for 64
5395 * bits. We need to respect the offset passed in. This interface
5396 * is so bad.
5397 */
5398 uint64_t addr = 0;
5399 struct agx_resource *rsrc = agx_resource(resources[i]);
5400
5401 memcpy(&addr, handles[i], sizeof(addr));
5402 addr += rsrc->bo->va->addr;
5403 memcpy(handles[i], &addr, sizeof(addr));
5404 } else {
5405 pipe_resource_reference(res, NULL);
5406 }
5407 }
5408 }
5409
5410 void agx_init_state_functions(struct pipe_context *ctx);
5411
5412 void
agx_decompress_inplace(struct agx_batch * batch,struct pipe_surface * surf,const char * reason)5413 agx_decompress_inplace(struct agx_batch *batch, struct pipe_surface *surf,
5414 const char *reason)
5415 {
5416 struct agx_context *ctx = batch->ctx;
5417 struct agx_device *dev = agx_device(ctx->base.screen);
5418 struct agx_resource *rsrc = agx_resource(surf->texture);
5419 struct ail_layout *layout = &rsrc->layout;
5420 unsigned level = surf->u.tex.level;
5421
5422 perf_debug(dev, "Decompressing in-place due to: %s", reason);
5423
5424 if (!batch->cdm.bo)
5425 batch->cdm = agx_encoder_allocate(batch, dev);
5426
5427 struct agx_ptr images = agx_pool_alloc_aligned(
5428 &batch->pool, sizeof(struct libagx_decompress_images), 64);
5429 struct libagx_decompress_images *img = images.cpu;
5430
5431 struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
5432 sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
5433 struct pipe_image_view view = image_view_for_surface(surf);
5434 agx_pack_texture(&img->compressed, rsrc, surf->format, &sampler_view);
5435 agx_batch_upload_pbe(batch, &img->uncompressed, &view, false, true, true,
5436 true);
5437
5438 struct agx_grid grid =
5439 agx_3d(ail_metadata_width_tl(layout, level) * 32,
5440 ail_metadata_height_tl(layout, level),
5441 surf->u.tex.last_layer - surf->u.tex.first_layer + 1);
5442
5443 libagx_decompress(batch, grid, AGX_BARRIER_ALL, layout,
5444 surf->u.tex.first_layer, level,
5445 agx_map_texture_gpu(rsrc, 0), images.gpu);
5446 }
5447
5448 void
agx_init_state_functions(struct pipe_context * ctx)5449 agx_init_state_functions(struct pipe_context *ctx)
5450 {
5451 ctx->create_blend_state = agx_create_blend_state;
5452 ctx->create_depth_stencil_alpha_state = agx_create_zsa_state;
5453 ctx->create_fs_state = agx_create_shader_state;
5454 ctx->create_rasterizer_state = agx_create_rs_state;
5455 ctx->create_sampler_state = agx_create_sampler_state;
5456 ctx->create_sampler_view = agx_create_sampler_view;
5457 ctx->create_surface = agx_create_surface;
5458 ctx->create_vertex_elements_state = agx_create_vertex_elements;
5459 ctx->create_vs_state = agx_create_shader_state;
5460 ctx->create_gs_state = agx_create_shader_state;
5461 ctx->create_tcs_state = agx_create_shader_state;
5462 ctx->create_tes_state = agx_create_shader_state;
5463 ctx->create_compute_state = agx_create_compute_state;
5464 ctx->bind_blend_state = agx_bind_blend_state;
5465 ctx->bind_depth_stencil_alpha_state = agx_bind_zsa_state;
5466 ctx->bind_sampler_states = agx_bind_sampler_states;
5467 ctx->bind_fs_state = agx_bind_fs_state;
5468 ctx->bind_rasterizer_state = agx_bind_rasterizer_state;
5469 ctx->bind_vertex_elements_state = agx_bind_vertex_elements_state;
5470 ctx->bind_vs_state = agx_bind_vs_state;
5471 ctx->bind_gs_state = agx_bind_gs_state;
5472 ctx->bind_tcs_state = agx_bind_tcs_state;
5473 ctx->bind_tes_state = agx_bind_tes_state;
5474 ctx->bind_compute_state = agx_bind_cs_state;
5475 ctx->delete_blend_state = agx_delete_state;
5476 ctx->delete_depth_stencil_alpha_state = agx_delete_state;
5477 ctx->delete_fs_state = agx_delete_shader_state;
5478 ctx->delete_compute_state = agx_delete_shader_state;
5479 ctx->delete_rasterizer_state = agx_delete_state;
5480 ctx->delete_sampler_state = agx_delete_sampler_state;
5481 ctx->delete_vertex_elements_state = agx_delete_state;
5482 ctx->delete_vs_state = agx_delete_shader_state;
5483 ctx->delete_gs_state = agx_delete_shader_state;
5484 ctx->delete_tcs_state = agx_delete_shader_state;
5485 ctx->delete_tes_state = agx_delete_shader_state;
5486 ctx->set_blend_color = agx_set_blend_color;
5487 ctx->set_clip_state = agx_set_clip_state;
5488 ctx->set_constant_buffer = agx_set_constant_buffer;
5489 ctx->set_shader_buffers = agx_set_shader_buffers;
5490 ctx->set_shader_images = agx_set_shader_images;
5491 ctx->set_sampler_views = agx_set_sampler_views;
5492 ctx->set_framebuffer_state = agx_set_framebuffer_state;
5493 ctx->set_polygon_stipple = agx_set_polygon_stipple;
5494 ctx->set_patch_vertices = agx_set_patch_vertices;
5495 ctx->set_sample_mask = agx_set_sample_mask;
5496 ctx->set_scissor_states = agx_set_scissor_states;
5497 ctx->set_stencil_ref = agx_set_stencil_ref;
5498 ctx->set_vertex_buffers = agx_set_vertex_buffers;
5499 ctx->set_viewport_states = agx_set_viewport_states;
5500 ctx->sampler_view_destroy = agx_sampler_view_destroy;
5501 ctx->surface_destroy = agx_surface_destroy;
5502 ctx->draw_vbo = agx_draw_vbo;
5503 ctx->launch_grid = agx_launch_grid;
5504 ctx->set_global_binding = agx_set_global_binding;
5505 ctx->texture_barrier = agx_texture_barrier;
5506 ctx->get_compute_state_info = agx_get_compute_state_info;
5507 ctx->set_tess_state = agx_set_tess_state;
5508 }
5509