• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2019-2020 Collabora, Ltd.
4  * Copyright 2014-2017 Broadcom
5  * Copyright 2010 Red Hat Inc.
6  * SPDX-License-Identifier: MIT
7  */
8 #include "agx_state.h"
9 #include <errno.h>
10 #include <stdio.h>
11 #include "asahi/compiler/agx_compile.h"
12 #include "asahi/compiler/agx_nir.h"
13 #include "asahi/genxml/agx_pack.h"
14 #include "asahi/layout/layout.h"
15 #include "asahi/lib/agx_abi.h"
16 #include "asahi/lib/agx_helpers.h"
17 #include "asahi/lib/agx_ppp.h"
18 #include "asahi/lib/agx_usc.h"
19 #include "asahi/libagx/compression.h"
20 #include "asahi/libagx/query.h"
21 #include "asahi/libagx/tessellator.h"
22 #include "compiler/nir/nir.h"
23 #include "compiler/nir/nir_serialize.h"
24 #include "compiler/shader_enums.h"
25 #include "gallium/auxiliary/nir/pipe_nir.h"
26 #include "gallium/auxiliary/nir/tgsi_to_nir.h"
27 #include "gallium/auxiliary/tgsi/tgsi_from_mesa.h"
28 #include "gallium/auxiliary/util/u_draw.h"
29 #include "gallium/auxiliary/util/u_framebuffer.h"
30 #include "gallium/auxiliary/util/u_helpers.h"
31 #include "gallium/auxiliary/util/u_prim_restart.h"
32 #include "gallium/auxiliary/util/u_viewport.h"
33 #include "pipe/p_context.h"
34 #include "pipe/p_defines.h"
35 #include "pipe/p_screen.h"
36 #include "pipe/p_state.h"
37 #include "util/bitscan.h"
38 #include "util/bitset.h"
39 #include "util/blend.h"
40 #include "util/blob.h"
41 #include "util/compiler.h"
42 #include "util/format/u_format.h"
43 #include "util/format/u_formats.h"
44 #include "util/half_float.h"
45 #include "util/hash_table.h"
46 #include "util/macros.h"
47 #include "util/ralloc.h"
48 #include "util/u_inlines.h"
49 #include "util/u_math.h"
50 #include "util/u_memory.h"
51 #include "util/u_prim.h"
52 #include "util/u_transfer.h"
53 #include "util/u_upload_mgr.h"
54 #include "agx_bg_eot.h"
55 #include "agx_bo.h"
56 #include "agx_device.h"
57 #include "agx_disk_cache.h"
58 #include "agx_linker.h"
59 #include "agx_nir.h"
60 #include "agx_nir_lower_gs.h"
61 #include "agx_nir_lower_vbo.h"
62 #include "agx_tilebuffer.h"
63 #include "libagx.h"
64 #include "libagx_dgc.h"
65 #include "libagx_shaders.h"
66 #include "nir_builder.h"
67 #include "nir_builder_opcodes.h"
68 #include "nir_intrinsics.h"
69 #include "nir_intrinsics_indices.h"
70 #include "nir_xfb_info.h"
71 #include "pool.h"
72 
73 void
agx_legalize_compression(struct agx_context * ctx,struct agx_resource * rsrc,enum pipe_format format)74 agx_legalize_compression(struct agx_context *ctx, struct agx_resource *rsrc,
75                          enum pipe_format format)
76 {
77    if (!ail_is_view_compatible(&rsrc->layout, format)) {
78       agx_decompress(ctx, rsrc, "Incompatible formats");
79    }
80 }
81 
82 static void
agx_set_shader_images(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * iviews)83 agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader,
84                       unsigned start_slot, unsigned count,
85                       unsigned unbind_num_trailing_slots,
86                       const struct pipe_image_view *iviews)
87 {
88    struct agx_context *ctx = agx_context(pctx);
89    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
90 
91    /* Unbind start_slot...start_slot+count */
92    if (!iviews) {
93       for (int i = start_slot;
94            i < start_slot + count + unbind_num_trailing_slots; i++) {
95          pipe_resource_reference(&ctx->stage[shader].images[i].resource, NULL);
96       }
97 
98       ctx->stage[shader].image_mask &=
99          ~BITFIELD64_MASK(count + unbind_num_trailing_slots) << start_slot;
100       return;
101    }
102 
103    /* Images writeable with pixel granularity are incompatible with
104     * compression. Decompress if necessary.
105     *
106     * Driver-internal images are used by the compute blitter and are exempt
107     * from these transitions, as it only uses compressed images when safe.
108     *
109     * We do this upfront because agx_decompress and agx_legalize_compression can
110     * call set_shader_images internall.
111     */
112    for (int i = 0; i < count; i++) {
113       const struct pipe_image_view *image = &iviews[i];
114       struct agx_resource *rsrc = agx_resource(image->resource);
115 
116       if (rsrc && !(image->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL)) {
117          if (!rsrc->layout.writeable_image &&
118              (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)) {
119 
120             agx_decompress(ctx, rsrc, "Shader image");
121          }
122 
123          /* Readable images may be compressed but are still subject to format
124           * reinterpretation rules.
125           */
126          agx_legalize_compression(ctx, rsrc, image->format);
127 
128          if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)
129             assert(rsrc->layout.writeable_image);
130       }
131    }
132 
133    /* Bind start_slot...start_slot+count */
134    for (int i = 0; i < count; i++) {
135       const struct pipe_image_view *image = &iviews[i];
136 
137       if (!image->resource) {
138          util_copy_image_view(&ctx->stage[shader].images[start_slot + i], NULL);
139          ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + i);
140       } else {
141          util_copy_image_view(&ctx->stage[shader].images[start_slot + i],
142                               image);
143          ctx->stage[shader].image_mask |= BITFIELD_BIT(start_slot + i);
144       }
145    }
146 
147    /* Unbind start_slot+count...start_slot+count+unbind_num_trailing_slots */
148    for (int i = 0; i < unbind_num_trailing_slots; i++) {
149       ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + count + i);
150       util_copy_image_view(&ctx->stage[shader].images[start_slot + count + i],
151                            NULL);
152    }
153 }
154 
155 static void
agx_set_shader_buffers(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)156 agx_set_shader_buffers(struct pipe_context *pctx, enum pipe_shader_type shader,
157                        unsigned start, unsigned count,
158                        const struct pipe_shader_buffer *buffers,
159                        unsigned writable_bitmask)
160 {
161    struct agx_context *ctx = agx_context(pctx);
162 
163    util_set_shader_buffers_mask(ctx->stage[shader].ssbo,
164                                 &ctx->stage[shader].ssbo_mask, buffers, start,
165                                 count);
166 
167    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SSBO;
168    ctx->stage[shader].ssbo_writable_mask &= ~(BITFIELD_MASK(count) << start);
169    ctx->stage[shader].ssbo_writable_mask |= writable_bitmask << start;
170 }
171 
172 static void
agx_set_blend_color(struct pipe_context * pctx,const struct pipe_blend_color * state)173 agx_set_blend_color(struct pipe_context *pctx,
174                     const struct pipe_blend_color *state)
175 {
176    struct agx_context *ctx = agx_context(pctx);
177 
178    if (state)
179       memcpy(&ctx->blend_color, state, sizeof(*state));
180 
181    ctx->dirty |= AGX_DIRTY_BLEND_COLOR;
182 }
183 
184 static void
agx_set_patch_vertices(struct pipe_context * pctx,unsigned char n)185 agx_set_patch_vertices(struct pipe_context *pctx, unsigned char n)
186 {
187    struct agx_context *ctx = agx_context(pctx);
188    ctx->patch_vertices = n;
189 }
190 
191 static void
agx_set_tess_state(struct pipe_context * pctx,const float default_outer_level[4],const float default_inner_level[2])192 agx_set_tess_state(struct pipe_context *pctx,
193                    const float default_outer_level[4],
194                    const float default_inner_level[2])
195 {
196    struct agx_context *ctx = agx_context(pctx);
197 
198    memcpy(ctx->default_outer_level, default_outer_level, 4 * sizeof(float));
199    memcpy(ctx->default_inner_level, default_inner_level, 2 * sizeof(float));
200 }
201 
202 static void *
agx_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)203 agx_create_blend_state(struct pipe_context *ctx,
204                        const struct pipe_blend_state *state)
205 {
206    struct agx_blend *so = CALLOC_STRUCT(agx_blend);
207    struct agx_blend_key *key = &so->key;
208 
209    key->alpha_to_coverage = state->alpha_to_coverage;
210    key->alpha_to_one = state->alpha_to_one;
211 
212    key->logicop_func =
213       state->logicop_enable ? state->logicop_func : PIPE_LOGICOP_COPY;
214 
215    for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
216       unsigned rti = state->independent_blend_enable ? i : 0;
217       struct pipe_rt_blend_state rt = state->rt[rti];
218 
219       if (state->logicop_enable || !rt.blend_enable) {
220          /* No blending, but we get the colour mask below */
221          key->rt[i] = (struct agx_blend_rt_key){
222             .rgb_func = PIPE_BLEND_ADD,
223             .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
224             .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
225 
226             .alpha_func = PIPE_BLEND_ADD,
227             .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
228             .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
229          };
230       } else {
231          key->rt[i].rgb_func = rt.rgb_func;
232          key->rt[i].rgb_src_factor = rt.rgb_src_factor;
233          key->rt[i].rgb_dst_factor = rt.rgb_dst_factor;
234 
235          key->rt[i].alpha_func = rt.alpha_func;
236          key->rt[i].alpha_src_factor = rt.alpha_src_factor;
237          key->rt[i].alpha_dst_factor = rt.alpha_dst_factor;
238       }
239 
240       key->rt[i].colormask = rt.colormask;
241 
242       if (rt.colormask)
243          so->store |= (PIPE_CLEAR_COLOR0 << i);
244    }
245 
246    return so;
247 }
248 
249 static void
agx_bind_blend_state(struct pipe_context * pctx,void * cso)250 agx_bind_blend_state(struct pipe_context *pctx, void *cso)
251 {
252    struct agx_context *ctx = agx_context(pctx);
253    ctx->blend = cso;
254    ctx->dirty |= AGX_DIRTY_BLEND;
255 }
256 
257 static const enum agx_stencil_op agx_stencil_ops[PIPE_STENCIL_OP_INVERT + 1] = {
258    [PIPE_STENCIL_OP_KEEP] = AGX_STENCIL_OP_KEEP,
259    [PIPE_STENCIL_OP_ZERO] = AGX_STENCIL_OP_ZERO,
260    [PIPE_STENCIL_OP_REPLACE] = AGX_STENCIL_OP_REPLACE,
261    [PIPE_STENCIL_OP_INCR] = AGX_STENCIL_OP_INCR_SAT,
262    [PIPE_STENCIL_OP_DECR] = AGX_STENCIL_OP_DECR_SAT,
263    [PIPE_STENCIL_OP_INCR_WRAP] = AGX_STENCIL_OP_INCR_WRAP,
264    [PIPE_STENCIL_OP_DECR_WRAP] = AGX_STENCIL_OP_DECR_WRAP,
265    [PIPE_STENCIL_OP_INVERT] = AGX_STENCIL_OP_INVERT,
266 };
267 
268 static void
agx_pack_stencil(struct agx_fragment_stencil_packed * out,struct pipe_stencil_state st)269 agx_pack_stencil(struct agx_fragment_stencil_packed *out,
270                  struct pipe_stencil_state st)
271 {
272    if (st.enabled) {
273       agx_pack(out, FRAGMENT_STENCIL, cfg) {
274          cfg.compare = (enum agx_zs_func)st.func;
275          cfg.write_mask = st.writemask;
276          cfg.read_mask = st.valuemask;
277 
278          cfg.depth_pass = agx_stencil_ops[st.zpass_op];
279          cfg.depth_fail = agx_stencil_ops[st.zfail_op];
280          cfg.stencil_fail = agx_stencil_ops[st.fail_op];
281       }
282    } else {
283       agx_pack(out, FRAGMENT_STENCIL, cfg) {
284          cfg.compare = AGX_ZS_FUNC_ALWAYS;
285          cfg.write_mask = 0xFF;
286          cfg.read_mask = 0xFF;
287 
288          cfg.depth_pass = AGX_STENCIL_OP_KEEP;
289          cfg.depth_fail = AGX_STENCIL_OP_KEEP;
290          cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
291       }
292    }
293 }
294 
295 static void *
agx_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)296 agx_create_zsa_state(struct pipe_context *ctx,
297                      const struct pipe_depth_stencil_alpha_state *state)
298 {
299    struct agx_zsa *so = CALLOC_STRUCT(agx_zsa);
300    assert(!state->depth_bounds_test && "todo");
301 
302    so->base = *state;
303 
304    /* Handle the enable flag */
305    enum pipe_compare_func depth_func =
306       state->depth_enabled ? state->depth_func : PIPE_FUNC_ALWAYS;
307 
308    /* Z func can otherwise be used as-is */
309    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NEVER == AGX_ZS_FUNC_NEVER);
310    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LESS == AGX_ZS_FUNC_LESS);
311    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_EQUAL == AGX_ZS_FUNC_EQUAL);
312    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LEQUAL == AGX_ZS_FUNC_LEQUAL);
313    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GREATER == AGX_ZS_FUNC_GREATER);
314    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NOTEQUAL == AGX_ZS_FUNC_NOT_EQUAL);
315    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GEQUAL == AGX_ZS_FUNC_GEQUAL);
316    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_ALWAYS == AGX_ZS_FUNC_ALWAYS);
317 
318    agx_pack(&so->depth, FRAGMENT_FACE, cfg) {
319       cfg.depth_function = (enum agx_zs_func)depth_func;
320       cfg.disable_depth_write = !state->depth_writemask;
321    }
322 
323    agx_pack_stencil(&so->front_stencil, state->stencil[0]);
324 
325    if (state->stencil[1].enabled) {
326       agx_pack_stencil(&so->back_stencil, state->stencil[1]);
327    } else {
328       /* One sided stencil */
329       so->back_stencil = so->front_stencil;
330    }
331 
332    if (depth_func != PIPE_FUNC_NEVER && depth_func != PIPE_FUNC_ALWAYS)
333       so->load |= PIPE_CLEAR_DEPTH;
334 
335    if (state->depth_writemask) {
336       so->load |= PIPE_CLEAR_DEPTH;
337       so->store |= PIPE_CLEAR_DEPTH;
338    }
339 
340    if (state->stencil[0].enabled) {
341       so->load |= PIPE_CLEAR_STENCIL; /* TODO: Optimize */
342       so->store |= PIPE_CLEAR_STENCIL;
343    }
344 
345    return so;
346 }
347 
348 static void
agx_bind_zsa_state(struct pipe_context * pctx,void * cso)349 agx_bind_zsa_state(struct pipe_context *pctx, void *cso)
350 {
351    struct agx_context *ctx = agx_context(pctx);
352    ctx->zs = cso;
353    ctx->dirty |= AGX_DIRTY_ZS;
354 }
355 
356 static enum agx_polygon_mode
agx_translate_polygon_mode(unsigned mode)357 agx_translate_polygon_mode(unsigned mode)
358 {
359    switch (mode) {
360    case PIPE_POLYGON_MODE_FILL:
361       return AGX_POLYGON_MODE_FILL;
362    case PIPE_POLYGON_MODE_POINT:
363       return AGX_POLYGON_MODE_POINT;
364    case PIPE_POLYGON_MODE_LINE:
365       return AGX_POLYGON_MODE_LINE;
366    default:
367       unreachable("Unsupported polygon mode");
368    }
369 }
370 
371 static void *
agx_create_rs_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * cso)372 agx_create_rs_state(struct pipe_context *ctx,
373                     const struct pipe_rasterizer_state *cso)
374 {
375    struct agx_rasterizer *so = CALLOC_STRUCT(agx_rasterizer);
376    so->base = *cso;
377 
378    agx_pack(so->cull, CULL, cfg) {
379       cfg.cull_front = cso->cull_face & PIPE_FACE_FRONT;
380       cfg.cull_back = cso->cull_face & PIPE_FACE_BACK;
381       cfg.depth_clip = cso->depth_clip_near;
382       cfg.depth_clamp = !cso->depth_clip_near;
383       cfg.flat_shading_vertex =
384          cso->flatshade_first ? AGX_PPP_VERTEX_0 : AGX_PPP_VERTEX_2;
385       cfg.rasterizer_discard = cso->rasterizer_discard;
386    };
387 
388    /* Two-sided polygon mode doesn't seem to work on G13. Apple's OpenGL
389     * implementation lowers to multiple draws with culling. Warn.
390     */
391    if (unlikely(cso->fill_front != cso->fill_back)) {
392       agx_msg("Warning: Two-sided fill modes are unsupported, "
393               "rendering may be incorrect.\n");
394    }
395 
396    so->polygon_mode = agx_translate_polygon_mode(cso->fill_front);
397    so->line_width = agx_pack_line_width(cso->line_width);
398    so->depth_bias = util_get_offset(cso, cso->fill_front);
399 
400    return so;
401 }
402 
403 static void
agx_bind_rasterizer_state(struct pipe_context * pctx,void * cso)404 agx_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
405 {
406    struct agx_context *ctx = agx_context(pctx);
407    struct agx_rasterizer *so = cso;
408 
409    bool base_cso_changed = (cso == NULL) || (ctx->rast == NULL);
410 
411    /* Check if scissor or depth bias state has changed, since scissor/depth bias
412     * enable is part of the rasterizer state but everything else needed for
413     * scissors and depth bias is part of the scissor/depth bias arrays */
414    bool scissor_zbias_changed = base_cso_changed ||
415                                 (ctx->rast->base.scissor != so->base.scissor) ||
416                                 (ctx->rast->depth_bias != so->depth_bias);
417 
418    ctx->dirty |= AGX_DIRTY_RS;
419 
420    if (scissor_zbias_changed)
421       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
422 
423    if (base_cso_changed ||
424        (ctx->rast->base.sprite_coord_mode != so->base.sprite_coord_mode))
425       ctx->dirty |= AGX_DIRTY_SPRITE_COORD_MODE;
426 
427    ctx->rast = so;
428 }
429 
430 static bool
has_edgeflags(struct agx_context * ctx,enum mesa_prim mode)431 has_edgeflags(struct agx_context *ctx, enum mesa_prim mode)
432 {
433    return ctx->stage[PIPE_SHADER_VERTEX].shader->info.has_edgeflags &&
434           mode == MESA_PRIM_TRIANGLES &&
435           (ctx->rast->base.fill_front != PIPE_POLYGON_MODE_FILL);
436 }
437 
438 static enum agx_wrap
agx_wrap_from_pipe(enum pipe_tex_wrap in)439 agx_wrap_from_pipe(enum pipe_tex_wrap in)
440 {
441    switch (in) {
442    case PIPE_TEX_WRAP_REPEAT:
443       return AGX_WRAP_REPEAT;
444    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
445       return AGX_WRAP_CLAMP_TO_EDGE;
446    case PIPE_TEX_WRAP_MIRROR_REPEAT:
447       return AGX_WRAP_MIRRORED_REPEAT;
448    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
449       return AGX_WRAP_CLAMP_TO_BORDER;
450    case PIPE_TEX_WRAP_CLAMP:
451       return AGX_WRAP_CLAMP_GL;
452    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
453       return AGX_WRAP_MIRRORED_CLAMP_TO_EDGE;
454    default:
455       unreachable("Invalid wrap mode");
456    }
457 }
458 
459 static enum agx_mip_filter
agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)460 agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)
461 {
462    switch (in) {
463    case PIPE_TEX_MIPFILTER_NEAREST:
464       return AGX_MIP_FILTER_NEAREST;
465    case PIPE_TEX_MIPFILTER_LINEAR:
466       return AGX_MIP_FILTER_LINEAR;
467    case PIPE_TEX_MIPFILTER_NONE:
468       return AGX_MIP_FILTER_NONE;
469    }
470 
471    unreachable("Invalid mip filter");
472 }
473 
474 static const enum agx_compare_func agx_compare_funcs[PIPE_FUNC_ALWAYS + 1] = {
475    [PIPE_FUNC_NEVER] = AGX_COMPARE_FUNC_NEVER,
476    [PIPE_FUNC_LESS] = AGX_COMPARE_FUNC_LESS,
477    [PIPE_FUNC_EQUAL] = AGX_COMPARE_FUNC_EQUAL,
478    [PIPE_FUNC_LEQUAL] = AGX_COMPARE_FUNC_LEQUAL,
479    [PIPE_FUNC_GREATER] = AGX_COMPARE_FUNC_GREATER,
480    [PIPE_FUNC_NOTEQUAL] = AGX_COMPARE_FUNC_NOT_EQUAL,
481    [PIPE_FUNC_GEQUAL] = AGX_COMPARE_FUNC_GEQUAL,
482    [PIPE_FUNC_ALWAYS] = AGX_COMPARE_FUNC_ALWAYS,
483 };
484 
485 static const enum agx_filter agx_filters[] = {
486    [PIPE_TEX_FILTER_LINEAR] = AGX_FILTER_LINEAR,
487    [PIPE_TEX_FILTER_NEAREST] = AGX_FILTER_NEAREST,
488 };
489 
490 static enum pipe_format
fixup_border_zs(enum pipe_format orig,union pipe_color_union * c)491 fixup_border_zs(enum pipe_format orig, union pipe_color_union *c)
492 {
493    switch (orig) {
494    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
495    case PIPE_FORMAT_Z24X8_UNORM:
496       /* Z24 is internally promoted to Z32F via transfer_helper. These formats
497        * are normalized so should get clamped, but Z32F does not get clamped, so
498        * we clamp here.
499        */
500       c->f[0] = SATURATE(c->f[0]);
501       return PIPE_FORMAT_Z32_FLOAT;
502 
503    case PIPE_FORMAT_X24S8_UINT:
504    case PIPE_FORMAT_X32_S8X24_UINT:
505       /* Separate stencil is internally promoted */
506       return PIPE_FORMAT_S8_UINT;
507 
508    default:
509       return orig;
510    }
511 }
512 
513 static void *
agx_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * state)514 agx_create_sampler_state(struct pipe_context *pctx,
515                          const struct pipe_sampler_state *state)
516 {
517    struct agx_sampler_state *so = CALLOC_STRUCT(agx_sampler_state);
518    so->base = *state;
519 
520    /* We report a max texture LOD bias of 16, so clamp appropriately */
521    float lod_bias = CLAMP(state->lod_bias, -16.0, 16.0);
522    so->lod_bias_as_fp16 = _mesa_float_to_half(lod_bias);
523 
524    agx_pack(&so->desc, SAMPLER, cfg) {
525       cfg.minimum_lod = state->min_lod;
526       cfg.maximum_lod = state->max_lod;
527       cfg.maximum_anisotropy =
528          util_next_power_of_two(MAX2(state->max_anisotropy, 1));
529       cfg.magnify = agx_filters[state->mag_img_filter];
530       cfg.minify = agx_filters[state->min_img_filter];
531       cfg.mip_filter = agx_mip_filter_from_pipe(state->min_mip_filter);
532       cfg.wrap_s = agx_wrap_from_pipe(state->wrap_s);
533       cfg.wrap_t = agx_wrap_from_pipe(state->wrap_t);
534       cfg.wrap_r = agx_wrap_from_pipe(state->wrap_r);
535       cfg.pixel_coordinates = state->unnormalized_coords;
536       cfg.compare_func = agx_compare_funcs[state->compare_func];
537       cfg.compare_enable = state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE;
538       cfg.seamful_cube_maps = !state->seamless_cube_map;
539 
540       if (state->border_color_format != PIPE_FORMAT_NONE) {
541          /* TODO: Optimize to use compact descriptors for black/white borders */
542          so->uses_custom_border = true;
543          cfg.border_colour = AGX_BORDER_COLOUR_CUSTOM;
544       }
545    }
546 
547    memcpy(&so->desc_without_custom_border, &so->desc, sizeof(so->desc));
548 
549    if (so->uses_custom_border) {
550       union pipe_color_union border = state->border_color;
551       enum pipe_format format =
552          fixup_border_zs(state->border_color_format, &border);
553 
554       agx_pack_border(&so->border, border.ui, format);
555 
556       /* Neutralize the bindless-safe descriptor. XXX: This is a hack. */
557       so->desc_without_custom_border.opaque[1] &= ~(1u << 23);
558    }
559 
560    return so;
561 }
562 
563 static void
agx_delete_sampler_state(struct pipe_context * ctx,void * state)564 agx_delete_sampler_state(struct pipe_context *ctx, void *state)
565 {
566    struct agx_sampler_state *so = state;
567    FREE(so);
568 }
569 
570 static void
agx_bind_sampler_states(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,void ** states)571 agx_bind_sampler_states(struct pipe_context *pctx, enum pipe_shader_type shader,
572                         unsigned start, unsigned count, void **states)
573 {
574    struct agx_context *ctx = agx_context(pctx);
575 
576    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SAMPLER;
577 
578    for (unsigned i = 0; i < count; i++) {
579       unsigned p = start + i;
580       ctx->stage[shader].samplers[p] = states ? states[i] : NULL;
581       if (ctx->stage[shader].samplers[p])
582          ctx->stage[shader].valid_samplers |= BITFIELD_BIT(p);
583       else
584          ctx->stage[shader].valid_samplers &= ~BITFIELD_BIT(p);
585    }
586 
587    ctx->stage[shader].sampler_count =
588       util_last_bit(ctx->stage[shader].valid_samplers);
589 
590    /* Recalculate whether we need custom borders */
591    ctx->stage[shader].custom_borders = false;
592 
593    u_foreach_bit(i, ctx->stage[shader].valid_samplers) {
594       if (ctx->stage[shader].samplers[i]->uses_custom_border)
595          ctx->stage[shader].custom_borders = true;
596    }
597 }
598 
599 static enum agx_texture_dimension
agx_translate_tex_dim(enum pipe_texture_target dim,unsigned samples)600 agx_translate_tex_dim(enum pipe_texture_target dim, unsigned samples)
601 {
602    assert(samples >= 1);
603 
604    switch (dim) {
605    case PIPE_BUFFER:
606    case PIPE_TEXTURE_1D:
607       /* Lowered to 2D */
608       assert(samples == 1);
609       return AGX_TEXTURE_DIMENSION_2D;
610 
611    case PIPE_TEXTURE_RECT:
612    case PIPE_TEXTURE_2D:
613       return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
614                          : AGX_TEXTURE_DIMENSION_2D;
615 
616    case PIPE_TEXTURE_1D_ARRAY:
617       assert(samples == 1);
618       /* Lowered to 2D */
619       FALLTHROUGH;
620    case PIPE_TEXTURE_2D_ARRAY:
621       return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
622                          : AGX_TEXTURE_DIMENSION_2D_ARRAY;
623 
624    case PIPE_TEXTURE_3D:
625       assert(samples == 1);
626       return AGX_TEXTURE_DIMENSION_3D;
627 
628    case PIPE_TEXTURE_CUBE:
629       assert(samples == 1);
630       return AGX_TEXTURE_DIMENSION_CUBE;
631 
632    case PIPE_TEXTURE_CUBE_ARRAY:
633       assert(samples == 1);
634       return AGX_TEXTURE_DIMENSION_CUBE_ARRAY;
635 
636    default:
637       unreachable("Unsupported texture dimension");
638    }
639 }
640 
641 static bool
target_is_cube(enum pipe_texture_target target)642 target_is_cube(enum pipe_texture_target target)
643 {
644    return target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY;
645 }
646 
647 static void
agx_pack_texture(void * out,struct agx_resource * rsrc,enum pipe_format format,const struct pipe_sampler_view * state)648 agx_pack_texture(void *out, struct agx_resource *rsrc,
649                  enum pipe_format format /* override */,
650                  const struct pipe_sampler_view *state)
651 {
652    const struct util_format_description *desc = util_format_description(format);
653 
654    assert(ail_is_valid_pixel_format(format));
655 
656    uint8_t format_swizzle[4] = {
657       desc->swizzle[0],
658       desc->swizzle[1],
659       desc->swizzle[2],
660       desc->swizzle[3],
661    };
662 
663    if (util_format_is_depth_or_stencil(format)) {
664       assert(!util_format_is_depth_and_stencil(format) &&
665              "separate stencil always used");
666 
667       /* Broadcast depth and stencil */
668       format_swizzle[0] = 0;
669       format_swizzle[1] = 0;
670       format_swizzle[2] = 0;
671       format_swizzle[3] = 0;
672    }
673 
674    /* We only have a single swizzle for the user swizzle and the format fixup,
675     * so compose them now. */
676    uint8_t out_swizzle[4];
677    uint8_t view_swizzle[4] = {state->swizzle_r, state->swizzle_g,
678                               state->swizzle_b, state->swizzle_a};
679 
680    util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle);
681 
682    unsigned first_layer =
683       (state->target == PIPE_BUFFER) ? 0 : state->u.tex.first_layer;
684 
685    /* Pack the descriptor into GPU memory */
686    agx_pack(out, TEXTURE, cfg) {
687       cfg.dimension = agx_translate_tex_dim(state->target,
688                                             util_res_sample_count(&rsrc->base));
689       cfg.layout = agx_translate_layout(rsrc->layout.tiling);
690       cfg.channels = ail_pixel_format[format].channels;
691       cfg.type = ail_pixel_format[format].type;
692       cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]);
693       cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]);
694       cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]);
695       cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]);
696 
697       if (state->target == PIPE_BUFFER) {
698          unsigned size_el =
699             agx_texture_buffer_size_el(format, state->u.buf.size);
700 
701          /* Use a 2D texture to increase the maximum size */
702          cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
703          cfg.height = DIV_ROUND_UP(size_el, cfg.width);
704          cfg.first_level = cfg.last_level = 0;
705          cfg.buffer_size_sw = size_el;
706          cfg.buffer_offset_sw = 0;
707       } else {
708          cfg.width = rsrc->base.width0;
709          cfg.height = rsrc->base.height0;
710          cfg.first_level = state->u.tex.first_level;
711          cfg.last_level = state->u.tex.last_level;
712       }
713 
714       cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
715       cfg.unk_mipmapped = rsrc->mipmapped;
716       cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
717 
718       if (ail_is_compressed(&rsrc->layout)) {
719          cfg.compressed_1 = true;
720          cfg.extended = true;
721       }
722 
723       cfg.address = agx_map_texture_gpu(rsrc, first_layer);
724 
725       if (state->target == PIPE_BUFFER)
726          cfg.address += state->u.buf.offset;
727 
728       if (ail_is_compressed(&rsrc->layout)) {
729          cfg.acceleration_buffer =
730             agx_map_texture_gpu(rsrc, 0) + rsrc->layout.metadata_offset_B +
731             (first_layer * rsrc->layout.compression_layer_stride_B);
732       }
733 
734       if (state->target == PIPE_TEXTURE_3D) {
735          cfg.depth = rsrc->base.depth0;
736       } else if (state->target == PIPE_BUFFER) {
737          cfg.depth = 1;
738       } else {
739          unsigned layers =
740             state->u.tex.last_layer - state->u.tex.first_layer + 1;
741 
742          if (target_is_cube(state->target))
743             layers /= 6;
744 
745          if (rsrc->layout.tiling == AIL_TILING_LINEAR &&
746              (state->target == PIPE_TEXTURE_1D_ARRAY ||
747               state->target == PIPE_TEXTURE_2D_ARRAY)) {
748 
749             cfg.depth_linear = layers;
750             cfg.layer_stride_linear = (rsrc->layout.layer_stride_B - 0x80);
751             cfg.extended = true;
752          } else {
753             assert((rsrc->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
754             cfg.depth = layers;
755          }
756       }
757 
758       if (rsrc->base.nr_samples > 1)
759          cfg.samples = agx_translate_sample_count(rsrc->base.nr_samples);
760 
761       if (state->target == PIPE_BUFFER) {
762          cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16;
763       } else if (rsrc->layout.tiling == AIL_TILING_LINEAR) {
764          cfg.stride = ail_get_linear_stride_B(&rsrc->layout, 0) - 16;
765       } else {
766          assert(rsrc->layout.tiling == AIL_TILING_TWIDDLED ||
767                 rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED);
768 
769          cfg.page_aligned_layers = rsrc->layout.page_aligned_layers;
770       }
771    }
772 }
773 
774 static struct pipe_sampler_view *
agx_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * orig_texture,const struct pipe_sampler_view * state)775 agx_create_sampler_view(struct pipe_context *pctx,
776                         struct pipe_resource *orig_texture,
777                         const struct pipe_sampler_view *state)
778 {
779    struct agx_resource *rsrc = agx_resource(orig_texture);
780    struct agx_sampler_view *so = CALLOC_STRUCT(agx_sampler_view);
781 
782    if (!so)
783       return NULL;
784 
785    struct pipe_resource *texture = orig_texture;
786    enum pipe_format format = state->format;
787 
788    const struct util_format_description *desc = util_format_description(format);
789 
790    /* Separate stencil always used on G13, so we need to fix up for Z32S8 */
791    if (util_format_has_stencil(desc) && rsrc->separate_stencil) {
792       if (util_format_has_depth(desc)) {
793          /* Reinterpret as the depth-only part */
794          format = util_format_get_depth_only(format);
795       } else {
796          /* Use the stencil-only-part */
797          rsrc = rsrc->separate_stencil;
798          texture = &rsrc->base;
799          format = texture->format;
800       }
801    }
802 
803    agx_legalize_compression(agx_context(pctx), rsrc, format);
804 
805    /* Save off the resource that we actually use, with the stencil fixed up */
806    so->rsrc = rsrc;
807    so->format = format;
808 
809    so->base = *state;
810    so->base.texture = NULL;
811    pipe_resource_reference(&so->base.texture, orig_texture);
812    pipe_reference_init(&so->base.reference, 1);
813    so->base.context = pctx;
814    return &so->base;
815 }
816 
817 static void
agx_set_sampler_views(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)818 agx_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
819                       unsigned start, unsigned count,
820                       unsigned unbind_num_trailing_slots, bool take_ownership,
821                       struct pipe_sampler_view **views)
822 {
823    struct agx_context *ctx = agx_context(pctx);
824    unsigned new_nr = 0;
825    unsigned i;
826 
827    assert(start == 0);
828 
829    if (!views)
830       count = 0;
831 
832    for (i = 0; i < count; ++i) {
833       if (take_ownership) {
834          pipe_sampler_view_reference(
835             (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
836          ctx->stage[shader].textures[i] = (struct agx_sampler_view *)views[i];
837       } else {
838          pipe_sampler_view_reference(
839             (struct pipe_sampler_view **)&ctx->stage[shader].textures[i],
840             views[i]);
841       }
842    }
843 
844    for (; i < count + unbind_num_trailing_slots; i++) {
845       pipe_sampler_view_reference(
846          (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
847    }
848 
849    for (unsigned t = 0; t < MAX2(ctx->stage[shader].texture_count, count);
850         ++t) {
851       if (ctx->stage[shader].textures[t])
852          new_nr = t + 1;
853    }
854 
855    ctx->stage[shader].texture_count = new_nr;
856    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
857 }
858 
859 static void
agx_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * pview)860 agx_sampler_view_destroy(struct pipe_context *ctx,
861                          struct pipe_sampler_view *pview)
862 {
863    struct agx_sampler_view *view = (struct agx_sampler_view *)pview;
864    pipe_resource_reference(&view->base.texture, NULL);
865    FREE(view);
866 }
867 
868 static struct pipe_surface *
agx_create_surface(struct pipe_context * ctx,struct pipe_resource * texture,const struct pipe_surface * surf_tmpl)869 agx_create_surface(struct pipe_context *ctx, struct pipe_resource *texture,
870                    const struct pipe_surface *surf_tmpl)
871 {
872    agx_legalize_compression(agx_context(ctx), agx_resource(texture),
873                             surf_tmpl->format);
874 
875    struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
876 
877    if (!surface)
878       return NULL;
879 
880    unsigned level = surf_tmpl->u.tex.level;
881 
882    pipe_reference_init(&surface->reference, 1);
883    pipe_resource_reference(&surface->texture, texture);
884 
885    assert(texture->target != PIPE_BUFFER && "buffers are not renderable");
886 
887    surface->context = ctx;
888    surface->format = surf_tmpl->format;
889    surface->nr_samples = surf_tmpl->nr_samples;
890    surface->width = u_minify(texture->width0, level);
891    surface->height = u_minify(texture->height0, level);
892    surface->texture = texture;
893    surface->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
894    surface->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
895    surface->u.tex.level = level;
896 
897    return surface;
898 }
899 
900 static void
agx_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)901 agx_set_clip_state(struct pipe_context *ctx,
902                    const struct pipe_clip_state *state)
903 {
904 }
905 
906 static void
agx_set_polygon_stipple(struct pipe_context * pctx,const struct pipe_poly_stipple * state)907 agx_set_polygon_stipple(struct pipe_context *pctx,
908                         const struct pipe_poly_stipple *state)
909 {
910    struct agx_context *ctx = agx_context(pctx);
911 
912    memcpy(ctx->poly_stipple, state->stipple, sizeof(ctx->poly_stipple));
913    ctx->dirty |= AGX_DIRTY_POLY_STIPPLE;
914 }
915 
916 static void
agx_set_sample_mask(struct pipe_context * pipe,unsigned sample_mask)917 agx_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
918 {
919    struct agx_context *ctx = agx_context(pipe);
920 
921    /* Optimization: At most MSAA 4x supported, so normalize to avoid pointless
922     * dirtying switching between e.g. 0xFFFF and 0xFFFFFFFF masks.
923     */
924    unsigned new_mask = sample_mask & BITFIELD_MASK(4);
925 
926    if (ctx->sample_mask != new_mask) {
927       ctx->sample_mask = new_mask;
928       ctx->dirty |= AGX_DIRTY_SAMPLE_MASK;
929    }
930 }
931 
932 static void
agx_set_scissor_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * scissor)933 agx_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
934                        unsigned num_scissors,
935                        const struct pipe_scissor_state *scissor)
936 {
937    struct agx_context *ctx = agx_context(pctx);
938 
939    STATIC_ASSERT(sizeof(ctx->scissor[0]) == sizeof(*scissor));
940    assert(start_slot + num_scissors <= AGX_MAX_VIEWPORTS);
941 
942    memcpy(&ctx->scissor[start_slot], scissor, sizeof(*scissor) * num_scissors);
943    ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
944 }
945 
946 static void
agx_set_stencil_ref(struct pipe_context * pctx,const struct pipe_stencil_ref state)947 agx_set_stencil_ref(struct pipe_context *pctx,
948                     const struct pipe_stencil_ref state)
949 {
950    struct agx_context *ctx = agx_context(pctx);
951    ctx->stencil_ref = state;
952    ctx->dirty |= AGX_DIRTY_STENCIL_REF;
953 }
954 
955 static void
agx_set_viewport_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_viewports,const struct pipe_viewport_state * vp)956 agx_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
957                         unsigned num_viewports,
958                         const struct pipe_viewport_state *vp)
959 {
960    struct agx_context *ctx = agx_context(pctx);
961 
962    STATIC_ASSERT(sizeof(ctx->viewport[0]) == sizeof(*vp));
963    assert(start_slot + num_viewports <= AGX_MAX_VIEWPORTS);
964 
965    memcpy(&ctx->viewport[start_slot], vp, sizeof(*vp) * num_viewports);
966    ctx->dirty |= AGX_DIRTY_VIEWPORT;
967 }
968 
969 static void
agx_get_scissor_extents(const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,const struct pipe_framebuffer_state * fb,unsigned * minx,unsigned * miny,unsigned * maxx,unsigned * maxy)970 agx_get_scissor_extents(const struct pipe_viewport_state *vp,
971                         const struct pipe_scissor_state *ss,
972                         const struct pipe_framebuffer_state *fb, unsigned *minx,
973                         unsigned *miny, unsigned *maxx, unsigned *maxy)
974 {
975    float trans_x = vp->translate[0], trans_y = vp->translate[1];
976    float abs_scale_x = fabsf(vp->scale[0]), abs_scale_y = fabsf(vp->scale[1]);
977 
978    /* Calculate the extent of the viewport. Note if a particular dimension of
979     * the viewport is an odd number of pixels, both the translate and the scale
980     * will have a fractional part of 0.5, so adding and subtracting them yields
981     * an integer. Therefore we don't need to round explicitly */
982    *minx = CLAMP((int)(trans_x - abs_scale_x), 0, fb->width);
983    *miny = CLAMP((int)(trans_y - abs_scale_y), 0, fb->height);
984    *maxx = CLAMP((int)(trans_x + abs_scale_x), 0, fb->width);
985    *maxy = CLAMP((int)(trans_y + abs_scale_y), 0, fb->height);
986 
987    if (ss) {
988       *minx = MAX2(ss->minx, *minx);
989       *miny = MAX2(ss->miny, *miny);
990       *maxx = MIN2(ss->maxx, *maxx);
991       *maxy = MIN2(ss->maxy, *maxy);
992    }
993 }
994 
995 static void
agx_upload_viewport_scissor(struct agx_pool * pool,struct agx_batch * batch,uint8_t ** out,const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,bool clip_halfz,bool multi_viewport)996 agx_upload_viewport_scissor(struct agx_pool *pool, struct agx_batch *batch,
997                             uint8_t **out, const struct pipe_viewport_state *vp,
998                             const struct pipe_scissor_state *ss,
999                             bool clip_halfz, bool multi_viewport)
1000 {
1001    /* Number of viewports/scissors isn't precisely determinable in Gallium, so
1002     * just key off whether we can write to anything other than viewport 0. This
1003     * could be tuned in the future.
1004     */
1005    unsigned count = multi_viewport ? AGX_MAX_VIEWPORTS : 1;
1006 
1007    /* Allocate scissor descriptors */
1008    unsigned index = batch->scissor.size / AGX_SCISSOR_LENGTH;
1009    struct agx_scissor_packed *scissors =
1010       util_dynarray_grow_bytes(&batch->scissor, count, AGX_SCISSOR_LENGTH);
1011 
1012    unsigned minx[AGX_MAX_VIEWPORTS], miny[AGX_MAX_VIEWPORTS];
1013    unsigned maxx[AGX_MAX_VIEWPORTS], maxy[AGX_MAX_VIEWPORTS];
1014 
1015    /* Upload each scissor */
1016    for (unsigned i = 0; i < count; ++i) {
1017       agx_get_scissor_extents(&vp[i], ss ? &ss[i] : NULL, &batch->key, &minx[i],
1018                               &miny[i], &maxx[i], &maxy[i]);
1019 
1020       float minz, maxz;
1021       util_viewport_zmin_zmax(vp, clip_halfz, &minz, &maxz);
1022 
1023       agx_pack(scissors + i, SCISSOR, cfg) {
1024          cfg.min_x = minx[i];
1025          cfg.min_y = miny[i];
1026          cfg.min_z = minz;
1027          cfg.max_x = maxx[i];
1028          cfg.max_y = maxy[i];
1029          cfg.max_z = maxz;
1030       }
1031    }
1032 
1033    /* Upload state */
1034    struct AGX_PPP_HEADER present = {
1035       .depth_bias_scissor = true,
1036       .region_clip = true,
1037       .viewport = true,
1038       .viewport_count = count,
1039    };
1040 
1041    size_t size = agx_ppp_update_size(&present);
1042    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
1043    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
1044 
1045    agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
1046       cfg.scissor = index;
1047 
1048       /* Use the current depth bias, we allocate linearly */
1049       unsigned count = batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
1050       cfg.depth_bias = count ? count - 1 : 0;
1051    };
1052 
1053    for (unsigned i = 0; i < count; ++i) {
1054       agx_ppp_push(&ppp, REGION_CLIP, cfg) {
1055          cfg.enable = true;
1056          cfg.min_x = minx[i] / 32;
1057          cfg.min_y = miny[i] / 32;
1058          cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
1059          cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
1060       }
1061    }
1062 
1063    agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
1064       ;
1065 
1066    /* Upload viewports */
1067    for (unsigned i = 0; i < count; ++i) {
1068       agx_ppp_push(&ppp, VIEWPORT, cfg) {
1069          cfg.translate_x = vp[i].translate[0];
1070          cfg.translate_y = vp[i].translate[1];
1071          cfg.translate_z = vp[i].translate[2];
1072          cfg.scale_x = vp[i].scale[0];
1073          cfg.scale_y = vp[i].scale[1];
1074          cfg.scale_z = vp[i].scale[2];
1075 
1076          if (!clip_halfz) {
1077             cfg.translate_z -= cfg.scale_z;
1078             cfg.scale_z *= 2;
1079          }
1080       }
1081    }
1082 
1083    agx_ppp_fini(out, &ppp);
1084 }
1085 
1086 static void
agx_upload_depth_bias(struct agx_batch * batch,const struct pipe_rasterizer_state * rast)1087 agx_upload_depth_bias(struct agx_batch *batch,
1088                       const struct pipe_rasterizer_state *rast)
1089 {
1090    void *ptr =
1091       util_dynarray_grow_bytes(&batch->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
1092 
1093    agx_pack(ptr, DEPTH_BIAS, cfg) {
1094       cfg.depth_bias = rast->offset_units * 2.0f;
1095       cfg.slope_scale = rast->offset_scale;
1096       cfg.clamp = rast->offset_clamp;
1097    }
1098 }
1099 
1100 /* A framebuffer state can be reused across batches, so it doesn't make sense
1101  * to add surfaces to the BO list here. Instead we added them when flushing.
1102  */
1103 
1104 static void
agx_set_framebuffer_state(struct pipe_context * pctx,const struct pipe_framebuffer_state * state)1105 agx_set_framebuffer_state(struct pipe_context *pctx,
1106                           const struct pipe_framebuffer_state *state)
1107 {
1108    struct agx_context *ctx = agx_context(pctx);
1109 
1110    if (!state)
1111       return;
1112 
1113    util_copy_framebuffer_state(&ctx->framebuffer, state);
1114    ctx->batch = NULL;
1115    agx_dirty_all(ctx);
1116 }
1117 
1118 /*
1119  * To write out render targets, each render target surface is bound as a
1120  * writable shader image, written with the end-of-tile program. This helper
1121  * constructs the internal pipe_image_view used.
1122  */
1123 static struct pipe_image_view
image_view_for_surface(struct pipe_surface * surf)1124 image_view_for_surface(struct pipe_surface *surf)
1125 {
1126    return (struct pipe_image_view){
1127       .resource = surf->texture,
1128       .format = surf->format,
1129       .access = PIPE_IMAGE_ACCESS_READ_WRITE,
1130       .shader_access = PIPE_IMAGE_ACCESS_READ_WRITE,
1131       .u.tex.single_layer_view =
1132          surf->u.tex.first_layer == surf->u.tex.last_layer,
1133       .u.tex.first_layer = surf->u.tex.first_layer,
1134       .u.tex.last_layer = surf->u.tex.last_layer,
1135       .u.tex.level = surf->u.tex.level,
1136    };
1137 }
1138 
1139 /* Similarly, to read render targets, surfaces are bound as textures */
1140 static struct pipe_sampler_view
sampler_view_for_surface(struct pipe_surface * surf)1141 sampler_view_for_surface(struct pipe_surface *surf)
1142 {
1143    bool layered = surf->u.tex.last_layer > surf->u.tex.first_layer;
1144 
1145    return (struct pipe_sampler_view){
1146       /* To reduce shader variants, we always use a 2D texture. For reloads of
1147        * arrays and cube maps, we map a single layer as a 2D image.
1148        */
1149       .target = layered ? PIPE_TEXTURE_2D_ARRAY : PIPE_TEXTURE_2D,
1150       .swizzle_r = PIPE_SWIZZLE_X,
1151       .swizzle_g = PIPE_SWIZZLE_Y,
1152       .swizzle_b = PIPE_SWIZZLE_Z,
1153       .swizzle_a = PIPE_SWIZZLE_W,
1154       .u.tex =
1155          {
1156             .first_layer = surf->u.tex.first_layer,
1157             .last_layer = surf->u.tex.last_layer,
1158             .first_level = surf->u.tex.level,
1159             .last_level = surf->u.tex.level,
1160          },
1161    };
1162 }
1163 
1164 static bool
target_is_array(enum pipe_texture_target target)1165 target_is_array(enum pipe_texture_target target)
1166 {
1167    switch (target) {
1168    case PIPE_TEXTURE_3D:
1169    case PIPE_TEXTURE_CUBE:
1170    case PIPE_TEXTURE_1D_ARRAY:
1171    case PIPE_TEXTURE_2D_ARRAY:
1172    case PIPE_TEXTURE_CUBE_ARRAY:
1173       return true;
1174    default:
1175       return false;
1176    }
1177 }
1178 
1179 static void
agx_batch_upload_pbe(struct agx_batch * batch,struct agx_pbe_packed * out,struct pipe_image_view * view,bool block_access,bool arrays_as_2d,bool force_2d_array,bool emrt)1180 agx_batch_upload_pbe(struct agx_batch *batch, struct agx_pbe_packed *out,
1181                      struct pipe_image_view *view, bool block_access,
1182                      bool arrays_as_2d, bool force_2d_array, bool emrt)
1183 {
1184    struct agx_resource *tex = agx_resource(view->resource);
1185    const struct util_format_description *desc =
1186       util_format_description(view->format);
1187    enum pipe_texture_target target = tex->base.target;
1188    bool is_buffer = (target == PIPE_BUFFER);
1189 
1190    if (!is_buffer && view->u.tex.single_layer_view)
1191       target = PIPE_TEXTURE_2D;
1192 
1193    arrays_as_2d |= (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
1194 
1195    /* To reduce shader variants, spilled layered render targets are accessed as
1196     * 2D Arrays regardless of the actual target, so force in that case.
1197     *
1198     * Likewise, cubes are accessed as arrays for consistency with NIR.
1199     */
1200    if ((arrays_as_2d && target_is_array(target)) || target_is_cube(target) ||
1201        force_2d_array)
1202       target = PIPE_TEXTURE_2D_ARRAY;
1203 
1204    unsigned level = is_buffer ? 0 : view->u.tex.level;
1205    unsigned layer = is_buffer ? 0 : view->u.tex.first_layer;
1206 
1207    agx_pack(out, PBE, cfg) {
1208       cfg.dimension =
1209          agx_translate_tex_dim(target, util_res_sample_count(&tex->base));
1210       cfg.layout = agx_translate_layout(tex->layout.tiling);
1211       cfg.channels = ail_pixel_format[view->format].channels;
1212       cfg.type = ail_pixel_format[view->format].type;
1213       cfg.srgb = util_format_is_srgb(view->format);
1214 
1215       assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
1216 
1217       for (unsigned i = 0; i < desc->nr_channels; ++i) {
1218          if (desc->swizzle[i] == 0)
1219             cfg.swizzle_r = i;
1220          else if (desc->swizzle[i] == 1)
1221             cfg.swizzle_g = i;
1222          else if (desc->swizzle[i] == 2)
1223             cfg.swizzle_b = i;
1224          else if (desc->swizzle[i] == 3)
1225             cfg.swizzle_a = i;
1226       }
1227 
1228       cfg.buffer = agx_map_texture_gpu(tex, layer);
1229       cfg.unk_mipmapped = tex->mipmapped;
1230 
1231       if (is_buffer) {
1232          unsigned size_el =
1233             agx_texture_buffer_size_el(view->format, view->u.buf.size);
1234 
1235          /* Buffers uniquely have offsets (in bytes, not texels) */
1236          cfg.buffer += view->u.buf.offset;
1237 
1238          /* Use a 2D texture to increase the maximum size */
1239          cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
1240          cfg.height = DIV_ROUND_UP(size_el, cfg.width);
1241          cfg.level = 0;
1242          cfg.stride = (cfg.width * util_format_get_blocksize(view->format)) - 4;
1243          cfg.layers = 1;
1244          cfg.levels = 1;
1245       } else if (util_res_sample_count(&tex->base) > 1 && !block_access) {
1246          /* Multisampled images are bound like buffer textures, with
1247           * addressing arithmetic to determine the texel to write.
1248           *
1249           * Note that the end-of-tile program uses real multisample images with
1250           * image_write_block instructions.
1251           */
1252          unsigned blocksize_B = util_format_get_blocksize(view->format);
1253          unsigned size_px =
1254             (tex->layout.size_B - tex->layout.layer_stride_B * layer) /
1255             blocksize_B;
1256 
1257          cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
1258          cfg.layout = AGX_LAYOUT_LINEAR;
1259          cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
1260          cfg.height = DIV_ROUND_UP(size_px, cfg.width);
1261          cfg.stride = (cfg.width * blocksize_B) - 4;
1262          cfg.layers = 1;
1263          cfg.levels = 1;
1264 
1265          cfg.buffer += tex->layout.level_offsets_B[level];
1266          cfg.level = 0;
1267       } else {
1268          cfg.width = view->resource->width0;
1269          cfg.height = view->resource->height0;
1270          cfg.level = level;
1271 
1272          unsigned layers = view->u.tex.last_layer - layer + 1;
1273 
1274          if (tex->layout.tiling == AIL_TILING_LINEAR &&
1275              (target == PIPE_TEXTURE_1D_ARRAY ||
1276               target == PIPE_TEXTURE_2D_ARRAY)) {
1277 
1278             cfg.depth_linear = layers;
1279             cfg.layer_stride_linear = (tex->layout.layer_stride_B - 0x80);
1280             cfg.extended = true;
1281          } else {
1282             assert((tex->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
1283             cfg.layers = layers;
1284          }
1285 
1286          if (tex->layout.tiling == AIL_TILING_LINEAR) {
1287             cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
1288             cfg.levels = 1;
1289          } else {
1290             cfg.page_aligned_layers = tex->layout.page_aligned_layers;
1291             cfg.levels = tex->base.last_level + 1;
1292          }
1293 
1294          if (tex->base.nr_samples > 1)
1295             cfg.samples = agx_translate_sample_count(tex->base.nr_samples);
1296       }
1297 
1298       if (ail_is_compressed(&tex->layout) && !emrt) {
1299          cfg.compressed_1 = true;
1300          cfg.extended = true;
1301 
1302          cfg.acceleration_buffer =
1303             agx_map_texture_gpu(tex, 0) + tex->layout.metadata_offset_B +
1304             (layer * tex->layout.compression_layer_stride_B);
1305       }
1306 
1307       /* When the descriptor isn't extended architecturally, we can use the last
1308        * 8 bytes as a sideband. We use it to provide metadata for image atomics.
1309        */
1310       if (!cfg.extended && (tex->layout.writeable_image || emrt) &&
1311           tex->base.target != PIPE_BUFFER) {
1312 
1313          if (util_res_sample_count(&tex->base) > 1) {
1314             cfg.aligned_width_msaa_sw =
1315                align(u_minify(view->resource->width0, level),
1316                      tex->layout.tilesize_el[level].width_el);
1317          } else {
1318             cfg.level_offset_sw =
1319                ail_get_level_offset_B(&tex->layout, cfg.level);
1320          }
1321 
1322          cfg.sample_count_log2_sw = util_logbase2(tex->base.nr_samples);
1323 
1324          if (tex->layout.tiling == AIL_TILING_TWIDDLED || emrt) {
1325             struct ail_tile tile_size = tex->layout.tilesize_el[level];
1326             cfg.tile_width_sw = tile_size.width_el;
1327             cfg.tile_height_sw = tile_size.height_el;
1328 
1329             cfg.layer_stride_sw = tex->layout.layer_stride_B;
1330          }
1331       }
1332    };
1333 }
1334 
1335 /* Likewise constant buffers, textures, and samplers are handled in a common
1336  * per-draw path, with dirty tracking to reduce the costs involved.
1337  */
1338 
1339 static void
agx_set_constant_buffer(struct pipe_context * pctx,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)1340 agx_set_constant_buffer(struct pipe_context *pctx, enum pipe_shader_type shader,
1341                         uint index, bool take_ownership,
1342                         const struct pipe_constant_buffer *cb)
1343 {
1344    struct agx_context *ctx = agx_context(pctx);
1345    struct agx_stage *s = &ctx->stage[shader];
1346    struct pipe_constant_buffer *constants = &s->cb[index];
1347 
1348    util_copy_constant_buffer(&s->cb[index], cb, take_ownership);
1349 
1350    /* Upload user buffer immediately */
1351    if (constants->user_buffer && !constants->buffer) {
1352       u_upload_data(ctx->base.const_uploader, 0, constants->buffer_size, 64,
1353                     constants->user_buffer, &constants->buffer_offset,
1354                     &constants->buffer);
1355    }
1356 
1357    unsigned mask = (1 << index);
1358 
1359    if (cb)
1360       s->cb_mask |= mask;
1361    else
1362       s->cb_mask &= ~mask;
1363 
1364    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_CONST;
1365 }
1366 
1367 static void
agx_surface_destroy(struct pipe_context * ctx,struct pipe_surface * surface)1368 agx_surface_destroy(struct pipe_context *ctx, struct pipe_surface *surface)
1369 {
1370    pipe_resource_reference(&surface->texture, NULL);
1371    FREE(surface);
1372 }
1373 
1374 static void
agx_delete_state(struct pipe_context * ctx,void * state)1375 agx_delete_state(struct pipe_context *ctx, void *state)
1376 {
1377    FREE(state);
1378 }
1379 
1380 /* BOs added to the batch in the uniform upload path */
1381 
1382 static void
agx_set_vertex_buffers(struct pipe_context * pctx,unsigned count,const struct pipe_vertex_buffer * buffers)1383 agx_set_vertex_buffers(struct pipe_context *pctx, unsigned count,
1384                        const struct pipe_vertex_buffer *buffers)
1385 {
1386    struct agx_context *ctx = agx_context(pctx);
1387 
1388    util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers,
1389                                 count, true);
1390 
1391    ctx->dirty |= AGX_DIRTY_VERTEX;
1392 }
1393 
1394 static void *
agx_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)1395 agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
1396                            const struct pipe_vertex_element *state)
1397 {
1398    assert(count <= AGX_MAX_ATTRIBS);
1399 
1400    struct agx_vertex_elements *so = calloc(1, sizeof(*so));
1401 
1402    for (unsigned i = 0; i < count; ++i) {
1403       const struct pipe_vertex_element ve = state[i];
1404 
1405       const struct util_format_description *desc =
1406          util_format_description(ve.src_format);
1407       unsigned chan_size = desc->channel[0].size / 8;
1408       assert((ve.src_offset & (chan_size - 1)) == 0);
1409 
1410       so->buffers[i] = ve.vertex_buffer_index;
1411       so->src_offsets[i] = ve.src_offset;
1412 
1413       so->key[i] = (struct agx_velem_key){
1414          .stride = ve.src_stride,
1415          .format = ve.src_format,
1416          .divisor = ve.instance_divisor,
1417          .instanced = ve.instance_divisor > 0,
1418       };
1419    }
1420 
1421    return so;
1422 }
1423 
1424 static void
agx_bind_vertex_elements_state(struct pipe_context * pctx,void * cso)1425 agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso)
1426 {
1427    struct agx_context *ctx = agx_context(pctx);
1428    ctx->attributes = cso;
1429    ctx->dirty |= AGX_DIRTY_VERTEX;
1430 }
1431 
1432 DERIVE_HASH_TABLE(asahi_vs_shader_key);
1433 DERIVE_HASH_TABLE(asahi_gs_shader_key);
1434 DERIVE_HASH_TABLE(asahi_fs_shader_key);
1435 DERIVE_HASH_TABLE(agx_fast_link_key);
1436 
1437 /* No compute variants */
1438 static uint32_t
asahi_cs_shader_key_hash(const void * key)1439 asahi_cs_shader_key_hash(const void *key)
1440 {
1441    return 0;
1442 }
1443 
1444 static bool
asahi_cs_shader_key_equal(const void * a,const void * b)1445 asahi_cs_shader_key_equal(const void *a, const void *b)
1446 {
1447    return true;
1448 }
1449 
1450 /* Dynamic lowered I/O version of nir_lower_clip_halfz */
1451 static bool
agx_nir_lower_clip_m1_1(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1452 agx_nir_lower_clip_m1_1(nir_builder *b, nir_intrinsic_instr *intr,
1453                         UNUSED void *data)
1454 {
1455    if (intr->intrinsic != nir_intrinsic_store_output)
1456       return false;
1457    if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_POS)
1458       return false;
1459 
1460    assert(nir_intrinsic_component(intr) == 0 && "not yet scalarized");
1461    b->cursor = nir_before_instr(&intr->instr);
1462 
1463    nir_def *pos = intr->src[0].ssa;
1464    nir_def *z = nir_channel(b, pos, 2);
1465    nir_def *w = nir_channel(b, pos, 3);
1466    nir_def *c = nir_load_clip_z_coeff_agx(b);
1467 
1468    /* Lerp. If c = 0, reduces to z. If c = 1/2, reduces to (z + w)/2 */
1469    nir_def *new_z = nir_ffma(b, nir_fneg(b, z), c, nir_ffma(b, w, c, z));
1470    nir_src_rewrite(&intr->src[0], nir_vector_insert_imm(b, pos, new_z, 2));
1471    return true;
1472 }
1473 
1474 /*
1475  * To implement point sprites, we'll replace TEX0...7 with point coordinate
1476  * reads as required. However, the .zw needs to read back 0.0/1.0. This pass
1477  * fixes up TEX loads of Z and W according to a uniform passed in a sideband,
1478  * eliminating shader variants.
1479  */
1480 static bool
agx_nir_lower_point_sprite_zw(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1481 agx_nir_lower_point_sprite_zw(nir_builder *b, nir_intrinsic_instr *intr,
1482                               UNUSED void *data)
1483 {
1484    if (intr->intrinsic != nir_intrinsic_load_input &&
1485        intr->intrinsic != nir_intrinsic_load_interpolated_input)
1486       return false;
1487 
1488    gl_varying_slot loc = nir_intrinsic_io_semantics(intr).location;
1489    if (!(loc >= VARYING_SLOT_TEX0 && loc <= VARYING_SLOT_TEX7))
1490       return false;
1491 
1492    b->cursor = nir_after_instr(&intr->instr);
1493    unsigned component = nir_intrinsic_component(intr);
1494 
1495    nir_def *mask = nir_load_tex_sprite_mask_agx(b);
1496    nir_def *location = nir_iadd_imm(b, nir_get_io_offset_src(intr)->ssa,
1497                                     loc - VARYING_SLOT_TEX0);
1498    nir_def *bit = nir_ishl(b, nir_imm_intN_t(b, 1, 16), location);
1499    nir_def *replace = nir_i2b(b, nir_iand(b, mask, bit));
1500 
1501    nir_def *vec = nir_pad_vec4(b, &intr->def);
1502    nir_def *chans[4] = {NULL, NULL, nir_imm_floatN_t(b, 0.0, vec->bit_size),
1503                         nir_imm_floatN_t(b, 1.0, vec->bit_size)};
1504 
1505    for (unsigned i = 0; i < 4; ++i) {
1506       nir_def *chan = nir_channel_or_undef(b, vec, i - component);
1507       chans[i] = chans[i] ? nir_bcsel(b, replace, chans[i], chan) : chan;
1508    }
1509 
1510    nir_def *new_vec = nir_vec(b, &chans[component], intr->def.num_components);
1511    nir_def_rewrite_uses_after(&intr->def, new_vec, new_vec->parent_instr);
1512    return true;
1513 }
1514 
1515 /*
1516  * Compile a NIR shader. The only lowering left at this point is sysvals. The
1517  * shader key should have already been applied. agx_compile_variant may call
1518  * this multiple times if there are auxiliary shaders.
1519  */
1520 static struct agx_compiled_shader *
agx_compile_nir(struct agx_device * dev,nir_shader * nir,struct util_debug_callback * debug,enum pipe_shader_type stage,bool internal_kernel,bool terminal,bool secondary,unsigned cf_base,BITSET_WORD * attrib_components_read)1521 agx_compile_nir(struct agx_device *dev, nir_shader *nir,
1522                 struct util_debug_callback *debug, enum pipe_shader_type stage,
1523                 bool internal_kernel, bool terminal, bool secondary,
1524                 unsigned cf_base, BITSET_WORD *attrib_components_read)
1525 {
1526    struct agx_compiled_shader *compiled = CALLOC_STRUCT(agx_compiled_shader);
1527    compiled->stage = stage;
1528    if (attrib_components_read)
1529       BITSET_COPY(compiled->attrib_components_read, attrib_components_read);
1530 
1531    struct agx_shader_key key = {
1532       .dev = agx_gather_device_key(dev),
1533       .libagx = dev->libagx,
1534       .has_scratch = !secondary,
1535       .promote_constants = true,
1536       .no_stop = !terminal,
1537       .secondary = secondary,
1538    };
1539 
1540    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1541       NIR_PASS(_, nir, agx_nir_lower_interpolation);
1542    }
1543 
1544    /* We always use dynamic sample shading in the GL driver. Indicate that. */
1545    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
1546        nir->info.fs.uses_sample_shading)
1547       key.fs.inside_sample_loop = true;
1548 
1549    if (internal_kernel) {
1550       key.reserved_preamble = 8;
1551    } else if (!secondary) {
1552       NIR_PASS(_, nir, agx_nir_lower_sysvals, stage, true);
1553       NIR_PASS(_, nir, agx_nir_layout_uniforms, compiled,
1554                &key.reserved_preamble);
1555    }
1556 
1557    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1558       key.fs.cf_base = cf_base;
1559    }
1560 
1561    agx_compile_shader_nir(nir, &key, debug, &compiled->b);
1562 
1563    if (compiled->b.info.binary_size && !secondary) {
1564       compiled->bo = agx_bo_create(dev, compiled->b.info.binary_size, 0,
1565                                    AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
1566 
1567       memcpy(agx_bo_map(compiled->bo), compiled->b.binary,
1568              compiled->b.info.binary_size);
1569    }
1570 
1571    return compiled;
1572 }
1573 
1574 static struct agx_compiled_shader *
1575 agx_build_meta_shader_internal(struct agx_context *ctx,
1576                                meta_shader_builder_t builder, void *data,
1577                                size_t data_size, bool prolog, bool epilog,
1578                                unsigned cf_base, bool internal_kernel);
1579 
1580 /* Does not take ownership of key. Clones if necessary. */
1581 static struct agx_compiled_shader *
agx_compile_variant(struct agx_device * dev,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key_)1582 agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
1583                     struct agx_uncompiled_shader *so,
1584                     struct util_debug_callback *debug,
1585                     union asahi_shader_key *key_)
1586 {
1587    struct blob_reader reader;
1588    blob_reader_init(&reader, so->serialized_nir.data, so->serialized_nir.size);
1589    nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);
1590 
1591    /* Auxiliary programs */
1592    enum mesa_prim gs_out_prim = MESA_PRIM_MAX;
1593    uint64_t outputs = 0;
1594    struct agx_fs_epilog_link_info epilog_key = {false};
1595    unsigned gs_out_count_words = 0;
1596    nir_shader *gs_count = NULL;
1597    nir_shader *gs_copy = NULL;
1598    nir_shader *pre_gs = NULL;
1599    BITSET_DECLARE(attrib_components_read, VERT_ATTRIB_MAX * 4) = {0};
1600 
1601    /* This can happen at inopportune times and cause jank, log it */
1602    perf_debug(dev, "Compiling %s shader variant #%u",
1603               _mesa_shader_stage_to_abbrev(so->type),
1604               _mesa_hash_table_num_entries(so->variants));
1605 
1606    struct agx_unlinked_uvs_layout uvs = {0};
1607    bool translucent = false;
1608 
1609    if (nir->info.stage == MESA_SHADER_VERTEX) {
1610       struct asahi_vs_shader_key *key = &key_->vs;
1611 
1612       if (nir->info.vs.tes_agx) {
1613          NIR_PASS(_, nir, agx_nir_lower_tes, dev->libagx, key->hw);
1614       } else {
1615          NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog,
1616                   attrib_components_read);
1617       }
1618 
1619       if (key->hw) {
1620          NIR_PASS(_, nir, agx_nir_lower_point_size, true);
1621          NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1622                   nir_metadata_control_flow, NULL);
1623 
1624          NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
1625                   NULL);
1626          NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
1627          NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
1628       } else {
1629          NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx);
1630 
1631          /* Turn into a compute shader now that we're free of vertexisms */
1632          nir->info.stage = MESA_SHADER_COMPUTE;
1633          memset(&nir->info.cs, 0, sizeof(nir->info.cs));
1634          nir->xfb_info = NULL;
1635          outputs = nir->info.outputs_written;
1636       }
1637    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1638       NIR_PASS_V(nir, agx_nir_lower_tcs, dev->libagx);
1639    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1640       struct asahi_gs_shader_key *key = &key_->gs;
1641 
1642       NIR_PASS(_, nir, agx_nir_lower_gs, dev->libagx, key->rasterizer_discard,
1643                &gs_count, &gs_copy, &pre_gs, &gs_out_prim, &gs_out_count_words);
1644    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1645       struct asahi_fs_shader_key *key = &key_->fs;
1646 
1647       /* Discards must be lowering before lowering MSAA to handle discards */
1648       NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
1649       NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog, &epilog_key);
1650 
1651       if (nir->info.fs.uses_fbfetch_output) {
1652          struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
1653             key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples,
1654             true);
1655 
1656          if (dev->debug & AGX_DBG_SMALLTILE)
1657             tib.tile_size = (struct agx_tile_size){16, 16};
1658 
1659          /* XXX: don't replicate this all over the driver */
1660          unsigned rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
1661                                   (2 * BITSET_LAST_BIT(nir->info.images_used));
1662          unsigned rt_spill = rt_spill_base;
1663          NIR_PASS(_, nir, agx_nir_lower_tilebuffer, &tib, NULL, &rt_spill, NULL,
1664                   &translucent);
1665       }
1666 
1667       if (nir->info.fs.uses_sample_shading) {
1668          /* Ensure the sample ID is preserved in register */
1669          nir_builder b =
1670             nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
1671          nir_export_agx(
1672             &b,
1673             nir_load_exported_agx(&b, 1, 16, .base = AGX_ABI_FIN_SAMPLE_MASK),
1674             .base = AGX_ABI_FOUT_SAMPLE_MASK);
1675 
1676          NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
1677       }
1678 
1679       NIR_PASS(_, nir, agx_nir_lower_sample_mask);
1680       NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
1681    }
1682 
1683    NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
1684 
1685    struct agx_compiled_shader *compiled = agx_compile_nir(
1686       dev, nir, debug, so->type, false, so->type != PIPE_SHADER_FRAGMENT, false,
1687       0, attrib_components_read);
1688 
1689    if (so->type == PIPE_SHADER_FRAGMENT) {
1690       /* XXX: don't replicate this all over the driver */
1691       epilog_key.rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
1692                                  (2 * BITSET_LAST_BIT(nir->info.images_used));
1693 
1694       compiled->epilog_key = epilog_key;
1695       compiled->b.info.reads_tib |= translucent;
1696    }
1697 
1698    compiled->so = so;
1699    compiled->uvs = uvs;
1700 
1701    /* Compile auxiliary programs */
1702    if (gs_count) {
1703       compiled->gs_count = agx_compile_nir(dev, gs_count, debug, so->type,
1704                                            false, true, false, 0, NULL);
1705       compiled->gs_count->so = so;
1706    }
1707 
1708    if (pre_gs) {
1709       compiled->pre_gs = agx_compile_nir(
1710          dev, pre_gs, debug, PIPE_SHADER_COMPUTE, false, true, false, 0, NULL);
1711    }
1712 
1713    if (gs_copy) {
1714       /* Replace the point size write if present, but do not insert a write:
1715        * the GS rast program writes point size iff we have points.
1716        */
1717       NIR_PASS(_, gs_copy, agx_nir_lower_point_size, false);
1718 
1719       NIR_PASS(_, gs_copy, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1720                nir_metadata_control_flow, NULL);
1721 
1722       NIR_PASS(_, gs_copy, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
1723                NULL);
1724       NIR_PASS(_, gs_copy, agx_nir_lower_cull_distance_vs);
1725 
1726       struct agx_unlinked_uvs_layout uvs = {0};
1727       NIR_PASS(_, gs_copy, agx_nir_lower_uvs, &uvs);
1728 
1729       compiled->gs_copy =
1730          agx_compile_nir(dev, gs_copy, debug, PIPE_SHADER_GEOMETRY, false, true,
1731                          false, 0, NULL);
1732       compiled->gs_copy->so = so;
1733       compiled->gs_copy->stage = so->type;
1734       compiled->gs_copy->uvs = uvs;
1735    }
1736 
1737    compiled->gs_output_mode = gs_out_prim;
1738    compiled->gs_count_words = gs_out_count_words;
1739    compiled->b.info.outputs = outputs;
1740 
1741    ralloc_free(nir);
1742    ralloc_free(pre_gs);
1743    ralloc_free(gs_count);
1744    return compiled;
1745 }
1746 
1747 static struct agx_compiled_shader *
agx_get_shader_variant(struct agx_screen * screen,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key)1748 agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
1749                        struct agx_uncompiled_shader *so,
1750                        struct util_debug_callback *debug,
1751                        union asahi_shader_key *key)
1752 {
1753    struct agx_compiled_shader *compiled =
1754       agx_disk_cache_retrieve(screen, so, key);
1755 
1756    if (!compiled) {
1757       compiled = agx_compile_variant(&screen->dev, pctx, so, debug, key);
1758       agx_disk_cache_store(screen->disk_cache, so, key, compiled);
1759    }
1760 
1761    /* key may be destroyed after we return, so clone it before using it as a
1762     * hash table key. The clone is logically owned by the hash table.
1763     */
1764    union asahi_shader_key *cloned_key =
1765       rzalloc(so->variants, union asahi_shader_key);
1766 
1767    if (so->type == PIPE_SHADER_FRAGMENT) {
1768       memcpy(cloned_key, key, sizeof(struct asahi_fs_shader_key));
1769    } else if (so->type == PIPE_SHADER_VERTEX ||
1770               so->type == PIPE_SHADER_TESS_EVAL) {
1771       memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
1772    } else if (so->type == PIPE_SHADER_GEOMETRY) {
1773       memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
1774    } else {
1775       assert(gl_shader_stage_is_compute(so->type) ||
1776              so->type == PIPE_SHADER_TESS_CTRL);
1777       /* No key */
1778    }
1779 
1780    _mesa_hash_table_insert(so->variants, cloned_key, compiled);
1781 
1782    return compiled;
1783 }
1784 
1785 static int
glsl_type_size(const struct glsl_type * type,bool bindless)1786 glsl_type_size(const struct glsl_type *type, bool bindless)
1787 {
1788    return glsl_count_attribute_slots(type, false);
1789 }
1790 
1791 static bool
should_lower_robustness(const nir_intrinsic_instr * intr,const void * data)1792 should_lower_robustness(const nir_intrinsic_instr *intr, const void *data)
1793 {
1794    const bool *gl_robust = data;
1795 
1796    switch (intr->intrinsic) {
1797    /* The texture/PBE hardware is robust, but our buffer image implementation
1798     * is not. Lower robustness only for buffer images.
1799     */
1800    case nir_intrinsic_image_load:
1801    case nir_intrinsic_image_store:
1802       return nir_intrinsic_image_dim(intr) == GLSL_SAMPLER_DIM_BUF;
1803 
1804    /* Image atomics are lowered to raw memory access */
1805    case nir_intrinsic_image_atomic:
1806    case nir_intrinsic_image_atomic_swap:
1807       return true;
1808 
1809    /* UBOs/SSBOs are lowered to raw pointers */
1810    case nir_intrinsic_load_ubo:
1811    case nir_intrinsic_load_ssbo:
1812    case nir_intrinsic_store_ssbo:
1813    case nir_intrinsic_ssbo_atomic:
1814    case nir_intrinsic_ssbo_atomic_swap:
1815       return *gl_robust;
1816 
1817    default:
1818       return false;
1819    }
1820 }
1821 
1822 static void
agx_shader_initialize(struct agx_device * dev,struct agx_uncompiled_shader * so,nir_shader * nir,bool support_lod_bias,bool robust)1823 agx_shader_initialize(struct agx_device *dev, struct agx_uncompiled_shader *so,
1824                       nir_shader *nir, bool support_lod_bias, bool robust)
1825 {
1826    if (nir->info.stage == MESA_SHADER_KERNEL)
1827       nir->info.stage = MESA_SHADER_COMPUTE;
1828 
1829    blob_init(&so->early_serialized_nir);
1830    nir_serialize(&so->early_serialized_nir, nir, true);
1831 
1832    /* We need to lower robustness before bindings, since robustness lowering
1833     * affects the bindings used.
1834     */
1835    NIR_PASS(_, nir, nir_lower_robust_access, should_lower_robustness, &robust);
1836 
1837    /* Similarly, we need to do early texture lowering before bindings */
1838    NIR_PASS(_, nir, agx_nir_lower_texture_early, support_lod_bias);
1839 
1840    /* We need to lower binding tables before calling agx_preprocess_nir, since
1841     * that does texture lowering that needs to know the binding model.
1842     */
1843    NIR_PASS(_, nir, agx_nir_lower_bindings, &so->uses_bindless_samplers);
1844 
1845    /* We need to do some I/O lowering before lowering textures */
1846    so->info.nr_bindful_textures = BITSET_LAST_BIT(nir->info.textures_used);
1847    so->info.nr_bindful_images = BITSET_LAST_BIT(nir->info.images_used);
1848 
1849    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1850             glsl_type_size,
1851             nir_lower_io_lower_64bit_to_32 |
1852                nir_lower_io_use_interpolated_input_intrinsics);
1853 
1854    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1855       struct agx_interp_info interp = agx_gather_interp_info(nir);
1856 
1857       /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
1858        * exception, interpolate flat shaded at fp32. This works around a
1859        * hardware limitation. The resulting code (with an extra f2f16 at the end
1860        * if needed) matches what Metal produces.
1861        */
1862       if (likely(!(dev->debug & AGX_DBG_NO16))) {
1863          uint64_t texcoord = agx_gather_texcoords(nir);
1864 
1865          NIR_PASS(_, nir, nir_lower_mediump_io,
1866                   nir_var_shader_in | nir_var_shader_out,
1867                   ~(interp.flat | texcoord), false);
1868       }
1869 
1870       so->info.inputs_flat_shaded = interp.flat;
1871       so->info.inputs_linear_shaded = interp.linear;
1872       so->info.uses_fbfetch = nir->info.fs.uses_fbfetch_output;
1873    } else if (nir->info.stage == MESA_SHADER_VERTEX ||
1874               nir->info.stage == MESA_SHADER_TESS_EVAL) {
1875       so->info.has_edgeflags = nir->info.outputs_written & VARYING_BIT_EDGE;
1876       so->info.cull_distance_size = nir->info.cull_distance_array_size;
1877    }
1878 
1879    /* Shrink and vectorize SSBOs before lowering them, since it is harder to
1880     * optimize the lowered code.
1881     */
1882    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
1883    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
1884    NIR_PASS(_, nir, agx_nir_cleanup_amul);
1885    NIR_PASS(_, nir, nir_opt_constant_folding);
1886    NIR_PASS(_, nir, nir_copy_prop);
1887    NIR_PASS(_, nir, nir_opt_cse);
1888    NIR_PASS(_, nir, nir_opt_dce);
1889    NIR_PASS(_, nir, nir_opt_shrink_vectors, true);
1890    NIR_PASS(_, nir, nir_copy_prop);
1891 
1892    NIR_PASS(
1893       _, nir, nir_opt_load_store_vectorize,
1894       &(const nir_load_store_vectorize_options){
1895          .modes = nir_var_mem_global | nir_var_mem_constant | nir_var_mem_ssbo,
1896          .callback = agx_mem_vectorize_cb,
1897       });
1898 
1899    NIR_PASS(_, nir, agx_nir_lower_texture);
1900    NIR_PASS(_, nir, nir_lower_ssbo, NULL);
1901 
1902    agx_preprocess_nir(nir, dev->libagx);
1903 
1904    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
1905        (nir->info.inputs_read & VARYING_BITS_TEX_ANY)) {
1906 
1907       NIR_PASS(_, nir, nir_shader_intrinsics_pass,
1908                agx_nir_lower_point_sprite_zw, nir_metadata_control_flow, NULL);
1909    }
1910 
1911    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1912       NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, true);
1913    }
1914 
1915    so->type = pipe_shader_type_from_mesa(nir->info.stage);
1916 
1917    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1918       nir->info.stage = MESA_SHADER_VERTEX;
1919       nir->info.vs.tes_agx = true;
1920    }
1921 
1922    blob_init(&so->serialized_nir);
1923    nir_serialize(&so->serialized_nir, nir, true);
1924    _mesa_sha1_compute(so->serialized_nir.data, so->serialized_nir.size,
1925                       so->nir_sha1);
1926 
1927    so->has_xfb_info = (nir->xfb_info != NULL);
1928 
1929    static_assert(
1930       ARRAY_SIZE(so->xfb_strides) == ARRAY_SIZE(nir->info.xfb_stride),
1931       "known target count");
1932 
1933    if (so->has_xfb_info) {
1934       struct nir_xfb_info *xfb = nir->xfb_info;
1935 
1936       for (unsigned i = 0; i < ARRAY_SIZE(so->xfb_strides); ++i) {
1937          so->xfb_strides[i] = xfb->buffers[i].stride;
1938       }
1939    }
1940 }
1941 
1942 static void *
agx_create_shader_state(struct pipe_context * pctx,const struct pipe_shader_state * cso)1943 agx_create_shader_state(struct pipe_context *pctx,
1944                         const struct pipe_shader_state *cso)
1945 {
1946    struct agx_context *ctx = agx_context(pctx);
1947    struct agx_uncompiled_shader *so =
1948       rzalloc(NULL, struct agx_uncompiled_shader);
1949    struct agx_device *dev = agx_device(pctx->screen);
1950 
1951    if (!so)
1952       return NULL;
1953 
1954    so->base = *cso;
1955 
1956    nir_shader *nir = cso->type == PIPE_SHADER_IR_NIR
1957                         ? cso->ir.nir
1958                         : tgsi_to_nir(cso->tokens, pctx->screen, false);
1959 
1960    if (nir->info.stage == MESA_SHADER_VERTEX ||
1961        nir->info.stage == MESA_SHADER_TESS_EVAL) {
1962       so->variants = asahi_vs_shader_key_table_create(so);
1963       so->linked_shaders = agx_fast_link_key_table_create(so);
1964    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1965       so->variants = asahi_gs_shader_key_table_create(so);
1966    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1967       /* No variants */
1968       so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
1969                                              asahi_cs_shader_key_equal);
1970    } else {
1971       so->variants = asahi_fs_shader_key_table_create(so);
1972       so->linked_shaders = agx_fast_link_key_table_create(so);
1973    }
1974 
1975    if (nir->info.stage == MESA_SHADER_TESS_EVAL ||
1976        nir->info.stage == MESA_SHADER_TESS_CTRL) {
1977 
1978       so->tess.ccw = nir->info.tess.ccw;
1979       so->tess.point_mode = nir->info.tess.point_mode;
1980       so->tess.spacing = nir->info.tess.spacing;
1981       so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
1982       so->tess.primitive = nir->info.tess._primitive_mode;
1983       so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
1984       so->tess.nr_patch_outputs =
1985          util_last_bit(nir->info.patch_outputs_written);
1986       if (nir->info.stage == MESA_SHADER_TESS_CTRL)
1987          so->tess.output_stride = agx_tcs_output_stride(nir);
1988    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1989       so->gs_mode = nir->info.gs.output_primitive;
1990    }
1991 
1992    agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
1993    gl_shader_stage next_stage = nir->info.next_stage;
1994 
1995    /* We're done with the NIR, throw it away */
1996    ralloc_free(nir);
1997    nir = NULL;
1998 
1999    /* Precompile shaders that have a small key. For shader-db, precompile a
2000     * shader with a default key. This could be improved but hopefully this is
2001     * acceptable for now.
2002     */
2003    if ((so->type == PIPE_SHADER_TESS_CTRL) ||
2004        (so->type == PIPE_SHADER_FRAGMENT && !so->info.uses_fbfetch)) {
2005       union asahi_shader_key key = {0};
2006       agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2007                              &key);
2008    } else if (so->type == PIPE_SHADER_VERTEX) {
2009       union asahi_shader_key key = {
2010          .vs.hw = next_stage == MESA_SHADER_FRAGMENT,
2011       };
2012       agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2013                              &key);
2014 
2015       if (!next_stage) {
2016          key.vs.hw = true;
2017          agx_get_shader_variant(agx_screen(pctx->screen), pctx, so,
2018                                 &pctx->debug, &key);
2019       }
2020    } else if (dev->debug & AGX_DBG_PRECOMPILE) {
2021       union asahi_shader_key key = {0};
2022 
2023       switch (so->type) {
2024       case PIPE_SHADER_GEOMETRY:
2025          break;
2026 
2027       case PIPE_SHADER_TESS_EVAL:
2028          /* TODO: Tessellation shaders with shader-db */
2029          return so;
2030 
2031       case PIPE_SHADER_FRAGMENT:
2032          key.fs.nr_samples = 1;
2033          break;
2034       default:
2035          unreachable("Unknown shader stage in shader-db precompile");
2036       }
2037 
2038       agx_compile_variant(dev, pctx, so, &pctx->debug, &key);
2039    }
2040 
2041    return so;
2042 }
2043 
2044 static void *
agx_create_compute_state(struct pipe_context * pctx,const struct pipe_compute_state * cso)2045 agx_create_compute_state(struct pipe_context *pctx,
2046                          const struct pipe_compute_state *cso)
2047 {
2048    struct agx_context *ctx = agx_context(pctx);
2049    struct agx_device *dev = agx_device(pctx->screen);
2050    struct agx_uncompiled_shader *so =
2051       rzalloc(NULL, struct agx_uncompiled_shader);
2052 
2053    if (!so)
2054       return NULL;
2055 
2056    so->variants = _mesa_hash_table_create(so, asahi_cs_shader_key_hash,
2057                                           asahi_cs_shader_key_equal);
2058 
2059    union asahi_shader_key key = {0};
2060 
2061    assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
2062    nir_shader *nir = (void *)cso->prog;
2063 
2064    agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
2065    agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2066                           &key);
2067 
2068    /* We're done with the NIR, throw it away */
2069    ralloc_free(nir);
2070    return so;
2071 }
2072 
2073 static void
agx_get_compute_state_info(struct pipe_context * pctx,void * cso,struct pipe_compute_state_object_info * info)2074 agx_get_compute_state_info(struct pipe_context *pctx, void *cso,
2075                            struct pipe_compute_state_object_info *info)
2076 {
2077    union asahi_shader_key key = {0};
2078    struct agx_compiled_shader *so = agx_get_shader_variant(
2079       agx_screen(pctx->screen), pctx, cso, &pctx->debug, &key);
2080 
2081    info->max_threads =
2082       agx_occupancy_for_register_count(so->b.info.nr_gprs).max_threads;
2083    info->private_memory = 0;
2084    info->preferred_simd_size = 32;
2085    info->simd_sizes = 32;
2086 }
2087 
2088 /* Does not take ownership of key. Clones if necessary. */
2089 static bool
agx_update_shader(struct agx_context * ctx,struct agx_compiled_shader ** out,enum pipe_shader_type stage,union asahi_shader_key * key)2090 agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
2091                   enum pipe_shader_type stage, union asahi_shader_key *key)
2092 {
2093    struct agx_uncompiled_shader *so = ctx->stage[stage].shader;
2094    assert(so != NULL);
2095 
2096    struct hash_entry *he = _mesa_hash_table_search(so->variants, key);
2097 
2098    if (he) {
2099       if ((*out) == he->data)
2100          return false;
2101 
2102       *out = he->data;
2103       return true;
2104    }
2105 
2106    struct agx_screen *screen = agx_screen(ctx->base.screen);
2107    *out = agx_get_shader_variant(screen, &ctx->base, so, &ctx->base.debug, key);
2108    return true;
2109 }
2110 
2111 static enum mesa_prim
rast_prim(enum mesa_prim mode,unsigned fill_mode)2112 rast_prim(enum mesa_prim mode, unsigned fill_mode)
2113 {
2114    if (u_reduced_prim(mode) == MESA_PRIM_TRIANGLES) {
2115       if (fill_mode == PIPE_POLYGON_MODE_POINT)
2116          return MESA_PRIM_POINTS;
2117       else if (fill_mode == PIPE_POLYGON_MODE_LINE)
2118          return MESA_PRIM_LINES;
2119    }
2120 
2121    return mode;
2122 }
2123 
2124 static bool
lower_fs_prolog_abi(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2125 lower_fs_prolog_abi(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2126 {
2127    if (intr->intrinsic == nir_intrinsic_load_polygon_stipple_agx) {
2128       b->cursor = nir_instr_remove(&intr->instr);
2129 
2130       nir_def *root = nir_load_preamble(b, 1, 64, .base = 12);
2131       off_t stipple_offs = offsetof(struct agx_draw_uniforms, polygon_stipple);
2132       nir_def *stipple_ptr_ptr = nir_iadd_imm(b, root, stipple_offs);
2133       nir_def *base = nir_load_global_constant(b, stipple_ptr_ptr, 4, 1, 64);
2134 
2135       nir_def *row = intr->src[0].ssa;
2136       nir_def *addr = nir_iadd(b, base, nir_u2u64(b, nir_imul_imm(b, row, 4)));
2137 
2138       nir_def *pattern = nir_load_global_constant(b, addr, 4, 1, 32);
2139       nir_def_rewrite_uses(&intr->def, pattern);
2140       return true;
2141    } else if (intr->intrinsic == nir_intrinsic_load_stat_query_address_agx) {
2142       b->cursor = nir_instr_remove(&intr->instr);
2143 
2144       /* ABI: root descriptor address in u6_u7 */
2145       nir_def *root = nir_load_preamble(b, 1, intr->def.bit_size, .base = 12);
2146 
2147       off_t offs = offsetof(struct agx_draw_uniforms,
2148                             pipeline_statistics[nir_intrinsic_base(intr)]);
2149 
2150       nir_def *ptr = nir_iadd_imm(b, root, offs);
2151       nir_def *load = nir_load_global_constant(b, ptr, 4, 1, 64);
2152       nir_def_rewrite_uses(&intr->def, load);
2153       return true;
2154    } else {
2155       return false;
2156    }
2157 }
2158 
2159 static void
build_fs_prolog(nir_builder * b,const void * key)2160 build_fs_prolog(nir_builder *b, const void *key)
2161 {
2162    agx_nir_fs_prolog(b, key);
2163 
2164    NIR_PASS(_, b->shader, nir_shader_intrinsics_pass, lower_fs_prolog_abi,
2165             nir_metadata_control_flow, NULL);
2166 }
2167 
2168 static struct agx_linked_shader *
asahi_fast_link(struct agx_context * ctx,struct agx_uncompiled_shader * so,struct agx_fast_link_key * key)2169 asahi_fast_link(struct agx_context *ctx, struct agx_uncompiled_shader *so,
2170                 struct agx_fast_link_key *key)
2171 {
2172    /* Try the cache */
2173    struct hash_entry *ent = _mesa_hash_table_search(so->linked_shaders, key);
2174    if (ent)
2175       return ent->data;
2176 
2177    struct agx_compiled_shader *prolog = NULL, *epilog = NULL;
2178 
2179    /* Build the prolog/epilog now */
2180    if (so->type == MESA_SHADER_FRAGMENT) {
2181       prolog = agx_build_meta_shader_internal(
2182          ctx, build_fs_prolog, &key->prolog.fs, sizeof(key->prolog.fs), true,
2183          false, key->prolog.fs.cf_base, false);
2184 
2185       epilog = agx_build_meta_shader_internal(
2186          ctx, agx_nir_fs_epilog, &key->epilog.fs, sizeof(key->epilog.fs), false,
2187          true, 0, false);
2188 
2189    } else if (so->type == MESA_SHADER_TESS_EVAL) {
2190       /* No prolog/epilog needed */
2191    } else {
2192       assert(so->type == MESA_SHADER_VERTEX);
2193 
2194       prolog = agx_build_meta_shader_internal(
2195          ctx, agx_nir_vs_prolog, &key->prolog.vs, sizeof(key->prolog.vs), true,
2196          false, 0, false);
2197    }
2198 
2199    /* Fast-link it all together */
2200    struct agx_device *dev = agx_device(ctx->base.screen);
2201 
2202    struct agx_linked_shader *linked =
2203       rzalloc(so->linked_shaders, struct agx_linked_shader);
2204    agx_fast_link(linked, dev, so->type == PIPE_SHADER_FRAGMENT, &key->main->b,
2205                  &prolog->b, &epilog->b, key->nr_samples_shaded);
2206 
2207    /* Cache the fast linked program */
2208    union asahi_shader_key *cloned_key =
2209       ralloc_memdup(so->linked_shaders, key, sizeof(*key));
2210    _mesa_hash_table_insert(so->linked_shaders, cloned_key, linked);
2211    return linked;
2212 }
2213 
2214 static bool
agx_update_vs(struct agx_batch * batch,unsigned index_size_B)2215 agx_update_vs(struct agx_batch *batch, unsigned index_size_B)
2216 {
2217    struct agx_context *ctx = batch->ctx;
2218 
2219    /* Only proceed if the shader or anything the key depends on changes
2220     *
2221     * vb_mask, attributes, vertex_buffers: VERTEX
2222     */
2223    if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB)) ||
2224          ctx->stage[PIPE_SHADER_TESS_EVAL].dirty ||
2225          ctx->stage[PIPE_SHADER_GEOMETRY].dirty ||
2226          ctx->stage[PIPE_SHADER_TESS_EVAL].shader ||
2227          ctx->stage[PIPE_SHADER_GEOMETRY].shader || ctx->in_tess))
2228       return false;
2229 
2230    struct asahi_vs_shader_key key = {
2231       .hw = !((ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess) ||
2232               ctx->stage[PIPE_SHADER_GEOMETRY].shader),
2233    };
2234 
2235    agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX,
2236                      (union asahi_shader_key *)&key);
2237 
2238    struct agx_device *dev = agx_device(ctx->base.screen);
2239    struct agx_fast_link_key link_key = {
2240       .prolog.vs.hw = key.hw,
2241       .prolog.vs.sw_index_size_B = key.hw ? 0 : index_size_B,
2242 
2243       .prolog.vs.robustness.level =
2244          ctx->robust ? AGX_ROBUSTNESS_GL : AGX_ROBUSTNESS_DISABLED,
2245 
2246       .prolog.vs.robustness.soft_fault = agx_has_soft_fault(dev),
2247       .main = ctx->vs,
2248    };
2249 
2250    STATIC_ASSERT(sizeof(link_key.prolog.vs.component_mask) ==
2251                  sizeof(ctx->vs->attrib_components_read));
2252    BITSET_COPY(link_key.prolog.vs.component_mask,
2253                ctx->vs->attrib_components_read);
2254 
2255    memcpy(link_key.prolog.vs.attribs, &ctx->attributes->key,
2256           sizeof(link_key.prolog.vs.attribs));
2257 
2258    void *old = ctx->linked.vs;
2259 
2260    ctx->linked.vs =
2261       asahi_fast_link(ctx, ctx->stage[PIPE_SHADER_VERTEX].shader, &link_key);
2262 
2263    agx_batch_add_bo(batch, ctx->vs->bo);
2264    if (ctx->linked.vs)
2265       agx_batch_add_bo(batch, ctx->linked.vs->bo);
2266 
2267    return old != ctx->linked.vs;
2268 }
2269 
2270 static bool
agx_update_tcs(struct agx_context * ctx,const struct pipe_draw_info * info)2271 agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
2272 {
2273    assert(info->mode == MESA_PRIM_PATCHES);
2274 
2275    ctx->tcs = _mesa_hash_table_next_entry(
2276                  ctx->stage[PIPE_SHADER_TESS_CTRL].shader->variants, NULL)
2277                  ->data;
2278    return true;
2279 }
2280 
2281 static bool
agx_update_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect)2282 agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
2283               const struct pipe_draw_indirect_info *indirect)
2284 {
2285    /* Only proceed if there is a geometry shader. Due to input assembly
2286     * dependence, we don't bother to dirty track right now.
2287     */
2288    if (!ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
2289       ctx->gs = NULL;
2290       return false;
2291    }
2292 
2293    /* Transform feedback always happens via the geometry shader, so look there
2294     * to get the XFB strides.
2295     */
2296    struct agx_uncompiled_shader *gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
2297 
2298    for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2299       struct agx_streamout_target *tgt =
2300          agx_so_target(ctx->streamout.targets[i]);
2301 
2302       if (tgt != NULL)
2303          tgt->stride = gs->xfb_strides[i];
2304    }
2305 
2306    struct asahi_gs_shader_key key = {
2307       .rasterizer_discard = ctx->rast->base.rasterizer_discard,
2308    };
2309 
2310    return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
2311                             (union asahi_shader_key *)&key);
2312 }
2313 
2314 static enum pipe_blendfactor
optimize_blend_factor_w_1(enum pipe_blendfactor f)2315 optimize_blend_factor_w_1(enum pipe_blendfactor f)
2316 {
2317    if (f == PIPE_BLENDFACTOR_SRC_ALPHA)
2318       return PIPE_BLENDFACTOR_ONE;
2319    else if (f == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
2320       return PIPE_BLENDFACTOR_ZERO;
2321    else
2322       return f;
2323 }
2324 
2325 static bool
agx_update_fs(struct agx_batch * batch)2326 agx_update_fs(struct agx_batch *batch)
2327 {
2328    struct agx_context *ctx = batch->ctx;
2329 
2330    /* Only proceed if the shader or anything the key depends on changes
2331     *
2332     * batch->key: implicitly dirties everything, no explicit check
2333     * rast: RS
2334     * blend: BLEND
2335     * sample_mask: SAMPLE_MASK
2336     * reduced_prim: PRIM
2337     */
2338    if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG | AGX_DIRTY_RS |
2339                        AGX_DIRTY_BLEND | AGX_DIRTY_SAMPLE_MASK |
2340                        AGX_DIRTY_PRIM | AGX_DIRTY_QUERY)))
2341       return false;
2342 
2343    struct agx_device *dev = agx_device(ctx->base.screen);
2344    unsigned nr_samples = util_framebuffer_get_num_samples(&batch->key);
2345 
2346    /* Get main shader */
2347    struct asahi_fs_shader_key key = {0};
2348 
2349    if (ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.uses_fbfetch) {
2350       key.nr_samples = nr_samples;
2351 
2352       for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
2353          struct pipe_surface *surf = batch->key.cbufs[i];
2354 
2355          key.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2356       }
2357    }
2358 
2359    agx_update_shader(ctx, &ctx->fs, PIPE_SHADER_FRAGMENT,
2360                      (union asahi_shader_key *)&key);
2361 
2362    /* Fast link with prolog/epilog */
2363    bool msaa = ctx->rast->base.multisample;
2364    unsigned sample_mask = ctx->sample_mask & BITFIELD_MASK(nr_samples);
2365 
2366    struct agx_fast_link_key link_key = {
2367       .prolog.fs.statistics =
2368          ctx->pipeline_statistics[PIPE_STAT_QUERY_PS_INVOCATIONS],
2369 
2370       .prolog.fs.cull_distance_size =
2371          ctx->stage[MESA_SHADER_VERTEX].shader->info.cull_distance_size,
2372 
2373       .prolog.fs.polygon_stipple =
2374          ctx->rast->base.poly_stipple_enable &&
2375          rast_prim(batch->reduced_prim, ctx->rast->base.fill_front) ==
2376             MESA_PRIM_TRIANGLES,
2377 
2378       .prolog.fs.api_sample_mask =
2379          (msaa && nr_samples > 1 && sample_mask != BITFIELD_MASK(nr_samples))
2380             ? sample_mask
2381             : 0xff,
2382 
2383       .epilog.fs.nr_samples = nr_samples,
2384       .epilog.fs.link = ctx->fs->epilog_key,
2385       .epilog.fs.force_small_tile = dev->debug & AGX_DBG_SMALLTILE,
2386 
2387       .main = ctx->fs,
2388       .nr_samples_shaded = ctx->fs->epilog_key.sample_shading ? nr_samples : 0,
2389    };
2390 
2391    for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
2392       struct pipe_surface *surf = batch->key.cbufs[i];
2393 
2394       link_key.epilog.fs.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2395       link_key.epilog.fs.remap[i] =
2396          link_key.epilog.fs.link.broadcast_rt0 ? 0 : i;
2397    }
2398 
2399    memcpy(&link_key.epilog.fs.blend, &ctx->blend->key,
2400           sizeof(link_key.epilog.fs.blend));
2401 
2402    /* Normalize */
2403    if (!agx_tilebuffer_spills(&batch->tilebuffer_layout))
2404       link_key.epilog.fs.link.rt_spill_base = 0;
2405 
2406    /* Try to disable blending to get rid of some fsats */
2407    if (link_key.epilog.fs.link.loc0_w_1) {
2408       struct agx_blend_rt_key *k = &link_key.epilog.fs.blend.rt[0];
2409 
2410       k->rgb_src_factor = optimize_blend_factor_w_1(k->rgb_src_factor);
2411       k->rgb_dst_factor = optimize_blend_factor_w_1(k->rgb_dst_factor);
2412 
2413       k->alpha_src_factor = optimize_blend_factor_w_1(k->alpha_src_factor);
2414       k->alpha_dst_factor = optimize_blend_factor_w_1(k->alpha_dst_factor);
2415    }
2416 
2417    link_key.epilog.fs.blend.alpha_to_coverage &= msaa;
2418 
2419    /* The main shader must not run tests if the epilog will */
2420    bool epilog_discards = link_key.epilog.fs.blend.alpha_to_coverage;
2421    batch->uniforms.no_epilog_discard = !epilog_discards ? ~0 : 0;
2422 
2423    bool prolog_discards = (link_key.prolog.fs.api_sample_mask != 0xff ||
2424                            link_key.prolog.fs.cull_distance_size ||
2425                            link_key.prolog.fs.polygon_stipple);
2426 
2427    /* The prolog runs tests if neither the main shader nor epilog will */
2428    link_key.prolog.fs.run_zs_tests = !ctx->fs->b.info.writes_sample_mask &&
2429                                      !epilog_discards && prolog_discards;
2430 
2431    if (link_key.prolog.fs.cull_distance_size)
2432       link_key.prolog.fs.cf_base = ctx->fs->b.info.varyings.fs.nr_cf;
2433 
2434    void *old = ctx->linked.fs;
2435 
2436    ctx->linked.fs =
2437       asahi_fast_link(ctx, ctx->stage[PIPE_SHADER_FRAGMENT].shader, &link_key);
2438 
2439    if (ctx->fs->bo)
2440       agx_batch_add_bo(batch, ctx->fs->bo);
2441 
2442    agx_batch_add_bo(batch, ctx->linked.fs->bo);
2443 
2444    return old != ctx->linked.fs;
2445 }
2446 
2447 static void
agx_bind_shader_state(struct pipe_context * pctx,void * cso,enum pipe_shader_type stage)2448 agx_bind_shader_state(struct pipe_context *pctx, void *cso,
2449                       enum pipe_shader_type stage)
2450 {
2451    struct agx_context *ctx = agx_context(pctx);
2452 
2453    if (stage == PIPE_SHADER_VERTEX)
2454       ctx->dirty |= AGX_DIRTY_VS_PROG;
2455    else if (stage == PIPE_SHADER_FRAGMENT)
2456       ctx->dirty |= AGX_DIRTY_FS_PROG;
2457    else
2458       ctx->stage[stage].dirty = ~0;
2459 
2460    ctx->stage[stage].shader = cso;
2461 }
2462 
2463 static void
agx_bind_vs_state(struct pipe_context * pctx,void * cso)2464 agx_bind_vs_state(struct pipe_context *pctx, void *cso)
2465 {
2466    agx_bind_shader_state(pctx, cso, PIPE_SHADER_VERTEX);
2467 }
2468 
2469 static void
agx_bind_fs_state(struct pipe_context * pctx,void * cso)2470 agx_bind_fs_state(struct pipe_context *pctx, void *cso)
2471 {
2472    agx_bind_shader_state(pctx, cso, PIPE_SHADER_FRAGMENT);
2473 }
2474 
2475 static void
agx_bind_gs_state(struct pipe_context * pctx,void * cso)2476 agx_bind_gs_state(struct pipe_context *pctx, void *cso)
2477 {
2478    agx_bind_shader_state(pctx, cso, PIPE_SHADER_GEOMETRY);
2479 }
2480 
2481 static void
agx_bind_tcs_state(struct pipe_context * pctx,void * cso)2482 agx_bind_tcs_state(struct pipe_context *pctx, void *cso)
2483 {
2484    agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_CTRL);
2485 }
2486 
2487 static void
agx_bind_tes_state(struct pipe_context * pctx,void * cso)2488 agx_bind_tes_state(struct pipe_context *pctx, void *cso)
2489 {
2490    agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_EVAL);
2491 }
2492 
2493 static void
agx_bind_cs_state(struct pipe_context * pctx,void * cso)2494 agx_bind_cs_state(struct pipe_context *pctx, void *cso)
2495 {
2496    agx_bind_shader_state(pctx, cso, PIPE_SHADER_COMPUTE);
2497 }
2498 
2499 /* Forward declare because of the recursion hit with geometry shaders */
2500 static void agx_delete_uncompiled_shader(struct agx_device *dev,
2501                                          struct agx_uncompiled_shader *so);
2502 
2503 static void
agx_delete_compiled_shader(struct agx_device * dev,struct agx_compiled_shader * so)2504 agx_delete_compiled_shader(struct agx_device *dev,
2505                            struct agx_compiled_shader *so)
2506 {
2507    if (so->gs_count)
2508       agx_delete_compiled_shader(dev, so->gs_count);
2509 
2510    if (so->pre_gs)
2511       agx_delete_compiled_shader(dev, so->pre_gs);
2512 
2513    if (so->gs_copy)
2514       agx_delete_compiled_shader(dev, so->gs_copy);
2515 
2516    free(so->b.binary);
2517    agx_bo_unreference(dev, so->bo);
2518    FREE(so);
2519 }
2520 
2521 static void
agx_delete_uncompiled_shader(struct agx_device * dev,struct agx_uncompiled_shader * so)2522 agx_delete_uncompiled_shader(struct agx_device *dev,
2523                              struct agx_uncompiled_shader *so)
2524 {
2525    hash_table_foreach(so->variants, ent) {
2526       agx_delete_compiled_shader(dev, ent->data);
2527    }
2528 
2529    _mesa_hash_table_destroy(so->variants, NULL);
2530 
2531    if (so->linked_shaders) {
2532       hash_table_foreach(so->linked_shaders, ent) {
2533          struct agx_linked_shader *link = ent->data;
2534          agx_bo_unreference(dev, link->bo);
2535       }
2536 
2537       _mesa_hash_table_destroy(so->linked_shaders, NULL);
2538    }
2539 
2540    blob_finish(&so->serialized_nir);
2541    blob_finish(&so->early_serialized_nir);
2542 
2543    for (unsigned i = 0; i < MESA_PRIM_COUNT; ++i) {
2544       for (unsigned j = 0; j < 3; ++j) {
2545          for (unsigned k = 0; k < 2; ++k) {
2546             if (so->passthrough_progs[i][j][k])
2547                agx_delete_uncompiled_shader(dev,
2548                                             so->passthrough_progs[i][j][k]);
2549          }
2550       }
2551    }
2552 
2553    for (unsigned i = 0; i < ARRAY_SIZE(so->passthrough_tcs); ++i) {
2554       if (so->passthrough_tcs[i])
2555          agx_delete_uncompiled_shader(dev, so->passthrough_tcs[i]);
2556    }
2557 
2558    ralloc_free(so);
2559 }
2560 
2561 static void
agx_delete_shader_state(struct pipe_context * ctx,void * cso)2562 agx_delete_shader_state(struct pipe_context *ctx, void *cso)
2563 {
2564    struct agx_device *dev = agx_device(ctx->screen);
2565    agx_delete_uncompiled_shader(dev, cso);
2566 }
2567 
2568 struct agx_generic_meta_key {
2569    meta_shader_builder_t builder;
2570    size_t key_size;
2571    uint8_t key[];
2572 };
2573 
2574 static uint32_t
meta_key_hash(const void * key_)2575 meta_key_hash(const void *key_)
2576 {
2577    const struct agx_generic_meta_key *key = key_;
2578 
2579    return _mesa_hash_data(key,
2580                           sizeof(struct agx_generic_meta_key) + key->key_size);
2581 }
2582 
2583 static bool
meta_key_equal(const void * a_,const void * b_)2584 meta_key_equal(const void *a_, const void *b_)
2585 {
2586    const struct agx_generic_meta_key *a = a_;
2587    const struct agx_generic_meta_key *b = b_;
2588 
2589    return a->builder == b->builder && a->key_size == b->key_size &&
2590           memcmp(a->key, b->key, a->key_size) == 0;
2591 }
2592 
2593 void
agx_init_meta_shaders(struct agx_context * ctx)2594 agx_init_meta_shaders(struct agx_context *ctx)
2595 {
2596    ctx->generic_meta =
2597       _mesa_hash_table_create(ctx, meta_key_hash, meta_key_equal);
2598 }
2599 
2600 static void
agx_destroy_compute_blitter(struct pipe_context * ctx,struct asahi_blitter * bl)2601 agx_destroy_compute_blitter(struct pipe_context *ctx, struct asahi_blitter *bl)
2602 {
2603    hash_table_foreach(bl->blit_cs, ent) {
2604       ctx->delete_compute_state(ctx, ent->data);
2605    }
2606 
2607    ctx->delete_sampler_state(ctx, bl->sampler[0]);
2608    ctx->delete_sampler_state(ctx, bl->sampler[1]);
2609 
2610    _mesa_hash_table_destroy(bl->blit_cs, NULL);
2611 }
2612 
2613 void
agx_destroy_meta_shaders(struct agx_context * ctx)2614 agx_destroy_meta_shaders(struct agx_context *ctx)
2615 {
2616    struct agx_device *dev = agx_device(ctx->base.screen);
2617    hash_table_foreach(ctx->generic_meta, ent) {
2618       agx_delete_compiled_shader(dev, ent->data);
2619    }
2620 
2621    agx_destroy_compute_blitter(&ctx->base, &ctx->compute_blitter);
2622    _mesa_hash_table_destroy(ctx->generic_meta, NULL);
2623 }
2624 
2625 static struct agx_compiled_shader *
agx_build_meta_shader_internal(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size,bool prolog,bool epilog,unsigned cf_base,bool internal_kernel)2626 agx_build_meta_shader_internal(struct agx_context *ctx,
2627                                meta_shader_builder_t builder, void *data,
2628                                size_t data_size, bool prolog, bool epilog,
2629                                unsigned cf_base, bool internal_kernel)
2630 {
2631    /* Build the meta shader key */
2632    size_t total_key_size = sizeof(struct agx_generic_meta_key) + data_size;
2633    struct agx_generic_meta_key *key = alloca(total_key_size);
2634 
2635    *key = (struct agx_generic_meta_key){
2636       .builder = builder,
2637       .key_size = data_size,
2638    };
2639 
2640    if (data_size)
2641       memcpy(key->key, data, data_size);
2642 
2643    /* Try to get the cached shader */
2644    struct hash_entry *ent = _mesa_hash_table_search(ctx->generic_meta, key);
2645    if (ent)
2646       return ent->data;
2647 
2648    /* Otherwise, compile the shader fresh */
2649    nir_builder b = nir_builder_init_simple_shader(
2650       MESA_SHADER_COMPUTE, &agx_nir_options, "AGX meta shader");
2651 
2652    builder(&b, data);
2653 
2654    struct agx_device *dev = agx_device(ctx->base.screen);
2655    if (!prolog) {
2656       /* We need to link libagx and assign shared before preprocessing, matching
2657        * what the driver would otherwise produce.
2658        */
2659       agx_link_libagx(b.shader, dev->libagx);
2660 
2661       NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
2662                nir_var_mem_shared, glsl_get_cl_type_size_align);
2663 
2664       NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
2665                nir_address_format_62bit_generic);
2666 
2667       agx_preprocess_nir(b.shader, NULL);
2668       NIR_PASS(_, b.shader, agx_nir_lower_texture);
2669       NIR_PASS(_, b.shader, agx_nir_lower_multisampled_image_store);
2670    }
2671 
2672    struct agx_compiled_shader *shader = agx_compile_nir(
2673       dev, b.shader, NULL, PIPE_SHADER_COMPUTE, internal_kernel,
2674       !prolog && !(b.shader->info.stage == MESA_SHADER_FRAGMENT &&
2675                    b.shader->info.fs.uses_sample_shading),
2676       prolog || epilog, cf_base, NULL);
2677 
2678    ralloc_free(b.shader);
2679 
2680    /* ..and cache it before we return. The key is on the stack right now, so
2681     * clone it before using it as a hash table key. The clone is logically owned
2682     * by the hash table.
2683     */
2684    void *cloned_key = rzalloc_size(ctx->generic_meta, total_key_size);
2685    memcpy(cloned_key, key, total_key_size);
2686 
2687    _mesa_hash_table_insert(ctx->generic_meta, cloned_key, shader);
2688    return shader;
2689 }
2690 
2691 struct agx_compiled_shader *
agx_build_meta_shader(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size)2692 agx_build_meta_shader(struct agx_context *ctx, meta_shader_builder_t builder,
2693                       void *data, size_t data_size)
2694 {
2695    return agx_build_meta_shader_internal(ctx, builder, data, data_size, false,
2696                                          false, 0, false);
2697 }
2698 
2699 static unsigned
sampler_count(struct agx_context * ctx,enum pipe_shader_type stage)2700 sampler_count(struct agx_context *ctx, enum pipe_shader_type stage)
2701 {
2702    /* We reserve sampler #0 for txf so add 1 to the API count */
2703    return ctx->stage[stage].sampler_count + 1;
2704 }
2705 
2706 static inline enum agx_sampler_states
translate_sampler_state_count(struct agx_context * ctx,enum pipe_shader_type stage)2707 translate_sampler_state_count(struct agx_context *ctx,
2708                               enum pipe_shader_type stage)
2709 {
2710    /* Clamp to binding table maximum, anything larger will be bindless */
2711    return agx_translate_sampler_state_count(MIN2(sampler_count(ctx, stage), 16),
2712                                             ctx->stage[stage].custom_borders);
2713 }
2714 
2715 static uint32_t
agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader * cs)2716 agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader *cs)
2717 {
2718    if (!cs || !cs->so)
2719       return 0;
2720 
2721    /* 2 descriptors per image, 1 descriptor per texture */
2722    return cs->so->info.nr_bindful_textures +
2723           (2 * cs->so->info.nr_bindful_images);
2724 }
2725 
2726 static uint32_t
agx_nr_tex_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2727 agx_nr_tex_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2728 {
2729    unsigned n = agx_nr_tex_descriptors_without_spilled_rts(cs);
2730 
2731    /* We add on texture/PBE descriptors for spilled render targets */
2732    bool spilled_rt = cs->stage == PIPE_SHADER_FRAGMENT &&
2733                      agx_tilebuffer_spills(&batch->tilebuffer_layout);
2734    if (spilled_rt)
2735       n += (batch->key.nr_cbufs * 2);
2736 
2737    return n;
2738 }
2739 
2740 /*
2741  * For spilled render targets, upload a texture/PBE pair for each surface to
2742  * allow loading/storing to the render target from the shader.
2743  */
2744 static void
agx_upload_spilled_rt_descriptors(struct agx_texture_packed * out,struct agx_batch * batch)2745 agx_upload_spilled_rt_descriptors(struct agx_texture_packed *out,
2746                                   struct agx_batch *batch)
2747 {
2748    for (unsigned rt = 0; rt < batch->key.nr_cbufs; ++rt) {
2749       struct agx_texture_packed *texture = out + (2 * rt);
2750       struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2751 
2752       struct pipe_surface *surf = batch->key.cbufs[rt];
2753       if (!surf)
2754          continue;
2755 
2756       struct agx_resource *rsrc = agx_resource(surf->texture);
2757       struct pipe_image_view view = image_view_for_surface(surf);
2758       struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
2759       sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2760 
2761       agx_pack_texture(texture, rsrc, surf->format, &sampler_view);
2762       agx_batch_upload_pbe(batch, pbe, &view, false, false, true, true);
2763    }
2764 }
2765 
2766 static void
agx_upload_textures(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2767 agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs,
2768                     enum pipe_shader_type stage)
2769 {
2770    struct agx_context *ctx = batch->ctx;
2771 
2772    /* This can occur for meta shaders */
2773    if (!cs->so) {
2774       batch->texture_count[stage] = 0;
2775       batch->stage_uniforms[stage].texture_base = 0;
2776       return;
2777    }
2778 
2779    unsigned nr_textures = cs->so->info.nr_bindful_textures;
2780 
2781    unsigned nr_active_textures = ctx->stage[stage].texture_count;
2782    unsigned nr_tex_descriptors = agx_nr_tex_descriptors(batch, cs);
2783    unsigned nr_images = cs->so->info.nr_bindful_images;
2784 
2785    struct agx_ptr T_tex = agx_pool_alloc_aligned(
2786       &batch->pool, AGX_TEXTURE_LENGTH * nr_tex_descriptors, 64);
2787 
2788    struct agx_texture_packed *textures = T_tex.cpu;
2789 
2790    for (unsigned i = 0; i < MIN2(nr_textures, nr_active_textures); ++i) {
2791       struct agx_sampler_view *tex = ctx->stage[stage].textures[i];
2792 
2793       if (tex == NULL) {
2794          agx_set_null_texture(&textures[i], T_tex.gpu);
2795          continue;
2796       }
2797 
2798       struct agx_resource *rsrc = tex->rsrc;
2799       agx_batch_reads(batch, tex->rsrc);
2800 
2801       /* Re-emit state because the layout might have changed from under us.
2802        * TODO: optimize this somehow?
2803        */
2804       agx_pack_texture(&tex->desc, rsrc, tex->format, &tex->base);
2805 
2806       textures[i] = tex->desc;
2807    }
2808 
2809    for (unsigned i = nr_active_textures; i < nr_textures; ++i)
2810       agx_set_null_texture(&textures[i], T_tex.gpu);
2811 
2812    for (unsigned i = 0; i < nr_images; ++i) {
2813       /* Image descriptors come in pairs after the textures */
2814       struct agx_texture_packed *texture =
2815          ((struct agx_texture_packed *)T_tex.cpu) + nr_textures + (2 * i);
2816 
2817       struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2818 
2819       if (!(ctx->stage[stage].image_mask & BITFIELD_BIT(i))) {
2820          agx_set_null_texture(texture, T_tex.gpu);
2821          agx_set_null_pbe(pbe, agx_pool_alloc_aligned(&batch->pool, 1, 64).gpu);
2822          continue;
2823       }
2824 
2825       struct pipe_image_view *view = &ctx->stage[stage].images[i];
2826       agx_batch_track_image(batch, view);
2827 
2828       struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view);
2829 
2830       /* For the texture descriptor, lower cubes to 2D arrays. This matches the
2831        * transform done in the compiler. Also, force 2D arrays for internal
2832        * blitter images, this helps reduce shader variants.
2833        */
2834       bool internal = (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
2835 
2836       if (target_is_cube(sampler_view.target) ||
2837           (sampler_view.target == PIPE_TEXTURE_3D && internal))
2838          sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2839 
2840       agx_pack_texture(texture, agx_resource(view->resource), view->format,
2841                        &sampler_view);
2842       agx_batch_upload_pbe(batch, pbe, view, false, false, false, false);
2843    }
2844 
2845    if (stage == PIPE_SHADER_FRAGMENT &&
2846        agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
2847 
2848       struct agx_texture_packed *out =
2849          ((struct agx_texture_packed *)T_tex.cpu) +
2850          agx_nr_tex_descriptors_without_spilled_rts(cs);
2851 
2852       agx_upload_spilled_rt_descriptors(out, batch);
2853    }
2854 
2855    batch->texture_count[stage] = nr_tex_descriptors;
2856    batch->stage_uniforms[stage].texture_base = T_tex.gpu;
2857 }
2858 
2859 uint16_t
agx_sampler_heap_add(struct agx_device * dev,struct agx_sampler_heap * heap,struct agx_sampler_packed * sampler)2860 agx_sampler_heap_add(struct agx_device *dev, struct agx_sampler_heap *heap,
2861                      struct agx_sampler_packed *sampler)
2862 {
2863    /* Allocate (maximally sized) BO if we haven't already */
2864    if (!heap->bo) {
2865       heap->bo = agx_bo_create(dev, AGX_SAMPLER_HEAP_SIZE * AGX_SAMPLER_LENGTH,
2866                                0, AGX_BO_WRITEBACK, "Sampler heap");
2867 
2868       assert(heap->count == 0);
2869    }
2870 
2871    /* TODO search */
2872 
2873    /* Precondition: there is room in the heap */
2874    assert(heap->count < AGX_SAMPLER_HEAP_SIZE);
2875    struct agx_sampler_packed *samplers = agx_bo_map(heap->bo);
2876    memcpy(samplers + heap->count, sampler, sizeof(*sampler));
2877 
2878    return heap->count++;
2879 }
2880 
2881 static void
agx_upload_samplers(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2882 agx_upload_samplers(struct agx_batch *batch, struct agx_compiled_shader *cs,
2883                     enum pipe_shader_type stage)
2884 {
2885    struct agx_context *ctx = batch->ctx;
2886 
2887    unsigned nr_samplers = sampler_count(ctx, stage);
2888    bool custom_borders = ctx->stage[stage].custom_borders;
2889 
2890    size_t sampler_length =
2891       AGX_SAMPLER_LENGTH + (custom_borders ? AGX_BORDER_LENGTH : 0);
2892 
2893    struct agx_ptr T =
2894       agx_pool_alloc_aligned(&batch->pool, sampler_length * nr_samplers, 64);
2895 
2896    /* Sampler #0 is reserved for txf */
2897    agx_pack_txf_sampler(T.cpu);
2898 
2899    /* Remaining samplers are API samplers */
2900    uint8_t *out_sampler = (uint8_t *)T.cpu + sampler_length;
2901    for (unsigned i = 0; i < ctx->stage[stage].sampler_count; ++i) {
2902       struct agx_sampler_state *sampler = ctx->stage[stage].samplers[i];
2903       struct agx_sampler_packed *out = (struct agx_sampler_packed *)out_sampler;
2904 
2905       if (sampler) {
2906          *out = sampler->desc;
2907 
2908          if (custom_borders) {
2909             STATIC_ASSERT(sizeof(sampler->border) == AGX_BORDER_LENGTH);
2910 
2911             memcpy(out_sampler + AGX_SAMPLER_LENGTH, &sampler->border,
2912                    AGX_BORDER_LENGTH);
2913          } else {
2914             assert(!sampler->uses_custom_border && "invalid combination");
2915          }
2916       } else {
2917          memset(out, 0, sampler_length);
2918       }
2919 
2920       out_sampler += sampler_length;
2921    }
2922 
2923    batch->sampler_count[stage] = nr_samplers;
2924    batch->samplers[stage] = T.gpu;
2925 }
2926 
2927 static void
agx_update_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2928 agx_update_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2929 {
2930    struct agx_context *ctx = batch->ctx;
2931    if (!cs)
2932       return;
2933 
2934    enum pipe_shader_type stage = cs->stage;
2935    if (!ctx->stage[stage].dirty)
2936       return;
2937 
2938    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_CONST)
2939       agx_set_cbuf_uniforms(batch, stage);
2940 
2941    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SSBO)
2942       agx_set_ssbo_uniforms(batch, stage);
2943 
2944    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE)
2945       agx_upload_textures(batch, cs, stage);
2946 
2947    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
2948       agx_set_sampler_uniforms(batch, stage);
2949 
2950    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
2951       agx_upload_samplers(batch, cs, stage);
2952 
2953    struct agx_stage_uniforms *unif = &batch->stage_uniforms[stage];
2954 
2955    batch->uniforms.tables[AGX_SYSVAL_STAGE(stage)] =
2956       agx_pool_upload_aligned(&batch->pool, unif, sizeof(*unif), 16);
2957 }
2958 
2959 static uint32_t
agx_build_pipeline(struct agx_batch * batch,struct agx_compiled_shader * cs,struct agx_linked_shader * linked,enum pipe_shader_type phys_stage,unsigned variable_shared_mem,size_t max_subgroups)2960 agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
2961                    struct agx_linked_shader *linked,
2962                    enum pipe_shader_type phys_stage,
2963                    unsigned variable_shared_mem, size_t max_subgroups)
2964 {
2965    struct agx_context *ctx = batch->ctx;
2966    struct agx_device *dev = agx_device(ctx->base.screen);
2967    unsigned constant_push_ranges = DIV_ROUND_UP(cs->b.info.rodata.size_16, 64);
2968 
2969    size_t usc_size =
2970       agx_usc_size(constant_push_ranges + cs->push_range_count + 2);
2971 
2972    struct agx_ptr t =
2973       agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
2974 
2975    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
2976 
2977    enum pipe_shader_type stage = cs->stage;
2978 
2979    if (batch->texture_count[stage]) {
2980       agx_usc_pack(&b, TEXTURE, cfg) {
2981          cfg.start = 0;
2982          cfg.count =
2983             MIN2(batch->texture_count[stage], AGX_NUM_TEXTURE_STATE_REGS);
2984          cfg.buffer = batch->stage_uniforms[stage].texture_base;
2985       }
2986    }
2987 
2988    if (batch->sampler_count[stage]) {
2989       agx_usc_pack(&b, SAMPLER, cfg) {
2990          cfg.start = 0;
2991          cfg.count = batch->sampler_count[stage];
2992          cfg.buffer = batch->samplers[stage];
2993       }
2994    }
2995 
2996    for (unsigned i = 0; i < cs->push_range_count; ++i) {
2997       unsigned table = cs->push[i].table;
2998       uint64_t table_ptr = batch->uniforms.tables[table];
2999 
3000       /* Params may be omitted if the VS prolog does not read them, but the
3001        * reservation is always there in the API shader just in case.
3002        */
3003       if (table == AGX_SYSVAL_TABLE_PARAMS && !table_ptr)
3004          continue;
3005 
3006       assert(table_ptr);
3007 
3008       agx_usc_uniform(&b, cs->push[i].uniform, cs->push[i].length,
3009                       table_ptr + cs->push[i].offset);
3010    }
3011 
3012    if (cs->bo) {
3013       agx_usc_immediates(&b, &cs->b.info.rodata, cs->bo->va->addr);
3014    }
3015 
3016    uint32_t max_scratch_size =
3017       MAX2(cs->b.info.scratch_size, cs->b.info.preamble_scratch_size);
3018 
3019    if (max_scratch_size > 0) {
3020       unsigned preamble_size = (cs->b.info.preamble_scratch_size > 0) ? 1 : 0;
3021 
3022       switch (phys_stage) {
3023       case PIPE_SHADER_FRAGMENT:
3024          agx_scratch_alloc(&ctx->scratch_fs, max_scratch_size, max_subgroups);
3025          batch->fs_scratch = true;
3026          batch->fs_preamble_scratch =
3027             MAX2(batch->fs_preamble_scratch, preamble_size);
3028          break;
3029       case PIPE_SHADER_VERTEX:
3030          agx_scratch_alloc(&ctx->scratch_vs, max_scratch_size, max_subgroups);
3031          batch->vs_scratch = true;
3032          batch->vs_preamble_scratch =
3033             MAX2(batch->vs_preamble_scratch, preamble_size);
3034          break;
3035       default:
3036          agx_scratch_alloc(&ctx->scratch_cs, max_scratch_size, max_subgroups);
3037          batch->cs_scratch = true;
3038          batch->cs_preamble_scratch =
3039             MAX2(batch->cs_preamble_scratch, preamble_size);
3040          break;
3041       }
3042    }
3043 
3044    if (stage == PIPE_SHADER_FRAGMENT) {
3045       agx_usc_push_packed(&b, SHARED, &batch->tilebuffer_layout.usc);
3046    } else {
3047       agx_usc_shared_non_fragment(&b, &cs->b.info, variable_shared_mem);
3048    }
3049 
3050    if (linked) {
3051       agx_usc_push_packed(&b, SHADER, linked->shader);
3052       agx_usc_push_packed(&b, REGISTERS, linked->regs);
3053 
3054       if (stage == PIPE_SHADER_FRAGMENT)
3055          agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, linked->fragment_props);
3056    } else {
3057       agx_usc_pack(&b, SHADER, cfg) {
3058          cfg.code =
3059             agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.main_offset);
3060          cfg.unk_2 = 3;
3061       }
3062 
3063       agx_usc_pack(&b, REGISTERS, cfg) {
3064          cfg.register_count = cs->b.info.nr_gprs;
3065          cfg.spill_size = cs->b.info.scratch_size
3066                              ? agx_scratch_get_bucket(cs->b.info.scratch_size)
3067                              : 0;
3068       }
3069    }
3070 
3071    if (cs->b.info.has_preamble) {
3072       agx_usc_pack(&b, PRESHADER, cfg) {
3073          cfg.code =
3074             agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.preamble_offset);
3075       }
3076    } else {
3077       agx_usc_pack(&b, NO_PRESHADER, cfg)
3078          ;
3079    }
3080 
3081    return agx_usc_addr(dev, t.gpu);
3082 }
3083 
3084 static void
agx_launch_internal(struct agx_batch * batch,struct agx_grid grid,struct agx_workgroup wg,struct agx_cdm_launch_word_0_packed launch,enum pipe_shader_type stage,uint32_t usc)3085 agx_launch_internal(struct agx_batch *batch, struct agx_grid grid,
3086                     struct agx_workgroup wg,
3087                     struct agx_cdm_launch_word_0_packed launch,
3088                     enum pipe_shader_type stage, uint32_t usc)
3089 {
3090    struct agx_context *ctx = batch->ctx;
3091    struct agx_device *dev = agx_device(ctx->base.screen);
3092 
3093    /* TODO: Ensure space if we allow multiple kernels in a batch */
3094    uint32_t *out = (uint32_t *)batch->cdm.current;
3095 
3096    out = agx_cdm_launch(out, dev->chip, grid, wg, launch, usc);
3097    out = agx_cdm_barrier(out, dev->chip);
3098 
3099    batch->cdm.current = (void *)out;
3100    assert(batch->cdm.current <= batch->cdm.end &&
3101           "Failed to reserve sufficient space in encoder");
3102 }
3103 
3104 void
agx_launch_precomp(struct agx_batch * batch,struct agx_grid grid,enum agx_barrier barrier,enum libagx_program program,void * args,size_t arg_size)3105 agx_launch_precomp(struct agx_batch *batch, struct agx_grid grid,
3106                    enum agx_barrier barrier, enum libagx_program program,
3107                    void *args, size_t arg_size)
3108 {
3109    struct agx_device *dev = agx_device(batch->ctx->base.screen);
3110    struct agx_precompiled_shader *cs =
3111       agx_get_precompiled(&batch->ctx->bg_eot, program);
3112 
3113    struct agx_ptr t =
3114       agx_pool_alloc_aligned(&batch->pipeline_pool, agx_usc_size(15), 64);
3115 
3116    uint64_t uploaded_data =
3117       agx_pool_upload_aligned(&batch->pool, args, arg_size, 4);
3118 
3119    uint32_t usc = agx_usc_addr(dev, t.gpu);
3120    agx_usc_words_precomp(t.cpu, &cs->b, uploaded_data, arg_size);
3121 
3122    agx_batch_add_bo(batch, cs->bo);
3123    agx_launch_internal(batch, grid, cs->b.workgroup, cs->b.launch,
3124                        PIPE_SHADER_COMPUTE, usc);
3125 }
3126 
3127 struct asahi_bg_eot
agx_build_bg_eot(struct agx_batch * batch,bool store,bool partial_render)3128 agx_build_bg_eot(struct agx_batch *batch, bool store, bool partial_render)
3129 {
3130    struct agx_context *ctx = batch->ctx;
3131 
3132    /* Construct the key */
3133    struct agx_bg_eot_key key = {.tib = batch->tilebuffer_layout};
3134 
3135    bool needs_textures_for_spilled_rts =
3136       agx_tilebuffer_spills(&batch->tilebuffer_layout) && !partial_render &&
3137       !store;
3138 
3139    for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3140       struct pipe_surface *surf = batch->key.cbufs[rt];
3141 
3142       if (surf == NULL)
3143          continue;
3144 
3145       if (store) {
3146          /* TODO: Suppress stores to discarded render targets */
3147          key.op[rt] = AGX_EOT_STORE;
3148       } else if (batch->tilebuffer_layout.spilled[rt] && partial_render) {
3149          /* Partial render programs exist only to store/load the tilebuffer to
3150           * main memory. When render targets are already spilled to main memory,
3151           * there's nothing to do.
3152           */
3153          key.op[rt] = AGX_BG_EOT_NONE;
3154       } else {
3155          bool valid = (batch->load & (PIPE_CLEAR_COLOR0 << rt));
3156          bool clear = (batch->clear & (PIPE_CLEAR_COLOR0 << rt));
3157          bool load = valid && !clear;
3158 
3159          /* Don't read back spilled render targets, they're already in memory */
3160          load &= !batch->tilebuffer_layout.spilled[rt];
3161 
3162          /* The background program used for partial renders must always load
3163           * whatever was stored in the mid-frame end-of-tile program.
3164           */
3165          load |= partial_render;
3166 
3167          key.op[rt] = load    ? AGX_BG_LOAD
3168                       : clear ? AGX_BG_CLEAR
3169                               : AGX_BG_EOT_NONE;
3170       }
3171    }
3172 
3173    /* Begin building the pipeline */
3174    size_t usc_size = agx_usc_size(3 + PIPE_MAX_COLOR_BUFS);
3175    struct agx_ptr t =
3176       agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
3177    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
3178 
3179    bool needs_sampler = false;
3180    unsigned uniforms = 0;
3181    unsigned nr_tex = 0;
3182 
3183    for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3184       if (key.op[rt] == AGX_BG_LOAD) {
3185          /* Each reloaded render target is textured */
3186          needs_sampler = true;
3187 
3188          /* Will be uploaded later, this would be clobbered */
3189          if (needs_textures_for_spilled_rts)
3190             continue;
3191 
3192          struct agx_ptr texture =
3193             agx_pool_alloc_aligned(&batch->pool, AGX_TEXTURE_LENGTH, 64);
3194          struct pipe_surface *surf = batch->key.cbufs[rt];
3195          assert(surf != NULL && "cannot load nonexistent attachment");
3196 
3197          struct agx_resource *rsrc = agx_resource(surf->texture);
3198          struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
3199 
3200          agx_pack_texture(texture.cpu, rsrc, surf->format, &sampler_view);
3201 
3202          agx_usc_pack(&b, TEXTURE, cfg) {
3203             /* Shifted to match eMRT indexing, could be optimized */
3204             cfg.start = rt * 2;
3205             cfg.count = 1;
3206             cfg.buffer = texture.gpu;
3207          }
3208 
3209          nr_tex = (rt * 2) + 1;
3210       } else if (key.op[rt] == AGX_BG_CLEAR) {
3211          assert(batch->uploaded_clear_color[rt] && "set when cleared");
3212          agx_usc_uniform(&b, 4 + (8 * rt), 8, batch->uploaded_clear_color[rt]);
3213          uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
3214       } else if (key.op[rt] == AGX_EOT_STORE) {
3215          struct pipe_image_view view =
3216             image_view_for_surface(batch->key.cbufs[rt]);
3217          struct agx_ptr pbe =
3218             agx_pool_alloc_aligned(&batch->pool, AGX_PBE_LENGTH, 256);
3219 
3220          /* The tilebuffer is already in sRGB space if needed. Do not convert */
3221          view.format = util_format_linear(view.format);
3222 
3223          bool no_compress = batch->feedback & (PIPE_CLEAR_COLOR0 << rt);
3224          agx_batch_upload_pbe(batch, pbe.cpu, &view, true, true, false,
3225                               no_compress);
3226 
3227          agx_usc_pack(&b, TEXTURE, cfg) {
3228             cfg.start = rt;
3229             cfg.count = 1;
3230             cfg.buffer = pbe.gpu;
3231          }
3232 
3233          nr_tex = rt + 1;
3234       }
3235    }
3236 
3237    if (needs_textures_for_spilled_rts) {
3238       /* Upload texture/PBE descriptors for each render target so we can clear
3239        * spilled render targets.
3240        */
3241       struct agx_ptr descs = agx_pool_alloc_aligned(
3242          &batch->pool, AGX_TEXTURE_LENGTH * 2 * batch->key.nr_cbufs, 64);
3243       agx_upload_spilled_rt_descriptors(descs.cpu, batch);
3244 
3245       agx_usc_pack(&b, TEXTURE, cfg) {
3246          cfg.start = 0;
3247          cfg.count = 2 * batch->key.nr_cbufs;
3248          cfg.buffer = descs.gpu;
3249       }
3250 
3251       nr_tex = MAX2(nr_tex, 2 * batch->key.nr_cbufs);
3252 
3253       /* Bind the base as u0_u1 for bindless access */
3254       agx_usc_uniform(&b, 0, 4,
3255                       agx_pool_upload_aligned(&batch->pool, &descs.gpu, 8, 8));
3256       uniforms = MAX2(uniforms, 4);
3257    }
3258 
3259    /* All render targets share a sampler */
3260    if (needs_sampler) {
3261       struct agx_ptr sampler =
3262          agx_pool_alloc_aligned(&batch->pool, AGX_SAMPLER_LENGTH, 64);
3263 
3264       agx_pack(sampler.cpu, SAMPLER, cfg) {
3265          cfg.minimum_lod = 0.0f;
3266          cfg.maximum_lod = INFINITY;
3267          cfg.magnify = AGX_FILTER_LINEAR;
3268          cfg.minify = AGX_FILTER_NEAREST;
3269          cfg.mip_filter = AGX_MIP_FILTER_NONE;
3270          cfg.wrap_s = AGX_WRAP_CLAMP_TO_EDGE;
3271          cfg.wrap_t = AGX_WRAP_CLAMP_TO_EDGE;
3272          cfg.wrap_r = AGX_WRAP_CLAMP_TO_EDGE;
3273          cfg.pixel_coordinates = true;
3274          cfg.compare_func = AGX_COMPARE_FUNC_ALWAYS;
3275       }
3276 
3277       agx_usc_pack(&b, SAMPLER, cfg) {
3278          cfg.start = 0;
3279          cfg.count = 1;
3280          cfg.buffer = sampler.gpu;
3281       }
3282    }
3283 
3284    agx_usc_push_packed(&b, SHARED, &batch->tilebuffer_layout.usc);
3285 
3286    /* Get the shader */
3287    key.reserved_preamble = uniforms;
3288    struct agx_device *dev = agx_device(ctx->base.screen);
3289    struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&ctx->bg_eot, &key);
3290    agx_batch_add_bo(batch, shader->bo);
3291    assert(shader->info.rodata.size_16 == 0);
3292 
3293    agx_usc_pack(&b, SHADER, cfg) {
3294       cfg.code = agx_usc_addr(dev, shader->ptr + shader->info.main_offset);
3295       cfg.unk_2 = 0;
3296    }
3297 
3298    agx_usc_pack(&b, REGISTERS, cfg)
3299       cfg.register_count = shader->info.nr_gprs;
3300 
3301    if (shader->info.has_preamble) {
3302       agx_usc_pack(&b, PRESHADER, cfg) {
3303          cfg.code =
3304             agx_usc_addr(dev, shader->ptr + shader->info.preamble_offset);
3305       }
3306    } else {
3307       agx_usc_pack(&b, NO_PRESHADER, cfg)
3308          ;
3309    }
3310 
3311    struct asahi_bg_eot ret = {.usc = t.gpu};
3312 
3313    agx_pack(&ret.counts, COUNTS, cfg) {
3314       cfg.uniform_register_count = shader->info.push_count;
3315       cfg.preshader_register_count = shader->info.nr_preamble_gprs;
3316       cfg.texture_state_register_count = nr_tex;
3317       cfg.sampler_state_register_count =
3318          agx_translate_sampler_state_count(needs_sampler ? 1 : 0, false);
3319 
3320       if (!store)
3321          cfg.unknown = 0xFFFF;
3322    }
3323 
3324    return ret;
3325 }
3326 
3327 /*
3328  * Return the standard sample positions, packed into a 32-bit word with fixed
3329  * point nibbles for each x/y component of the (at most 4) samples. This is
3330  * suitable for programming the PPP_MULTISAMPLECTL control register.
3331  */
3332 static uint32_t
agx_default_sample_positions(unsigned nr_samples)3333 agx_default_sample_positions(unsigned nr_samples)
3334 {
3335    switch (nr_samples) {
3336    case 1:
3337       return 0x88;
3338    case 2:
3339       return 0x44cc;
3340    case 4:
3341       return 0xeaa26e26;
3342    default:
3343       unreachable("Invalid sample count");
3344    }
3345 }
3346 
3347 void
agx_batch_init_state(struct agx_batch * batch)3348 agx_batch_init_state(struct agx_batch *batch)
3349 {
3350    if (batch->initialized)
3351       return;
3352 
3353    if (agx_batch_is_compute(batch)) {
3354       batch->initialized = true;
3355 
3356       struct agx_context *ctx = batch->ctx;
3357       struct agx_device *dev = agx_device(ctx->base.screen);
3358       uint8_t *out = batch->cdm.current;
3359 
3360       /* See below */
3361       agx_push(out, CDM_BARRIER, cfg) {
3362          cfg.usc_cache_inval = true;
3363          cfg.unk_5 = true;
3364          cfg.unk_6 = true;
3365          cfg.unk_8 = true;
3366          // cfg.unk_11 = true;
3367          // cfg.unk_20 = true;
3368          if (dev->params.num_clusters_total > 1) {
3369             // cfg.unk_24 = true;
3370             if (dev->params.gpu_generation == 13) {
3371                cfg.unk_4 = true;
3372                // cfg.unk_26 = true;
3373             }
3374          }
3375       }
3376 
3377       return;
3378    }
3379 
3380    /* Emit state on the batch that we don't change and so don't dirty track */
3381    uint8_t *out = batch->vdm.current;
3382 
3383    /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
3384     * with another that caused stale data to be cached and the CPU wrote to it
3385     * in the meantime.
3386     */
3387    agx_push(out, VDM_BARRIER, cfg) {
3388       cfg.usc_cache_inval = true;
3389    }
3390 
3391    struct AGX_PPP_HEADER present = {
3392       .w_clamp = true,
3393       .occlusion_query_2 = true,
3394       .output_unknown = true,
3395       .varying_word_2 = true,
3396       .viewport_count = 1, /* irrelevant */
3397    };
3398 
3399    size_t size = agx_ppp_update_size(&present);
3400    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
3401    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
3402 
3403    /* clang-format off */
3404    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
3405    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
3406    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
3407    agx_ppp_push(&ppp, VARYING_2, cfg);
3408    /* clang-format on */
3409 
3410    agx_ppp_fini(&out, &ppp);
3411    batch->vdm.current = out;
3412 
3413    /* Mark it as initialized now, since agx_batch_writes() will check this. */
3414    batch->initialized = true;
3415 
3416    /* Choose a tilebuffer layout given the framebuffer key */
3417    enum pipe_format formats[PIPE_MAX_COLOR_BUFS] = {0};
3418    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3419       struct pipe_surface *surf = batch->key.cbufs[i];
3420       if (surf)
3421          formats[i] = surf->format;
3422    }
3423 
3424    batch->tilebuffer_layout = agx_build_tilebuffer_layout(
3425       formats, batch->key.nr_cbufs,
3426       util_framebuffer_get_num_samples(&batch->key),
3427       util_framebuffer_get_num_layers(&batch->key) > 1);
3428 
3429    if (agx_device(batch->ctx->base.screen)->debug & AGX_DBG_SMALLTILE)
3430       batch->tilebuffer_layout.tile_size = (struct agx_tile_size){16, 16};
3431 
3432    /* If the layout spilled render targets, we need to decompress those render
3433     * targets to ensure we can write to them.
3434     */
3435    if (agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
3436       for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3437          if (!batch->tilebuffer_layout.spilled[i])
3438             continue;
3439 
3440          struct pipe_surface *surf = batch->key.cbufs[i];
3441          if (!surf)
3442             continue;
3443 
3444          struct agx_resource *rsrc = agx_resource(surf->texture);
3445          struct ail_layout *layout = &rsrc->layout;
3446          unsigned level = surf->u.tex.level;
3447 
3448          if (!ail_is_level_compressed(layout, level))
3449             continue;
3450 
3451          if (true || (rsrc->base.bind & PIPE_BIND_SHARED)) {
3452             agx_decompress_inplace(batch, surf, "Render target spilled");
3453          } else {
3454             agx_decompress(batch->ctx, rsrc, "Render target spilled");
3455          }
3456       }
3457    }
3458 
3459    if (batch->key.zsbuf) {
3460       unsigned level = batch->key.zsbuf->u.tex.level;
3461       struct agx_resource *rsrc = agx_resource(batch->key.zsbuf->texture);
3462 
3463       agx_batch_writes(batch, rsrc, level);
3464 
3465       if (rsrc->separate_stencil)
3466          agx_batch_writes(batch, rsrc->separate_stencil, level);
3467    }
3468 
3469    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3470       if (batch->key.cbufs[i]) {
3471          struct agx_resource *rsrc = agx_resource(batch->key.cbufs[i]->texture);
3472          unsigned level = batch->key.cbufs[i]->u.tex.level;
3473 
3474          if (agx_resource_valid(rsrc, level))
3475             batch->load |= PIPE_CLEAR_COLOR0 << i;
3476 
3477          agx_batch_writes(batch, rsrc, level);
3478          assert(agx_resource_valid(rsrc, level));
3479       }
3480    }
3481 
3482    /* Set up standard sample positions */
3483    batch->uniforms.ppp_multisamplectl =
3484       agx_default_sample_positions(batch->tilebuffer_layout.nr_samples);
3485 }
3486 
3487 static enum agx_object_type
agx_point_object_type(struct agx_rasterizer * rast)3488 agx_point_object_type(struct agx_rasterizer *rast)
3489 {
3490    return (rast->base.sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT)
3491              ? AGX_OBJECT_TYPE_POINT_SPRITE_UV01
3492              : AGX_OBJECT_TYPE_POINT_SPRITE_UV10;
3493 }
3494 
3495 #define MAX_PPP_UPDATES 2
3496 #define IS_DIRTY(ST)    !!(ctx->dirty & AGX_DIRTY_##ST)
3497 
3498 static uint8_t *
agx_encode_state(struct agx_batch * batch,uint8_t * out)3499 agx_encode_state(struct agx_batch *batch, uint8_t *out)
3500 {
3501    struct agx_context *ctx = batch->ctx;
3502    struct agx_device *dev = agx_device(ctx->base.screen);
3503 
3504    /* If nothing is dirty, encode nothing */
3505    if (!ctx->dirty)
3506       return out;
3507 
3508    struct agx_rasterizer *rast = ctx->rast;
3509    unsigned ppp_updates = 0;
3510 
3511    struct agx_compiled_shader *vs = ctx->vs;
3512    if (ctx->gs)
3513       vs = ctx->gs->gs_copy;
3514 
3515    bool varyings_dirty = false;
3516 
3517    if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS) ||
3518        IS_DIRTY(PRIM)) {
3519 
3520       unsigned bindings = ctx->linked.fs->cf.nr_bindings;
3521       if (bindings) {
3522          size_t linkage_size =
3523             AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
3524 
3525          struct agx_ptr t =
3526             agx_pool_alloc_aligned(&batch->pipeline_pool, linkage_size, 16);
3527 
3528          agx_link_varyings_vs_fs(t.cpu, &batch->linked_varyings,
3529                                  vs->uvs.user_size, &ctx->linked.fs->cf,
3530                                  ctx->rast->base.flatshade_first ? 0 : 2,
3531                                  (batch->reduced_prim == MESA_PRIM_POINTS)
3532                                     ? ctx->rast->base.sprite_coord_enable
3533                                     : 0,
3534                                  &batch->generate_primitive_id);
3535 
3536          batch->varyings = agx_usc_addr(dev, t.gpu);
3537       } else {
3538          batch->varyings = 0;
3539       }
3540 
3541       varyings_dirty = true;
3542       ppp_updates++;
3543    }
3544 
3545    if (IS_DIRTY(VS) || varyings_dirty) {
3546       agx_push(out, VDM_STATE, cfg) {
3547          cfg.vertex_shader_word_0_present = true;
3548          cfg.vertex_shader_word_1_present = true;
3549          cfg.vertex_outputs_present = true;
3550          cfg.vertex_unknown_present = true;
3551       }
3552 
3553       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_0, cfg) {
3554          cfg.uniform_register_count = vs->b.info.push_count;
3555          cfg.preshader_register_count = vs->b.info.nr_preamble_gprs;
3556          cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, vs);
3557          cfg.sampler_state_register_count =
3558             translate_sampler_state_count(ctx, vs->stage);
3559       }
3560 
3561       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3562          cfg.pipeline =
3563             agx_build_pipeline(batch, vs, ctx->gs ? NULL : ctx->linked.vs,
3564                                PIPE_SHADER_VERTEX, 0, 0);
3565       }
3566 
3567       agx_push_packed(out, vs->uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3568 
3569       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3570          cfg.flat_shading_control = ctx->rast->base.flatshade_first
3571                                        ? AGX_VDM_VERTEX_0
3572                                        : AGX_VDM_VERTEX_2;
3573          cfg.unknown_4 = cfg.unknown_5 = ctx->rast->base.rasterizer_discard;
3574 
3575          cfg.generate_primitive_id = batch->generate_primitive_id;
3576       }
3577 
3578       /* Pad up to a multiple of 8 bytes */
3579       memset(out, 0, 4);
3580       out += 4;
3581    }
3582 
3583    struct agx_pool *pool = &batch->pool;
3584 
3585    if ((ctx->dirty & AGX_DIRTY_RS) && ctx->rast->depth_bias) {
3586       agx_upload_depth_bias(batch, &ctx->rast->base);
3587       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
3588    }
3589 
3590    if (ctx->dirty & (AGX_DIRTY_VIEWPORT | AGX_DIRTY_SCISSOR_ZBIAS |
3591                      AGX_DIRTY_RS | AGX_DIRTY_VS)) {
3592 
3593       agx_upload_viewport_scissor(pool, batch, &out, ctx->viewport,
3594                                   ctx->rast->base.scissor ? ctx->scissor : NULL,
3595                                   ctx->rast->base.clip_halfz,
3596                                   vs->b.info.nonzero_viewport);
3597    }
3598 
3599    bool is_points = batch->reduced_prim == MESA_PRIM_POINTS;
3600    bool is_lines = batch->reduced_prim == MESA_PRIM_LINES;
3601 
3602    bool object_type_dirty =
3603       IS_DIRTY(PRIM) || (is_points && IS_DIRTY(SPRITE_COORD_MODE));
3604 
3605    bool fragment_face_dirty =
3606       IS_DIRTY(ZS) || IS_DIRTY(STENCIL_REF) || IS_DIRTY(RS);
3607 
3608    enum agx_object_type object_type = is_points  ? agx_point_object_type(rast)
3609                                       : is_lines ? AGX_OBJECT_TYPE_LINE
3610                                                  : AGX_OBJECT_TYPE_TRIANGLE;
3611 
3612    struct AGX_PPP_HEADER dirty = {
3613       .fragment_control =
3614          IS_DIRTY(ZS) || IS_DIRTY(RS) || IS_DIRTY(PRIM) || IS_DIRTY(QUERY),
3615       .fragment_control_2 = IS_DIRTY(FS_PROG) || IS_DIRTY(RS),
3616       .fragment_front_face = fragment_face_dirty,
3617       .fragment_front_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3618       .fragment_front_stencil = IS_DIRTY(ZS),
3619       .fragment_back_face = fragment_face_dirty,
3620       .fragment_back_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3621       .fragment_back_stencil = IS_DIRTY(ZS),
3622       .output_select = varyings_dirty,
3623       .varying_counts_32 = varyings_dirty,
3624       .varying_counts_16 = varyings_dirty,
3625       /* Also dirty with tess but agx_draw_patches dirties RS for that */
3626       .cull = IS_DIRTY(RS),
3627       .cull_2 = varyings_dirty,
3628       .fragment_shader =
3629          IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
3630       .occlusion_query = IS_DIRTY(QUERY),
3631       .output_size = IS_DIRTY(VS_PROG),
3632       .viewport_count = 1, /* irrelevant */
3633    };
3634 
3635    size_t size = agx_ppp_update_size(&dirty);
3636    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
3637    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
3638 
3639    if (dirty.fragment_control) {
3640       agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
3641          if (ctx->active_queries && ctx->occlusion_query) {
3642             if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
3643                cfg.visibility_mode = AGX_VISIBILITY_MODE_COUNTING;
3644             else
3645                cfg.visibility_mode = AGX_VISIBILITY_MODE_BOOLEAN;
3646          }
3647 
3648          cfg.stencil_test_enable = ctx->zs->base.stencil[0].enabled;
3649          cfg.two_sided_stencil = ctx->zs->base.stencil[1].enabled;
3650          cfg.depth_bias_enable =
3651             rast->depth_bias && object_type == AGX_OBJECT_TYPE_TRIANGLE;
3652 
3653          /* Always enable scissoring so we may scissor to the viewport (TODO:
3654           * optimize this out if the viewport is the default and the app does
3655           * not use the scissor test)
3656           */
3657          cfg.scissor_enable = true;
3658 
3659          /* This avoids broken derivatives along primitive edges */
3660          cfg.disable_tri_merging = is_lines || is_points;
3661       }
3662    }
3663 
3664    if (dirty.fragment_control_2) {
3665       /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
3666        * main fragment control word and has to be combined into the secondary
3667        * word for reliable behaviour.
3668        */
3669       agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
3670                           ctx->linked.fs->fragment_control) {
3671          cfg.tag_write_disable = rast->base.rasterizer_discard;
3672       }
3673    }
3674 
3675    if (dirty.fragment_front_face) {
3676       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, ctx->zs->depth) {
3677          cfg.stencil_reference = ctx->stencil_ref.ref_value[0];
3678          cfg.line_width = rast->line_width;
3679          cfg.polygon_mode = rast->polygon_mode;
3680       }
3681    }
3682 
3683    if (dirty.fragment_front_face_2)
3684       agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->b.info);
3685 
3686    if (dirty.fragment_front_stencil) {
3687       agx_ppp_push_packed(&ppp, ctx->zs->front_stencil.opaque,
3688                           FRAGMENT_STENCIL);
3689    }
3690 
3691    if (dirty.fragment_back_face) {
3692       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, ctx->zs->depth) {
3693          bool twosided = ctx->zs->base.stencil[1].enabled;
3694          cfg.stencil_reference = ctx->stencil_ref.ref_value[twosided ? 1 : 0];
3695          cfg.line_width = rast->line_width;
3696          cfg.polygon_mode = rast->polygon_mode;
3697       }
3698    }
3699 
3700    if (dirty.fragment_back_face_2)
3701       agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->b.info);
3702 
3703    if (dirty.fragment_back_stencil)
3704       agx_ppp_push_packed(&ppp, ctx->zs->back_stencil.opaque, FRAGMENT_STENCIL);
3705 
3706    assert(dirty.varying_counts_32 == dirty.varying_counts_16);
3707    assert(dirty.varying_counts_32 == dirty.output_select);
3708 
3709    if (dirty.output_select) {
3710       agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &vs->uvs.osel,
3711                                 &ctx->linked.fs->osel);
3712 
3713       agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_32,
3714                           VARYING_COUNTS);
3715 
3716       agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_16,
3717                           VARYING_COUNTS);
3718    }
3719 
3720    if (dirty.cull) {
3721       agx_ppp_push_merged(&ppp, CULL, cfg, ctx->rast->cull) {
3722          cfg.front_face_ccw = ctx->rast->base.front_ccw;
3723 
3724          if (ctx->in_tess && !ctx->gs) {
3725             /* Yes, OpenGL is backwards. Deal with it. */
3726             cfg.front_face_ccw ^=
3727                !ctx->stage[MESA_SHADER_TESS_EVAL].shader->tess.ccw;
3728          }
3729       }
3730    }
3731 
3732    if (dirty.cull_2) {
3733       agx_ppp_push(&ppp, CULL_2, cfg) {
3734          cfg.needs_primitive_id = batch->generate_primitive_id;
3735          cfg.clamp_w = true;
3736       }
3737    }
3738 
3739    if (dirty.fragment_shader) {
3740       unsigned frag_tex_count = ctx->stage[PIPE_SHADER_FRAGMENT].texture_count;
3741 
3742       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_0, cfg) {
3743          cfg.uniform_register_count = ctx->fs->b.info.push_count;
3744          cfg.preshader_register_count = ctx->fs->b.info.nr_preamble_gprs;
3745          cfg.texture_state_register_count =
3746             agx_nr_tex_descriptors(batch, ctx->fs);
3747          cfg.sampler_state_register_count =
3748             translate_sampler_state_count(ctx, PIPE_SHADER_FRAGMENT);
3749          cfg.cf_binding_count = ctx->linked.fs->cf.nr_bindings;
3750       }
3751 
3752       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
3753          cfg.pipeline = agx_build_pipeline(batch, ctx->fs, ctx->linked.fs,
3754                                            PIPE_SHADER_FRAGMENT, 0, 0);
3755       }
3756 
3757       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
3758          cfg.cf_bindings = batch->varyings;
3759       }
3760 
3761       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg) {
3762          /* XXX: This is wrong */
3763          cfg.unknown = frag_tex_count >= 4;
3764       }
3765    }
3766 
3767    if (dirty.occlusion_query) {
3768       agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
3769          if (ctx->active_queries && ctx->occlusion_query) {
3770             cfg.index = agx_get_oq_index(batch, ctx->occlusion_query);
3771          }
3772       }
3773    }
3774 
3775    if (dirty.output_size) {
3776       agx_ppp_push(&ppp, OUTPUT_SIZE, cfg)
3777          cfg.count = vs->uvs.size;
3778    }
3779 
3780    agx_ppp_fini(&out, &ppp);
3781    ppp_updates++;
3782 
3783    assert(ppp_updates <= MAX_PPP_UPDATES);
3784    return out;
3785 }
3786 
3787 static enum agx_primitive
agx_primitive_for_pipe(enum mesa_prim mode)3788 agx_primitive_for_pipe(enum mesa_prim mode)
3789 {
3790    switch (mode) {
3791    case MESA_PRIM_POINTS:
3792       return AGX_PRIMITIVE_POINTS;
3793    case MESA_PRIM_LINES:
3794       return AGX_PRIMITIVE_LINES;
3795    case MESA_PRIM_LINE_STRIP:
3796       return AGX_PRIMITIVE_LINE_STRIP;
3797    case MESA_PRIM_LINE_LOOP:
3798       return AGX_PRIMITIVE_LINE_LOOP;
3799    case MESA_PRIM_TRIANGLES:
3800       return AGX_PRIMITIVE_TRIANGLES;
3801    case MESA_PRIM_TRIANGLE_STRIP:
3802       return AGX_PRIMITIVE_TRIANGLE_STRIP;
3803    case MESA_PRIM_TRIANGLE_FAN:
3804       return AGX_PRIMITIVE_TRIANGLE_FAN;
3805    case MESA_PRIM_QUADS:
3806       return AGX_PRIMITIVE_QUADS;
3807    case MESA_PRIM_QUAD_STRIP:
3808       return AGX_PRIMITIVE_QUAD_STRIP;
3809    default:
3810       unreachable("todo: other primitive types");
3811    }
3812 }
3813 
3814 static uint64_t
agx_index_buffer_rsrc_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,size_t * extent)3815 agx_index_buffer_rsrc_ptr(struct agx_batch *batch,
3816                           const struct pipe_draw_info *info, size_t *extent)
3817 {
3818    assert(!info->has_user_indices && "cannot use user pointers with indirect");
3819 
3820    struct agx_resource *rsrc = agx_resource(info->index.resource);
3821    agx_batch_reads(batch, rsrc);
3822 
3823    *extent = ALIGN_POT(rsrc->layout.size_B, 4);
3824    return rsrc->bo->va->addr;
3825 }
3826 
3827 static uint64_t
agx_index_buffer_direct_ptr(struct agx_batch * batch,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_info * info,size_t * extent)3828 agx_index_buffer_direct_ptr(struct agx_batch *batch,
3829                             const struct pipe_draw_start_count_bias *draw,
3830                             const struct pipe_draw_info *info, size_t *extent)
3831 {
3832    off_t offset = draw->start * info->index_size;
3833    uint32_t max_extent = draw->count * info->index_size;
3834 
3835    if (!info->has_user_indices) {
3836       uint64_t base = agx_index_buffer_rsrc_ptr(batch, info, extent);
3837 
3838       *extent = ALIGN_POT(MIN2(*extent - offset, max_extent), 4);
3839       return base + offset;
3840    } else {
3841       *extent = ALIGN_POT(max_extent, 4);
3842 
3843       return agx_pool_upload_aligned(&batch->pool,
3844                                      ((uint8_t *)info->index.user) + offset,
3845                                      draw->count * info->index_size, 64);
3846    }
3847 }
3848 
3849 static uint64_t
agx_index_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,size_t * extent)3850 agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
3851                      const struct pipe_draw_start_count_bias *draw,
3852                      size_t *extent)
3853 {
3854    if (draw)
3855       return agx_index_buffer_direct_ptr(batch, draw, info, extent);
3856    else
3857       return agx_index_buffer_rsrc_ptr(batch, info, extent);
3858 }
3859 
3860 static void
agx_ensure_cmdbuf_has_space(struct agx_batch * batch,struct agx_encoder * enc,size_t space)3861 agx_ensure_cmdbuf_has_space(struct agx_batch *batch, struct agx_encoder *enc,
3862                             size_t space)
3863 {
3864    bool vdm = enc == &batch->vdm;
3865    assert(vdm || (enc == &batch->cdm));
3866 
3867    size_t link_length =
3868       vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
3869 
3870    /* Assert that we have space for a link tag */
3871    assert((enc->current + link_length) <= enc->end && "Encoder overflowed");
3872 
3873    /* Always leave room for a link tag, in case we run out of space later,
3874     * plus padding because VDM apparently overreads?
3875     *
3876     * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
3877     */
3878    space += link_length + 0x800;
3879 
3880    /* If there is room in the command buffer, we're done */
3881    if (likely((enc->end - enc->current) >= space))
3882       return;
3883 
3884    /* Otherwise, we need to allocate a new command buffer. We use memory owned
3885     * by the batch to simplify lifetime management for the BO.
3886     */
3887    size_t size = 65536;
3888    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 256);
3889 
3890    /* Jump from the old command buffer to the new command buffer */
3891    agx_cs_jump((uint32_t *)enc->current, T.gpu, vdm);
3892 
3893    /* Swap out the command buffer */
3894    enc->current = T.cpu;
3895    enc->end = enc->current + size;
3896 }
3897 
3898 static void
agx_ia_update(struct agx_batch * batch,const struct pipe_draw_info * info,uint64_t draw,uint64_t ib,uint64_t ib_range_el)3899 agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
3900               uint64_t draw, uint64_t ib, uint64_t ib_range_el)
3901 {
3902    struct agx_context *ctx = batch->ctx;
3903    struct agx_device *dev = agx_device(ctx->base.screen);
3904 
3905    if (!batch->cdm.bo) {
3906       batch->cdm = agx_encoder_allocate(batch, dev);
3907    }
3908 
3909    uint64_t ia_vertices = agx_get_query_address(
3910       batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]);
3911 
3912    uint64_t ia_primitives = agx_get_query_address(
3913       batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES]);
3914 
3915    uint64_t vs_invocations = agx_get_query_address(
3916       batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]);
3917 
3918    uint64_t c_prims = agx_get_query_address(
3919       batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES]);
3920 
3921    uint64_t c_invs = agx_get_query_address(
3922       batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]);
3923 
3924    /* With a geometry shader, clipper counters are written by the pre-GS kernel
3925     * since they depend on the output on the geometry shader. Without a geometry
3926     * shader, they are written along with IA.
3927     *
3928     * TODO: Broken tessellation interaction, but nobody cares.
3929     */
3930    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
3931       c_prims = 0;
3932       c_invs = 0;
3933    }
3934 
3935    if (info->primitive_restart) {
3936       perf_debug(dev, "Input assembly counters with primitive restart");
3937 
3938       libagx_increment_ia_restart(
3939          batch, agx_1d(1024), AGX_BARRIER_ALL, ia_vertices, ia_primitives,
3940          vs_invocations, c_prims, c_invs, draw, ib, ib_range_el,
3941          info->restart_index, info->index_size, info->mode);
3942    } else {
3943       perf_debug(dev, "Input assembly counters");
3944 
3945       libagx_increment_ia(batch, agx_1d(1), AGX_BARRIER_ALL, ia_vertices,
3946                           ia_primitives, vs_invocations, c_prims, c_invs, draw,
3947                           info->mode);
3948    }
3949 }
3950 
3951 static uint64_t
agx_batch_geometry_state(struct agx_batch * batch)3952 agx_batch_geometry_state(struct agx_batch *batch)
3953 {
3954    struct agx_context *ctx = batch->ctx;
3955 
3956    if (!batch->geometry_state) {
3957       uint32_t size = 128 * 1024 * 1024;
3958 
3959       if (!ctx->heap) {
3960          ctx->heap = pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL,
3961                                         PIPE_USAGE_DEFAULT, size);
3962       }
3963 
3964       struct agx_geometry_state state = {
3965          .heap = agx_resource(ctx->heap)->bo->va->addr,
3966          .heap_size = size,
3967       };
3968 
3969       agx_batch_writes(batch, agx_resource(ctx->heap), 0);
3970 
3971       batch->geometry_state =
3972          agx_pool_upload_aligned(&batch->pool, &state, sizeof(state), 8);
3973    }
3974 
3975    return batch->geometry_state;
3976 }
3977 
3978 static uint64_t
agx_batch_geometry_params(struct agx_batch * batch,uint64_t input_index_buffer,size_t index_buffer_size_B,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_indirect_info * indirect)3979 agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
3980                           size_t index_buffer_size_B,
3981                           const struct pipe_draw_info *info,
3982                           const struct pipe_draw_start_count_bias *draw,
3983                           const struct pipe_draw_indirect_info *indirect)
3984 {
3985    struct agx_ia_state ia = {
3986       .index_buffer = input_index_buffer,
3987       .index_buffer_range_el = index_buffer_size_B / info->index_size,
3988       .verts_per_instance = draw ? draw->count : 0,
3989    };
3990 
3991    batch->uniforms.input_assembly =
3992       agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
3993 
3994    struct agx_geometry_params params = {
3995       .state = agx_batch_geometry_state(batch),
3996       .indirect_desc = batch->geom_indirect,
3997       .flat_outputs =
3998          batch->ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
3999       .input_topology = info->mode,
4000    };
4001 
4002    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->streamout.targets); ++i) {
4003       struct agx_streamout_target *so =
4004          agx_so_target(batch->ctx->streamout.targets[i]);
4005       struct agx_resource *rsrc = so ? agx_resource(so->offset) : NULL;
4006 
4007       uint32_t size;
4008       params.xfb_base_original[i] = agx_batch_get_so_address(batch, i, &size);
4009       params.xfb_size[i] = size;
4010 
4011       if (rsrc) {
4012          params.xfb_offs_ptrs[i] = rsrc->bo->va->addr;
4013          agx_batch_writes(batch, rsrc, 0);
4014          batch->incoherent_writes = true;
4015       } else {
4016          params.xfb_offs_ptrs[i] = 0;
4017       }
4018    }
4019 
4020    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->prims_generated); ++i) {
4021       params.prims_generated_counter[i] =
4022          agx_get_query_address(batch, batch->ctx->prims_generated[i]);
4023    }
4024 
4025    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_prims_generated); ++i) {
4026       params.xfb_prims_generated_counter[i] =
4027          agx_get_query_address(batch, batch->ctx->tf_prims_generated[i]);
4028    }
4029 
4030    if (batch->ctx->active_queries && batch->ctx->streamout.num_targets > 0) {
4031       for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_overflow); ++i) {
4032          params.xfb_overflow[i] =
4033             agx_get_query_address(batch, batch->ctx->tf_overflow[i]);
4034       }
4035 
4036       params.xfb_any_overflow =
4037          agx_get_query_address(batch, batch->ctx->tf_any_overflow);
4038    }
4039 
4040    /* Calculate input primitive count for direct draws, and allocate the vertex
4041     * & count buffers. GPU calculates and allocates for indirect draws.
4042     */
4043    batch->uniforms.vertex_outputs = batch->ctx->vs->b.info.outputs;
4044    params.input_mask = batch->uniforms.vertex_outputs;
4045    params.count_buffer_stride = batch->ctx->gs->gs_count_words * 4;
4046 
4047    if (indirect) {
4048       batch->uniforms.vertex_output_buffer_ptr =
4049          agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
4050 
4051       params.vs_grid[2] = params.gs_grid[2] = 1;
4052    } else {
4053       params.vs_grid[0] = draw->count;
4054       params.gs_grid[0] =
4055          u_decomposed_prims_for_vertices(info->mode, draw->count);
4056 
4057       params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
4058 
4059       params.input_primitives = params.gs_grid[0] * info->instance_count;
4060 
4061       unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
4062                                             batch->uniforms.vertex_outputs);
4063       unsigned size = params.input_primitives * params.count_buffer_stride;
4064 
4065       if (size) {
4066          params.count_buffer =
4067             agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
4068       }
4069 
4070       if (vb_size) {
4071          uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4072          batch->uniforms.vertex_output_buffer_ptr =
4073             agx_pool_upload(&batch->pool, &addr, 8);
4074 
4075          params.input_buffer = addr;
4076       }
4077    }
4078 
4079    return agx_pool_upload_aligned_with_bo(&batch->pool, &params, sizeof(params),
4080                                           8, &batch->geom_params_bo);
4081 }
4082 
4083 static uint64_t
agx_indirect_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect)4084 agx_indirect_buffer_ptr(struct agx_batch *batch,
4085                         const struct pipe_draw_indirect_info *indirect)
4086 {
4087    assert(indirect->buffer && "drawauto already handled");
4088 
4089    struct agx_resource *rsrc = agx_resource(indirect->buffer);
4090    agx_batch_reads(batch, rsrc);
4091    return rsrc->bo->va->addr + indirect->offset;
4092 }
4093 
4094 static void
agx_launch_gs_prerast(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_indirect_info * indirect)4095 agx_launch_gs_prerast(struct agx_batch *batch,
4096                       const struct pipe_draw_info *info,
4097                       const struct pipe_draw_start_count_bias *draws,
4098                       const struct pipe_draw_indirect_info *indirect)
4099 {
4100    struct agx_context *ctx = batch->ctx;
4101    struct agx_device *dev = agx_device(ctx->base.screen);
4102    struct agx_compiled_shader *gs = ctx->gs;
4103 
4104    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader->is_xfb_passthrough)
4105       perf_debug(dev, "Transform feedbck");
4106    else
4107       perf_debug(dev, "Geometry shader");
4108 
4109    /* This is a graphics batch, so it may not have had a CDM encoder allocated
4110     * yet. Allocate that so we can start enqueueing compute work.
4111     */
4112    if (!batch->cdm.bo) {
4113       batch->cdm = agx_encoder_allocate(batch, dev);
4114    }
4115 
4116    agx_ensure_cmdbuf_has_space(
4117       batch, &batch->cdm,
4118       8 * (AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH +
4119            AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH +
4120            AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH +
4121            AGX_CDM_BARRIER_LENGTH));
4122 
4123    assert(!info->primitive_restart && "should have been lowered");
4124 
4125    uint64_t gp = batch->uniforms.geometry_params;
4126    struct agx_grid grid_vs, grid_gs;
4127    struct agx_workgroup wg;
4128 
4129    /* Setup grids */
4130    if (indirect) {
4131       uint64_t ib = 0;
4132       size_t ib_extent = 0;
4133 
4134       if (info->index_size) {
4135          ib = agx_index_buffer_ptr(batch, info, indirect ? NULL : draws,
4136                                    &ib_extent);
4137       }
4138 
4139       struct libagx_gs_setup_indirect_args gsi = {
4140          .index_buffer = ib,
4141          .index_buffer_range_el = ib_extent / info->index_size,
4142          .draw = agx_indirect_buffer_ptr(batch, indirect),
4143          .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
4144          .ia = batch->uniforms.input_assembly,
4145          .p = batch->uniforms.geometry_params,
4146          .vs_outputs = batch->uniforms.vertex_outputs,
4147          .index_size_B = info->index_size,
4148          .prim = info->mode,
4149       };
4150 
4151       libagx_gs_setup_indirect_struct(batch, agx_1d(1), AGX_BARRIER_ALL, gsi);
4152 
4153       wg = agx_workgroup(1, 1, 1);
4154       grid_vs =
4155          agx_grid_indirect(gp + offsetof(struct agx_geometry_params, vs_grid));
4156 
4157       grid_gs =
4158          agx_grid_indirect(gp + offsetof(struct agx_geometry_params, gs_grid));
4159    } else {
4160       wg = agx_workgroup(64, 1, 1);
4161       grid_vs = agx_3d(draws->count, info->instance_count, 1);
4162 
4163       grid_gs =
4164          agx_3d(u_decomposed_prims_for_vertices(info->mode, draws->count),
4165                 info->instance_count, 1);
4166    }
4167 
4168    /* Launch the vertex shader first */
4169    agx_launch(batch, grid_vs, wg, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0);
4170 
4171    /* If there is a count shader, launch it and prefix sum the results. */
4172    if (gs->gs_count) {
4173       perf_debug(dev, "Geometry shader count");
4174       agx_launch(batch, grid_gs, wg, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY,
4175                  0);
4176 
4177       libagx_prefix_sum_geom(batch, agx_1d(1024 * gs->gs_count_words),
4178                              AGX_BARRIER_ALL, gp);
4179    }
4180 
4181    /* Pre-GS shader */
4182    agx_launch(batch, agx_1d(1), agx_workgroup(1, 1, 1), gs->pre_gs, NULL,
4183               PIPE_SHADER_COMPUTE, 0);
4184 
4185    /* Pre-rast geometry shader */
4186    agx_launch(batch, grid_gs, wg, gs, NULL, PIPE_SHADER_GEOMETRY, 0);
4187 }
4188 
4189 static void
agx_draw_without_restart(struct agx_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)4190 agx_draw_without_restart(struct agx_batch *batch,
4191                          const struct pipe_draw_info *info,
4192                          unsigned drawid_offset,
4193                          const struct pipe_draw_indirect_info *indirect,
4194                          const struct pipe_draw_start_count_bias *draw)
4195 {
4196    struct agx_context *ctx = batch->ctx;
4197    struct agx_device *dev = agx_device(ctx->base.screen);
4198 
4199    perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
4200 
4201    agx_batch_init_state(batch);
4202 
4203    size_t ib_extent = 0;
4204    uint64_t ib;
4205 
4206    /* The rest of this function handles only the general case of indirect
4207     * multidraws, so synthesize an indexed indirect draw now if we need one for
4208     * a direct draw (necessarily only one). This unifies the code paths.
4209     */
4210    struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
4211 
4212    if (!indirect) {
4213       /* Adds in the offset so set to 0 in the desc */
4214       ib = agx_index_buffer_direct_ptr(batch, draw, info, &ib_extent);
4215 
4216       uint32_t desc[5] = {draw->count, info->instance_count, 0,
4217                           draw->index_bias, info->start_instance};
4218 
4219       u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
4220                     &indirect_synthesized.offset, &indirect_synthesized.buffer);
4221 
4222       indirect = &indirect_synthesized;
4223    } else {
4224       /* Does not add in offset, the unroll kernel uses the desc's offset */
4225       ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
4226    }
4227 
4228    /* Next, we unroll the index buffer used by the indirect draw */
4229    if (!batch->cdm.bo)
4230       batch->cdm = agx_encoder_allocate(batch, dev);
4231 
4232    /* Allocate output indirect draw descriptors. This is exact. */
4233    struct agx_resource out_draws_rsrc = {0};
4234    struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
4235       &batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
4236       &out_draws_rsrc.bo);
4237 
4238    struct libagx_unroll_restart_args unroll = {
4239       .heap = agx_batch_geometry_state(batch),
4240       .index_buffer = ib,
4241       .out_draw = out_draws.gpu,
4242       .restart_index = info->restart_index,
4243       .index_buffer_size_el = ib_extent / info->index_size,
4244       .flatshade_first = batch->ctx->rast->base.flatshade_first,
4245       .in_draw = agx_indirect_buffer_ptr(batch, indirect),
4246    };
4247 
4248    /* Unroll the index buffer for each draw */
4249    libagx_unroll_restart_struct(
4250       batch, agx_1d(1024 * indirect->draw_count), AGX_BARRIER_ALL, unroll,
4251       util_logbase2(info->index_size), libagx_compact_prim(info->mode));
4252 
4253    /* Now draw the results without restart */
4254    struct pipe_draw_info new_info = {
4255       .mode = u_decomposed_prim(info->mode),
4256       .index_size = info->index_size,
4257       .index.resource = ctx->heap,
4258       .increment_draw_id = info->increment_draw_id,
4259       .index_bias_varies = info->index_bias_varies,
4260    };
4261 
4262    struct pipe_draw_indirect_info new_indirect = *indirect;
4263    new_indirect.buffer = &out_draws_rsrc.base;
4264    new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->va->addr;
4265    new_indirect.stride = 5 * sizeof(uint32_t);
4266 
4267    ctx->active_draw_without_restart = true;
4268    ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, NULL,
4269                       1);
4270    ctx->active_draw_without_restart = false;
4271 }
4272 
4273 static bool
agx_needs_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,bool * xfb_only)4274 agx_needs_passthrough_gs(struct agx_context *ctx,
4275                          const struct pipe_draw_info *info,
4276                          const struct pipe_draw_indirect_info *indirect,
4277                          bool *xfb_only)
4278 {
4279    /* If there is already a geometry shader in the pipeline, we do not need to
4280     * apply a passthrough GS of our own.
4281     */
4282    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader)
4283       return false;
4284 
4285    /* Rendering adjacency requires a GS, add a passthrough since we don't have
4286     * one.
4287     */
4288    if (mesa_prim_has_adjacency(info->mode)) {
4289       perf_debug_ctx(ctx, "Using passthrough GS due to adjacency primitives");
4290       return true;
4291    }
4292 
4293    /* TODO: Handle fans properly, we need to plumb a sysval. */
4294    if (info->mode == MESA_PRIM_TRIANGLE_FAN &&
4295        ctx->rast->base.flatshade_first &&
4296        ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded) {
4297 
4298       perf_debug_ctx(ctx, "Using passthrough GS due to first tri fans");
4299       return true;
4300    }
4301 
4302    /* TODO: this is really sloppy, we should add a VDM kernel for this. */
4303    if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries &&
4304        ctx->prims_generated[0]) {
4305       perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
4306       return true;
4307    }
4308 
4309    /* Edge flags are emulated with a geometry shader */
4310    if (has_edgeflags(ctx, info->mode)) {
4311       perf_debug_ctx(ctx, "Using passthrough GS due to edge flags");
4312       return true;
4313    }
4314 
4315    /* Transform feedback is layered on geometry shaders, so if transform
4316     * feedback is used, we need a GS.
4317     */
4318    struct agx_uncompiled_shader *last_vtx =
4319       ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4320          ?: ctx->stage[PIPE_SHADER_VERTEX].shader;
4321 
4322    if (last_vtx->has_xfb_info && ctx->streamout.num_targets) {
4323       *xfb_only = true;
4324       return true;
4325    }
4326 
4327    /* Otherwise, we don't need one */
4328    return false;
4329 }
4330 
4331 static enum mesa_prim
agx_tess_output_prim(struct agx_uncompiled_shader * tcs,struct agx_uncompiled_shader * tes)4332 agx_tess_output_prim(struct agx_uncompiled_shader *tcs,
4333                      struct agx_uncompiled_shader *tes)
4334 {
4335    if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) {
4336       return MESA_PRIM_POINTS;
4337    } else if (TESS_PRIMITIVE_ISOLINES ==
4338               MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) {
4339       return MESA_PRIM_LINES;
4340    } else {
4341       return MESA_PRIM_TRIANGLES;
4342    }
4343 }
4344 
4345 static struct agx_uncompiled_shader *
agx_get_passthrough_gs(struct agx_context * ctx,struct agx_uncompiled_shader * prev_cso,enum mesa_prim mode,bool xfb_passthrough)4346 agx_get_passthrough_gs(struct agx_context *ctx,
4347                        struct agx_uncompiled_shader *prev_cso,
4348                        enum mesa_prim mode, bool xfb_passthrough)
4349 {
4350    bool edgeflags = has_edgeflags(ctx, mode);
4351 
4352    if (mode == MESA_PRIM_PATCHES) {
4353       mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader,
4354                                   ctx->stage[MESA_SHADER_TESS_EVAL].shader);
4355    }
4356 
4357    /* Only handle the polygon mode when edge flags are in use, because
4358     * nir_passthrough_gs doesn't handle transform feedback + polygon mode
4359     * properly. Technically this can break edge flags + transform feedback
4360     * but that's firmly in "doctor, it hurts when I do this" territory, and
4361     * I'm not sure that's even possible to hit. TODO: Reevaluate.
4362     */
4363    unsigned poly_mode =
4364       edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
4365 
4366    if (prev_cso->passthrough_progs[mode][poly_mode][edgeflags])
4367       return prev_cso->passthrough_progs[mode][poly_mode][edgeflags];
4368 
4369    struct blob_reader reader;
4370    blob_reader_init(&reader, prev_cso->early_serialized_nir.data,
4371                     prev_cso->early_serialized_nir.size);
4372    nir_shader *prev = nir_deserialize(NULL, &agx_nir_options, &reader);
4373 
4374    nir_shader *gs = nir_create_passthrough_gs(
4375       &agx_nir_options, prev, mode, rast_prim(mode, poly_mode), edgeflags,
4376       false /* force line strip out */, false);
4377 
4378    ralloc_free(prev);
4379 
4380    struct agx_uncompiled_shader *cso = pipe_shader_from_nir(&ctx->base, gs);
4381    cso->is_xfb_passthrough = xfb_passthrough;
4382    prev_cso->passthrough_progs[mode][poly_mode][edgeflags] = cso;
4383    return cso;
4384 }
4385 
4386 static void
agx_apply_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,bool xfb_passthrough)4387 agx_apply_passthrough_gs(struct agx_context *ctx,
4388                          const struct pipe_draw_info *info,
4389                          unsigned drawid_offset,
4390                          const struct pipe_draw_indirect_info *indirect,
4391                          const struct pipe_draw_start_count_bias *draws,
4392                          unsigned num_draws, bool xfb_passthrough)
4393 {
4394    enum pipe_shader_type prev_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4395                                          ? PIPE_SHADER_TESS_EVAL
4396                                          : PIPE_SHADER_VERTEX;
4397    struct agx_uncompiled_shader *prev_cso = ctx->stage[prev_stage].shader;
4398 
4399    assert(ctx->stage[PIPE_SHADER_GEOMETRY].shader == NULL);
4400 
4401    /* Draw with passthrough */
4402    ctx->base.bind_gs_state(
4403       &ctx->base,
4404       agx_get_passthrough_gs(ctx, prev_cso, info->mode, xfb_passthrough));
4405    ctx->base.draw_vbo(&ctx->base, info, drawid_offset, indirect, draws,
4406                       num_draws);
4407    ctx->base.bind_gs_state(&ctx->base, NULL);
4408 }
4409 
4410 static void
util_draw_multi_unroll_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4411 util_draw_multi_unroll_indirect(struct pipe_context *pctx,
4412                                 const struct pipe_draw_info *info,
4413                                 const struct pipe_draw_indirect_info *indirect,
4414                                 const struct pipe_draw_start_count_bias *draws)
4415 {
4416    for (unsigned i = 0; i < indirect->draw_count; ++i) {
4417       const struct pipe_draw_indirect_info subindirect = {
4418          .buffer = indirect->buffer,
4419          .count_from_stream_output = indirect->count_from_stream_output,
4420          .offset = indirect->offset + (i * indirect->stride),
4421          .draw_count = 1,
4422       };
4423 
4424       pctx->draw_vbo(pctx, info, i, &subindirect, draws, 1);
4425    }
4426 }
4427 
4428 static void
util_draw_multi_upload_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4429 util_draw_multi_upload_indirect(struct pipe_context *pctx,
4430                                 const struct pipe_draw_info *info,
4431                                 const struct pipe_draw_indirect_info *indirect,
4432                                 const struct pipe_draw_start_count_bias *draws)
4433 {
4434    struct pipe_draw_indirect_info indirect_ = *indirect;
4435    u_upload_data(pctx->const_uploader, 0, 4, 4, &indirect->draw_count,
4436                  &indirect_.indirect_draw_count_offset,
4437                  &indirect_.indirect_draw_count);
4438 
4439    pctx->draw_vbo(pctx, info, 0, &indirect_, draws, 1);
4440 }
4441 
4442 static void
agx_upload_draw_params(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_info * info)4443 agx_upload_draw_params(struct agx_batch *batch,
4444                        const struct pipe_draw_indirect_info *indirect,
4445                        const struct pipe_draw_start_count_bias *draws,
4446                        const struct pipe_draw_info *info)
4447 {
4448    if (indirect) {
4449       uint64_t address = agx_indirect_buffer_ptr(batch, indirect);
4450 
4451       /* To implement draw parameters, we use the last 2 words of the
4452        * indirect draw descriptor. Offset by 3 words for indexed draw (5
4453        * total) and 2 words for non-indexed (4 total).  See the layouts of
4454        * indexed vs non-indexed draw descriptors.
4455        *
4456        * This gives us a consistent layout
4457        *
4458        *    uint32_t first_vertex;
4459        *    uint32_t base_instance;
4460        *
4461        * and we can implement load_first_vertex & load_base_instance without
4462        * checking for indexing.
4463        */
4464       uint32_t offset = info->index_size ? 3 : 2;
4465       batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = address + offset * 4;
4466    } else {
4467       /* Upload just those two words. */
4468       uint32_t params[2] = {
4469          info->index_size ? draws->index_bias : draws->start,
4470          info->start_instance,
4471       };
4472 
4473       batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] =
4474          agx_pool_upload_aligned(&batch->pool, params, sizeof(params), 4);
4475    }
4476 }
4477 
4478 static void
agx_draw_patches(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4479 agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
4480                  unsigned drawid_offset,
4481                  const struct pipe_draw_indirect_info *indirect,
4482                  const struct pipe_draw_start_count_bias *draws,
4483                  unsigned num_draws)
4484 {
4485    struct agx_device *dev = agx_device(ctx->base.screen);
4486    perf_debug(dev, "Tessellation");
4487 
4488    struct agx_uncompiled_shader *tcs = ctx->stage[MESA_SHADER_TESS_CTRL].shader;
4489    struct agx_uncompiled_shader *tes = ctx->stage[MESA_SHADER_TESS_EVAL].shader;
4490 
4491    assert(tes != NULL && "required with patches");
4492 
4493    unsigned patch_vertices = ctx->patch_vertices;
4494 
4495    /* OpenGL allows omitting the tcs, fill in a passthrough program if needed.
4496     * In principle, we could optimize this case, but I don't think it matters.
4497     */
4498    bool unbind_tcs_when_done = false;
4499    if (!tcs) {
4500       struct agx_uncompiled_shader *vs = ctx->stage[MESA_SHADER_VERTEX].shader;
4501 
4502       assert(patch_vertices >= 1 &&
4503              patch_vertices <= ARRAY_SIZE(vs->passthrough_tcs));
4504 
4505       if (!vs->passthrough_tcs[patch_vertices - 1]) {
4506          struct blob_reader reader;
4507          blob_reader_init(&reader, vs->early_serialized_nir.data,
4508                           vs->early_serialized_nir.size);
4509          nir_shader *vs_nir = nir_deserialize(NULL, &agx_nir_options, &reader);
4510          nir_shader *nir = nir_create_passthrough_tcs(&agx_nir_options, vs_nir,
4511                                                       patch_vertices);
4512          ralloc_free(vs_nir);
4513 
4514          /* Lower the tess level sysvals and gather info, since mesa/st won't do
4515           * either for us.
4516           */
4517          NIR_PASS(_, nir, nir_lower_system_values);
4518 
4519          nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
4520 
4521          vs->passthrough_tcs[patch_vertices - 1] =
4522             pipe_shader_from_nir(&ctx->base, nir);
4523       }
4524 
4525       tcs = vs->passthrough_tcs[patch_vertices - 1];
4526       ctx->base.bind_tcs_state(&ctx->base, tcs);
4527       unbind_tcs_when_done = true;
4528    }
4529 
4530    enum tess_primitive_mode mode =
4531       MAX2(tcs->tess.primitive, tes->tess.primitive);
4532    enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
4533 
4534    enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
4535                                         ? PIPE_TESS_SPACING_EQUAL
4536                                      : spacing == TESS_SPACING_FRACTIONAL_ODD
4537                                         ? PIPE_TESS_SPACING_FRACTIONAL_ODD
4538                                         : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
4539 
4540    bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
4541    enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
4542 
4543    enum libagx_tess_partitioning partitioning =
4544       (enum libagx_tess_partitioning)pspacing;
4545 
4546    struct agx_bo *draw_bo = NULL;
4547    size_t draw_stride = 5 * sizeof(uint32_t);
4548 
4549    struct agx_batch *batch = agx_get_batch(ctx);
4550    agx_batch_init_state(batch);
4551 
4552    if (!batch->cdm.bo) {
4553       batch->cdm = agx_encoder_allocate(batch, dev);
4554    }
4555 
4556    uint64_t ib = 0;
4557    size_t ib_extent = 0;
4558 
4559    if (info->index_size)
4560       ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
4561 
4562    struct agx_ia_state ia = {
4563       .index_buffer = ib,
4564       .index_buffer_range_el = ib_extent,
4565       .verts_per_instance = draws ? draws->count : 0,
4566    };
4567 
4568    batch->uniforms.input_assembly =
4569       agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
4570 
4571    agx_upload_draw_params(batch, indirect, draws, info);
4572 
4573    /* Setup parameters */
4574    uint64_t geom_state = agx_batch_geometry_state(batch);
4575    assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
4576 
4577    struct libagx_tess_args args = {
4578       .heap = geom_state,
4579       .tcs_stride_el = tcs->tess.output_stride / 4,
4580       .statistic = agx_get_query_address(
4581          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]),
4582       .input_patch_size = patch_vertices,
4583       .output_patch_size = tcs->tess.output_patch_size,
4584       .tcs_patch_constants = tcs->tess.nr_patch_outputs,
4585       .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
4586       .patch_coord_buffer = agx_resource(ctx->heap)->bo->va->addr,
4587       .partitioning = partitioning,
4588       .points_mode = point_mode,
4589    };
4590 
4591    if (!point_mode && tes->tess.primitive != TESS_PRIMITIVE_ISOLINES) {
4592       args.ccw = !tes->tess.ccw;
4593    }
4594 
4595    memcpy(&args.tess_level_outer_default, ctx->default_outer_level,
4596           sizeof(ctx->default_outer_level));
4597    memcpy(&args.tess_level_inner_default, ctx->default_inner_level,
4598           sizeof(ctx->default_inner_level));
4599 
4600    struct agx_grid vs_grid, tcs_grid, tess_grid;
4601 
4602    agx_upload_vbos(batch);
4603    agx_update_vs(batch, info->index_size);
4604    agx_update_tcs(ctx, info);
4605    /* XXX */
4606    ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0;
4607    ctx->stage[PIPE_SHADER_TESS_EVAL].dirty = ~0;
4608    agx_update_descriptors(batch, ctx->vs);
4609    agx_update_descriptors(batch, ctx->tcs);
4610 
4611    batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs;
4612 
4613    if (indirect == NULL) {
4614       unsigned in_patches = draws->count / patch_vertices;
4615       if (in_patches == 0)
4616          return;
4617 
4618       /* TCS invocation counter increments once per-patch */
4619       agx_query_increment_cpu(
4620          ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
4621          in_patches);
4622 
4623       unsigned unrolled_patches = in_patches * info->instance_count;
4624 
4625       uint32_t alloc = 0;
4626       uint32_t tcs_out_offs = alloc;
4627       alloc += unrolled_patches * tcs->tess.output_stride;
4628 
4629       uint32_t patch_coord_offs = alloc;
4630       alloc += unrolled_patches * 4;
4631 
4632       uint32_t count_offs = alloc;
4633       alloc += unrolled_patches * sizeof(uint32_t);
4634 
4635       uint32_t draw_offs = alloc;
4636       alloc += draw_stride;
4637 
4638       struct agx_ptr blob =
4639          agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo);
4640 
4641       args.tcs_buffer = blob.gpu + tcs_out_offs;
4642       args.patches_per_instance = in_patches;
4643       args.coord_allocs = blob.gpu + patch_coord_offs;
4644       args.nr_patches = unrolled_patches;
4645       args.out_draws = blob.gpu + draw_offs;
4646       args.counts = blob.gpu + count_offs;
4647 
4648       unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
4649                                             batch->uniforms.vertex_outputs);
4650       uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4651       batch->uniforms.vertex_output_buffer_ptr =
4652          agx_pool_upload(&batch->pool, &addr, 8);
4653 
4654       vs_grid = agx_3d(draws->count, info->instance_count, 1);
4655       tcs_grid = agx_3d(in_patches * tcs->tess.output_patch_size,
4656                         info->instance_count, 1);
4657 
4658       tess_grid = agx_1d(unrolled_patches);
4659    } else if (indirect) {
4660       args.out_draws =
4661          agx_pool_alloc_aligned_with_bo(&batch->pool, draw_stride, 4, &draw_bo)
4662             .gpu;
4663    }
4664 
4665    uint64_t state =
4666       agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4);
4667 
4668    if (indirect) {
4669       uint32_t grid_stride = sizeof(uint32_t) * 6;
4670 
4671       uint64_t vertex_out_ptr = agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
4672       uint64_t indirect_ptr = agx_indirect_buffer_ptr(batch, indirect);
4673 
4674       uint64_t tcs_statistic = agx_get_query_address(
4675          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
4676 
4677       /* Allocate 3x indirect global+local grids for VS/TCS/tess */
4678       uint64_t grids =
4679          agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
4680 
4681       libagx_tess_setup_indirect(
4682          batch, agx_1d(1), AGX_BARRIER_ALL, state, grids, 0 /* XXX: IA */,
4683          indirect_ptr, vertex_out_ptr, 0, 0, 0 /* XXX: Index buffer */,
4684          ctx->vs->b.info.outputs, tcs_statistic);
4685 
4686       batch->uniforms.vertex_output_buffer_ptr = vertex_out_ptr;
4687 
4688       vs_grid = agx_grid_indirect_local(grids + 0 * grid_stride);
4689       tcs_grid = agx_grid_indirect_local(grids + 1 * grid_stride);
4690       tess_grid = agx_grid_indirect_local(grids + 2 * grid_stride);
4691    }
4692 
4693    batch->uniforms.tess_params = state;
4694 
4695    agx_launch(batch, vs_grid, agx_workgroup(64, 1, 1), ctx->vs, ctx->linked.vs,
4696               PIPE_SHADER_VERTEX, 0);
4697 
4698    agx_launch(batch, tcs_grid, agx_workgroup(tcs->tess.output_patch_size, 1, 1),
4699               ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0);
4700 
4701    batch->uniforms.vertex_output_buffer_ptr = 0;
4702 
4703    /* Generate counts, then prefix sum them, then finally tessellate. */
4704    libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
4705                      LIBAGX_TESS_MODE_COUNT, state);
4706    libagx_prefix_sum_tess(batch, agx_1d(1024), AGX_BARRIER_ALL, state);
4707    libagx_tessellate(batch, tess_grid, AGX_BARRIER_ALL, mode,
4708                      LIBAGX_TESS_MODE_WITH_COUNTS, state);
4709 
4710    /* Face culling state needs to be specialized for tess */
4711    ctx->dirty |= AGX_DIRTY_RS;
4712 
4713    /* Run TES as VS */
4714    void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
4715    void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
4716    ctx->base.bind_vs_state(&ctx->base, tes_cso);
4717    ctx->in_tess = true;
4718 
4719    struct pipe_draw_info draw_info = {
4720       .mode = out_prim,
4721       .index_size = 4,
4722       .index.resource = ctx->heap,
4723       .instance_count = 1,
4724    };
4725 
4726    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
4727    struct agx_resource indirect_rsrc = {.bo = draw_bo};
4728 
4729    struct pipe_draw_indirect_info copy_indirect = {
4730       .buffer = &indirect_rsrc.base,
4731       .offset = args.out_draws - draw_bo->va->addr,
4732       .stride = draw_stride,
4733       .draw_count = 1,
4734    };
4735 
4736    ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);
4737 
4738    /* Restore vertex state */
4739    ctx->base.bind_vs_state(&ctx->base, vs_cso);
4740    ctx->in_tess = false;
4741 
4742    if (unbind_tcs_when_done) {
4743       ctx->base.bind_tcs_state(&ctx->base, NULL);
4744    }
4745 }
4746 
4747 /*
4748  * From the ARB_texture_barrier spec:
4749  *
4750  *  Specifically, the values of rendered fragments are undefined if any
4751  *  shader stage fetches texels and the same texels are written via fragment
4752  *  shader outputs, even if the reads and writes are not in the same Draw
4753  *  call, unless any of the following exceptions apply:
4754  *
4755  *  - The reads and writes are from/to disjoint sets of texels (after
4756  *    accounting for texture filtering rules).
4757  *
4758  *  - There is only a single read and write of each texel, and the read is in
4759  *    the fragment shader invocation that writes the same texel (e.g. using
4760  *    "texelFetch2D(sampler, ivec2(gl_FragCoord.xy), 0);").
4761  *
4762  *  - If a texel has been written, then in order to safely read the result
4763  *    a texel fetch must be in a subsequent Draw separated by the command
4764  *
4765  *      void TextureBarrier(void);
4766  *
4767  *    TextureBarrier() will guarantee that writes have completed and caches
4768  *    have been invalidated before subsequent Draws are executed."
4769  *
4770  * The wording is subtle, but we are not required to flush implicitly for
4771  * feedback loops, even though we're a tiler. What we are required to do is
4772  * decompress framebuffers involved in feedback loops, because otherwise
4773  * the hardware will race itself with exception #1, where we have a disjoint
4774  * group texels that intersects a compressed tile being written out.
4775  */
4776 static void
agx_legalize_feedback_loops(struct agx_context * ctx)4777 agx_legalize_feedback_loops(struct agx_context *ctx)
4778 {
4779    /* Trust that u_blitter knows what it's doing */
4780    if (ctx->blitter->running)
4781       return;
4782 
4783    for (unsigned stage = 0; stage < ARRAY_SIZE(ctx->stage); ++stage) {
4784       if (!(ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE))
4785          continue;
4786 
4787       for (unsigned i = 0; i < ctx->stage[stage].texture_count; ++i) {
4788          if (!ctx->stage[stage].textures[i])
4789             continue;
4790 
4791          struct agx_resource *rsrc = ctx->stage[stage].textures[i]->rsrc;
4792 
4793          for (unsigned cb = 0; cb < ctx->framebuffer.nr_cbufs; ++cb) {
4794             if (ctx->framebuffer.cbufs[cb] &&
4795                 agx_resource(ctx->framebuffer.cbufs[cb]->texture) == rsrc) {
4796 
4797                if (rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED) {
4798                   /* Decompress if we can and shadow if we can't. */
4799                   if (rsrc->base.bind & PIPE_BIND_SHARED) {
4800                      struct agx_batch *batch = agx_get_batch(ctx);
4801 
4802                      /* If we already did in-place decompression for this one */
4803                      if (batch->feedback & (PIPE_CLEAR_COLOR0 << i))
4804                         continue;
4805 
4806                      /* Use our current context batch. If it already touched
4807                       * this buffer, that will have been flushed above.
4808                       */
4809                      agx_decompress_inplace(batch, ctx->framebuffer.cbufs[cb],
4810                                             "Texture feedback loop");
4811 
4812                      /* Mark it as a feedback cbuf, so it will be written to
4813                       * uncompressed despite having a compressed layout.
4814                       */
4815                      batch->feedback |= PIPE_CLEAR_COLOR0 << i;
4816                   } else {
4817                      agx_decompress(ctx, rsrc, "Texture feedback loop");
4818                   }
4819                }
4820 
4821                /* Not required by the spec, just for debug */
4822                if (agx_device(ctx->base.screen)->debug & AGX_DBG_FEEDBACK)
4823                   agx_flush_writer(ctx, rsrc, "Feedback loop");
4824             }
4825          }
4826       }
4827    }
4828 }
4829 
4830 static void
agx_draw_vbo(struct pipe_context * pctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4831 agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
4832              unsigned drawid_offset,
4833              const struct pipe_draw_indirect_info *indirect,
4834              const struct pipe_draw_start_count_bias *draws, unsigned num_draws)
4835 {
4836    struct agx_context *ctx = agx_context(pctx);
4837 
4838    if (unlikely(!agx_render_condition_check(ctx)))
4839       return;
4840 
4841    if (num_draws > 1) {
4842       util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
4843       return;
4844    }
4845 
4846    if (indirect && indirect->draw_count > 1 && !indirect->indirect_draw_count) {
4847       assert(drawid_offset == 0);
4848       assert(num_draws == 1);
4849 
4850       util_draw_multi_unroll_indirect(pctx, info, indirect, draws);
4851       return;
4852    }
4853 
4854    if (indirect && indirect->count_from_stream_output) {
4855       agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect);
4856       return;
4857    }
4858 
4859    /* TODO: stop cheating */
4860    if (indirect && indirect->indirect_draw_count) {
4861       perf_debug_ctx(ctx, "multi-draw indirect");
4862       util_draw_indirect(pctx, info, drawid_offset, indirect);
4863       return;
4864    }
4865 
4866    /* TODO: stop cheating.
4867     *
4868     * libagx supports this, just needs test coverage and gallium side wiring.
4869     */
4870    if (indirect && info->mode == MESA_PRIM_PATCHES && info->index_size) {
4871       perf_debug_ctx(ctx, "indexed indirect with tess");
4872       util_draw_indirect(pctx, info, drawid_offset, indirect);
4873       return;
4874    }
4875 
4876    bool xfb_passthrough = false;
4877    if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
4878       agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
4879                                num_draws, xfb_passthrough);
4880       return;
4881    }
4882 
4883    if (info->mode == MESA_PRIM_PATCHES) {
4884       agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
4885       return;
4886    }
4887 
4888    agx_legalize_feedback_loops(ctx);
4889 
4890    /* Only the rasterization stream counts */
4891    if (ctx->active_queries && ctx->prims_generated[0] &&
4892        !ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
4893 
4894       assert(!indirect && "we force a passthrough GS for this");
4895       agx_primitives_update_direct(ctx, info, draws);
4896    }
4897 
4898    struct agx_batch *batch = agx_get_batch(ctx);
4899    uint64_t ib = 0;
4900    size_t ib_extent = 0;
4901 
4902    if (info->index_size) {
4903       ib =
4904          agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
4905    }
4906 
4907    if (ctx->active_queries && !ctx->active_draw_without_restart &&
4908        (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
4909         ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
4910         ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS] ||
4911         ((ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
4912           ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) &&
4913          !ctx->stage[PIPE_SHADER_GEOMETRY].shader))) {
4914 
4915       uint64_t ptr;
4916       if (indirect) {
4917          ptr = agx_indirect_buffer_ptr(batch, indirect);
4918       } else {
4919          uint32_t desc[] = {draws->count, info->instance_count, 0};
4920          ptr = agx_pool_upload(&batch->pool, &desc, sizeof(desc));
4921       }
4922 
4923       agx_ia_update(batch, info, ptr, ib,
4924                     info->index_size ? ib_extent / info->index_size : 1);
4925    }
4926 
4927    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
4928        info->index_size) {
4929 
4930       agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
4931       return;
4932    }
4933 
4934    agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
4935 
4936 #ifndef NDEBUG
4937    if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
4938       agx_dirty_all(ctx);
4939 #endif
4940 
4941    agx_batch_init_state(batch);
4942 
4943    /* Dirty track the reduced prim: lines vs points vs triangles. Happens before
4944     * agx_update_vs/agx_update_fs, which specialize based on primitive.
4945     */
4946    enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
4947    if (reduced_prim != batch->reduced_prim)
4948       ctx->dirty |= AGX_DIRTY_PRIM;
4949    batch->reduced_prim = reduced_prim;
4950 
4951    /* Update shaders first so we can use them after */
4952    if (agx_update_vs(batch, info->index_size)) {
4953       ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG;
4954       ctx->stage[PIPE_SHADER_VERTEX].dirty = ~0;
4955    } else if (ctx->stage[PIPE_SHADER_VERTEX].dirty ||
4956               (ctx->dirty & AGX_DIRTY_VERTEX))
4957       ctx->dirty |= AGX_DIRTY_VS;
4958 
4959    /* This is subtle. But agx_update_vs will be true at least once per batch. */
4960    assert(agx_batch_uses_bo(batch, ctx->vs->bo));
4961    assert(!ctx->linked.vs || agx_batch_uses_bo(batch, ctx->linked.vs->bo));
4962 
4963    agx_update_gs(ctx, info, indirect);
4964 
4965    if (ctx->gs) {
4966       batch->geom_indirect = agx_pool_alloc_aligned_with_bo(
4967                                 &batch->pool, 64, 4, &batch->geom_indirect_bo)
4968                                 .gpu;
4969 
4970       batch->uniforms.geometry_params =
4971          agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
4972 
4973       agx_batch_add_bo(batch, ctx->gs->bo);
4974       agx_batch_add_bo(batch, ctx->gs->gs_copy->bo);
4975    }
4976 
4977    if (ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG)) {
4978       struct agx_compiled_shader *vs = ctx->vs;
4979       if (ctx->gs)
4980          vs = ctx->gs->gs_copy;
4981 
4982       agx_assign_uvs(
4983          &batch->linked_varyings, &vs->uvs,
4984          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
4985          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded);
4986 
4987       for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
4988          batch->uniforms.uvs_index[i] = batch->linked_varyings.slots[i];
4989       }
4990    }
4991 
4992    /* Set draw ID */
4993    if (ctx->vs->b.info.uses_draw_id) {
4994       batch->uniforms.draw_id = drawid_offset;
4995 
4996       ctx->dirty |= AGX_DIRTY_VS;
4997    }
4998 
4999    if (agx_update_fs(batch)) {
5000       ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
5001       ctx->stage[PIPE_SHADER_FRAGMENT].dirty = ~0;
5002    } else if ((ctx->stage[PIPE_SHADER_FRAGMENT].dirty) ||
5003               (ctx->dirty & (AGX_DIRTY_BLEND_COLOR | AGX_DIRTY_SAMPLE_MASK))) {
5004       ctx->dirty |= AGX_DIRTY_FS;
5005    }
5006 
5007    /* This is subtle. But agx_update_fs will be true at least once per batch. */
5008    assert(!ctx->fs->bo || agx_batch_uses_bo(batch, ctx->fs->bo));
5009    assert(agx_batch_uses_bo(batch, ctx->linked.fs->bo));
5010 
5011    if (ctx->linked.vs->uses_base_param || ctx->gs) {
5012       agx_upload_draw_params(batch, indirect, draws, info);
5013 
5014       batch->uniforms.is_indexed_draw = (info->index_size > 0);
5015       ctx->dirty |= AGX_DIRTY_VS;
5016    }
5017 
5018    agx_update_descriptors(batch, ctx->vs);
5019    agx_update_descriptors(batch, ctx->gs);
5020    agx_update_descriptors(batch, ctx->fs);
5021 
5022    if (IS_DIRTY(VS) || IS_DIRTY(FS) || ctx->gs || IS_DIRTY(VERTEX) ||
5023        IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
5024        IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) {
5025 
5026       if (IS_DIRTY(VERTEX)) {
5027          agx_upload_vbos(batch);
5028       }
5029 
5030       if (IS_DIRTY(BLEND_COLOR)) {
5031          memcpy(batch->uniforms.blend_constant, &ctx->blend_color,
5032                 sizeof(ctx->blend_color));
5033       }
5034 
5035       if (IS_DIRTY(RS)) {
5036          struct pipe_rasterizer_state *rs = &ctx->rast->base;
5037 
5038          batch->uniforms.fixed_point_size =
5039             rs->point_size_per_vertex ? 0.0 : rs->point_size;
5040 
5041          /* TODO: tri fans */
5042          batch->uniforms.provoking_vertex = !rs->flatshade_first ? 2 : 0;
5043       }
5044 
5045       if (IS_DIRTY(QUERY)) {
5046          for (unsigned i = 0; i < ARRAY_SIZE(ctx->pipeline_statistics); ++i) {
5047             struct agx_query *query = ctx->pipeline_statistics[i];
5048             batch->uniforms.pipeline_statistics[i] =
5049                agx_get_query_address(batch, query);
5050          }
5051       }
5052 
5053       if (IS_DIRTY(POLY_STIPPLE)) {
5054          STATIC_ASSERT(sizeof(ctx->poly_stipple) == 32 * 4);
5055 
5056          batch->uniforms.polygon_stipple = agx_pool_upload_aligned(
5057             &batch->pool, ctx->poly_stipple, sizeof(ctx->poly_stipple), 4);
5058       }
5059 
5060       agx_upload_uniforms(batch);
5061    }
5062 
5063    struct pipe_draw_info info_gs;
5064    struct pipe_draw_indirect_info indirect_gs;
5065 
5066    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
5067    struct agx_resource indirect_rsrc = {.bo = batch->geom_indirect_bo};
5068 
5069    if (ctx->gs) {
5070       /* Launch the pre-rasterization parts of the geometry shader */
5071       agx_launch_gs_prerast(batch, info, draws, indirect);
5072 
5073       if (ctx->rast->base.rasterizer_discard)
5074          return;
5075 
5076       /* Setup to rasterize the GS results */
5077       info_gs = (struct pipe_draw_info){
5078          .mode = ctx->gs->gs_output_mode,
5079          .index_size = 4,
5080          .primitive_restart = ctx->gs->gs_output_mode != MESA_PRIM_POINTS,
5081          .restart_index = ~0,
5082          .index.resource = ctx->heap,
5083          .instance_count = 1,
5084       };
5085 
5086       indirect_gs = (struct pipe_draw_indirect_info){
5087          .draw_count = 1,
5088          .buffer = &indirect_rsrc.base,
5089          .offset = batch->geom_indirect - indirect_rsrc.bo->va->addr,
5090       };
5091 
5092       info = &info_gs;
5093       indirect = &indirect_gs;
5094 
5095       /* TODO: Deduplicate? */
5096       batch->reduced_prim = u_reduced_prim(info->mode);
5097       ctx->dirty |= AGX_DIRTY_PRIM;
5098 
5099       if (info_gs.index_size) {
5100          ib = agx_resource(ctx->heap)->bo->va->addr;
5101          ib_extent = agx_resource(ctx->heap)->bo->size;
5102       } else {
5103          ib = 0;
5104          ib_extent = 0;
5105       }
5106 
5107       /* We need to reemit geometry descriptors since the txf sampler may change
5108        * between the GS prepass and the GS rast program.
5109        */
5110       agx_update_descriptors(batch, ctx->gs->gs_copy);
5111    }
5112 
5113    assert((!indirect || !indirect->indirect_draw_count) && "multidraw handled");
5114 
5115    /* Update batch masks based on current state */
5116    if (ctx->dirty & AGX_DIRTY_BLEND) {
5117       /* TODO: Any point to tracking load? */
5118       batch->draw |= ctx->blend->store;
5119       batch->resolve |= ctx->blend->store;
5120    }
5121 
5122    if (ctx->dirty & AGX_DIRTY_ZS) {
5123       batch->load |= ctx->zs->load;
5124       batch->draw |= ctx->zs->store;
5125       batch->resolve |= ctx->zs->store;
5126    }
5127 
5128    /* When we approach the end of a command buffer, cycle it out for a new one.
5129     * We only need to do this once per draw as long as we conservatively
5130     * estimate the maximum bytes of VDM commands that this draw will emit.
5131     */
5132    agx_ensure_cmdbuf_has_space(
5133       batch, &batch->vdm,
5134       (AGX_VDM_STATE_LENGTH * 2) + (AGX_PPP_STATE_LENGTH * MAX_PPP_UPDATES) +
5135          AGX_VDM_STATE_RESTART_INDEX_LENGTH +
5136          AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH +
5137          AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH +
5138          AGX_VDM_STATE_VERTEX_OUTPUTS_LENGTH +
5139          AGX_VDM_STATE_VERTEX_UNKNOWN_LENGTH + 4 /* padding */ +
5140          AGX_INDEX_LIST_LENGTH + AGX_INDEX_LIST_BUFFER_LO_LENGTH +
5141          AGX_INDEX_LIST_COUNT_LENGTH + AGX_INDEX_LIST_INSTANCES_LENGTH +
5142          AGX_INDEX_LIST_START_LENGTH + AGX_INDEX_LIST_BUFFER_SIZE_LENGTH);
5143 
5144    uint8_t *out = agx_encode_state(batch, batch->vdm.current);
5145 
5146    if (info->index_size && info->primitive_restart) {
5147       agx_push(out, VDM_STATE, cfg)
5148          cfg.restart_index_present = true;
5149 
5150       agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
5151          cfg.value = info->restart_index;
5152    }
5153 
5154    struct agx_draw draw = {0};
5155    if (info->index_size) {
5156       draw.index_size = agx_translate_index_size(info->index_size);
5157       draw.index_buffer = ib;
5158       draw.index_buffer_range_B = ib_extent;
5159       draw.restart = info->primitive_restart;
5160       draw.indexed = true;
5161    } else {
5162       draw.start = draws->start;
5163    }
5164 
5165    if (indirect) {
5166       draw.b = agx_grid_indirect(agx_indirect_buffer_ptr(batch, indirect));
5167    } else {
5168       draw.b = agx_3d(draws->count, info->instance_count, 1);
5169       if (info->index_size)
5170          draw.index_bias = draws->index_bias;
5171    }
5172 
5173    out = (void *)agx_vdm_draw((uint32_t *)out, 0 /* ignored for now */, draw,
5174                               agx_primitive_for_pipe(info->mode));
5175 
5176    batch->vdm.current = out;
5177    assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
5178           "Failed to reserve sufficient space in encoder");
5179    agx_dirty_reset_graphics(ctx);
5180 
5181    assert(batch == agx_get_batch(ctx) && "batch should not change under us");
5182 
5183    batch->draws++;
5184 
5185    /* The scissor/zbias arrays are indexed with 16-bit integers, imposigin a
5186     * maximum of UINT16_MAX descriptors. Flush if the next draw would overflow
5187     */
5188    if (unlikely(
5189           (((batch->scissor.size / AGX_SCISSOR_LENGTH) + AGX_MAX_VIEWPORTS) >
5190            UINT16_MAX) ||
5191           (batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH) >= UINT16_MAX)) {
5192       agx_flush_batch_for_reason(ctx, batch, "Scissor/depth bias overflow");
5193    } else if (unlikely(batch->draws > 100000)) {
5194       /* Mostly so drawoverhead doesn't OOM */
5195       agx_flush_batch_for_reason(ctx, batch, "Absurd number of draws");
5196    } else if (unlikely(batch->sampler_heap.count >
5197                        (AGX_SAMPLER_HEAP_SIZE - (PIPE_MAX_SAMPLERS * 6)))) {
5198       agx_flush_batch_for_reason(ctx, batch, "Sampler heap overflow");
5199    }
5200 }
5201 
5202 static void
agx_texture_barrier(struct pipe_context * pipe,unsigned flags)5203 agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
5204 {
5205    struct agx_context *ctx = agx_context(pipe);
5206 
5207    /* Framebuffer fetch is coherent, so barriers are a no-op. */
5208    if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER)
5209       return;
5210 
5211    agx_flush_all(ctx, "Texture barrier");
5212 }
5213 
5214 void
agx_launch(struct agx_batch * batch,struct agx_grid grid,struct agx_workgroup wg,struct agx_compiled_shader * cs,struct agx_linked_shader * linked,enum pipe_shader_type stage,unsigned variable_shared_mem)5215 agx_launch(struct agx_batch *batch, struct agx_grid grid,
5216            struct agx_workgroup wg, struct agx_compiled_shader *cs,
5217            struct agx_linked_shader *linked, enum pipe_shader_type stage,
5218            unsigned variable_shared_mem)
5219 {
5220    struct agx_context *ctx = batch->ctx;
5221 
5222    /* To implement load_num_workgroups, the number of workgroups needs to be
5223     * available in GPU memory. This is either the indirect buffer, or just a
5224     * buffer we upload ourselves if not indirect.
5225     */
5226    if (grid.mode == AGX_CDM_MODE_DIRECT) {
5227       uint32_t groups[3] = {
5228          grid.count[0] / wg.x,
5229          grid.count[1] / wg.y,
5230          grid.count[2] / wg.z,
5231       };
5232 
5233       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
5234          agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4);
5235    } else {
5236       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid.ptr;
5237    }
5238 
5239    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
5240       if (!*res)
5241          continue;
5242 
5243       struct agx_resource *buffer = agx_resource(*res);
5244       agx_batch_writes(batch, buffer, 0);
5245       batch->incoherent_writes = true;
5246    }
5247 
5248    agx_update_descriptors(batch, cs);
5249    agx_upload_uniforms(batch);
5250 
5251    // TODO: This is broken.
5252    size_t subgroups_per_core = 0;
5253 #if 0
5254    if (!info->indirect) {
5255       size_t subgroups_per_workgroup =
5256          DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 32);
5257       subgroups_per_core =
5258          local_workgroups *
5259          DIV_ROUND_UP(info->grid[0] * info->grid[1] * info->grid[2],
5260                      ctx->scratch_cs.num_cores);
5261    }
5262 #endif
5263 
5264    uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
5265                                      variable_shared_mem, subgroups_per_core);
5266 
5267    if (cs)
5268       agx_batch_add_bo(batch, cs->bo);
5269 
5270    struct agx_cdm_launch_word_0_packed launch;
5271    agx_pack(&launch, CDM_LAUNCH_WORD_0, cfg) {
5272       cfg.uniform_register_count = cs->b.info.push_count;
5273       cfg.preshader_register_count = cs->b.info.nr_preamble_gprs;
5274       cfg.texture_state_register_count =
5275          cs ? agx_nr_tex_descriptors(batch, cs) : 0;
5276       cfg.sampler_state_register_count =
5277          translate_sampler_state_count(ctx, stage);
5278    }
5279 
5280    agx_launch_internal(batch, grid, wg, launch, stage, usc);
5281 }
5282 
5283 static void
agx_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)5284 agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
5285 {
5286    struct agx_context *ctx = agx_context(pipe);
5287    if (unlikely(!ctx->compute_blitter.active &&
5288                 !agx_render_condition_check(ctx)))
5289       return;
5290 
5291    struct agx_batch *batch = agx_get_compute_batch(ctx);
5292 
5293    uint64_t indirect = 0;
5294    if (info->indirect) {
5295       struct agx_resource *rsrc = agx_resource(info->indirect);
5296       agx_batch_reads(batch, rsrc);
5297       indirect = rsrc->bo->va->addr + info->indirect_offset;
5298    }
5299 
5300    /* Increment the pipeline stats query.
5301     *
5302     * TODO: Can we use the hardware counter for this?
5303     */
5304    struct agx_query *statistic =
5305       ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS];
5306 
5307    struct agx_workgroup wg =
5308       agx_workgroup(info->block[0], info->block[1], info->block[2]);
5309 
5310    if (statistic) {
5311       if (indirect) {
5312          uint64_t addr = agx_get_query_address(batch, statistic);
5313 
5314          libagx_increment_cs_invocations(batch, agx_1d(1), AGX_BARRIER_ALL,
5315                                          indirect, addr,
5316                                          agx_workgroup_threads(wg));
5317       } else {
5318          agx_query_increment_cpu(ctx, statistic,
5319                                  agx_workgroup_threads(wg) * info->grid[0] *
5320                                     info->grid[1] * info->grid[2]);
5321       }
5322    }
5323 
5324    agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
5325 
5326    agx_batch_init_state(batch);
5327 
5328    struct agx_uncompiled_shader *uncompiled =
5329       ctx->stage[PIPE_SHADER_COMPUTE].shader;
5330 
5331    /* There is exactly one variant, get it */
5332    struct agx_compiled_shader *cs =
5333       _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
5334 
5335    struct agx_grid grid;
5336    if (indirect) {
5337       grid = agx_grid_indirect(indirect);
5338    } else {
5339       grid = agx_3d(0, 0, 0);
5340 
5341       for (unsigned d = 0; d < 3; ++d) {
5342          grid.count[d] = ((info->grid[d] - 1) * info->block[d]) +
5343                          (info->last_block[d] ?: info->block[d]);
5344       }
5345    }
5346 
5347    agx_launch(batch, grid, wg, cs, NULL, PIPE_SHADER_COMPUTE,
5348               info->variable_shared_mem);
5349 
5350    /* TODO: Dirty tracking? */
5351    agx_dirty_all(ctx);
5352 
5353    batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = 0;
5354 
5355    /* If the next dispatch might overflow, flush now. TODO: If this is ever hit
5356     * in practice, we can use CDM stream links.
5357     */
5358    size_t dispatch_upper_bound =
5359       AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH +
5360       AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH +
5361       AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH +
5362       AGX_CDM_BARRIER_LENGTH;
5363 
5364    if (batch->cdm.current + dispatch_upper_bound >= batch->cdm.end)
5365       agx_flush_batch_for_reason(ctx, batch, "CDM overfull");
5366 }
5367 
5368 static void
agx_set_global_binding(struct pipe_context * pipe,unsigned first,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)5369 agx_set_global_binding(struct pipe_context *pipe, unsigned first,
5370                        unsigned count, struct pipe_resource **resources,
5371                        uint32_t **handles)
5372 {
5373    struct agx_context *ctx = agx_context(pipe);
5374    unsigned old_size =
5375       util_dynarray_num_elements(&ctx->global_buffers, *resources);
5376 
5377    if (old_size < first + count) {
5378       /* we are screwed no matter what */
5379       if (!util_dynarray_grow(&ctx->global_buffers, *resources,
5380                               (first + count) - old_size))
5381          unreachable("out of memory");
5382 
5383       for (unsigned i = old_size; i < first + count; i++)
5384          *util_dynarray_element(&ctx->global_buffers, struct pipe_resource *,
5385                                 i) = NULL;
5386    }
5387 
5388    for (unsigned i = 0; i < count; ++i) {
5389       struct pipe_resource **res = util_dynarray_element(
5390          &ctx->global_buffers, struct pipe_resource *, first + i);
5391       if (resources && resources[i]) {
5392          pipe_resource_reference(res, resources[i]);
5393 
5394          /* The handle points to uint32_t, but space is allocated for 64
5395           * bits. We need to respect the offset passed in. This interface
5396           * is so bad.
5397           */
5398          uint64_t addr = 0;
5399          struct agx_resource *rsrc = agx_resource(resources[i]);
5400 
5401          memcpy(&addr, handles[i], sizeof(addr));
5402          addr += rsrc->bo->va->addr;
5403          memcpy(handles[i], &addr, sizeof(addr));
5404       } else {
5405          pipe_resource_reference(res, NULL);
5406       }
5407    }
5408 }
5409 
5410 void agx_init_state_functions(struct pipe_context *ctx);
5411 
5412 void
agx_decompress_inplace(struct agx_batch * batch,struct pipe_surface * surf,const char * reason)5413 agx_decompress_inplace(struct agx_batch *batch, struct pipe_surface *surf,
5414                        const char *reason)
5415 {
5416    struct agx_context *ctx = batch->ctx;
5417    struct agx_device *dev = agx_device(ctx->base.screen);
5418    struct agx_resource *rsrc = agx_resource(surf->texture);
5419    struct ail_layout *layout = &rsrc->layout;
5420    unsigned level = surf->u.tex.level;
5421 
5422    perf_debug(dev, "Decompressing in-place due to: %s", reason);
5423 
5424    if (!batch->cdm.bo)
5425       batch->cdm = agx_encoder_allocate(batch, dev);
5426 
5427    struct agx_ptr images = agx_pool_alloc_aligned(
5428       &batch->pool, sizeof(struct libagx_decompress_images), 64);
5429    struct libagx_decompress_images *img = images.cpu;
5430 
5431    struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
5432    sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
5433    struct pipe_image_view view = image_view_for_surface(surf);
5434    agx_pack_texture(&img->compressed, rsrc, surf->format, &sampler_view);
5435    agx_batch_upload_pbe(batch, &img->uncompressed, &view, false, true, true,
5436                         true);
5437 
5438    struct agx_grid grid =
5439       agx_3d(ail_metadata_width_tl(layout, level) * 32,
5440              ail_metadata_height_tl(layout, level),
5441              surf->u.tex.last_layer - surf->u.tex.first_layer + 1);
5442 
5443    libagx_decompress(batch, grid, AGX_BARRIER_ALL, layout,
5444                      surf->u.tex.first_layer, level,
5445                      agx_map_texture_gpu(rsrc, 0), images.gpu);
5446 }
5447 
5448 void
agx_init_state_functions(struct pipe_context * ctx)5449 agx_init_state_functions(struct pipe_context *ctx)
5450 {
5451    ctx->create_blend_state = agx_create_blend_state;
5452    ctx->create_depth_stencil_alpha_state = agx_create_zsa_state;
5453    ctx->create_fs_state = agx_create_shader_state;
5454    ctx->create_rasterizer_state = agx_create_rs_state;
5455    ctx->create_sampler_state = agx_create_sampler_state;
5456    ctx->create_sampler_view = agx_create_sampler_view;
5457    ctx->create_surface = agx_create_surface;
5458    ctx->create_vertex_elements_state = agx_create_vertex_elements;
5459    ctx->create_vs_state = agx_create_shader_state;
5460    ctx->create_gs_state = agx_create_shader_state;
5461    ctx->create_tcs_state = agx_create_shader_state;
5462    ctx->create_tes_state = agx_create_shader_state;
5463    ctx->create_compute_state = agx_create_compute_state;
5464    ctx->bind_blend_state = agx_bind_blend_state;
5465    ctx->bind_depth_stencil_alpha_state = agx_bind_zsa_state;
5466    ctx->bind_sampler_states = agx_bind_sampler_states;
5467    ctx->bind_fs_state = agx_bind_fs_state;
5468    ctx->bind_rasterizer_state = agx_bind_rasterizer_state;
5469    ctx->bind_vertex_elements_state = agx_bind_vertex_elements_state;
5470    ctx->bind_vs_state = agx_bind_vs_state;
5471    ctx->bind_gs_state = agx_bind_gs_state;
5472    ctx->bind_tcs_state = agx_bind_tcs_state;
5473    ctx->bind_tes_state = agx_bind_tes_state;
5474    ctx->bind_compute_state = agx_bind_cs_state;
5475    ctx->delete_blend_state = agx_delete_state;
5476    ctx->delete_depth_stencil_alpha_state = agx_delete_state;
5477    ctx->delete_fs_state = agx_delete_shader_state;
5478    ctx->delete_compute_state = agx_delete_shader_state;
5479    ctx->delete_rasterizer_state = agx_delete_state;
5480    ctx->delete_sampler_state = agx_delete_sampler_state;
5481    ctx->delete_vertex_elements_state = agx_delete_state;
5482    ctx->delete_vs_state = agx_delete_shader_state;
5483    ctx->delete_gs_state = agx_delete_shader_state;
5484    ctx->delete_tcs_state = agx_delete_shader_state;
5485    ctx->delete_tes_state = agx_delete_shader_state;
5486    ctx->set_blend_color = agx_set_blend_color;
5487    ctx->set_clip_state = agx_set_clip_state;
5488    ctx->set_constant_buffer = agx_set_constant_buffer;
5489    ctx->set_shader_buffers = agx_set_shader_buffers;
5490    ctx->set_shader_images = agx_set_shader_images;
5491    ctx->set_sampler_views = agx_set_sampler_views;
5492    ctx->set_framebuffer_state = agx_set_framebuffer_state;
5493    ctx->set_polygon_stipple = agx_set_polygon_stipple;
5494    ctx->set_patch_vertices = agx_set_patch_vertices;
5495    ctx->set_sample_mask = agx_set_sample_mask;
5496    ctx->set_scissor_states = agx_set_scissor_states;
5497    ctx->set_stencil_ref = agx_set_stencil_ref;
5498    ctx->set_vertex_buffers = agx_set_vertex_buffers;
5499    ctx->set_viewport_states = agx_set_viewport_states;
5500    ctx->sampler_view_destroy = agx_sampler_view_destroy;
5501    ctx->surface_destroy = agx_surface_destroy;
5502    ctx->draw_vbo = agx_draw_vbo;
5503    ctx->launch_grid = agx_launch_grid;
5504    ctx->set_global_binding = agx_set_global_binding;
5505    ctx->texture_barrier = agx_texture_barrier;
5506    ctx->get_compute_state_info = agx_get_compute_state_info;
5507    ctx->set_tess_state = agx_set_tess_state;
5508 }
5509