• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2019-2020 Collabora, Ltd.
4  * Copyright 2014-2017 Broadcom
5  * Copyright 2010 Red Hat Inc.
6  * SPDX-License-Identifier: MIT
7  */
8 #include "agx_state.h"
9 #include <errno.h>
10 #include <stdio.h>
11 #include "asahi/compiler/agx_compile.h"
12 #include "asahi/genxml/agx_pack.h"
13 #include "asahi/layout/layout.h"
14 #include "asahi/lib/agx_formats.h"
15 #include "asahi/lib/agx_helpers.h"
16 #include "asahi/lib/agx_nir_passes.h"
17 #include "asahi/lib/agx_ppp.h"
18 #include "asahi/lib/agx_usc.h"
19 #include "compiler/nir/nir.h"
20 #include "compiler/nir/nir_serialize.h"
21 #include "compiler/shader_enums.h"
22 #include "gallium/auxiliary/nir/pipe_nir.h"
23 #include "gallium/auxiliary/nir/tgsi_to_nir.h"
24 #include "gallium/auxiliary/tgsi/tgsi_from_mesa.h"
25 #include "gallium/auxiliary/util/u_blend.h"
26 #include "gallium/auxiliary/util/u_draw.h"
27 #include "gallium/auxiliary/util/u_framebuffer.h"
28 #include "gallium/auxiliary/util/u_helpers.h"
29 #include "gallium/auxiliary/util/u_prim_restart.h"
30 #include "gallium/auxiliary/util/u_viewport.h"
31 #include "pipe/p_context.h"
32 #include "pipe/p_defines.h"
33 #include "pipe/p_screen.h"
34 #include "pipe/p_state.h"
35 #include "tessellator/p_tessellator.h"
36 #include "util/bitscan.h"
37 #include "util/bitset.h"
38 #include "util/blend.h"
39 #include "util/blob.h"
40 #include "util/compiler.h"
41 #include "util/format/u_format.h"
42 #include "util/format_srgb.h"
43 #include "util/half_float.h"
44 #include "util/hash_table.h"
45 #include "util/macros.h"
46 #include "util/ralloc.h"
47 #include "util/u_dump.h"
48 #include "util/u_inlines.h"
49 #include "util/u_math.h"
50 #include "util/u_memory.h"
51 #include "util/u_prim.h"
52 #include "util/u_resource.h"
53 #include "util/u_transfer.h"
54 #include "util/u_upload_mgr.h"
55 #include "agx_bo.h"
56 #include "agx_device.h"
57 #include "agx_disk_cache.h"
58 #include "agx_nir_lower_gs.h"
59 #include "agx_nir_lower_vbo.h"
60 #include "agx_tilebuffer.h"
61 #include "nir_builder.h"
62 #include "nir_builder_opcodes.h"
63 #include "nir_intrinsics.h"
64 #include "nir_intrinsics_indices.h"
65 #include "nir_xfb_info.h"
66 #include "pool.h"
67 
68 void
agx_legalize_compression(struct agx_context * ctx,struct agx_resource * rsrc,enum pipe_format format)69 agx_legalize_compression(struct agx_context *ctx, struct agx_resource *rsrc,
70                          enum pipe_format format)
71 {
72    /* If the resource isn't compressed, we can reinterpret */
73    if (rsrc->layout.tiling != AIL_TILING_TWIDDLED_COMPRESSED)
74       return;
75 
76    /* The physical format */
77    enum pipe_format storage = rsrc->layout.format;
78 
79    /* If the formats are compatible, we don't have to decompress. Compatible
80     * formats have the same number/size/order of channels, but may differ in
81     * data type. For example, R32_SINT is compatible with Z32_FLOAT, but not
82     * with R16G16_SINT. This is the relation given by the "channels" part of the
83     * decomposed format.
84     *
85     * This has not been exhaustively tested and might be missing some corner
86     * cases around XR formats, but is well-motivated and seems to work.
87     */
88    if (agx_pixel_format[storage].channels == agx_pixel_format[format].channels)
89       return;
90 
91    /* Otherwise, decompress. */
92    agx_decompress(ctx, rsrc, "Incompatible formats");
93 }
94 
95 static void
agx_set_shader_images(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * iviews)96 agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader,
97                       unsigned start_slot, unsigned count,
98                       unsigned unbind_num_trailing_slots,
99                       const struct pipe_image_view *iviews)
100 {
101    struct agx_context *ctx = agx_context(pctx);
102    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
103 
104    /* Unbind start_slot...start_slot+count */
105    if (!iviews) {
106       for (int i = start_slot;
107            i < start_slot + count + unbind_num_trailing_slots; i++) {
108          pipe_resource_reference(&ctx->stage[shader].images[i].resource, NULL);
109       }
110 
111       ctx->stage[shader].image_mask &=
112          ~BITFIELD64_MASK(count + unbind_num_trailing_slots) << start_slot;
113       return;
114    }
115 
116    /* Images writeable with pixel granularity are incompatible with
117     * compression. Decompress if necessary.
118     *
119     * Driver-internal images are used by the compute blitter and are exempt
120     * from these transitions, as it only uses compressed images when safe.
121     *
122     * We do this upfront because agx_decompress and agx_legalize_compression can
123     * call set_shader_images internall.
124     */
125    for (int i = 0; i < count; i++) {
126       const struct pipe_image_view *image = &iviews[i];
127       struct agx_resource *rsrc = agx_resource(image->resource);
128 
129       if (rsrc && !(image->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL)) {
130          if (!rsrc->layout.writeable_image &&
131              (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)) {
132 
133             agx_decompress(ctx, rsrc, "Shader image");
134          }
135 
136          /* Readable images may be compressed but are still subject to format
137           * reinterpretation rules.
138           */
139          agx_legalize_compression(ctx, rsrc, image->format);
140 
141          if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)
142             assert(rsrc->layout.writeable_image);
143       }
144    }
145 
146    /* Bind start_slot...start_slot+count */
147    for (int i = 0; i < count; i++) {
148       const struct pipe_image_view *image = &iviews[i];
149 
150       if (!image->resource) {
151          util_copy_image_view(&ctx->stage[shader].images[start_slot + i], NULL);
152          ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + i);
153       } else {
154          util_copy_image_view(&ctx->stage[shader].images[start_slot + i],
155                               image);
156          ctx->stage[shader].image_mask |= BITFIELD_BIT(start_slot + i);
157       }
158    }
159 
160    /* Unbind start_slot+count...start_slot+count+unbind_num_trailing_slots */
161    for (int i = 0; i < unbind_num_trailing_slots; i++) {
162       ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + count + i);
163       util_copy_image_view(&ctx->stage[shader].images[start_slot + count + i],
164                            NULL);
165    }
166 }
167 
168 static void
agx_set_shader_buffers(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)169 agx_set_shader_buffers(struct pipe_context *pctx, enum pipe_shader_type shader,
170                        unsigned start, unsigned count,
171                        const struct pipe_shader_buffer *buffers,
172                        unsigned writable_bitmask)
173 {
174    struct agx_context *ctx = agx_context(pctx);
175 
176    util_set_shader_buffers_mask(ctx->stage[shader].ssbo,
177                                 &ctx->stage[shader].ssbo_mask, buffers, start,
178                                 count);
179 
180    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SSBO;
181    ctx->stage[shader].ssbo_writable_mask &= ~(BITFIELD_MASK(count) << start);
182    ctx->stage[shader].ssbo_writable_mask |= writable_bitmask << start;
183 }
184 
185 static void
agx_set_blend_color(struct pipe_context * pctx,const struct pipe_blend_color * state)186 agx_set_blend_color(struct pipe_context *pctx,
187                     const struct pipe_blend_color *state)
188 {
189    struct agx_context *ctx = agx_context(pctx);
190 
191    if (state)
192       memcpy(&ctx->blend_color, state, sizeof(*state));
193 
194    ctx->dirty |= AGX_DIRTY_BLEND_COLOR;
195 }
196 
197 static void
agx_set_patch_vertices(struct pipe_context * pctx,unsigned char n)198 agx_set_patch_vertices(struct pipe_context *pctx, unsigned char n)
199 {
200    struct agx_context *ctx = agx_context(pctx);
201    ctx->patch_vertices = n;
202 }
203 
204 static void
agx_set_tess_state(struct pipe_context * pctx,const float default_outer_level[4],const float default_inner_level[2])205 agx_set_tess_state(struct pipe_context *pctx,
206                    const float default_outer_level[4],
207                    const float default_inner_level[2])
208 {
209    struct agx_context *ctx = agx_context(pctx);
210 
211    memcpy(ctx->default_outer_level, default_outer_level, 4 * sizeof(float));
212    memcpy(ctx->default_inner_level, default_inner_level, 2 * sizeof(float));
213 }
214 
215 static void *
agx_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)216 agx_create_blend_state(struct pipe_context *ctx,
217                        const struct pipe_blend_state *state)
218 {
219    struct agx_blend *so = CALLOC_STRUCT(agx_blend);
220    struct agx_blend_key *key = &so->key;
221 
222    key->alpha_to_coverage = state->alpha_to_coverage;
223    key->alpha_to_one = state->alpha_to_one;
224 
225    key->logicop_func =
226       state->logicop_enable ? state->logicop_func : PIPE_LOGICOP_COPY;
227 
228    for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
229       unsigned rti = state->independent_blend_enable ? i : 0;
230       struct pipe_rt_blend_state rt = state->rt[rti];
231 
232       if (state->logicop_enable || !rt.blend_enable) {
233          /* No blending, but we get the colour mask below */
234          static const nir_lower_blend_channel replace = {
235             .func = PIPE_BLEND_ADD,
236             .src_factor = PIPE_BLENDFACTOR_ONE,
237             .dst_factor = PIPE_BLENDFACTOR_ZERO,
238          };
239 
240          key->rt[i].rgb = replace;
241          key->rt[i].alpha = replace;
242       } else {
243          key->rt[i].rgb.func = rt.rgb_func;
244          key->rt[i].rgb.src_factor = rt.rgb_src_factor;
245          key->rt[i].rgb.dst_factor = rt.rgb_dst_factor;
246 
247          key->rt[i].alpha.func = rt.alpha_func;
248          key->rt[i].alpha.src_factor = rt.alpha_src_factor;
249          key->rt[i].alpha.dst_factor = rt.alpha_dst_factor;
250       }
251 
252       key->rt[i].colormask = rt.colormask;
253 
254       if (rt.colormask)
255          so->store |= (PIPE_CLEAR_COLOR0 << i);
256    }
257 
258    return so;
259 }
260 
261 static void
agx_bind_blend_state(struct pipe_context * pctx,void * cso)262 agx_bind_blend_state(struct pipe_context *pctx, void *cso)
263 {
264    struct agx_context *ctx = agx_context(pctx);
265    ctx->blend = cso;
266    ctx->dirty |= AGX_DIRTY_BLEND;
267 }
268 
269 static const enum agx_stencil_op agx_stencil_ops[PIPE_STENCIL_OP_INVERT + 1] = {
270    [PIPE_STENCIL_OP_KEEP] = AGX_STENCIL_OP_KEEP,
271    [PIPE_STENCIL_OP_ZERO] = AGX_STENCIL_OP_ZERO,
272    [PIPE_STENCIL_OP_REPLACE] = AGX_STENCIL_OP_REPLACE,
273    [PIPE_STENCIL_OP_INCR] = AGX_STENCIL_OP_INCR_SAT,
274    [PIPE_STENCIL_OP_DECR] = AGX_STENCIL_OP_DECR_SAT,
275    [PIPE_STENCIL_OP_INCR_WRAP] = AGX_STENCIL_OP_INCR_WRAP,
276    [PIPE_STENCIL_OP_DECR_WRAP] = AGX_STENCIL_OP_DECR_WRAP,
277    [PIPE_STENCIL_OP_INVERT] = AGX_STENCIL_OP_INVERT,
278 };
279 
280 static void
agx_pack_stencil(struct agx_fragment_stencil_packed * out,struct pipe_stencil_state st)281 agx_pack_stencil(struct agx_fragment_stencil_packed *out,
282                  struct pipe_stencil_state st)
283 {
284    if (st.enabled) {
285       agx_pack(out, FRAGMENT_STENCIL, cfg) {
286          cfg.compare = (enum agx_zs_func)st.func;
287          cfg.write_mask = st.writemask;
288          cfg.read_mask = st.valuemask;
289 
290          cfg.depth_pass = agx_stencil_ops[st.zpass_op];
291          cfg.depth_fail = agx_stencil_ops[st.zfail_op];
292          cfg.stencil_fail = agx_stencil_ops[st.fail_op];
293       }
294    } else {
295       agx_pack(out, FRAGMENT_STENCIL, cfg) {
296          cfg.compare = AGX_ZS_FUNC_ALWAYS;
297          cfg.write_mask = 0xFF;
298          cfg.read_mask = 0xFF;
299 
300          cfg.depth_pass = AGX_STENCIL_OP_KEEP;
301          cfg.depth_fail = AGX_STENCIL_OP_KEEP;
302          cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
303       }
304    }
305 }
306 
307 static void *
agx_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)308 agx_create_zsa_state(struct pipe_context *ctx,
309                      const struct pipe_depth_stencil_alpha_state *state)
310 {
311    struct agx_zsa *so = CALLOC_STRUCT(agx_zsa);
312    assert(!state->depth_bounds_test && "todo");
313 
314    so->base = *state;
315 
316    /* Handle the enable flag */
317    enum pipe_compare_func depth_func =
318       state->depth_enabled ? state->depth_func : PIPE_FUNC_ALWAYS;
319 
320    /* Z func can otherwise be used as-is */
321    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NEVER == AGX_ZS_FUNC_NEVER);
322    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LESS == AGX_ZS_FUNC_LESS);
323    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_EQUAL == AGX_ZS_FUNC_EQUAL);
324    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LEQUAL == AGX_ZS_FUNC_LEQUAL);
325    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GREATER == AGX_ZS_FUNC_GREATER);
326    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NOTEQUAL == AGX_ZS_FUNC_NOT_EQUAL);
327    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GEQUAL == AGX_ZS_FUNC_GEQUAL);
328    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_ALWAYS == AGX_ZS_FUNC_ALWAYS);
329 
330    agx_pack(&so->depth, FRAGMENT_FACE, cfg) {
331       cfg.depth_function = (enum agx_zs_func)depth_func;
332       cfg.disable_depth_write = !state->depth_writemask;
333    }
334 
335    agx_pack_stencil(&so->front_stencil, state->stencil[0]);
336 
337    if (state->stencil[1].enabled) {
338       agx_pack_stencil(&so->back_stencil, state->stencil[1]);
339    } else {
340       /* One sided stencil */
341       so->back_stencil = so->front_stencil;
342    }
343 
344    if (depth_func != PIPE_FUNC_NEVER && depth_func != PIPE_FUNC_ALWAYS)
345       so->load |= PIPE_CLEAR_DEPTH;
346 
347    if (state->depth_writemask) {
348       so->load |= PIPE_CLEAR_DEPTH;
349       so->store |= PIPE_CLEAR_DEPTH;
350    }
351 
352    if (state->stencil[0].enabled) {
353       so->load |= PIPE_CLEAR_STENCIL; /* TODO: Optimize */
354       so->store |= PIPE_CLEAR_STENCIL;
355    }
356 
357    return so;
358 }
359 
360 static void
agx_bind_zsa_state(struct pipe_context * pctx,void * cso)361 agx_bind_zsa_state(struct pipe_context *pctx, void *cso)
362 {
363    struct agx_context *ctx = agx_context(pctx);
364    ctx->zs = cso;
365    ctx->dirty |= AGX_DIRTY_ZS;
366 }
367 
368 static enum agx_polygon_mode
agx_translate_polygon_mode(unsigned mode)369 agx_translate_polygon_mode(unsigned mode)
370 {
371    switch (mode) {
372    case PIPE_POLYGON_MODE_FILL:
373       return AGX_POLYGON_MODE_FILL;
374    case PIPE_POLYGON_MODE_POINT:
375       return AGX_POLYGON_MODE_POINT;
376    case PIPE_POLYGON_MODE_LINE:
377       return AGX_POLYGON_MODE_LINE;
378    default:
379       unreachable("Unsupported polygon mode");
380    }
381 }
382 
383 static void *
agx_create_rs_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * cso)384 agx_create_rs_state(struct pipe_context *ctx,
385                     const struct pipe_rasterizer_state *cso)
386 {
387    struct agx_rasterizer *so = CALLOC_STRUCT(agx_rasterizer);
388    so->base = *cso;
389 
390    agx_pack(so->cull, CULL, cfg) {
391       cfg.cull_front = cso->cull_face & PIPE_FACE_FRONT;
392       cfg.cull_back = cso->cull_face & PIPE_FACE_BACK;
393       cfg.front_face_ccw = cso->front_ccw;
394       cfg.depth_clip = cso->depth_clip_near;
395       cfg.depth_clamp = !cso->depth_clip_near;
396       cfg.flat_shading_vertex =
397          cso->flatshade_first ? AGX_PPP_VERTEX_0 : AGX_PPP_VERTEX_2;
398       cfg.rasterizer_discard = cso->rasterizer_discard;
399    };
400 
401    /* Two-sided polygon mode doesn't seem to work on G13. Apple's OpenGL
402     * implementation lowers to multiple draws with culling. Warn.
403     */
404    if (unlikely(cso->fill_front != cso->fill_back)) {
405       agx_msg("Warning: Two-sided fill modes are unsupported, "
406               "rendering may be incorrect.\n");
407    }
408 
409    so->polygon_mode = agx_translate_polygon_mode(cso->fill_front);
410    so->line_width = agx_pack_line_width(cso->line_width);
411 
412    return so;
413 }
414 
415 static void
agx_bind_rasterizer_state(struct pipe_context * pctx,void * cso)416 agx_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
417 {
418    struct agx_context *ctx = agx_context(pctx);
419    struct agx_rasterizer *so = cso;
420 
421    bool base_cso_changed = (cso == NULL) || (ctx->rast == NULL);
422 
423    /* Check if scissor or depth bias state has changed, since scissor/depth bias
424     * enable is part of the rasterizer state but everything else needed for
425     * scissors and depth bias is part of the scissor/depth bias arrays */
426    bool scissor_zbias_changed =
427       base_cso_changed || (ctx->rast->base.scissor != so->base.scissor) ||
428       (ctx->rast->base.offset_tri != so->base.offset_tri);
429 
430    ctx->dirty |= AGX_DIRTY_RS;
431 
432    if (scissor_zbias_changed)
433       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
434 
435    if (base_cso_changed ||
436        (ctx->rast->base.sprite_coord_mode != so->base.sprite_coord_mode))
437       ctx->dirty |= AGX_DIRTY_SPRITE_COORD_MODE;
438 
439    ctx->rast = so;
440 }
441 
442 static bool
has_edgeflags(struct agx_context * ctx,enum mesa_prim mode)443 has_edgeflags(struct agx_context *ctx, enum mesa_prim mode)
444 {
445    return ctx->stage[PIPE_SHADER_VERTEX].shader->info.has_edgeflags &&
446           mode == MESA_PRIM_TRIANGLES &&
447           (ctx->rast->base.fill_front != PIPE_POLYGON_MODE_FILL);
448 }
449 
450 static enum agx_wrap
agx_wrap_from_pipe(enum pipe_tex_wrap in)451 agx_wrap_from_pipe(enum pipe_tex_wrap in)
452 {
453    switch (in) {
454    case PIPE_TEX_WRAP_REPEAT:
455       return AGX_WRAP_REPEAT;
456    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
457       return AGX_WRAP_CLAMP_TO_EDGE;
458    case PIPE_TEX_WRAP_MIRROR_REPEAT:
459       return AGX_WRAP_MIRRORED_REPEAT;
460    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
461       return AGX_WRAP_CLAMP_TO_BORDER;
462    case PIPE_TEX_WRAP_CLAMP:
463       return AGX_WRAP_CLAMP_GL;
464    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
465       return AGX_WRAP_MIRRORED_CLAMP_TO_EDGE;
466    default:
467       unreachable("Invalid wrap mode");
468    }
469 }
470 
471 static enum agx_mip_filter
agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)472 agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)
473 {
474    switch (in) {
475    case PIPE_TEX_MIPFILTER_NEAREST:
476       return AGX_MIP_FILTER_NEAREST;
477    case PIPE_TEX_MIPFILTER_LINEAR:
478       return AGX_MIP_FILTER_LINEAR;
479    case PIPE_TEX_MIPFILTER_NONE:
480       return AGX_MIP_FILTER_NONE;
481    }
482 
483    unreachable("Invalid mip filter");
484 }
485 
486 static const enum agx_compare_func agx_compare_funcs[PIPE_FUNC_ALWAYS + 1] = {
487    [PIPE_FUNC_NEVER] = AGX_COMPARE_FUNC_NEVER,
488    [PIPE_FUNC_LESS] = AGX_COMPARE_FUNC_LESS,
489    [PIPE_FUNC_EQUAL] = AGX_COMPARE_FUNC_EQUAL,
490    [PIPE_FUNC_LEQUAL] = AGX_COMPARE_FUNC_LEQUAL,
491    [PIPE_FUNC_GREATER] = AGX_COMPARE_FUNC_GREATER,
492    [PIPE_FUNC_NOTEQUAL] = AGX_COMPARE_FUNC_NOT_EQUAL,
493    [PIPE_FUNC_GEQUAL] = AGX_COMPARE_FUNC_GEQUAL,
494    [PIPE_FUNC_ALWAYS] = AGX_COMPARE_FUNC_ALWAYS,
495 };
496 
497 static const enum agx_filter agx_filters[] = {
498    [PIPE_TEX_FILTER_LINEAR] = AGX_FILTER_LINEAR,
499    [PIPE_TEX_FILTER_NEAREST] = AGX_FILTER_NEAREST,
500 };
501 
502 static enum pipe_format
fixup_border_zs(enum pipe_format orig,union pipe_color_union * c)503 fixup_border_zs(enum pipe_format orig, union pipe_color_union *c)
504 {
505    switch (orig) {
506    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
507    case PIPE_FORMAT_Z24X8_UNORM:
508       /* Z24 is internally promoted to Z32F via transfer_helper. These formats
509        * are normalized so should get clamped, but Z32F does not get clamped, so
510        * we clamp here.
511        */
512       c->f[0] = SATURATE(c->f[0]);
513       return PIPE_FORMAT_Z32_FLOAT;
514 
515    case PIPE_FORMAT_X24S8_UINT:
516    case PIPE_FORMAT_X32_S8X24_UINT:
517       /* Separate stencil is internally promoted */
518       return PIPE_FORMAT_S8_UINT;
519 
520    default:
521       return orig;
522    }
523 }
524 
525 static void *
agx_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * state)526 agx_create_sampler_state(struct pipe_context *pctx,
527                          const struct pipe_sampler_state *state)
528 {
529    struct agx_sampler_state *so = CALLOC_STRUCT(agx_sampler_state);
530    so->base = *state;
531 
532    /* We report a max texture LOD bias of 16, so clamp appropriately */
533    float lod_bias = CLAMP(state->lod_bias, -16.0, 16.0);
534    so->lod_bias_as_fp16 = _mesa_float_to_half(lod_bias);
535 
536    agx_pack(&so->desc, SAMPLER, cfg) {
537       cfg.minimum_lod = state->min_lod;
538       cfg.maximum_lod = state->max_lod;
539       cfg.maximum_anisotropy =
540          util_next_power_of_two(MAX2(state->max_anisotropy, 1));
541       cfg.magnify = agx_filters[state->mag_img_filter];
542       cfg.minify = agx_filters[state->min_img_filter];
543       cfg.mip_filter = agx_mip_filter_from_pipe(state->min_mip_filter);
544       cfg.wrap_s = agx_wrap_from_pipe(state->wrap_s);
545       cfg.wrap_t = agx_wrap_from_pipe(state->wrap_t);
546       cfg.wrap_r = agx_wrap_from_pipe(state->wrap_r);
547       cfg.pixel_coordinates = state->unnormalized_coords;
548       cfg.compare_func = agx_compare_funcs[state->compare_func];
549       cfg.compare_enable = state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE;
550       cfg.seamful_cube_maps = !state->seamless_cube_map;
551 
552       if (state->border_color_format != PIPE_FORMAT_NONE) {
553          /* TODO: Optimize to use compact descriptors for black/white borders */
554          so->uses_custom_border = true;
555          cfg.border_colour = AGX_BORDER_COLOUR_CUSTOM;
556       }
557    }
558 
559    memcpy(&so->desc_without_custom_border, &so->desc, sizeof(so->desc));
560 
561    if (so->uses_custom_border) {
562       union pipe_color_union border = state->border_color;
563       enum pipe_format format =
564          fixup_border_zs(state->border_color_format, &border);
565 
566       agx_pack_border(&so->border, border.ui, format);
567 
568       /* Neutralize the bindless-safe descriptor. XXX: This is a hack. */
569       so->desc_without_custom_border.opaque[1] &= ~(1u << 23);
570    }
571 
572    return so;
573 }
574 
575 static void
agx_delete_sampler_state(struct pipe_context * ctx,void * state)576 agx_delete_sampler_state(struct pipe_context *ctx, void *state)
577 {
578    struct agx_sampler_state *so = state;
579    FREE(so);
580 }
581 
582 static void
agx_bind_sampler_states(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,void ** states)583 agx_bind_sampler_states(struct pipe_context *pctx, enum pipe_shader_type shader,
584                         unsigned start, unsigned count, void **states)
585 {
586    struct agx_context *ctx = agx_context(pctx);
587 
588    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SAMPLER;
589 
590    for (unsigned i = 0; i < count; i++) {
591       unsigned p = start + i;
592       ctx->stage[shader].samplers[p] = states ? states[i] : NULL;
593       if (ctx->stage[shader].samplers[p])
594          ctx->stage[shader].valid_samplers |= BITFIELD_BIT(p);
595       else
596          ctx->stage[shader].valid_samplers &= ~BITFIELD_BIT(p);
597    }
598 
599    ctx->stage[shader].sampler_count =
600       util_last_bit(ctx->stage[shader].valid_samplers);
601 
602    /* Recalculate whether we need custom borders */
603    ctx->stage[shader].custom_borders = false;
604 
605    u_foreach_bit(i, ctx->stage[shader].valid_samplers) {
606       if (ctx->stage[shader].samplers[i]->uses_custom_border)
607          ctx->stage[shader].custom_borders = true;
608    }
609 }
610 
611 /* See agx_stage_needs_bindless_sampler for explanation */
612 static enum pipe_shader_type
merged_stage(struct agx_context * ctx,enum pipe_shader_type stage)613 merged_stage(struct agx_context *ctx, enum pipe_shader_type stage)
614 {
615    return stage == MESA_SHADER_TESS_CTRL ? MESA_SHADER_VERTEX : stage;
616 }
617 
618 static enum agx_texture_dimension
agx_translate_tex_dim(enum pipe_texture_target dim,unsigned samples)619 agx_translate_tex_dim(enum pipe_texture_target dim, unsigned samples)
620 {
621    assert(samples >= 1);
622 
623    switch (dim) {
624    case PIPE_BUFFER:
625    case PIPE_TEXTURE_1D:
626       /* Lowered to 2D */
627       assert(samples == 1);
628       return AGX_TEXTURE_DIMENSION_2D;
629 
630    case PIPE_TEXTURE_RECT:
631    case PIPE_TEXTURE_2D:
632       return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
633                          : AGX_TEXTURE_DIMENSION_2D;
634 
635    case PIPE_TEXTURE_1D_ARRAY:
636       assert(samples == 1);
637       /* Lowered to 2D */
638       FALLTHROUGH;
639    case PIPE_TEXTURE_2D_ARRAY:
640       return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
641                          : AGX_TEXTURE_DIMENSION_2D_ARRAY;
642 
643    case PIPE_TEXTURE_3D:
644       assert(samples == 1);
645       return AGX_TEXTURE_DIMENSION_3D;
646 
647    case PIPE_TEXTURE_CUBE:
648       assert(samples == 1);
649       return AGX_TEXTURE_DIMENSION_CUBE;
650 
651    case PIPE_TEXTURE_CUBE_ARRAY:
652       assert(samples == 1);
653       return AGX_TEXTURE_DIMENSION_CUBE_ARRAY;
654 
655    default:
656       unreachable("Unsupported texture dimension");
657    }
658 }
659 
660 static enum agx_sample_count
agx_translate_sample_count(unsigned samples)661 agx_translate_sample_count(unsigned samples)
662 {
663    switch (samples) {
664    case 2:
665       return AGX_SAMPLE_COUNT_2;
666    case 4:
667       return AGX_SAMPLE_COUNT_4;
668    default:
669       unreachable("Invalid sample count");
670    }
671 }
672 
673 static bool
target_is_cube(enum pipe_texture_target target)674 target_is_cube(enum pipe_texture_target target)
675 {
676    return target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY;
677 }
678 
679 static void
agx_pack_texture(void * out,struct agx_resource * rsrc,enum pipe_format format,const struct pipe_sampler_view * state)680 agx_pack_texture(void *out, struct agx_resource *rsrc,
681                  enum pipe_format format /* override */,
682                  const struct pipe_sampler_view *state)
683 {
684    const struct util_format_description *desc = util_format_description(format);
685 
686    assert(agx_is_valid_pixel_format(format));
687 
688    uint8_t format_swizzle[4] = {
689       desc->swizzle[0],
690       desc->swizzle[1],
691       desc->swizzle[2],
692       desc->swizzle[3],
693    };
694 
695    if (util_format_is_depth_or_stencil(format)) {
696       assert(!util_format_is_depth_and_stencil(format) &&
697              "separate stencil always used");
698 
699       /* Broadcast depth and stencil */
700       format_swizzle[0] = 0;
701       format_swizzle[1] = 0;
702       format_swizzle[2] = 0;
703       format_swizzle[3] = 0;
704    }
705 
706    /* We only have a single swizzle for the user swizzle and the format fixup,
707     * so compose them now. */
708    uint8_t out_swizzle[4];
709    uint8_t view_swizzle[4] = {state->swizzle_r, state->swizzle_g,
710                               state->swizzle_b, state->swizzle_a};
711 
712    util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle);
713 
714    unsigned first_layer =
715       (state->target == PIPE_BUFFER) ? 0 : state->u.tex.first_layer;
716 
717    /* Pack the descriptor into GPU memory */
718    agx_pack(out, TEXTURE, cfg) {
719       cfg.dimension = agx_translate_tex_dim(state->target,
720                                             util_res_sample_count(&rsrc->base));
721       cfg.layout = agx_translate_layout(rsrc->layout.tiling);
722       cfg.channels = agx_pixel_format[format].channels;
723       cfg.type = agx_pixel_format[format].type;
724       cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]);
725       cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]);
726       cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]);
727       cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]);
728 
729       if (state->target == PIPE_BUFFER) {
730          unsigned size_el =
731             agx_texture_buffer_size_el(format, state->u.buf.size);
732 
733          /* Use a 2D texture to increase the maximum size */
734          cfg.width = 1024;
735          cfg.height = DIV_ROUND_UP(size_el, cfg.width);
736          cfg.first_level = cfg.last_level = 0;
737 
738          /* Stash the actual size in the software-defined section for txs */
739          cfg.software_defined = size_el;
740       } else {
741          cfg.width = rsrc->base.width0;
742          cfg.height = rsrc->base.height0;
743          cfg.first_level = state->u.tex.first_level;
744          cfg.last_level = state->u.tex.last_level;
745       }
746 
747       cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
748       cfg.unk_mipmapped = rsrc->mipmapped;
749       cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
750 
751       if (ail_is_compressed(&rsrc->layout)) {
752          cfg.compressed_1 = true;
753          cfg.extended = true;
754       }
755 
756       cfg.address = agx_map_texture_gpu(rsrc, first_layer);
757 
758       if (state->target == PIPE_BUFFER)
759          cfg.address += state->u.buf.offset;
760 
761       if (ail_is_compressed(&rsrc->layout)) {
762          cfg.acceleration_buffer =
763             agx_map_texture_gpu(rsrc, 0) + rsrc->layout.metadata_offset_B +
764             (first_layer * rsrc->layout.compression_layer_stride_B);
765       }
766 
767       if (state->target == PIPE_TEXTURE_3D) {
768          cfg.depth = rsrc->base.depth0;
769       } else if (state->target == PIPE_BUFFER) {
770          cfg.depth = 1;
771       } else {
772          unsigned layers =
773             state->u.tex.last_layer - state->u.tex.first_layer + 1;
774 
775          if (target_is_cube(state->target))
776             layers /= 6;
777 
778          if (rsrc->layout.tiling == AIL_TILING_LINEAR &&
779              (state->target == PIPE_TEXTURE_1D_ARRAY ||
780               state->target == PIPE_TEXTURE_2D_ARRAY)) {
781 
782             cfg.depth_linear = layers;
783             cfg.layer_stride_linear = (rsrc->layout.layer_stride_B - 0x80);
784             cfg.extended = true;
785          } else {
786             assert((rsrc->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
787             cfg.depth = layers;
788          }
789       }
790 
791       if (rsrc->base.nr_samples > 1)
792          cfg.samples = agx_translate_sample_count(rsrc->base.nr_samples);
793 
794       if (state->target == PIPE_BUFFER) {
795          cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16;
796       } else if (rsrc->layout.tiling == AIL_TILING_LINEAR) {
797          cfg.stride = ail_get_linear_stride_B(&rsrc->layout, 0) - 16;
798       } else {
799          assert(rsrc->layout.tiling == AIL_TILING_TWIDDLED ||
800                 rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED);
801 
802          cfg.page_aligned_layers = rsrc->layout.page_aligned_layers;
803       }
804    }
805 }
806 
807 static struct pipe_sampler_view *
agx_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * orig_texture,const struct pipe_sampler_view * state)808 agx_create_sampler_view(struct pipe_context *pctx,
809                         struct pipe_resource *orig_texture,
810                         const struct pipe_sampler_view *state)
811 {
812    struct agx_resource *rsrc = agx_resource(orig_texture);
813    struct agx_sampler_view *so = CALLOC_STRUCT(agx_sampler_view);
814 
815    if (!so)
816       return NULL;
817 
818    struct pipe_resource *texture = orig_texture;
819    enum pipe_format format = state->format;
820 
821    const struct util_format_description *desc = util_format_description(format);
822 
823    /* Separate stencil always used on G13, so we need to fix up for Z32S8 */
824    if (util_format_has_stencil(desc) && rsrc->separate_stencil) {
825       if (util_format_has_depth(desc)) {
826          /* Reinterpret as the depth-only part */
827          format = util_format_get_depth_only(format);
828       } else {
829          /* Use the stencil-only-part */
830          rsrc = rsrc->separate_stencil;
831          texture = &rsrc->base;
832          format = texture->format;
833       }
834    }
835 
836    agx_legalize_compression(agx_context(pctx), rsrc, format);
837 
838    /* Save off the resource that we actually use, with the stencil fixed up */
839    so->rsrc = rsrc;
840    so->format = format;
841 
842    so->base = *state;
843    so->base.texture = NULL;
844    pipe_resource_reference(&so->base.texture, orig_texture);
845    pipe_reference_init(&so->base.reference, 1);
846    so->base.context = pctx;
847    return &so->base;
848 }
849 
850 static void
agx_set_sampler_views(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)851 agx_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
852                       unsigned start, unsigned count,
853                       unsigned unbind_num_trailing_slots, bool take_ownership,
854                       struct pipe_sampler_view **views)
855 {
856    struct agx_context *ctx = agx_context(pctx);
857    unsigned new_nr = 0;
858    unsigned i;
859 
860    assert(start == 0);
861 
862    if (!views)
863       count = 0;
864 
865    for (i = 0; i < count; ++i) {
866       if (take_ownership) {
867          pipe_sampler_view_reference(
868             (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
869          ctx->stage[shader].textures[i] = (struct agx_sampler_view *)views[i];
870       } else {
871          pipe_sampler_view_reference(
872             (struct pipe_sampler_view **)&ctx->stage[shader].textures[i],
873             views[i]);
874       }
875    }
876 
877    for (; i < count + unbind_num_trailing_slots; i++) {
878       pipe_sampler_view_reference(
879          (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
880    }
881 
882    for (unsigned t = 0; t < MAX2(ctx->stage[shader].texture_count, count);
883         ++t) {
884       if (ctx->stage[shader].textures[t])
885          new_nr = t + 1;
886    }
887 
888    ctx->stage[shader].texture_count = new_nr;
889    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
890 }
891 
892 static void
agx_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * pview)893 agx_sampler_view_destroy(struct pipe_context *ctx,
894                          struct pipe_sampler_view *pview)
895 {
896    struct agx_sampler_view *view = (struct agx_sampler_view *)pview;
897    pipe_resource_reference(&view->base.texture, NULL);
898    FREE(view);
899 }
900 
901 static struct pipe_surface *
agx_create_surface(struct pipe_context * ctx,struct pipe_resource * texture,const struct pipe_surface * surf_tmpl)902 agx_create_surface(struct pipe_context *ctx, struct pipe_resource *texture,
903                    const struct pipe_surface *surf_tmpl)
904 {
905    agx_legalize_compression(agx_context(ctx), agx_resource(texture),
906                             surf_tmpl->format);
907 
908    struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
909 
910    if (!surface)
911       return NULL;
912 
913    unsigned level = surf_tmpl->u.tex.level;
914 
915    pipe_reference_init(&surface->reference, 1);
916    pipe_resource_reference(&surface->texture, texture);
917 
918    assert(texture->target != PIPE_BUFFER && "buffers are not renderable");
919 
920    surface->context = ctx;
921    surface->format = surf_tmpl->format;
922    surface->nr_samples = surf_tmpl->nr_samples;
923    surface->width = u_minify(texture->width0, level);
924    surface->height = u_minify(texture->height0, level);
925    surface->texture = texture;
926    surface->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
927    surface->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
928    surface->u.tex.level = level;
929 
930    return surface;
931 }
932 
933 static void
agx_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)934 agx_set_clip_state(struct pipe_context *ctx,
935                    const struct pipe_clip_state *state)
936 {
937 }
938 
939 static void
agx_set_polygon_stipple(struct pipe_context * pctx,const struct pipe_poly_stipple * state)940 agx_set_polygon_stipple(struct pipe_context *pctx,
941                         const struct pipe_poly_stipple *state)
942 {
943    struct agx_context *ctx = agx_context(pctx);
944 
945    memcpy(ctx->poly_stipple, state->stipple, sizeof(ctx->poly_stipple));
946    ctx->dirty |= AGX_DIRTY_POLY_STIPPLE;
947 }
948 
949 static void
agx_set_sample_mask(struct pipe_context * pipe,unsigned sample_mask)950 agx_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
951 {
952    struct agx_context *ctx = agx_context(pipe);
953 
954    /* Optimization: At most MSAA 4x supported, so normalize to avoid pointless
955     * dirtying switching between e.g. 0xFFFF and 0xFFFFFFFF masks.
956     */
957    unsigned new_mask = sample_mask & BITFIELD_MASK(4);
958 
959    if (ctx->sample_mask != new_mask) {
960       ctx->sample_mask = new_mask;
961       ctx->dirty |= AGX_DIRTY_SAMPLE_MASK;
962    }
963 }
964 
965 static void
agx_set_scissor_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * scissor)966 agx_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
967                        unsigned num_scissors,
968                        const struct pipe_scissor_state *scissor)
969 {
970    struct agx_context *ctx = agx_context(pctx);
971 
972    STATIC_ASSERT(sizeof(ctx->scissor[0]) == sizeof(*scissor));
973    assert(start_slot + num_scissors <= AGX_MAX_VIEWPORTS);
974 
975    memcpy(&ctx->scissor[start_slot], scissor, sizeof(*scissor) * num_scissors);
976    ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
977 }
978 
979 static void
agx_set_stencil_ref(struct pipe_context * pctx,const struct pipe_stencil_ref state)980 agx_set_stencil_ref(struct pipe_context *pctx,
981                     const struct pipe_stencil_ref state)
982 {
983    struct agx_context *ctx = agx_context(pctx);
984    ctx->stencil_ref = state;
985    ctx->dirty |= AGX_DIRTY_STENCIL_REF;
986 }
987 
988 static void
agx_set_viewport_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_viewports,const struct pipe_viewport_state * vp)989 agx_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
990                         unsigned num_viewports,
991                         const struct pipe_viewport_state *vp)
992 {
993    struct agx_context *ctx = agx_context(pctx);
994 
995    STATIC_ASSERT(sizeof(ctx->viewport[0]) == sizeof(*vp));
996    assert(start_slot + num_viewports <= AGX_MAX_VIEWPORTS);
997 
998    memcpy(&ctx->viewport[start_slot], vp, sizeof(*vp) * num_viewports);
999    ctx->dirty |= AGX_DIRTY_VIEWPORT;
1000 }
1001 
1002 static void
agx_get_scissor_extents(const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,const struct pipe_framebuffer_state * fb,unsigned * minx,unsigned * miny,unsigned * maxx,unsigned * maxy)1003 agx_get_scissor_extents(const struct pipe_viewport_state *vp,
1004                         const struct pipe_scissor_state *ss,
1005                         const struct pipe_framebuffer_state *fb, unsigned *minx,
1006                         unsigned *miny, unsigned *maxx, unsigned *maxy)
1007 {
1008    float trans_x = vp->translate[0], trans_y = vp->translate[1];
1009    float abs_scale_x = fabsf(vp->scale[0]), abs_scale_y = fabsf(vp->scale[1]);
1010 
1011    /* Calculate the extent of the viewport. Note if a particular dimension of
1012     * the viewport is an odd number of pixels, both the translate and the scale
1013     * will have a fractional part of 0.5, so adding and subtracting them yields
1014     * an integer. Therefore we don't need to round explicitly */
1015    *minx = CLAMP((int)(trans_x - abs_scale_x), 0, fb->width);
1016    *miny = CLAMP((int)(trans_y - abs_scale_y), 0, fb->height);
1017    *maxx = CLAMP((int)(trans_x + abs_scale_x), 0, fb->width);
1018    *maxy = CLAMP((int)(trans_y + abs_scale_y), 0, fb->height);
1019 
1020    if (ss) {
1021       *minx = MAX2(ss->minx, *minx);
1022       *miny = MAX2(ss->miny, *miny);
1023       *maxx = MIN2(ss->maxx, *maxx);
1024       *maxy = MIN2(ss->maxy, *maxy);
1025    }
1026 }
1027 
1028 static void
agx_upload_viewport_scissor(struct agx_pool * pool,struct agx_batch * batch,uint8_t ** out,const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,bool clip_halfz,bool multi_viewport)1029 agx_upload_viewport_scissor(struct agx_pool *pool, struct agx_batch *batch,
1030                             uint8_t **out, const struct pipe_viewport_state *vp,
1031                             const struct pipe_scissor_state *ss,
1032                             bool clip_halfz, bool multi_viewport)
1033 {
1034    /* Number of viewports/scissors isn't precisely determinable in Gallium, so
1035     * just key off whether we can write to anything other than viewport 0. This
1036     * could be tuned in the future.
1037     */
1038    unsigned count = multi_viewport ? AGX_MAX_VIEWPORTS : 1;
1039 
1040    /* Allocate scissor descriptors */
1041    unsigned index = batch->scissor.size / AGX_SCISSOR_LENGTH;
1042    struct agx_scissor_packed *scissors =
1043       util_dynarray_grow_bytes(&batch->scissor, count, AGX_SCISSOR_LENGTH);
1044 
1045    unsigned minx[AGX_MAX_VIEWPORTS], miny[AGX_MAX_VIEWPORTS];
1046    unsigned maxx[AGX_MAX_VIEWPORTS], maxy[AGX_MAX_VIEWPORTS];
1047 
1048    /* Upload each scissor */
1049    for (unsigned i = 0; i < count; ++i) {
1050       agx_get_scissor_extents(&vp[i], ss ? &ss[i] : NULL, &batch->key, &minx[i],
1051                               &miny[i], &maxx[i], &maxy[i]);
1052 
1053       float minz, maxz;
1054       util_viewport_zmin_zmax(vp, clip_halfz, &minz, &maxz);
1055 
1056       agx_pack(scissors + i, SCISSOR, cfg) {
1057          cfg.min_x = minx[i];
1058          cfg.min_y = miny[i];
1059          cfg.min_z = minz;
1060          cfg.max_x = maxx[i];
1061          cfg.max_y = maxy[i];
1062          cfg.max_z = maxz;
1063       }
1064    }
1065 
1066    /* Upload state */
1067    struct agx_ppp_update ppp =
1068       agx_new_ppp_update(pool, (struct AGX_PPP_HEADER){
1069                                   .depth_bias_scissor = true,
1070                                   .region_clip = true,
1071                                   .viewport = true,
1072                                   .viewport_count = count,
1073                                });
1074 
1075    agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
1076       cfg.scissor = index;
1077 
1078       /* Use the current depth bias, we allocate linearly */
1079       unsigned count = batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
1080       cfg.depth_bias = count ? count - 1 : 0;
1081    };
1082 
1083    for (unsigned i = 0; i < count; ++i) {
1084       agx_ppp_push(&ppp, REGION_CLIP, cfg) {
1085          cfg.enable = true;
1086          cfg.min_x = minx[i] / 32;
1087          cfg.min_y = miny[i] / 32;
1088          cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
1089          cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
1090       }
1091    }
1092 
1093    agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
1094       ;
1095 
1096    /* Upload viewports */
1097    for (unsigned i = 0; i < count; ++i) {
1098       agx_ppp_push(&ppp, VIEWPORT, cfg) {
1099          cfg.translate_x = vp[i].translate[0];
1100          cfg.translate_y = vp[i].translate[1];
1101          cfg.translate_z = vp[i].translate[2];
1102          cfg.scale_x = vp[i].scale[0];
1103          cfg.scale_y = vp[i].scale[1];
1104          cfg.scale_z = vp[i].scale[2];
1105 
1106          if (!clip_halfz) {
1107             cfg.translate_z -= cfg.scale_z;
1108             cfg.scale_z *= 2;
1109          }
1110       }
1111    }
1112 
1113    agx_ppp_fini(out, &ppp);
1114 }
1115 
1116 static void
agx_upload_depth_bias(struct agx_batch * batch,const struct pipe_rasterizer_state * rast)1117 agx_upload_depth_bias(struct agx_batch *batch,
1118                       const struct pipe_rasterizer_state *rast)
1119 {
1120    void *ptr =
1121       util_dynarray_grow_bytes(&batch->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
1122 
1123    agx_pack(ptr, DEPTH_BIAS, cfg) {
1124       cfg.depth_bias = rast->offset_units * 2.0f;
1125       cfg.slope_scale = rast->offset_scale;
1126       cfg.clamp = rast->offset_clamp;
1127    }
1128 }
1129 
1130 /* A framebuffer state can be reused across batches, so it doesn't make sense
1131  * to add surfaces to the BO list here. Instead we added them when flushing.
1132  */
1133 
1134 static void
agx_set_framebuffer_state(struct pipe_context * pctx,const struct pipe_framebuffer_state * state)1135 agx_set_framebuffer_state(struct pipe_context *pctx,
1136                           const struct pipe_framebuffer_state *state)
1137 {
1138    struct agx_context *ctx = agx_context(pctx);
1139 
1140    if (!state)
1141       return;
1142 
1143    util_copy_framebuffer_state(&ctx->framebuffer, state);
1144    ctx->batch = NULL;
1145    agx_dirty_all(ctx);
1146 }
1147 
1148 /*
1149  * To write out render targets, each render target surface is bound as a
1150  * writable shader image, written with the end-of-tile program. This helper
1151  * constructs the internal pipe_image_view used.
1152  */
1153 static struct pipe_image_view
image_view_for_surface(struct pipe_surface * surf)1154 image_view_for_surface(struct pipe_surface *surf)
1155 {
1156    return (struct pipe_image_view){
1157       .resource = surf->texture,
1158       .format = surf->format,
1159       .access = PIPE_IMAGE_ACCESS_READ_WRITE,
1160       .shader_access = PIPE_IMAGE_ACCESS_READ_WRITE,
1161       .u.tex.single_layer_view =
1162          surf->u.tex.first_layer == surf->u.tex.last_layer,
1163       .u.tex.first_layer = surf->u.tex.first_layer,
1164       .u.tex.last_layer = surf->u.tex.last_layer,
1165       .u.tex.level = surf->u.tex.level,
1166    };
1167 }
1168 
1169 /* Similarly, to read render targets, surfaces are bound as textures */
1170 static struct pipe_sampler_view
sampler_view_for_surface(struct pipe_surface * surf)1171 sampler_view_for_surface(struct pipe_surface *surf)
1172 {
1173    bool layered = surf->u.tex.last_layer > surf->u.tex.first_layer;
1174 
1175    return (struct pipe_sampler_view){
1176       /* To reduce shader variants, we always use a 2D texture. For reloads of
1177        * arrays and cube maps, we map a single layer as a 2D image.
1178        */
1179       .target = layered ? PIPE_TEXTURE_2D_ARRAY : PIPE_TEXTURE_2D,
1180       .swizzle_r = PIPE_SWIZZLE_X,
1181       .swizzle_g = PIPE_SWIZZLE_Y,
1182       .swizzle_b = PIPE_SWIZZLE_Z,
1183       .swizzle_a = PIPE_SWIZZLE_W,
1184       .u.tex =
1185          {
1186             .first_layer = surf->u.tex.first_layer,
1187             .last_layer = surf->u.tex.last_layer,
1188             .first_level = surf->u.tex.level,
1189             .last_level = surf->u.tex.level,
1190          },
1191    };
1192 }
1193 
1194 static void
agx_pack_image_atomic_data(void * packed,struct pipe_image_view * view)1195 agx_pack_image_atomic_data(void *packed, struct pipe_image_view *view)
1196 {
1197    struct agx_resource *tex = agx_resource(view->resource);
1198 
1199    if (tex->base.target == PIPE_BUFFER) {
1200       agx_pack(packed, PBE_BUFFER_SOFTWARE, cfg) {
1201          cfg.base = tex->bo->ptr.gpu + view->u.buf.offset;
1202       }
1203    } else if (tex->layout.writeable_image) {
1204       unsigned level = view->u.tex.level;
1205       unsigned blocksize_B = util_format_get_blocksize(tex->layout.format);
1206 
1207       agx_pack(packed, ATOMIC_SOFTWARE, cfg) {
1208          cfg.base =
1209             tex->bo->ptr.gpu +
1210             ail_get_layer_level_B(&tex->layout, view->u.tex.first_layer, level);
1211 
1212          cfg.sample_count = MAX2(util_res_sample_count(view->resource), 1);
1213 
1214          if (tex->layout.tiling == AIL_TILING_TWIDDLED) {
1215             struct ail_tile tile_size = tex->layout.tilesize_el[level];
1216             cfg.tile_width = tile_size.width_el;
1217             cfg.tile_height = tile_size.height_el;
1218 
1219             unsigned width_el = u_minify(tex->base.width0, level);
1220             cfg.tiles_per_row = DIV_ROUND_UP(width_el, tile_size.width_el);
1221 
1222             cfg.layer_stride_pixels = DIV_ROUND_UP(
1223                tex->layout.layer_stride_B, blocksize_B * cfg.sample_count);
1224          }
1225       }
1226    }
1227 }
1228 
1229 static bool
target_is_array(enum pipe_texture_target target)1230 target_is_array(enum pipe_texture_target target)
1231 {
1232    switch (target) {
1233    case PIPE_TEXTURE_3D:
1234    case PIPE_TEXTURE_CUBE:
1235    case PIPE_TEXTURE_1D_ARRAY:
1236    case PIPE_TEXTURE_2D_ARRAY:
1237    case PIPE_TEXTURE_CUBE_ARRAY:
1238       return true;
1239    default:
1240       return false;
1241    }
1242 }
1243 
1244 static void
agx_batch_upload_pbe(struct agx_batch * batch,struct agx_pbe_packed * out,struct pipe_image_view * view,bool block_access,bool arrays_as_2d,bool force_2d_array)1245 agx_batch_upload_pbe(struct agx_batch *batch, struct agx_pbe_packed *out,
1246                      struct pipe_image_view *view, bool block_access,
1247                      bool arrays_as_2d, bool force_2d_array)
1248 {
1249    struct agx_resource *tex = agx_resource(view->resource);
1250    const struct util_format_description *desc =
1251       util_format_description(view->format);
1252    enum pipe_texture_target target = tex->base.target;
1253    bool is_buffer = (target == PIPE_BUFFER);
1254 
1255    if (!is_buffer && view->u.tex.single_layer_view)
1256       target = PIPE_TEXTURE_2D;
1257 
1258    arrays_as_2d |= (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
1259 
1260    /* To reduce shader variants, spilled layered render targets are accessed as
1261     * 2D Arrays regardless of the actual target, so force in that case.
1262     *
1263     * Likewise, cubes are accessed as arrays for consistency with NIR.
1264     */
1265    if ((arrays_as_2d && target_is_array(target)) || target_is_cube(target) ||
1266        force_2d_array)
1267       target = PIPE_TEXTURE_2D_ARRAY;
1268 
1269    unsigned level = is_buffer ? 0 : view->u.tex.level;
1270    unsigned layer = is_buffer ? 0 : view->u.tex.first_layer;
1271 
1272    agx_pack(out, PBE, cfg) {
1273       cfg.dimension =
1274          agx_translate_tex_dim(target, util_res_sample_count(&tex->base));
1275       cfg.layout = agx_translate_layout(tex->layout.tiling);
1276       cfg.channels = agx_pixel_format[view->format].channels;
1277       cfg.type = agx_pixel_format[view->format].type;
1278       cfg.srgb = util_format_is_srgb(view->format);
1279 
1280       assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
1281 
1282       for (unsigned i = 0; i < desc->nr_channels; ++i) {
1283          if (desc->swizzle[i] == 0)
1284             cfg.swizzle_r = i;
1285          else if (desc->swizzle[i] == 1)
1286             cfg.swizzle_g = i;
1287          else if (desc->swizzle[i] == 2)
1288             cfg.swizzle_b = i;
1289          else if (desc->swizzle[i] == 3)
1290             cfg.swizzle_a = i;
1291       }
1292 
1293       cfg.buffer = agx_map_texture_gpu(tex, layer);
1294       cfg.unk_mipmapped = tex->mipmapped;
1295 
1296       if (is_buffer) {
1297          unsigned size_el =
1298             agx_texture_buffer_size_el(view->format, view->u.buf.size);
1299 
1300          /* Buffers uniquely have offsets (in bytes, not texels) */
1301          cfg.buffer += view->u.buf.offset;
1302 
1303          /* Use a 2D texture to increase the maximum size */
1304          cfg.width = 1024;
1305          cfg.height = DIV_ROUND_UP(size_el, cfg.width);
1306          cfg.level = 0;
1307          cfg.stride = (cfg.width * util_format_get_blocksize(view->format)) - 4;
1308          cfg.layers = 1;
1309          cfg.levels = 1;
1310       } else if (util_res_sample_count(&tex->base) > 1 && !block_access) {
1311          /* Multisampled images are bound like buffer textures, with
1312           * addressing arithmetic to determine the texel to write.
1313           *
1314           * Note that the end-of-tile program uses real multisample images with
1315           * image_write_block instructions.
1316           */
1317          unsigned blocksize_B = util_format_get_blocksize(view->format);
1318          unsigned size_px =
1319             (tex->layout.size_B - tex->layout.layer_stride_B * layer) /
1320             blocksize_B;
1321 
1322          cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
1323          cfg.layout = AGX_LAYOUT_LINEAR;
1324          cfg.width = 1024;
1325          cfg.height = DIV_ROUND_UP(size_px, cfg.width);
1326          cfg.stride = (cfg.width * blocksize_B) - 4;
1327          cfg.layers = 1;
1328          cfg.levels = 1;
1329 
1330          cfg.buffer += tex->layout.level_offsets_B[level];
1331          cfg.level = 0;
1332       } else {
1333          cfg.width = view->resource->width0;
1334          cfg.height = view->resource->height0;
1335          cfg.level = level;
1336 
1337          unsigned layers = view->u.tex.last_layer - layer + 1;
1338 
1339          if (tex->layout.tiling == AIL_TILING_LINEAR &&
1340              (target == PIPE_TEXTURE_1D_ARRAY ||
1341               target == PIPE_TEXTURE_2D_ARRAY)) {
1342 
1343             cfg.depth_linear = layers;
1344             cfg.layer_stride_linear = (tex->layout.layer_stride_B - 0x80);
1345             cfg.extended = true;
1346          } else {
1347             assert((tex->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
1348             cfg.layers = layers;
1349          }
1350 
1351          if (tex->layout.tiling == AIL_TILING_LINEAR) {
1352             cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
1353             cfg.levels = 1;
1354          } else {
1355             cfg.page_aligned_layers = tex->layout.page_aligned_layers;
1356             cfg.levels = tex->base.last_level + 1;
1357          }
1358 
1359          if (tex->base.nr_samples > 1)
1360             cfg.samples = agx_translate_sample_count(tex->base.nr_samples);
1361       }
1362 
1363       if (ail_is_compressed(&tex->layout)) {
1364          cfg.compressed_1 = true;
1365          cfg.extended = true;
1366 
1367          cfg.acceleration_buffer =
1368             agx_map_texture_gpu(tex, 0) + tex->layout.metadata_offset_B +
1369             (layer * tex->layout.compression_layer_stride_B);
1370       }
1371 
1372       /* When the descriptor isn't extended architecturally, we can use the last
1373        * 8 bytes as a sideband. We use it to provide metadata for image atomics.
1374        */
1375       if (!cfg.extended) {
1376          struct agx_ptr desc =
1377             agx_pool_alloc_aligned(&batch->pool, AGX_ATOMIC_SOFTWARE_LENGTH, 8);
1378 
1379          agx_pack_image_atomic_data(desc.cpu, view);
1380          cfg.software_defined = desc.gpu;
1381       }
1382    };
1383 }
1384 
1385 /* Likewise constant buffers, textures, and samplers are handled in a common
1386  * per-draw path, with dirty tracking to reduce the costs involved.
1387  */
1388 
1389 static void
agx_set_constant_buffer(struct pipe_context * pctx,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)1390 agx_set_constant_buffer(struct pipe_context *pctx, enum pipe_shader_type shader,
1391                         uint index, bool take_ownership,
1392                         const struct pipe_constant_buffer *cb)
1393 {
1394    struct agx_context *ctx = agx_context(pctx);
1395    struct agx_stage *s = &ctx->stage[shader];
1396    struct pipe_constant_buffer *constants = &s->cb[index];
1397 
1398    util_copy_constant_buffer(&s->cb[index], cb, take_ownership);
1399 
1400    /* Upload user buffer immediately */
1401    if (constants->user_buffer && !constants->buffer) {
1402       u_upload_data(ctx->base.const_uploader, 0, constants->buffer_size, 64,
1403                     constants->user_buffer, &constants->buffer_offset,
1404                     &constants->buffer);
1405    }
1406 
1407    unsigned mask = (1 << index);
1408 
1409    if (cb)
1410       s->cb_mask |= mask;
1411    else
1412       s->cb_mask &= ~mask;
1413 
1414    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_CONST;
1415 }
1416 
1417 static void
agx_surface_destroy(struct pipe_context * ctx,struct pipe_surface * surface)1418 agx_surface_destroy(struct pipe_context *ctx, struct pipe_surface *surface)
1419 {
1420    pipe_resource_reference(&surface->texture, NULL);
1421    FREE(surface);
1422 }
1423 
1424 static void
agx_delete_state(struct pipe_context * ctx,void * state)1425 agx_delete_state(struct pipe_context *ctx, void *state)
1426 {
1427    FREE(state);
1428 }
1429 
1430 /* BOs added to the batch in the uniform upload path */
1431 
1432 static void
agx_set_vertex_buffers(struct pipe_context * pctx,unsigned count,const struct pipe_vertex_buffer * buffers)1433 agx_set_vertex_buffers(struct pipe_context *pctx, unsigned count,
1434                        const struct pipe_vertex_buffer *buffers)
1435 {
1436    struct agx_context *ctx = agx_context(pctx);
1437 
1438    util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers,
1439                                 count, true);
1440 
1441    ctx->dirty |= AGX_DIRTY_VERTEX;
1442 }
1443 
1444 static void *
agx_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)1445 agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
1446                            const struct pipe_vertex_element *state)
1447 {
1448    assert(count <= AGX_MAX_ATTRIBS);
1449 
1450    struct agx_vertex_elements *so = calloc(1, sizeof(*so));
1451 
1452    for (unsigned i = 0; i < count; ++i) {
1453       const struct pipe_vertex_element ve = state[i];
1454 
1455       const struct util_format_description *desc =
1456          util_format_description(ve.src_format);
1457       unsigned chan_size = desc->channel[0].size / 8;
1458       assert((ve.src_offset & (chan_size - 1)) == 0);
1459 
1460       so->buffers[i] = ve.vertex_buffer_index;
1461       so->src_offsets[i] = ve.src_offset;
1462 
1463       so->key[i] = (struct agx_velem_key){
1464          .stride = ve.src_stride,
1465          .format = ve.src_format,
1466          .divisor = ve.instance_divisor,
1467       };
1468    }
1469 
1470    return so;
1471 }
1472 
1473 static void
agx_bind_vertex_elements_state(struct pipe_context * pctx,void * cso)1474 agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso)
1475 {
1476    struct agx_context *ctx = agx_context(pctx);
1477    ctx->attributes = cso;
1478    ctx->dirty |= AGX_DIRTY_VERTEX;
1479 }
1480 
1481 static uint32_t
asahi_vs_shader_key_hash(const void * key)1482 asahi_vs_shader_key_hash(const void *key)
1483 {
1484    return _mesa_hash_data(key, sizeof(struct asahi_vs_shader_key));
1485 }
1486 
1487 static bool
asahi_vs_shader_key_equal(const void * a,const void * b)1488 asahi_vs_shader_key_equal(const void *a, const void *b)
1489 {
1490    return memcmp(a, b, sizeof(struct asahi_vs_shader_key)) == 0;
1491 }
1492 
1493 static uint32_t
asahi_gs_shader_key_hash(const void * key)1494 asahi_gs_shader_key_hash(const void *key)
1495 {
1496    return _mesa_hash_data(key, sizeof(struct asahi_gs_shader_key));
1497 }
1498 
1499 static bool
asahi_gs_shader_key_equal(const void * a,const void * b)1500 asahi_gs_shader_key_equal(const void *a, const void *b)
1501 {
1502    return memcmp(a, b, sizeof(struct asahi_gs_shader_key)) == 0;
1503 }
1504 
1505 static uint32_t
asahi_fs_shader_key_hash(const void * key)1506 asahi_fs_shader_key_hash(const void *key)
1507 {
1508    return _mesa_hash_data(key, sizeof(struct asahi_fs_shader_key));
1509 }
1510 
1511 static bool
asahi_fs_shader_key_equal(const void * a,const void * b)1512 asahi_fs_shader_key_equal(const void *a, const void *b)
1513 {
1514    return memcmp(a, b, sizeof(struct asahi_fs_shader_key)) == 0;
1515 }
1516 
1517 static uint32_t
asahi_tcs_shader_key_hash(const void * key)1518 asahi_tcs_shader_key_hash(const void *key)
1519 {
1520    return _mesa_hash_data(key, sizeof(struct asahi_tcs_shader_key));
1521 }
1522 
1523 static bool
asahi_tcs_shader_key_equal(const void * a,const void * b)1524 asahi_tcs_shader_key_equal(const void *a, const void *b)
1525 {
1526    return memcmp(a, b, sizeof(struct asahi_tcs_shader_key)) == 0;
1527 }
1528 
1529 /* No compute variants */
1530 static uint32_t
asahi_cs_shader_key_hash(const void * key)1531 asahi_cs_shader_key_hash(const void *key)
1532 {
1533    return 0;
1534 }
1535 
1536 static bool
asahi_cs_shader_key_equal(const void * a,const void * b)1537 asahi_cs_shader_key_equal(const void *a, const void *b)
1538 {
1539    return true;
1540 }
1541 
1542 static unsigned
agx_find_linked_slot(struct agx_varyings_vs * vs,struct agx_varyings_fs * fs,gl_varying_slot slot,unsigned offset,bool debug)1543 agx_find_linked_slot(struct agx_varyings_vs *vs, struct agx_varyings_fs *fs,
1544                      gl_varying_slot slot, unsigned offset, bool debug)
1545 {
1546    assert(offset < 4);
1547    assert(slot != VARYING_SLOT_PNTC && "point coords aren't linked");
1548 
1549    if (slot == VARYING_SLOT_POS) {
1550       if (offset == 3) {
1551          return 0; /* W */
1552       } else if (offset == 2) {
1553          assert(fs->reads_z);
1554          return 1; /* Z */
1555       } else {
1556          unreachable("gl_Position.xy are not varyings");
1557       }
1558    }
1559 
1560    unsigned vs_index = vs->slots[slot];
1561 
1562    if (!(vs_index < vs->nr_index)) {
1563       /* Varyings not written by vertex shader are undefined, be robust.
1564        *
1565        * If the layer is read but not written, its value will be ignored by the
1566        * agx_nir_predicate_layer_id lowering, so read garbage.
1567        *
1568        * For other varyings, this is probably an app bug.
1569        */
1570       if (unlikely(debug && (slot != VARYING_SLOT_LAYER)))
1571          unreachable("Fragment shader read varying not written by vertex!");
1572 
1573       return 0;
1574    }
1575 
1576    assert(vs_index >= 4 && "gl_Position should have been the first 4 slots");
1577    assert((vs_index < vs->base_index_fp16) ==
1578              ((vs_index + offset) < vs->base_index_fp16) &&
1579           "a given varying must have a consistent type");
1580 
1581    unsigned vs_user_index = (vs_index + offset) - 4;
1582 
1583    if (fs->reads_z)
1584       return vs_user_index + 2;
1585    else
1586       return vs_user_index + 1;
1587 }
1588 
1589 static unsigned
agx_num_general_outputs(struct agx_varyings_vs * vs)1590 agx_num_general_outputs(struct agx_varyings_vs *vs)
1591 {
1592    unsigned nr_vs = vs->nr_index;
1593    bool writes_psiz = vs->slots[VARYING_SLOT_PSIZ] < nr_vs;
1594 
1595    assert(nr_vs >= 4 && "gl_Position must be written");
1596    if (writes_psiz)
1597       assert(nr_vs >= 5 && "gl_PointSize is written");
1598 
1599    return nr_vs - (writes_psiz ? 5 : 4);
1600 }
1601 
1602 static uint32_t
agx_link_varyings_vs_fs(struct agx_pool * pool,struct agx_varyings_vs * vs,struct agx_varyings_fs * fs,bool first_provoking_vertex,uint8_t sprite_coord_enable,bool * generate_primitive_id)1603 agx_link_varyings_vs_fs(struct agx_pool *pool, struct agx_varyings_vs *vs,
1604                         struct agx_varyings_fs *fs, bool first_provoking_vertex,
1605                         uint8_t sprite_coord_enable,
1606                         bool *generate_primitive_id)
1607 {
1608    *generate_primitive_id = false;
1609 
1610    /* If there are no bindings, there's nothing to emit */
1611    if (fs->nr_bindings == 0)
1612       return 0;
1613 
1614    size_t linkage_size =
1615       AGX_CF_BINDING_HEADER_LENGTH + (fs->nr_bindings * AGX_CF_BINDING_LENGTH);
1616 
1617    void *tmp = alloca(linkage_size);
1618    struct agx_cf_binding_header_packed *header = tmp;
1619    struct agx_cf_binding_packed *bindings = (void *)(header + 1);
1620 
1621    unsigned nr_slots = agx_num_general_outputs(vs) + 1 + (fs->reads_z ? 1 : 0);
1622 
1623    agx_pack(header, CF_BINDING_HEADER, cfg) {
1624       cfg.number_of_32_bit_slots = nr_slots;
1625       cfg.number_of_coefficient_registers = fs->nr_cf;
1626    }
1627 
1628    for (unsigned i = 0; i < fs->nr_bindings; ++i) {
1629       agx_pack(bindings + i, CF_BINDING, cfg) {
1630          cfg.base_coefficient_register = fs->bindings[i].cf_base;
1631          cfg.components = fs->bindings[i].count;
1632          cfg.shade_model =
1633             agx_translate_shade_model(fs, i, first_provoking_vertex);
1634 
1635          if (util_varying_is_point_coord(fs->bindings[i].slot,
1636                                          sprite_coord_enable)) {
1637             assert(fs->bindings[i].offset == 0);
1638             cfg.source = AGX_COEFFICIENT_SOURCE_POINT_COORD;
1639          } else if (fs->bindings[i].slot == VARYING_SLOT_PRIMITIVE_ID &&
1640                     vs->slots[VARYING_SLOT_PRIMITIVE_ID] == ~0) {
1641             cfg.source = AGX_COEFFICIENT_SOURCE_PRIMITIVE_ID;
1642             *generate_primitive_id = true;
1643          } else {
1644             cfg.base_slot = agx_find_linked_slot(
1645                vs, fs, fs->bindings[i].slot, fs->bindings[i].offset,
1646                pool->dev->debug & AGX_DBG_VARYINGS);
1647 
1648             assert(cfg.base_slot + cfg.components <=
1649                       MAX2(nr_slots, cfg.components) &&
1650                    "overflow slots");
1651          }
1652 
1653          if (fs->bindings[i].slot == VARYING_SLOT_POS) {
1654             if (fs->bindings[i].offset == 2) {
1655                cfg.source = AGX_COEFFICIENT_SOURCE_FRAGCOORD_Z;
1656             } else {
1657                assert(!fs->bindings[i].perspective &&
1658                       "W must not be perspective divided");
1659             }
1660          }
1661 
1662          assert(cfg.base_coefficient_register + cfg.components <= fs->nr_cf &&
1663                 "overflowed coefficient registers");
1664       }
1665    }
1666 
1667    struct agx_ptr ptr = agx_pool_alloc_aligned(pool, (3 * linkage_size), 256);
1668    assert(ptr.gpu < (1ull << 32) && "varyings must be in low memory");
1669 
1670    /* I don't understand why the data structures are repeated thrice */
1671    for (unsigned i = 0; i < 3; ++i) {
1672       memcpy(((uint8_t *)ptr.cpu) + (i * linkage_size), (uint8_t *)tmp,
1673              linkage_size);
1674    }
1675 
1676    return ptr.gpu;
1677 }
1678 
1679 /* Dynamic lowered I/O version of nir_lower_clip_halfz */
1680 static bool
agx_nir_lower_clip_m1_1(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1681 agx_nir_lower_clip_m1_1(nir_builder *b, nir_intrinsic_instr *intr,
1682                         UNUSED void *data)
1683 {
1684    if (intr->intrinsic != nir_intrinsic_store_output)
1685       return false;
1686    if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_POS)
1687       return false;
1688 
1689    assert(nir_intrinsic_component(intr) == 0 && "not yet scalarized");
1690    b->cursor = nir_before_instr(&intr->instr);
1691 
1692    nir_def *pos = intr->src[0].ssa;
1693    nir_def *z = nir_channel(b, pos, 2);
1694    nir_def *w = nir_channel(b, pos, 3);
1695    nir_def *c = nir_load_clip_z_coeff_agx(b);
1696 
1697    /* Lerp. If c = 0, reduces to z. If c = 1/2, reduces to (z + w)/2 */
1698    nir_def *new_z = nir_ffma(b, nir_fneg(b, z), c, nir_ffma(b, w, c, z));
1699    nir_src_rewrite(&intr->src[0], nir_vector_insert_imm(b, pos, new_z, 2));
1700    return true;
1701 }
1702 
1703 static nir_def *
nir_channel_or_undef(nir_builder * b,nir_def * def,signed int channel)1704 nir_channel_or_undef(nir_builder *b, nir_def *def, signed int channel)
1705 {
1706    if (channel >= 0 && channel < def->num_components)
1707       return nir_channel(b, def, channel);
1708    else
1709       return nir_undef(b, def->bit_size, 1);
1710 }
1711 
1712 /*
1713  * To implement point sprites, we'll replace TEX0...7 with point coordinate
1714  * reads as required. However, the .zw needs to read back 0.0/1.0. This pass
1715  * fixes up TEX loads of Z and W according to a uniform passed in a sideband,
1716  * eliminating shader variants.
1717  */
1718 static bool
agx_nir_lower_point_sprite_zw(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1719 agx_nir_lower_point_sprite_zw(nir_builder *b, nir_intrinsic_instr *intr,
1720                               UNUSED void *data)
1721 {
1722    if (intr->intrinsic != nir_intrinsic_load_input &&
1723        intr->intrinsic != nir_intrinsic_load_interpolated_input)
1724       return false;
1725 
1726    gl_varying_slot loc = nir_intrinsic_io_semantics(intr).location;
1727    if (!(loc >= VARYING_SLOT_TEX0 && loc <= VARYING_SLOT_TEX7))
1728       return false;
1729 
1730    b->cursor = nir_after_instr(&intr->instr);
1731    unsigned component = nir_intrinsic_component(intr);
1732 
1733    nir_def *mask = nir_load_tex_sprite_mask_agx(b);
1734    nir_def *location = nir_iadd_imm(b, nir_get_io_offset_src(intr)->ssa,
1735                                     loc - VARYING_SLOT_TEX0);
1736    nir_def *bit = nir_ishl(b, nir_imm_intN_t(b, 1, 16), location);
1737    nir_def *replace = nir_i2b(b, nir_iand(b, mask, bit));
1738 
1739    nir_def *vec = nir_pad_vec4(b, &intr->def);
1740    nir_def *chans[4] = {NULL, NULL, nir_imm_float(b, 0.0),
1741                         nir_imm_float(b, 1.0)};
1742 
1743    for (unsigned i = 0; i < 4; ++i) {
1744       nir_def *chan = nir_channel_or_undef(b, vec, i - component);
1745       chans[i] = chans[i] ? nir_bcsel(b, replace, chans[i], chan) : chan;
1746    }
1747 
1748    nir_def *new_vec = nir_vec(b, &chans[component], intr->def.num_components);
1749    nir_def_rewrite_uses_after(&intr->def, new_vec, new_vec->parent_instr);
1750    return true;
1751 }
1752 
1753 static bool
agx_nir_lower_stats_fs(nir_shader * s)1754 agx_nir_lower_stats_fs(nir_shader *s)
1755 {
1756    assert(s->info.stage == MESA_SHADER_FRAGMENT);
1757    nir_builder b_ =
1758       nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
1759    nir_builder *b = &b_;
1760 
1761    nir_def *samples = nir_bit_count(b, nir_load_sample_mask_in(b));
1762    unsigned query = PIPE_STAT_QUERY_PS_INVOCATIONS;
1763 
1764    nir_def *addr = nir_load_stat_query_address_agx(b, .base = query);
1765    nir_global_atomic(b, 32, addr, samples, .atomic_op = nir_atomic_op_iadd);
1766 
1767    nir_metadata_preserve(b->impl,
1768                          nir_metadata_block_index | nir_metadata_dominance);
1769    return true;
1770 }
1771 
1772 /*
1773  * Compile a NIR shader. The only lowering left at this point is sysvals. The
1774  * shader key should have already been applied. agx_compile_variant may call
1775  * this multiple times if there are auxiliary shaders.
1776  */
1777 static struct agx_compiled_shader *
agx_compile_nir(struct agx_device * dev,nir_shader * nir,const struct agx_shader_key * base_key,struct util_debug_callback * debug,enum pipe_shader_type stage)1778 agx_compile_nir(struct agx_device *dev, nir_shader *nir,
1779                 const struct agx_shader_key *base_key,
1780                 struct util_debug_callback *debug, enum pipe_shader_type stage)
1781 {
1782    struct agx_compiled_shader *compiled = CALLOC_STRUCT(agx_compiled_shader);
1783    struct util_dynarray binary;
1784    util_dynarray_init(&binary, NULL);
1785 
1786    struct agx_shader_key key = *base_key;
1787    key.needs_g13x_coherency = (dev->params.gpu_generation == 13 &&
1788                                dev->params.num_clusters_total > 1) ||
1789                               dev->params.num_dies > 1;
1790    key.libagx = dev->libagx;
1791    key.has_scratch = true;
1792 
1793    NIR_PASS(_, nir, agx_nir_lower_sysvals, stage, true);
1794    NIR_PASS(_, nir, agx_nir_layout_uniforms, compiled, &key.reserved_preamble);
1795 
1796    agx_compile_shader_nir(nir, &key, debug, &binary, &compiled->info);
1797 
1798    if (binary.size) {
1799       compiled->bo = agx_bo_create(dev, binary.size,
1800                                    AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
1801 
1802       memcpy(compiled->bo->ptr.cpu, binary.data, binary.size);
1803    }
1804 
1805    util_dynarray_fini(&binary);
1806    return compiled;
1807 }
1808 
1809 /*
1810  * Insert code into a fragment shader to lower polygon stipple. The stipple is
1811  * passed in a sideband, rather than requiring a texture binding. This is
1812  * simpler for drivers to integrate and might be more efficient.
1813  */
1814 static bool
agx_nir_lower_poly_stipple(nir_shader * s)1815 agx_nir_lower_poly_stipple(nir_shader *s)
1816 {
1817    assert(s->info.stage == MESA_SHADER_FRAGMENT);
1818 
1819    /* Insert at the beginning for performance. */
1820    nir_builder b_ =
1821       nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
1822    nir_builder *b = &b_;
1823 
1824    /* The stipple coordinate is defined at the window coordinate mod 32. It's
1825     * reversed along the X-axis to simplify the driver, hence the NOT.
1826     */
1827    nir_def *raw = nir_u2u32(b, nir_load_pixel_coord(b));
1828    nir_def *coord = nir_umod_imm(
1829       b,
1830       nir_vec2(b, nir_inot(b, nir_channel(b, raw, 0)), nir_channel(b, raw, 1)),
1831       32);
1832 
1833    /* Load the stipple pattern for the row */
1834    nir_def *pattern = nir_load_polygon_stipple_agx(b, nir_channel(b, coord, 1));
1835 
1836    /* Extract the column from the packed bitfield */
1837    nir_def *bit = nir_ubitfield_extract(b, pattern, nir_channel(b, coord, 0),
1838                                         nir_imm_int(b, 1));
1839 
1840    /* Discard fragments where the pattern is 0 */
1841    nir_discard_if(b, nir_ieq_imm(b, bit, 0));
1842    s->info.fs.uses_discard = true;
1843 
1844    nir_metadata_preserve(b->impl,
1845                          nir_metadata_dominance | nir_metadata_block_index);
1846    return true;
1847 }
1848 
1849 static bool
lower_vbo(nir_shader * s,struct agx_velem_key * key)1850 lower_vbo(nir_shader *s, struct agx_velem_key *key)
1851 {
1852    struct agx_attribute out[AGX_MAX_VBUFS];
1853 
1854    for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
1855       out[i] = (struct agx_attribute){
1856          .divisor = key[i].divisor,
1857          .stride = key[i].stride,
1858          .format = key[i].format,
1859       };
1860    }
1861 
1862    return agx_nir_lower_vbo(s, out);
1863 }
1864 
1865 /* Does not take ownership of key. Clones if necessary. */
1866 static struct agx_compiled_shader *
agx_compile_variant(struct agx_device * dev,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key_,struct agx_uncompiled_shader * linked_so)1867 agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
1868                     struct agx_uncompiled_shader *so,
1869                     struct util_debug_callback *debug,
1870                     union asahi_shader_key *key_,
1871                     struct agx_uncompiled_shader *linked_so)
1872 {
1873    struct blob_reader reader;
1874    blob_reader_init(&reader, so->serialized_nir.data, so->serialized_nir.size);
1875    nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);
1876 
1877    /* Auxiliary programs */
1878    enum mesa_prim gs_out_prim = MESA_PRIM_MAX;
1879    uint64_t outputs = 0;
1880    unsigned gs_out_count_words = 0;
1881    nir_shader *gs_count = NULL;
1882    nir_shader *gs_copy = NULL;
1883    nir_shader *pre_gs = NULL;
1884 
1885    /* This can happen at inopportune times and cause jank, log it */
1886    perf_debug(dev, "Compiling shader variant #%u",
1887               _mesa_hash_table_num_entries(so->variants));
1888 
1889    bool force_translucent = false;
1890 
1891    if (nir->info.stage == MESA_SHADER_VERTEX) {
1892       struct asahi_vs_shader_key *key = &key_->vs;
1893 
1894       NIR_PASS(_, nir, lower_vbo, key->attribs);
1895 
1896       if (key->next_stage == ASAHI_VS_FS) {
1897          NIR_PASS(_, nir, agx_nir_lower_point_size,
1898                   key->next.fs.fixed_point_size);
1899          NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1900                   nir_metadata_block_index | nir_metadata_dominance, NULL);
1901       } else if (key->next_stage == ASAHI_VS_GS) {
1902          NIR_PASS(_, nir, agx_nir_lower_sysvals, PIPE_SHADER_VERTEX, false);
1903          NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx,
1904                   key->next.gs.index_size_B, &outputs);
1905       }
1906    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1907       struct asahi_tcs_shader_key *key = &key_->tcs;
1908 
1909       /* TODO: Deduplicate this logic from the GS case! */
1910       struct blob_reader vs_reader;
1911       blob_reader_init(&vs_reader, linked_so->serialized_nir.data,
1912                        linked_so->serialized_nir.size);
1913       nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);
1914 
1915       /* Apply the VS key to the VS before linking it in */
1916       NIR_PASS_V(vs, lower_vbo, key->attribs);
1917       NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1918       NIR_PASS_V(vs, agx_nir_lower_sysvals, PIPE_SHADER_VERTEX, false);
1919 
1920       NIR_PASS_V(nir, agx_nir_lower_tcs, vs, dev->libagx, key->index_size_B);
1921       ralloc_free(vs);
1922    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1923       struct asahi_gs_shader_key *key = &key_->gs;
1924 
1925       /* XFB occurs for GS, not VS. TODO: Check if active. */
1926       if (nir->xfb_info != NULL) {
1927          NIR_PASS(_, nir, nir_io_add_const_offset_to_base,
1928                   nir_var_shader_in | nir_var_shader_out);
1929          NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info);
1930       }
1931 
1932       NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1933 
1934       NIR_PASS(_, nir, agx_nir_lower_gs, dev->libagx, key->rasterizer_discard,
1935                &gs_count, &gs_copy, &pre_gs, &gs_out_prim, &gs_out_count_words);
1936    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1937       struct asahi_fs_shader_key *key = &key_->fs;
1938 
1939       struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
1940          key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples, true);
1941 
1942       if (dev->debug & AGX_DBG_SMALLTILE)
1943          tib.tile_size = (struct agx_tile_size){16, 16};
1944 
1945       nir_lower_blend_options opts = {
1946          .scalar_blend_const = true,
1947          .logicop_enable = key->blend.logicop_func != PIPE_LOGICOP_COPY,
1948          .logicop_func = key->blend.logicop_func,
1949       };
1950 
1951       static_assert(ARRAY_SIZE(opts.format) == PIPE_MAX_COLOR_BUFS,
1952                     "max RTs out of sync");
1953 
1954       for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i)
1955          opts.format[i] = key->rt_formats[i];
1956 
1957       memcpy(opts.rt, key->blend.rt, sizeof(opts.rt));
1958 
1959       /* It's more efficient to use masked stores (with
1960        * agx_nir_lower_tilebuffer) than to emulate colour masking with
1961        * nir_lower_blend.
1962        */
1963       uint8_t colormasks[PIPE_MAX_COLOR_BUFS] = {0};
1964 
1965       for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
1966          /* TODO: Flakes some dEQPs, seems to invoke UB. Revisit later.
1967           * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.77
1968           * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.98
1969           */
1970          if (0 /* agx_tilebuffer_supports_mask(&tib, i) */) {
1971             colormasks[i] = key->blend.rt[i].colormask;
1972             opts.rt[i].colormask = (uint8_t)BITFIELD_MASK(4);
1973          } else {
1974             colormasks[i] = (uint8_t)BITFIELD_MASK(4);
1975          }
1976 
1977          /* If not all bound RTs are fully written to, we need to force
1978           * translucent pass type. agx_nir_lower_tilebuffer will take
1979           * care of this for its own colormasks input.
1980           */
1981          unsigned comps = util_format_get_nr_components(key->rt_formats[i]);
1982          if ((opts.rt[i].colormask & BITFIELD_MASK(comps)) !=
1983              BITFIELD_MASK(comps))
1984             force_translucent = true;
1985       }
1986 
1987       if (key->statistics) {
1988          NIR_PASS(_, nir, agx_nir_lower_stats_fs);
1989       }
1990 
1991       /* Similarly for cull distancing lowering */
1992       if (key->cull_distance_size) {
1993          NIR_PASS(_, nir, agx_nir_lower_cull_distance_fs,
1994                   key->cull_distance_size);
1995       }
1996 
1997       /* Similarly for polygon stipple */
1998       if (key->polygon_stipple) {
1999          NIR_PASS_V(nir, agx_nir_lower_poly_stipple);
2000       }
2001 
2002       /* Discards must be lowering before lowering MSAA to handle discards */
2003       NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
2004 
2005       /* Alpha-to-coverage must be lowered before alpha-to-one */
2006       if (key->blend.alpha_to_coverage)
2007          NIR_PASS(_, nir, agx_nir_lower_alpha_to_coverage, tib.nr_samples);
2008 
2009       /* Alpha-to-one must be lowered before blending */
2010       if (key->blend.alpha_to_one)
2011          NIR_PASS(_, nir, agx_nir_lower_alpha_to_one);
2012 
2013       NIR_PASS(_, nir, nir_lower_blend, &opts);
2014 
2015       /* XXX: don't replicate this all over the driver */
2016       unsigned rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
2017                                (2 * BITSET_LAST_BIT(nir->info.images_used));
2018       unsigned rt_spill = rt_spill_base;
2019       NIR_PASS(_, nir, agx_nir_lower_tilebuffer, &tib, colormasks, &rt_spill,
2020                &force_translucent, false);
2021 
2022       NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics);
2023       NIR_PASS(_, nir, agx_nir_lower_monolithic_msaa,
2024                &(struct agx_msaa_state){
2025                   .nr_samples = tib.nr_samples,
2026                   .api_sample_mask = key->api_sample_mask,
2027                });
2028 
2029       if (nir->info.inputs_read & VARYING_BIT_LAYER)
2030          NIR_PASS(_, nir, agx_nir_predicate_layer_id);
2031    }
2032 
2033    NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
2034 
2035    struct agx_shader_key base_key = {0};
2036 
2037    if (nir->info.stage == MESA_SHADER_VERTEX) {
2038       struct asahi_vs_shader_key *key = &key_->vs;
2039 
2040       if (key->next_stage == ASAHI_VS_FS) {
2041          base_key.vs.outputs_flat_shaded = key_->vs.next.fs.outputs_flat_shaded;
2042 
2043          base_key.vs.outputs_linear_shaded =
2044             key_->vs.next.fs.outputs_linear_shaded;
2045       }
2046    }
2047 
2048    struct agx_compiled_shader *compiled =
2049       agx_compile_nir(dev, nir, &base_key, debug, so->type);
2050 
2051    compiled->so = so;
2052 
2053    /* reads_tib => Translucent pass type */
2054    compiled->info.reads_tib |= force_translucent;
2055 
2056    /* Could be optimized to use non-translucent pass types with the
2057     * appropriate HSR configuration, but that mechanism is not yet
2058     * understood. Warn that we're leaving perf on the table when used.
2059     */
2060    if (force_translucent)
2061       perf_debug(dev, "Translucency forced due to colour masking");
2062 
2063    /* Compile auxiliary programs */
2064    if (gs_count) {
2065       compiled->gs_count =
2066          agx_compile_nir(dev, gs_count, &base_key, debug, so->type);
2067       compiled->gs_count->so = so;
2068       compiled->gs_count->stage = so->type;
2069    }
2070 
2071    if (pre_gs) {
2072       compiled->pre_gs =
2073          agx_compile_nir(dev, pre_gs, &base_key, debug, PIPE_SHADER_COMPUTE);
2074    }
2075 
2076    if (gs_copy) {
2077       struct asahi_gs_shader_key *key = &key_->gs;
2078 
2079       /* TODO: deduplicate */
2080       NIR_PASS(_, gs_copy, agx_nir_lower_point_size, key->fixed_point_size);
2081       NIR_PASS(_, gs_copy, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
2082                nir_metadata_block_index | nir_metadata_dominance, NULL);
2083 
2084       base_key.vs.outputs_flat_shaded = key->outputs_flat_shaded;
2085       base_key.vs.outputs_linear_shaded = key->outputs_linear_shaded;
2086 
2087       compiled->gs_copy =
2088          agx_compile_nir(dev, gs_copy, &base_key, debug, PIPE_SHADER_GEOMETRY);
2089       compiled->gs_copy->so = so;
2090       compiled->gs_copy->stage = so->type;
2091    }
2092 
2093    compiled->gs_output_mode = gs_out_prim;
2094    compiled->gs_count_words = gs_out_count_words;
2095    compiled->info.outputs = outputs;
2096    compiled->stage = so->type;
2097 
2098    ralloc_free(nir);
2099    ralloc_free(pre_gs);
2100    ralloc_free(gs_count);
2101    return compiled;
2102 }
2103 
2104 static struct agx_compiled_shader *
agx_get_shader_variant(struct agx_screen * screen,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key,struct agx_uncompiled_shader * linked_so)2105 agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
2106                        struct agx_uncompiled_shader *so,
2107                        struct util_debug_callback *debug,
2108                        union asahi_shader_key *key,
2109                        struct agx_uncompiled_shader *linked_so)
2110 {
2111    struct agx_compiled_shader *compiled =
2112       agx_disk_cache_retrieve(screen, so, key);
2113 
2114    if (!compiled) {
2115       compiled =
2116          agx_compile_variant(&screen->dev, pctx, so, debug, key, linked_so);
2117       agx_disk_cache_store(screen->disk_cache, so, key, compiled);
2118    }
2119 
2120    /* key may be destroyed after we return, so clone it before using it as a
2121     * hash table key. The clone is logically owned by the hash table.
2122     */
2123    union asahi_shader_key *cloned_key =
2124       rzalloc(so->variants, union asahi_shader_key);
2125 
2126    if (so->type == PIPE_SHADER_FRAGMENT) {
2127       memcpy(cloned_key, key, sizeof(struct asahi_fs_shader_key));
2128    } else if (so->type == PIPE_SHADER_VERTEX ||
2129               so->type == PIPE_SHADER_TESS_EVAL) {
2130       memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
2131    } else if (so->type == PIPE_SHADER_GEOMETRY) {
2132       memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
2133    } else if (so->type == PIPE_SHADER_TESS_CTRL) {
2134       memcpy(cloned_key, key, sizeof(struct asahi_tcs_shader_key));
2135    } else {
2136       assert(gl_shader_stage_is_compute(so->type));
2137       /* No key */
2138    }
2139 
2140    _mesa_hash_table_insert(so->variants, cloned_key, compiled);
2141 
2142    return compiled;
2143 }
2144 
2145 static void
agx_shader_initialize(struct agx_device * dev,struct agx_uncompiled_shader * so,nir_shader * nir,bool support_lod_bias,bool robust)2146 agx_shader_initialize(struct agx_device *dev, struct agx_uncompiled_shader *so,
2147                       nir_shader *nir, bool support_lod_bias, bool robust)
2148 {
2149    if (nir->info.stage == MESA_SHADER_KERNEL)
2150       nir->info.stage = MESA_SHADER_COMPUTE;
2151 
2152    blob_init(&so->early_serialized_nir);
2153    nir_serialize(&so->early_serialized_nir, nir, true);
2154 
2155    nir_lower_robust_access_options robustness = {
2156       /* Images accessed through the texture or PBE hardware are robust, so we
2157        * don't set lower_image. However, buffer images and image atomics are
2158        * lowered so require robustness lowering.
2159        */
2160       .lower_buffer_image = true,
2161       .lower_image_atomic = true,
2162 
2163       /* Buffer access is based on raw pointers and hence needs lowering to be
2164          robust */
2165       .lower_ubo = robust,
2166       .lower_ssbo = robust,
2167    };
2168 
2169    /* We need to lower robustness before bindings, since robustness lowering
2170     * affects the bindings used.
2171     */
2172    NIR_PASS(_, nir, nir_lower_robust_access, &robustness);
2173 
2174    /* Similarly, we need to do early texture lowering before bindings */
2175    NIR_PASS(_, nir, agx_nir_lower_texture_early, support_lod_bias);
2176 
2177    /* We need to lower binding tables before calling agx_preprocess_nir, since
2178     * that does texture lowering that needs to know the binding model.
2179     */
2180    NIR_PASS(_, nir, agx_nir_lower_bindings, &so->uses_bindless_samplers);
2181 
2182    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2183       /* Lower to maximum colour buffers, the excess stores will get cleaned up
2184        * by tilebuffer lowering so they won't become real shader code. However,
2185        * that depends on the shader key which we don't have at this point.
2186        */
2187       NIR_PASS(_, nir, nir_lower_fragcolor, 8);
2188    }
2189 
2190    NIR_PASS(_, nir, agx_nir_lower_texture);
2191    NIR_PASS(_, nir, nir_lower_ssbo);
2192 
2193    bool allow_mediump = !(dev->debug & AGX_DBG_NO16);
2194    agx_preprocess_nir(nir, dev->libagx, allow_mediump, &so->info);
2195 
2196    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
2197        (nir->info.inputs_read & VARYING_BITS_TEX_ANY)) {
2198 
2199       NIR_PASS(_, nir, nir_shader_intrinsics_pass,
2200                agx_nir_lower_point_sprite_zw,
2201                nir_metadata_block_index | nir_metadata_dominance, NULL);
2202    }
2203 
2204    so->type = pipe_shader_type_from_mesa(nir->info.stage);
2205 
2206    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
2207       NIR_PASS(_, nir, agx_nir_lower_tes, dev->libagx);
2208    }
2209 
2210    blob_init(&so->serialized_nir);
2211    nir_serialize(&so->serialized_nir, nir, true);
2212    _mesa_sha1_compute(so->serialized_nir.data, so->serialized_nir.size,
2213                       so->nir_sha1);
2214 
2215    so->has_xfb_info = (nir->xfb_info != NULL);
2216 
2217    static_assert(
2218       ARRAY_SIZE(so->xfb_strides) == ARRAY_SIZE(nir->info.xfb_stride),
2219       "known target count");
2220 
2221    if (so->has_xfb_info) {
2222       struct nir_xfb_info *xfb = nir->xfb_info;
2223 
2224       for (unsigned i = 0; i < ARRAY_SIZE(so->xfb_strides); ++i) {
2225          so->xfb_strides[i] = xfb->buffers[i].stride;
2226       }
2227    }
2228 }
2229 
2230 static void *
agx_create_shader_state(struct pipe_context * pctx,const struct pipe_shader_state * cso)2231 agx_create_shader_state(struct pipe_context *pctx,
2232                         const struct pipe_shader_state *cso)
2233 {
2234    struct agx_context *ctx = agx_context(pctx);
2235    struct agx_uncompiled_shader *so =
2236       rzalloc(NULL, struct agx_uncompiled_shader);
2237    struct agx_device *dev = agx_device(pctx->screen);
2238 
2239    if (!so)
2240       return NULL;
2241 
2242    so->base = *cso;
2243 
2244    nir_shader *nir = cso->type == PIPE_SHADER_IR_NIR
2245                         ? cso->ir.nir
2246                         : tgsi_to_nir(cso->tokens, pctx->screen, false);
2247 
2248    if (nir->info.stage == MESA_SHADER_VERTEX) {
2249       so->variants = _mesa_hash_table_create(so, asahi_vs_shader_key_hash,
2250                                              asahi_vs_shader_key_equal);
2251    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
2252       so->variants = _mesa_hash_table_create(NULL, asahi_gs_shader_key_hash,
2253                                              asahi_gs_shader_key_equal);
2254 
2255    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
2256       /* No variants */
2257       so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
2258                                              asahi_cs_shader_key_equal);
2259    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
2260       so->variants = _mesa_hash_table_create(NULL, asahi_tcs_shader_key_hash,
2261                                              asahi_tcs_shader_key_equal);
2262    } else {
2263       so->variants = _mesa_hash_table_create(so, asahi_fs_shader_key_hash,
2264                                              asahi_fs_shader_key_equal);
2265    }
2266 
2267    if (nir->info.stage == MESA_SHADER_TESS_EVAL ||
2268        nir->info.stage == MESA_SHADER_TESS_CTRL) {
2269 
2270       so->tess.ccw = nir->info.tess.ccw;
2271       so->tess.point_mode = nir->info.tess.point_mode;
2272       so->tess.spacing = nir->info.tess.spacing;
2273       so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
2274       so->tess.primitive = nir->info.tess._primitive_mode;
2275       so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
2276       so->tess.nr_patch_outputs =
2277          util_last_bit(nir->info.patch_outputs_written);
2278       if (nir->info.stage == MESA_SHADER_TESS_CTRL)
2279          so->tess.output_stride = agx_tcs_output_stride(nir);
2280    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
2281       so->gs_mode = nir->info.gs.output_primitive;
2282    }
2283 
2284    agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
2285 
2286    /* We're done with the NIR, throw it away */
2287    ralloc_free(nir);
2288    nir = NULL;
2289 
2290    /* For shader-db, precompile a shader with a default key. This could be
2291     * improved but hopefully this is acceptable for now.
2292     */
2293    if (dev->debug & AGX_DBG_PRECOMPILE) {
2294       union asahi_shader_key key = {0};
2295 
2296       switch (so->type) {
2297       case PIPE_SHADER_VERTEX: {
2298          for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
2299             key.vs.attribs[i] = (struct agx_velem_key){
2300                .stride = 16,
2301                .format = PIPE_FORMAT_R32G32B32A32_FLOAT,
2302             };
2303          }
2304 
2305          break;
2306       }
2307 
2308       case PIPE_SHADER_GEOMETRY:
2309          break;
2310 
2311       case PIPE_SHADER_TESS_CTRL:
2312       case PIPE_SHADER_TESS_EVAL:
2313          /* TODO: Tessellation shaders with shader-db */
2314          return so;
2315 
2316       case PIPE_SHADER_FRAGMENT:
2317          key.fs.nr_samples = 1;
2318          key.fs.blend.logicop_func = PIPE_LOGICOP_COPY;
2319          for (unsigned i = 0; i < 1; ++i) {
2320             key.fs.rt_formats[i] = PIPE_FORMAT_R8G8B8A8_UNORM;
2321             key.fs.blend.rt[i].colormask = 0xF;
2322 
2323             const nir_lower_blend_channel replace = {
2324                .func = PIPE_BLEND_ADD,
2325                .src_factor = PIPE_BLENDFACTOR_ONE,
2326                .dst_factor = PIPE_BLENDFACTOR_ZERO,
2327             };
2328 
2329             key.fs.blend.rt[i].rgb = replace;
2330             key.fs.blend.rt[i].alpha = replace;
2331          }
2332          break;
2333       default:
2334          unreachable("Unknown shader stage in shader-db precompile");
2335       }
2336 
2337       agx_compile_variant(dev, pctx, so, &pctx->debug, &key, NULL);
2338    }
2339 
2340    return so;
2341 }
2342 
2343 static void *
agx_create_compute_state(struct pipe_context * pctx,const struct pipe_compute_state * cso)2344 agx_create_compute_state(struct pipe_context *pctx,
2345                          const struct pipe_compute_state *cso)
2346 {
2347    struct agx_context *ctx = agx_context(pctx);
2348    struct agx_device *dev = agx_device(pctx->screen);
2349    struct agx_uncompiled_shader *so =
2350       rzalloc(NULL, struct agx_uncompiled_shader);
2351 
2352    if (!so)
2353       return NULL;
2354 
2355    so->variants = _mesa_hash_table_create(so, asahi_cs_shader_key_hash,
2356                                           asahi_cs_shader_key_equal);
2357 
2358    union asahi_shader_key key = {0};
2359 
2360    assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
2361    nir_shader *nir = (void *)cso->prog;
2362 
2363    agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
2364    agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2365                           &key, NULL);
2366 
2367    /* We're done with the NIR, throw it away */
2368    ralloc_free(nir);
2369    return so;
2370 }
2371 
2372 static void
agx_get_compute_state_info(struct pipe_context * pctx,void * cso,struct pipe_compute_state_object_info * info)2373 agx_get_compute_state_info(struct pipe_context *pctx, void *cso,
2374                            struct pipe_compute_state_object_info *info)
2375 {
2376    union asahi_shader_key key = {0};
2377    struct agx_compiled_shader *so = agx_get_shader_variant(
2378       agx_screen(pctx->screen), pctx, cso, &pctx->debug, &key, NULL);
2379 
2380    info->max_threads =
2381       agx_occupancy_for_register_count(so->info.nr_gprs).max_threads;
2382    info->private_memory = 0;
2383    info->preferred_simd_size = 32;
2384    info->simd_sizes = 32;
2385 }
2386 
2387 /* Does not take ownership of key. Clones if necessary. */
2388 static bool
agx_update_shader(struct agx_context * ctx,struct agx_compiled_shader ** out,enum pipe_shader_type stage,union asahi_shader_key * key)2389 agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
2390                   enum pipe_shader_type stage, union asahi_shader_key *key)
2391 {
2392    struct agx_uncompiled_shader *so = ctx->stage[stage].shader;
2393    assert(so != NULL);
2394 
2395    struct hash_entry *he = _mesa_hash_table_search(so->variants, key);
2396 
2397    if (he) {
2398       if ((*out) == he->data)
2399          return false;
2400 
2401       *out = he->data;
2402       return true;
2403    }
2404 
2405    struct agx_uncompiled_shader *linked_so = NULL;
2406    if (stage == PIPE_SHADER_TESS_CTRL || stage == PIPE_SHADER_GEOMETRY)
2407       linked_so = ctx->stage[PIPE_SHADER_VERTEX].shader;
2408 
2409    struct agx_screen *screen = agx_screen(ctx->base.screen);
2410    *out = agx_get_shader_variant(screen, &ctx->base, so, &ctx->base.debug, key,
2411                                  linked_so);
2412    return true;
2413 }
2414 
2415 static enum mesa_prim
rast_prim(enum mesa_prim mode,unsigned fill_mode)2416 rast_prim(enum mesa_prim mode, unsigned fill_mode)
2417 {
2418    if (u_reduced_prim(mode) == MESA_PRIM_TRIANGLES) {
2419       if (fill_mode == PIPE_POLYGON_MODE_POINT)
2420          return MESA_PRIM_POINTS;
2421       else if (fill_mode == PIPE_POLYGON_MODE_LINE)
2422          return MESA_PRIM_LINES;
2423    }
2424 
2425    return mode;
2426 }
2427 
2428 static bool
agx_update_vs(struct agx_context * ctx,unsigned index_size_B)2429 agx_update_vs(struct agx_context *ctx, unsigned index_size_B)
2430 {
2431    /* Only proceed if the shader or anything the key depends on changes
2432     *
2433     * vb_mask, attributes, vertex_buffers: VERTEX
2434     * point_size_per_vertex: RS
2435     * outputs_{flat,linear}_shaded: FS_PROG
2436     */
2437    if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB |
2438                         AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)) ||
2439          ctx->stage[PIPE_SHADER_TESS_EVAL].dirty ||
2440          ctx->stage[PIPE_SHADER_GEOMETRY].dirty ||
2441          ctx->stage[PIPE_SHADER_TESS_EVAL].shader ||
2442          ctx->stage[PIPE_SHADER_GEOMETRY].shader || ctx->in_tess))
2443       return false;
2444 
2445    enum mesa_prim rasterized_prim =
2446       rast_prim(ctx->batch->reduced_prim, ctx->rast->base.fill_front);
2447 
2448    struct asahi_vs_shader_key key = {
2449       .next_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess
2450                        ? ASAHI_VS_TCS
2451                     : ctx->stage[PIPE_SHADER_GEOMETRY].shader ? ASAHI_VS_GS
2452                                                               : ASAHI_VS_FS,
2453    };
2454 
2455    if (key.next_stage == ASAHI_VS_FS) {
2456       /* If we are not rasterizing points, don't set fixed_point_size to
2457        * eliminate the useless point size write.
2458        */
2459       key.next.fs.fixed_point_size = !ctx->rast->base.point_size_per_vertex &&
2460                                      rasterized_prim == MESA_PRIM_POINTS;
2461 
2462       key.next.fs.outputs_flat_shaded =
2463          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded;
2464       key.next.fs.outputs_linear_shaded =
2465          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded;
2466    } else if (key.next_stage == ASAHI_VS_GS) {
2467       key.next.gs.index_size_B = index_size_B;
2468    }
2469 
2470    memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));
2471 
2472    return agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX,
2473                             (union asahi_shader_key *)&key);
2474 }
2475 
2476 static bool
agx_update_tcs(struct agx_context * ctx,const struct pipe_draw_info * info)2477 agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
2478 {
2479    assert(info->mode == MESA_PRIM_PATCHES);
2480 
2481    /* We don't bother to dirty track yet, update! */
2482    struct asahi_tcs_shader_key key = {
2483       .index_size_B = info->index_size,
2484    };
2485 
2486    memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));
2487 
2488    static_assert(sizeof(key.input_nir_sha1) ==
2489                     sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),
2490                  "common size for shader sha-1");
2491 
2492    memcpy(key.input_nir_sha1, ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1,
2493           sizeof(key.input_nir_sha1));
2494 
2495    return agx_update_shader(ctx, &ctx->tcs, PIPE_SHADER_TESS_CTRL,
2496                             (union asahi_shader_key *)&key);
2497 }
2498 
2499 static bool
agx_update_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect)2500 agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
2501               const struct pipe_draw_indirect_info *indirect)
2502 {
2503    /* Only proceed if there is a geometry shader. Due to input assembly
2504     * dependence, we don't bother to dirty track right now.
2505     */
2506    if (!ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
2507       ctx->gs = NULL;
2508       return false;
2509    }
2510 
2511    /* Transform feedback always happens via the geometry shader, so look there
2512     * to get the XFB strides.
2513     */
2514    struct agx_uncompiled_shader *gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
2515 
2516    for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2517       struct agx_streamout_target *tgt =
2518          agx_so_target(ctx->streamout.targets[i]);
2519 
2520       if (tgt != NULL)
2521          tgt->stride = gs->xfb_strides[i];
2522    }
2523 
2524    enum mesa_prim rasterized_prim =
2525       rast_prim(gs->gs_mode, ctx->rast->base.fill_front);
2526 
2527    struct asahi_gs_shader_key key = {
2528       .rasterizer_discard = ctx->rast->base.rasterizer_discard,
2529 
2530       /* TODO: Deduplicate */
2531       .fixed_point_size = !ctx->rast->base.point_size_per_vertex &&
2532                           rasterized_prim == MESA_PRIM_POINTS,
2533       .outputs_flat_shaded =
2534          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
2535       .outputs_linear_shaded =
2536          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded,
2537    };
2538 
2539    return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
2540                             (union asahi_shader_key *)&key);
2541 }
2542 
2543 static bool
agx_update_fs(struct agx_batch * batch)2544 agx_update_fs(struct agx_batch *batch)
2545 {
2546    struct agx_context *ctx = batch->ctx;
2547 
2548    /* Only proceed if the shader or anything the key depends on changes
2549     *
2550     * batch->key: implicitly dirties everything, no explicit check
2551     * rast: RS
2552     * blend: BLEND
2553     * sample_mask: SAMPLE_MASK
2554     * reduced_prim: PRIM
2555     */
2556    if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG | AGX_DIRTY_RS |
2557                        AGX_DIRTY_BLEND | AGX_DIRTY_SAMPLE_MASK |
2558                        AGX_DIRTY_PRIM | AGX_DIRTY_QUERY)))
2559       return false;
2560 
2561    unsigned nr_samples = util_framebuffer_get_num_samples(&batch->key);
2562    bool msaa = ctx->rast->base.multisample;
2563 
2564    struct asahi_fs_shader_key key = {
2565       .statistics = ctx->pipeline_statistics[PIPE_STAT_QUERY_PS_INVOCATIONS],
2566 
2567       .cull_distance_size =
2568          ctx->stage[MESA_SHADER_VERTEX].shader->info.cull_distance_size,
2569 
2570       .polygon_stipple =
2571          ctx->rast->base.poly_stipple_enable &&
2572          rast_prim(batch->reduced_prim, ctx->rast->base.fill_front) ==
2573             MESA_PRIM_TRIANGLES,
2574 
2575       .nr_samples = nr_samples,
2576 
2577       /* Only lower sample mask if at least one sample is masked out */
2578       .api_sample_mask =
2579          msaa && (~ctx->sample_mask & BITFIELD_MASK(nr_samples)),
2580    };
2581 
2582    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
2583       struct pipe_surface *surf = batch->key.cbufs[i];
2584 
2585       key.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2586    }
2587 
2588    memcpy(&key.blend, ctx->blend, sizeof(key.blend));
2589 
2590    /* Normalize key */
2591    if (!msaa)
2592       key.blend.alpha_to_coverage = false;
2593 
2594    return agx_update_shader(ctx, &ctx->fs, PIPE_SHADER_FRAGMENT,
2595                             (union asahi_shader_key *)&key);
2596 }
2597 
2598 static void
agx_bind_shader_state(struct pipe_context * pctx,void * cso,enum pipe_shader_type stage)2599 agx_bind_shader_state(struct pipe_context *pctx, void *cso,
2600                       enum pipe_shader_type stage)
2601 {
2602    struct agx_context *ctx = agx_context(pctx);
2603 
2604    if (stage == PIPE_SHADER_VERTEX)
2605       ctx->dirty |= AGX_DIRTY_VS_PROG;
2606    else if (stage == PIPE_SHADER_FRAGMENT)
2607       ctx->dirty |= AGX_DIRTY_FS_PROG;
2608    else
2609       ctx->stage[stage].dirty = ~0;
2610 
2611    ctx->stage[stage].shader = cso;
2612 }
2613 
2614 static void
agx_bind_vs_state(struct pipe_context * pctx,void * cso)2615 agx_bind_vs_state(struct pipe_context *pctx, void *cso)
2616 {
2617    agx_bind_shader_state(pctx, cso, PIPE_SHADER_VERTEX);
2618 }
2619 
2620 static void
agx_bind_fs_state(struct pipe_context * pctx,void * cso)2621 agx_bind_fs_state(struct pipe_context *pctx, void *cso)
2622 {
2623    agx_bind_shader_state(pctx, cso, PIPE_SHADER_FRAGMENT);
2624 }
2625 
2626 static void
agx_bind_gs_state(struct pipe_context * pctx,void * cso)2627 agx_bind_gs_state(struct pipe_context *pctx, void *cso)
2628 {
2629    agx_bind_shader_state(pctx, cso, PIPE_SHADER_GEOMETRY);
2630 }
2631 
2632 static void
agx_bind_tcs_state(struct pipe_context * pctx,void * cso)2633 agx_bind_tcs_state(struct pipe_context *pctx, void *cso)
2634 {
2635    agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_CTRL);
2636 }
2637 
2638 static void
agx_bind_tes_state(struct pipe_context * pctx,void * cso)2639 agx_bind_tes_state(struct pipe_context *pctx, void *cso)
2640 {
2641    agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_EVAL);
2642 }
2643 
2644 static void
agx_bind_cs_state(struct pipe_context * pctx,void * cso)2645 agx_bind_cs_state(struct pipe_context *pctx, void *cso)
2646 {
2647    agx_bind_shader_state(pctx, cso, PIPE_SHADER_COMPUTE);
2648 }
2649 
2650 /* Forward declare because of the recursion hit with geometry shaders */
2651 static void agx_delete_uncompiled_shader(struct agx_uncompiled_shader *so);
2652 
2653 static void
agx_delete_compiled_shader_internal(struct agx_compiled_shader * so)2654 agx_delete_compiled_shader_internal(struct agx_compiled_shader *so)
2655 {
2656    if (so->gs_count)
2657       agx_delete_compiled_shader_internal(so->gs_count);
2658 
2659    if (so->pre_gs)
2660       agx_delete_compiled_shader_internal(so->pre_gs);
2661 
2662    if (so->gs_copy)
2663       agx_delete_compiled_shader_internal(so->gs_copy);
2664 
2665    agx_bo_unreference(so->bo);
2666    FREE(so);
2667 }
2668 
2669 static void
agx_delete_compiled_shader(struct hash_entry * ent)2670 agx_delete_compiled_shader(struct hash_entry *ent)
2671 {
2672    agx_delete_compiled_shader_internal(ent->data);
2673 }
2674 
2675 static void
agx_delete_uncompiled_shader(struct agx_uncompiled_shader * so)2676 agx_delete_uncompiled_shader(struct agx_uncompiled_shader *so)
2677 {
2678    _mesa_hash_table_destroy(so->variants, agx_delete_compiled_shader);
2679    blob_finish(&so->serialized_nir);
2680    blob_finish(&so->early_serialized_nir);
2681 
2682    for (unsigned i = 0; i < MESA_PRIM_COUNT; ++i) {
2683       for (unsigned j = 0; j < 3; ++j) {
2684          for (unsigned k = 0; k < 2; ++k) {
2685             if (so->passthrough_progs[i][j][k])
2686                agx_delete_uncompiled_shader(so->passthrough_progs[i][j][k]);
2687          }
2688       }
2689    }
2690 
2691    for (unsigned i = 0; i < ARRAY_SIZE(so->passthrough_tcs); ++i) {
2692       if (so->passthrough_tcs[i])
2693          agx_delete_uncompiled_shader(so->passthrough_tcs[i]);
2694    }
2695 
2696    ralloc_free(so);
2697 }
2698 
2699 static void
agx_delete_shader_state(struct pipe_context * ctx,void * cso)2700 agx_delete_shader_state(struct pipe_context *ctx, void *cso)
2701 {
2702    agx_delete_uncompiled_shader(cso);
2703 }
2704 
2705 struct agx_generic_meta_key {
2706    meta_shader_builder_t builder;
2707    size_t key_size;
2708    uint8_t key[];
2709 };
2710 
2711 static uint32_t
meta_key_hash(const void * key_)2712 meta_key_hash(const void *key_)
2713 {
2714    const struct agx_generic_meta_key *key = key_;
2715 
2716    return _mesa_hash_data(key,
2717                           sizeof(struct agx_generic_meta_key) + key->key_size);
2718 }
2719 
2720 static bool
meta_key_equal(const void * a_,const void * b_)2721 meta_key_equal(const void *a_, const void *b_)
2722 {
2723    const struct agx_generic_meta_key *a = a_;
2724    const struct agx_generic_meta_key *b = b_;
2725 
2726    return a->builder == b->builder && a->key_size == b->key_size &&
2727           memcmp(a->key, b->key, a->key_size) == 0;
2728 }
2729 
2730 void
agx_init_meta_shaders(struct agx_context * ctx)2731 agx_init_meta_shaders(struct agx_context *ctx)
2732 {
2733    ctx->generic_meta =
2734       _mesa_hash_table_create(ctx, meta_key_hash, meta_key_equal);
2735 }
2736 
2737 void
agx_destroy_meta_shaders(struct agx_context * ctx)2738 agx_destroy_meta_shaders(struct agx_context *ctx)
2739 {
2740    _mesa_hash_table_destroy(ctx->generic_meta, agx_delete_compiled_shader);
2741 }
2742 
2743 static struct agx_compiled_shader *
agx_build_meta_shader(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size)2744 agx_build_meta_shader(struct agx_context *ctx, meta_shader_builder_t builder,
2745                       void *data, size_t data_size)
2746 {
2747    /* Build the meta shader key */
2748    size_t total_key_size = sizeof(struct agx_generic_meta_key) + data_size;
2749    struct agx_generic_meta_key *key = alloca(total_key_size);
2750 
2751    *key = (struct agx_generic_meta_key){
2752       .builder = builder,
2753       .key_size = data_size,
2754    };
2755 
2756    if (data_size)
2757       memcpy(key->key, data, data_size);
2758 
2759    /* Try to get the cached shader */
2760    struct hash_entry *ent = _mesa_hash_table_search(ctx->generic_meta, key);
2761    if (ent)
2762       return ent->data;
2763 
2764    /* Otherwise, compile the shader fresh */
2765    nir_builder b = nir_builder_init_simple_shader(
2766       MESA_SHADER_COMPUTE, &agx_nir_options, "AGX meta shader");
2767 
2768    builder(&b, data);
2769 
2770    struct agx_device *dev = agx_device(ctx->base.screen);
2771    UNUSED struct agx_uncompiled_shader_info info;
2772    agx_preprocess_nir(b.shader, dev->libagx, false, &info);
2773 
2774    struct agx_shader_key base_key = {0};
2775    struct agx_compiled_shader *shader =
2776       agx_compile_nir(dev, b.shader, &base_key, NULL, PIPE_SHADER_COMPUTE);
2777 
2778    ralloc_free(b.shader);
2779 
2780    /* ..and cache it before we return. The key is on the stack right now, so
2781     * clone it before using it as a hash table key. The clone is logically owned
2782     * by the hash table.
2783     */
2784    void *cloned_key = rzalloc_size(ctx->generic_meta, total_key_size);
2785    memcpy(cloned_key, key, total_key_size);
2786 
2787    _mesa_hash_table_insert(ctx->generic_meta, cloned_key, shader);
2788    return shader;
2789 }
2790 
2791 static unsigned
sampler_count(struct agx_context * ctx,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2792 sampler_count(struct agx_context *ctx, struct agx_compiled_shader *cs,
2793               enum pipe_shader_type stage)
2794 {
2795    unsigned sampler_count = ctx->stage[stage].sampler_count;
2796 
2797    if (cs->info.uses_txf)
2798       sampler_count = MAX2(sampler_count, cs->info.txf_sampler + 1);
2799 
2800    return sampler_count;
2801 }
2802 
2803 static inline enum agx_sampler_states
translate_sampler_state_count(struct agx_context * ctx,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2804 translate_sampler_state_count(struct agx_context *ctx,
2805                               struct agx_compiled_shader *cs,
2806                               enum pipe_shader_type stage)
2807 {
2808    /* Get samplers from merged stage but get txf status from cs */
2809    stage = merged_stage(ctx, stage);
2810 
2811    return agx_translate_sampler_state_count(sampler_count(ctx, cs, stage),
2812                                             ctx->stage[stage].custom_borders);
2813 }
2814 
2815 /*
2816  * Despite having both a layout *and* a flag that I only see Metal use with null
2817  * textures, AGX doesn't seem to have "real" null textures. Instead we need to
2818  * bind an arbitrary address and throw away the results to read all 0's.
2819  * Accordingly, the caller must pass some address that lives at least as long as
2820  * the texture descriptor itself.
2821  */
2822 static void
agx_set_null_texture(struct agx_texture_packed * tex,uint64_t valid_address)2823 agx_set_null_texture(struct agx_texture_packed *tex, uint64_t valid_address)
2824 {
2825    agx_pack(tex, TEXTURE, cfg) {
2826       cfg.layout = AGX_LAYOUT_NULL;
2827       cfg.channels = AGX_CHANNELS_R8;
2828       cfg.type = AGX_TEXTURE_TYPE_UNORM /* don't care */;
2829       cfg.swizzle_r = AGX_CHANNEL_0;
2830       cfg.swizzle_g = AGX_CHANNEL_0;
2831       cfg.swizzle_b = AGX_CHANNEL_0;
2832       cfg.swizzle_a = AGX_CHANNEL_0;
2833       cfg.address = valid_address;
2834       cfg.null = true;
2835    }
2836 }
2837 
2838 static void
agx_set_null_pbe(struct agx_pbe_packed * pbe,uint64_t sink)2839 agx_set_null_pbe(struct agx_pbe_packed *pbe, uint64_t sink)
2840 {
2841    agx_pack(pbe, PBE, cfg) {
2842       cfg.width = 1;
2843       cfg.height = 1;
2844       cfg.levels = 1;
2845       cfg.layout = AGX_LAYOUT_NULL;
2846       cfg.channels = AGX_CHANNELS_R8;
2847       cfg.type = AGX_TEXTURE_TYPE_UNORM /* don't care */;
2848       cfg.swizzle_r = AGX_CHANNEL_R;
2849       cfg.swizzle_g = AGX_CHANNEL_R;
2850       cfg.swizzle_b = AGX_CHANNEL_R;
2851       cfg.swizzle_a = AGX_CHANNEL_R;
2852       cfg.buffer = sink;
2853    }
2854 }
2855 
2856 static uint32_t
agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader * cs)2857 agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader *cs)
2858 {
2859    if (!cs || !cs->so)
2860       return 0;
2861 
2862    /* 2 descriptors per image, 1 descriptor per texture */
2863    return cs->so->info.nr_bindful_textures +
2864           (2 * cs->so->info.nr_bindful_images);
2865 }
2866 
2867 static uint32_t
agx_nr_tex_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2868 agx_nr_tex_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2869 {
2870    unsigned n = agx_nr_tex_descriptors_without_spilled_rts(cs);
2871 
2872    /* We add on texture/PBE descriptors for spilled render targets */
2873    bool spilled_rt = cs->stage == PIPE_SHADER_FRAGMENT &&
2874                      agx_tilebuffer_spills(&batch->tilebuffer_layout);
2875    if (spilled_rt)
2876       n += (batch->key.nr_cbufs * 2);
2877 
2878    return n;
2879 }
2880 
2881 /*
2882  * For spilled render targets, upload a texture/PBE pair for each surface to
2883  * allow loading/storing to the render target from the shader.
2884  */
2885 static void
agx_upload_spilled_rt_descriptors(struct agx_texture_packed * out,struct agx_batch * batch)2886 agx_upload_spilled_rt_descriptors(struct agx_texture_packed *out,
2887                                   struct agx_batch *batch)
2888 {
2889    for (unsigned rt = 0; rt < batch->key.nr_cbufs; ++rt) {
2890       struct agx_texture_packed *texture = out + (2 * rt);
2891       struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2892 
2893       struct pipe_surface *surf = batch->key.cbufs[rt];
2894       if (!surf)
2895          continue;
2896 
2897       struct agx_resource *rsrc = agx_resource(surf->texture);
2898       struct pipe_image_view view = image_view_for_surface(surf);
2899       struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
2900       sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2901 
2902       agx_pack_texture(texture, rsrc, surf->format, &sampler_view);
2903       agx_batch_upload_pbe(batch, pbe, &view, false, false, true);
2904    }
2905 }
2906 
2907 static void
agx_upload_textures(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2908 agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs,
2909                     enum pipe_shader_type stage)
2910 {
2911    struct agx_context *ctx = batch->ctx;
2912 
2913    /* This can occur for meta shaders */
2914    if (!cs->so) {
2915       batch->texture_count[stage] = 0;
2916       batch->stage_uniforms[stage].texture_base = 0;
2917       return;
2918    }
2919 
2920    unsigned nr_textures = cs->so->info.nr_bindful_textures;
2921 
2922    unsigned nr_active_textures = ctx->stage[stage].texture_count;
2923    unsigned nr_tex_descriptors = agx_nr_tex_descriptors(batch, cs);
2924    unsigned nr_images = cs->so->info.nr_bindful_images;
2925 
2926    struct agx_ptr T_tex = agx_pool_alloc_aligned(
2927       &batch->pool, AGX_TEXTURE_LENGTH * nr_tex_descriptors, 64);
2928 
2929    struct agx_texture_packed *textures = T_tex.cpu;
2930 
2931    for (unsigned i = 0; i < MIN2(nr_textures, nr_active_textures); ++i) {
2932       struct agx_sampler_view *tex = ctx->stage[stage].textures[i];
2933 
2934       if (tex == NULL) {
2935          agx_set_null_texture(&textures[i], T_tex.gpu);
2936          continue;
2937       }
2938 
2939       struct agx_resource *rsrc = tex->rsrc;
2940       agx_batch_reads(batch, tex->rsrc);
2941 
2942       /* Re-emit state because the layout might have changed from under us.
2943        * TODO: optimize this somehow?
2944        */
2945       agx_pack_texture(&tex->desc, rsrc, tex->format, &tex->base);
2946 
2947       textures[i] = tex->desc;
2948    }
2949 
2950    for (unsigned i = nr_active_textures; i < nr_textures; ++i)
2951       agx_set_null_texture(&textures[i], T_tex.gpu);
2952 
2953    for (unsigned i = 0; i < nr_images; ++i) {
2954       /* Image descriptors come in pairs after the textures */
2955       struct agx_texture_packed *texture =
2956          ((struct agx_texture_packed *)T_tex.cpu) + nr_textures + (2 * i);
2957 
2958       struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2959 
2960       if (!(ctx->stage[stage].image_mask & BITFIELD_BIT(i))) {
2961          agx_set_null_texture(texture, T_tex.gpu);
2962          agx_set_null_pbe(pbe, agx_pool_alloc_aligned(&batch->pool, 1, 64).gpu);
2963          continue;
2964       }
2965 
2966       struct pipe_image_view *view = &ctx->stage[stage].images[i];
2967       agx_batch_track_image(batch, view);
2968 
2969       struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view);
2970 
2971       /* For the texture descriptor, lower cubes to 2D arrays. This matches the
2972        * transform done in the compiler.
2973        */
2974       if (target_is_cube(sampler_view.target))
2975          sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2976 
2977       agx_pack_texture(texture, agx_resource(view->resource), view->format,
2978                        &sampler_view);
2979       agx_batch_upload_pbe(batch, pbe, view, false, false, false);
2980    }
2981 
2982    if (stage == PIPE_SHADER_FRAGMENT &&
2983        agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
2984 
2985       struct agx_texture_packed *out =
2986          ((struct agx_texture_packed *)T_tex.cpu) +
2987          agx_nr_tex_descriptors_without_spilled_rts(cs);
2988 
2989       agx_upload_spilled_rt_descriptors(out, batch);
2990    }
2991 
2992    batch->texture_count[stage] = nr_tex_descriptors;
2993    batch->stage_uniforms[stage].texture_base = T_tex.gpu;
2994 }
2995 
2996 uint16_t
agx_sampler_heap_add(struct agx_device * dev,struct agx_sampler_heap * heap,struct agx_sampler_packed * sampler)2997 agx_sampler_heap_add(struct agx_device *dev, struct agx_sampler_heap *heap,
2998                      struct agx_sampler_packed *sampler)
2999 {
3000    /* Allocate (maximally sized) BO if we haven't already */
3001    if (!heap->bo) {
3002       heap->bo = agx_bo_create(dev, AGX_SAMPLER_HEAP_SIZE * AGX_SAMPLER_LENGTH,
3003                                AGX_BO_WRITEBACK, "Sampler heap");
3004 
3005       assert(heap->count == 0);
3006    }
3007 
3008    /* TODO search */
3009 
3010    /* Precondition: there is room in the heap */
3011    assert(heap->count < AGX_SAMPLER_HEAP_SIZE);
3012    struct agx_sampler_packed *samplers = heap->bo->ptr.cpu;
3013    memcpy(samplers + heap->count, sampler, sizeof(*sampler));
3014 
3015    return heap->count++;
3016 }
3017 
3018 static void
agx_upload_samplers(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type orig_stage)3019 agx_upload_samplers(struct agx_batch *batch, struct agx_compiled_shader *cs,
3020                     enum pipe_shader_type orig_stage)
3021 {
3022    struct agx_context *ctx = batch->ctx;
3023 
3024    /* Get samplers from merged stage but get txf status from cs */
3025    enum pipe_shader_type stage = merged_stage(ctx, orig_stage);
3026 
3027    unsigned nr_samplers = sampler_count(ctx, cs, stage);
3028    bool custom_borders = ctx->stage[stage].custom_borders;
3029 
3030    size_t sampler_length =
3031       AGX_SAMPLER_LENGTH + (custom_borders ? AGX_BORDER_LENGTH : 0);
3032 
3033    struct agx_ptr T =
3034       agx_pool_alloc_aligned(&batch->pool, sampler_length * nr_samplers, 64);
3035 
3036    uint8_t *out_sampler = T.cpu;
3037    for (unsigned i = 0; i < nr_samplers; ++i) {
3038       struct agx_sampler_state *sampler = ctx->stage[stage].samplers[i];
3039       struct agx_sampler_packed *out = (struct agx_sampler_packed *)out_sampler;
3040 
3041       if (cs->info.uses_txf && i == cs->info.txf_sampler) {
3042          agx_pack(out, SAMPLER, cfg) {
3043             /* Allow mipmapping. This is respected by txf, weirdly. */
3044             cfg.mip_filter = AGX_MIP_FILTER_NEAREST;
3045 
3046             /* Out-of-bounds reads must return 0 */
3047             cfg.wrap_s = AGX_WRAP_CLAMP_TO_BORDER;
3048             cfg.wrap_t = AGX_WRAP_CLAMP_TO_BORDER;
3049             cfg.wrap_r = AGX_WRAP_CLAMP_TO_BORDER;
3050             cfg.border_colour = AGX_BORDER_COLOUR_TRANSPARENT_BLACK;
3051          }
3052       } else if (sampler) {
3053          *out = sampler->desc;
3054 
3055          if (custom_borders) {
3056             STATIC_ASSERT(sizeof(sampler->border) == AGX_BORDER_LENGTH);
3057 
3058             memcpy(out_sampler + AGX_SAMPLER_LENGTH, &sampler->border,
3059                    AGX_BORDER_LENGTH);
3060          } else {
3061             assert(!sampler->uses_custom_border && "invalid combination");
3062          }
3063       } else {
3064          memset(out, 0, sampler_length);
3065       }
3066 
3067       out_sampler += sampler_length;
3068    }
3069 
3070    batch->sampler_count[orig_stage] = nr_samplers;
3071    batch->samplers[orig_stage] = T.gpu;
3072 }
3073 
3074 static void
agx_update_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)3075 agx_update_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
3076 {
3077    struct agx_context *ctx = batch->ctx;
3078    if (!cs)
3079       return;
3080 
3081    enum pipe_shader_type stage = cs->stage;
3082    if (!ctx->stage[stage].dirty)
3083       return;
3084 
3085    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_CONST)
3086       agx_set_cbuf_uniforms(batch, stage);
3087 
3088    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SSBO)
3089       agx_set_ssbo_uniforms(batch, stage);
3090 
3091    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE)
3092       agx_upload_textures(batch, cs, stage);
3093 
3094    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
3095       agx_set_sampler_uniforms(batch, stage);
3096 
3097    if ((ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER) ||
3098        (ctx->stage[merged_stage(ctx, stage)].dirty & AGX_STAGE_DIRTY_SAMPLER))
3099       agx_upload_samplers(batch, cs, stage);
3100 
3101    struct agx_stage_uniforms *unif = &batch->stage_uniforms[stage];
3102 
3103    batch->uniforms.tables[AGX_SYSVAL_STAGE(stage)] =
3104       agx_pool_upload_aligned(&batch->pool, unif, sizeof(*unif), 16);
3105 }
3106 
3107 static uint32_t
agx_build_pipeline(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type phys_stage,unsigned variable_shared_mem,size_t max_subgroups)3108 agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
3109                    enum pipe_shader_type phys_stage,
3110                    unsigned variable_shared_mem, size_t max_subgroups)
3111 {
3112    struct agx_context *ctx = batch->ctx;
3113    struct agx_usc_builder b =
3114       agx_alloc_usc_control(&batch->pipeline_pool, cs->push_range_count + 2);
3115 
3116    enum pipe_shader_type stage = cs->stage;
3117    enum pipe_shader_type merged = merged_stage(ctx, stage);
3118 
3119    if (batch->texture_count[merged]) {
3120       agx_usc_pack(&b, TEXTURE, cfg) {
3121          cfg.start = 0;
3122          cfg.count =
3123             MIN2(batch->texture_count[merged], AGX_NUM_TEXTURE_STATE_REGS);
3124          cfg.buffer = batch->stage_uniforms[merged].texture_base;
3125       }
3126    }
3127 
3128    if (batch->sampler_count[stage]) {
3129       agx_usc_pack(&b, SAMPLER, cfg) {
3130          cfg.start = 0;
3131          cfg.count = batch->sampler_count[stage];
3132          cfg.buffer = batch->samplers[stage];
3133       }
3134    }
3135 
3136    for (unsigned i = 0; i < cs->push_range_count; ++i) {
3137       agx_usc_uniform(
3138          &b, cs->push[i].uniform, cs->push[i].length,
3139          batch->uniforms.tables[cs->push[i].table] + cs->push[i].offset);
3140    }
3141 
3142    if (stage == PIPE_SHADER_FRAGMENT) {
3143       agx_usc_tilebuffer(&b, &batch->tilebuffer_layout);
3144    } else if (stage == PIPE_SHADER_COMPUTE || stage == PIPE_SHADER_TESS_CTRL) {
3145       unsigned size = cs->info.local_size + variable_shared_mem;
3146 
3147       agx_usc_pack(&b, SHARED, cfg) {
3148          cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
3149          cfg.bytes_per_threadgroup = size > 0 ? size : 65536;
3150          cfg.uses_shared_memory = size > 0;
3151       }
3152    } else {
3153       agx_usc_shared_none(&b);
3154    }
3155 
3156    agx_usc_pack(&b, SHADER, cfg) {
3157       if (stage == PIPE_SHADER_FRAGMENT)
3158          cfg.loads_varyings = cs->info.varyings.fs.nr_bindings > 0;
3159 
3160       cfg.code = cs->bo->ptr.gpu + cs->info.main_offset;
3161       cfg.unk_2 = (stage == PIPE_SHADER_FRAGMENT) ? 2 : 3;
3162    }
3163 
3164    uint32_t max_scratch_size =
3165       MAX2(cs->info.scratch_size, cs->info.preamble_scratch_size);
3166 
3167    if (max_scratch_size > 0) {
3168       unsigned preamble_size = (cs->info.preamble_scratch_size > 0) ? 1 : 0;
3169 
3170       switch (phys_stage) {
3171       case PIPE_SHADER_FRAGMENT:
3172          agx_scratch_alloc(&ctx->scratch_fs, max_scratch_size, max_subgroups);
3173          batch->fs_scratch = true;
3174          batch->fs_preamble_scratch =
3175             MAX2(batch->fs_preamble_scratch, preamble_size);
3176          break;
3177       case PIPE_SHADER_VERTEX:
3178          agx_scratch_alloc(&ctx->scratch_vs, max_scratch_size, max_subgroups);
3179          batch->vs_scratch = true;
3180          batch->vs_preamble_scratch =
3181             MAX2(batch->vs_preamble_scratch, preamble_size);
3182          break;
3183       default:
3184          agx_scratch_alloc(&ctx->scratch_cs, max_scratch_size, max_subgroups);
3185          batch->cs_scratch = true;
3186          batch->cs_preamble_scratch =
3187             MAX2(batch->cs_preamble_scratch, preamble_size);
3188          break;
3189       }
3190    }
3191 
3192    agx_usc_pack(&b, REGISTERS, cfg) {
3193       cfg.register_count = cs->info.nr_gprs;
3194       cfg.unk_1 = (stage == PIPE_SHADER_FRAGMENT);
3195       cfg.spill_size = cs->info.scratch_size
3196                           ? agx_scratch_get_bucket(cs->info.scratch_size)
3197                           : 0;
3198    }
3199 
3200    if (stage == PIPE_SHADER_FRAGMENT) {
3201       agx_usc_pack(&b, FRAGMENT_PROPERTIES, cfg) {
3202          bool writes_sample_mask = ctx->fs->info.writes_sample_mask;
3203          cfg.early_z_testing = !writes_sample_mask;
3204          cfg.unk_4 = 0x2;
3205          cfg.unk_5 = 0x0;
3206       }
3207    }
3208 
3209    if (cs->info.has_preamble) {
3210       agx_usc_pack(&b, PRESHADER, cfg) {
3211          cfg.code = cs->bo->ptr.gpu + cs->info.preamble_offset;
3212       }
3213    } else {
3214       agx_usc_pack(&b, NO_PRESHADER, cfg)
3215          ;
3216    }
3217 
3218    return agx_usc_fini(&b);
3219 }
3220 
3221 uint64_t
agx_build_meta(struct agx_batch * batch,bool store,bool partial_render)3222 agx_build_meta(struct agx_batch *batch, bool store, bool partial_render)
3223 {
3224    struct agx_context *ctx = batch->ctx;
3225 
3226    /* Construct the key */
3227    struct agx_meta_key key = {.tib = batch->tilebuffer_layout};
3228 
3229    bool needs_textures_for_spilled_rts =
3230       agx_tilebuffer_spills(&batch->tilebuffer_layout) && !partial_render &&
3231       !store;
3232 
3233    for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3234       struct pipe_surface *surf = batch->key.cbufs[rt];
3235 
3236       if (surf == NULL)
3237          continue;
3238 
3239       if (store) {
3240          /* TODO: Suppress stores to discarded render targets */
3241          key.op[rt] = AGX_META_OP_STORE;
3242       } else if (batch->tilebuffer_layout.spilled[rt] && partial_render) {
3243          /* Partial render programs exist only to store/load the tilebuffer to
3244           * main memory. When render targets are already spilled to main memory,
3245           * there's nothing to do.
3246           */
3247          key.op[rt] = AGX_META_OP_NONE;
3248       } else {
3249          bool valid = (batch->load & (PIPE_CLEAR_COLOR0 << rt));
3250          bool clear = (batch->clear & (PIPE_CLEAR_COLOR0 << rt));
3251          bool load = valid && !clear;
3252 
3253          /* Don't read back spilled render targets, they're already in memory */
3254          load &= !batch->tilebuffer_layout.spilled[rt];
3255 
3256          /* The background program used for partial renders must always load
3257           * whatever was stored in the mid-frame end-of-tile program.
3258           */
3259          load |= partial_render;
3260 
3261          key.op[rt] = load    ? AGX_META_OP_LOAD
3262                       : clear ? AGX_META_OP_CLEAR
3263                               : AGX_META_OP_NONE;
3264       }
3265    }
3266 
3267    /* Begin building the pipeline */
3268    struct agx_usc_builder b =
3269       agx_alloc_usc_control(&batch->pipeline_pool, 3 + PIPE_MAX_COLOR_BUFS);
3270 
3271    bool needs_sampler = false;
3272    unsigned uniforms = 0;
3273 
3274    for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3275       if (key.op[rt] == AGX_META_OP_LOAD) {
3276          /* Each reloaded render target is textured */
3277          needs_sampler = true;
3278 
3279          /* Will be uploaded later, this would be clobbered */
3280          if (needs_textures_for_spilled_rts)
3281             continue;
3282 
3283          struct agx_ptr texture =
3284             agx_pool_alloc_aligned(&batch->pool, AGX_TEXTURE_LENGTH, 64);
3285          struct pipe_surface *surf = batch->key.cbufs[rt];
3286          assert(surf != NULL && "cannot load nonexistent attachment");
3287 
3288          struct agx_resource *rsrc = agx_resource(surf->texture);
3289          struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
3290 
3291          agx_pack_texture(texture.cpu, rsrc, surf->format, &sampler_view);
3292 
3293          agx_usc_pack(&b, TEXTURE, cfg) {
3294             /* Shifted to match eMRT indexing, could be optimized */
3295             cfg.start = rt * 2;
3296             cfg.count = 1;
3297             cfg.buffer = texture.gpu;
3298          }
3299 
3300       } else if (key.op[rt] == AGX_META_OP_CLEAR) {
3301          assert(batch->uploaded_clear_color[rt] && "set when cleared");
3302          agx_usc_uniform(&b, 4 + (8 * rt), 8, batch->uploaded_clear_color[rt]);
3303          uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
3304       } else if (key.op[rt] == AGX_META_OP_STORE) {
3305          struct pipe_image_view view =
3306             image_view_for_surface(batch->key.cbufs[rt]);
3307          struct agx_ptr pbe =
3308             agx_pool_alloc_aligned(&batch->pool, AGX_PBE_LENGTH, 256);
3309 
3310          /* The tilebuffer is already in sRGB space if needed. Do not convert */
3311          view.format = util_format_linear(view.format);
3312 
3313          agx_batch_upload_pbe(batch, pbe.cpu, &view, true, true, false);
3314 
3315          agx_usc_pack(&b, TEXTURE, cfg) {
3316             cfg.start = rt;
3317             cfg.count = 1;
3318             cfg.buffer = pbe.gpu;
3319          }
3320       }
3321    }
3322 
3323    if (needs_textures_for_spilled_rts) {
3324       /* Upload texture/PBE descriptors for each render target so we can clear
3325        * spilled render targets.
3326        */
3327       struct agx_ptr descs = agx_pool_alloc_aligned(
3328          &batch->pool, AGX_TEXTURE_LENGTH * 2 * batch->key.nr_cbufs, 64);
3329       agx_upload_spilled_rt_descriptors(descs.cpu, batch);
3330 
3331       agx_usc_pack(&b, TEXTURE, cfg) {
3332          cfg.start = 0;
3333          cfg.count = 2 * batch->key.nr_cbufs;
3334          cfg.buffer = descs.gpu;
3335       }
3336 
3337       /* Bind the base as u0_u1 for bindless access */
3338       agx_usc_uniform(&b, 0, 4,
3339                       agx_pool_upload_aligned(&batch->pool, &descs.gpu, 8, 8));
3340       uniforms = MAX2(uniforms, 4);
3341    }
3342 
3343    /* All render targets share a sampler */
3344    if (needs_sampler) {
3345       struct agx_ptr sampler =
3346          agx_pool_alloc_aligned(&batch->pool, AGX_SAMPLER_LENGTH, 64);
3347 
3348       agx_pack(sampler.cpu, SAMPLER, cfg) {
3349          cfg.magnify = AGX_FILTER_LINEAR;
3350          cfg.minify = AGX_FILTER_NEAREST;
3351          cfg.mip_filter = AGX_MIP_FILTER_NONE;
3352          cfg.wrap_s = AGX_WRAP_CLAMP_TO_EDGE;
3353          cfg.wrap_t = AGX_WRAP_CLAMP_TO_EDGE;
3354          cfg.wrap_r = AGX_WRAP_CLAMP_TO_EDGE;
3355          cfg.pixel_coordinates = true;
3356          cfg.compare_func = AGX_COMPARE_FUNC_ALWAYS;
3357       }
3358 
3359       agx_usc_pack(&b, SAMPLER, cfg) {
3360          cfg.start = 0;
3361          cfg.count = 1;
3362          cfg.buffer = sampler.gpu;
3363       }
3364    }
3365 
3366    agx_usc_tilebuffer(&b, &batch->tilebuffer_layout);
3367 
3368    /* Get the shader */
3369    key.reserved_preamble = uniforms;
3370    struct agx_meta_shader *shader = agx_get_meta_shader(&ctx->meta, &key);
3371    agx_batch_add_bo(batch, shader->bo);
3372 
3373    agx_usc_pack(&b, SHADER, cfg) {
3374       cfg.code = shader->ptr;
3375       cfg.unk_2 = 0;
3376    }
3377 
3378    agx_usc_pack(&b, REGISTERS, cfg)
3379       cfg.register_count = shader->info.nr_gprs;
3380 
3381    if (shader->info.has_preamble) {
3382       agx_usc_pack(&b, PRESHADER, cfg) {
3383          cfg.code = shader->ptr + shader->info.preamble_offset;
3384       }
3385    } else {
3386       agx_usc_pack(&b, NO_PRESHADER, cfg)
3387          ;
3388    }
3389 
3390    return agx_usc_fini(&b);
3391 }
3392 
3393 /*
3394  * Return the standard sample positions, packed into a 32-bit word with fixed
3395  * point nibbles for each x/y component of the (at most 4) samples. This is
3396  * suitable for programming the PPP_MULTISAMPLECTL control register.
3397  */
3398 static uint32_t
agx_default_sample_positions(unsigned nr_samples)3399 agx_default_sample_positions(unsigned nr_samples)
3400 {
3401    switch (nr_samples) {
3402    case 1:
3403       return 0x88;
3404    case 2:
3405       return 0x44cc;
3406    case 4:
3407       return 0xeaa26e26;
3408    default:
3409       unreachable("Invalid sample count");
3410    }
3411 }
3412 
3413 void
agx_batch_init_state(struct agx_batch * batch)3414 agx_batch_init_state(struct agx_batch *batch)
3415 {
3416    if (batch->initialized)
3417       return;
3418 
3419    if (agx_batch_is_compute(batch)) {
3420       batch->initialized = true;
3421 
3422       struct agx_context *ctx = batch->ctx;
3423       struct agx_device *dev = agx_device(ctx->base.screen);
3424       uint8_t *out = batch->cdm.current;
3425 
3426       /* See below */
3427       agx_push(out, CDM_BARRIER, cfg) {
3428          cfg.usc_cache_inval = true;
3429          cfg.unk_5 = true;
3430          cfg.unk_6 = true;
3431          cfg.unk_8 = true;
3432          // cfg.unk_11 = true;
3433          // cfg.unk_20 = true;
3434          if (dev->params.num_clusters_total > 1) {
3435             // cfg.unk_24 = true;
3436             if (dev->params.gpu_generation == 13) {
3437                cfg.unk_4 = true;
3438                // cfg.unk_26 = true;
3439             }
3440          }
3441       }
3442 
3443       return;
3444    }
3445 
3446    /* Emit state on the batch that we don't change and so don't dirty track */
3447    uint8_t *out = batch->vdm.current;
3448 
3449    /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
3450     * with another that caused stale data to be cached and the CPU wrote to it
3451     * in the meantime.
3452     */
3453    agx_push(out, VDM_BARRIER, cfg) {
3454       cfg.usc_cache_inval = true;
3455    }
3456 
3457    struct agx_ppp_update ppp =
3458       agx_new_ppp_update(&batch->pool, (struct AGX_PPP_HEADER){
3459                                           .w_clamp = true,
3460                                           .occlusion_query_2 = true,
3461                                           .output_unknown = true,
3462                                           .varying_word_2 = true,
3463                                           .viewport_count = 1, /* irrelevant */
3464                                        });
3465 
3466    /* clang-format off */
3467    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
3468    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
3469    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
3470    agx_ppp_push(&ppp, VARYING_2, cfg);
3471    /* clang-format on */
3472 
3473    agx_ppp_fini(&out, &ppp);
3474    batch->vdm.current = out;
3475 
3476    /* Mark it as initialized now, since agx_batch_writes() will check this. */
3477    batch->initialized = true;
3478 
3479    /* Choose a tilebuffer layout given the framebuffer key */
3480    enum pipe_format formats[PIPE_MAX_COLOR_BUFS] = {0};
3481    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3482       struct pipe_surface *surf = batch->key.cbufs[i];
3483       if (surf)
3484          formats[i] = surf->format;
3485    }
3486 
3487    batch->tilebuffer_layout = agx_build_tilebuffer_layout(
3488       formats, batch->key.nr_cbufs,
3489       util_framebuffer_get_num_samples(&batch->key),
3490       util_framebuffer_get_num_layers(&batch->key) > 1);
3491 
3492    if (agx_device(batch->ctx->base.screen)->debug & AGX_DBG_SMALLTILE)
3493       batch->tilebuffer_layout.tile_size = (struct agx_tile_size){16, 16};
3494 
3495    /* If the layout spilled render targets, we need to decompress those render
3496     * targets to ensure we can write to them.
3497     */
3498    if (agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
3499       for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3500          if (!batch->tilebuffer_layout.spilled[i])
3501             continue;
3502 
3503          struct pipe_surface *surf = batch->key.cbufs[i];
3504          if (!surf)
3505             continue;
3506 
3507          struct agx_resource *rsrc = agx_resource(surf->texture);
3508          if (rsrc->layout.writeable_image)
3509             continue;
3510 
3511          /* Decompress if we can and shadow if we can't. */
3512          if (rsrc->base.bind & PIPE_BIND_SHARED)
3513             unreachable("TODO");
3514          else
3515             agx_decompress(batch->ctx, rsrc, "Render target spilled");
3516       }
3517    }
3518 
3519    if (batch->key.zsbuf) {
3520       unsigned level = batch->key.zsbuf->u.tex.level;
3521       struct agx_resource *rsrc = agx_resource(batch->key.zsbuf->texture);
3522 
3523       agx_batch_writes(batch, rsrc, level);
3524 
3525       if (rsrc->separate_stencil)
3526          agx_batch_writes(batch, rsrc->separate_stencil, level);
3527    }
3528 
3529    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3530       if (batch->key.cbufs[i]) {
3531          struct agx_resource *rsrc = agx_resource(batch->key.cbufs[i]->texture);
3532          unsigned level = batch->key.cbufs[i]->u.tex.level;
3533 
3534          if (agx_resource_valid(rsrc, level))
3535             batch->load |= PIPE_CLEAR_COLOR0 << i;
3536 
3537          agx_batch_writes(batch, rsrc, batch->key.cbufs[i]->u.tex.level);
3538       }
3539    }
3540 
3541    /* Set up standard sample positions */
3542    batch->uniforms.ppp_multisamplectl =
3543       agx_default_sample_positions(batch->tilebuffer_layout.nr_samples);
3544 }
3545 
3546 static enum agx_object_type
agx_point_object_type(struct agx_rasterizer * rast)3547 agx_point_object_type(struct agx_rasterizer *rast)
3548 {
3549    return (rast->base.sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT)
3550              ? AGX_OBJECT_TYPE_POINT_SPRITE_UV01
3551              : AGX_OBJECT_TYPE_POINT_SPRITE_UV10;
3552 }
3553 
3554 #define MAX_PPP_UPDATES 2
3555 #define IS_DIRTY(ST)    !!(ctx->dirty & AGX_DIRTY_##ST)
3556 
3557 static uint8_t *
agx_encode_state(struct agx_batch * batch,uint8_t * out)3558 agx_encode_state(struct agx_batch *batch, uint8_t *out)
3559 {
3560    struct agx_context *ctx = batch->ctx;
3561 
3562    /* If nothing is dirty, encode nothing */
3563    if (!ctx->dirty)
3564       return out;
3565 
3566    struct agx_rasterizer *rast = ctx->rast;
3567    unsigned ppp_updates = 0;
3568 
3569    struct agx_compiled_shader *vs = ctx->vs, *fs = ctx->fs;
3570    if (ctx->gs)
3571       vs = ctx->gs->gs_copy;
3572 
3573    bool varyings_dirty = false;
3574 
3575    if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS) ||
3576        IS_DIRTY(PRIM)) {
3577       batch->varyings = agx_link_varyings_vs_fs(
3578          &batch->pipeline_pool, &vs->info.varyings.vs,
3579          &ctx->fs->info.varyings.fs, ctx->rast->base.flatshade_first,
3580          (batch->reduced_prim == MESA_PRIM_POINTS)
3581             ? ctx->rast->base.sprite_coord_enable
3582             : 0,
3583          &batch->generate_primitive_id);
3584 
3585       varyings_dirty = true;
3586       ppp_updates++;
3587    }
3588 
3589    if (IS_DIRTY(VS) || varyings_dirty) {
3590       agx_push(out, VDM_STATE, cfg) {
3591          cfg.vertex_shader_word_0_present = true;
3592          cfg.vertex_shader_word_1_present = true;
3593          cfg.vertex_outputs_present = true;
3594          cfg.vertex_unknown_present = true;
3595       }
3596 
3597       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_0, cfg) {
3598          cfg.uniform_register_count = vs->info.push_count;
3599          cfg.preshader_register_count = vs->info.nr_preamble_gprs;
3600          cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, vs);
3601          cfg.sampler_state_register_count =
3602             translate_sampler_state_count(ctx, vs, vs->stage);
3603       }
3604 
3605       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3606          cfg.pipeline = agx_build_pipeline(batch, vs, PIPE_SHADER_VERTEX, 0, 0);
3607       }
3608 
3609       agx_push(out, VDM_STATE_VERTEX_OUTPUTS, cfg) {
3610          cfg.output_count_1 = vs->info.varyings.vs.nr_index;
3611          cfg.output_count_2 = cfg.output_count_1;
3612       }
3613 
3614       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3615          cfg.flat_shading_control = ctx->rast->base.flatshade_first
3616                                        ? AGX_VDM_VERTEX_0
3617                                        : AGX_VDM_VERTEX_2;
3618          cfg.unknown_4 = cfg.unknown_5 = ctx->rast->base.rasterizer_discard;
3619 
3620          cfg.generate_primitive_id = batch->generate_primitive_id;
3621       }
3622 
3623       /* Pad up to a multiple of 8 bytes */
3624       memset(out, 0, 4);
3625       out += 4;
3626    }
3627 
3628    struct agx_pool *pool = &batch->pool;
3629 
3630    if ((ctx->dirty & AGX_DIRTY_RS) && ctx->rast->base.offset_tri) {
3631       agx_upload_depth_bias(batch, &ctx->rast->base);
3632       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
3633    }
3634 
3635    if (ctx->dirty & (AGX_DIRTY_VIEWPORT | AGX_DIRTY_SCISSOR_ZBIAS |
3636                      AGX_DIRTY_RS | AGX_DIRTY_VS)) {
3637 
3638       agx_upload_viewport_scissor(pool, batch, &out, ctx->viewport,
3639                                   ctx->rast->base.scissor ? ctx->scissor : NULL,
3640                                   ctx->rast->base.clip_halfz,
3641                                   vs->info.nonzero_viewport);
3642    }
3643 
3644    bool is_points = batch->reduced_prim == MESA_PRIM_POINTS;
3645    bool is_lines = batch->reduced_prim == MESA_PRIM_LINES;
3646 
3647    bool object_type_dirty =
3648       IS_DIRTY(PRIM) || (is_points && IS_DIRTY(SPRITE_COORD_MODE));
3649 
3650    bool fragment_face_dirty =
3651       IS_DIRTY(ZS) || IS_DIRTY(STENCIL_REF) || IS_DIRTY(RS);
3652 
3653    enum agx_object_type object_type = is_points  ? agx_point_object_type(rast)
3654                                       : is_lines ? AGX_OBJECT_TYPE_LINE
3655                                                  : AGX_OBJECT_TYPE_TRIANGLE;
3656 
3657    struct AGX_PPP_HEADER dirty = {
3658       .fragment_control =
3659          IS_DIRTY(ZS) || IS_DIRTY(RS) || IS_DIRTY(PRIM) || IS_DIRTY(QUERY),
3660       .fragment_control_2 = IS_DIRTY(PRIM) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS),
3661       .fragment_front_face = fragment_face_dirty,
3662       .fragment_front_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3663       .fragment_front_stencil = IS_DIRTY(ZS),
3664       .fragment_back_face = fragment_face_dirty,
3665       .fragment_back_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3666       .fragment_back_stencil = IS_DIRTY(ZS),
3667       .output_select = IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG),
3668       .varying_counts_32 = IS_DIRTY(VS_PROG),
3669       .varying_counts_16 = IS_DIRTY(VS_PROG),
3670       .cull = IS_DIRTY(RS),
3671       .cull_2 = varyings_dirty,
3672       .fragment_shader =
3673          IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
3674       .occlusion_query = IS_DIRTY(QUERY),
3675       .output_size = IS_DIRTY(VS_PROG),
3676       .viewport_count = 1, /* irrelevant */
3677    };
3678 
3679    struct agx_ppp_update ppp = agx_new_ppp_update(pool, dirty);
3680 
3681    if (dirty.fragment_control) {
3682       agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
3683          if (ctx->active_queries && ctx->occlusion_query) {
3684             if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
3685                cfg.visibility_mode = AGX_VISIBILITY_MODE_COUNTING;
3686             else
3687                cfg.visibility_mode = AGX_VISIBILITY_MODE_BOOLEAN;
3688          }
3689 
3690          cfg.stencil_test_enable = ctx->zs->base.stencil[0].enabled;
3691          cfg.two_sided_stencil = ctx->zs->base.stencil[1].enabled;
3692          cfg.depth_bias_enable = rast->base.offset_tri;
3693 
3694          /* Always enable scissoring so we may scissor to the viewport (TODO:
3695           * optimize this out if the viewport is the default and the app does
3696           * not use the scissor test)
3697           */
3698          cfg.scissor_enable = true;
3699       }
3700    }
3701 
3702    if (dirty.fragment_control_2) {
3703       agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
3704          /* This avoids broken derivatives along primitive edges */
3705          cfg.disable_tri_merging =
3706             (is_lines || is_points || ctx->fs->info.disable_tri_merging);
3707          cfg.tag_write_disable = ctx->fs->info.tag_write_disable ||
3708                                  ctx->rast->base.rasterizer_discard;
3709          cfg.pass_type = agx_pass_type_for_shader(&ctx->fs->info);
3710       }
3711    }
3712 
3713    if (dirty.fragment_front_face) {
3714       struct agx_fragment_face_packed front_face;
3715       agx_pack(&front_face, FRAGMENT_FACE, cfg) {
3716          cfg.stencil_reference = ctx->stencil_ref.ref_value[0];
3717          cfg.line_width = rast->line_width;
3718          cfg.polygon_mode = rast->polygon_mode;
3719       };
3720 
3721       front_face.opaque[0] |= ctx->zs->depth.opaque[0];
3722 
3723       agx_ppp_push_packed(&ppp, &front_face, FRAGMENT_FACE);
3724    }
3725 
3726    if (dirty.fragment_front_face_2)
3727       agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->info);
3728 
3729    if (dirty.fragment_front_stencil) {
3730       agx_ppp_push_packed(&ppp, ctx->zs->front_stencil.opaque,
3731                           FRAGMENT_STENCIL);
3732    }
3733 
3734    if (dirty.fragment_back_face) {
3735       struct agx_fragment_face_packed back_face;
3736 
3737       agx_pack(&back_face, FRAGMENT_FACE, cfg) {
3738          bool twosided = ctx->zs->base.stencil[1].enabled;
3739          cfg.stencil_reference = ctx->stencil_ref.ref_value[twosided ? 1 : 0];
3740          cfg.line_width = rast->line_width;
3741          cfg.polygon_mode = rast->polygon_mode;
3742       };
3743 
3744       back_face.opaque[0] |= ctx->zs->depth.opaque[0];
3745       agx_ppp_push_packed(&ppp, &back_face, FRAGMENT_FACE);
3746    }
3747 
3748    if (dirty.fragment_back_face_2)
3749       agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->info);
3750 
3751    if (dirty.fragment_back_stencil)
3752       agx_ppp_push_packed(&ppp, ctx->zs->back_stencil.opaque, FRAGMENT_STENCIL);
3753 
3754    if (dirty.output_select) {
3755       agx_ppp_push(&ppp, OUTPUT_SELECT, cfg) {
3756          cfg.varyings = !!fs->info.varyings.fs.nr_bindings;
3757          cfg.point_size = vs->info.writes_psiz;
3758          cfg.viewport_target = vs->info.writes_layer_viewport;
3759          cfg.render_target = vs->info.writes_layer_viewport;
3760          cfg.frag_coord_z = fs->info.varyings.fs.reads_z;
3761          cfg.clip_distance_plane_0 = vs->info.varyings.vs.nr_clip_dists > 0;
3762          cfg.clip_distance_plane_1 = vs->info.varyings.vs.nr_clip_dists > 1;
3763          cfg.clip_distance_plane_2 = vs->info.varyings.vs.nr_clip_dists > 2;
3764          cfg.clip_distance_plane_3 = vs->info.varyings.vs.nr_clip_dists > 3;
3765          cfg.clip_distance_plane_4 = vs->info.varyings.vs.nr_clip_dists > 4;
3766          cfg.clip_distance_plane_5 = vs->info.varyings.vs.nr_clip_dists > 5;
3767          cfg.clip_distance_plane_6 = vs->info.varyings.vs.nr_clip_dists > 6;
3768          cfg.clip_distance_plane_7 = vs->info.varyings.vs.nr_clip_dists > 7;
3769 
3770          assert(cfg.point_size || !is_points);
3771       }
3772    }
3773 
3774    assert(dirty.varying_counts_32 == dirty.varying_counts_16);
3775 
3776    if (dirty.varying_counts_32) {
3777       agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
3778          cfg.smooth = vs->info.varyings.vs.num_32_smooth;
3779          cfg.flat = vs->info.varyings.vs.num_32_flat;
3780          cfg.linear = vs->info.varyings.vs.num_32_linear;
3781       }
3782 
3783       agx_ppp_push(&ppp, VARYING_COUNTS, cfg) {
3784          cfg.smooth = vs->info.varyings.vs.num_16_smooth;
3785          cfg.flat = vs->info.varyings.vs.num_16_flat;
3786          cfg.linear = vs->info.varyings.vs.num_16_linear;
3787       }
3788    }
3789 
3790    if (dirty.cull)
3791       agx_ppp_push_packed(&ppp, ctx->rast->cull, CULL);
3792 
3793    if (dirty.cull_2) {
3794       agx_ppp_push(&ppp, CULL_2, cfg) {
3795          cfg.needs_primitive_id = batch->generate_primitive_id;
3796       }
3797    }
3798 
3799    if (dirty.fragment_shader) {
3800       unsigned frag_tex_count = ctx->stage[PIPE_SHADER_FRAGMENT].texture_count;
3801 
3802       agx_ppp_push(&ppp, FRAGMENT_SHADER, cfg) {
3803          cfg.pipeline =
3804             agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT, 0, 0),
3805          cfg.uniform_register_count = ctx->fs->info.push_count;
3806          cfg.preshader_register_count = ctx->fs->info.nr_preamble_gprs;
3807          cfg.texture_state_register_count =
3808             agx_nr_tex_descriptors(batch, ctx->fs);
3809          cfg.sampler_state_register_count =
3810             translate_sampler_state_count(ctx, ctx->fs, PIPE_SHADER_FRAGMENT);
3811          cfg.cf_binding_count = ctx->fs->info.varyings.fs.nr_bindings;
3812          cfg.cf_bindings = batch->varyings;
3813 
3814          /* XXX: This is probably wrong */
3815          cfg.unknown_30 = frag_tex_count >= 4;
3816       }
3817    }
3818 
3819    if (dirty.occlusion_query) {
3820       agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
3821          if (ctx->active_queries && ctx->occlusion_query) {
3822             cfg.index = agx_get_oq_index(batch, ctx->occlusion_query);
3823          }
3824       }
3825    }
3826 
3827    if (dirty.output_size) {
3828       agx_ppp_push(&ppp, OUTPUT_SIZE, cfg)
3829          cfg.count = vs->info.varyings.vs.nr_index;
3830    }
3831 
3832    agx_ppp_fini(&out, &ppp);
3833    ppp_updates++;
3834 
3835    assert(ppp_updates <= MAX_PPP_UPDATES);
3836    return out;
3837 }
3838 
3839 static enum agx_primitive
agx_primitive_for_pipe(enum mesa_prim mode)3840 agx_primitive_for_pipe(enum mesa_prim mode)
3841 {
3842    switch (mode) {
3843    case MESA_PRIM_POINTS:
3844       return AGX_PRIMITIVE_POINTS;
3845    case MESA_PRIM_LINES:
3846       return AGX_PRIMITIVE_LINES;
3847    case MESA_PRIM_LINE_STRIP:
3848       return AGX_PRIMITIVE_LINE_STRIP;
3849    case MESA_PRIM_LINE_LOOP:
3850       return AGX_PRIMITIVE_LINE_LOOP;
3851    case MESA_PRIM_TRIANGLES:
3852       return AGX_PRIMITIVE_TRIANGLES;
3853    case MESA_PRIM_TRIANGLE_STRIP:
3854       return AGX_PRIMITIVE_TRIANGLE_STRIP;
3855    case MESA_PRIM_TRIANGLE_FAN:
3856       return AGX_PRIMITIVE_TRIANGLE_FAN;
3857    case MESA_PRIM_QUADS:
3858       return AGX_PRIMITIVE_QUADS;
3859    case MESA_PRIM_QUAD_STRIP:
3860       return AGX_PRIMITIVE_QUAD_STRIP;
3861    default:
3862       unreachable("todo: other primitive types");
3863    }
3864 }
3865 
3866 static uint64_t
agx_index_buffer_rsrc_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,size_t * extent)3867 agx_index_buffer_rsrc_ptr(struct agx_batch *batch,
3868                           const struct pipe_draw_info *info, size_t *extent)
3869 {
3870    assert(!info->has_user_indices && "cannot use user pointers with indirect");
3871 
3872    struct agx_resource *rsrc = agx_resource(info->index.resource);
3873    agx_batch_reads(batch, rsrc);
3874 
3875    *extent = ALIGN_POT(rsrc->layout.size_B, 4);
3876    return rsrc->bo->ptr.gpu;
3877 }
3878 
3879 static uint64_t
agx_index_buffer_direct_ptr(struct agx_batch * batch,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_info * info,size_t * extent)3880 agx_index_buffer_direct_ptr(struct agx_batch *batch,
3881                             const struct pipe_draw_start_count_bias *draw,
3882                             const struct pipe_draw_info *info, size_t *extent)
3883 {
3884    off_t offset = draw->start * info->index_size;
3885    uint32_t max_extent = draw->count * info->index_size;
3886 
3887    if (!info->has_user_indices) {
3888       uint64_t base = agx_index_buffer_rsrc_ptr(batch, info, extent);
3889 
3890       *extent = ALIGN_POT(MIN2(*extent - offset, max_extent), 4);
3891       return base + offset;
3892    } else {
3893       *extent = ALIGN_POT(max_extent, 4);
3894 
3895       return agx_pool_upload_aligned(&batch->pool,
3896                                      ((uint8_t *)info->index.user) + offset,
3897                                      draw->count * info->index_size, 64);
3898    }
3899 }
3900 
3901 static uint64_t
agx_index_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,size_t * extent)3902 agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
3903                      const struct pipe_draw_start_count_bias *draw,
3904                      size_t *extent)
3905 {
3906    if (draw)
3907       return agx_index_buffer_direct_ptr(batch, draw, info, extent);
3908    else
3909       return agx_index_buffer_rsrc_ptr(batch, info, extent);
3910 }
3911 
3912 static void
agx_ensure_vdm_cmdbuf_has_space(struct agx_batch * batch,size_t space)3913 agx_ensure_vdm_cmdbuf_has_space(struct agx_batch *batch, size_t space)
3914 {
3915    /* Assert that we have space for a link tag */
3916    assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
3917           "Encoder overflowed");
3918 
3919    /* Always leave room for a link tag, in case we run out of space later,
3920     * plus padding because VDM apparently overreads?
3921     *
3922     * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
3923     */
3924    space += AGX_VDM_STREAM_LINK_LENGTH + 0x800;
3925 
3926    /* If there is room in the command buffer, we're done */
3927    if (likely((batch->vdm.end - batch->vdm.current) >= space))
3928       return;
3929 
3930    /* Otherwise, we need to allocate a new command buffer. We use memory owned
3931     * by the batch to simplify lifetime management for the BO.
3932     */
3933    size_t size = 65536;
3934    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 256);
3935 
3936    /* Jump from the old command buffer to the new command buffer */
3937    agx_pack(batch->vdm.current, VDM_STREAM_LINK, cfg) {
3938       cfg.target_lo = T.gpu & BITFIELD_MASK(32);
3939       cfg.target_hi = T.gpu >> 32;
3940    }
3941 
3942    /* Swap out the command buffer */
3943    batch->vdm.current = T.cpu;
3944    batch->vdm.end = batch->vdm.current + size;
3945 }
3946 
3947 #define COUNT_NONRESTART(T)                                                    \
3948    static unsigned count_nonrestart_##T(const T *indices, T restart,           \
3949                                         unsigned n)                            \
3950    {                                                                           \
3951       unsigned out = 0;                                                        \
3952       for (int i = 0; i < n; ++i) {                                            \
3953          if (indices[i] != restart)                                            \
3954             out++;                                                             \
3955       }                                                                        \
3956       return out;                                                              \
3957    }
3958 
3959 COUNT_NONRESTART(uint8_t)
COUNT_NONRESTART(uint16_t)3960 COUNT_NONRESTART(uint16_t)
3961 COUNT_NONRESTART(uint32_t)
3962 
3963 #undef COUNT_NONRESTART
3964 
3965 static void
3966 agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,
3967                      const struct pipe_draw_start_count_bias *draws)
3968 {
3969    unsigned count = draws->count;
3970 
3971    if (info->primitive_restart && info->index_size) {
3972       struct pipe_transfer *transfer = NULL;
3973       unsigned offset = draws->start * info->index_size;
3974 
3975       const void *indices;
3976       if (info->has_user_indices) {
3977          indices = (uint8_t *)info->index.user + offset;
3978       } else {
3979          struct pipe_resource *rsrc = info->index.resource;
3980 
3981          indices =
3982             pipe_buffer_map_range(&ctx->base, rsrc, offset,
3983                                   agx_resource(rsrc)->layout.size_B - offset,
3984                                   PIPE_MAP_READ, &transfer);
3985       }
3986 
3987       if (info->index_size == 1)
3988          count = count_nonrestart_uint8_t(indices, info->restart_index, count);
3989       else if (info->index_size == 2)
3990          count = count_nonrestart_uint16_t(indices, info->restart_index, count);
3991       else
3992          count = count_nonrestart_uint32_t(indices, info->restart_index, count);
3993 
3994       if (transfer)
3995          pipe_buffer_unmap(&ctx->base, transfer);
3996    }
3997 
3998    count *= info->instance_count;
3999 
4000    agx_query_increment_cpu(
4001       ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);
4002 
4003    agx_query_increment_cpu(
4004       ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
4005 }
4006 
4007 static uint64_t
agx_allocate_geometry_count_buffer(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draws)4008 agx_allocate_geometry_count_buffer(
4009    struct agx_batch *batch, const struct pipe_draw_info *info,
4010    const struct pipe_draw_start_count_bias *draws)
4011 {
4012    unsigned prim_per_instance =
4013       u_decomposed_prims_for_vertices(info->mode, draws->count);
4014    unsigned prims = prim_per_instance * info->instance_count;
4015 
4016    unsigned stride = batch->ctx->gs->gs_count_words * 4;
4017    unsigned size = prims * stride;
4018 
4019    if (size)
4020       return agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
4021    else
4022       return 0;
4023 }
4024 
4025 static uint64_t
agx_batch_geometry_state(struct agx_batch * batch)4026 agx_batch_geometry_state(struct agx_batch *batch)
4027 {
4028    struct agx_context *ctx = batch->ctx;
4029 
4030    if (!batch->geometry_state) {
4031       if (!ctx->heap) {
4032          ctx->heap = pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL,
4033                                         PIPE_USAGE_DEFAULT, 1024 * 1024 * 128);
4034       }
4035 
4036       struct agx_geometry_state state = {
4037          .heap = agx_resource(ctx->heap)->bo->ptr.gpu,
4038       };
4039 
4040       agx_batch_writes(batch, agx_resource(ctx->heap), 0);
4041 
4042       batch->geometry_state =
4043          agx_pool_upload_aligned(&batch->pool, &state, sizeof(state), 8);
4044    }
4045 
4046    return batch->geometry_state;
4047 }
4048 
4049 static void
agx_upload_ia_params(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,uint64_t input_index_buffer,size_t index_buffer_size_B,uint64_t unroll_output)4050 agx_upload_ia_params(struct agx_batch *batch, const struct pipe_draw_info *info,
4051                      const struct pipe_draw_indirect_info *indirect,
4052                      uint64_t input_index_buffer, size_t index_buffer_size_B,
4053                      uint64_t unroll_output)
4054 {
4055    struct agx_ia_state ia = {
4056       .heap = agx_batch_geometry_state(batch),
4057       .index_buffer = input_index_buffer,
4058       .index_size_B = info->index_size,
4059       .out_draws = unroll_output,
4060       .restart_index = info->restart_index,
4061       .index_buffer_size_B = index_buffer_size_B,
4062       .flatshade_first = batch->ctx->rast->base.flatshade_first,
4063    };
4064 
4065    if (indirect) {
4066       struct agx_resource *rsrc = agx_resource(indirect->buffer);
4067       agx_batch_reads(batch, rsrc);
4068 
4069       ia.draws = rsrc->bo->ptr.gpu + indirect->offset;
4070    }
4071 
4072    batch->uniforms.input_assembly =
4073       agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
4074 }
4075 
4076 static uint64_t
agx_batch_geometry_params(struct agx_batch * batch,uint64_t input_index_buffer,size_t index_buffer_size_B,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_indirect_info * indirect)4077 agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
4078                           size_t index_buffer_size_B,
4079                           const struct pipe_draw_info *info,
4080                           const struct pipe_draw_start_count_bias *draw,
4081                           const struct pipe_draw_indirect_info *indirect)
4082 {
4083    agx_upload_ia_params(batch, info, indirect, input_index_buffer,
4084                         index_buffer_size_B, 0);
4085 
4086    struct agx_geometry_params params = {
4087       .state = agx_batch_geometry_state(batch),
4088       .indirect_desc = batch->geom_indirect,
4089       .flat_outputs =
4090          batch->ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
4091       .input_topology = info->mode,
4092    };
4093 
4094    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->streamout.targets); ++i) {
4095       struct agx_streamout_target *so =
4096          agx_so_target(batch->ctx->streamout.targets[i]);
4097       struct agx_resource *rsrc = so ? agx_resource(so->offset) : NULL;
4098 
4099       uint32_t size;
4100       params.xfb_base_original[i] = agx_batch_get_so_address(batch, i, &size);
4101       params.xfb_size[i] = size;
4102 
4103       if (rsrc) {
4104          params.xfb_offs_ptrs[i] = rsrc->bo->ptr.gpu;
4105          agx_batch_writes(batch, rsrc, 0);
4106          batch->incoherent_writes = true;
4107       } else {
4108          params.xfb_offs_ptrs[i] = 0;
4109       }
4110    }
4111 
4112    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->prims_generated); ++i) {
4113       if (batch->ctx->prims_generated[i]) {
4114          params.prims_generated_counter[i] =
4115             agx_get_query_address(batch, batch->ctx->prims_generated[i]);
4116       }
4117    }
4118 
4119    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_prims_generated); ++i) {
4120       if (batch->ctx->tf_prims_generated[i]) {
4121          params.xfb_prims_generated_counter[i] =
4122             agx_get_query_address(batch, batch->ctx->tf_prims_generated[i]);
4123       }
4124    }
4125 
4126    if (batch->ctx->active_queries && batch->ctx->streamout.num_targets > 0) {
4127       for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_overflow); ++i) {
4128          if (batch->ctx->tf_overflow[i]) {
4129             params.xfb_overflow[i] =
4130                agx_get_query_address(batch, batch->ctx->tf_overflow[i]);
4131          }
4132       }
4133 
4134       if (batch->ctx->tf_any_overflow) {
4135          params.xfb_any_overflow =
4136             agx_get_query_address(batch, batch->ctx->tf_any_overflow);
4137       }
4138    }
4139 
4140    /* Calculate input primitive count for direct draws, and allocate the vertex
4141     * & count buffers. GPU calculates and allocates for indirect draws.
4142     */
4143    unsigned count_buffer_stride = batch->ctx->gs->gs_count_words * 4;
4144    params.vs_outputs = batch->ctx->vs->info.outputs;
4145 
4146    if (indirect) {
4147       params.count_buffer_stride = count_buffer_stride;
4148    } else {
4149       params.gs_grid[0] =
4150          u_decomposed_prims_for_vertices(info->mode, draw->count);
4151 
4152       params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
4153 
4154       params.input_primitives = params.gs_grid[0] * info->instance_count;
4155       params.input_vertices = draw->count;
4156 
4157       unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
4158                                             params.vs_outputs);
4159       unsigned size = params.input_primitives * count_buffer_stride;
4160 
4161       if (size) {
4162          params.count_buffer =
4163             agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
4164       }
4165 
4166       if (vb_size) {
4167          params.vertex_buffer =
4168             agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4169       }
4170    }
4171 
4172    return agx_pool_upload_aligned_with_bo(&batch->pool, &params, sizeof(params),
4173                                           8, &batch->geom_params_bo);
4174 }
4175 
4176 static void
agx_launch_gs_prerast(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_indirect_info * indirect)4177 agx_launch_gs_prerast(struct agx_batch *batch,
4178                       const struct pipe_draw_info *info,
4179                       const struct pipe_draw_start_count_bias *draws,
4180                       const struct pipe_draw_indirect_info *indirect)
4181 {
4182    struct agx_context *ctx = batch->ctx;
4183    struct agx_device *dev = agx_device(ctx->base.screen);
4184    struct agx_compiled_shader *gs = ctx->gs;
4185 
4186    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader->is_xfb_passthrough)
4187       perf_debug(dev, "Transform feedbck");
4188    else
4189       perf_debug(dev, "Geometry shader");
4190 
4191    /* This is a graphics batch, so it may not have had a CDM encoder allocated
4192     * yet. Allocate that so we can start enqueueing compute work.
4193     */
4194    if (!batch->cdm.bo) {
4195       batch->cdm = agx_encoder_allocate(batch, dev);
4196    }
4197 
4198    assert(!info->primitive_restart && "should have been lowered");
4199 
4200    struct pipe_grid_info grid_vs = {.block = {1, 1, 1}};
4201    struct pipe_grid_info grid_gs = {.block = {1, 1, 1}};
4202    struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};
4203 
4204    /* Setup grids */
4205    if (indirect) {
4206       assert(indirect->buffer && "drawauto already handled");
4207 
4208       struct agx_gs_setup_indirect_key key = {
4209          .prim = info->mode,
4210       };
4211 
4212       const struct pipe_grid_info grid_setup = {
4213          .block = {1, 1, 1},
4214          .grid = {1, 1, 1},
4215       };
4216 
4217       agx_launch(batch, &grid_setup,
4218                  agx_build_meta_shader(ctx, agx_nir_gs_setup_indirect, &key,
4219                                        sizeof(key)),
4220                  PIPE_SHADER_COMPUTE);
4221 
4222       /* Wrap the pool allocation in a fake resource for meta-Gallium use */
4223       assert(batch->geom_params_bo != NULL);
4224       grid_vs.indirect = &grid_indirect_rsrc.base;
4225       grid_gs.indirect = &grid_indirect_rsrc.base;
4226 
4227       unsigned param_offs =
4228          (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu);
4229 
4230       grid_vs.indirect_offset =
4231          param_offs + offsetof(struct agx_geometry_params, vs_grid);
4232 
4233       grid_gs.indirect_offset =
4234          param_offs + offsetof(struct agx_geometry_params, gs_grid);
4235    } else {
4236       grid_vs.grid[0] = draws->count;
4237       grid_vs.grid[1] = info->instance_count;
4238       grid_vs.grid[2] = 1;
4239 
4240       grid_gs.grid[0] =
4241          u_decomposed_prims_for_vertices(info->mode, draws->count);
4242       grid_gs.grid[1] = info->instance_count;
4243       grid_gs.grid[2] = 1;
4244    }
4245 
4246    /* Launch the vertex shader first */
4247    agx_launch(batch, &grid_vs, ctx->vs, ctx->vs->stage);
4248 
4249    /* If there is a count shader, launch it and prefix sum the results. */
4250    if (gs->gs_count) {
4251       perf_debug(dev, "Geometry shader count");
4252       agx_launch(batch, &grid_gs, gs->gs_count, PIPE_SHADER_GEOMETRY);
4253 
4254       unsigned words = gs->gs_count_words;
4255       agx_launch(batch,
4256                  &(const struct pipe_grid_info){
4257                     .block = {32, gs->gs_count_words, 1},
4258                     .grid = {1, 1, 1},
4259                  },
4260                  agx_build_meta_shader(ctx, agx_nir_prefix_sum_gs, &words,
4261                                        sizeof(words)),
4262                  PIPE_SHADER_COMPUTE);
4263    }
4264 
4265    /* Pre-GS shader */
4266    agx_launch(batch,
4267               &(const struct pipe_grid_info){
4268                  .block = {1, 1, 1},
4269                  .grid = {1, 1, 1},
4270               },
4271               gs->pre_gs, PIPE_SHADER_COMPUTE);
4272 
4273    /* Pre-rast geometry shader */
4274    agx_launch(batch, &grid_gs, gs, PIPE_SHADER_GEOMETRY);
4275 }
4276 
4277 static void
agx_draw_without_restart(struct agx_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)4278 agx_draw_without_restart(struct agx_batch *batch,
4279                          const struct pipe_draw_info *info,
4280                          unsigned drawid_offset,
4281                          const struct pipe_draw_indirect_info *indirect,
4282                          const struct pipe_draw_start_count_bias *draw)
4283 {
4284    struct agx_context *ctx = batch->ctx;
4285    struct agx_device *dev = agx_device(ctx->base.screen);
4286 
4287    perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
4288 
4289    agx_batch_init_state(batch);
4290 
4291    size_t ib_extent = 0;
4292    uint64_t ib =
4293       agx_index_buffer_ptr(batch, info, indirect ? NULL : draw, &ib_extent);
4294 
4295    /* The rest of this function handles only the general case of indirect
4296     * multidraws, so synthesize an indexed indirect draw now if we need one for
4297     * a direct draw (necessarily only one). This unifies the code paths.
4298     */
4299    struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
4300 
4301    if (!indirect) {
4302       uint32_t desc[5] = {draw->count, info->instance_count, 0,
4303                           draw->index_bias, info->start_instance};
4304 
4305       u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
4306                     &indirect_synthesized.offset, &indirect_synthesized.buffer);
4307 
4308       indirect = &indirect_synthesized;
4309    }
4310 
4311    /* Next, we unroll the index buffer used by the indirect draw */
4312    if (!batch->cdm.bo)
4313       batch->cdm = agx_encoder_allocate(batch, dev);
4314 
4315    struct agx_unroll_restart_key key = {
4316       .prim = info->mode,
4317       .index_size_B = info->index_size,
4318    };
4319 
4320    /* Allocate output indirect draw descriptors. This is exact. */
4321    struct agx_resource out_draws_rsrc = {0};
4322    struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
4323       &batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
4324       &out_draws_rsrc.bo);
4325 
4326    agx_upload_ia_params(batch, info, indirect, ib, ib_extent, out_draws.gpu);
4327 
4328    /* Unroll the index buffer for each draw */
4329    const struct pipe_grid_info grid_setup = {
4330       .block = {1, 1, 1},
4331       .grid = {indirect->draw_count, 1, 1},
4332    };
4333 
4334    agx_launch(
4335       batch, &grid_setup,
4336       agx_build_meta_shader(ctx, agx_nir_unroll_restart, &key, sizeof(key)),
4337       PIPE_SHADER_COMPUTE);
4338 
4339    /* Now draw the results without restart */
4340    struct pipe_draw_info new_info = *info;
4341    new_info.primitive_restart = false;
4342    new_info.mode = u_decomposed_prim(info->mode);
4343    new_info.index.resource = ctx->heap;
4344    new_info.has_user_indices = false;
4345 
4346    struct pipe_draw_indirect_info new_indirect = *indirect;
4347    new_indirect.buffer = &out_draws_rsrc.base;
4348    new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->ptr.gpu;
4349    new_indirect.stride = 5 * sizeof(uint32_t);
4350 
4351    ctx->active_draw_without_restart = true;
4352    ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, draw,
4353                       1);
4354    ctx->active_draw_without_restart = false;
4355 }
4356 
4357 static bool
agx_needs_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,bool * xfb_only)4358 agx_needs_passthrough_gs(struct agx_context *ctx,
4359                          const struct pipe_draw_info *info,
4360                          const struct pipe_draw_indirect_info *indirect,
4361                          bool *xfb_only)
4362 {
4363    /* If there is already a geometry shader in the pipeline, we do not need to
4364     * apply a passthrough GS of our own.
4365     */
4366    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader)
4367       return false;
4368 
4369    /* Rendering adjacency requires a GS, add a passthrough since we don't have
4370     * one.
4371     */
4372    if (info->mode == MESA_PRIM_LINES_ADJACENCY ||
4373        info->mode == MESA_PRIM_TRIANGLES_ADJACENCY ||
4374        info->mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY ||
4375        info->mode == MESA_PRIM_LINE_STRIP_ADJACENCY) {
4376       perf_debug_ctx(ctx, "Using passthrough GS due to adjacency primitives");
4377       return true;
4378    }
4379 
4380    /* Experimentally, G13 does not seem to pick the right provoking vertex for
4381     * triangle fans with first provoking. Inserting a GS for this case lets us
4382     * use our (correct) shader-based input assembly, translating to
4383     * appropriately oriented triangles and working around the hardware issue.
4384     * This warrants more investigation in case we're just misconfiguring the
4385     * hardware, but as tri fans are absent in Metal and GL defaults to last
4386     * vertex, this is a plausible part of the hardware to be broken (or absent).
4387     *
4388     * Affects piglit clipflat.
4389     */
4390    if (info->mode == MESA_PRIM_TRIANGLE_FAN &&
4391        ctx->rast->base.flatshade_first &&
4392        ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded) {
4393 
4394       perf_debug_ctx(ctx, "Using passthrough GS due to tri fan bug");
4395       return true;
4396    }
4397 
4398    /* TODO: this is sloppy, we should add a VDM kernel for this. */
4399    if (indirect && ctx->active_queries && ctx->prims_generated[0]) {
4400       perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
4401       return true;
4402    }
4403 
4404    /* Edge flags are emulated with a geometry shader */
4405    if (has_edgeflags(ctx, info->mode)) {
4406       perf_debug_ctx(ctx, "Using passthrough GS due to edge flags");
4407       return true;
4408    }
4409 
4410    /* Various pipeline statistics are implemented in the pre-GS shader. */
4411    if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
4412        ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
4413        ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) {
4414       perf_debug_ctx(ctx, "Using passthrough GS due to pipeline statistics");
4415       return true;
4416    }
4417 
4418    /* Transform feedback is layered on geometry shaders, so if transform
4419     * feedback is used, we need a GS.
4420     */
4421    if (ctx->stage[PIPE_SHADER_VERTEX].shader->has_xfb_info &&
4422        ctx->streamout.num_targets) {
4423       *xfb_only = true;
4424       return true;
4425    }
4426 
4427    /* Otherwise, we don't need one */
4428    return false;
4429 }
4430 
4431 static struct agx_uncompiled_shader *
agx_get_passthrough_gs(struct agx_context * ctx,struct agx_uncompiled_shader * prev_cso,enum mesa_prim mode,bool xfb_passthrough)4432 agx_get_passthrough_gs(struct agx_context *ctx,
4433                        struct agx_uncompiled_shader *prev_cso,
4434                        enum mesa_prim mode, bool xfb_passthrough)
4435 {
4436    bool edgeflags = has_edgeflags(ctx, mode);
4437 
4438    /* Only handle the polygon mode when edge flags are in use, because
4439     * nir_passthrough_gs doesn't handle transform feedback + polygon mode
4440     * properly. Technically this can break edge flags + transform feedback but
4441     * that's firmly in "doctor, it hurts when I do this" territory, and I'm not
4442     * sure that's even possible to hit. TODO: Reevaluate.
4443     */
4444    unsigned poly_mode =
4445       edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
4446 
4447    if (prev_cso->passthrough_progs[mode][poly_mode][edgeflags])
4448       return prev_cso->passthrough_progs[mode][poly_mode][edgeflags];
4449 
4450    struct blob_reader reader;
4451    blob_reader_init(&reader, prev_cso->early_serialized_nir.data,
4452                     prev_cso->early_serialized_nir.size);
4453    nir_shader *prev = nir_deserialize(NULL, &agx_nir_options, &reader);
4454 
4455    nir_shader *gs = nir_create_passthrough_gs(
4456       &agx_nir_options, prev, mode, rast_prim(mode, poly_mode), edgeflags,
4457       false /* force line strip out */);
4458 
4459    ralloc_free(prev);
4460 
4461    struct agx_uncompiled_shader *cso = pipe_shader_from_nir(&ctx->base, gs);
4462    cso->is_xfb_passthrough = xfb_passthrough;
4463    prev_cso->passthrough_progs[mode][poly_mode][edgeflags] = cso;
4464    return cso;
4465 }
4466 
4467 static void
agx_apply_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,bool xfb_passthrough)4468 agx_apply_passthrough_gs(struct agx_context *ctx,
4469                          const struct pipe_draw_info *info,
4470                          unsigned drawid_offset,
4471                          const struct pipe_draw_indirect_info *indirect,
4472                          const struct pipe_draw_start_count_bias *draws,
4473                          unsigned num_draws, bool xfb_passthrough)
4474 {
4475    enum pipe_shader_type prev_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4476                                          ? PIPE_SHADER_TESS_EVAL
4477                                          : PIPE_SHADER_VERTEX;
4478    struct agx_uncompiled_shader *prev_cso = ctx->stage[prev_stage].shader;
4479 
4480    assert(ctx->stage[PIPE_SHADER_GEOMETRY].shader == NULL);
4481 
4482    /* Draw with passthrough */
4483    ctx->base.bind_gs_state(
4484       &ctx->base,
4485       agx_get_passthrough_gs(ctx, prev_cso, info->mode, xfb_passthrough));
4486    ctx->base.draw_vbo(&ctx->base, info, drawid_offset, indirect, draws,
4487                       num_draws);
4488    ctx->base.bind_gs_state(&ctx->base, NULL);
4489 }
4490 
4491 static void
util_draw_multi_unroll_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4492 util_draw_multi_unroll_indirect(struct pipe_context *pctx,
4493                                 const struct pipe_draw_info *info,
4494                                 const struct pipe_draw_indirect_info *indirect,
4495                                 const struct pipe_draw_start_count_bias *draws)
4496 {
4497    for (unsigned i = 0; i < indirect->draw_count; ++i) {
4498       const struct pipe_draw_indirect_info subindirect = {
4499          .buffer = indirect->buffer,
4500          .count_from_stream_output = indirect->count_from_stream_output,
4501          .offset = indirect->offset + (i * indirect->stride),
4502          .draw_count = 1,
4503       };
4504 
4505       pctx->draw_vbo(pctx, info, i, &subindirect, draws, 1);
4506    }
4507 }
4508 
4509 static void
util_draw_multi_upload_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4510 util_draw_multi_upload_indirect(struct pipe_context *pctx,
4511                                 const struct pipe_draw_info *info,
4512                                 const struct pipe_draw_indirect_info *indirect,
4513                                 const struct pipe_draw_start_count_bias *draws)
4514 {
4515    struct pipe_draw_indirect_info indirect_ = *indirect;
4516    u_upload_data(pctx->const_uploader, 0, 4, 4, &indirect->draw_count,
4517                  &indirect_.indirect_draw_count_offset,
4518                  &indirect_.indirect_draw_count);
4519 
4520    pctx->draw_vbo(pctx, info, 0, &indirect_, draws, 1);
4521 }
4522 
4523 static void
agx_upload_draw_params(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_info * info)4524 agx_upload_draw_params(struct agx_batch *batch,
4525                        const struct pipe_draw_indirect_info *indirect,
4526                        const struct pipe_draw_start_count_bias *draws,
4527                        const struct pipe_draw_info *info)
4528 {
4529    if (indirect) {
4530       struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
4531       uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
4532       agx_batch_reads(batch, indirect_rsrc);
4533 
4534       /* To implement draw parameters, we use the last 2 words of the
4535        * indirect draw descriptor. Offset by 3 words for indexed draw (5
4536        * total) and 2 words for non-indexed (4 total).  See the layouts of
4537        * indexed vs non-indexed draw descriptors.
4538        *
4539        * This gives us a consistent layout
4540        *
4541        *    uint32_t first_vertex;
4542        *    uint32_t base_instance;
4543        *
4544        * and we can implement load_first_vertex & load_base_instance without
4545        * checking for indexing.
4546        */
4547       uint32_t offset = info->index_size ? 3 : 2;
4548       batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = address + offset * 4;
4549    } else {
4550       /* Upload just those two words. */
4551       uint32_t params[2] = {
4552          info->index_size ? draws->index_bias : draws->start,
4553          info->start_instance,
4554       };
4555 
4556       batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] =
4557          agx_pool_upload_aligned(&batch->pool, params, sizeof(params), 4);
4558    }
4559 }
4560 
4561 static void
agx_draw_patches(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4562 agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
4563                  unsigned drawid_offset,
4564                  const struct pipe_draw_indirect_info *indirect,
4565                  const struct pipe_draw_start_count_bias *draws,
4566                  unsigned num_draws)
4567 {
4568    struct agx_device *dev = agx_device(ctx->base.screen);
4569    perf_debug(dev, "Tessellation");
4570 
4571    struct agx_uncompiled_shader *tcs = ctx->stage[MESA_SHADER_TESS_CTRL].shader;
4572    struct agx_uncompiled_shader *tes = ctx->stage[MESA_SHADER_TESS_EVAL].shader;
4573 
4574    assert(tes != NULL && "required with patches");
4575 
4576    unsigned patch_vertices = ctx->patch_vertices;
4577 
4578    /* OpenGL allows omitting the tcs, fill in a passthrough program if needed.
4579     * In principle, we could optimize this case, but I don't think it matters.
4580     */
4581    bool unbind_tcs_when_done = false;
4582    if (!tcs) {
4583       struct agx_uncompiled_shader *vs = ctx->stage[MESA_SHADER_VERTEX].shader;
4584 
4585       assert(patch_vertices >= 1 &&
4586              patch_vertices <= ARRAY_SIZE(vs->passthrough_tcs));
4587 
4588       if (!vs->passthrough_tcs[patch_vertices - 1]) {
4589          struct blob_reader reader;
4590          blob_reader_init(&reader, vs->early_serialized_nir.data,
4591                           vs->early_serialized_nir.size);
4592          nir_shader *vs_nir = nir_deserialize(NULL, &agx_nir_options, &reader);
4593          nir_shader *nir = nir_create_passthrough_tcs(&agx_nir_options, vs_nir,
4594                                                       patch_vertices);
4595          ralloc_free(vs_nir);
4596 
4597          /* Lower the tess level sysvals and gather info, since mesa/st won't do
4598           * either for us.
4599           */
4600          NIR_PASS(_, nir, nir_lower_system_values);
4601 
4602          nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
4603 
4604          vs->passthrough_tcs[patch_vertices - 1] =
4605             pipe_shader_from_nir(&ctx->base, nir);
4606       }
4607 
4608       tcs = vs->passthrough_tcs[patch_vertices - 1];
4609       ctx->base.bind_tcs_state(&ctx->base, tcs);
4610       unbind_tcs_when_done = true;
4611    }
4612 
4613    unsigned in_vertices = draws->count;
4614    unsigned in_patches = in_vertices / patch_vertices;
4615 
4616    if (in_patches == 0)
4617       return;
4618 
4619    /* TCS invocation counter increments once per-patch */
4620    agx_query_increment_cpu(
4621       ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
4622       in_patches);
4623 
4624    struct agx_batch *batch = agx_get_compute_batch(ctx);
4625    agx_batch_init_state(batch);
4626 
4627    struct pipe_resource *heap =
4628       pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL, PIPE_USAGE_DEFAULT,
4629                          1024 * 1024 * 128);
4630 
4631    uint64_t heap_gpu = agx_resource(heap)->bo->ptr.gpu;
4632    uint8_t *heap_cpu = agx_resource(heap)->bo->ptr.cpu;
4633 
4634    unsigned unrolled_patch_count = in_patches * info->instance_count;
4635 
4636    uint32_t heap_water = 0;
4637    uint32_t tcs_out_offs = heap_water;
4638    heap_water += ALIGN(unrolled_patch_count * tcs->tess.output_stride, 4);
4639 
4640    agx_batch_writes(batch, agx_resource(heap), 0);
4641    batch->incoherent_writes = true;
4642 
4643    uint64_t ib = 0;
4644    size_t ib_extent = 0;
4645 
4646    if (info->index_size)
4647       ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
4648 
4649    agx_upload_ia_params(batch, info, indirect, ib, ib_extent, 0);
4650    agx_upload_draw_params(batch, indirect, draws, info);
4651 
4652    /* Setup parameters */
4653    struct agx_tess_params tess_params = {
4654       .tcs_buffer = heap_gpu + tcs_out_offs,
4655       .input_patch_size = patch_vertices,
4656       .output_patch_size = tcs->tess.output_patch_size,
4657       .tcs_patch_constants = tcs->tess.nr_patch_outputs,
4658       .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
4659       .patch_coord_buffer = heap_gpu,
4660       .patches_per_instance = in_patches,
4661    };
4662 
4663    memcpy(&tess_params.tess_level_outer_default, ctx->default_outer_level,
4664           sizeof(ctx->default_outer_level));
4665    memcpy(&tess_params.tess_level_inner_default, ctx->default_inner_level,
4666           sizeof(ctx->default_inner_level));
4667 
4668    batch->uniforms.tess_params =
4669       agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
4670 
4671    /* Run VS+TCS as compute */
4672    agx_upload_vbos(batch);
4673    agx_update_vs(ctx, info->index_size);
4674    agx_update_tcs(ctx, info);
4675    /* XXX */
4676    ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0;
4677    ctx->stage[PIPE_SHADER_TESS_EVAL].dirty = ~0;
4678    agx_update_descriptors(batch, ctx->vs);
4679    agx_update_descriptors(batch, ctx->tcs);
4680 
4681    struct pipe_grid_info tcs_grid = {
4682       .block = {MAX2(patch_vertices, tcs->tess.output_patch_size), 1, 1},
4683       .grid = {in_patches, info->instance_count, 1},
4684       /* XXX */
4685       .variable_shared_mem = 32768,
4686    };
4687 
4688    agx_launch(batch, &tcs_grid, ctx->tcs, PIPE_SHADER_TESS_CTRL);
4689 
4690    agx_flush_all(ctx, "HACK");
4691    agx_sync_all(ctx, "HACK");
4692 
4693    /* Setup batch */
4694    batch = agx_get_batch(ctx);
4695 
4696    enum tess_primitive_mode mode =
4697       MAX2(tcs->tess.primitive, tes->tess.primitive);
4698    enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
4699 
4700    enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
4701                                         ? PIPE_TESS_SPACING_EQUAL
4702                                      : spacing == TESS_SPACING_FRACTIONAL_ODD
4703                                         ? PIPE_TESS_SPACING_FRACTIONAL_ODD
4704                                         : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
4705 
4706    bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
4707    enum mesa_prim in_prim = mode == TESS_PRIMITIVE_ISOLINES ? MESA_PRIM_LINES
4708                             : mode == TESS_PRIMITIVE_QUADS
4709                                ? MESA_PRIM_QUADS
4710                                : MESA_PRIM_TRIANGLES;
4711    enum mesa_prim out_prim = point_mode ? MESA_PRIM_POINTS
4712                              : mode == TESS_PRIMITIVE_ISOLINES
4713                                 ? MESA_PRIM_LINES
4714                                 : MESA_PRIM_TRIANGLES;
4715 
4716    struct pipe_tessellator *tess =
4717       p_tess_init(in_prim, pspacing, tes->tess.ccw, point_mode);
4718 
4719    struct pipe_tessellator_data data = {0};
4720 
4721    /* Mem allocate */
4722    uint32_t patch_coord_offs_offs = heap_water;
4723    tess_params.patch_coord_offs = heap_gpu + heap_water;
4724    heap_water += align(4 * unrolled_patch_count, 4);
4725 
4726    uint32_t draws_off = heap_water;
4727    uint32_t *patch_draws = (uint32_t *)(heap_cpu + heap_water);
4728    heap_water += align(sizeof(uint32_t) * 5 * unrolled_patch_count, 4);
4729 
4730    uint32_t *patch_offs = (uint32_t *)(heap_cpu + patch_coord_offs_offs);
4731 
4732    for (unsigned patch = 0; patch < unrolled_patch_count; ++patch) {
4733       float *addr =
4734          (float *)(heap_cpu + tcs_out_offs + tcs->tess.output_stride * patch);
4735 
4736       struct pipe_tessellation_factors factors = {
4737          .outer_tf = {addr[0], addr[1], addr[2], addr[3]},
4738          .inner_tf = {addr[4], addr[5]},
4739       };
4740       p_tessellate(tess, &factors, &data);
4741 
4742       /* Mem allocate indices */
4743       uint32_t index_off = heap_water;
4744       uint16_t *indices = (uint16_t *)(heap_cpu + heap_water);
4745       heap_water += align(sizeof(*indices) * data.num_indices, 4);
4746 
4747       for (unsigned idx = 0; idx < data.num_indices; ++idx) {
4748          indices[idx] = data.indices[idx];
4749       }
4750 
4751       /* Mem allocate patch coords */
4752       heap_water = align(heap_water, 8);
4753       patch_offs[patch] = heap_water / 8;
4754       float *patch_coords = (float *)(heap_cpu + heap_water);
4755       heap_water += align(8 * data.num_domain_points, 4);
4756 
4757       for (unsigned p = 0; p < data.num_domain_points; ++p) {
4758          patch_coords[2 * p + 0] = data.domain_points_u[p];
4759          patch_coords[2 * p + 1] = data.domain_points_v[p];
4760       }
4761       assert(data.num_indices < 32768);
4762       assert(data.num_domain_points < 8192);
4763 
4764       /* Generate a draw for the patch */
4765       uint32_t *desc = patch_draws + (patch * 5);
4766 
4767       desc[0] = data.num_indices;                   /* count */
4768       desc[1] = 1;                                  /* instance_count */
4769       desc[2] = index_off / sizeof(*indices);       /* start */
4770       desc[3] = patch * LIBAGX_TES_PATCH_ID_STRIDE; /* index_bias */
4771       desc[4] = 0;                                  /* start_instance */
4772 
4773       /* TES invocation counter increments once per tessellated vertex */
4774       agx_query_increment_cpu(
4775          ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS],
4776          data.num_domain_points);
4777    }
4778    p_tess_destroy(tess);
4779 
4780    /* Run TES as VS */
4781    agx_batch_init_state(batch);
4782    void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
4783    void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
4784    ctx->base.bind_vs_state(&ctx->base, tes_cso);
4785    ctx->in_tess = true;
4786 
4787    struct pipe_draw_info draw_info = {
4788       .mode = out_prim,
4789       .index_size = 2,
4790       .index.resource = heap,
4791       .instance_count = 1,
4792       .view_mask = info->view_mask,
4793    };
4794 
4795    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
4796    struct pipe_draw_indirect_info copy_indirect = {
4797       .buffer = heap,
4798       .offset = draws_off,
4799       .stride = 5 * sizeof(uint32_t),
4800       .draw_count = in_patches * info->instance_count,
4801    };
4802 
4803    batch->uniforms.tess_params =
4804       agx_pool_upload(&batch->pool, &tess_params, sizeof(tess_params));
4805 
4806    ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);
4807 
4808    /* Restore vertex state */
4809    ctx->base.bind_vs_state(&ctx->base, vs_cso);
4810    ctx->in_tess = false;
4811 
4812    pipe_resource_reference(&heap, NULL);
4813 
4814    if (unbind_tcs_when_done) {
4815       ctx->base.bind_tcs_state(&ctx->base, NULL);
4816    }
4817 }
4818 
4819 /*
4820  * From the ARB_texture_barrier spec:
4821  *
4822  *  Specifically, the values of rendered fragments are undefined if any
4823  *  shader stage fetches texels and the same texels are written via fragment
4824  *  shader outputs, even if the reads and writes are not in the same Draw
4825  *  call, unless any of the following exceptions apply:
4826  *
4827  *  - The reads and writes are from/to disjoint sets of texels (after
4828  *    accounting for texture filtering rules).
4829  *
4830  *  - There is only a single read and write of each texel, and the read is in
4831  *    the fragment shader invocation that writes the same texel (e.g. using
4832  *    "texelFetch2D(sampler, ivec2(gl_FragCoord.xy), 0);").
4833  *
4834  *  - If a texel has been written, then in order to safely read the result
4835  *    a texel fetch must be in a subsequent Draw separated by the command
4836  *
4837  *      void TextureBarrier(void);
4838  *
4839  *    TextureBarrier() will guarantee that writes have completed and caches
4840  *    have been invalidated before subsequent Draws are executed."
4841  *
4842  * The wording is subtle, but we are not required to flush implicitly for
4843  * feedback loops, even though we're a tiler. What we are required to do is
4844  * decompress framebuffers involved in feedback loops, because otherwise
4845  * the hardware will race itself with exception #1, where we have a disjoint
4846  * group texels that intersects a compressed tile being written out.
4847  */
4848 static void
agx_legalize_feedback_loops(struct agx_context * ctx)4849 agx_legalize_feedback_loops(struct agx_context *ctx)
4850 {
4851    /* Trust that u_blitter knows what it's doing */
4852    if (ctx->blitter->running)
4853       return;
4854 
4855    for (unsigned stage = 0; stage < ARRAY_SIZE(ctx->stage); ++stage) {
4856       if (!(ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE))
4857          continue;
4858 
4859       for (unsigned i = 0; i < ctx->stage[stage].texture_count; ++i) {
4860          if (!ctx->stage[stage].textures[i])
4861             continue;
4862 
4863          struct agx_resource *rsrc = ctx->stage[stage].textures[i]->rsrc;
4864 
4865          for (unsigned cb = 0; cb < ctx->framebuffer.nr_cbufs; ++cb) {
4866             if (ctx->framebuffer.cbufs[cb] &&
4867                 agx_resource(ctx->framebuffer.cbufs[cb]->texture) == rsrc) {
4868 
4869                if (rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED) {
4870                   /* Decompress if we can and shadow if we can't. */
4871                   if (rsrc->base.bind & PIPE_BIND_SHARED)
4872                      unreachable("TODO");
4873                   else
4874                      agx_decompress(ctx, rsrc, "Texture feedback loop");
4875                }
4876 
4877                /* Not required by the spec, just for debug */
4878                if (agx_device(ctx->base.screen)->debug & AGX_DBG_FEEDBACK)
4879                   agx_flush_writer(ctx, rsrc, "Feedback loop");
4880             }
4881          }
4882       }
4883    }
4884 }
4885 
4886 static void
agx_draw_vbo(struct pipe_context * pctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4887 agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
4888              unsigned drawid_offset,
4889              const struct pipe_draw_indirect_info *indirect,
4890              const struct pipe_draw_start_count_bias *draws, unsigned num_draws)
4891 {
4892    struct agx_context *ctx = agx_context(pctx);
4893 
4894    if (unlikely(!agx_render_condition_check(ctx)))
4895       return;
4896 
4897    if (num_draws > 1) {
4898       util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
4899       return;
4900    }
4901 
4902    if (indirect && indirect->draw_count > 1 && !indirect->indirect_draw_count) {
4903       assert(drawid_offset == 0);
4904       assert(num_draws == 1);
4905 
4906       util_draw_multi_unroll_indirect(pctx, info, indirect, draws);
4907       return;
4908    }
4909 
4910    if (indirect && indirect->count_from_stream_output) {
4911       agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect);
4912       return;
4913    }
4914 
4915    /* TODO: stop cheating */
4916    if (indirect && indirect->indirect_draw_count) {
4917       perf_debug_ctx(ctx, "multi-draw indirect");
4918       util_draw_indirect(pctx, info, indirect);
4919       return;
4920    }
4921 
4922    /* TODO: stop cheating */
4923    if (info->mode == MESA_PRIM_PATCHES && indirect) {
4924       perf_debug_ctx(ctx, "indirect tessellation");
4925       util_draw_indirect(pctx, info, indirect);
4926       return;
4927    }
4928 
4929    /* TODO: stop cheating */
4930    if (ctx->active_queries && !ctx->active_draw_without_restart &&
4931        (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
4932         ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) &&
4933        indirect) {
4934 
4935       perf_debug_ctx(ctx, "indirect IA queries");
4936       util_draw_indirect(pctx, info, indirect);
4937       return;
4938    }
4939 
4940    if (info->mode == MESA_PRIM_PATCHES) {
4941       agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
4942       return;
4943    }
4944 
4945    bool xfb_passthrough = false;
4946    if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
4947       agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
4948                                num_draws, xfb_passthrough);
4949       return;
4950    }
4951 
4952    agx_legalize_feedback_loops(ctx);
4953 
4954    /* Only the rasterization stream counts */
4955    if (ctx->active_queries && ctx->prims_generated[0] &&
4956        !ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
4957 
4958       assert(!indirect && "we force a passthrough GS for this");
4959       agx_primitives_update_direct(ctx, info, draws);
4960    }
4961 
4962    if (ctx->active_queries && !ctx->active_draw_without_restart &&
4963        (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
4964         ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) {
4965       assert(!indirect && "lowered");
4966       agx_ia_update_direct(ctx, info, draws);
4967    }
4968 
4969    struct agx_batch *batch = agx_get_batch(ctx);
4970 
4971    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
4972        info->index_size) {
4973 
4974       agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
4975       return;
4976    }
4977 
4978    agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
4979 
4980    uint64_t ib = 0;
4981    size_t ib_extent = 0;
4982 
4983    if (info->index_size) {
4984       ib =
4985          agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
4986    }
4987 
4988 #ifndef NDEBUG
4989    if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
4990       agx_dirty_all(ctx);
4991 #endif
4992 
4993    agx_batch_init_state(batch);
4994 
4995    /* Dirty track the reduced prim: lines vs points vs triangles. Happens before
4996     * agx_update_vs/agx_update_fs, which specialize based on primitive.
4997     */
4998    enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
4999    if (reduced_prim != batch->reduced_prim)
5000       ctx->dirty |= AGX_DIRTY_PRIM;
5001    batch->reduced_prim = reduced_prim;
5002 
5003    /* Update shaders first so we can use them after */
5004    if (agx_update_vs(ctx, info->index_size)) {
5005       ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG;
5006       ctx->stage[PIPE_SHADER_VERTEX].dirty = ~0;
5007 
5008       agx_batch_add_bo(batch, ctx->vs->bo);
5009    } else if (ctx->stage[PIPE_SHADER_VERTEX].dirty ||
5010               (ctx->dirty & AGX_DIRTY_VERTEX))
5011       ctx->dirty |= AGX_DIRTY_VS;
5012 
5013    agx_update_gs(ctx, info, indirect);
5014 
5015    if (ctx->gs) {
5016       batch->geom_indirect = agx_pool_alloc_aligned_with_bo(
5017                                 &batch->pool, 64, 4, &batch->geom_indirect_bo)
5018                                 .gpu;
5019 
5020       batch->uniforms.geometry_params =
5021          agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
5022 
5023       agx_batch_add_bo(batch, ctx->gs->bo);
5024       agx_batch_add_bo(batch, ctx->gs->gs_copy->bo);
5025    }
5026 
5027    /* Set draw ID */
5028    if (ctx->vs->info.uses_draw_id) {
5029       batch->uniforms.draw_id = drawid_offset;
5030 
5031       ctx->dirty |= AGX_DIRTY_VS;
5032    }
5033 
5034    if (agx_update_fs(batch)) {
5035       ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
5036       ctx->stage[PIPE_SHADER_FRAGMENT].dirty = ~0;
5037 
5038       agx_batch_add_bo(batch, ctx->fs->bo);
5039    } else if ((ctx->stage[PIPE_SHADER_FRAGMENT].dirty) ||
5040               (ctx->dirty & (AGX_DIRTY_BLEND_COLOR | AGX_DIRTY_SAMPLE_MASK))) {
5041       ctx->dirty |= AGX_DIRTY_FS;
5042    }
5043 
5044    if (ctx->vs->info.uses_base_param || ctx->gs) {
5045       agx_upload_draw_params(batch, indirect, draws, info);
5046 
5047       batch->uniforms.is_indexed_draw = (info->index_size > 0);
5048       ctx->dirty |= AGX_DIRTY_VS;
5049    }
5050 
5051    agx_update_descriptors(batch, ctx->vs);
5052    agx_update_descriptors(batch, ctx->gs);
5053    agx_update_descriptors(batch, ctx->fs);
5054 
5055    struct agx_compiled_shader *prerast = ctx->gs ? ctx->gs->gs_copy : ctx->vs;
5056 
5057    batch->uniforms.layer_id_written =
5058       (prerast && prerast->info.writes_layer_viewport) ? ~0 : 0;
5059 
5060    if (IS_DIRTY(VS) || IS_DIRTY(FS) || ctx->gs || IS_DIRTY(VERTEX) ||
5061        IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
5062        IS_DIRTY(RS) || IS_DIRTY(PRIM)) {
5063 
5064       if (IS_DIRTY(VERTEX)) {
5065          agx_upload_vbos(batch);
5066       }
5067 
5068       if (IS_DIRTY(BLEND_COLOR)) {
5069          memcpy(batch->uniforms.blend_constant, &ctx->blend_color,
5070                 sizeof(ctx->blend_color));
5071       }
5072 
5073       if (IS_DIRTY(RS)) {
5074          batch->uniforms.fixed_point_size = ctx->rast->base.point_size;
5075       }
5076 
5077       if (IS_DIRTY(QUERY)) {
5078          for (unsigned i = 0; i < ARRAY_SIZE(ctx->pipeline_statistics); ++i) {
5079             struct agx_query *query = ctx->pipeline_statistics[i];
5080             batch->uniforms.pipeline_statistics[i] =
5081                query ? agx_get_query_address(batch, query) : 0;
5082          }
5083       }
5084 
5085       if (IS_DIRTY(POLY_STIPPLE)) {
5086          STATIC_ASSERT(sizeof(ctx->poly_stipple) == 32 * 4);
5087 
5088          batch->uniforms.polygon_stipple = agx_pool_upload_aligned(
5089             &batch->pool, ctx->poly_stipple, sizeof(ctx->poly_stipple), 4);
5090       }
5091 
5092       agx_upload_uniforms(batch);
5093    }
5094 
5095    struct pipe_draw_info info_gs;
5096    struct pipe_draw_indirect_info indirect_gs;
5097 
5098    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
5099    struct agx_resource indirect_rsrc = {.bo = batch->geom_indirect_bo};
5100 
5101    if (ctx->gs) {
5102       /* Launch the pre-rasterization parts of the geometry shader */
5103       agx_launch_gs_prerast(batch, info, draws, indirect);
5104 
5105       if (ctx->rast->base.rasterizer_discard)
5106          return;
5107 
5108       /* Setup to rasterize the GS results */
5109       info_gs = (struct pipe_draw_info){
5110          .mode = ctx->gs->gs_output_mode,
5111          .index_size = 4,
5112          .primitive_restart = ctx->gs->gs_output_mode != MESA_PRIM_POINTS,
5113          .restart_index = ~0,
5114          .index.resource = ctx->heap,
5115          .instance_count = 1,
5116          .view_mask = info->view_mask,
5117       };
5118 
5119       indirect_gs = (struct pipe_draw_indirect_info){
5120          .draw_count = 1,
5121          .buffer = &indirect_rsrc.base,
5122          .offset = batch->geom_indirect - indirect_rsrc.bo->ptr.gpu,
5123       };
5124 
5125       info = &info_gs;
5126       indirect = &indirect_gs;
5127 
5128       /* TODO: Deduplicate? */
5129       batch->reduced_prim = u_reduced_prim(info->mode);
5130       ctx->dirty |= AGX_DIRTY_PRIM;
5131 
5132       if (info_gs.index_size) {
5133          ib = agx_resource(ctx->heap)->bo->ptr.gpu;
5134          ib_extent = agx_resource(ctx->heap)->bo->size;
5135       } else {
5136          ib = 0;
5137          ib_extent = 0;
5138       }
5139 
5140       /* We need to reemit geometry descriptors since the txf sampler may change
5141        * between the GS prepass and the GS rast program.
5142        */
5143       agx_update_descriptors(batch, ctx->gs->gs_copy);
5144    }
5145 
5146    assert((!indirect || !indirect->indirect_draw_count) && "multidraw handled");
5147 
5148    /* Update batch masks based on current state */
5149    if (ctx->dirty & AGX_DIRTY_BLEND) {
5150       /* TODO: Any point to tracking load? */
5151       batch->draw |= ctx->blend->store;
5152       batch->resolve |= ctx->blend->store;
5153    }
5154 
5155    if (ctx->dirty & AGX_DIRTY_ZS) {
5156       batch->load |= ctx->zs->load;
5157       batch->draw |= ctx->zs->store;
5158       batch->resolve |= ctx->zs->store;
5159    }
5160 
5161    /* When we approach the end of a command buffer, cycle it out for a new one.
5162     * We only need to do this once per draw as long as we conservatively
5163     * estimate the maximum bytes of VDM commands that this draw will emit.
5164     */
5165    agx_ensure_vdm_cmdbuf_has_space(
5166       batch,
5167       (AGX_VDM_STATE_LENGTH * 2) + (AGX_PPP_STATE_LENGTH * MAX_PPP_UPDATES) +
5168          AGX_VDM_STATE_RESTART_INDEX_LENGTH +
5169          AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH +
5170          AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH +
5171          AGX_VDM_STATE_VERTEX_OUTPUTS_LENGTH +
5172          AGX_VDM_STATE_VERTEX_UNKNOWN_LENGTH + 4 /* padding */ +
5173          AGX_INDEX_LIST_LENGTH + AGX_INDEX_LIST_BUFFER_LO_LENGTH +
5174          AGX_INDEX_LIST_COUNT_LENGTH + AGX_INDEX_LIST_INSTANCES_LENGTH +
5175          AGX_INDEX_LIST_START_LENGTH + AGX_INDEX_LIST_BUFFER_SIZE_LENGTH);
5176 
5177    uint8_t *out = agx_encode_state(batch, batch->vdm.current);
5178 
5179    if (info->index_size) {
5180       agx_push(out, VDM_STATE, cfg)
5181          cfg.restart_index_present = true;
5182 
5183       agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
5184          cfg.value = info->restart_index;
5185    }
5186 
5187    agx_push(out, INDEX_LIST, cfg) {
5188       cfg.primitive = agx_primitive_for_pipe(info->mode);
5189 
5190       if (indirect != NULL) {
5191          cfg.indirect_buffer_present = true;
5192       } else {
5193          cfg.instance_count_present = true;
5194          cfg.index_count_present = true;
5195          cfg.start_present = true;
5196       }
5197 
5198       if (info->index_size) {
5199          cfg.restart_enable = info->primitive_restart;
5200          cfg.index_buffer_hi = (ib >> 32);
5201          cfg.index_size = agx_translate_index_size(info->index_size);
5202          cfg.index_buffer_present = true;
5203          cfg.index_buffer_size_present = true;
5204       }
5205    }
5206 
5207    if (info->index_size) {
5208       agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
5209          cfg.buffer_lo = ib & BITFIELD_MASK(32);
5210       }
5211    }
5212 
5213    if (indirect) {
5214       struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
5215       uint64_t address = indirect_rsrc->bo->ptr.gpu + indirect->offset;
5216 
5217       agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
5218          cfg.address_hi = address >> 32;
5219          cfg.address_lo = address & BITFIELD_MASK(32);
5220       }
5221    } else {
5222       agx_push(out, INDEX_LIST_COUNT, cfg)
5223          cfg.count = draws->count;
5224 
5225       agx_push(out, INDEX_LIST_INSTANCES, cfg)
5226          cfg.count = info->instance_count;
5227 
5228       agx_push(out, INDEX_LIST_START, cfg) {
5229          cfg.start = info->index_size ? draws->index_bias : draws->start;
5230       }
5231    }
5232 
5233    if (info->index_size) {
5234       agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
5235          cfg.size = ib_extent;
5236       }
5237    }
5238 
5239    batch->vdm.current = out;
5240    assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
5241           "Failed to reserve sufficient space in encoder");
5242    agx_dirty_reset_graphics(ctx);
5243 
5244    assert(batch == agx_get_batch(ctx) && "batch should not change under us");
5245 
5246    batch->draws++;
5247 
5248    /* The scissor/zbias arrays are indexed with 16-bit integers, imposigin a
5249     * maximum of UINT16_MAX descriptors. Flush if the next draw would overflow
5250     */
5251    if (unlikely(
5252           (((batch->scissor.size / AGX_SCISSOR_LENGTH) + AGX_MAX_VIEWPORTS) >
5253            UINT16_MAX) ||
5254           (batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH) >= UINT16_MAX)) {
5255       agx_flush_batch_for_reason(ctx, batch, "Scissor/depth bias overflow");
5256    } else if (unlikely(batch->draws > 100000)) {
5257       /* Mostly so drawoverhead doesn't OOM */
5258       agx_flush_batch_for_reason(ctx, batch, "Absurd number of draws");
5259    } else if (unlikely(batch->sampler_heap.count >
5260                        (AGX_SAMPLER_HEAP_SIZE - (PIPE_MAX_SAMPLERS * 6)))) {
5261       agx_flush_batch_for_reason(ctx, batch, "Sampler heap overflow");
5262    }
5263 }
5264 
5265 static void
agx_texture_barrier(struct pipe_context * pipe,unsigned flags)5266 agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
5267 {
5268    struct agx_context *ctx = agx_context(pipe);
5269 
5270    /* Framebuffer fetch is coherent, so barriers are a no-op. */
5271    if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER)
5272       return;
5273 
5274    agx_flush_all(ctx, "Texture barrier");
5275 }
5276 
5277 void
agx_launch(struct agx_batch * batch,const struct pipe_grid_info * info,struct agx_compiled_shader * cs,enum pipe_shader_type stage)5278 agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
5279            struct agx_compiled_shader *cs, enum pipe_shader_type stage)
5280 {
5281    struct agx_context *ctx = batch->ctx;
5282    struct agx_device *dev = agx_device(ctx->base.screen);
5283 
5284    /* To implement load_num_workgroups, the number of workgroups needs to be
5285     * available in GPU memory. This is either the indirect buffer, or just a
5286     * buffer we upload ourselves if not indirect.
5287     */
5288    if (info->indirect) {
5289       struct agx_resource *indirect = agx_resource(info->indirect);
5290       agx_batch_reads(batch, indirect);
5291 
5292       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
5293          indirect->bo->ptr.gpu + info->indirect_offset;
5294    } else {
5295       static_assert(sizeof(info->grid) == 12,
5296                     "matches indirect dispatch buffer");
5297 
5298       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = agx_pool_upload_aligned(
5299          &batch->pool, info->grid, sizeof(info->grid), 4);
5300    }
5301 
5302    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
5303       if (!*res)
5304          continue;
5305 
5306       struct agx_resource *buffer = agx_resource(*res);
5307       agx_batch_writes(batch, buffer, 0);
5308       batch->incoherent_writes = true;
5309    }
5310 
5311    agx_batch_add_bo(batch, cs->bo);
5312 
5313    agx_update_descriptors(batch, cs);
5314    agx_upload_uniforms(batch);
5315 
5316    // TODO: This is broken.
5317    size_t subgroups_per_core = 0;
5318 #if 0
5319    if (!info->indirect) {
5320       size_t subgroups_per_workgroup =
5321          DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 32);
5322       subgroups_per_core =
5323          local_workgroups *
5324          DIV_ROUND_UP(info->grid[0] * info->grid[1] * info->grid[2],
5325                      ctx->scratch_cs.num_cores);
5326    }
5327 #endif
5328 
5329    /* TODO: Ensure space if we allow multiple kernels in a batch */
5330    uint8_t *out = batch->cdm.current;
5331 
5332    agx_push(out, CDM_LAUNCH, cfg) {
5333       if (info->indirect)
5334          cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
5335       else
5336          cfg.mode = AGX_CDM_MODE_DIRECT;
5337 
5338       cfg.uniform_register_count = cs->info.push_count;
5339       cfg.preshader_register_count = cs->info.nr_preamble_gprs;
5340       cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, cs);
5341       cfg.sampler_state_register_count =
5342          translate_sampler_state_count(ctx, cs, stage);
5343       cfg.pipeline =
5344          agx_build_pipeline(batch, cs, PIPE_SHADER_COMPUTE,
5345                             info->variable_shared_mem, subgroups_per_core);
5346    }
5347 
5348    /* Added in G14X */
5349    if (dev->params.gpu_generation >= 14 && dev->params.num_clusters_total > 1) {
5350       agx_push(out, CDM_UNK_G14X, cfg)
5351          ;
5352    }
5353 
5354    if (info->indirect) {
5355       agx_push(out, CDM_INDIRECT, cfg) {
5356          cfg.address_hi = batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] >> 32;
5357          cfg.address_lo =
5358             batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] & BITFIELD64_MASK(32);
5359       }
5360    } else {
5361       uint32_t size[3];
5362       for (unsigned d = 0; d < 3; ++d) {
5363          size[d] = ((info->grid[d] - 1) * info->block[d]) +
5364                    (info->last_block[d] ?: info->block[d]);
5365       }
5366 
5367       agx_push(out, CDM_GLOBAL_SIZE, cfg) {
5368          cfg.x = size[0];
5369          cfg.y = size[1];
5370          cfg.z = size[2];
5371       }
5372    }
5373 
5374    agx_push(out, CDM_LOCAL_SIZE, cfg) {
5375       cfg.x = info->block[0];
5376       cfg.y = info->block[1];
5377       cfg.z = info->block[2];
5378    }
5379 
5380    agx_push(out, CDM_BARRIER, cfg) {
5381       cfg.unk_5 = true;
5382       cfg.unk_6 = true;
5383       cfg.unk_8 = true;
5384       // cfg.unk_11 = true;
5385       // cfg.unk_20 = true;
5386       if (dev->params.num_clusters_total > 1) {
5387          // cfg.unk_24 = true;
5388          if (dev->params.gpu_generation == 13) {
5389             cfg.unk_4 = true;
5390             // cfg.unk_26 = true;
5391          }
5392       }
5393 
5394       /* With multiple launches in the same CDM stream, we can get cache
5395        * coherency (? or sync?) issues. We hit this with blits, which need - in
5396        * between dispatches - need the PBE cache to be flushed and the texture
5397        * cache to be invalidated. Until we know what bits mean what exactly,
5398        * let's just set these after every launch to be safe. We can revisit in
5399        * the future when we figure out what the bits mean.
5400        */
5401       cfg.unk_0 = true;
5402       cfg.unk_1 = true;
5403       cfg.unk_2 = true;
5404       cfg.usc_cache_inval = true;
5405       cfg.unk_4 = true;
5406       cfg.unk_5 = true;
5407       cfg.unk_6 = true;
5408       cfg.unk_7 = true;
5409       cfg.unk_8 = true;
5410       cfg.unk_9 = true;
5411       cfg.unk_10 = true;
5412       cfg.unk_11 = true;
5413       cfg.unk_12 = true;
5414       cfg.unk_13 = true;
5415       cfg.unk_14 = true;
5416       cfg.unk_15 = true;
5417       cfg.unk_16 = true;
5418       cfg.unk_17 = true;
5419       cfg.unk_18 = true;
5420       cfg.unk_19 = true;
5421    }
5422 
5423    batch->cdm.current = out;
5424    assert(batch->cdm.current <= batch->cdm.end &&
5425           "Failed to reserve sufficient space in encoder");
5426 }
5427 
5428 static void
agx_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)5429 agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
5430 {
5431    struct agx_context *ctx = agx_context(pipe);
5432    if (unlikely(!ctx->compute_blitter.active &&
5433                 !agx_render_condition_check(ctx)))
5434       return;
5435 
5436    /* Increment the pipeline stats query.
5437     *
5438     * TODO: Use the hardware counter for this, or at least an auxiliary compute
5439     * job so it doesn't stall.
5440     *
5441     * This has to happen before getting the batch, because it will invalidate
5442     * the batch due to the stall.
5443     */
5444    if (ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]) {
5445       uint32_t grid[3] = {info->grid[0], info->grid[1], info->grid[2]};
5446       if (info->indirect) {
5447          perf_debug_ctx(ctx, "Emulated indirect compute invocation query");
5448          pipe_buffer_read(pipe, info->indirect, info->indirect_offset,
5449                           sizeof(grid), grid);
5450       }
5451 
5452       unsigned workgroups = grid[0] * grid[1] * grid[2];
5453       unsigned blocksize = info->block[0] * info->block[1] * info->block[2];
5454       unsigned count = workgroups * blocksize;
5455 
5456       agx_query_increment_cpu(
5457          ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS], count);
5458    }
5459 
5460    struct agx_batch *batch = agx_get_compute_batch(ctx);
5461    agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
5462 
5463    agx_batch_init_state(batch);
5464 
5465    struct agx_uncompiled_shader *uncompiled =
5466       ctx->stage[PIPE_SHADER_COMPUTE].shader;
5467 
5468    /* There is exactly one variant, get it */
5469    struct agx_compiled_shader *cs =
5470       _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
5471 
5472    agx_launch(batch, info, cs, PIPE_SHADER_COMPUTE);
5473 
5474    /* TODO: Dirty tracking? */
5475    agx_dirty_all(ctx);
5476 
5477    batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = 0;
5478 
5479    /* If the next dispatch might overflow, flush now. TODO: If this is ever hit
5480     * in practice, we can use CDM stream links.
5481     */
5482    size_t dispatch_upper_bound =
5483       AGX_CDM_LAUNCH_LENGTH + AGX_CDM_UNK_G14X_LENGTH +
5484       AGX_CDM_INDIRECT_LENGTH + AGX_CDM_GLOBAL_SIZE_LENGTH +
5485       AGX_CDM_LOCAL_SIZE_LENGTH + AGX_CDM_BARRIER_LENGTH;
5486 
5487    if (batch->cdm.current + dispatch_upper_bound >= batch->cdm.end)
5488       agx_flush_batch_for_reason(ctx, batch, "CDM overfull");
5489 }
5490 
5491 static void
agx_set_global_binding(struct pipe_context * pipe,unsigned first,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)5492 agx_set_global_binding(struct pipe_context *pipe, unsigned first,
5493                        unsigned count, struct pipe_resource **resources,
5494                        uint32_t **handles)
5495 {
5496    struct agx_context *ctx = agx_context(pipe);
5497    unsigned old_size =
5498       util_dynarray_num_elements(&ctx->global_buffers, *resources);
5499 
5500    if (old_size < first + count) {
5501       /* we are screwed no matter what */
5502       if (!util_dynarray_grow(&ctx->global_buffers, *resources,
5503                               (first + count) - old_size))
5504          unreachable("out of memory");
5505 
5506       for (unsigned i = old_size; i < first + count; i++)
5507          *util_dynarray_element(&ctx->global_buffers, struct pipe_resource *,
5508                                 i) = NULL;
5509    }
5510 
5511    for (unsigned i = 0; i < count; ++i) {
5512       struct pipe_resource **res = util_dynarray_element(
5513          &ctx->global_buffers, struct pipe_resource *, first + i);
5514       if (resources && resources[i]) {
5515          pipe_resource_reference(res, resources[i]);
5516 
5517          /* The handle points to uint32_t, but space is allocated for 64
5518           * bits. We need to respect the offset passed in. This interface
5519           * is so bad.
5520           */
5521          uint64_t addr = 0;
5522          struct agx_resource *rsrc = agx_resource(resources[i]);
5523 
5524          memcpy(&addr, handles[i], sizeof(addr));
5525          addr += rsrc->bo->ptr.gpu;
5526          memcpy(handles[i], &addr, sizeof(addr));
5527       } else {
5528          pipe_resource_reference(res, NULL);
5529       }
5530    }
5531 }
5532 
5533 void agx_init_state_functions(struct pipe_context *ctx);
5534 
5535 void
agx_init_state_functions(struct pipe_context * ctx)5536 agx_init_state_functions(struct pipe_context *ctx)
5537 {
5538    ctx->create_blend_state = agx_create_blend_state;
5539    ctx->create_depth_stencil_alpha_state = agx_create_zsa_state;
5540    ctx->create_fs_state = agx_create_shader_state;
5541    ctx->create_rasterizer_state = agx_create_rs_state;
5542    ctx->create_sampler_state = agx_create_sampler_state;
5543    ctx->create_sampler_view = agx_create_sampler_view;
5544    ctx->create_surface = agx_create_surface;
5545    ctx->create_vertex_elements_state = agx_create_vertex_elements;
5546    ctx->create_vs_state = agx_create_shader_state;
5547    ctx->create_gs_state = agx_create_shader_state;
5548    ctx->create_tcs_state = agx_create_shader_state;
5549    ctx->create_tes_state = agx_create_shader_state;
5550    ctx->create_compute_state = agx_create_compute_state;
5551    ctx->bind_blend_state = agx_bind_blend_state;
5552    ctx->bind_depth_stencil_alpha_state = agx_bind_zsa_state;
5553    ctx->bind_sampler_states = agx_bind_sampler_states;
5554    ctx->bind_fs_state = agx_bind_fs_state;
5555    ctx->bind_rasterizer_state = agx_bind_rasterizer_state;
5556    ctx->bind_vertex_elements_state = agx_bind_vertex_elements_state;
5557    ctx->bind_vs_state = agx_bind_vs_state;
5558    ctx->bind_gs_state = agx_bind_gs_state;
5559    ctx->bind_tcs_state = agx_bind_tcs_state;
5560    ctx->bind_tes_state = agx_bind_tes_state;
5561    ctx->bind_compute_state = agx_bind_cs_state;
5562    ctx->delete_blend_state = agx_delete_state;
5563    ctx->delete_depth_stencil_alpha_state = agx_delete_state;
5564    ctx->delete_fs_state = agx_delete_shader_state;
5565    ctx->delete_compute_state = agx_delete_shader_state;
5566    ctx->delete_rasterizer_state = agx_delete_state;
5567    ctx->delete_sampler_state = agx_delete_sampler_state;
5568    ctx->delete_vertex_elements_state = agx_delete_state;
5569    ctx->delete_vs_state = agx_delete_shader_state;
5570    ctx->delete_gs_state = agx_delete_shader_state;
5571    ctx->delete_tcs_state = agx_delete_shader_state;
5572    ctx->delete_tes_state = agx_delete_shader_state;
5573    ctx->set_blend_color = agx_set_blend_color;
5574    ctx->set_clip_state = agx_set_clip_state;
5575    ctx->set_constant_buffer = agx_set_constant_buffer;
5576    ctx->set_shader_buffers = agx_set_shader_buffers;
5577    ctx->set_shader_images = agx_set_shader_images;
5578    ctx->set_sampler_views = agx_set_sampler_views;
5579    ctx->set_framebuffer_state = agx_set_framebuffer_state;
5580    ctx->set_polygon_stipple = agx_set_polygon_stipple;
5581    ctx->set_patch_vertices = agx_set_patch_vertices;
5582    ctx->set_sample_mask = agx_set_sample_mask;
5583    ctx->set_scissor_states = agx_set_scissor_states;
5584    ctx->set_stencil_ref = agx_set_stencil_ref;
5585    ctx->set_vertex_buffers = agx_set_vertex_buffers;
5586    ctx->set_viewport_states = agx_set_viewport_states;
5587    ctx->sampler_view_destroy = agx_sampler_view_destroy;
5588    ctx->surface_destroy = agx_surface_destroy;
5589    ctx->draw_vbo = agx_draw_vbo;
5590    ctx->launch_grid = agx_launch_grid;
5591    ctx->set_global_binding = agx_set_global_binding;
5592    ctx->texture_barrier = agx_texture_barrier;
5593    ctx->get_compute_state_info = agx_get_compute_state_info;
5594    ctx->set_tess_state = agx_set_tess_state;
5595 }
5596