• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3  * Copyright (C) 2018 Alyssa Rosenzweig
4  * Copyright (C) 2020 Collabora Ltd.
5  * Copyright © 2017 Intel Corporation
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  */
26 
27 #include "gallium/auxiliary/util/u_blend.h"
28 #include "pipe/p_defines.h"
29 #include "pipe/p_state.h"
30 #include "util/macros.h"
31 #include "util/u_draw.h"
32 #include "util/u_helpers.h"
33 #include "util/u_memory.h"
34 #include "util/u_prim.h"
35 #include "util/u_sample_positions.h"
36 #include "util/u_vbuf.h"
37 #include "util/u_viewport.h"
38 
39 #include "decode.h"
40 
41 #include "genxml/gen_macros.h"
42 
43 #include "pan_mod_conv_cso.h"
44 #include "pan_blend.h"
45 #include "pan_bo.h"
46 #include "pan_cmdstream.h"
47 #include "pan_context.h"
48 #include "pan_csf.h"
49 #include "pan_fb_preload.h"
50 #include "pan_format.h"
51 #include "pan_indirect_dispatch.h"
52 #include "pan_jm.h"
53 #include "pan_job.h"
54 #include "pan_pool.h"
55 #include "pan_resource.h"
56 #include "pan_samples.h"
57 #include "pan_shader.h"
58 #include "pan_texture.h"
59 #include "pan_util.h"
60 
61 /* JOBX() is used to select the job backend helpers to call from generic
62  * functions. */
63 #if PAN_ARCH <= 9
64 #define JOBX(__suffix) GENX(jm_##__suffix)
65 #elif PAN_ARCH <= 10
66 #define JOBX(__suffix) GENX(csf_##__suffix)
67 #else
68 #error "Unsupported arch"
69 #endif
70 
71 struct panfrost_sampler_state {
72    struct pipe_sampler_state base;
73    struct mali_sampler_packed hw;
74 };
75 
76 /* Misnomer: Sampler view corresponds to textures, not samplers */
77 
78 struct panfrost_sampler_view {
79    struct pipe_sampler_view base;
80    struct panfrost_pool_ref state;
81    struct mali_texture_packed bifrost_descriptor;
82    uint64_t texture_bo;
83    uint64_t texture_size;
84    uint64_t modifier;
85 
86    /* Pool used to allocate the descriptor. If NULL, defaults to the global
87     * descriptor pool. Can be set for short lived descriptors, useful for
88     * shader images on Valhall.
89     */
90    struct panfrost_pool *pool;
91 };
92 
93 /* Statically assert that PIPE_* enums match the hardware enums.
94  * (As long as they match, we don't need to translate them.)
95  */
96 static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match");
97 static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match");
98 static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match");
99 static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match");
100 static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match");
101 static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match");
102 static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match");
103 static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match");
104 
105 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)106 panfrost_sample_pattern(unsigned samples)
107 {
108    switch (samples) {
109    case 1:
110       return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
111    case 4:
112       return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
113    case 8:
114       return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
115    case 16:
116       return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
117    default:
118       unreachable("Unsupported sample count");
119    }
120 }
121 
122 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)123 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
124 {
125    /* CLAMP is only supported on Midgard, where it is broken for nearest
126     * filtering. Use CLAMP_TO_EDGE in that case.
127     */
128 
129    switch (w) {
130    case PIPE_TEX_WRAP_REPEAT:
131       return MALI_WRAP_MODE_REPEAT;
132    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
133       return MALI_WRAP_MODE_CLAMP_TO_EDGE;
134    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
135       return MALI_WRAP_MODE_CLAMP_TO_BORDER;
136    case PIPE_TEX_WRAP_MIRROR_REPEAT:
137       return MALI_WRAP_MODE_MIRRORED_REPEAT;
138    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
139       return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
140    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
141       return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
142 
143 #if PAN_ARCH <= 5
144    case PIPE_TEX_WRAP_CLAMP:
145       return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE
146                            : MALI_WRAP_MODE_CLAMP;
147    case PIPE_TEX_WRAP_MIRROR_CLAMP:
148       return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE
149                            : MALI_WRAP_MODE_MIRRORED_CLAMP;
150 #endif
151 
152    default:
153       unreachable("Invalid wrap");
154    }
155 }
156 
157 /* The hardware compares in the wrong order order, so we have to flip before
158  * encoding. Yes, really. */
159 
160 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)161 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
162 {
163    return !cso->compare_mode
164              ? MALI_FUNC_NEVER
165              : panfrost_flip_compare_func((enum mali_func)cso->compare_func);
166 }
167 
168 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)169 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
170 {
171    switch (f) {
172    case PIPE_TEX_MIPFILTER_NEAREST:
173       return MALI_MIPMAP_MODE_NEAREST;
174    case PIPE_TEX_MIPFILTER_LINEAR:
175       return MALI_MIPMAP_MODE_TRILINEAR;
176 #if PAN_ARCH >= 6
177    case PIPE_TEX_MIPFILTER_NONE:
178       return MALI_MIPMAP_MODE_NONE;
179 #else
180    case PIPE_TEX_MIPFILTER_NONE:
181       return MALI_MIPMAP_MODE_NEAREST;
182 #endif
183    default:
184       unreachable("Invalid");
185    }
186 }
187 
188 #if PAN_ARCH == 7
189 static void
pan_afbc_reswizzle_border_color(const struct pipe_sampler_state * cso,struct panfrost_sampler_state * so)190 pan_afbc_reswizzle_border_color(const struct pipe_sampler_state *cso,
191                                 struct panfrost_sampler_state *so)
192 {
193    if (!panfrost_format_supports_afbc(PAN_ARCH, cso->border_color_format))
194       return;
195 
196    /* On v7, pan_texture.c composes the API swizzle with a bijective
197     * swizzle derived from the format, to allow more formats than the
198     * hardware otherwise supports. When packing border colours, we need to
199     * undo this bijection, by swizzling with its inverse.
200     */
201    unsigned mali_format =
202       GENX(panfrost_format_from_pipe_format)(cso->border_color_format)->hw;
203    enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12);
204 
205    unsigned char inverted_swizzle[4];
206    panfrost_invert_swizzle(GENX(pan_decompose_swizzle)(order).post,
207                            inverted_swizzle);
208 
209    util_format_apply_color_swizzle(&so->base.border_color, &cso->border_color,
210                                    inverted_swizzle,
211                                    false /* is_integer (irrelevant) */);
212 }
213 #endif
214 
215 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)216 panfrost_create_sampler_state(struct pipe_context *pctx,
217                               const struct pipe_sampler_state *cso)
218 {
219    struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
220    so->base = *cso;
221 
222 #if PAN_ARCH == 7
223    pan_afbc_reswizzle_border_color(cso, so);
224 #endif
225 
226    bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
227 
228    pan_pack(&so->hw, SAMPLER, cfg) {
229       cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
230       cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
231 
232       cfg.normalized_coordinates = !cso->unnormalized_coords;
233       cfg.lod_bias = cso->lod_bias;
234       cfg.minimum_lod = cso->min_lod;
235       cfg.maximum_lod = cso->max_lod;
236 
237       cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
238       cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
239       cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
240 
241       cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
242       cfg.compare_function = panfrost_sampler_compare_func(cso);
243       cfg.seamless_cube_map = cso->seamless_cube_map;
244 
245       cfg.border_color_r = so->base.border_color.ui[0];
246       cfg.border_color_g = so->base.border_color.ui[1];
247       cfg.border_color_b = so->base.border_color.ui[2];
248       cfg.border_color_a = so->base.border_color.ui[3];
249 
250 #if PAN_ARCH >= 6
251       if (cso->max_anisotropy > 1) {
252          cfg.maximum_anisotropy = cso->max_anisotropy;
253          cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
254       }
255 #else
256       /* Emulate disabled mipmapping by clamping the LOD as tight as
257        * possible (from 0 to epsilon = 1/256) */
258       if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
259          cfg.maximum_lod = cfg.minimum_lod + (1.0 / 256.0);
260 #endif
261    }
262 
263    return so;
264 }
265 
266 /* Get pointers to the blend shaders bound to each active render target. Used
267  * to emit the blend descriptors, as well as the fragment renderer state
268  * descriptor.
269  */
270 static void
panfrost_get_blend_shaders(struct panfrost_batch * batch,uint64_t * blend_shaders)271 panfrost_get_blend_shaders(struct panfrost_batch *batch,
272                            uint64_t *blend_shaders)
273 {
274    unsigned shader_offset = 0;
275    struct panfrost_bo *shader_bo = NULL;
276 
277    for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) {
278       if (batch->key.cbufs[c]) {
279          blend_shaders[c] =
280             panfrost_get_blend(batch, c, &shader_bo, &shader_offset);
281       }
282    }
283 
284    if (shader_bo)
285       perf_debug(batch->ctx, "Blend shader use");
286 }
287 
288 #if PAN_ARCH >= 5
289 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)290 pack_blend_constant(enum pipe_format format, float cons)
291 {
292    const struct util_format_description *format_desc =
293       util_format_description(format);
294 
295    unsigned chan_size = 0;
296 
297    for (unsigned i = 0; i < format_desc->nr_channels; i++)
298       chan_size = MAX2(format_desc->channel[0].size, chan_size);
299 
300    uint16_t unorm = (cons * ((1 << chan_size) - 1));
301    return unorm << (16 - chan_size);
302 }
303 
304 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,uint64_t * blend_shaders)305 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
306                     uint64_t *blend_shaders)
307 {
308    unsigned rt_count = batch->key.nr_cbufs;
309    struct panfrost_context *ctx = batch->ctx;
310    const struct panfrost_blend_state *so = ctx->blend;
311    bool dithered = so->base.dither;
312 
313    /* Always have at least one render target for depth-only passes */
314    for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
315       struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
316 
317       /* Disable blending for unbacked render targets */
318       if (rt_count == 0 || !batch->key.cbufs[i] || !so->info[i].enabled) {
319          pan_pack(packed, BLEND, cfg) {
320             cfg.enable = false;
321 #if PAN_ARCH >= 6
322             cfg.internal.mode = MALI_BLEND_MODE_OFF;
323 #endif
324          }
325 
326          continue;
327       }
328 
329       struct pan_blend_info info = so->info[i];
330       enum pipe_format format = batch->key.cbufs[i]->format;
331       float cons =
332          pan_blend_get_constant(info.constant_mask, ctx->blend_color.color);
333 
334       /* Word 0: Flags and constant */
335       pan_pack(packed, BLEND, cfg) {
336          cfg.srgb = util_format_is_srgb(format);
337          cfg.load_destination = info.load_dest;
338          cfg.round_to_fb_precision = !dithered;
339          cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
340 #if PAN_ARCH >= 6
341          if (!blend_shaders[i])
342             cfg.constant = pack_blend_constant(format, cons);
343 #else
344          cfg.blend_shader = (blend_shaders[i] != 0);
345 
346          if (blend_shaders[i])
347             cfg.shader_pc = blend_shaders[i];
348          else
349             cfg.constant = cons;
350 #endif
351       }
352 
353       if (!blend_shaders[i]) {
354          /* Word 1: Blend Equation */
355          STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
356          packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
357       }
358 
359 #if PAN_ARCH >= 6
360       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
361       struct mali_internal_blend_packed *internal_blend_packed =
362          (struct mali_internal_blend_packed *)&packed->opaque[2];
363 
364       /* Words 2 and 3: Internal blend */
365       if (blend_shaders[i]) {
366          /* The blend shader's address needs to be at
367           * the same top 32 bit as the fragment shader.
368           * TODO: Ensure that's always the case.
369           */
370          assert(!fs->bin.bo || (blend_shaders[i] & (0xffffffffull << 32)) ==
371                                   (fs->bin.gpu & (0xffffffffull << 32)));
372 
373          pan_pack(internal_blend_packed, INTERNAL_BLEND, cfg) {
374             cfg.mode = MALI_BLEND_MODE_SHADER;
375             cfg.shader.pc = (uint32_t)blend_shaders[i];
376 
377 #if PAN_ARCH <= 7
378             unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
379             assert(!(ret_offset & 0x7));
380 
381             cfg.shader.return_value = ret_offset ? fs->bin.gpu + ret_offset : 0;
382 #endif
383          }
384       } else {
385          pan_pack(internal_blend_packed, INTERNAL_BLEND, cfg) {
386             cfg.mode = info.opaque ? MALI_BLEND_MODE_OPAQUE
387                                    : MALI_BLEND_MODE_FIXED_FUNCTION;
388 
389             /* If we want the conversion to work properly,
390              * num_comps must be set to 4
391              */
392             cfg.fixed_function.num_comps = 4;
393             cfg.fixed_function.conversion.memory_format = GENX(
394                panfrost_dithered_format_from_pipe_format)(format, dithered);
395             cfg.fixed_function.rt = i;
396 
397 #if PAN_ARCH >= 7
398             if (cfg.mode == MALI_BLEND_MODE_FIXED_FUNCTION &&
399                 (cfg.fixed_function.conversion.memory_format & 0xff) ==
400                    MALI_RGB_COMPONENT_ORDER_RGB1) {
401                /* fixed function does not like RGB1 as the component order */
402                /* force this field to be the default 0 (RGBA) */
403                cfg.fixed_function.conversion.memory_format &= ~0xff;
404                cfg.fixed_function.conversion.memory_format |=
405                   MALI_RGB_COMPONENT_ORDER_RGBA;
406             }
407 #endif
408 #if PAN_ARCH <= 7
409             if (!info.opaque) {
410                cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
411                cfg.fixed_function.alpha_one_store = info.alpha_one_store;
412             }
413 
414             if (fs->info.fs.untyped_color_outputs) {
415                cfg.fixed_function.conversion.register_format = GENX(
416                   pan_fixup_blend_type)(fs->info.bifrost.blend[i].type, format);
417             } else {
418                cfg.fixed_function.conversion.register_format =
419                   fs->info.bifrost.blend[i].format;
420             }
421 #endif
422          }
423       }
424 #endif
425    }
426 }
427 #endif
428 
429 static uint64_t
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)430 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch,
431                                   enum pipe_shader_type stage)
432 {
433    struct panfrost_compiled_shader *ss = batch->ctx->prog[stage];
434 
435    panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
436    panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
437 
438    return ss->state.gpu;
439 }
440 
441 static float
panfrost_z_depth_offset(struct panfrost_context * ctx,float offset_units)442 panfrost_z_depth_offset(struct panfrost_context *ctx, float offset_units)
443 {
444    if (ctx->pipe_framebuffer.zsbuf) {
445       if (util_format_is_float(ctx->pipe_framebuffer.zsbuf->format)) {
446          /* no scaling necessary, hw will do this at run time */
447          return offset_units;
448       }
449    }
450    /* if fixed point, apply the minimum resolvable difference scaling here */
451    return 2.0f * offset_units;
452 }
453 
454 #if PAN_ARCH <= 7
455 /* Construct a partial RSD corresponding to no executed fragment shader, and
456  * merge with the existing partial RSD. */
457 
458 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)459 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
460 {
461    struct mali_renderer_state_packed empty_rsd;
462 
463    pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
464 #if PAN_ARCH >= 6
465       cfg.properties.shader_modifies_coverage = true;
466       cfg.properties.allow_forward_pixel_to_kill = true;
467       cfg.properties.allow_forward_pixel_to_be_killed = true;
468       cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
469 
470       /* Alpha isn't written so these are vacuous */
471       cfg.multisample_misc.overdraw_alpha0 = true;
472       cfg.multisample_misc.overdraw_alpha1 = true;
473 #else
474       cfg.shader.shader = 0x1;
475       cfg.properties.work_register_count = 1;
476       cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
477       cfg.properties.force_early_z = true;
478 #endif
479    }
480 
481    pan_merge((*rsd), empty_rsd, RENDERER_STATE);
482 }
483 
484 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,uint64_t * blend_shaders,struct mali_renderer_state_packed * rsd)485 panfrost_prepare_fs_state(struct panfrost_context *ctx, uint64_t *blend_shaders,
486                           struct mali_renderer_state_packed *rsd)
487 {
488    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
489    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
490    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
491    struct panfrost_blend_state *so = ctx->blend;
492    bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
493    bool msaa = rast->multisample;
494 
495    unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
496 
497    bool has_blend_shader = false;
498 
499    for (unsigned c = 0; c < rt_count; ++c)
500       has_blend_shader |= (blend_shaders[c] != 0);
501 
502    bool has_oq = ctx->occlusion_query && ctx->active_queries;
503 
504    pan_pack(rsd, RENDERER_STATE, cfg) {
505       if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
506 #if PAN_ARCH >= 6
507          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
508             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
509             ctx->blend->base.alpha_to_coverage,
510             ctx->depth_stencil->zs_always_passes);
511 
512          cfg.properties.pixel_kill_operation = earlyzs.kill;
513          cfg.properties.zs_update_operation = earlyzs.update;
514 
515          cfg.properties.allow_forward_pixel_to_kill =
516             pan_allow_forward_pixel_to_kill(ctx, fs);
517 #else
518          cfg.properties.force_early_z =
519             fs->info.fs.can_early_z && !alpha_to_coverage &&
520             ((enum mali_func)zsa->base.alpha_func == MALI_FUNC_ALWAYS);
521 
522          /* TODO: Reduce this limit? */
523          if (has_blend_shader)
524             cfg.properties.work_register_count =
525                MAX2(fs->info.work_reg_count, 8);
526          else
527             cfg.properties.work_register_count = fs->info.work_reg_count;
528 
529          /* Hardware quirks around early-zs forcing without a
530           * depth buffer. Note this breaks occlusion queries. */
531          bool force_ez_with_discard = !zsa->enabled && !has_oq;
532 
533          cfg.properties.shader_reads_tilebuffer =
534             force_ez_with_discard && fs->info.fs.can_discard;
535          cfg.properties.shader_contains_discard =
536             !force_ez_with_discard && fs->info.fs.can_discard;
537 #endif
538       }
539 
540 #if PAN_ARCH == 4
541       if (rt_count > 0) {
542          cfg.multisample_misc.load_destination = so->info[0].load_dest;
543          cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
544          cfg.stencil_mask_misc.write_enable = so->info[0].enabled;
545          cfg.stencil_mask_misc.srgb =
546             util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
547          cfg.stencil_mask_misc.dither_disable = !so->base.dither;
548          cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
549 
550          if (blend_shaders[0]) {
551             cfg.blend_shader = blend_shaders[0];
552          } else {
553             cfg.blend_constant = pan_blend_get_constant(
554                so->info[0].constant_mask, ctx->blend_color.color);
555          }
556       } else {
557          /* If there is no colour buffer, leaving fields default is
558           * fine, except for blending which is nonnullable */
559          cfg.blend_equation.color_mask = 0xf;
560          cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
561          cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
562          cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
563          cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
564          cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
565          cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
566       }
567 #elif PAN_ARCH == 5
568       /* Workaround */
569       cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
570 #endif
571 
572       cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
573 
574       cfg.multisample_misc.evaluate_per_sample = msaa && (ctx->min_samples > 1);
575 
576 #if PAN_ARCH >= 6
577       /* MSAA blend shaders need to pass their sample ID to
578        * LD_TILE/ST_TILE, so we must preload it. Additionally, we
579        * need per-sample shading for the blend shader, accomplished
580        * by forcing per-sample shading for the whole program. */
581 
582       if (msaa && has_blend_shader) {
583          cfg.multisample_misc.evaluate_per_sample = true;
584          cfg.preload.fragment.sample_mask_id = true;
585       }
586 
587       /* Bifrost does not have native point sprites. Point sprites are
588        * lowered in the driver to gl_PointCoord reads. This field
589        * actually controls the orientation of gl_PointCoord. Both
590        * orientations are controlled with sprite_coord_mode in
591        * Gallium.
592        */
593       cfg.properties.point_sprite_coord_origin_max_y =
594          (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
595 
596       cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
597       cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
598 #endif
599 
600       cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
601       cfg.depth_units = panfrost_z_depth_offset(ctx, rast->offset_units);
602       cfg.depth_factor = rast->offset_scale;
603       cfg.depth_bias_clamp = rast->offset_clamp;
604 
605       bool back_enab = zsa->base.stencil[1].enabled;
606       cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
607       cfg.stencil_back.reference_value =
608          ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
609 
610 #if PAN_ARCH <= 5
611       /* v6+ fits register preload here, no alpha testing */
612       cfg.alpha_reference = zsa->base.alpha_ref_value;
613 #endif
614    }
615 }
616 
617 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,uint64_t * blend_shaders)618 panfrost_emit_frag_shader(struct panfrost_context *ctx,
619                           struct mali_renderer_state_packed *fragmeta,
620                           uint64_t *blend_shaders)
621 {
622    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
623    const struct panfrost_rasterizer *rast = ctx->rasterizer;
624    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
625 
626    /* We need to merge several several partial renderer state descriptors,
627     * so stage to temporary storage rather than reading back write-combine
628     * memory, which will trash performance. */
629    struct mali_renderer_state_packed rsd;
630    panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
631 
632 #if PAN_ARCH == 4
633    if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
634       /* Word 14: SFBD Blend Equation */
635       STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
636       rsd.opaque[14] = ctx->blend->equation[0];
637    }
638 #endif
639 
640    /* Merge with CSO state and upload */
641    if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
642       struct mali_renderer_state_packed *partial_rsd =
643          (struct mali_renderer_state_packed *)&fs->partial_rsd;
644       STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
645       pan_merge(rsd, *partial_rsd, RENDERER_STATE);
646    } else {
647       pan_merge_empty_fs(&rsd);
648    }
649 
650    /* Word 8, 9 Misc state */
651    rsd.opaque[8] |= zsa->rsd_depth.opaque[0] | rast->multisample.opaque[0];
652 
653    rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] | rast->stencil_misc.opaque[0];
654 
655    /* late patching of the merged RSD in case of line-smoothing */
656    if (u_reduced_prim(ctx->active_prim) == MESA_PRIM_LINES &&
657        rast->base.line_smooth) {
658       rsd.opaque[8] |= (1u << 16); // multisample_enable = 1
659       rsd.opaque[9] &= ~(1u << 30); // single_sampled_lines = 0
660    }
661 
662    /* Word 10, 11 Stencil Front and Back */
663    rsd.opaque[10] |= zsa->stencil_front.opaque[0];
664    rsd.opaque[11] |= zsa->stencil_back.opaque[0];
665 
666    memcpy(fragmeta, &rsd, sizeof(rsd));
667 }
668 
669 static uint64_t
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)670 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
671 {
672    struct panfrost_context *ctx = batch->ctx;
673    struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT];
674 
675    panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
676    panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_FRAGMENT);
677 
678    struct panfrost_ptr xfer;
679 
680 #if PAN_ARCH == 4
681    xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
682 #else
683    unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
684 
685    xfer =
686       pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(RENDERER_STATE),
687                                     PAN_DESC_ARRAY(rt_count, BLEND));
688 #endif
689 
690    if (!xfer.cpu)
691       return 0;
692 
693    uint64_t blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
694    panfrost_get_blend_shaders(batch, blend_shaders);
695 
696    panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *)xfer.cpu,
697                              blend_shaders);
698 
699 #if PAN_ARCH >= 5
700    panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE),
701                        blend_shaders);
702 #endif
703 
704    return xfer.gpu;
705 }
706 #endif
707 
708 static uint64_t
panfrost_emit_viewport(struct panfrost_batch * batch)709 panfrost_emit_viewport(struct panfrost_batch *batch)
710 {
711    struct panfrost_context *ctx = batch->ctx;
712    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
713    const struct pipe_scissor_state *ss = &ctx->scissor;
714    const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
715 
716    /* Derive min/max from translate/scale. Note since |x| >= 0 by
717     * definition, we have that -|x| <= |x| hence translate - |scale| <=
718     * translate + |scale|, so the ordering is correct here. */
719    float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
720    float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
721    float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
722    float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
723 
724    float minz, maxz;
725    util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz);
726 
727    /* Scissor to the intersection of viewport and to the scissor, clamped
728     * to the framebuffer */
729 
730    unsigned minx = MIN2(batch->key.width, MAX2((int)vp_minx, 0));
731    unsigned maxx = MIN2(batch->key.width, MAX2((int)vp_maxx, 0));
732    unsigned miny = MIN2(batch->key.height, MAX2((int)vp_miny, 0));
733    unsigned maxy = MIN2(batch->key.height, MAX2((int)vp_maxy, 0));
734 
735    if (ss && rast->scissor) {
736       minx = MAX2(ss->minx, minx);
737       miny = MAX2(ss->miny, miny);
738       maxx = MIN2(ss->maxx, maxx);
739       maxy = MIN2(ss->maxy, maxy);
740    }
741 
742    /* Set the range to [1, 1) so max values don't wrap round */
743    if (maxx == 0 || maxy == 0)
744       maxx = maxy = minx = miny = 1;
745 
746    panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
747    batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
748 
749    /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */
750    maxx--;
751    maxy--;
752 
753    batch->minimum_z = minz;
754    batch->maximum_z = maxz;
755 
756 #if PAN_ARCH <= 7
757    struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
758 
759    if (!T.cpu)
760       return 0;
761 
762    pan_cast_and_pack(T.cpu, VIEWPORT, cfg) {
763       cfg.scissor_minimum_x = minx;
764       cfg.scissor_minimum_y = miny;
765       cfg.scissor_maximum_x = maxx;
766       cfg.scissor_maximum_y = maxy;
767 
768       cfg.minimum_z = batch->minimum_z;
769       cfg.maximum_z = batch->maximum_z;
770    }
771 
772    return T.gpu;
773 #else
774    pan_cast_and_pack(&batch->scissor, SCISSOR, cfg) {
775       cfg.scissor_minimum_x = minx;
776       cfg.scissor_minimum_y = miny;
777       cfg.scissor_maximum_x = maxx;
778       cfg.scissor_maximum_y = maxy;
779    }
780 
781    return 0;
782 #endif
783 }
784 
785 #if PAN_ARCH >= 9
786 /**
787  * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the
788  * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at
789  * CSO create time. However, the stencil reference values and shader
790  * interactions are dynamic state. Pack only the dynamic state here and OR
791  * together.
792  */
793 static uint64_t
panfrost_emit_depth_stencil(struct panfrost_batch * batch)794 panfrost_emit_depth_stencil(struct panfrost_batch *batch)
795 {
796    struct panfrost_context *ctx = batch->ctx;
797    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
798    struct panfrost_rasterizer *rast = ctx->rasterizer;
799    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
800    bool back_enab = zsa->base.stencil[1].enabled;
801 
802    struct panfrost_ptr T =
803       pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL);
804 
805    if (!T.cpu)
806       return 0;
807 
808    struct mali_depth_stencil_packed dynamic;
809    pan_pack(&dynamic, DEPTH_STENCIL, cfg) {
810       cfg.front_reference_value = ctx->stencil_ref.ref_value[0];
811       cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
812 
813       cfg.stencil_from_shader = fs->info.fs.writes_stencil;
814       cfg.depth_source = pan_depth_source(&fs->info);
815 
816       cfg.depth_bias_enable = rast->base.offset_tri;
817       cfg.depth_units = panfrost_z_depth_offset(ctx, rast->base.offset_units);
818       cfg.depth_factor = rast->base.offset_scale;
819       cfg.depth_bias_clamp = rast->base.offset_clamp;
820 
821       assert(rast->base.depth_clip_near == rast->base.depth_clip_far);
822       cfg.depth_cull_enable = rast->base.depth_clip_near;
823       cfg.depth_clamp_mode = rast->base.depth_clamp
824                                 ? MALI_DEPTH_CLAMP_MODE_BOUNDS
825                                 : MALI_DEPTH_CLAMP_MODE_0_1;
826    }
827 
828    pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
829    memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL));
830 
831    return T.gpu;
832 }
833 
834 /**
835  * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared
836  * with Bifrost, but the container data structure is simplified.
837  */
838 static uint64_t
panfrost_emit_blend_valhall(struct panfrost_batch * batch)839 panfrost_emit_blend_valhall(struct panfrost_batch *batch)
840 {
841    unsigned rt_count = MAX2(batch->key.nr_cbufs, 1);
842 
843    struct panfrost_ptr T =
844       pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND);
845 
846    if (!T.cpu)
847       return 0;
848 
849    uint64_t blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
850    panfrost_get_blend_shaders(batch, blend_shaders);
851 
852    panfrost_emit_blend(batch, T.cpu, blend_shaders);
853 
854    /* Precalculate for the per-draw path */
855    bool has_blend_shader = false;
856 
857    for (unsigned i = 0; i < rt_count; ++i)
858       has_blend_shader |= !!blend_shaders[i];
859 
860    batch->ctx->valhall_has_blend_shader = has_blend_shader;
861 
862    return T.gpu;
863 }
864 
865 /**
866  * Emit Valhall buffer descriptors for bound vertex buffers at draw-time.
867  */
868 static uint64_t
panfrost_emit_vertex_buffers(struct panfrost_batch * batch)869 panfrost_emit_vertex_buffers(struct panfrost_batch *batch)
870 {
871    struct panfrost_context *ctx = batch->ctx;
872    unsigned buffer_count = util_last_bit(ctx->vb_mask);
873    struct panfrost_ptr T =
874       pan_pool_alloc_desc_array(&batch->pool.base, buffer_count, BUFFER);
875 
876    if (!T.cpu)
877       return 0;
878 
879    struct mali_buffer_packed *buffers = T.cpu;
880 
881    memset(buffers, 0, sizeof(*buffers) * buffer_count);
882 
883    u_foreach_bit(i, ctx->vb_mask) {
884       struct pipe_vertex_buffer vb = ctx->vertex_buffers[i];
885       struct pipe_resource *prsrc = vb.buffer.resource;
886       struct panfrost_resource *rsrc = pan_resource(prsrc);
887       assert(!vb.is_user_buffer);
888 
889       panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
890 
891       pan_pack(buffers + i, BUFFER, cfg) {
892          cfg.address = rsrc->image.data.base + vb.buffer_offset;
893 
894          cfg.size = prsrc->width0 - vb.buffer_offset;
895       }
896    }
897 
898    return T.gpu;
899 }
900 
901 static uint64_t
panfrost_emit_vertex_data(struct panfrost_batch * batch)902 panfrost_emit_vertex_data(struct panfrost_batch *batch)
903 {
904    struct panfrost_context *ctx = batch->ctx;
905    struct panfrost_vertex_state *vtx = ctx->vertex;
906 
907    return pan_pool_upload_aligned(&batch->pool.base, vtx->attributes,
908                                   vtx->num_elements * pan_size(ATTRIBUTE),
909                                   pan_alignment(ATTRIBUTE));
910 }
911 
912 static void panfrost_update_sampler_view(struct panfrost_sampler_view *view,
913                                          struct pipe_context *pctx);
914 
915 static uint64_t
panfrost_emit_images(struct panfrost_batch * batch,enum pipe_shader_type stage)916 panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage)
917 {
918    struct panfrost_context *ctx = batch->ctx;
919    unsigned last_bit = util_last_bit(ctx->image_mask[stage]);
920 
921    struct panfrost_ptr T =
922       pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE);
923 
924    struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
925 
926    for (int i = 0; i < last_bit; ++i) {
927       struct pipe_image_view *image = &ctx->images[stage][i];
928 
929       if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) {
930          memset(&out[i], 0, sizeof(out[i]));
931          continue;
932       }
933 
934       /* Construct a synthetic sampler view so we can use our usual
935        * sampler view code for the actual descriptor packing.
936        *
937        * Use the batch pool for a transient allocation, rather than
938        * allocating a long-lived descriptor.
939        */
940       struct panfrost_sampler_view view = {
941          .base = util_image_to_sampler_view(image),
942          .pool = &batch->pool,
943       };
944 
945       panfrost_update_sampler_view(&view, &ctx->base);
946       out[i] = view.bifrost_descriptor;
947 
948       panfrost_track_image_access(batch, stage, image);
949    }
950 
951    return T.gpu;
952 }
953 #endif
954 
955 static uint64_t
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)956 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
957                                  enum pipe_shader_type st,
958                                  struct panfrost_constant_buffer *buf,
959                                  unsigned index)
960 {
961    struct pipe_constant_buffer *cb = &buf->cb[index];
962    struct panfrost_resource *rsrc = pan_resource(cb->buffer);
963 
964    if (rsrc) {
965       panfrost_batch_read_rsrc(batch, rsrc, st);
966 
967       /* Alignment gauranteed by
968        * pipe_caps.constant_buffer_offset_alignment */
969       return rsrc->image.data.base + cb->buffer_offset;
970    } else if (cb->user_buffer) {
971       return pan_pool_upload_aligned(&batch->pool.base,
972                                      cb->user_buffer + cb->buffer_offset,
973                                      cb->buffer_size, 16);
974    } else {
975       unreachable("No constant buffer");
976    }
977 }
978 
979 struct sysval_uniform {
980    union {
981       float f[4];
982       int32_t i[4];
983       uint32_t u[4];
984       uint64_t du[2];
985    };
986 };
987 
988 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)989 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
990                                       struct sysval_uniform *uniform)
991 {
992    struct panfrost_context *ctx = batch->ctx;
993    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
994 
995    uniform->f[0] = vp->scale[0];
996    uniform->f[1] = vp->scale[1];
997    uniform->f[2] = vp->scale[2];
998 }
999 
1000 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1001 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1002                                        struct sysval_uniform *uniform)
1003 {
1004    struct panfrost_context *ctx = batch->ctx;
1005    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1006 
1007    uniform->f[0] = vp->translate[0];
1008    uniform->f[1] = vp->translate[1];
1009    uniform->f[2] = vp->translate[2];
1010 }
1011 
1012 static void
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1013 panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1014                            enum pipe_shader_type st, unsigned int sysvalid,
1015                            struct sysval_uniform *uniform)
1016 {
1017    struct panfrost_context *ctx = batch->ctx;
1018    unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1019    unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1020    bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1021    struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1022 
1023    assert(dim);
1024 
1025    if (tex->target == PIPE_BUFFER) {
1026       assert(dim == 1);
1027       unsigned buf_size = tex->u.buf.size / util_format_get_blocksize(tex->format);
1028       uniform->i[0] = MIN2(buf_size, PAN_MAX_TEXEL_BUFFER_ELEMENTS);
1029       return;
1030    }
1031 
1032    uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1033 
1034    if (dim > 1)
1035       uniform->i[1] = u_minify(tex->texture->height0, tex->u.tex.first_level);
1036 
1037    if (dim > 2)
1038       uniform->i[2] = u_minify(tex->texture->depth0, tex->u.tex.first_level);
1039 
1040    if (is_array) {
1041       unsigned size = tex->texture->array_size;
1042 
1043       /* Internally, we store the number of 2D images (faces * array
1044        * size). Externally, we report the array size in terms of
1045        * complete cubes. So divide by the # of faces per cube.
1046        */
1047       if (tex->target == PIPE_TEXTURE_CUBE_ARRAY)
1048          size /= 6;
1049 
1050       uniform->i[dim] = size;
1051    }
1052 }
1053 
1054 static void
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1055 panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
1056                                   enum pipe_shader_type st,
1057                                   unsigned int sysvalid,
1058                                   struct sysval_uniform *uniform)
1059 {
1060    struct panfrost_context *ctx = batch->ctx;
1061    unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1062    unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1063    unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1064 
1065    assert(dim && dim < 4);
1066 
1067    struct pipe_image_view *image = &ctx->images[st][idx];
1068 
1069    if (image->resource->target == PIPE_BUFFER) {
1070       unsigned blocksize = util_format_get_blocksize(image->format);
1071       uniform->i[0] = image->resource->width0 / blocksize;
1072       return;
1073    }
1074 
1075    uniform->i[0] = u_minify(image->resource->width0, image->u.tex.level);
1076 
1077    if (dim > 1)
1078       uniform->i[1] = u_minify(image->resource->height0, image->u.tex.level);
1079 
1080    if (dim > 2)
1081       uniform->i[2] = u_minify(image->resource->depth0, image->u.tex.level);
1082 
1083    if (is_array)
1084       uniform->i[dim] = image->resource->array_size;
1085 }
1086 
1087 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)1088 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1089                             enum pipe_shader_type st, unsigned ssbo_id,
1090                             struct sysval_uniform *uniform)
1091 {
1092    struct panfrost_context *ctx = batch->ctx;
1093 
1094    assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1095    struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1096 
1097    /* Compute address */
1098    struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1099    struct panfrost_bo *bo = rsrc->bo;
1100 
1101    panfrost_batch_write_rsrc(batch, rsrc, st);
1102 
1103    util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1104                   sb.buffer_size);
1105 
1106    /* Upload address and size as sysval */
1107    uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
1108    uniform->u[2] = sb.buffer_size;
1109 }
1110 
1111 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)1112 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1113                                enum pipe_shader_type st, unsigned samp_idx,
1114                                struct sysval_uniform *uniform)
1115 {
1116    struct panfrost_context *ctx = batch->ctx;
1117    struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1118 
1119    uniform->f[0] = sampl->min_lod;
1120    uniform->f[1] = sampl->max_lod;
1121    uniform->f[2] = sampl->lod_bias;
1122 
1123    /* Even without any errata, Midgard represents "no mipmapping" as
1124     * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1125     * panfrost_create_sampler_state which also explains our choice of
1126     * epsilon value (again to keep behaviour consistent) */
1127 
1128    if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1129       uniform->f[1] = uniform->f[0] + (1.0 / 256.0);
1130 }
1131 
1132 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1133 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1134                                        struct sysval_uniform *uniform)
1135 {
1136    struct panfrost_context *ctx = batch->ctx;
1137 
1138    uniform->u[0] = ctx->compute_grid->grid[0];
1139    uniform->u[1] = ctx->compute_grid->grid[1];
1140    uniform->u[2] = ctx->compute_grid->grid[2];
1141 }
1142 
1143 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1144 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
1145                                         struct sysval_uniform *uniform)
1146 {
1147    struct panfrost_context *ctx = batch->ctx;
1148 
1149    uniform->u[0] = ctx->compute_grid->block[0];
1150    uniform->u[1] = ctx->compute_grid->block[1];
1151    uniform->u[2] = ctx->compute_grid->block[2];
1152 }
1153 
1154 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1155 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
1156                                 struct sysval_uniform *uniform)
1157 {
1158    struct panfrost_context *ctx = batch->ctx;
1159 
1160    uniform->u[0] = ctx->compute_grid->work_dim;
1161 }
1162 
1163 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
1164  * Midgard, we emulate the Bifrost path with some extra arithmetic in the
1165  * shader, to keep the code as unified as possible. */
1166 
1167 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1168 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
1169                                         struct sysval_uniform *uniform)
1170 {
1171    struct panfrost_context *ctx = batch->ctx;
1172    struct panfrost_device *dev = pan_device(ctx->base.screen);
1173 
1174    unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1175    uniform->du[0] =
1176       dev->sample_positions->ptr.gpu +
1177       panfrost_sample_positions_offset(panfrost_sample_pattern(samples));
1178 }
1179 
1180 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1181 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1182                                     struct sysval_uniform *uniform)
1183 {
1184    unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1185    uniform->u[0] = (samples > 1) ? ~0 : 0;
1186 }
1187 
1188 #if PAN_ARCH >= 6
1189 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)1190 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1191                                      unsigned size_and_rt,
1192                                      struct sysval_uniform *uniform)
1193 {
1194    unsigned rt = size_and_rt & 0xF;
1195    unsigned size = size_and_rt >> 4;
1196 
1197    if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1198       enum pipe_format format = batch->key.cbufs[rt]->format;
1199       uniform->u[0] =
1200          GENX(pan_blend_get_internal_desc)(format, rt, size, false) >> 32;
1201    } else {
1202       pan_cast_and_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
1203          cfg.memory_format =
1204             GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_NONE)->hw;
1205    }
1206 }
1207 #endif
1208 
1209 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1210 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1211 {
1212    return target->buffer_offset + (pan_so_target(target)->offset * stride);
1213 }
1214 
1215 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,void * ptr_cpu,uint64_t ptr_gpu,struct panfrost_compiled_shader * ss,enum pipe_shader_type st)1216 panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu,
1217                         uint64_t ptr_gpu, struct panfrost_compiled_shader *ss,
1218                         enum pipe_shader_type st)
1219 {
1220    struct sysval_uniform *uniforms = ptr_cpu;
1221 
1222    for (unsigned i = 0; i < ss->sysvals.sysval_count; ++i) {
1223       int sysval = ss->sysvals.sysvals[i];
1224 
1225       switch (PAN_SYSVAL_TYPE(sysval)) {
1226       case PAN_SYSVAL_VIEWPORT_SCALE:
1227          panfrost_upload_viewport_scale_sysval(batch, &uniforms[i]);
1228          break;
1229       case PAN_SYSVAL_VIEWPORT_OFFSET:
1230          panfrost_upload_viewport_offset_sysval(batch, &uniforms[i]);
1231          break;
1232       case PAN_SYSVAL_TEXTURE_SIZE:
1233          panfrost_upload_txs_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1234                                     &uniforms[i]);
1235          break;
1236       case PAN_SYSVAL_SSBO:
1237          panfrost_upload_ssbo_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1238                                      &uniforms[i]);
1239          break;
1240 
1241       case PAN_SYSVAL_XFB: {
1242          unsigned buf = PAN_SYSVAL_ID(sysval);
1243          struct panfrost_compiled_shader *vs =
1244             batch->ctx->prog[PIPE_SHADER_VERTEX];
1245          struct pipe_stream_output_info *so = &vs->stream_output;
1246          unsigned stride = so->stride[buf] * 4;
1247 
1248          struct pipe_stream_output_target *target = NULL;
1249          if (buf < batch->ctx->streamout.num_targets)
1250             target = batch->ctx->streamout.targets[buf];
1251 
1252          if (!target) {
1253             /* Memory sink */
1254             uniforms[i].du[0] = 0x8ull << 60;
1255             break;
1256          }
1257 
1258          struct panfrost_resource *rsrc = pan_resource(target->buffer);
1259          unsigned offset = panfrost_xfb_offset(stride, target);
1260 
1261          util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
1262                         target->buffer_size - offset);
1263 
1264          panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1265 
1266          uniforms[i].du[0] = rsrc->image.data.base + offset;
1267          break;
1268       }
1269 
1270       case PAN_SYSVAL_NUM_VERTICES:
1271          uniforms[i].u[0] = batch->ctx->vertex_count;
1272          break;
1273 
1274       case PAN_SYSVAL_NUM_WORK_GROUPS:
1275          for (unsigned j = 0; j < 3; j++) {
1276             batch->num_wg_sysval[j] =
1277                ptr_gpu + (i * sizeof(*uniforms)) + (j * 4);
1278          }
1279          panfrost_upload_num_work_groups_sysval(batch, &uniforms[i]);
1280          break;
1281       case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1282          panfrost_upload_local_group_size_sysval(batch, &uniforms[i]);
1283          break;
1284       case PAN_SYSVAL_WORK_DIM:
1285          panfrost_upload_work_dim_sysval(batch, &uniforms[i]);
1286          break;
1287       case PAN_SYSVAL_SAMPLER:
1288          panfrost_upload_sampler_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1289                                         &uniforms[i]);
1290          break;
1291       case PAN_SYSVAL_IMAGE_SIZE:
1292          panfrost_upload_image_size_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1293                                            &uniforms[i]);
1294          break;
1295       case PAN_SYSVAL_SAMPLE_POSITIONS:
1296          panfrost_upload_sample_positions_sysval(batch, &uniforms[i]);
1297          break;
1298       case PAN_SYSVAL_MULTISAMPLED:
1299          panfrost_upload_multisampled_sysval(batch, &uniforms[i]);
1300          break;
1301 #if PAN_ARCH >= 6
1302       case PAN_SYSVAL_RT_CONVERSION:
1303          panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval),
1304                                               &uniforms[i]);
1305          break;
1306 #endif
1307       case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1308          uniforms[i].u[0] = batch->ctx->offset_start;
1309          uniforms[i].u[1] = batch->ctx->base_vertex;
1310          uniforms[i].u[2] = batch->ctx->base_instance;
1311          break;
1312       case PAN_SYSVAL_DRAWID:
1313          uniforms[i].u[0] = batch->ctx->drawid;
1314          break;
1315       default:
1316          assert(0);
1317       }
1318    }
1319 }
1320 
1321 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1322 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1323                                  struct panfrost_constant_buffer *buf,
1324                                  unsigned index)
1325 {
1326    struct pipe_constant_buffer *cb = &buf->cb[index];
1327    struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1328 
1329    if (rsrc) {
1330       if (panfrost_bo_mmap(rsrc->bo))
1331          return NULL;
1332 
1333       panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1334       panfrost_bo_wait(rsrc->bo, INT64_MAX, false);
1335 
1336       return rsrc->bo->ptr.cpu + cb->buffer_offset;
1337    } else if (cb->user_buffer) {
1338       return cb->user_buffer + cb->buffer_offset;
1339    } else
1340       unreachable("No constant buffer");
1341 }
1342 
1343 /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
1344  * implemented with buffer descriptors in the resource table, sized in terms of
1345  * bytes. On Bifrost and older, UBOs have special uniform buffer data
1346  * structure, sized in terms of entries.
1347  */
1348 static void
panfrost_emit_ubo(void * base,unsigned index,uint64_t address,size_t size)1349 panfrost_emit_ubo(void *base, unsigned index, uint64_t address, size_t size)
1350 {
1351 #if PAN_ARCH >= 9
1352    struct mali_buffer_packed *out = base;
1353 
1354    pan_pack(out + index, BUFFER, cfg) {
1355       cfg.size = size;
1356       cfg.address = address;
1357    }
1358 #else
1359    struct mali_uniform_buffer_packed *out = base;
1360 
1361    /* Issue (57) for the ARB_uniform_buffer_object spec says that
1362     * the buffer can be larger than the uniform data inside it,
1363     * so clamp ubo size to what hardware supports. */
1364 
1365    pan_pack(out + index, UNIFORM_BUFFER, cfg) {
1366       cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12);
1367       cfg.pointer = address;
1368    }
1369 #endif
1370 }
1371 
1372 #if PAN_ARCH >= 9
1373 static uint64_t
panfrost_emit_ssbos(struct panfrost_batch * batch,enum pipe_shader_type st)1374 panfrost_emit_ssbos(struct panfrost_batch *batch, enum pipe_shader_type st)
1375 {
1376    struct panfrost_context *ctx = batch->ctx;
1377    unsigned ssbo_count = util_last_bit(ctx->ssbo_mask[st]);
1378 
1379    if (!ssbo_count)
1380       return 0;
1381 
1382    struct panfrost_ptr ssbos =
1383       pan_pool_alloc_desc_array(&batch->pool.base, ssbo_count, BUFFER);
1384    struct mali_buffer_packed *bufs = ssbos.cpu;
1385 
1386    memset(bufs, 0, sizeof(bufs[0]) * ssbo_count);
1387 
1388    u_foreach_bit(ssbo_id, ctx->ssbo_mask[st]) {
1389       struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1390       struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1391       struct panfrost_bo *bo = rsrc->bo;
1392 
1393       panfrost_batch_write_rsrc(batch, rsrc, st);
1394 
1395       util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1396                      sb.buffer_size);
1397       pan_pack(&bufs[ssbo_id], BUFFER, cfg) {
1398          cfg.size = sb.buffer_size;
1399          cfg.address = bo->ptr.gpu + sb.buffer_offset;
1400       }
1401    }
1402 
1403    return ssbos.gpu;
1404 }
1405 #endif
1406 
1407 static uint64_t
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,unsigned * buffer_count,uint64_t * push_constants,unsigned * pushed_words)1408 panfrost_emit_const_buf(struct panfrost_batch *batch,
1409                         enum pipe_shader_type stage, unsigned *buffer_count,
1410                         uint64_t *push_constants, unsigned *pushed_words)
1411 {
1412    struct panfrost_context *ctx = batch->ctx;
1413    struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1414    struct panfrost_compiled_shader *ss = ctx->prog[stage];
1415 
1416    if (!ss)
1417       return 0;
1418 
1419    /* Allocate room for the sysval and the uniforms */
1420    size_t sys_size = sizeof(float) * 4 * ss->sysvals.sysval_count;
1421    struct panfrost_ptr transfer =
1422       pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1423 
1424    if (!transfer.cpu)
1425       return 0;
1426 
1427    /* Upload sysvals requested by the shader */
1428    uint8_t *sysvals = alloca(sys_size);
1429    panfrost_upload_sysvals(batch, sysvals, transfer.gpu, ss, stage);
1430    memcpy(transfer.cpu, sysvals, sys_size);
1431 
1432    /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1433    struct panfrost_compiled_shader *shader = ctx->prog[stage];
1434    unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1435    unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1436    unsigned desc_size;
1437    struct panfrost_ptr ubos = {0};
1438 
1439 #if PAN_ARCH >= 9
1440    desc_size = sizeof(struct mali_buffer_packed);
1441    ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, BUFFER);
1442 #else
1443    desc_size = sizeof(struct mali_uniform_buffer_packed);
1444    ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1,
1445                                     UNIFORM_BUFFER);
1446 #endif
1447 
1448    if (!ubos.cpu)
1449       return 0;
1450 
1451    memset(ubos.cpu, 0, desc_size * (ubo_count + 1));
1452 
1453    if (buffer_count)
1454       *buffer_count = ubo_count + (sys_size ? 1 : 0);
1455 
1456    /* Upload sysval as a final UBO */
1457 
1458    if (sys_size)
1459       panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size);
1460 
1461    /* The rest are honest-to-goodness UBOs */
1462 
1463    u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1464       size_t usz = buf->cb[ubo].buffer_size;
1465       uint64_t address = 0;
1466 
1467       if (usz > 0) {
1468          address = panfrost_map_constant_buffer_gpu(batch, stage, buf, ubo);
1469       }
1470 
1471       panfrost_emit_ubo(ubos.cpu, ubo, address, usz);
1472    }
1473 
1474    if (pushed_words)
1475       *pushed_words = ss->info.push.count;
1476 
1477    if (ss->info.push.count == 0)
1478       return ubos.gpu;
1479 
1480    /* Copy push constants required by the shader */
1481    struct panfrost_ptr push_transfer =
1482       pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16);
1483 
1484    if (!push_transfer.cpu)
1485       return 0;
1486 
1487    uint32_t *push_cpu = (uint32_t *)push_transfer.cpu;
1488    *push_constants = push_transfer.gpu;
1489 
1490    for (unsigned i = 0; i < ss->info.push.count; ++i) {
1491       struct panfrost_ubo_word src = ss->info.push.words[i];
1492 
1493       if (src.ubo == sysval_ubo) {
1494          unsigned sysval_idx = src.offset / 16;
1495          unsigned sysval_comp = (src.offset % 16) / 4;
1496          unsigned sysval_type =
1497             PAN_SYSVAL_TYPE(ss->sysvals.sysvals[sysval_idx]);
1498          uint64_t ptr = push_transfer.gpu + (4 * i);
1499 
1500          if (sysval_type == PAN_SYSVAL_NUM_WORK_GROUPS &&
1501              sysval_comp < ARRAY_SIZE(batch->num_wg_sysval))
1502             batch->num_wg_sysval[sysval_comp] = ptr;
1503       }
1504       /* Map the UBO, this should be cheap. For some buffers this may
1505        * read from write-combine memory which is slow, though :-(
1506        */
1507       const void *mapped_ubo =
1508          (src.ubo == sysval_ubo)
1509             ? sysvals
1510             : panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1511 
1512       if (!mapped_ubo)
1513          return 0;
1514 
1515       /* TODO: Is there any benefit to combining ranges */
1516       memcpy(push_cpu + i, (uint8_t *)mapped_ubo + src.offset, 4);
1517    }
1518 
1519    return ubos.gpu;
1520 }
1521 
1522 /*
1523  * Choose the number of WLS instances to allocate. This must be a power-of-two.
1524  * The number of WLS instances limits the number of concurrent tasks on a given
1525  * shader core, setting to the (rounded) total number of tasks avoids any
1526  * throttling. Smaller values save memory at the expense of possible throttling.
1527  *
1528  * With indirect dispatch, we don't know at launch-time how many tasks will be
1529  * needed, so we use a conservative value that's unlikely to cause slowdown in
1530  * practice without wasting too much memory.
1531  */
1532 static unsigned
panfrost_choose_wls_instance_count(const struct pipe_grid_info * grid)1533 panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
1534 {
1535    if (grid->indirect) {
1536       /* May need tuning in the future, conservative guess */
1537       return 128;
1538    } else {
1539       return util_next_power_of_two(grid->grid[0]) *
1540              util_next_power_of_two(grid->grid[1]) *
1541              util_next_power_of_two(grid->grid[2]);
1542    }
1543 }
1544 
1545 static uint64_t
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * grid)1546 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1547                             const struct pipe_grid_info *grid)
1548 {
1549    struct panfrost_context *ctx = batch->ctx;
1550    struct panfrost_device *dev = pan_device(ctx->base.screen);
1551    struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_COMPUTE];
1552    struct panfrost_ptr t =
1553       pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1554 
1555    struct pan_tls_info info = {
1556       .tls.size = ss->info.tls_size,
1557       .wls.size = ss->info.wls_size + grid->variable_shared_mem,
1558       .wls.instances = panfrost_choose_wls_instance_count(grid),
1559    };
1560 
1561    if (ss->info.tls_size) {
1562       struct panfrost_bo *bo = panfrost_batch_get_scratchpad(
1563          batch, ss->info.tls_size, dev->thread_tls_alloc, dev->core_id_range);
1564 
1565       if (!bo)
1566          return 0;
1567 
1568       info.tls.ptr = bo->ptr.gpu;
1569    }
1570 
1571    if (info.wls.size) {
1572       unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
1573                       dev->core_id_range;
1574 
1575       struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);
1576 
1577       if (!bo)
1578          return 0;
1579 
1580       info.wls.ptr = bo->ptr.gpu;
1581    }
1582 
1583    GENX(pan_emit_tls)(&info, t.cpu);
1584    return t.gpu;
1585 }
1586 
1587 #if PAN_ARCH <= 5
1588 static uint64_t
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1589 panfrost_get_tex_desc(struct panfrost_batch *batch, enum pipe_shader_type st,
1590                       struct panfrost_sampler_view *view)
1591 {
1592    if (!view)
1593       return (uint64_t)0;
1594 
1595    struct pipe_sampler_view *pview = &view->base;
1596    struct panfrost_resource *rsrc = pan_resource(pview->texture);
1597 
1598    panfrost_batch_read_rsrc(batch, rsrc, st);
1599    panfrost_batch_add_bo(batch, view->state.bo, st);
1600 
1601    return view->state.gpu;
1602 }
1603 #endif
1604 
1605 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1606 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1607                                 struct pipe_context *pctx,
1608                                 struct pipe_resource *texture)
1609 {
1610    struct panfrost_device *device = pan_device(pctx->screen);
1611    struct panfrost_context *ctx = pan_context(pctx);
1612    struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1613    enum pipe_format format = so->base.format;
1614    assert(prsrc->bo);
1615 
1616    bool is_shadow = false;
1617    /* Format to access the stencil/depth portion of a Z32_S8 texture */
1618    if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1619       assert(prsrc->separate_stencil);
1620       texture = &prsrc->separate_stencil->base;
1621       prsrc = (struct panfrost_resource *)texture;
1622       format = texture->format;
1623    } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1624       format = PIPE_FORMAT_Z32_FLOAT;
1625    } else if (prsrc->shadow_image) {
1626       prsrc = prsrc->shadow_image;
1627       texture = &prsrc->base;
1628       format = texture ->format;
1629       is_shadow = true;
1630    }
1631 
1632    so->texture_bo = prsrc->image.data.base;
1633    so->texture_size = prsrc->image.layout.data_size;
1634    so->modifier = prsrc->image.layout.modifier;
1635 
1636    /* MSAA only supported for 2D textures */
1637 
1638    assert(texture->nr_samples <= 1 || so->base.target == PIPE_TEXTURE_2D ||
1639           so->base.target == PIPE_TEXTURE_2D_ARRAY);
1640 
1641    enum mali_texture_dimension type =
1642       panfrost_translate_texture_dimension(so->base.target);
1643 
1644    bool is_buffer = (so->base.target == PIPE_BUFFER);
1645 
1646    unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1647    unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1648    unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1649    unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1650    unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1651    unsigned buf_size =
1652       (is_buffer ? so->base.u.buf.size : 0) / util_format_get_blocksize(format);
1653    buf_size = MIN2(buf_size, PAN_MAX_TEXEL_BUFFER_ELEMENTS);
1654 
1655    if (so->base.target == PIPE_TEXTURE_3D) {
1656       first_layer /= prsrc->image.layout.depth;
1657       last_layer /= prsrc->image.layout.depth;
1658       assert(!first_layer && !last_layer);
1659    }
1660 
1661    struct pan_image_view iview = {
1662       .format = format,
1663       .dim = type,
1664       .first_level = first_level,
1665       .last_level = last_level,
1666       .first_layer = first_layer,
1667       .last_layer = last_layer,
1668       .swizzle =
1669          {
1670             so->base.swizzle_r,
1671             so->base.swizzle_g,
1672             so->base.swizzle_b,
1673             so->base.swizzle_a,
1674          },
1675       .planes = {NULL},
1676       .buf.offset = buf_offset,
1677       .buf.size = buf_size,
1678    };
1679 
1680 #if PAN_ARCH >= 7
1681    /* v7+ doesn't have an _RRRR component order. */
1682    if (util_format_is_depth_or_stencil(format))
1683       GENX(panfrost_texture_swizzle_replicate_x)(&iview);
1684 #endif
1685 #if PAN_ARCH == 7
1686    /* v7 requires AFBC reswizzle */
1687    if (!util_format_is_depth_or_stencil(format) &&
1688        !panfrost_format_is_yuv(format) &&
1689        panfrost_format_supports_afbc(PAN_ARCH, format))
1690       GENX(panfrost_texture_afbc_reswizzle)(&iview);
1691 #endif
1692 
1693    panfrost_set_image_view_planes(&iview, texture);
1694 
1695    unsigned size = (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1696                    GENX(panfrost_estimate_texture_payload_size)(&iview);
1697 
1698    struct panfrost_pool *pool = so->pool ?: &ctx->descs;
1699    struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64);
1700 
1701    if (!payload.cpu) {
1702       mesa_loge("panfrost_create_sampler_view_bo failed");
1703       return;
1704    }
1705 
1706    so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1707 
1708    void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1709 
1710    if (PAN_ARCH <= 5) {
1711       payload.cpu += pan_size(TEXTURE);
1712       payload.gpu += pan_size(TEXTURE);
1713    }
1714 
1715    const struct util_format_description *desc =
1716       util_format_description(format);
1717 
1718    if ((device->debug & PAN_DBG_YUV) && panfrost_format_is_yuv(format) &&
1719        !(is_shadow && panfrost_format_supports_mtk_tiled(format)) ) {
1720       if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
1721          iview.swizzle[1] = PIPE_SWIZZLE_0;
1722          iview.swizzle[2] = PIPE_SWIZZLE_1;
1723       } else if (desc->layout == UTIL_FORMAT_LAYOUT_PLANAR2) {
1724          iview.swizzle[1] = PIPE_SWIZZLE_0;
1725          iview.swizzle[2] = PIPE_SWIZZLE_0;
1726       }
1727    }
1728 
1729    if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC &&
1730        so->base.astc_decode_format == PIPE_ASTC_DECODE_FORMAT_UNORM8) {
1731       iview.astc.narrow = true;
1732    }
1733 
1734    GENX(panfrost_new_texture)(&iview, tex, &payload);
1735 }
1736 
1737 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1738 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1739                              struct pipe_context *pctx)
1740 {
1741    struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1742    if (view->texture_bo != rsrc->image.data.base ||
1743        view->texture_size != rsrc->image.layout.data_size ||
1744        view->modifier != rsrc->image.layout.modifier) {
1745       panfrost_bo_unreference(view->state.bo);
1746       panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1747    }
1748 }
1749 
1750 #if PAN_ARCH >= 6
1751 static void
panfrost_emit_null_texture(struct mali_texture_packed * out)1752 panfrost_emit_null_texture(struct mali_texture_packed *out)
1753 
1754 {
1755    /* Annoyingly, an all zero texture descriptor is not valid and will raise
1756     * a DATA_INVALID_FAULT if you try to texture it, instead of returning
1757     * 0000s! Fill in with sometthing that will behave robustly.
1758     */
1759    pan_pack(out, TEXTURE, cfg) {
1760       cfg.dimension = MALI_TEXTURE_DIMENSION_2D;
1761       cfg.width = 1;
1762       cfg.height = 1;
1763       cfg.depth = 1;
1764       cfg.array_size = 1;
1765       cfg.format = MALI_PACK_FMT(CONSTANT, 0000, L);
1766 #if PAN_ARCH <= 7
1767       cfg.texel_ordering = MALI_TEXTURE_LAYOUT_LINEAR;
1768 #endif
1769    }
1770 }
1771 #endif
1772 
1773 static uint64_t
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1774 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1775                                   enum pipe_shader_type stage)
1776 {
1777    struct panfrost_context *ctx = batch->ctx;
1778 
1779    unsigned actual_count = ctx->sampler_view_count[stage];
1780    unsigned needed_count = ctx->prog[stage]->info.texture_count;
1781    unsigned alloc_count = MAX2(actual_count, needed_count);
1782 
1783    if (!alloc_count)
1784       return 0;
1785 
1786 #if PAN_ARCH >= 6
1787    struct panfrost_ptr T =
1788       pan_pool_alloc_desc_array(&batch->pool.base, alloc_count, TEXTURE);
1789 
1790    if (!T.cpu)
1791       return 0;
1792 
1793    struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
1794 
1795    for (int i = 0; i < actual_count; ++i) {
1796       struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1797 
1798       if (!view) {
1799          panfrost_emit_null_texture(&out[i]);
1800          continue;
1801       }
1802 
1803       struct pipe_sampler_view *pview = &view->base;
1804       struct panfrost_resource *rsrc = pan_resource(pview->texture);
1805 
1806       panfrost_update_sampler_view(view, &ctx->base);
1807       out[i] = view->bifrost_descriptor;
1808 
1809       panfrost_batch_read_rsrc(batch, rsrc, stage);
1810       panfrost_batch_add_bo(batch, view->state.bo, stage);
1811    }
1812 
1813    for (int i = actual_count; i < needed_count; ++i)
1814       panfrost_emit_null_texture(&out[i]);
1815 
1816    return T.gpu;
1817 #else
1818    uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1819 
1820    for (int i = 0; i < actual_count; ++i) {
1821       struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1822 
1823       if (!view) {
1824          trampolines[i] = 0;
1825          continue;
1826       }
1827 
1828       panfrost_update_sampler_view(view, &ctx->base);
1829 
1830       trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1831    }
1832 
1833    for (int i = actual_count; i < needed_count; ++i)
1834       trampolines[i] = 0;
1835 
1836    return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1837                                   sizeof(uint64_t) * alloc_count,
1838                                   sizeof(uint64_t));
1839 #endif
1840 }
1841 
1842 static uint64_t
panfrost_upload_wa_sampler(struct panfrost_batch * batch)1843 panfrost_upload_wa_sampler(struct panfrost_batch *batch)
1844 {
1845    struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, SAMPLER);
1846    pan_cast_and_pack(T.cpu, SAMPLER, cfg)
1847       ;
1848    return T.gpu;
1849 }
1850 
1851 static uint64_t
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1852 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1853                                   enum pipe_shader_type stage)
1854 {
1855    struct panfrost_context *ctx = batch->ctx;
1856 
1857    /* We always need at least 1 sampler for txf to work */
1858    if (!ctx->sampler_count[stage])
1859       return panfrost_upload_wa_sampler(batch);
1860 
1861    struct panfrost_ptr T = pan_pool_alloc_desc_array(
1862       &batch->pool.base, ctx->sampler_count[stage], SAMPLER);
1863 
1864    if (!T.cpu)
1865       return 0;
1866 
1867    struct mali_sampler_packed *out = (struct mali_sampler_packed *)T.cpu;
1868 
1869    for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) {
1870       struct panfrost_sampler_state *st = ctx->samplers[stage][i];
1871 
1872       out[i] = st ? st->hw : (struct mali_sampler_packed){0};
1873    }
1874 
1875    return T.gpu;
1876 }
1877 
1878 #if PAN_ARCH <= 7
1879 /* Packs all image attribute descs and attribute buffer descs.
1880  * `first_image_buf_index` must be the index of the first image attribute buffer
1881  * descriptor.
1882  */
1883 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1884 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1885                    struct mali_attribute_packed *attribs, unsigned first_buf)
1886 {
1887    unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1888 
1889    for (unsigned i = 0; i < last_bit; ++i) {
1890       enum pipe_format format = ctx->images[shader][i].format;
1891 
1892       pan_pack(attribs + i, ATTRIBUTE, cfg) {
1893          /* Continuation record means 2 buffers per image */
1894          cfg.buffer_index = first_buf + (i * 2);
1895          cfg.offset_enable = (PAN_ARCH <= 5);
1896          cfg.format = GENX(panfrost_format_from_pipe_format)(format)->hw;
1897       }
1898    }
1899 }
1900 
1901 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1902 pan_modifier_to_attr_type(uint64_t modifier)
1903 {
1904    switch (modifier) {
1905    case DRM_FORMAT_MOD_LINEAR:
1906       return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1907    case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1908       return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1909    default:
1910       unreachable("Invalid modifier for attribute record");
1911    }
1912 }
1913 
1914 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1915 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1916                 struct mali_attribute_buffer_packed *bufs,
1917                 unsigned first_image_buf_index)
1918 {
1919    struct panfrost_context *ctx = batch->ctx;
1920    unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1921 
1922    for (unsigned i = 0; i < last_bit; ++i) {
1923       struct pipe_image_view *image = &ctx->images[shader][i];
1924 
1925       if (!(ctx->image_mask[shader] & (1 << i)) ||
1926           !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1927          /* Unused image bindings */
1928          pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg)
1929             ;
1930          pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg)
1931             ;
1932          continue;
1933       }
1934 
1935       struct panfrost_resource *rsrc = pan_resource(image->resource);
1936 
1937       bool is_msaa = image->resource->nr_samples > 1;
1938 
1939       bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1940       bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1941 
1942       unsigned offset = is_buffer ? image->u.buf.offset
1943                                   : panfrost_texture_offset(
1944                                        &rsrc->image.layout, image->u.tex.level,
1945                                        (is_3d || is_msaa) ? 0 : image->u.tex.first_layer,
1946                                        (is_3d || is_msaa) ? image->u.tex.first_layer : 0);
1947 
1948       panfrost_track_image_access(batch, shader, image);
1949 
1950       pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1951          cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1952          cfg.pointer = rsrc->image.data.base + offset;
1953          cfg.stride = util_format_get_blocksize(image->format);
1954          cfg.size = panfrost_bo_size(rsrc->bo) - offset;
1955       }
1956 
1957       if (is_buffer) {
1958          pan_cast_and_pack(&bufs[(i * 2) + 1], ATTRIBUTE_BUFFER_CONTINUATION_3D,
1959                            cfg) {
1960             cfg.s_dimension =
1961                rsrc->base.width0 / util_format_get_blocksize(image->format);
1962             cfg.t_dimension = cfg.r_dimension = 1;
1963          }
1964 
1965          continue;
1966       }
1967 
1968       pan_cast_and_pack(&bufs[(i * 2) + 1], ATTRIBUTE_BUFFER_CONTINUATION_3D,
1969                         cfg) {
1970          unsigned level = image->u.tex.level;
1971          unsigned samples = rsrc->image.layout.nr_samples;
1972 
1973          cfg.s_dimension = u_minify(rsrc->base.width0, level);
1974          cfg.t_dimension = u_minify(rsrc->base.height0, level);
1975          cfg.r_dimension = is_3d ? u_minify(rsrc->image.layout.depth, level)
1976             : (image->u.tex.last_layer - image->u.tex.first_layer + 1);
1977 
1978          cfg.row_stride = rsrc->image.layout.slices[level].row_stride;
1979          if (cfg.r_dimension > 1) {
1980             cfg.slice_stride =
1981                panfrost_get_layer_stride(&rsrc->image.layout, level);
1982          }
1983 
1984          if (is_msaa) {
1985             if (cfg.r_dimension == 1) {
1986                /* regular multisampled images get the sample index in
1987                   the R dimension */
1988                cfg.r_dimension = samples;
1989                cfg.slice_stride =
1990                   panfrost_get_layer_stride(&rsrc->image.layout, level) / samples;
1991             } else {
1992                /* multisampled image arrays are emulated by making the
1993                   image "samples" times higher than the original image,
1994                   and fixing up the T coordinate by the sample number
1995                   to address the correct sample (on bifrost) */
1996                cfg.t_dimension *= samples;
1997             }
1998          }
1999       }
2000    }
2001 }
2002 
2003 static uint64_t
panfrost_emit_image_attribs(struct panfrost_batch * batch,uint64_t * buffers,enum pipe_shader_type type)2004 panfrost_emit_image_attribs(struct panfrost_batch *batch, uint64_t *buffers,
2005                             enum pipe_shader_type type)
2006 {
2007    struct panfrost_context *ctx = batch->ctx;
2008    struct panfrost_compiled_shader *shader = ctx->prog[type];
2009 
2010    if (!shader->info.attribute_count) {
2011       *buffers = 0;
2012       return 0;
2013    }
2014 
2015    /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
2016    unsigned attr_count = shader->info.attribute_count;
2017    unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
2018 
2019    struct panfrost_ptr bufs =
2020       pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
2021 
2022    struct panfrost_ptr attribs =
2023       pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
2024 
2025    emit_image_attribs(ctx, type, attribs.cpu, 0);
2026    emit_image_bufs(batch, type, bufs.cpu, 0);
2027 
2028    /* We need an empty attrib buf to stop the prefetching on Bifrost */
2029 #if PAN_ARCH >= 6
2030    struct  mali_attribute_buffer_packed *attrib_bufs = bufs.cpu;
2031 
2032    pan_pack(&attrib_bufs[buf_count - 1], ATTRIBUTE_BUFFER, cfg)
2033       ;
2034 #endif
2035 
2036    *buffers = bufs.gpu;
2037    return attribs.gpu;
2038 }
2039 
2040 static uint64_t
panfrost_emit_vertex_data(struct panfrost_batch * batch,uint64_t * buffers)2041 panfrost_emit_vertex_data(struct panfrost_batch *batch, uint64_t *buffers)
2042 {
2043    struct panfrost_context *ctx = batch->ctx;
2044    struct panfrost_vertex_state *so = ctx->vertex;
2045    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2046    bool instanced = ctx->instance_count > 1;
2047    uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
2048    unsigned nr_images = util_last_bit(image_mask);
2049 
2050    /* Worst case: everything is NPOT, which is only possible if instancing
2051     * is enabled. Otherwise single record is gauranteed.
2052     * Also, we allocate more memory than what's needed here if either instancing
2053     * is enabled or images are present, this can be improved. */
2054    unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
2055    unsigned nr_bufs =
2056       ((so->nr_bufs + nr_images) * bufs_per_attrib) + (PAN_ARCH >= 6 ? 1 : 0);
2057 
2058    unsigned count = vs->info.attribute_count;
2059 
2060    struct panfrost_compiled_shader *xfb =
2061       ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb;
2062 
2063    if (xfb)
2064       count = MAX2(count, xfb->info.attribute_count);
2065 
2066 #if PAN_ARCH <= 5
2067    /* Midgard needs vertexid/instanceid handled specially */
2068    bool special_vbufs = count >= PAN_VERTEX_ID;
2069 
2070    if (special_vbufs)
2071       nr_bufs += 2;
2072 #endif
2073 
2074    if (!nr_bufs) {
2075       *buffers = 0;
2076       return 0;
2077    }
2078 
2079    struct panfrost_ptr S =
2080       pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, ATTRIBUTE_BUFFER);
2081    struct panfrost_ptr T =
2082       pan_pool_alloc_desc_array(&batch->pool.base, count, ATTRIBUTE);
2083 
2084    struct mali_attribute_buffer_packed *bufs =
2085       (struct mali_attribute_buffer_packed *)S.cpu;
2086 
2087    struct mali_attribute_packed *out = (struct mali_attribute_packed *)T.cpu;
2088 
2089    unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = {0};
2090    unsigned k = 0;
2091 
2092    for (unsigned i = 0; i < so->nr_bufs; ++i) {
2093       unsigned vbi = so->buffers[i].vbi;
2094       unsigned divisor = so->buffers[i].divisor;
2095       attrib_to_buffer[i] = k;
2096 
2097       if (!(ctx->vb_mask & (1 << vbi)))
2098          continue;
2099 
2100       struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2101       struct panfrost_resource *rsrc;
2102 
2103       rsrc = pan_resource(buf->buffer.resource);
2104       if (!rsrc)
2105          continue;
2106 
2107       panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2108 
2109       /* Mask off lower bits, see offset fixup below */
2110       uint64_t raw_addr = rsrc->image.data.base + buf->buffer_offset;
2111       uint64_t addr = raw_addr & ~63;
2112 
2113       /* Since we advanced the base pointer, we shrink the buffer
2114        * size, but add the offset we subtracted */
2115       unsigned size =
2116          rsrc->base.width0 + (raw_addr - addr) - buf->buffer_offset;
2117 
2118       /* When there is a divisor, the hardware-level divisor is
2119        * the product of the instance divisor and the padded count */
2120       unsigned stride = so->strides[vbi];
2121       unsigned hw_divisor = ctx->padded_count * divisor;
2122 
2123       if (ctx->instance_count <= 1) {
2124          /* Per-instance would be every attribute equal */
2125          if (divisor)
2126             stride = 0;
2127 
2128          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2129             cfg.pointer = addr;
2130             cfg.stride = stride;
2131             cfg.size = size;
2132          }
2133       } else if (!divisor) {
2134          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2135             cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
2136             cfg.pointer = addr;
2137             cfg.stride = stride;
2138             cfg.size = size;
2139             cfg.divisor = ctx->padded_count;
2140          }
2141       } else if (util_is_power_of_two_or_zero(hw_divisor)) {
2142          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2143             cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
2144             cfg.pointer = addr;
2145             cfg.stride = stride;
2146             cfg.size = size;
2147             cfg.divisor_r = __builtin_ctz(hw_divisor);
2148          }
2149 
2150       } else {
2151          unsigned shift = 0, extra_flags = 0;
2152 
2153          unsigned magic_divisor =
2154             panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
2155 
2156          /* Records with continuations must be aligned */
2157          k = ALIGN_POT(k, 2);
2158          attrib_to_buffer[i] = k;
2159 
2160          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2161             cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
2162             cfg.pointer = addr;
2163             cfg.stride = stride;
2164             cfg.size = size;
2165 
2166             cfg.divisor_r = shift;
2167             cfg.divisor_e = extra_flags;
2168          }
2169 
2170          pan_cast_and_pack(&bufs[k + 1], ATTRIBUTE_BUFFER_CONTINUATION_NPOT,
2171                            cfg) {
2172             cfg.divisor_numerator = magic_divisor;
2173             cfg.divisor = divisor;
2174          }
2175 
2176          ++k;
2177       }
2178 
2179       ++k;
2180    }
2181 
2182 #if PAN_ARCH <= 5
2183    /* Add special gl_VertexID/gl_InstanceID buffers */
2184    if (special_vbufs) {
2185       panfrost_vertex_id(ctx->padded_count,
2186                          (struct mali_attribute_vertex_id_packed *)&bufs[k],
2187                          ctx->instance_count > 1);
2188 
2189       pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
2190          cfg.buffer_index = k++;
2191          cfg.format = so->formats[PAN_VERTEX_ID];
2192       }
2193 
2194       panfrost_instance_id(ctx->padded_count,
2195                            (struct mali_attribute_instance_id_packed *)&bufs[k],
2196                            ctx->instance_count > 1);
2197 
2198       pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
2199          cfg.buffer_index = k++;
2200          cfg.format = so->formats[PAN_INSTANCE_ID];
2201       }
2202    }
2203 #endif
2204 
2205    if (nr_images) {
2206       k = ALIGN_POT(k, 2);
2207       emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
2208       emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
2209       k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
2210    }
2211 
2212 #if PAN_ARCH >= 6
2213    /* We need an empty attrib buf to stop the prefetching on Bifrost */
2214    pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg)
2215       ;
2216 #endif
2217 
2218    /* Attribute addresses require 64-byte alignment, so let:
2219     *
2220     *      base' = base & ~63 = base - (base & 63)
2221     *      offset' = offset + (base & 63)
2222     *
2223     * Since base' + offset' = base + offset, these are equivalent
2224     * addressing modes and now base is 64 aligned.
2225     */
2226 
2227    /* While these are usually equal, they are not required to be. In some
2228     * cases, u_blitter passes too high a value for num_elements.
2229     */
2230    assert(vs->info.attributes_read_count <= so->num_elements);
2231 
2232    for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) {
2233       unsigned vbi = so->pipe[i].vertex_buffer_index;
2234       struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2235 
2236       /* BOs are aligned; just fixup for buffer_offset */
2237       signed src_offset = so->pipe[i].src_offset;
2238       src_offset += (buf->buffer_offset & 63);
2239 
2240       /* Base instance offset */
2241       if (ctx->base_instance && so->pipe[i].instance_divisor) {
2242          src_offset += (ctx->base_instance * so->pipe[i].src_stride) /
2243                        so->pipe[i].instance_divisor;
2244       }
2245 
2246       /* Also, somewhat obscurely per-instance data needs to be
2247        * offset in response to a delayed start in an indexed draw */
2248 
2249       if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
2250          src_offset -= so->pipe[i].src_stride * ctx->offset_start;
2251 
2252       pan_pack(out + i, ATTRIBUTE, cfg) {
2253          cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
2254          cfg.format = so->formats[i];
2255          cfg.offset = src_offset;
2256       }
2257    }
2258 
2259    *buffers = S.gpu;
2260    return T.gpu;
2261 }
2262 
2263 static uint64_t
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)2264 panfrost_emit_varyings(struct panfrost_batch *batch,
2265                        struct mali_attribute_buffer_packed *slot,
2266                        unsigned stride, unsigned count)
2267 {
2268    unsigned size = stride * count;
2269    uint64_t ptr =
2270       pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
2271 
2272    pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
2273       cfg.stride = stride;
2274       cfg.size = size;
2275       cfg.pointer = ptr;
2276    }
2277 
2278    return ptr;
2279 }
2280 
2281 /* Given a varying, figure out which index it corresponds to */
2282 
2283 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)2284 pan_varying_index(unsigned present, enum pan_special_varying v)
2285 {
2286    return util_bitcount(present & BITFIELD_MASK(v));
2287 }
2288 
2289 /* Determines which varying buffers are required */
2290 
2291 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)2292 pan_varying_present(const struct panfrost_device *dev,
2293                     struct pan_shader_info *producer,
2294                     struct pan_shader_info *consumer, uint16_t point_coord_mask)
2295 {
2296    /* At the moment we always emit general and position buffers. Not
2297     * strictly necessary but usually harmless */
2298 
2299    unsigned present =
2300       BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
2301 
2302    /* Enable special buffers by the shader info */
2303 
2304    if (producer->vs.writes_point_size)
2305       present |= BITFIELD_BIT(PAN_VARY_PSIZ);
2306 
2307 #if PAN_ARCH <= 5
2308    /* On Midgard, these exist as real varyings. Later architectures use
2309     * LD_VAR_SPECIAL reads instead. */
2310 
2311    if (consumer->fs.reads_point_coord)
2312       present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2313 
2314    if (consumer->fs.reads_face)
2315       present |= BITFIELD_BIT(PAN_VARY_FACE);
2316 
2317    if (consumer->fs.reads_frag_coord)
2318       present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
2319 
2320    /* Also, if we have a point sprite, we need a point coord buffer */
2321 
2322    for (unsigned i = 0; i < consumer->varyings.input_count; i++) {
2323       gl_varying_slot loc = consumer->varyings.input[i].location;
2324 
2325       if (util_varying_is_point_coord(loc, point_coord_mask))
2326          present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2327    }
2328 #endif
2329 
2330    return present;
2331 }
2332 
2333 /* Emitters for varying records */
2334 
2335 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)2336 pan_emit_vary(const struct panfrost_device *dev,
2337               struct mali_attribute_packed *out, unsigned buffer_index,
2338               mali_pixel_format format, unsigned offset)
2339 {
2340    pan_pack(out, ATTRIBUTE, cfg) {
2341       cfg.buffer_index = buffer_index;
2342       cfg.offset_enable = (PAN_ARCH <= 5);
2343       cfg.format = format;
2344       cfg.offset = offset;
2345    }
2346 }
2347 
2348 /* Special records */
2349 
2350 /* clang-format off */
2351 static const struct {
2352    unsigned components;
2353    enum mali_format format;
2354 } pan_varying_formats[PAN_VARY_MAX] = {
2355    [PAN_VARY_POSITION]  = { 4, MALI_SNAP_4   },
2356    [PAN_VARY_PSIZ]      = { 1, MALI_R16F     },
2357    [PAN_VARY_PNTCOORD]  = { 4, MALI_RGBA32F  },
2358    [PAN_VARY_FACE]      = { 1, MALI_R32I     },
2359    [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F  },
2360 };
2361 /* clang-format on */
2362 
2363 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)2364 pan_special_format(const struct panfrost_device *dev,
2365                    enum pan_special_varying buf)
2366 {
2367    assert(buf < PAN_VARY_MAX);
2368    mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2369 
2370 #if PAN_ARCH <= 6
2371    unsigned nr = pan_varying_formats[buf].components;
2372    format |= panfrost_get_default_swizzle(nr);
2373 #endif
2374 
2375    return format;
2376 }
2377 
2378 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)2379 pan_emit_vary_special(const struct panfrost_device *dev,
2380                       struct mali_attribute_packed *out, unsigned present,
2381                       enum pan_special_varying buf)
2382 {
2383    pan_emit_vary(dev, out, pan_varying_index(present, buf),
2384                  pan_special_format(dev, buf), 0);
2385 }
2386 
2387 /* Negative indicates a varying is not found */
2388 
2389 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)2390 pan_find_vary(const struct pan_shader_varying *vary, unsigned vary_count,
2391               unsigned loc)
2392 {
2393    for (unsigned i = 0; i < vary_count; ++i) {
2394       if (vary[i].location == loc)
2395          return i;
2396    }
2397 
2398    return -1;
2399 }
2400 
2401 /* Assign varying locations for the general buffer. Returns the calculated
2402  * per-vertex stride, and outputs offsets into the passed array. Negative
2403  * offset indicates a varying is not used. */
2404 
2405 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2406 pan_assign_varyings(const struct panfrost_device *dev,
2407                     struct pan_shader_info *producer,
2408                     struct pan_shader_info *consumer, signed *offsets)
2409 {
2410    unsigned producer_count = producer->varyings.output_count;
2411    unsigned consumer_count = consumer->varyings.input_count;
2412 
2413    const struct pan_shader_varying *producer_vars = producer->varyings.output;
2414    const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2415 
2416    unsigned stride = 0;
2417 
2418    for (unsigned i = 0; i < producer_count; ++i) {
2419       signed loc = pan_find_vary(consumer_vars, consumer_count,
2420                                  producer_vars[i].location);
2421       enum pipe_format format =
2422          loc >= 0 ? consumer_vars[loc].format : PIPE_FORMAT_NONE;
2423 
2424       if (format != PIPE_FORMAT_NONE) {
2425          offsets[i] = stride;
2426          stride += util_format_get_blocksize(format);
2427       } else {
2428          offsets[i] = -1;
2429       }
2430    }
2431 
2432    return stride;
2433 }
2434 
2435 /* Emitter for a single varying (attribute) descriptor */
2436 
2437 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,signed offset,enum pan_special_varying pos_varying)2438 panfrost_emit_varying(const struct panfrost_device *dev,
2439                       struct mali_attribute_packed *out,
2440                       const struct pan_shader_varying varying,
2441                       enum pipe_format pipe_format, unsigned present,
2442                       uint16_t point_sprite_mask, signed offset,
2443                       enum pan_special_varying pos_varying)
2444 {
2445    /* Note: varying.format != pipe_format in some obscure cases due to a
2446     * limitation of the NIR linker. This should be fixed in the future to
2447     * eliminate the additional lookups. See:
2448     * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2449     */
2450    gl_varying_slot loc = varying.location;
2451    mali_pixel_format format =
2452       GENX(panfrost_format_from_pipe_format)(pipe_format)->hw;
2453 
2454    if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2455       pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2456    } else if (loc == VARYING_SLOT_POS) {
2457       pan_emit_vary_special(dev, out, present, pos_varying);
2458    } else if (loc == VARYING_SLOT_PSIZ) {
2459       pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2460    } else if (loc == VARYING_SLOT_FACE) {
2461       pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2462    } else if (offset < 0) {
2463       pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2464    } else {
2465       STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2466       pan_emit_vary(dev, out, 0, format, offset);
2467    }
2468 }
2469 
2470 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2471  * rather than draw time (under good conditions). */
2472 
2473 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_compiled_shader * producer,struct panfrost_compiled_shader * consumer,uint16_t point_coord_mask,struct pan_linkage * out)2474 panfrost_emit_varying_descs(struct panfrost_pool *pool,
2475                             struct panfrost_compiled_shader *producer,
2476                             struct panfrost_compiled_shader *consumer,
2477                             uint16_t point_coord_mask, struct pan_linkage *out)
2478 {
2479    struct panfrost_device *dev = pool->dev;
2480    unsigned producer_count = producer->info.varyings.output_count;
2481    unsigned consumer_count = consumer->info.varyings.input_count;
2482 
2483    /* Offsets within the general varying buffer, indexed by location */
2484    signed offsets[PAN_MAX_VARYINGS];
2485    assert(producer_count <= ARRAY_SIZE(offsets));
2486    assert(consumer_count <= ARRAY_SIZE(offsets));
2487 
2488    /* Allocate enough descriptors for both shader stages */
2489    struct panfrost_ptr T = pan_pool_alloc_desc_array(
2490       &pool->base, producer_count + consumer_count, ATTRIBUTE);
2491 
2492    /* Take a reference if we're being put on the CSO */
2493    if (!pool->owned) {
2494       out->bo = pool->transient_bo;
2495       panfrost_bo_reference(out->bo);
2496    }
2497 
2498    struct mali_attribute_packed *descs = T.cpu;
2499    out->producer = producer_count ? T.gpu : 0;
2500    out->consumer =
2501       consumer_count ? T.gpu + (pan_size(ATTRIBUTE) * producer_count) : 0;
2502 
2503    /* Lay out the varyings. Must use producer to lay out, in order to
2504     * respect transform feedback precisions. */
2505    out->present = pan_varying_present(dev, &producer->info, &consumer->info,
2506                                       point_coord_mask);
2507 
2508    out->stride =
2509       pan_assign_varyings(dev, &producer->info, &consumer->info, offsets);
2510 
2511    for (unsigned i = 0; i < producer_count; ++i) {
2512       signed j = pan_find_vary(consumer->info.varyings.input,
2513                                consumer->info.varyings.input_count,
2514                                producer->info.varyings.output[i].location);
2515 
2516       enum pipe_format format = (j >= 0)
2517                                    ? consumer->info.varyings.input[j].format
2518                                    : producer->info.varyings.output[i].format;
2519 
2520       panfrost_emit_varying(dev, descs + i, producer->info.varyings.output[i],
2521                             format, out->present, 0, offsets[i],
2522                             PAN_VARY_POSITION);
2523    }
2524 
2525    for (unsigned i = 0; i < consumer_count; ++i) {
2526       signed j = pan_find_vary(producer->info.varyings.output,
2527                                producer->info.varyings.output_count,
2528                                consumer->info.varyings.input[i].location);
2529 
2530       signed offset = (j >= 0) ? offsets[j] : -1;
2531 
2532       panfrost_emit_varying(
2533          dev, descs + producer_count + i, consumer->info.varyings.input[i],
2534          consumer->info.varyings.input[i].format, out->present,
2535          point_coord_mask, offset, PAN_VARY_FRAGCOORD);
2536    }
2537 }
2538 
2539 #if PAN_ARCH <= 5
2540 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2541 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2542                        unsigned present, enum pan_special_varying v,
2543                        unsigned special)
2544 {
2545    if (present & BITFIELD_BIT(v)) {
2546       unsigned idx = pan_varying_index(present, v);
2547 
2548       pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2549          cfg.special = special;
2550          cfg.type = 0;
2551       }
2552    }
2553 }
2554 #endif
2555 
2556 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,bool point_coord_replace)2557 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2558                                  unsigned vertex_count,
2559                                  bool point_coord_replace)
2560 {
2561    struct panfrost_context *ctx = batch->ctx;
2562    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2563    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
2564 
2565    uint16_t point_coord_mask = 0;
2566 
2567    memset(&batch->varyings, 0, sizeof(batch->varyings));
2568 
2569 #if PAN_ARCH <= 5
2570    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2571 
2572    /* Point sprites are lowered on Bifrost and newer */
2573    if (point_coord_replace)
2574       point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2575 #endif
2576 
2577    /* In good conditions, we only need to link varyings once */
2578    bool prelink =
2579       (point_coord_mask == 0) && !vs->info.separable && !fs->info.separable;
2580 
2581    /* Try to reduce copies */
2582    struct pan_linkage _linkage;
2583    struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2584 
2585    /* Emit ATTRIBUTE descriptors if needed */
2586    if (!prelink || vs->linkage.bo == NULL) {
2587       struct panfrost_pool *pool = prelink ? &ctx->descs : &batch->pool;
2588 
2589       panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
2590    }
2591 
2592    unsigned present = linkage->present, stride = linkage->stride;
2593    unsigned count = util_bitcount(present);
2594    struct panfrost_ptr T =
2595       pan_pool_alloc_desc_array(&batch->pool.base, count + 1, ATTRIBUTE_BUFFER);
2596 
2597    if (!T.cpu) {
2598       mesa_loge("panfrost_emit_varying_descriptor failed");
2599       return;
2600    }
2601 
2602    struct mali_attribute_buffer_packed *varyings =
2603       (struct mali_attribute_buffer_packed *)T.cpu;
2604 
2605    batch->varyings.nr_bufs = count;
2606 
2607 #if PAN_ARCH >= 6
2608    /* Suppress prefetch on Bifrost */
2609    memset(varyings + count, 0, sizeof(*varyings));
2610 #endif
2611 
2612    if (stride) {
2613       panfrost_emit_varyings(
2614          batch, &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], stride,
2615          vertex_count);
2616    } else {
2617       /* The indirect draw code reads the stride field, make sure
2618        * that it is initialised */
2619       memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0,
2620              sizeof(*varyings));
2621    }
2622 
2623    /* fp32 vec4 gl_Position */
2624    batch->varyings.pos = panfrost_emit_varyings(
2625       batch, &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2626       sizeof(float) * 4, vertex_count);
2627 
2628    if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2629       batch->varyings.psiz = panfrost_emit_varyings(
2630          batch, &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2,
2631          vertex_count);
2632    }
2633 
2634 #if PAN_ARCH <= 5
2635    pan_emit_special_input(
2636       varyings, present, PAN_VARY_PNTCOORD,
2637       (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2638          ? MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MAX_Y
2639          : MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MIN_Y);
2640    pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2641                           MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2642    pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2643                           MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2644 #endif
2645 
2646    batch->varyings.bufs = T.gpu;
2647    batch->varyings.vs = linkage->producer;
2648    batch->varyings.fs = linkage->consumer;
2649 }
2650 #endif
2651 
2652 static struct pan_tls_info
get_tls_info(struct panfrost_device * dev,struct panfrost_batch * batch)2653 get_tls_info(struct panfrost_device *dev, struct panfrost_batch *batch)
2654 {
2655    struct panfrost_bo *tls_bo = NULL;
2656    if (batch->stack_size) {
2657       tls_bo = panfrost_batch_get_scratchpad(batch, batch->stack_size,
2658                                              dev->thread_tls_alloc,
2659                                              dev->core_id_range);
2660       if (!tls_bo)
2661          mesa_loge("failed to allocate scratch-pad memory for stack");
2662    }
2663 
2664    return (struct pan_tls_info) {
2665       .tls =
2666          {
2667             .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2668             .size = batch->stack_size,
2669          },
2670    };
2671 }
2672 
2673 
2674 static void
emit_tls(struct panfrost_batch * batch)2675 emit_tls(struct panfrost_batch *batch)
2676 {
2677    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2678 
2679    /* Emitted with the FB descriptor on Midgard. */
2680    if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2681       return;
2682 
2683    struct pan_tls_info tls = get_tls_info(dev, batch);
2684 
2685    assert(batch->tls.cpu);
2686    GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2687 }
2688 
2689 static void
emit_fbd(struct panfrost_batch * batch,struct pan_fb_info * fb)2690 emit_fbd(struct panfrost_batch *batch, struct pan_fb_info *fb)
2691 {
2692    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2693 
2694    struct pan_tls_info tls = get_tls_info(dev, batch);
2695 
2696 #if PAN_ARCH >= 6
2697    fb->sample_positions =
2698       dev->sample_positions->ptr.gpu +
2699       panfrost_sample_positions_offset(pan_sample_pattern(fb->nr_samples));
2700 #endif
2701 
2702    JOBX(emit_fbds)(batch, fb, &tls);
2703 }
2704 
2705 /* Mark a surface as written */
2706 
2707 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2708 panfrost_initialize_surface(struct panfrost_batch *batch,
2709                             struct pipe_surface *surf)
2710 {
2711    if (surf) {
2712       struct panfrost_resource *rsrc = pan_resource(surf->texture);
2713       BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2714       if (rsrc->separate_stencil)
2715          BITSET_SET(rsrc->separate_stencil->valid.data, surf->u.tex.level);
2716       if (rsrc->shadow_image)
2717          BITSET_SET(rsrc->shadow_image->valid.data, surf->u.tex.level);
2718    }
2719 }
2720 
2721 /* Generate a fragment job. This should be called once per frame. (Usually,
2722  * this corresponds to eglSwapBuffers or one of glFlush, glFinish)
2723  */
2724 static void
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2725 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2726 {
2727    /* Mark the affected buffers as initialized, since we're writing to it.
2728     * Also, add the surfaces we're writing to to the batch */
2729 
2730    struct pipe_framebuffer_state *fb = &batch->key;
2731 
2732    for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2733       panfrost_initialize_surface(batch, fb->cbufs[i]);
2734 
2735    panfrost_initialize_surface(batch, fb->zsbuf);
2736 
2737    /* The passed tile coords can be out of range in some cases, so we need
2738     * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2739     * Theoretically we also need to clamp the coordinates positive, but we
2740     * avoid that edge case as all four values are unsigned. Also,
2741     * theoretically we could clamp the minima, but if that has to happen
2742     * the asserts would fail anyway (since the maxima would get clamped
2743     * and then be smaller than the minima). An edge case of sorts occurs
2744     * when no scissors are added to draw, so by default min=~0 and max=0.
2745     * But that can't happen if any actual drawing occurs (beyond a
2746     * wallpaper reload), so this is again irrelevant in practice. */
2747 
2748    batch->maxx = MIN2(batch->maxx, fb->width);
2749    batch->maxy = MIN2(batch->maxy, fb->height);
2750 
2751    /* Rendering region must be at least 1x1; otherwise, there is nothing
2752     * to do and the whole job chain should have been discarded. */
2753 
2754    assert(batch->maxx > batch->minx);
2755    assert(batch->maxy > batch->miny);
2756 
2757    JOBX(emit_fragment_job)(batch, pfb);
2758 }
2759 
2760 /* Count generated primitives (when there is no geom/tess shaders) for
2761  * transform feedback */
2762 
2763 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2764 panfrost_statistics_record(struct panfrost_context *ctx,
2765                            const struct pipe_draw_info *info,
2766                            const struct pipe_draw_start_count_bias *draw)
2767 {
2768    if (!ctx->active_queries)
2769       return;
2770 
2771    uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2772    ctx->prims_generated += prims;
2773 
2774    if (!ctx->streamout.num_targets)
2775       return;
2776 
2777    ctx->tf_prims_generated += prims;
2778    ctx->dirty |= PAN_DIRTY_SO;
2779 }
2780 
2781 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2782 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2783 {
2784    unsigned count =
2785       u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2786 
2787    for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2788       if (!ctx->streamout.targets[i])
2789          continue;
2790 
2791       pan_so_target(ctx->streamout.targets[i])->offset += count;
2792    }
2793 }
2794 
2795 /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of
2796  * 3D state. In particular, it groups the fragment shader descriptor with
2797  * depth/stencil, blend, polygon offset, and multisampling state. These pieces
2798  * of state are dirty tracked independently for the benefit of newer GPUs that
2799  * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D
2800  * dirty flags that trigger re-emits of the fragment RSD.
2801  *
2802  * Obscurely, occlusion queries are included. Occlusion query state is nominally
2803  * specified in the draw call descriptor, but must be considered when determing
2804  * early-Z state which is part of the RSD.
2805  */
2806 #define FRAGMENT_RSD_DIRTY_MASK                                                \
2807    (PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER |   \
2808     PAN_DIRTY_OQ)
2809 
2810 static inline void
panfrost_update_shader_state(struct panfrost_batch * batch,enum pipe_shader_type st)2811 panfrost_update_shader_state(struct panfrost_batch *batch,
2812                              enum pipe_shader_type st)
2813 {
2814    struct panfrost_context *ctx = batch->ctx;
2815    struct panfrost_compiled_shader *ss = ctx->prog[st];
2816 
2817    bool frag = (st == PIPE_SHADER_FRAGMENT);
2818    unsigned dirty_3d = ctx->dirty;
2819    unsigned dirty = ctx->dirty_shader[st];
2820 
2821    if (dirty & (PAN_DIRTY_STAGE_TEXTURE | PAN_DIRTY_STAGE_SHADER)) {
2822       batch->textures[st] = panfrost_emit_texture_descriptors(batch, st);
2823    }
2824 
2825    if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2826       batch->samplers[st] = panfrost_emit_sampler_descriptors(batch, st);
2827    }
2828 
2829    /* On Bifrost and older, the fragment shader descriptor is fused
2830     * together with the renderer state; the combined renderer state
2831     * descriptor is emitted below. Otherwise, the shader descriptor is
2832     * standalone and is emitted here.
2833     */
2834    if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) {
2835       batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2836    }
2837 
2838 #if PAN_ARCH >= 9
2839    if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2840       batch->images[st] =
2841          ctx->image_mask[st] ? panfrost_emit_images(batch, st) : 0;
2842    }
2843 
2844    if (dirty & PAN_DIRTY_STAGE_SSBO)
2845       batch->ssbos[st] = panfrost_emit_ssbos(batch, st);
2846 #endif
2847 
2848    if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2849       batch->uniform_buffers[st] = panfrost_emit_const_buf(
2850          batch, st, &batch->nr_uniform_buffers[st], &batch->push_uniforms[st],
2851          &batch->nr_push_uniforms[st]);
2852    }
2853 
2854 #if PAN_ARCH <= 7
2855    /* On Bifrost and older, if the fragment shader changes OR any renderer
2856     * state specified with the fragment shader, the whole renderer state
2857     * descriptor is dirtied and must be reemited.
2858     */
2859    if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) ||
2860                 (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) {
2861 
2862       batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2863    }
2864 
2865    /* Vertex shaders need to mix vertex data and image descriptors in the
2866     * attribute array. This is taken care of in panfrost_update_state_3d().
2867     */
2868    if (st != PIPE_SHADER_VERTEX && (dirty & PAN_DIRTY_STAGE_IMAGE)) {
2869       batch->attribs[st] =
2870          panfrost_emit_image_attribs(batch, &batch->attrib_bufs[st], st);
2871    }
2872 #endif
2873 }
2874 
2875 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2876 panfrost_update_state_3d(struct panfrost_batch *batch)
2877 {
2878    struct panfrost_context *ctx = batch->ctx;
2879    unsigned dirty = ctx->dirty;
2880 
2881    if (dirty & PAN_DIRTY_TLS_SIZE)
2882       panfrost_batch_adjust_stack_size(batch);
2883 
2884    if (dirty & PAN_DIRTY_BLEND)
2885       panfrost_set_batch_masks_blend(batch);
2886 
2887    if (dirty & PAN_DIRTY_ZS)
2888       panfrost_set_batch_masks_zs(batch);
2889 
2890 #if PAN_ARCH >= 9
2891    if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) ||
2892        (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER))
2893       batch->depth_stencil = panfrost_emit_depth_stencil(batch);
2894 
2895    if (dirty & PAN_DIRTY_BLEND)
2896       batch->blend = panfrost_emit_blend_valhall(batch);
2897 
2898    if (dirty & PAN_DIRTY_VERTEX) {
2899       batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(batch);
2900 
2901       batch->attrib_bufs[PIPE_SHADER_VERTEX] =
2902          panfrost_emit_vertex_buffers(batch);
2903    }
2904 #else
2905    unsigned vt_shader_dirty = ctx->dirty_shader[PIPE_SHADER_VERTEX];
2906    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2907    struct panfrost_vertex_state *vstate = ctx->vertex;
2908    bool attr_offsetted_by_instance_base =
2909       vstate->attr_depends_on_base_instance_mask &
2910       BITFIELD_MASK(vs->info.attributes_read_count);
2911 
2912    /* Vertex data, vertex shader and images accessed by the vertex shader have
2913     * an impact on the attributes array, we need to re-emit anytime one of these
2914     * parameters changes. */
2915    if ((dirty & PAN_DIRTY_VERTEX) ||
2916        (vt_shader_dirty & (PAN_DIRTY_STAGE_IMAGE | PAN_DIRTY_STAGE_SHADER)) ||
2917        attr_offsetted_by_instance_base) {
2918       batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(
2919          batch, &batch->attrib_bufs[PIPE_SHADER_VERTEX]);
2920    }
2921 #endif
2922 }
2923 
2924 static void
panfrost_launch_xfb(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned count)2925 panfrost_launch_xfb(struct panfrost_batch *batch,
2926                     const struct pipe_draw_info *info, unsigned count)
2927 {
2928    struct panfrost_context *ctx = batch->ctx;
2929 
2930    /* Nothing to do */
2931    if (batch->ctx->streamout.num_targets == 0)
2932       return;
2933 
2934    /* TODO: XFB with index buffers */
2935    // assert(info->index_size == 0);
2936 
2937    if (!u_trim_pipe_prim(info->mode, &count))
2938       return;
2939 
2940    perf_debug(batch->ctx, "Emulating transform feedback");
2941 
2942    struct panfrost_uncompiled_shader *vs_uncompiled =
2943       ctx->uncompiled[PIPE_SHADER_VERTEX];
2944    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2945 
2946    vs_uncompiled->xfb->stream_output = vs->stream_output;
2947 
2948    uint64_t saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
2949    uint64_t saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
2950    uint64_t saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
2951    unsigned saved_nr_push_uniforms =
2952       batch->nr_push_uniforms[PIPE_SHADER_VERTEX];
2953 
2954    ctx->uncompiled[PIPE_SHADER_VERTEX] = NULL; /* should not be read */
2955    ctx->prog[PIPE_SHADER_VERTEX] = vs_uncompiled->xfb;
2956    batch->rsd[PIPE_SHADER_VERTEX] =
2957       panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
2958 
2959    batch->uniform_buffers[PIPE_SHADER_VERTEX] =
2960       panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
2961                               &batch->push_uniforms[PIPE_SHADER_VERTEX],
2962                               &batch->nr_push_uniforms[PIPE_SHADER_VERTEX]);
2963 
2964    JOBX(launch_xfb)(batch, info, count);
2965    batch->compute_count++;
2966 
2967    ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled;
2968    ctx->prog[PIPE_SHADER_VERTEX] = vs;
2969    batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
2970    batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
2971    batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
2972    batch->nr_push_uniforms[PIPE_SHADER_VERTEX] = saved_nr_push_uniforms;
2973 }
2974 
2975 /*
2976  * Increase the vertex count on the batch using a saturating add, and hope the
2977  * compiler can use the machine instruction here...
2978  */
2979 static inline void
panfrost_increase_vertex_count(struct panfrost_batch * batch,uint32_t increment)2980 panfrost_increase_vertex_count(struct panfrost_batch *batch, uint32_t increment)
2981 {
2982    uint32_t sum = batch->vertex_count + increment;
2983 
2984    if (sum >= batch->vertex_count)
2985       batch->vertex_count = sum;
2986    else
2987       batch->vertex_count = UINT32_MAX;
2988 
2989 #if PAN_ARCH <= 5
2990    batch->tiler_ctx.midgard.vertex_count = batch->vertex_count;
2991 #endif
2992 }
2993 
2994 /*
2995  * If we change whether we're drawing points, or whether point sprites are
2996  * enabled (specified in the rasterizer), we may need to rebind shaders
2997  * accordingly. This implicitly covers the case of rebinding framebuffers,
2998  * because all dirty flags are set there.
2999  */
3000 static void
panfrost_update_active_prim(struct panfrost_context * ctx,const struct pipe_draw_info * info)3001 panfrost_update_active_prim(struct panfrost_context *ctx,
3002                             const struct pipe_draw_info *info)
3003 {
3004    const enum mesa_prim prev_prim = u_reduced_prim(ctx->active_prim);
3005    const enum mesa_prim new_prim = u_reduced_prim(info->mode);
3006 
3007    ctx->active_prim = info->mode;
3008 
3009    if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
3010        (prev_prim != new_prim)) {
3011       panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
3012    }
3013 }
3014 
3015 static unsigned
panfrost_draw_get_vertex_count(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool idvs)3016 panfrost_draw_get_vertex_count(struct panfrost_batch *batch,
3017                                const struct pipe_draw_info *info,
3018                                const struct pipe_draw_start_count_bias *draw,
3019                                bool idvs)
3020 {
3021    struct panfrost_context *ctx = batch->ctx;
3022    unsigned vertex_count = ctx->vertex_count;
3023    unsigned min_index = 0, max_index = 0;
3024 
3025    batch->indices = 0;
3026    if (info->index_size && PAN_ARCH >= 9) {
3027       batch->indices = panfrost_get_index_buffer(batch, info, draw);
3028 
3029       /* Use index count to estimate vertex count */
3030       panfrost_increase_vertex_count(batch, draw->count);
3031    } else if (info->index_size) {
3032       batch->indices = panfrost_get_index_buffer_bounded(
3033          batch, info, draw, &min_index, &max_index);
3034 
3035       /* Use the corresponding values */
3036       vertex_count = max_index - min_index + 1;
3037       ctx->offset_start = min_index + draw->index_bias;
3038       panfrost_increase_vertex_count(batch, vertex_count);
3039    } else {
3040       ctx->offset_start = draw->start;
3041       panfrost_increase_vertex_count(batch, vertex_count);
3042    }
3043 
3044    if (PAN_ARCH <= 9 && info->instance_count > 1) {
3045       unsigned count = vertex_count;
3046 
3047       /* Index-Driven Vertex Shading requires different instances to
3048        * have different cache lines for position results. Each vertex
3049        * position is 16 bytes and the Mali cache line is 64 bytes, so
3050        * the instance count must be aligned to 4 vertices.
3051        */
3052       if (idvs)
3053          count = ALIGN_POT(count, 4);
3054 
3055       ctx->padded_count = panfrost_padded_vertex_count(count);
3056    } else {
3057       ctx->padded_count = vertex_count;
3058    }
3059 
3060    return vertex_count;
3061 }
3062 
3063 static void
panfrost_single_draw_direct(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)3064 panfrost_single_draw_direct(struct panfrost_batch *batch,
3065                             const struct pipe_draw_info *info,
3066                             unsigned drawid_offset,
3067                             const struct pipe_draw_start_count_bias *draw)
3068 {
3069    if (!draw->count || !info->instance_count)
3070       return;
3071 
3072    struct panfrost_context *ctx = batch->ctx;
3073 
3074    panfrost_update_active_prim(ctx, info);
3075 
3076    /* Take into account a negative bias */
3077    ctx->vertex_count =
3078       draw->count + (info->index_size ? abs(draw->index_bias) : 0);
3079    ctx->instance_count = info->instance_count;
3080    ctx->base_vertex = info->index_size ? draw->index_bias : 0;
3081    ctx->base_instance = info->start_instance;
3082    ctx->drawid = drawid_offset;
3083 
3084    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
3085    bool idvs = vs->info.vs.idvs;
3086 
3087    UNUSED unsigned vertex_count =
3088       panfrost_draw_get_vertex_count(batch, info, draw, idvs);
3089 
3090    panfrost_statistics_record(ctx, info, draw);
3091 
3092    panfrost_update_state_3d(batch);
3093    panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3094    panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3095    panfrost_clean_state_3d(ctx);
3096 
3097    if (ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb) {
3098       panfrost_launch_xfb(batch, info, draw->count);
3099    }
3100 
3101    /* Increment transform feedback offsets */
3102    panfrost_update_streamout_offsets(ctx);
3103 
3104    /* Any side effects must be handled by the XFB shader, so we only need
3105     * to run vertex shaders if we need rasterization.
3106     */
3107    if (panfrost_batch_skip_rasterization(batch))
3108       return;
3109 
3110 #if PAN_ARCH <= 7
3111    /* Emit all sort of descriptors. */
3112    panfrost_emit_varying_descriptor(batch,
3113                                     ctx->padded_count * ctx->instance_count,
3114                                     info->mode == MESA_PRIM_POINTS);
3115 #endif
3116 
3117    JOBX(launch_draw)(batch, info, drawid_offset, draw, vertex_count);
3118    batch->draw_count++;
3119 }
3120 
3121 static bool
panfrost_compatible_batch_state(struct panfrost_batch * batch,enum mesa_prim reduced_prim)3122 panfrost_compatible_batch_state(struct panfrost_batch *batch,
3123                                 enum mesa_prim reduced_prim)
3124 {
3125    struct panfrost_context *ctx = batch->ctx;
3126    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3127 
3128    if (reduced_prim == MESA_PRIM_LINES &&
3129        !u_tristate_set(&batch->line_smoothing, rast->line_smooth))
3130       return false;
3131 
3132    /* Only applies on Valhall */
3133    if (PAN_ARCH < 9)
3134       return true;
3135 
3136    bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
3137    bool first = rast->flatshade_first;
3138 
3139    /* gl_PointCoord orientation only matters when drawing points, but
3140     * provoking vertex doesn't matter for points.
3141     */
3142    if (reduced_prim == MESA_PRIM_POINTS)
3143       return u_tristate_set(&batch->sprite_coord_origin, coord);
3144    else
3145       return u_tristate_set(&batch->first_provoking_vertex, first);
3146 }
3147 
3148 static struct panfrost_batch *
prepare_draw(struct pipe_context * pipe,const struct pipe_draw_info * info)3149 prepare_draw(struct pipe_context *pipe, const struct pipe_draw_info *info)
3150 {
3151    struct panfrost_context *ctx = pan_context(pipe);
3152    struct panfrost_device *dev = pan_device(pipe->screen);
3153 
3154    /* Do some common setup */
3155    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3156    if (!batch)
3157       return NULL;
3158 
3159    /* Don't add too many jobs to a single batch. Job manager hardware has a
3160     * hard limit of 65536 jobs per job chain. Given a draw issues a maximum
3161     * of 3 jobs (a vertex, a tiler and a compute job is XFB is enabled), we
3162     * could use 65536 / 3 as a limit, but we choose a smaller soft limit
3163     * (arbitrary) to avoid the risk of timeouts. This might not be a good
3164     * idea. */
3165    if (unlikely(batch->draw_count > 10000)) {
3166       batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3167       if (!batch)
3168          return NULL;
3169    }
3170 
3171    enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
3172 
3173    if (unlikely(!panfrost_compatible_batch_state(batch, reduced_prim))) {
3174       batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
3175       if (!batch)
3176          return NULL;
3177 
3178       ASSERTED bool succ = panfrost_compatible_batch_state(batch, reduced_prim);
3179       assert(succ && "must be able to set state for a fresh batch");
3180    }
3181 
3182    /* panfrost_batch_skip_rasterization reads
3183     * batch->scissor_culls_everything, which is set by
3184     * panfrost_emit_viewport, so call that first.
3185     */
3186    if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
3187       batch->viewport = panfrost_emit_viewport(batch);
3188 
3189    /* Mark everything dirty when debugging */
3190    if (unlikely(dev->debug & PAN_DBG_DIRTY))
3191       panfrost_dirty_state_all(ctx);
3192 
3193    /* Conservatively assume draw parameters always change */
3194    ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3195 
3196    return batch;
3197 }
3198 
3199 static void
panfrost_draw_indirect(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect)3200 panfrost_draw_indirect(struct pipe_context *pipe,
3201                        const struct pipe_draw_info *info,
3202                        unsigned drawid_offset,
3203                        const struct pipe_draw_indirect_info *indirect)
3204 {
3205    struct panfrost_context *ctx = pan_context(pipe);
3206 
3207    if (!PAN_GPU_SUPPORTS_DRAW_INDIRECT || ctx->active_queries ||
3208        ctx->streamout.num_targets) {
3209       util_draw_indirect(pipe, info, drawid_offset, indirect);
3210       perf_debug(ctx, "Emulating indirect draw on the CPU");
3211       return;
3212    }
3213 
3214    struct panfrost_batch *batch = prepare_draw(pipe, info);
3215    if (!batch) {
3216       mesa_loge("prepare_draw failed");
3217       return;
3218    }
3219 
3220    struct pipe_draw_info tmp_info = *info;
3221 
3222    panfrost_batch_read_rsrc(batch, pan_resource(indirect->buffer),
3223                             PIPE_SHADER_VERTEX);
3224 
3225    panfrost_update_active_prim(ctx, &tmp_info);
3226 
3227    ctx->drawid = drawid_offset;
3228 
3229    batch->indices = 0;
3230    if (info->index_size) {
3231       struct panfrost_resource *index_buffer =
3232          pan_resource(info->index.resource);
3233       panfrost_batch_read_rsrc(batch, index_buffer, PIPE_SHADER_VERTEX);
3234       batch->indices = index_buffer->image.data.base;
3235    }
3236 
3237    panfrost_update_state_3d(batch);
3238    panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3239    panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3240    panfrost_clean_state_3d(ctx);
3241 
3242    /* Increment transform feedback offsets */
3243    panfrost_update_streamout_offsets(ctx);
3244 
3245    /* Any side effects must be handled by the XFB shader, so we only need
3246     * to run vertex shaders if we need rasterization.
3247     */
3248    if (panfrost_batch_skip_rasterization(batch))
3249       return;
3250 
3251    JOBX(launch_draw_indirect)(batch, &tmp_info, drawid_offset, indirect);
3252    batch->draw_count++;
3253 }
3254 
3255 static void
panfrost_multi_draw_direct(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3256 panfrost_multi_draw_direct(struct pipe_context *pipe,
3257                            const struct pipe_draw_info *info,
3258                            unsigned drawid_offset,
3259                            const struct pipe_draw_start_count_bias *draws,
3260                            unsigned num_draws)
3261 {
3262    struct panfrost_context *ctx = pan_context(pipe);
3263    struct panfrost_batch *batch = prepare_draw(pipe, info);
3264    if (!batch) {
3265       mesa_loge("prepare_draw failed");
3266       return;
3267    }
3268 
3269    struct pipe_draw_info tmp_info = *info;
3270    unsigned drawid = drawid_offset;
3271 
3272    for (unsigned i = 0; i < num_draws; i++) {
3273       panfrost_single_draw_direct(batch, &tmp_info, drawid, &draws[i]);
3274 
3275       if (tmp_info.increment_draw_id) {
3276          ctx->dirty |= PAN_DIRTY_DRAWID;
3277          drawid++;
3278       }
3279    }
3280 }
3281 
3282 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3283 panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
3284                   unsigned drawid_offset,
3285                   const struct pipe_draw_indirect_info *indirect,
3286                   const struct pipe_draw_start_count_bias *draws,
3287                   unsigned num_draws)
3288 {
3289    struct panfrost_context *ctx = pan_context(pipe);
3290 
3291    if (!panfrost_render_condition_check(ctx))
3292       return;
3293 
3294    ctx->draw_calls++;
3295 
3296    if (indirect && indirect->buffer) {
3297       assert(num_draws == 1);
3298       panfrost_draw_indirect(pipe, info, drawid_offset, indirect);
3299    } else {
3300       panfrost_multi_draw_direct(pipe, info, drawid_offset, draws, num_draws);
3301    }
3302 }
3303 
3304 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3305  * construct the COMPUTE job and some of its payload.
3306  */
3307 
3308 static void
panfrost_launch_grid_on_batch(struct pipe_context * pipe,struct panfrost_batch * batch,const struct pipe_grid_info * info)3309 panfrost_launch_grid_on_batch(struct pipe_context *pipe,
3310                               struct panfrost_batch *batch,
3311                               const struct pipe_grid_info *info)
3312 {
3313    struct panfrost_context *ctx = pan_context(pipe);
3314 
3315    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
3316       if (!*res)
3317          continue;
3318 
3319       struct panfrost_resource *buffer = pan_resource(*res);
3320       panfrost_batch_write_rsrc(batch, buffer, PIPE_SHADER_COMPUTE);
3321    }
3322 
3323    if (info->indirect && !PAN_GPU_SUPPORTS_DISPATCH_INDIRECT) {
3324       struct pipe_transfer *transfer;
3325       uint32_t *params =
3326          pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset,
3327                                3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer);
3328 
3329       struct pipe_grid_info direct = *info;
3330       direct.indirect = NULL;
3331       direct.grid[0] = params[0];
3332       direct.grid[1] = params[1];
3333       direct.grid[2] = params[2];
3334       pipe_buffer_unmap(pipe, transfer);
3335 
3336       if (params[0] && params[1] && params[2])
3337          panfrost_launch_grid_on_batch(pipe, batch, &direct);
3338 
3339       return;
3340    }
3341 
3342    ctx->compute_grid = info;
3343 
3344    /* Conservatively assume workgroup size changes every launch */
3345    ctx->dirty |= PAN_DIRTY_PARAMS;
3346 
3347    panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
3348 
3349    /* We want our compute thread descriptor to be per job.
3350     * Save the global one, and restore it when we're done emitting
3351     * the job.
3352     */
3353    uint64_t saved_tls = batch->tls.gpu;
3354    batch->tls.gpu = panfrost_emit_shared_memory(batch, info);
3355 
3356    /* if indirect, mark the indirect buffer as being read */
3357    if (info->indirect)
3358       panfrost_batch_read_rsrc(batch, pan_resource(info->indirect), PIPE_SHADER_COMPUTE);
3359 
3360    /* launch it */
3361    JOBX(launch_grid)(batch, info);
3362    batch->compute_count++;
3363    batch->tls.gpu = saved_tls;
3364 }
3365 
3366 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)3367 panfrost_launch_grid(struct pipe_context *pipe,
3368                      const struct pipe_grid_info *info)
3369 {
3370    struct panfrost_context *ctx = pan_context(pipe);
3371 
3372    /* XXX - shouldn't be necessary with working memory barriers. Affected
3373     * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3374    panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
3375 
3376    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3377    panfrost_launch_grid_on_batch(pipe, batch, info);
3378 
3379    panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
3380 }
3381 
3382 #define AFBC_BLOCK_ALIGN 16
3383 
3384 static void
panfrost_launch_convert_shader(struct panfrost_batch * batch,void * cso,struct pipe_constant_buffer * cbuf,unsigned nr_blocks)3385 panfrost_launch_convert_shader(struct panfrost_batch *batch, void *cso,
3386                             struct pipe_constant_buffer *cbuf,
3387                             unsigned nr_blocks)
3388 {
3389    struct pipe_context *pctx = &batch->ctx->base;
3390    void *saved_cso = NULL;
3391    struct pipe_constant_buffer saved_const = {};
3392    struct pipe_grid_info grid = {
3393       .block[0] = 1,
3394       .block[1] = 1,
3395       .block[2] = 1,
3396       .grid[0] = nr_blocks,
3397       .grid[1] = 1,
3398       .grid[2] = 1,
3399    };
3400 
3401    struct panfrost_constant_buffer *pbuf =
3402       &batch->ctx->constant_buffer[PIPE_SHADER_COMPUTE];
3403    saved_cso = batch->ctx->uncompiled[PIPE_SHADER_COMPUTE];
3404    util_copy_constant_buffer(&pbuf->cb[0], &saved_const, true);
3405 
3406    pctx->bind_compute_state(pctx, cso);
3407    pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, false, cbuf);
3408 
3409    panfrost_launch_grid_on_batch(pctx, batch, &grid);
3410 
3411    pctx->bind_compute_state(pctx, saved_cso);
3412    pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, true, &saved_const);
3413 }
3414 
3415 #define LAUNCH_CONVERT_SHADER(name, batch, rsrc, consts, nr_blocks)               \
3416    struct pan_mod_convert_shader_data *shaders =                                      \
3417       panfrost_get_mod_convert_shaders(batch->ctx, rsrc, AFBC_BLOCK_ALIGN);           \
3418    struct pipe_constant_buffer constant_buffer = {                             \
3419       .buffer_size = sizeof(consts),                                           \
3420       .user_buffer = &consts};                                                 \
3421    panfrost_launch_convert_shader(batch, shaders->name##_cso, &constant_buffer,   \
3422                                nr_blocks);
3423 
3424 static void
panfrost_afbc_size(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * metadata,unsigned offset,unsigned level)3425 panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src,
3426                    struct panfrost_bo *metadata, unsigned offset,
3427                    unsigned level)
3428 {
3429    struct pan_image_slice_layout *slice = &src->image.layout.slices[level];
3430    struct panfrost_afbc_size_info consts = {
3431       .src =
3432          src->image.data.base + src->image.data.offset + slice->offset,
3433       .metadata = metadata->ptr.gpu + offset,
3434    };
3435 
3436    panfrost_batch_read_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3437    panfrost_batch_write_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3438 
3439    LAUNCH_CONVERT_SHADER(afbc_size, batch, src, consts, slice->afbc.nr_blocks);
3440 }
3441 
3442 static void
panfrost_afbc_pack(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * dst,struct pan_image_slice_layout * dst_slice,struct panfrost_bo * metadata,unsigned metadata_offset,unsigned level)3443 panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src,
3444                    struct panfrost_bo *dst,
3445                    struct pan_image_slice_layout *dst_slice,
3446                    struct panfrost_bo *metadata, unsigned metadata_offset,
3447                    unsigned level)
3448 {
3449    struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level];
3450    struct panfrost_afbc_pack_info consts = {
3451       .src = src->image.data.base + src->image.data.offset +
3452              src_slice->offset,
3453       .dst = dst->ptr.gpu + dst_slice->offset,
3454       .metadata = metadata->ptr.gpu + metadata_offset,
3455       .header_size = dst_slice->afbc.header_size,
3456       .src_stride = src_slice->afbc.stride,
3457       .dst_stride = dst_slice->afbc.stride,
3458    };
3459 
3460    panfrost_batch_read_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3461    panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE);
3462    panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3463 
3464    LAUNCH_CONVERT_SHADER(afbc_pack, batch, src, consts, dst_slice->afbc.nr_blocks);
3465 }
3466 
3467 static void
panfrost_mtk_detile_compute(struct panfrost_context * ctx,struct pipe_blit_info * info)3468 panfrost_mtk_detile_compute(struct panfrost_context *ctx, struct pipe_blit_info *info)
3469 {
3470    struct pipe_context *pipe = &ctx->base;
3471    struct pipe_resource *y_src = info->src.resource;
3472    struct pipe_resource *uv_src = y_src->next;
3473    struct pipe_resource *y_dst = info->dst.resource;
3474    struct pipe_resource *uv_dst = y_dst->next;
3475 
3476    unsigned width = info->src.box.width;
3477    unsigned height = info->src.box.height;
3478    unsigned src_stride = pan_resource(y_src)->image.layout.slices[0].row_stride;
3479    unsigned dst_stride = pan_resource(y_dst)->image.layout.slices[0].row_stride;
3480 
3481    /* 4 images: y_src, uv_src, y_dst, uv_dst */
3482    struct pipe_image_view image[4] = { 0 };
3483 
3484    if (!uv_src) {
3485       /* single plane conversion; this must be R8 or R8G8 */
3486       assert(!uv_dst);
3487       if (y_src->format == PIPE_FORMAT_R8G8_UNORM) {
3488          /* R8G8 would be the single chroma plane of an image */
3489          /* adjust for dimensions of original luma plane */
3490          width *= 2;
3491          height *= 2;
3492          uv_src = y_src;
3493          uv_dst = y_dst;
3494          y_src = y_dst = NULL;
3495       }
3496    }
3497    image[0].resource = y_src;
3498    image[0].format = PIPE_FORMAT_R8G8B8A8_UINT;
3499    image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
3500    image[0].u.tex.level = info->src.level;
3501    image[0].u.tex.first_layer = 0;
3502    image[0].u.tex.last_layer = y_src ? (unsigned)(y_src->array_size - 1) : 0;
3503 
3504    image[1].resource = uv_src;
3505    image[1].format = PIPE_FORMAT_R8G8B8A8_UINT;
3506    image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_READ;
3507    image[1].u.tex.level = info->src.level;
3508    image[1].u.tex.first_layer = 0;
3509    image[1].u.tex.last_layer = uv_src ? (unsigned)(uv_src->array_size - 1) : 0;
3510 
3511    image[2].resource = y_dst;
3512    image[2].format = PIPE_FORMAT_R8G8B8A8_UINT;
3513    image[2].shader_access = image[2].access = PIPE_IMAGE_ACCESS_WRITE;
3514    image[2].u.tex.level = info->dst.level;
3515    image[2].u.tex.first_layer = 0;
3516    image[2].u.tex.last_layer = y_dst ? (unsigned)(y_dst->array_size - 1) : 0;
3517 
3518    image[3].resource = uv_dst;
3519    image[3].format = PIPE_FORMAT_R8G8B8A8_UINT;
3520    image[3].shader_access = image[3].access = PIPE_IMAGE_ACCESS_WRITE;
3521    image[3].u.tex.level = info->dst.level;
3522    image[3].u.tex.first_layer = 0;
3523    image[3].u.tex.last_layer = uv_dst ? (unsigned)(uv_dst->array_size - 1) : 0;
3524 
3525    struct panfrost_mtk_detile_info consts = {
3526       .tiles_per_stride = src_stride >> 4,
3527       .src_width = width,
3528       .src_height = height,
3529       .dst_stride = dst_stride,
3530    };
3531    panfrost_flush_all_batches(ctx, "mtk_detile pre-barrier");
3532 
3533    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3534    pipe->set_shader_images(pipe, PIPE_SHADER_COMPUTE, 0, 4, 0, image);
3535 
3536    /* launch the compute shader */
3537    struct pan_mod_convert_shader_data *shaders =
3538       panfrost_get_mod_convert_shaders(ctx, pan_resource(y_dst ? y_dst : uv_dst), AFBC_BLOCK_ALIGN);
3539    struct pipe_constant_buffer cbuf = {
3540       .buffer_size = sizeof(consts),
3541       .user_buffer = &consts};
3542 
3543    struct pipe_grid_info grid_info = {
3544       .block[0] = 4,
3545       .last_block[0] = (width/4) % 4,
3546       .block[1] = 16,
3547       .last_block[1] = height % 16,
3548       .block[2] = 1,
3549       .last_block[2] = 0,
3550       .grid[0] = DIV_ROUND_UP(width/4, 4),
3551       .grid[1] = DIV_ROUND_UP(height, 16),
3552       .grid[2] = 1,
3553    };
3554 
3555    struct pipe_constant_buffer saved_const = {};
3556    struct panfrost_constant_buffer *pbuf =
3557       &batch->ctx->constant_buffer[PIPE_SHADER_COMPUTE];
3558    void *saved_cso = batch->ctx->uncompiled[PIPE_SHADER_COMPUTE];
3559    void *cso = shaders->mtk_detile_cso;
3560    util_copy_constant_buffer(&pbuf->cb[0], &saved_const, true);
3561 
3562    pipe->bind_compute_state(pipe, cso);
3563    pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &cbuf);
3564 
3565    panfrost_launch_grid_on_batch(pipe, batch, &grid_info);
3566 
3567    pipe->bind_compute_state(pipe, saved_cso);
3568    pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, true, &saved_const);
3569 }
3570 
3571 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)3572 panfrost_create_rasterizer_state(struct pipe_context *pctx,
3573                                  const struct pipe_rasterizer_state *cso)
3574 {
3575    struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3576 
3577    so->base = *cso;
3578 
3579 #if PAN_ARCH <= 7
3580    pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3581       cfg.multisample_enable = cso->multisample;
3582       cfg.fixed_function_near_discard = cso->depth_clip_near;
3583       cfg.fixed_function_far_discard = cso->depth_clip_far;
3584       cfg.fixed_function_depth_range_fixed = !cso->depth_clamp;
3585       cfg.shader_depth_range_fixed = true;
3586    }
3587 
3588    pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3589       cfg.front_facing_depth_bias = cso->offset_tri;
3590       cfg.back_facing_depth_bias = cso->offset_tri;
3591       cfg.single_sampled_lines = !cso->multisample;
3592    }
3593 #endif
3594 
3595    return so;
3596 }
3597 
3598 #if PAN_ARCH >= 9
3599 /*
3600  * Given a pipe_vertex_element, pack the corresponding Valhall attribute
3601  * descriptor. This function is called at CSO create time.
3602  */
3603 static void
panfrost_pack_attribute(struct panfrost_device * dev,const struct pipe_vertex_element el,struct mali_attribute_packed * out)3604 panfrost_pack_attribute(struct panfrost_device *dev,
3605                         const struct pipe_vertex_element el,
3606                         struct mali_attribute_packed *out)
3607 {
3608    pan_pack(out, ATTRIBUTE, cfg) {
3609       cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER;
3610       cfg.frequency = (el.instance_divisor > 0)
3611                          ? MALI_ATTRIBUTE_FREQUENCY_INSTANCE
3612                          : MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3613       cfg.format = GENX(panfrost_format_from_pipe_format)(el.src_format)->hw;
3614       cfg.offset = el.src_offset;
3615       cfg.buffer_index = el.vertex_buffer_index;
3616       cfg.stride = el.src_stride;
3617 
3618       if (el.instance_divisor == 0) {
3619          /* Per-vertex */
3620          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
3621          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3622          cfg.offset_enable = true;
3623       } else if (util_is_power_of_two_or_zero(el.instance_divisor)) {
3624          /* Per-instance, POT divisor */
3625          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
3626          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3627          cfg.divisor_r = __builtin_ctz(el.instance_divisor);
3628       } else {
3629          /* Per-instance, NPOT divisor */
3630          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
3631          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3632 
3633          cfg.divisor_d = panfrost_compute_magic_divisor(
3634             el.instance_divisor, &cfg.divisor_r, &cfg.divisor_e);
3635       }
3636    }
3637 }
3638 #endif
3639 
3640 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)3641 panfrost_create_vertex_elements_state(struct pipe_context *pctx,
3642                                       unsigned num_elements,
3643                                       const struct pipe_vertex_element *elements)
3644 {
3645    struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3646    UNUSED struct panfrost_device *dev = pan_device(pctx->screen);
3647 
3648    so->num_elements = num_elements;
3649    memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3650 
3651    for (unsigned i = 0; i < num_elements; ++i)
3652       so->strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
3653 #if PAN_ARCH >= 9
3654    for (unsigned i = 0; i < num_elements; ++i)
3655       panfrost_pack_attribute(dev, elements[i], &so->attributes[i]);
3656 #else
3657    /* Assign attribute buffers corresponding to the vertex buffers, keyed
3658     * for a particular divisor since that's how instancing works on Mali */
3659    for (unsigned i = 0; i < num_elements; ++i) {
3660       so->element_buffer[i] = pan_assign_vertex_buffer(
3661          so->buffers, &so->nr_bufs, elements[i].vertex_buffer_index,
3662          elements[i].instance_divisor);
3663       if (elements[i].instance_divisor)
3664          so->attr_depends_on_base_instance_mask |= BITFIELD_BIT(i);
3665    }
3666 
3667    for (int i = 0; i < num_elements; ++i) {
3668       enum pipe_format fmt = elements[i].src_format;
3669       so->formats[i] = GENX(panfrost_format_from_pipe_format)(fmt)->hw;
3670 
3671       assert(MALI_EXTRACT_INDEX(so->formats[i]) && "format must be supported");
3672    }
3673 
3674    /* Let's also prepare vertex builtins */
3675    so->formats[PAN_VERTEX_ID] =
3676       GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3677    so->formats[PAN_INSTANCE_ID] =
3678       GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3679 #endif
3680 
3681    return so;
3682 }
3683 
3684 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)3685 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3686 {
3687    switch (in) {
3688    case PIPE_STENCIL_OP_KEEP:
3689       return MALI_STENCIL_OP_KEEP;
3690    case PIPE_STENCIL_OP_ZERO:
3691       return MALI_STENCIL_OP_ZERO;
3692    case PIPE_STENCIL_OP_REPLACE:
3693       return MALI_STENCIL_OP_REPLACE;
3694    case PIPE_STENCIL_OP_INCR:
3695       return MALI_STENCIL_OP_INCR_SAT;
3696    case PIPE_STENCIL_OP_DECR:
3697       return MALI_STENCIL_OP_DECR_SAT;
3698    case PIPE_STENCIL_OP_INCR_WRAP:
3699       return MALI_STENCIL_OP_INCR_WRAP;
3700    case PIPE_STENCIL_OP_DECR_WRAP:
3701       return MALI_STENCIL_OP_DECR_WRAP;
3702    case PIPE_STENCIL_OP_INVERT:
3703       return MALI_STENCIL_OP_INVERT;
3704    default:
3705       unreachable("Invalid stencil op");
3706    }
3707 }
3708 
3709 #if PAN_ARCH <= 7
3710 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)3711 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3712                     struct mali_stencil_packed *out)
3713 {
3714    pan_pack(out, STENCIL, s) {
3715       s.mask = in->valuemask;
3716       s.compare_function = (enum mali_func)in->func;
3717       s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3718       s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3719       s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3720    }
3721 }
3722 #endif
3723 
3724 static bool
pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state * zsa)3725 pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa)
3726 {
3727    if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS)
3728       return false;
3729 
3730    if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS)
3731       return false;
3732 
3733    if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS)
3734       return false;
3735 
3736    return true;
3737 }
3738 
3739 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)3740 panfrost_create_depth_stencil_state(
3741    struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *zsa)
3742 {
3743    struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3744    so->base = *zsa;
3745 
3746    const struct pipe_stencil_state front = zsa->stencil[0];
3747    const struct pipe_stencil_state back =
3748       zsa->stencil[1].enabled ? zsa->stencil[1] : front;
3749 
3750    enum mali_func depth_func =
3751       zsa->depth_enabled ? (enum mali_func)zsa->depth_func : MALI_FUNC_ALWAYS;
3752 
3753    /* Normalize (there's no separate enable) */
3754    if (PAN_ARCH <= 5 && !zsa->alpha_enabled)
3755       so->base.alpha_func = MALI_FUNC_ALWAYS;
3756 
3757 #if PAN_ARCH <= 7
3758    /* Prepack relevant parts of the Renderer State Descriptor. They will
3759     * be ORed in at draw-time */
3760    pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3761       cfg.depth_function = depth_func;
3762       cfg.depth_write_mask = zsa->depth_writemask;
3763    }
3764 
3765    pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3766       cfg.stencil_enable = front.enabled;
3767       cfg.stencil_mask_front = front.writemask;
3768       cfg.stencil_mask_back = back.writemask;
3769 
3770 #if PAN_ARCH <= 5
3771       cfg.alpha_test_compare_function = (enum mali_func)so->base.alpha_func;
3772 #endif
3773    }
3774 
3775    /* Stencil tests have their own words in the RSD */
3776    pan_pipe_to_stencil(&front, &so->stencil_front);
3777    pan_pipe_to_stencil(&back, &so->stencil_back);
3778 #else
3779    /* Pack with nodefaults so only explicitly set fields affect pan_merge() when
3780     * emitting depth stencil descriptor */
3781    pan_cast_and_pack_nodefaults(&so->desc, DEPTH_STENCIL, cfg) {
3782       cfg.front_compare_function = (enum mali_func)front.func;
3783       cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op);
3784       cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op);
3785       cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op);
3786 
3787       cfg.back_compare_function = (enum mali_func)back.func;
3788       cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op);
3789       cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op);
3790       cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op);
3791 
3792       cfg.stencil_test_enable = front.enabled;
3793       cfg.front_write_mask = front.writemask;
3794       cfg.back_write_mask = back.writemask;
3795       cfg.front_value_mask = front.valuemask;
3796       cfg.back_value_mask = back.valuemask;
3797 
3798       cfg.depth_write_enable = zsa->depth_writemask;
3799       cfg.depth_function = depth_func;
3800    }
3801 #endif
3802 
3803    so->enabled = zsa->stencil[0].enabled ||
3804                  (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3805 
3806    so->zs_always_passes = pipe_zs_always_passes(zsa);
3807    so->writes_zs = util_writes_depth_stencil(zsa);
3808 
3809    /* TODO: Bounds test should be easy */
3810    assert(!zsa->depth_bounds_test);
3811 
3812    return so;
3813 }
3814 
3815 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)3816 panfrost_create_sampler_view(struct pipe_context *pctx,
3817                              struct pipe_resource *texture,
3818                              const struct pipe_sampler_view *template)
3819 {
3820    struct panfrost_context *ctx = pan_context(pctx);
3821    struct panfrost_sampler_view *so =
3822       rzalloc(pctx, struct panfrost_sampler_view);
3823    struct panfrost_resource *ptexture = pan_resource(texture);
3824 
3825    pan_legalize_format(ctx, ptexture, template->format, false,
3826                        false);
3827    pipe_reference(NULL, &texture->reference);
3828 
3829    so->base = *template;
3830    so->base.texture = texture;
3831    so->base.reference.count = 1;
3832    so->base.context = pctx;
3833 
3834    panfrost_create_sampler_view_bo(so, pctx, texture);
3835 
3836    return (struct pipe_sampler_view *)so;
3837 }
3838 
3839 /* A given Gallium blend state can be encoded to the hardware in numerous,
3840  * dramatically divergent ways due to the interactions of blending with
3841  * framebuffer formats. Conceptually, there are two modes:
3842  *
3843  * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3844  *   state, and suitable blend constant)
3845  *
3846  * - Blend shaders (for everything else)
3847  *
3848  * A given Gallium blend configuration will compile to exactly one
3849  * fixed-function blend state, if it compiles to any, although the constant
3850  * will vary across runs as that is tracked outside of the Gallium CSO.
3851  *
3852  * However, that same blend configuration will compile to many different blend
3853  * shaders, depending on the framebuffer formats active. The rationale is that
3854  * blend shaders override not just fixed-function blending but also
3855  * fixed-function format conversion, so blend shaders are keyed to a particular
3856  * framebuffer format. As an example, the tilebuffer format is identical for
3857  * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3858  * blend shaders.
3859  *
3860  * All of this state is encapsulated in the panfrost_blend_state struct
3861  * (our subclass of pipe_blend_state).
3862  */
3863 
3864 /* Create a blend CSO. Essentially, try to compile a fixed-function
3865  * expression and initialize blend shaders */
3866 
3867 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)3868 panfrost_create_blend_state(struct pipe_context *pipe,
3869                             const struct pipe_blend_state *blend)
3870 {
3871    struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3872    so->base = *blend;
3873 
3874    so->pan.logicop_enable = blend->logicop_enable;
3875    so->pan.logicop_func = blend->logicop_func;
3876    so->pan.rt_count = blend->max_rt + 1;
3877    so->pan.alpha_to_one = blend->alpha_to_one;
3878 
3879    for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3880       unsigned g = blend->independent_blend_enable ? c : 0;
3881       const struct pipe_rt_blend_state pipe = blend->rt[g];
3882       struct pan_blend_equation equation = {0};
3883 
3884       equation.color_mask = pipe.colormask;
3885       equation.blend_enable = pipe.blend_enable;
3886 
3887       if (pipe.blend_enable) {
3888          equation.rgb_func = pipe.rgb_func;
3889          equation.rgb_src_factor = pipe.rgb_src_factor;
3890          equation.rgb_dst_factor = pipe.rgb_dst_factor;
3891          equation.alpha_func = pipe.alpha_func;
3892          equation.alpha_src_factor = pipe.alpha_src_factor;
3893          equation.alpha_dst_factor = pipe.alpha_dst_factor;
3894       }
3895 
3896       /* Determine some common properties */
3897       unsigned constant_mask = pan_blend_constant_mask(equation);
3898       const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
3899       so->info[c] = (struct pan_blend_info){
3900          .enabled = (equation.color_mask != 0) &&
3901                     !(blend->logicop_enable &&
3902                       blend->logicop_func == PIPE_LOGICOP_NOOP),
3903          .opaque = !blend->logicop_enable && pan_blend_is_opaque(equation),
3904          .constant_mask = constant_mask,
3905 
3906          /* TODO: check the dest for the logicop */
3907          .load_dest = blend->logicop_enable || pan_blend_reads_dest(equation),
3908 
3909          /* Could this possibly be fixed-function? */
3910          .fixed_function =
3911             !blend->logicop_enable &&
3912             pan_blend_can_fixed_function(equation, supports_2src) &&
3913             (!constant_mask || pan_blend_supports_constant(PAN_ARCH, c)),
3914 
3915          .alpha_zero_nop = pan_blend_alpha_zero_nop(equation),
3916          .alpha_one_store = pan_blend_alpha_one_store(equation),
3917       };
3918 
3919       so->pan.rts[c].equation = equation;
3920 
3921       /* Bifrost needs to know if any render target loads its
3922        * destination in the hot draw path, so precompute this */
3923       if (so->info[c].load_dest)
3924          so->load_dest_mask |= BITFIELD_BIT(c);
3925 
3926       /* Bifrost needs to know if any render target loads its
3927        * destination in the hot draw path, so precompute this */
3928       if (so->info[c].enabled)
3929          so->enabled_mask |= BITFIELD_BIT(c);
3930 
3931       /* Converting equations to Mali style is expensive, do it at
3932        * CSO create time instead of draw-time */
3933       if (so->info[c].fixed_function) {
3934          so->equation[c] = pan_pack_blend(equation);
3935       }
3936    }
3937 
3938    return so;
3939 }
3940 
3941 #if PAN_ARCH >= 9
3942 static enum mali_flush_to_zero_mode
panfrost_ftz_mode(struct pan_shader_info * info)3943 panfrost_ftz_mode(struct pan_shader_info *info)
3944 {
3945    if (info->ftz_fp32) {
3946       if (info->ftz_fp16)
3947          return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
3948       else
3949          return MALI_FLUSH_TO_ZERO_MODE_DX11;
3950    } else {
3951       /* We don't have a "flush FP16, preserve FP32" mode, but APIs
3952        * should not be able to generate that.
3953        */
3954       assert(!info->ftz_fp16 && !info->ftz_fp32);
3955       return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
3956    }
3957 }
3958 #endif
3959 
3960 static void
prepare_shader(struct panfrost_compiled_shader * state,struct panfrost_pool * pool,bool upload)3961 prepare_shader(struct panfrost_compiled_shader *state,
3962                struct panfrost_pool *pool, bool upload)
3963 {
3964 #if PAN_ARCH <= 7
3965    struct mali_renderer_state_packed *out =
3966       (struct mali_renderer_state_packed *)&state->partial_rsd;
3967 
3968    if (upload) {
3969       struct panfrost_ptr ptr =
3970          pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3971 
3972       state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3973       out = ptr.cpu;
3974    }
3975 
3976    pan_pack(out, RENDERER_STATE, cfg) {
3977       pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
3978    }
3979 #else
3980    assert(upload);
3981 
3982    /* The address in the shader program descriptor must be non-null, but
3983     * the entire shader program descriptor may be omitted.
3984     *
3985     * See dEQP-GLES31.functional.compute.basic.empty
3986     */
3987    if (!state->bin.gpu)
3988       return;
3989 
3990    bool vs = (state->info.stage == MESA_SHADER_VERTEX);
3991    bool secondary_enable = (vs && state->info.vs.secondary_enable);
3992 
3993    unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1;
3994    struct panfrost_ptr ptr =
3995       pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM);
3996 
3997    state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3998 
3999    struct mali_shader_program_packed *programs = ptr.cpu;
4000 
4001    /* Generic, or IDVS/points */
4002    pan_cast_and_pack(&programs[0], SHADER_PROGRAM, cfg) {
4003       cfg.stage = pan_shader_stage(&state->info);
4004 
4005       if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
4006          cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
4007       else if (vs)
4008          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
4009 
4010       cfg.register_allocation =
4011          pan_register_allocation(state->info.work_reg_count);
4012       cfg.binary = state->bin.gpu;
4013       cfg.preload.r48_r63 = (state->info.preload >> 48);
4014       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
4015 
4016       if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
4017          cfg.requires_helper_threads = state->info.contains_barrier;
4018    }
4019 
4020    if (!vs)
4021       return;
4022 
4023    /* IDVS/triangles */
4024    pan_pack(&programs[1], SHADER_PROGRAM, cfg) {
4025       cfg.stage = pan_shader_stage(&state->info);
4026       cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
4027       cfg.register_allocation =
4028          pan_register_allocation(state->info.work_reg_count);
4029       cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
4030       cfg.preload.r48_r63 = (state->info.preload >> 48);
4031       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
4032    }
4033 
4034    if (!secondary_enable)
4035       return;
4036 
4037    pan_pack(&programs[2], SHADER_PROGRAM, cfg) {
4038       unsigned work_count = state->info.vs.secondary_work_reg_count;
4039 
4040       cfg.stage = pan_shader_stage(&state->info);
4041       cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
4042       cfg.register_allocation = pan_register_allocation(work_count);
4043       cfg.binary = state->bin.gpu + state->info.vs.secondary_offset;
4044       cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48);
4045       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
4046    }
4047 #endif
4048 }
4049 
4050 static void
screen_destroy(struct pipe_screen * pscreen)4051 screen_destroy(struct pipe_screen *pscreen)
4052 {
4053    struct panfrost_device *dev = pan_device(pscreen);
4054    GENX(pan_fb_preload_cache_cleanup)(&dev->fb_preload_cache);
4055 }
4056 
4057 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)4058 panfrost_sampler_view_destroy(struct pipe_context *pctx,
4059                               struct pipe_sampler_view *pview)
4060 {
4061    struct panfrost_sampler_view *view = (struct panfrost_sampler_view *)pview;
4062 
4063    pipe_resource_reference(&pview->texture, NULL);
4064    panfrost_bo_unreference(view->state.bo);
4065    ralloc_free(view);
4066 }
4067 
4068 static void
context_populate_vtbl(struct pipe_context * pipe)4069 context_populate_vtbl(struct pipe_context *pipe)
4070 {
4071    pipe->draw_vbo = panfrost_draw_vbo;
4072    pipe->launch_grid = panfrost_launch_grid;
4073 
4074    pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
4075    pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
4076    pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
4077    pipe->create_sampler_view = panfrost_create_sampler_view;
4078    pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
4079    pipe->create_sampler_state = panfrost_create_sampler_state;
4080    pipe->create_blend_state = panfrost_create_blend_state;
4081 
4082    pipe->get_sample_position = u_default_get_sample_position;
4083 }
4084 
4085 static void
context_init(struct panfrost_context * ctx)4086 context_init(struct panfrost_context *ctx)
4087 {
4088 }
4089 
4090 static void
context_cleanup(struct panfrost_context * ctx)4091 context_cleanup(struct panfrost_context *ctx)
4092 {
4093 }
4094 
4095 #if PAN_ARCH <= 5
4096 
4097 /* Returns the polygon list's GPU address if available, or otherwise allocates
4098  * the polygon list.  It's perfectly fast to use allocate/free BO directly,
4099  * since we'll hit the BO cache and this is one-per-batch anyway. */
4100 
4101 static uint64_t
batch_get_polygon_list(struct panfrost_batch * batch)4102 batch_get_polygon_list(struct panfrost_batch *batch)
4103 {
4104    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
4105 
4106    if (!batch->tiler_ctx.midgard.polygon_list) {
4107       bool has_draws = batch->draw_count > 0;
4108       unsigned size = panfrost_tiler_get_polygon_list_size(
4109          batch->key.width, batch->key.height, batch->vertex_count,
4110          !dev->model->quirks.no_hierarchical_tiling);
4111 
4112       /* Create the BO as invisible if we can. If there are no draws,
4113        * we need to write the polygon list manually because there's
4114        * no WRITE_VALUE job in the chain
4115        */
4116       bool init_polygon_list = !has_draws;
4117       batch->polygon_list_bo = panfrost_batch_create_bo(
4118          batch, size, init_polygon_list ? 0 : PAN_BO_INVISIBLE,
4119          PIPE_SHADER_VERTEX, "Polygon list");
4120 
4121       if (!batch->polygon_list_bo) {
4122          mesa_loge("failed to allocate memory for polygon-list");
4123          return 0;
4124       }
4125 
4126       batch->tiler_ctx.midgard.polygon_list = batch->polygon_list_bo->ptr.gpu;
4127       panfrost_batch_add_bo(batch, batch->polygon_list_bo,
4128                             PIPE_SHADER_FRAGMENT);
4129 
4130       if (init_polygon_list && dev->model->quirks.no_hierarchical_tiling) {
4131          assert(batch->polygon_list_bo->ptr.cpu);
4132          uint32_t *polygon_list_body =
4133             batch->polygon_list_bo->ptr.cpu +
4134             MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
4135 
4136          /* Magic for Mali T720 */
4137          polygon_list_body[0] = 0xa0000000;
4138       } else if (init_polygon_list) {
4139          assert(batch->polygon_list_bo->ptr.cpu);
4140          uint32_t *header = batch->polygon_list_bo->ptr.cpu;
4141          memset(header, 0, size);
4142       }
4143 
4144       batch->tiler_ctx.midgard.disable = !has_draws;
4145       batch->tiler_ctx.midgard.no_hierarchical_tiling =
4146          dev->model->quirks.no_hierarchical_tiling;
4147       batch->tiler_ctx.midgard.heap.start = dev->tiler_heap->ptr.gpu;
4148       batch->tiler_ctx.midgard.heap.size = panfrost_bo_size(dev->tiler_heap);
4149    }
4150 
4151    return batch->tiler_ctx.midgard.polygon_list;
4152 }
4153 #endif
4154 
4155 static void
init_polygon_list(struct panfrost_batch * batch)4156 init_polygon_list(struct panfrost_batch *batch)
4157 {
4158 #if PAN_ARCH <= 5
4159    uint64_t polygon_list = batch_get_polygon_list(batch);
4160    if (polygon_list)
4161       pan_jc_initialize_tiler(&batch->pool.base, &batch->jm.jobs.vtc_jc,
4162                               polygon_list);
4163 #endif
4164 }
4165 
4166 static int
submit_batch(struct panfrost_batch * batch,struct pan_fb_info * fb)4167 submit_batch(struct panfrost_batch *batch, struct pan_fb_info *fb)
4168 {
4169    JOBX(prepare_tiler)(batch, fb);
4170    JOBX(preload_fb)(batch, fb);
4171    init_polygon_list(batch);
4172 
4173    /* Now that all draws are in, we can finally prepare the
4174     * FBD for the batch (if there is one). */
4175 
4176    emit_tls(batch);
4177 
4178    if (panfrost_has_fragment_job(batch)) {
4179       emit_fbd(batch, fb);
4180       emit_fragment_job(batch, fb);
4181    }
4182 
4183    return JOBX(submit_batch)(batch);
4184 }
4185 
4186 static void
emit_write_timestamp(struct panfrost_batch * batch,struct panfrost_resource * dst,unsigned offset)4187 emit_write_timestamp(struct panfrost_batch *batch,
4188                      struct panfrost_resource *dst, unsigned offset)
4189 {
4190    batch->need_job_req_cycle_count = true;
4191    batch->has_time_query = true;
4192 
4193    JOBX(emit_write_timestamp)(batch, dst, offset);
4194 }
4195 
4196 void
GENX(panfrost_cmdstream_screen_init)4197 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
4198 {
4199    struct panfrost_device *dev = &screen->dev;
4200 
4201    screen->vtbl.prepare_shader = prepare_shader;
4202    screen->vtbl.screen_destroy = screen_destroy;
4203    screen->vtbl.context_populate_vtbl = context_populate_vtbl;
4204    screen->vtbl.context_init = JOBX(init_context);
4205    screen->vtbl.context_cleanup = JOBX(cleanup_context);
4206    screen->vtbl.init_batch = JOBX(init_batch);
4207    screen->vtbl.cleanup_batch = JOBX(cleanup_batch);
4208    screen->vtbl.submit_batch = submit_batch;
4209    screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
4210    screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
4211    screen->vtbl.compile_shader = GENX(pan_shader_compile);
4212    screen->vtbl.afbc_size = panfrost_afbc_size;
4213    screen->vtbl.afbc_pack = panfrost_afbc_pack;
4214    screen->vtbl.mtk_detile = panfrost_mtk_detile_compute;
4215    screen->vtbl.emit_write_timestamp = emit_write_timestamp;
4216    screen->vtbl.select_tile_size = GENX(pan_select_tile_size);
4217 
4218    GENX(pan_fb_preload_cache_init)
4219    (&dev->fb_preload_cache, panfrost_device_gpu_id(dev), &dev->blend_shaders,
4220     &screen->mempools.bin.base, &screen->mempools.desc.base);
4221 
4222 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
4223    pan_indirect_dispatch_meta_init(
4224       &dev->indirect_dispatch, panfrost_device_gpu_id(dev),
4225       &screen->mempools.bin.base, &screen->mempools.desc.base);
4226 #endif
4227 }
4228