• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3  * Copyright (C) 2018 Alyssa Rosenzweig
4  * Copyright (C) 2020 Collabora Ltd.
5  * Copyright © 2017 Intel Corporation
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  */
26 
27 #include "gallium/auxiliary/util/u_blend.h"
28 #include "pipe/p_defines.h"
29 #include "pipe/p_state.h"
30 #include "util/macros.h"
31 #include "util/u_draw.h"
32 #include "util/u_helpers.h"
33 #include "util/u_memory.h"
34 #include "util/u_prim.h"
35 #include "util/u_sample_positions.h"
36 #include "util/u_vbuf.h"
37 #include "util/u_viewport.h"
38 
39 #include "decode.h"
40 
41 #include "genxml/gen_macros.h"
42 
43 #include "pan_afbc_cso.h"
44 #include "pan_blend.h"
45 #include "pan_blitter.h"
46 #include "pan_bo.h"
47 #include "pan_cmdstream.h"
48 #include "pan_context.h"
49 #include "pan_indirect_dispatch.h"
50 #include "pan_jm.h"
51 #include "pan_job.h"
52 #include "pan_pool.h"
53 #include "pan_resource.h"
54 #include "pan_samples.h"
55 #include "pan_shader.h"
56 #include "pan_texture.h"
57 #include "pan_util.h"
58 
59 /* JOBX() is used to select the job backend helpers to call from generic
60  * functions. */
61 #if PAN_ARCH <= 9
62 #define JOBX(__suffix) GENX(jm_##__suffix)
63 #else
64 #error "Unsupported arch"
65 #endif
66 
67 struct panfrost_sampler_state {
68    struct pipe_sampler_state base;
69    struct mali_sampler_packed hw;
70 };
71 
72 /* Misnomer: Sampler view corresponds to textures, not samplers */
73 
74 struct panfrost_sampler_view {
75    struct pipe_sampler_view base;
76    struct panfrost_pool_ref state;
77    struct mali_texture_packed bifrost_descriptor;
78    mali_ptr texture_bo;
79    uint64_t modifier;
80 
81    /* Pool used to allocate the descriptor. If NULL, defaults to the global
82     * descriptor pool. Can be set for short lived descriptors, useful for
83     * shader images on Valhall.
84     */
85    struct panfrost_pool *pool;
86 };
87 
88 /* Statically assert that PIPE_* enums match the hardware enums.
89  * (As long as they match, we don't need to translate them.)
90  */
91 static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match");
92 static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match");
93 static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match");
94 static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match");
95 static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match");
96 static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match");
97 static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match");
98 static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match");
99 
100 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)101 panfrost_sample_pattern(unsigned samples)
102 {
103    switch (samples) {
104    case 1:
105       return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
106    case 4:
107       return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
108    case 8:
109       return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
110    case 16:
111       return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
112    default:
113       unreachable("Unsupported sample count");
114    }
115 }
116 
117 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)118 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
119 {
120    /* CLAMP is only supported on Midgard, where it is broken for nearest
121     * filtering. Use CLAMP_TO_EDGE in that case.
122     */
123 
124    switch (w) {
125    case PIPE_TEX_WRAP_REPEAT:
126       return MALI_WRAP_MODE_REPEAT;
127    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
128       return MALI_WRAP_MODE_CLAMP_TO_EDGE;
129    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
130       return MALI_WRAP_MODE_CLAMP_TO_BORDER;
131    case PIPE_TEX_WRAP_MIRROR_REPEAT:
132       return MALI_WRAP_MODE_MIRRORED_REPEAT;
133    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
134       return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
135    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
136       return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
137 
138 #if PAN_ARCH <= 5
139    case PIPE_TEX_WRAP_CLAMP:
140       return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE
141                            : MALI_WRAP_MODE_CLAMP;
142    case PIPE_TEX_WRAP_MIRROR_CLAMP:
143       return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE
144                            : MALI_WRAP_MODE_MIRRORED_CLAMP;
145 #endif
146 
147    default:
148       unreachable("Invalid wrap");
149    }
150 }
151 
152 /* The hardware compares in the wrong order order, so we have to flip before
153  * encoding. Yes, really. */
154 
155 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)156 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
157 {
158    return !cso->compare_mode
159              ? MALI_FUNC_NEVER
160              : panfrost_flip_compare_func((enum mali_func)cso->compare_func);
161 }
162 
163 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)164 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
165 {
166    switch (f) {
167    case PIPE_TEX_MIPFILTER_NEAREST:
168       return MALI_MIPMAP_MODE_NEAREST;
169    case PIPE_TEX_MIPFILTER_LINEAR:
170       return MALI_MIPMAP_MODE_TRILINEAR;
171 #if PAN_ARCH >= 6
172    case PIPE_TEX_MIPFILTER_NONE:
173       return MALI_MIPMAP_MODE_NONE;
174 #else
175    case PIPE_TEX_MIPFILTER_NONE:
176       return MALI_MIPMAP_MODE_NEAREST;
177 #endif
178    default:
179       unreachable("Invalid");
180    }
181 }
182 
183 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)184 panfrost_create_sampler_state(struct pipe_context *pctx,
185                               const struct pipe_sampler_state *cso)
186 {
187    struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
188    so->base = *cso;
189 
190 #if PAN_ARCH == 7
191    /* On v7, pan_texture.c composes the API swizzle with a bijective
192     * swizzle derived from the format, to allow more formats than the
193     * hardware otherwise supports. When packing border colours, we need to
194     * undo this bijection, by swizzling with its inverse.
195     */
196    unsigned mali_format =
197       GENX(panfrost_format_from_pipe_format)(cso->border_color_format)->hw;
198    enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12);
199 
200    unsigned char inverted_swizzle[4];
201    panfrost_invert_swizzle(GENX(pan_decompose_swizzle)(order).post,
202                            inverted_swizzle);
203 
204    util_format_apply_color_swizzle(&so->base.border_color, &cso->border_color,
205                                    inverted_swizzle,
206                                    false /* is_integer (irrelevant) */);
207 #endif
208 
209    bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
210 
211    pan_pack(&so->hw, SAMPLER, cfg) {
212       cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
213       cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
214 
215       cfg.normalized_coordinates = !cso->unnormalized_coords;
216       cfg.lod_bias = cso->lod_bias;
217       cfg.minimum_lod = cso->min_lod;
218       cfg.maximum_lod = cso->max_lod;
219 
220       cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
221       cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
222       cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
223 
224       cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
225       cfg.compare_function = panfrost_sampler_compare_func(cso);
226       cfg.seamless_cube_map = cso->seamless_cube_map;
227 
228       cfg.border_color_r = so->base.border_color.ui[0];
229       cfg.border_color_g = so->base.border_color.ui[1];
230       cfg.border_color_b = so->base.border_color.ui[2];
231       cfg.border_color_a = so->base.border_color.ui[3];
232 
233 #if PAN_ARCH >= 6
234       if (cso->max_anisotropy > 1) {
235          cfg.maximum_anisotropy = cso->max_anisotropy;
236          cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
237       }
238 #else
239       /* Emulate disabled mipmapping by clamping the LOD as tight as
240        * possible (from 0 to epsilon = 1/256) */
241       if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
242          cfg.maximum_lod = cfg.minimum_lod + (1.0 / 256.0);
243 #endif
244    }
245 
246    return so;
247 }
248 
249 /* Get pointers to the blend shaders bound to each active render target. Used
250  * to emit the blend descriptors, as well as the fragment renderer state
251  * descriptor.
252  */
253 static void
panfrost_get_blend_shaders(struct panfrost_batch * batch,mali_ptr * blend_shaders)254 panfrost_get_blend_shaders(struct panfrost_batch *batch,
255                            mali_ptr *blend_shaders)
256 {
257    unsigned shader_offset = 0;
258    struct panfrost_bo *shader_bo = NULL;
259 
260    for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) {
261       if (batch->key.cbufs[c]) {
262          blend_shaders[c] =
263             panfrost_get_blend(batch, c, &shader_bo, &shader_offset);
264       }
265    }
266 
267    if (shader_bo)
268       perf_debug_ctx(batch->ctx, "Blend shader use");
269 }
270 
271 #if PAN_ARCH >= 5
272 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)273 pack_blend_constant(enum pipe_format format, float cons)
274 {
275    const struct util_format_description *format_desc =
276       util_format_description(format);
277 
278    unsigned chan_size = 0;
279 
280    for (unsigned i = 0; i < format_desc->nr_channels; i++)
281       chan_size = MAX2(format_desc->channel[0].size, chan_size);
282 
283    uint16_t unorm = (cons * ((1 << chan_size) - 1));
284    return unorm << (16 - chan_size);
285 }
286 
287 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,mali_ptr * blend_shaders)288 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
289                     mali_ptr *blend_shaders)
290 {
291    unsigned rt_count = batch->key.nr_cbufs;
292    struct panfrost_context *ctx = batch->ctx;
293    const struct panfrost_blend_state *so = ctx->blend;
294    bool dithered = so->base.dither;
295 
296    /* Always have at least one render target for depth-only passes */
297    for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
298       struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
299 
300       /* Disable blending for unbacked render targets */
301       if (rt_count == 0 || !batch->key.cbufs[i] || !so->info[i].enabled) {
302          pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
303             cfg.enable = false;
304 #if PAN_ARCH >= 6
305             cfg.internal.mode = MALI_BLEND_MODE_OFF;
306 #endif
307          }
308 
309          continue;
310       }
311 
312       struct pan_blend_info info = so->info[i];
313       enum pipe_format format = batch->key.cbufs[i]->format;
314       float cons =
315          pan_blend_get_constant(info.constant_mask, ctx->blend_color.color);
316 
317       /* Word 0: Flags and constant */
318       pan_pack(packed, BLEND, cfg) {
319          cfg.srgb = util_format_is_srgb(format);
320          cfg.load_destination = info.load_dest;
321          cfg.round_to_fb_precision = !dithered;
322          cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
323 #if PAN_ARCH >= 6
324          if (!blend_shaders[i])
325             cfg.constant = pack_blend_constant(format, cons);
326 #else
327          cfg.blend_shader = (blend_shaders[i] != 0);
328 
329          if (blend_shaders[i])
330             cfg.shader_pc = blend_shaders[i];
331          else
332             cfg.constant = cons;
333 #endif
334       }
335 
336       if (!blend_shaders[i]) {
337          /* Word 1: Blend Equation */
338          STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
339          packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
340       }
341 
342 #if PAN_ARCH >= 6
343       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
344 
345       /* Words 2 and 3: Internal blend */
346       if (blend_shaders[i]) {
347          /* The blend shader's address needs to be at
348           * the same top 32 bit as the fragment shader.
349           * TODO: Ensure that's always the case.
350           */
351          assert(!fs->bin.bo || (blend_shaders[i] & (0xffffffffull << 32)) ==
352                                   (fs->bin.gpu & (0xffffffffull << 32)));
353 
354          pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
355             cfg.mode = MALI_BLEND_MODE_SHADER;
356             cfg.shader.pc = (u32)blend_shaders[i];
357 
358 #if PAN_ARCH <= 7
359             unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
360             assert(!(ret_offset & 0x7));
361 
362             cfg.shader.return_value = ret_offset ? fs->bin.gpu + ret_offset : 0;
363 #endif
364          }
365       } else {
366          pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
367             cfg.mode = info.opaque ? MALI_BLEND_MODE_OPAQUE
368                                    : MALI_BLEND_MODE_FIXED_FUNCTION;
369 
370             /* If we want the conversion to work properly,
371              * num_comps must be set to 4
372              */
373             cfg.fixed_function.num_comps = 4;
374             cfg.fixed_function.conversion.memory_format = GENX(
375                panfrost_dithered_format_from_pipe_format)(format, dithered);
376             cfg.fixed_function.rt = i;
377 
378 #if PAN_ARCH <= 7
379             if (!info.opaque) {
380                cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
381                cfg.fixed_function.alpha_one_store = info.alpha_one_store;
382             }
383 
384             if (fs->info.fs.untyped_color_outputs) {
385                cfg.fixed_function.conversion.register_format = GENX(
386                   pan_fixup_blend_type)(fs->info.bifrost.blend[i].type, format);
387             } else {
388                cfg.fixed_function.conversion.register_format =
389                   fs->info.bifrost.blend[i].format;
390             }
391 #endif
392          }
393       }
394 #endif
395    }
396 }
397 #endif
398 
399 static mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)400 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch,
401                                   enum pipe_shader_type stage)
402 {
403    struct panfrost_compiled_shader *ss = batch->ctx->prog[stage];
404 
405    panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
406    panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
407 
408    return ss->state.gpu;
409 }
410 
411 #if PAN_ARCH <= 7
412 /* Construct a partial RSD corresponding to no executed fragment shader, and
413  * merge with the existing partial RSD. */
414 
415 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)416 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
417 {
418    struct mali_renderer_state_packed empty_rsd;
419 
420    pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
421 #if PAN_ARCH >= 6
422       cfg.properties.shader_modifies_coverage = true;
423       cfg.properties.allow_forward_pixel_to_kill = true;
424       cfg.properties.allow_forward_pixel_to_be_killed = true;
425       cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
426 
427       /* Alpha isn't written so these are vacuous */
428       cfg.multisample_misc.overdraw_alpha0 = true;
429       cfg.multisample_misc.overdraw_alpha1 = true;
430 #else
431       cfg.shader.shader = 0x1;
432       cfg.properties.work_register_count = 1;
433       cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
434       cfg.properties.force_early_z = true;
435 #endif
436    }
437 
438    pan_merge((*rsd), empty_rsd, RENDERER_STATE);
439 }
440 
441 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,mali_ptr * blend_shaders,struct mali_renderer_state_packed * rsd)442 panfrost_prepare_fs_state(struct panfrost_context *ctx, mali_ptr *blend_shaders,
443                           struct mali_renderer_state_packed *rsd)
444 {
445    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
446    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
447    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
448    struct panfrost_blend_state *so = ctx->blend;
449    bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
450    bool msaa = rast->multisample;
451 
452    unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
453 
454    bool has_blend_shader = false;
455 
456    for (unsigned c = 0; c < rt_count; ++c)
457       has_blend_shader |= (blend_shaders[c] != 0);
458 
459    bool has_oq = ctx->occlusion_query && ctx->active_queries;
460 
461    pan_pack(rsd, RENDERER_STATE, cfg) {
462       if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
463 #if PAN_ARCH >= 6
464          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
465             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
466             ctx->blend->base.alpha_to_coverage,
467             ctx->depth_stencil->zs_always_passes);
468 
469          cfg.properties.pixel_kill_operation = earlyzs.kill;
470          cfg.properties.zs_update_operation = earlyzs.update;
471 
472          cfg.properties.allow_forward_pixel_to_kill =
473             pan_allow_forward_pixel_to_kill(ctx, fs);
474 #else
475          cfg.properties.force_early_z =
476             fs->info.fs.can_early_z && !alpha_to_coverage &&
477             ((enum mali_func)zsa->base.alpha_func == MALI_FUNC_ALWAYS);
478 
479          /* TODO: Reduce this limit? */
480          if (has_blend_shader)
481             cfg.properties.work_register_count =
482                MAX2(fs->info.work_reg_count, 8);
483          else
484             cfg.properties.work_register_count = fs->info.work_reg_count;
485 
486          /* Hardware quirks around early-zs forcing without a
487           * depth buffer. Note this breaks occlusion queries. */
488          bool force_ez_with_discard = !zsa->enabled && !has_oq;
489 
490          cfg.properties.shader_reads_tilebuffer =
491             force_ez_with_discard && fs->info.fs.can_discard;
492          cfg.properties.shader_contains_discard =
493             !force_ez_with_discard && fs->info.fs.can_discard;
494 #endif
495       }
496 
497 #if PAN_ARCH == 4
498       if (rt_count > 0) {
499          cfg.multisample_misc.load_destination = so->info[0].load_dest;
500          cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
501          cfg.stencil_mask_misc.write_enable = so->info[0].enabled;
502          cfg.stencil_mask_misc.srgb =
503             util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
504          cfg.stencil_mask_misc.dither_disable = !so->base.dither;
505          cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
506 
507          if (blend_shaders[0]) {
508             cfg.blend_shader = blend_shaders[0];
509          } else {
510             cfg.blend_constant = pan_blend_get_constant(
511                so->info[0].constant_mask, ctx->blend_color.color);
512          }
513       } else {
514          /* If there is no colour buffer, leaving fields default is
515           * fine, except for blending which is nonnullable */
516          cfg.blend_equation.color_mask = 0xf;
517          cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
518          cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
519          cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
520          cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
521          cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
522          cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
523       }
524 #elif PAN_ARCH == 5
525       /* Workaround */
526       cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
527 #endif
528 
529       cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
530 
531       cfg.multisample_misc.evaluate_per_sample = msaa && (ctx->min_samples > 1);
532 
533 #if PAN_ARCH >= 6
534       /* MSAA blend shaders need to pass their sample ID to
535        * LD_TILE/ST_TILE, so we must preload it. Additionally, we
536        * need per-sample shading for the blend shader, accomplished
537        * by forcing per-sample shading for the whole program. */
538 
539       if (msaa && has_blend_shader) {
540          cfg.multisample_misc.evaluate_per_sample = true;
541          cfg.preload.fragment.sample_mask_id = true;
542       }
543 
544       /* Bifrost does not have native point sprites. Point sprites are
545        * lowered in the driver to gl_PointCoord reads. This field
546        * actually controls the orientation of gl_PointCoord. Both
547        * orientations are controlled with sprite_coord_mode in
548        * Gallium.
549        */
550       cfg.properties.point_sprite_coord_origin_max_y =
551          (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
552 
553       cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
554       cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
555 #endif
556 
557       cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
558       cfg.depth_units = rast->offset_units * 2.0f;
559       cfg.depth_factor = rast->offset_scale;
560       cfg.depth_bias_clamp = rast->offset_clamp;
561 
562       bool back_enab = zsa->base.stencil[1].enabled;
563       cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
564       cfg.stencil_back.reference_value =
565          ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
566 
567 #if PAN_ARCH <= 5
568       /* v6+ fits register preload here, no alpha testing */
569       cfg.alpha_reference = zsa->base.alpha_ref_value;
570 #endif
571    }
572 }
573 
574 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,mali_ptr * blend_shaders)575 panfrost_emit_frag_shader(struct panfrost_context *ctx,
576                           struct mali_renderer_state_packed *fragmeta,
577                           mali_ptr *blend_shaders)
578 {
579    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
580    const struct panfrost_rasterizer *rast = ctx->rasterizer;
581    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
582 
583    /* We need to merge several several partial renderer state descriptors,
584     * so stage to temporary storage rather than reading back write-combine
585     * memory, which will trash performance. */
586    struct mali_renderer_state_packed rsd;
587    panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
588 
589 #if PAN_ARCH == 4
590    if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
591       /* Word 14: SFBD Blend Equation */
592       STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
593       rsd.opaque[14] = ctx->blend->equation[0];
594    }
595 #endif
596 
597    /* Merge with CSO state and upload */
598    if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
599       struct mali_renderer_state_packed *partial_rsd =
600          (struct mali_renderer_state_packed *)&fs->partial_rsd;
601       STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
602       pan_merge(rsd, *partial_rsd, RENDERER_STATE);
603    } else {
604       pan_merge_empty_fs(&rsd);
605    }
606 
607    /* Word 8, 9 Misc state */
608    rsd.opaque[8] |= zsa->rsd_depth.opaque[0] | rast->multisample.opaque[0];
609 
610    rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] | rast->stencil_misc.opaque[0];
611 
612    /* late patching of the merged RSD in case of line-smoothing */
613    if (u_reduced_prim(ctx->active_prim) == MESA_PRIM_LINES &&
614        rast->base.line_smooth) {
615       rsd.opaque[8] |= (1u << 16); // multisample_enable = 1
616       rsd.opaque[9] &= ~(1u << 30); // single_sampled_lines = 0
617    }
618 
619    /* Word 10, 11 Stencil Front and Back */
620    rsd.opaque[10] |= zsa->stencil_front.opaque[0];
621    rsd.opaque[11] |= zsa->stencil_back.opaque[0];
622 
623    memcpy(fragmeta, &rsd, sizeof(rsd));
624 }
625 
626 static mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)627 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
628 {
629    struct panfrost_context *ctx = batch->ctx;
630    struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT];
631 
632    panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
633 
634    struct panfrost_ptr xfer;
635 
636 #if PAN_ARCH == 4
637    xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
638 #else
639    unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
640 
641    xfer =
642       pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(RENDERER_STATE),
643                                     PAN_DESC_ARRAY(rt_count, BLEND));
644 #endif
645 
646    mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
647    panfrost_get_blend_shaders(batch, blend_shaders);
648 
649    panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *)xfer.cpu,
650                              blend_shaders);
651 
652 #if PAN_ARCH >= 5
653    panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE),
654                        blend_shaders);
655 #endif
656 
657    return xfer.gpu;
658 }
659 #endif
660 
661 static mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)662 panfrost_emit_viewport(struct panfrost_batch *batch)
663 {
664    struct panfrost_context *ctx = batch->ctx;
665    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
666    const struct pipe_scissor_state *ss = &ctx->scissor;
667    const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
668 
669    /* Derive min/max from translate/scale. Note since |x| >= 0 by
670     * definition, we have that -|x| <= |x| hence translate - |scale| <=
671     * translate + |scale|, so the ordering is correct here. */
672    float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
673    float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
674    float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
675    float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
676 
677    float minz, maxz;
678    util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz);
679 
680    /* Scissor to the intersection of viewport and to the scissor, clamped
681     * to the framebuffer */
682 
683    unsigned minx = MIN2(batch->key.width, MAX2((int)vp_minx, 0));
684    unsigned maxx = MIN2(batch->key.width, MAX2((int)vp_maxx, 0));
685    unsigned miny = MIN2(batch->key.height, MAX2((int)vp_miny, 0));
686    unsigned maxy = MIN2(batch->key.height, MAX2((int)vp_maxy, 0));
687 
688    if (ss && rast->scissor) {
689       minx = MAX2(ss->minx, minx);
690       miny = MAX2(ss->miny, miny);
691       maxx = MIN2(ss->maxx, maxx);
692       maxy = MIN2(ss->maxy, maxy);
693    }
694 
695    /* Set the range to [1, 1) so max values don't wrap round */
696    if (maxx == 0 || maxy == 0)
697       maxx = maxy = minx = miny = 1;
698 
699    panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
700    batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
701 
702    /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */
703    maxx--;
704    maxy--;
705 
706    batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY;
707    batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY;
708 
709 #if PAN_ARCH <= 7
710    struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
711 
712    pan_pack(T.cpu, VIEWPORT, cfg) {
713       cfg.scissor_minimum_x = minx;
714       cfg.scissor_minimum_y = miny;
715       cfg.scissor_maximum_x = maxx;
716       cfg.scissor_maximum_y = maxy;
717 
718       cfg.minimum_z = batch->minimum_z;
719       cfg.maximum_z = batch->maximum_z;
720    }
721 
722    return T.gpu;
723 #else
724    pan_pack(&batch->scissor, SCISSOR, cfg) {
725       cfg.scissor_minimum_x = minx;
726       cfg.scissor_minimum_y = miny;
727       cfg.scissor_maximum_x = maxx;
728       cfg.scissor_maximum_y = maxy;
729    }
730 
731    return 0;
732 #endif
733 }
734 
735 #if PAN_ARCH >= 9
736 /**
737  * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the
738  * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at
739  * CSO create time. However, the stencil reference values and shader
740  * interactions are dynamic state. Pack only the dynamic state here and OR
741  * together.
742  */
743 static mali_ptr
panfrost_emit_depth_stencil(struct panfrost_batch * batch)744 panfrost_emit_depth_stencil(struct panfrost_batch *batch)
745 {
746    struct panfrost_context *ctx = batch->ctx;
747    const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
748    struct panfrost_rasterizer *rast = ctx->rasterizer;
749    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
750    bool back_enab = zsa->base.stencil[1].enabled;
751 
752    struct panfrost_ptr T =
753       pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL);
754    struct mali_depth_stencil_packed dynamic;
755 
756    pan_pack(&dynamic, DEPTH_STENCIL, cfg) {
757       cfg.front_reference_value = ctx->stencil_ref.ref_value[0];
758       cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
759 
760       cfg.stencil_from_shader = fs->info.fs.writes_stencil;
761       cfg.depth_source = pan_depth_source(&fs->info);
762 
763       cfg.depth_bias_enable = rast->base.offset_tri;
764       cfg.depth_units = rast->base.offset_units * 2.0f;
765       cfg.depth_factor = rast->base.offset_scale;
766       cfg.depth_bias_clamp = rast->base.offset_clamp;
767    }
768 
769    pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
770    memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL));
771 
772    return T.gpu;
773 }
774 
775 /**
776  * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared
777  * with Bifrost, but the container data structure is simplified.
778  */
779 static mali_ptr
panfrost_emit_blend_valhall(struct panfrost_batch * batch)780 panfrost_emit_blend_valhall(struct panfrost_batch *batch)
781 {
782    unsigned rt_count = MAX2(batch->key.nr_cbufs, 1);
783 
784    struct panfrost_ptr T =
785       pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND);
786 
787    mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
788    panfrost_get_blend_shaders(batch, blend_shaders);
789 
790    panfrost_emit_blend(batch, T.cpu, blend_shaders);
791 
792    /* Precalculate for the per-draw path */
793    bool has_blend_shader = false;
794 
795    for (unsigned i = 0; i < rt_count; ++i)
796       has_blend_shader |= !!blend_shaders[i];
797 
798    batch->ctx->valhall_has_blend_shader = has_blend_shader;
799 
800    return T.gpu;
801 }
802 
803 /**
804  * Emit Valhall buffer descriptors for bound vertex buffers at draw-time.
805  */
806 static mali_ptr
panfrost_emit_vertex_buffers(struct panfrost_batch * batch)807 panfrost_emit_vertex_buffers(struct panfrost_batch *batch)
808 {
809    struct panfrost_context *ctx = batch->ctx;
810    unsigned buffer_count = util_last_bit(ctx->vb_mask);
811    struct panfrost_ptr T =
812       pan_pool_alloc_desc_array(&batch->pool.base, buffer_count, BUFFER);
813    struct mali_buffer_packed *buffers = T.cpu;
814 
815    u_foreach_bit(i, ctx->vb_mask) {
816       struct pipe_vertex_buffer vb = ctx->vertex_buffers[i];
817       struct pipe_resource *prsrc = vb.buffer.resource;
818       struct panfrost_resource *rsrc = pan_resource(prsrc);
819       assert(!vb.is_user_buffer);
820 
821       panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
822 
823       pan_pack(buffers + i, BUFFER, cfg) {
824          cfg.address = rsrc->image.data.base + vb.buffer_offset;
825 
826          cfg.size = prsrc->width0 - vb.buffer_offset;
827       }
828    }
829 
830    return T.gpu;
831 }
832 
833 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch)834 panfrost_emit_vertex_data(struct panfrost_batch *batch)
835 {
836    struct panfrost_context *ctx = batch->ctx;
837    struct panfrost_vertex_state *vtx = ctx->vertex;
838 
839    return pan_pool_upload_aligned(&batch->pool.base, vtx->attributes,
840                                   vtx->num_elements * pan_size(ATTRIBUTE),
841                                   pan_alignment(ATTRIBUTE));
842 }
843 
844 static void panfrost_update_sampler_view(struct panfrost_sampler_view *view,
845                                          struct pipe_context *pctx);
846 
847 static mali_ptr
panfrost_emit_images(struct panfrost_batch * batch,enum pipe_shader_type stage)848 panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage)
849 {
850    struct panfrost_context *ctx = batch->ctx;
851    unsigned last_bit = util_last_bit(ctx->image_mask[stage]);
852 
853    struct panfrost_ptr T =
854       pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE);
855 
856    struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
857 
858    for (int i = 0; i < last_bit; ++i) {
859       struct pipe_image_view *image = &ctx->images[stage][i];
860 
861       if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) {
862          memset(&out[i], 0, sizeof(out[i]));
863          continue;
864       }
865 
866       /* Construct a synthetic sampler view so we can use our usual
867        * sampler view code for the actual descriptor packing.
868        *
869        * Use the batch pool for a transient allocation, rather than
870        * allocating a long-lived descriptor.
871        */
872       struct panfrost_sampler_view view = {
873          .base = util_image_to_sampler_view(image),
874          .pool = &batch->pool,
875       };
876 
877       /* If we specify a cube map, the hardware internally treat it as
878        * a 2D array. Since cube maps as images can confuse our common
879        * texturing code, explicitly use a 2D array.
880        *
881        * Similar concerns apply to 3D textures.
882        */
883       if (view.base.target == PIPE_BUFFER)
884          view.base.target = PIPE_BUFFER;
885       else
886          view.base.target = PIPE_TEXTURE_2D_ARRAY;
887 
888       panfrost_update_sampler_view(&view, &ctx->base);
889       out[i] = view.bifrost_descriptor;
890 
891       panfrost_track_image_access(batch, stage, image);
892    }
893 
894    return T.gpu;
895 }
896 #endif
897 
898 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)899 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
900                                  enum pipe_shader_type st,
901                                  struct panfrost_constant_buffer *buf,
902                                  unsigned index)
903 {
904    struct pipe_constant_buffer *cb = &buf->cb[index];
905    struct panfrost_resource *rsrc = pan_resource(cb->buffer);
906 
907    if (rsrc) {
908       panfrost_batch_read_rsrc(batch, rsrc, st);
909 
910       /* Alignment gauranteed by
911        * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
912       return rsrc->image.data.base + cb->buffer_offset;
913    } else if (cb->user_buffer) {
914       return pan_pool_upload_aligned(&batch->pool.base,
915                                      cb->user_buffer + cb->buffer_offset,
916                                      cb->buffer_size, 16);
917    } else {
918       unreachable("No constant buffer");
919    }
920 }
921 
922 struct sysval_uniform {
923    union {
924       float f[4];
925       int32_t i[4];
926       uint32_t u[4];
927       uint64_t du[2];
928    };
929 };
930 
931 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)932 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
933                                       struct sysval_uniform *uniform)
934 {
935    struct panfrost_context *ctx = batch->ctx;
936    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
937 
938    uniform->f[0] = vp->scale[0];
939    uniform->f[1] = vp->scale[1];
940    uniform->f[2] = vp->scale[2];
941 }
942 
943 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)944 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
945                                        struct sysval_uniform *uniform)
946 {
947    struct panfrost_context *ctx = batch->ctx;
948    const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
949 
950    uniform->f[0] = vp->translate[0];
951    uniform->f[1] = vp->translate[1];
952    uniform->f[2] = vp->translate[2];
953 }
954 
955 static void
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)956 panfrost_upload_txs_sysval(struct panfrost_batch *batch,
957                            enum pipe_shader_type st, unsigned int sysvalid,
958                            struct sysval_uniform *uniform)
959 {
960    struct panfrost_context *ctx = batch->ctx;
961    unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
962    unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
963    bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
964    struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
965 
966    assert(dim);
967 
968    if (tex->target == PIPE_BUFFER) {
969       assert(dim == 1);
970       uniform->i[0] = tex->u.buf.size / util_format_get_blocksize(tex->format);
971       return;
972    }
973 
974    uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
975 
976    if (dim > 1)
977       uniform->i[1] = u_minify(tex->texture->height0, tex->u.tex.first_level);
978 
979    if (dim > 2)
980       uniform->i[2] = u_minify(tex->texture->depth0, tex->u.tex.first_level);
981 
982    if (is_array) {
983       unsigned size = tex->texture->array_size;
984 
985       /* Internally, we store the number of 2D images (faces * array
986        * size). Externally, we report the array size in terms of
987        * complete cubes. So divide by the # of faces per cube.
988        */
989       if (tex->target == PIPE_TEXTURE_CUBE_ARRAY)
990          size /= 6;
991 
992       uniform->i[dim] = size;
993    }
994 }
995 
996 static void
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)997 panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
998                                   enum pipe_shader_type st,
999                                   unsigned int sysvalid,
1000                                   struct sysval_uniform *uniform)
1001 {
1002    struct panfrost_context *ctx = batch->ctx;
1003    unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1004    unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1005    unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1006 
1007    assert(dim && dim < 4);
1008 
1009    struct pipe_image_view *image = &ctx->images[st][idx];
1010 
1011    if (image->resource->target == PIPE_BUFFER) {
1012       unsigned blocksize = util_format_get_blocksize(image->format);
1013       uniform->i[0] = image->resource->width0 / blocksize;
1014       return;
1015    }
1016 
1017    uniform->i[0] = u_minify(image->resource->width0, image->u.tex.level);
1018 
1019    if (dim > 1)
1020       uniform->i[1] = u_minify(image->resource->height0, image->u.tex.level);
1021 
1022    if (dim > 2)
1023       uniform->i[2] = u_minify(image->resource->depth0, image->u.tex.level);
1024 
1025    if (is_array)
1026       uniform->i[dim] = image->resource->array_size;
1027 }
1028 
1029 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)1030 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1031                             enum pipe_shader_type st, unsigned ssbo_id,
1032                             struct sysval_uniform *uniform)
1033 {
1034    struct panfrost_context *ctx = batch->ctx;
1035 
1036    assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1037    struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1038 
1039    /* Compute address */
1040    struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1041    struct panfrost_bo *bo = rsrc->bo;
1042 
1043    panfrost_batch_write_rsrc(batch, rsrc, st);
1044 
1045    util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1046                   sb.buffer_size);
1047 
1048    /* Upload address and size as sysval */
1049    uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
1050    uniform->u[2] = sb.buffer_size;
1051 }
1052 
1053 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)1054 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1055                                enum pipe_shader_type st, unsigned samp_idx,
1056                                struct sysval_uniform *uniform)
1057 {
1058    struct panfrost_context *ctx = batch->ctx;
1059    struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1060 
1061    uniform->f[0] = sampl->min_lod;
1062    uniform->f[1] = sampl->max_lod;
1063    uniform->f[2] = sampl->lod_bias;
1064 
1065    /* Even without any errata, Midgard represents "no mipmapping" as
1066     * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1067     * panfrost_create_sampler_state which also explains our choice of
1068     * epsilon value (again to keep behaviour consistent) */
1069 
1070    if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1071       uniform->f[1] = uniform->f[0] + (1.0 / 256.0);
1072 }
1073 
1074 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1075 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1076                                        struct sysval_uniform *uniform)
1077 {
1078    struct panfrost_context *ctx = batch->ctx;
1079 
1080    uniform->u[0] = ctx->compute_grid->grid[0];
1081    uniform->u[1] = ctx->compute_grid->grid[1];
1082    uniform->u[2] = ctx->compute_grid->grid[2];
1083 }
1084 
1085 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1086 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
1087                                         struct sysval_uniform *uniform)
1088 {
1089    struct panfrost_context *ctx = batch->ctx;
1090 
1091    uniform->u[0] = ctx->compute_grid->block[0];
1092    uniform->u[1] = ctx->compute_grid->block[1];
1093    uniform->u[2] = ctx->compute_grid->block[2];
1094 }
1095 
1096 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1097 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
1098                                 struct sysval_uniform *uniform)
1099 {
1100    struct panfrost_context *ctx = batch->ctx;
1101 
1102    uniform->u[0] = ctx->compute_grid->work_dim;
1103 }
1104 
1105 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
1106  * Midgard, we emulate the Bifrost path with some extra arithmetic in the
1107  * shader, to keep the code as unified as possible. */
1108 
1109 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1110 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
1111                                         struct sysval_uniform *uniform)
1112 {
1113    struct panfrost_context *ctx = batch->ctx;
1114    struct panfrost_device *dev = pan_device(ctx->base.screen);
1115 
1116    unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1117    uniform->du[0] =
1118       dev->sample_positions->ptr.gpu +
1119       panfrost_sample_positions_offset(panfrost_sample_pattern(samples));
1120 }
1121 
1122 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1123 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1124                                     struct sysval_uniform *uniform)
1125 {
1126    unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1127    uniform->u[0] = (samples > 1) ? ~0 : 0;
1128 }
1129 
1130 #if PAN_ARCH >= 6
1131 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)1132 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1133                                      unsigned size_and_rt,
1134                                      struct sysval_uniform *uniform)
1135 {
1136    unsigned rt = size_and_rt & 0xF;
1137    unsigned size = size_and_rt >> 4;
1138 
1139    if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1140       enum pipe_format format = batch->key.cbufs[rt]->format;
1141       uniform->u[0] =
1142          GENX(pan_blend_get_internal_desc)(format, rt, size, false) >> 32;
1143    } else {
1144       pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
1145          cfg.memory_format =
1146             GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_NONE)->hw;
1147    }
1148 }
1149 #endif
1150 
1151 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1152 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1153 {
1154    return target->buffer_offset + (pan_so_target(target)->offset * stride);
1155 }
1156 
1157 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,void * ptr_cpu,mali_ptr ptr_gpu,struct panfrost_compiled_shader * ss,enum pipe_shader_type st)1158 panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu,
1159                         mali_ptr ptr_gpu, struct panfrost_compiled_shader *ss,
1160                         enum pipe_shader_type st)
1161 {
1162    struct sysval_uniform *uniforms = ptr_cpu;
1163 
1164    for (unsigned i = 0; i < ss->sysvals.sysval_count; ++i) {
1165       int sysval = ss->sysvals.sysvals[i];
1166 
1167       switch (PAN_SYSVAL_TYPE(sysval)) {
1168       case PAN_SYSVAL_VIEWPORT_SCALE:
1169          panfrost_upload_viewport_scale_sysval(batch, &uniforms[i]);
1170          break;
1171       case PAN_SYSVAL_VIEWPORT_OFFSET:
1172          panfrost_upload_viewport_offset_sysval(batch, &uniforms[i]);
1173          break;
1174       case PAN_SYSVAL_TEXTURE_SIZE:
1175          panfrost_upload_txs_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1176                                     &uniforms[i]);
1177          break;
1178       case PAN_SYSVAL_SSBO:
1179          panfrost_upload_ssbo_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1180                                      &uniforms[i]);
1181          break;
1182 
1183       case PAN_SYSVAL_XFB: {
1184          unsigned buf = PAN_SYSVAL_ID(sysval);
1185          struct panfrost_compiled_shader *vs =
1186             batch->ctx->prog[PIPE_SHADER_VERTEX];
1187          struct pipe_stream_output_info *so = &vs->stream_output;
1188          unsigned stride = so->stride[buf] * 4;
1189 
1190          struct pipe_stream_output_target *target = NULL;
1191          if (buf < batch->ctx->streamout.num_targets)
1192             target = batch->ctx->streamout.targets[buf];
1193 
1194          if (!target) {
1195             /* Memory sink */
1196             uniforms[i].du[0] = 0x8ull << 60;
1197             break;
1198          }
1199 
1200          struct panfrost_resource *rsrc = pan_resource(target->buffer);
1201          unsigned offset = panfrost_xfb_offset(stride, target);
1202 
1203          util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
1204                         target->buffer_size - offset);
1205 
1206          panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1207 
1208          uniforms[i].du[0] = rsrc->image.data.base + offset;
1209          break;
1210       }
1211 
1212       case PAN_SYSVAL_NUM_VERTICES:
1213          uniforms[i].u[0] = batch->ctx->vertex_count;
1214          break;
1215 
1216       case PAN_SYSVAL_NUM_WORK_GROUPS:
1217          for (unsigned j = 0; j < 3; j++) {
1218             batch->num_wg_sysval[j] =
1219                ptr_gpu + (i * sizeof(*uniforms)) + (j * 4);
1220          }
1221          panfrost_upload_num_work_groups_sysval(batch, &uniforms[i]);
1222          break;
1223       case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1224          panfrost_upload_local_group_size_sysval(batch, &uniforms[i]);
1225          break;
1226       case PAN_SYSVAL_WORK_DIM:
1227          panfrost_upload_work_dim_sysval(batch, &uniforms[i]);
1228          break;
1229       case PAN_SYSVAL_SAMPLER:
1230          panfrost_upload_sampler_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1231                                         &uniforms[i]);
1232          break;
1233       case PAN_SYSVAL_IMAGE_SIZE:
1234          panfrost_upload_image_size_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1235                                            &uniforms[i]);
1236          break;
1237       case PAN_SYSVAL_SAMPLE_POSITIONS:
1238          panfrost_upload_sample_positions_sysval(batch, &uniforms[i]);
1239          break;
1240       case PAN_SYSVAL_MULTISAMPLED:
1241          panfrost_upload_multisampled_sysval(batch, &uniforms[i]);
1242          break;
1243 #if PAN_ARCH >= 6
1244       case PAN_SYSVAL_RT_CONVERSION:
1245          panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval),
1246                                               &uniforms[i]);
1247          break;
1248 #endif
1249       case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1250          uniforms[i].u[0] = batch->ctx->offset_start;
1251          uniforms[i].u[1] = batch->ctx->base_vertex;
1252          uniforms[i].u[2] = batch->ctx->base_instance;
1253          break;
1254       case PAN_SYSVAL_DRAWID:
1255          uniforms[i].u[0] = batch->ctx->drawid;
1256          break;
1257       default:
1258          assert(0);
1259       }
1260    }
1261 }
1262 
1263 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1264 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1265                                  struct panfrost_constant_buffer *buf,
1266                                  unsigned index)
1267 {
1268    struct pipe_constant_buffer *cb = &buf->cb[index];
1269    struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1270 
1271    if (rsrc) {
1272       panfrost_bo_mmap(rsrc->bo);
1273       panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1274       panfrost_bo_wait(rsrc->bo, INT64_MAX, false);
1275 
1276       return rsrc->bo->ptr.cpu + cb->buffer_offset;
1277    } else if (cb->user_buffer) {
1278       return cb->user_buffer + cb->buffer_offset;
1279    } else
1280       unreachable("No constant buffer");
1281 }
1282 
1283 /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
1284  * implemented with buffer descriptors in the resource table, sized in terms of
1285  * bytes. On Bifrost and older, UBOs have special uniform buffer data
1286  * structure, sized in terms of entries.
1287  */
1288 static void
panfrost_emit_ubo(void * base,unsigned index,mali_ptr address,size_t size)1289 panfrost_emit_ubo(void *base, unsigned index, mali_ptr address, size_t size)
1290 {
1291 #if PAN_ARCH >= 9
1292    struct mali_buffer_packed *out = base;
1293 
1294    pan_pack(out + index, BUFFER, cfg) {
1295       cfg.size = size;
1296       cfg.address = address;
1297    }
1298 #else
1299    struct mali_uniform_buffer_packed *out = base;
1300 
1301    /* Issue (57) for the ARB_uniform_buffer_object spec says that
1302     * the buffer can be larger than the uniform data inside it,
1303     * so clamp ubo size to what hardware supports. */
1304 
1305    pan_pack(out + index, UNIFORM_BUFFER, cfg) {
1306       cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12);
1307       cfg.pointer = address;
1308    }
1309 #endif
1310 }
1311 
1312 static mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,unsigned * buffer_count,mali_ptr * push_constants,unsigned * pushed_words)1313 panfrost_emit_const_buf(struct panfrost_batch *batch,
1314                         enum pipe_shader_type stage, unsigned *buffer_count,
1315                         mali_ptr *push_constants, unsigned *pushed_words)
1316 {
1317    struct panfrost_context *ctx = batch->ctx;
1318    struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1319    struct panfrost_compiled_shader *ss = ctx->prog[stage];
1320 
1321    if (!ss)
1322       return 0;
1323 
1324    /* Allocate room for the sysval and the uniforms */
1325    size_t sys_size = sizeof(float) * 4 * ss->sysvals.sysval_count;
1326    struct panfrost_ptr transfer =
1327       pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1328 
1329    /* Upload sysvals requested by the shader */
1330    uint8_t *sysvals = alloca(sys_size);
1331    panfrost_upload_sysvals(batch, sysvals, transfer.gpu, ss, stage);
1332    memcpy(transfer.cpu, sysvals, sys_size);
1333 
1334    /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1335    struct panfrost_compiled_shader *shader = ctx->prog[stage];
1336    unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1337    unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1338    struct panfrost_ptr ubos = {0};
1339 
1340 #if PAN_ARCH >= 9
1341    ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, BUFFER);
1342 #else
1343    ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1,
1344                                     UNIFORM_BUFFER);
1345 #endif
1346 
1347    if (buffer_count)
1348       *buffer_count = ubo_count + (sys_size ? 1 : 0);
1349 
1350    /* Upload sysval as a final UBO */
1351 
1352    if (sys_size)
1353       panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size);
1354 
1355    /* The rest are honest-to-goodness UBOs */
1356 
1357    u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1358       size_t usz = buf->cb[ubo].buffer_size;
1359       mali_ptr address = 0;
1360 
1361       if (usz > 0) {
1362          address = panfrost_map_constant_buffer_gpu(batch, stage, buf, ubo);
1363       }
1364 
1365       panfrost_emit_ubo(ubos.cpu, ubo, address, usz);
1366    }
1367 
1368    if (pushed_words)
1369       *pushed_words = ss->info.push.count;
1370 
1371    if (ss->info.push.count == 0)
1372       return ubos.gpu;
1373 
1374    /* Copy push constants required by the shader */
1375    struct panfrost_ptr push_transfer =
1376       pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16);
1377 
1378    uint32_t *push_cpu = (uint32_t *)push_transfer.cpu;
1379    *push_constants = push_transfer.gpu;
1380 
1381    for (unsigned i = 0; i < ss->info.push.count; ++i) {
1382       struct panfrost_ubo_word src = ss->info.push.words[i];
1383 
1384       if (src.ubo == sysval_ubo) {
1385          unsigned sysval_idx = src.offset / 16;
1386          unsigned sysval_comp = (src.offset % 16) / 4;
1387          unsigned sysval_type =
1388             PAN_SYSVAL_TYPE(ss->sysvals.sysvals[sysval_idx]);
1389          mali_ptr ptr = push_transfer.gpu + (4 * i);
1390 
1391          if (sysval_type == PAN_SYSVAL_NUM_WORK_GROUPS)
1392             batch->num_wg_sysval[sysval_comp] = ptr;
1393       }
1394       /* Map the UBO, this should be cheap. For some buffers this may
1395        * read from write-combine memory which is slow, though :-(
1396        */
1397       const void *mapped_ubo =
1398          (src.ubo == sysval_ubo)
1399             ? sysvals
1400             : panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1401 
1402       /* TODO: Is there any benefit to combining ranges */
1403       memcpy(push_cpu + i, (uint8_t *)mapped_ubo + src.offset, 4);
1404    }
1405 
1406    return ubos.gpu;
1407 }
1408 
1409 /*
1410  * Choose the number of WLS instances to allocate. This must be a power-of-two.
1411  * The number of WLS instances limits the number of concurrent tasks on a given
1412  * shader core, setting to the (rounded) total number of tasks avoids any
1413  * throttling. Smaller values save memory at the expense of possible throttling.
1414  *
1415  * With indirect dispatch, we don't know at launch-time how many tasks will be
1416  * needed, so we use a conservative value that's unlikely to cause slowdown in
1417  * practice without wasting too much memory.
1418  */
1419 static unsigned
panfrost_choose_wls_instance_count(const struct pipe_grid_info * grid)1420 panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
1421 {
1422    if (grid->indirect) {
1423       /* May need tuning in the future, conservative guess */
1424       return 128;
1425    } else {
1426       return util_next_power_of_two(grid->grid[0]) *
1427              util_next_power_of_two(grid->grid[1]) *
1428              util_next_power_of_two(grid->grid[2]);
1429    }
1430 }
1431 
1432 static mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * grid)1433 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1434                             const struct pipe_grid_info *grid)
1435 {
1436    struct panfrost_context *ctx = batch->ctx;
1437    struct panfrost_device *dev = pan_device(ctx->base.screen);
1438    struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_COMPUTE];
1439    struct panfrost_ptr t =
1440       pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1441 
1442    struct pan_tls_info info = {
1443       .tls.size = ss->info.tls_size,
1444       .wls.size = ss->info.wls_size + grid->variable_shared_mem,
1445       .wls.instances = panfrost_choose_wls_instance_count(grid),
1446    };
1447 
1448    if (ss->info.tls_size) {
1449       struct panfrost_bo *bo = panfrost_batch_get_scratchpad(
1450          batch, ss->info.tls_size, dev->thread_tls_alloc, dev->core_id_range);
1451       info.tls.ptr = bo->ptr.gpu;
1452    }
1453 
1454    if (info.wls.size) {
1455       unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
1456                       dev->core_id_range;
1457 
1458       struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);
1459 
1460       info.wls.ptr = bo->ptr.gpu;
1461    }
1462 
1463    GENX(pan_emit_tls)(&info, t.cpu);
1464    return t.gpu;
1465 }
1466 
1467 #if PAN_ARCH <= 5
1468 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1469 panfrost_get_tex_desc(struct panfrost_batch *batch, enum pipe_shader_type st,
1470                       struct panfrost_sampler_view *view)
1471 {
1472    if (!view)
1473       return (mali_ptr)0;
1474 
1475    struct pipe_sampler_view *pview = &view->base;
1476    struct panfrost_resource *rsrc = pan_resource(pview->texture);
1477 
1478    panfrost_batch_read_rsrc(batch, rsrc, st);
1479    panfrost_batch_add_bo(batch, view->state.bo, st);
1480 
1481    return view->state.gpu;
1482 }
1483 #endif
1484 
1485 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1486 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1487                                 struct pipe_context *pctx,
1488                                 struct pipe_resource *texture)
1489 {
1490    struct panfrost_device *device = pan_device(pctx->screen);
1491    struct panfrost_context *ctx = pan_context(pctx);
1492    struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1493    enum pipe_format format = so->base.format;
1494    assert(prsrc->bo);
1495 
1496    /* Format to access the stencil/depth portion of a Z32_S8 texture */
1497    if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1498       assert(prsrc->separate_stencil);
1499       texture = &prsrc->separate_stencil->base;
1500       prsrc = (struct panfrost_resource *)texture;
1501       format = texture->format;
1502    } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1503       format = PIPE_FORMAT_Z32_FLOAT;
1504    }
1505 
1506    so->texture_bo = prsrc->image.data.base;
1507    so->modifier = prsrc->image.layout.modifier;
1508 
1509    /* MSAA only supported for 2D textures */
1510 
1511    assert(texture->nr_samples <= 1 || so->base.target == PIPE_TEXTURE_2D ||
1512           so->base.target == PIPE_TEXTURE_2D_ARRAY);
1513 
1514    enum mali_texture_dimension type =
1515       panfrost_translate_texture_dimension(so->base.target);
1516 
1517    bool is_buffer = (so->base.target == PIPE_BUFFER);
1518 
1519    unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1520    unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1521    unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1522    unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1523    unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1524    unsigned buf_size =
1525       (is_buffer ? so->base.u.buf.size : 0) / util_format_get_blocksize(format);
1526 
1527    if (so->base.target == PIPE_TEXTURE_3D) {
1528       first_layer /= prsrc->image.layout.depth;
1529       last_layer /= prsrc->image.layout.depth;
1530       assert(!first_layer && !last_layer);
1531    }
1532 
1533    struct pan_image_view iview = {
1534       .format = format,
1535       .dim = type,
1536       .first_level = first_level,
1537       .last_level = last_level,
1538       .first_layer = first_layer,
1539       .last_layer = last_layer,
1540       .swizzle =
1541          {
1542             so->base.swizzle_r,
1543             so->base.swizzle_g,
1544             so->base.swizzle_b,
1545             so->base.swizzle_a,
1546          },
1547       .planes = {NULL},
1548       .buf.offset = buf_offset,
1549       .buf.size = buf_size,
1550    };
1551 
1552    panfrost_set_image_view_planes(&iview, texture);
1553 
1554    unsigned size = (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1555                    GENX(panfrost_estimate_texture_payload_size)(&iview);
1556 
1557    struct panfrost_pool *pool = so->pool ?: &ctx->descs;
1558    struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64);
1559    so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1560 
1561    void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1562 
1563    if (PAN_ARCH <= 5) {
1564       payload.cpu += pan_size(TEXTURE);
1565       payload.gpu += pan_size(TEXTURE);
1566    }
1567 
1568    if ((device->debug & PAN_DBG_YUV) && panfrost_format_is_yuv(format)) {
1569       const struct util_format_description *desc =
1570          util_format_description(format);
1571 
1572       if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
1573          iview.swizzle[2] = PIPE_SWIZZLE_1;
1574       } else if (desc->layout == UTIL_FORMAT_LAYOUT_PLANAR2) {
1575          iview.swizzle[1] = PIPE_SWIZZLE_0;
1576          iview.swizzle[2] = PIPE_SWIZZLE_0;
1577       }
1578    }
1579 
1580    GENX(panfrost_new_texture)(&iview, tex, &payload);
1581 }
1582 
1583 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1584 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1585                              struct pipe_context *pctx)
1586 {
1587    struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1588    if (view->texture_bo != rsrc->image.data.base ||
1589        view->modifier != rsrc->image.layout.modifier) {
1590       panfrost_bo_unreference(view->state.bo);
1591       panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1592    }
1593 }
1594 
1595 #if PAN_ARCH >= 6
1596 static void
panfrost_emit_null_texture(struct mali_texture_packed * out)1597 panfrost_emit_null_texture(struct mali_texture_packed *out)
1598 
1599 {
1600    /* Annoyingly, an all zero texture descriptor is not valid and will raise
1601     * a DATA_INVALID_FAULT if you try to texture it, instead of returning
1602     * 0000s! Fill in with sometthing that will behave robustly.
1603     */
1604    pan_pack(out, TEXTURE, cfg) {
1605       cfg.dimension = MALI_TEXTURE_DIMENSION_2D;
1606       cfg.width = 1;
1607       cfg.height = 1;
1608       cfg.depth = 1;
1609       cfg.array_size = 1;
1610       cfg.format = MALI_PACK_FMT(CONSTANT, 0000, L);
1611 #if PAN_ARCH <= 7
1612       cfg.texel_ordering = MALI_TEXTURE_LAYOUT_LINEAR;
1613 #endif
1614    }
1615 }
1616 #endif
1617 
1618 static mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1619 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1620                                   enum pipe_shader_type stage)
1621 {
1622    struct panfrost_context *ctx = batch->ctx;
1623 
1624    unsigned actual_count = ctx->sampler_view_count[stage];
1625    unsigned needed_count = ctx->prog[stage]->info.texture_count;
1626    unsigned alloc_count = MAX2(actual_count, needed_count);
1627 
1628    if (!alloc_count)
1629       return 0;
1630 
1631 #if PAN_ARCH >= 6
1632    struct panfrost_ptr T =
1633       pan_pool_alloc_desc_array(&batch->pool.base, alloc_count, TEXTURE);
1634    struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
1635 
1636    for (int i = 0; i < actual_count; ++i) {
1637       struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1638 
1639       if (!view) {
1640          panfrost_emit_null_texture(&out[i]);
1641          continue;
1642       }
1643 
1644       struct pipe_sampler_view *pview = &view->base;
1645       struct panfrost_resource *rsrc = pan_resource(pview->texture);
1646 
1647       panfrost_update_sampler_view(view, &ctx->base);
1648       out[i] = view->bifrost_descriptor;
1649 
1650       panfrost_batch_read_rsrc(batch, rsrc, stage);
1651       panfrost_batch_add_bo(batch, view->state.bo, stage);
1652    }
1653 
1654    for (int i = actual_count; i < needed_count; ++i)
1655       panfrost_emit_null_texture(&out[i]);
1656 
1657    return T.gpu;
1658 #else
1659    uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1660 
1661    for (int i = 0; i < actual_count; ++i) {
1662       struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1663 
1664       if (!view) {
1665          trampolines[i] = 0;
1666          continue;
1667       }
1668 
1669       panfrost_update_sampler_view(view, &ctx->base);
1670 
1671       trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1672    }
1673 
1674    for (int i = actual_count; i < needed_count; ++i)
1675       trampolines[i] = 0;
1676 
1677    return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1678                                   sizeof(uint64_t) * alloc_count,
1679                                   sizeof(uint64_t));
1680 #endif
1681 }
1682 
1683 static mali_ptr
panfrost_upload_wa_sampler(struct panfrost_batch * batch)1684 panfrost_upload_wa_sampler(struct panfrost_batch *batch)
1685 {
1686    struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, SAMPLER);
1687    pan_pack(T.cpu, SAMPLER, cfg)
1688       ;
1689    return T.gpu;
1690 }
1691 
1692 static mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1693 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1694                                   enum pipe_shader_type stage)
1695 {
1696    struct panfrost_context *ctx = batch->ctx;
1697 
1698    /* We always need at least 1 sampler for txf to work */
1699    if (!ctx->sampler_count[stage])
1700       return panfrost_upload_wa_sampler(batch);
1701 
1702    struct panfrost_ptr T = pan_pool_alloc_desc_array(
1703       &batch->pool.base, ctx->sampler_count[stage], SAMPLER);
1704    struct mali_sampler_packed *out = (struct mali_sampler_packed *)T.cpu;
1705 
1706    for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) {
1707       struct panfrost_sampler_state *st = ctx->samplers[stage][i];
1708 
1709       out[i] = st ? st->hw : (struct mali_sampler_packed){0};
1710    }
1711 
1712    return T.gpu;
1713 }
1714 
1715 #if PAN_ARCH <= 7
1716 /* Packs all image attribute descs and attribute buffer descs.
1717  * `first_image_buf_index` must be the index of the first image attribute buffer
1718  * descriptor.
1719  */
1720 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1721 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1722                    struct mali_attribute_packed *attribs, unsigned first_buf)
1723 {
1724    unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1725 
1726    for (unsigned i = 0; i < last_bit; ++i) {
1727       enum pipe_format format = ctx->images[shader][i].format;
1728 
1729       pan_pack(attribs + i, ATTRIBUTE, cfg) {
1730          /* Continuation record means 2 buffers per image */
1731          cfg.buffer_index = first_buf + (i * 2);
1732          cfg.offset_enable = (PAN_ARCH <= 5);
1733          cfg.format = GENX(panfrost_format_from_pipe_format)(format)->hw;
1734       }
1735    }
1736 }
1737 
1738 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1739 pan_modifier_to_attr_type(uint64_t modifier)
1740 {
1741    switch (modifier) {
1742    case DRM_FORMAT_MOD_LINEAR:
1743       return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1744    case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1745       return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1746    default:
1747       unreachable("Invalid modifier for attribute record");
1748    }
1749 }
1750 
1751 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1752 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1753                 struct mali_attribute_buffer_packed *bufs,
1754                 unsigned first_image_buf_index)
1755 {
1756    struct panfrost_context *ctx = batch->ctx;
1757    unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1758 
1759    for (unsigned i = 0; i < last_bit; ++i) {
1760       struct pipe_image_view *image = &ctx->images[shader][i];
1761 
1762       if (!(ctx->image_mask[shader] & (1 << i)) ||
1763           !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1764          /* Unused image bindings */
1765          pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg)
1766             ;
1767          pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg)
1768             ;
1769          continue;
1770       }
1771 
1772       struct panfrost_resource *rsrc = pan_resource(image->resource);
1773 
1774       bool is_msaa = image->resource->nr_samples > 1;
1775 
1776       bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1777       bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1778 
1779       unsigned offset = is_buffer ? image->u.buf.offset
1780                                   : panfrost_texture_offset(
1781                                        &rsrc->image.layout, image->u.tex.level,
1782                                        (is_3d || is_msaa) ? 0 : image->u.tex.first_layer,
1783                                        (is_3d || is_msaa) ? image->u.tex.first_layer : 0);
1784 
1785       panfrost_track_image_access(batch, shader, image);
1786 
1787       pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1788          cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1789          cfg.pointer = rsrc->image.data.base + offset;
1790          cfg.stride = util_format_get_blocksize(image->format);
1791          cfg.size = panfrost_bo_size(rsrc->bo) - offset;
1792       }
1793 
1794       if (is_buffer) {
1795          pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1796             cfg.s_dimension =
1797                rsrc->base.width0 / util_format_get_blocksize(image->format);
1798             cfg.t_dimension = cfg.r_dimension = 1;
1799          }
1800 
1801          continue;
1802       }
1803 
1804       pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1805          unsigned level = image->u.tex.level;
1806          unsigned r_dim;
1807 
1808          if (is_3d) {
1809             r_dim = u_minify(rsrc->base.depth0, level);
1810          } else if (is_msaa) {
1811             r_dim = u_minify(image->resource->nr_samples, level);
1812          } else {
1813             r_dim = image->u.tex.last_layer - image->u.tex.first_layer + 1;
1814          }
1815          cfg.s_dimension = u_minify(rsrc->base.width0, level);
1816          cfg.t_dimension = u_minify(rsrc->base.height0, level);
1817          cfg.r_dimension = r_dim;
1818 
1819          cfg.row_stride = rsrc->image.layout.slices[level].row_stride;
1820 
1821          if (is_msaa) {
1822             unsigned samples = rsrc->base.nr_samples;
1823             cfg.slice_stride =
1824                panfrost_get_layer_stride(&rsrc->image.layout, level) / samples;
1825          } else if (rsrc->base.target != PIPE_TEXTURE_2D) {
1826             cfg.slice_stride =
1827                panfrost_get_layer_stride(&rsrc->image.layout, level);
1828          }
1829       }
1830    }
1831 }
1832 
1833 static mali_ptr
panfrost_emit_image_attribs(struct panfrost_batch * batch,mali_ptr * buffers,enum pipe_shader_type type)1834 panfrost_emit_image_attribs(struct panfrost_batch *batch, mali_ptr *buffers,
1835                             enum pipe_shader_type type)
1836 {
1837    struct panfrost_context *ctx = batch->ctx;
1838    struct panfrost_compiled_shader *shader = ctx->prog[type];
1839 
1840    if (!shader->info.attribute_count) {
1841       *buffers = 0;
1842       return 0;
1843    }
1844 
1845    /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1846    unsigned attr_count = shader->info.attribute_count;
1847    unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
1848 
1849    struct panfrost_ptr bufs =
1850       pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1851 
1852    struct panfrost_ptr attribs =
1853       pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1854 
1855    emit_image_attribs(ctx, type, attribs.cpu, 0);
1856    emit_image_bufs(batch, type, bufs.cpu, 0);
1857 
1858    /* We need an empty attrib buf to stop the prefetching on Bifrost */
1859 #if PAN_ARCH >= 6
1860    pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
1861             ATTRIBUTE_BUFFER, cfg)
1862       ;
1863 #endif
1864 
1865    *buffers = bufs.gpu;
1866    return attribs.gpu;
1867 }
1868 
1869 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1870 panfrost_emit_vertex_data(struct panfrost_batch *batch, mali_ptr *buffers)
1871 {
1872    struct panfrost_context *ctx = batch->ctx;
1873    struct panfrost_vertex_state *so = ctx->vertex;
1874    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
1875    bool instanced = ctx->instance_count > 1;
1876    uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1877    unsigned nr_images = util_last_bit(image_mask);
1878 
1879    /* Worst case: everything is NPOT, which is only possible if instancing
1880     * is enabled. Otherwise single record is gauranteed.
1881     * Also, we allocate more memory than what's needed here if either instancing
1882     * is enabled or images are present, this can be improved. */
1883    unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
1884    unsigned nr_bufs =
1885       ((so->nr_bufs + nr_images) * bufs_per_attrib) + (PAN_ARCH >= 6 ? 1 : 0);
1886 
1887    unsigned count = vs->info.attribute_count;
1888 
1889    struct panfrost_compiled_shader *xfb =
1890       ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb;
1891 
1892    if (xfb)
1893       count = MAX2(count, xfb->info.attribute_count);
1894 
1895 #if PAN_ARCH <= 5
1896    /* Midgard needs vertexid/instanceid handled specially */
1897    bool special_vbufs = count >= PAN_VERTEX_ID;
1898 
1899    if (special_vbufs)
1900       nr_bufs += 2;
1901 #endif
1902 
1903    if (!nr_bufs) {
1904       *buffers = 0;
1905       return 0;
1906    }
1907 
1908    struct panfrost_ptr S =
1909       pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, ATTRIBUTE_BUFFER);
1910    struct panfrost_ptr T =
1911       pan_pool_alloc_desc_array(&batch->pool.base, count, ATTRIBUTE);
1912 
1913    struct mali_attribute_buffer_packed *bufs =
1914       (struct mali_attribute_buffer_packed *)S.cpu;
1915 
1916    struct mali_attribute_packed *out = (struct mali_attribute_packed *)T.cpu;
1917 
1918    unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = {0};
1919    unsigned k = 0;
1920 
1921    for (unsigned i = 0; i < so->nr_bufs; ++i) {
1922       unsigned vbi = so->buffers[i].vbi;
1923       unsigned divisor = so->buffers[i].divisor;
1924       attrib_to_buffer[i] = k;
1925 
1926       if (!(ctx->vb_mask & (1 << vbi)))
1927          continue;
1928 
1929       struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1930       struct panfrost_resource *rsrc;
1931 
1932       rsrc = pan_resource(buf->buffer.resource);
1933       if (!rsrc)
1934          continue;
1935 
1936       panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1937 
1938       /* Mask off lower bits, see offset fixup below */
1939       mali_ptr raw_addr = rsrc->image.data.base + buf->buffer_offset;
1940       mali_ptr addr = raw_addr & ~63;
1941 
1942       /* Since we advanced the base pointer, we shrink the buffer
1943        * size, but add the offset we subtracted */
1944       unsigned size =
1945          rsrc->base.width0 + (raw_addr - addr) - buf->buffer_offset;
1946 
1947       /* When there is a divisor, the hardware-level divisor is
1948        * the product of the instance divisor and the padded count */
1949       unsigned stride = so->strides[vbi];
1950       unsigned hw_divisor = ctx->padded_count * divisor;
1951 
1952       if (ctx->instance_count <= 1) {
1953          /* Per-instance would be every attribute equal */
1954          if (divisor)
1955             stride = 0;
1956 
1957          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1958             cfg.pointer = addr;
1959             cfg.stride = stride;
1960             cfg.size = size;
1961          }
1962       } else if (!divisor) {
1963          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1964             cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1965             cfg.pointer = addr;
1966             cfg.stride = stride;
1967             cfg.size = size;
1968             cfg.divisor = ctx->padded_count;
1969          }
1970       } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1971          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1972             cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1973             cfg.pointer = addr;
1974             cfg.stride = stride;
1975             cfg.size = size;
1976             cfg.divisor_r = __builtin_ctz(hw_divisor);
1977          }
1978 
1979       } else {
1980          unsigned shift = 0, extra_flags = 0;
1981 
1982          unsigned magic_divisor =
1983             panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1984 
1985          /* Records with continuations must be aligned */
1986          k = ALIGN_POT(k, 2);
1987          attrib_to_buffer[i] = k;
1988 
1989          pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1990             cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1991             cfg.pointer = addr;
1992             cfg.stride = stride;
1993             cfg.size = size;
1994 
1995             cfg.divisor_r = shift;
1996             cfg.divisor_e = extra_flags;
1997          }
1998 
1999          pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
2000             cfg.divisor_numerator = magic_divisor;
2001             cfg.divisor = divisor;
2002          }
2003 
2004          ++k;
2005       }
2006 
2007       ++k;
2008    }
2009 
2010 #if PAN_ARCH <= 5
2011    /* Add special gl_VertexID/gl_InstanceID buffers */
2012    if (special_vbufs) {
2013       panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
2014 
2015       pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
2016          cfg.buffer_index = k++;
2017          cfg.format = so->formats[PAN_VERTEX_ID];
2018       }
2019 
2020       panfrost_instance_id(ctx->padded_count, &bufs[k],
2021                            ctx->instance_count > 1);
2022 
2023       pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
2024          cfg.buffer_index = k++;
2025          cfg.format = so->formats[PAN_INSTANCE_ID];
2026       }
2027    }
2028 #endif
2029 
2030    if (nr_images) {
2031       k = ALIGN_POT(k, 2);
2032       emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
2033       emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
2034       k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
2035    }
2036 
2037 #if PAN_ARCH >= 6
2038    /* We need an empty attrib buf to stop the prefetching on Bifrost */
2039    pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg)
2040       ;
2041 #endif
2042 
2043    /* Attribute addresses require 64-byte alignment, so let:
2044     *
2045     *      base' = base & ~63 = base - (base & 63)
2046     *      offset' = offset + (base & 63)
2047     *
2048     * Since base' + offset' = base + offset, these are equivalent
2049     * addressing modes and now base is 64 aligned.
2050     */
2051 
2052    /* While these are usually equal, they are not required to be. In some
2053     * cases, u_blitter passes too high a value for num_elements.
2054     */
2055    assert(vs->info.attributes_read_count <= so->num_elements);
2056 
2057    for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) {
2058       unsigned vbi = so->pipe[i].vertex_buffer_index;
2059       struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2060 
2061       /* BOs are aligned; just fixup for buffer_offset */
2062       signed src_offset = so->pipe[i].src_offset;
2063       src_offset += (buf->buffer_offset & 63);
2064 
2065       /* Base instance offset */
2066       if (ctx->base_instance && so->pipe[i].instance_divisor) {
2067          src_offset += (ctx->base_instance * so->pipe[i].src_stride) /
2068                        so->pipe[i].instance_divisor;
2069       }
2070 
2071       /* Also, somewhat obscurely per-instance data needs to be
2072        * offset in response to a delayed start in an indexed draw */
2073 
2074       if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
2075          src_offset -= so->pipe[i].src_stride * ctx->offset_start;
2076 
2077       pan_pack(out + i, ATTRIBUTE, cfg) {
2078          cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
2079          cfg.format = so->formats[i];
2080          cfg.offset = src_offset;
2081       }
2082    }
2083 
2084    *buffers = S.gpu;
2085    return T.gpu;
2086 }
2087 
2088 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)2089 panfrost_emit_varyings(struct panfrost_batch *batch,
2090                        struct mali_attribute_buffer_packed *slot,
2091                        unsigned stride, unsigned count)
2092 {
2093    unsigned size = stride * count;
2094    mali_ptr ptr =
2095       pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
2096 
2097    pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
2098       cfg.stride = stride;
2099       cfg.size = size;
2100       cfg.pointer = ptr;
2101    }
2102 
2103    return ptr;
2104 }
2105 
2106 /* Given a varying, figure out which index it corresponds to */
2107 
2108 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)2109 pan_varying_index(unsigned present, enum pan_special_varying v)
2110 {
2111    return util_bitcount(present & BITFIELD_MASK(v));
2112 }
2113 
2114 /* Determines which varying buffers are required */
2115 
2116 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)2117 pan_varying_present(const struct panfrost_device *dev,
2118                     struct pan_shader_info *producer,
2119                     struct pan_shader_info *consumer, uint16_t point_coord_mask)
2120 {
2121    /* At the moment we always emit general and position buffers. Not
2122     * strictly necessary but usually harmless */
2123 
2124    unsigned present =
2125       BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
2126 
2127    /* Enable special buffers by the shader info */
2128 
2129    if (producer->vs.writes_point_size)
2130       present |= BITFIELD_BIT(PAN_VARY_PSIZ);
2131 
2132 #if PAN_ARCH <= 5
2133    /* On Midgard, these exist as real varyings. Later architectures use
2134     * LD_VAR_SPECIAL reads instead. */
2135 
2136    if (consumer->fs.reads_point_coord)
2137       present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2138 
2139    if (consumer->fs.reads_face)
2140       present |= BITFIELD_BIT(PAN_VARY_FACE);
2141 
2142    if (consumer->fs.reads_frag_coord)
2143       present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
2144 
2145    /* Also, if we have a point sprite, we need a point coord buffer */
2146 
2147    for (unsigned i = 0; i < consumer->varyings.input_count; i++) {
2148       gl_varying_slot loc = consumer->varyings.input[i].location;
2149 
2150       if (util_varying_is_point_coord(loc, point_coord_mask))
2151          present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2152    }
2153 #endif
2154 
2155    return present;
2156 }
2157 
2158 /* Emitters for varying records */
2159 
2160 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)2161 pan_emit_vary(const struct panfrost_device *dev,
2162               struct mali_attribute_packed *out, unsigned buffer_index,
2163               mali_pixel_format format, unsigned offset)
2164 {
2165    pan_pack(out, ATTRIBUTE, cfg) {
2166       cfg.buffer_index = buffer_index;
2167       cfg.offset_enable = (PAN_ARCH <= 5);
2168       cfg.format = format;
2169       cfg.offset = offset;
2170    }
2171 }
2172 
2173 /* Special records */
2174 
2175 /* clang-format off */
2176 static const struct {
2177    unsigned components;
2178    enum mali_format format;
2179 } pan_varying_formats[PAN_VARY_MAX] = {
2180    [PAN_VARY_POSITION]  = { 4, MALI_SNAP_4   },
2181    [PAN_VARY_PSIZ]      = { 1, MALI_R16F     },
2182    [PAN_VARY_PNTCOORD]  = { 4, MALI_RGBA32F  },
2183    [PAN_VARY_FACE]      = { 1, MALI_R32I     },
2184    [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F  },
2185 };
2186 /* clang-format on */
2187 
2188 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)2189 pan_special_format(const struct panfrost_device *dev,
2190                    enum pan_special_varying buf)
2191 {
2192    assert(buf < PAN_VARY_MAX);
2193    mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2194 
2195 #if PAN_ARCH <= 6
2196    unsigned nr = pan_varying_formats[buf].components;
2197    format |= panfrost_get_default_swizzle(nr);
2198 #endif
2199 
2200    return format;
2201 }
2202 
2203 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)2204 pan_emit_vary_special(const struct panfrost_device *dev,
2205                       struct mali_attribute_packed *out, unsigned present,
2206                       enum pan_special_varying buf)
2207 {
2208    pan_emit_vary(dev, out, pan_varying_index(present, buf),
2209                  pan_special_format(dev, buf), 0);
2210 }
2211 
2212 /* Negative indicates a varying is not found */
2213 
2214 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)2215 pan_find_vary(const struct pan_shader_varying *vary, unsigned vary_count,
2216               unsigned loc)
2217 {
2218    for (unsigned i = 0; i < vary_count; ++i) {
2219       if (vary[i].location == loc)
2220          return i;
2221    }
2222 
2223    return -1;
2224 }
2225 
2226 /* Assign varying locations for the general buffer. Returns the calculated
2227  * per-vertex stride, and outputs offsets into the passed array. Negative
2228  * offset indicates a varying is not used. */
2229 
2230 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2231 pan_assign_varyings(const struct panfrost_device *dev,
2232                     struct pan_shader_info *producer,
2233                     struct pan_shader_info *consumer, signed *offsets)
2234 {
2235    unsigned producer_count = producer->varyings.output_count;
2236    unsigned consumer_count = consumer->varyings.input_count;
2237 
2238    const struct pan_shader_varying *producer_vars = producer->varyings.output;
2239    const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2240 
2241    unsigned stride = 0;
2242 
2243    for (unsigned i = 0; i < producer_count; ++i) {
2244       signed loc = pan_find_vary(consumer_vars, consumer_count,
2245                                  producer_vars[i].location);
2246       enum pipe_format format =
2247          loc >= 0 ? consumer_vars[loc].format : PIPE_FORMAT_NONE;
2248 
2249       if (format != PIPE_FORMAT_NONE) {
2250          offsets[i] = stride;
2251          stride += util_format_get_blocksize(format);
2252       } else {
2253          offsets[i] = -1;
2254       }
2255    }
2256 
2257    return stride;
2258 }
2259 
2260 /* Emitter for a single varying (attribute) descriptor */
2261 
2262 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,signed offset,enum pan_special_varying pos_varying)2263 panfrost_emit_varying(const struct panfrost_device *dev,
2264                       struct mali_attribute_packed *out,
2265                       const struct pan_shader_varying varying,
2266                       enum pipe_format pipe_format, unsigned present,
2267                       uint16_t point_sprite_mask, signed offset,
2268                       enum pan_special_varying pos_varying)
2269 {
2270    /* Note: varying.format != pipe_format in some obscure cases due to a
2271     * limitation of the NIR linker. This should be fixed in the future to
2272     * eliminate the additional lookups. See:
2273     * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2274     */
2275    gl_varying_slot loc = varying.location;
2276    mali_pixel_format format =
2277       GENX(panfrost_format_from_pipe_format)(pipe_format)->hw;
2278 
2279    if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2280       pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2281    } else if (loc == VARYING_SLOT_POS) {
2282       pan_emit_vary_special(dev, out, present, pos_varying);
2283    } else if (loc == VARYING_SLOT_PSIZ) {
2284       pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2285    } else if (loc == VARYING_SLOT_FACE) {
2286       pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2287    } else if (offset < 0) {
2288       pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2289    } else {
2290       STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2291       pan_emit_vary(dev, out, 0, format, offset);
2292    }
2293 }
2294 
2295 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2296  * rather than draw time (under good conditions). */
2297 
2298 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_compiled_shader * producer,struct panfrost_compiled_shader * consumer,uint16_t point_coord_mask,struct pan_linkage * out)2299 panfrost_emit_varying_descs(struct panfrost_pool *pool,
2300                             struct panfrost_compiled_shader *producer,
2301                             struct panfrost_compiled_shader *consumer,
2302                             uint16_t point_coord_mask, struct pan_linkage *out)
2303 {
2304    struct panfrost_device *dev = pool->dev;
2305    unsigned producer_count = producer->info.varyings.output_count;
2306    unsigned consumer_count = consumer->info.varyings.input_count;
2307 
2308    /* Offsets within the general varying buffer, indexed by location */
2309    signed offsets[PAN_MAX_VARYINGS];
2310    assert(producer_count <= ARRAY_SIZE(offsets));
2311    assert(consumer_count <= ARRAY_SIZE(offsets));
2312 
2313    /* Allocate enough descriptors for both shader stages */
2314    struct panfrost_ptr T = pan_pool_alloc_desc_array(
2315       &pool->base, producer_count + consumer_count, ATTRIBUTE);
2316 
2317    /* Take a reference if we're being put on the CSO */
2318    if (!pool->owned) {
2319       out->bo = pool->transient_bo;
2320       panfrost_bo_reference(out->bo);
2321    }
2322 
2323    struct mali_attribute_packed *descs = T.cpu;
2324    out->producer = producer_count ? T.gpu : 0;
2325    out->consumer =
2326       consumer_count ? T.gpu + (pan_size(ATTRIBUTE) * producer_count) : 0;
2327 
2328    /* Lay out the varyings. Must use producer to lay out, in order to
2329     * respect transform feedback precisions. */
2330    out->present = pan_varying_present(dev, &producer->info, &consumer->info,
2331                                       point_coord_mask);
2332 
2333    out->stride =
2334       pan_assign_varyings(dev, &producer->info, &consumer->info, offsets);
2335 
2336    for (unsigned i = 0; i < producer_count; ++i) {
2337       signed j = pan_find_vary(consumer->info.varyings.input,
2338                                consumer->info.varyings.input_count,
2339                                producer->info.varyings.output[i].location);
2340 
2341       enum pipe_format format = (j >= 0)
2342                                    ? consumer->info.varyings.input[j].format
2343                                    : producer->info.varyings.output[i].format;
2344 
2345       panfrost_emit_varying(dev, descs + i, producer->info.varyings.output[i],
2346                             format, out->present, 0, offsets[i],
2347                             PAN_VARY_POSITION);
2348    }
2349 
2350    for (unsigned i = 0; i < consumer_count; ++i) {
2351       signed j = pan_find_vary(producer->info.varyings.output,
2352                                producer->info.varyings.output_count,
2353                                consumer->info.varyings.input[i].location);
2354 
2355       signed offset = (j >= 0) ? offsets[j] : -1;
2356 
2357       panfrost_emit_varying(
2358          dev, descs + producer_count + i, consumer->info.varyings.input[i],
2359          consumer->info.varyings.input[i].format, out->present,
2360          point_coord_mask, offset, PAN_VARY_FRAGCOORD);
2361    }
2362 }
2363 
2364 #if PAN_ARCH <= 5
2365 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2366 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2367                        unsigned present, enum pan_special_varying v,
2368                        unsigned special)
2369 {
2370    if (present & BITFIELD_BIT(v)) {
2371       unsigned idx = pan_varying_index(present, v);
2372 
2373       pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2374          cfg.special = special;
2375          cfg.type = 0;
2376       }
2377    }
2378 }
2379 #endif
2380 
2381 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,bool point_coord_replace)2382 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2383                                  unsigned vertex_count,
2384                                  bool point_coord_replace)
2385 {
2386    struct panfrost_context *ctx = batch->ctx;
2387    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2388    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
2389 
2390    uint16_t point_coord_mask = 0;
2391 
2392    memset(&batch->varyings, 0, sizeof(batch->varyings));
2393 
2394 #if PAN_ARCH <= 5
2395    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2396 
2397    /* Point sprites are lowered on Bifrost and newer */
2398    if (point_coord_replace)
2399       point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2400 #endif
2401 
2402    /* In good conditions, we only need to link varyings once */
2403    bool prelink =
2404       (point_coord_mask == 0) && !vs->info.separable && !fs->info.separable;
2405 
2406    /* Try to reduce copies */
2407    struct pan_linkage _linkage;
2408    struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2409 
2410    /* Emit ATTRIBUTE descriptors if needed */
2411    if (!prelink || vs->linkage.bo == NULL) {
2412       struct panfrost_pool *pool = prelink ? &ctx->descs : &batch->pool;
2413 
2414       panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
2415    }
2416 
2417    unsigned present = linkage->present, stride = linkage->stride;
2418    unsigned count = util_bitcount(present);
2419    struct panfrost_ptr T =
2420       pan_pool_alloc_desc_array(&batch->pool.base, count + 1, ATTRIBUTE_BUFFER);
2421    struct mali_attribute_buffer_packed *varyings =
2422       (struct mali_attribute_buffer_packed *)T.cpu;
2423 
2424    batch->varyings.nr_bufs = count;
2425 
2426 #if PAN_ARCH >= 6
2427    /* Suppress prefetch on Bifrost */
2428    memset(varyings + count, 0, sizeof(*varyings));
2429 #endif
2430 
2431    if (stride) {
2432       panfrost_emit_varyings(
2433          batch, &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], stride,
2434          vertex_count);
2435    } else {
2436       /* The indirect draw code reads the stride field, make sure
2437        * that it is initialised */
2438       memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0,
2439              sizeof(*varyings));
2440    }
2441 
2442    /* fp32 vec4 gl_Position */
2443    batch->varyings.pos = panfrost_emit_varyings(
2444       batch, &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2445       sizeof(float) * 4, vertex_count);
2446 
2447    if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2448       batch->varyings.psiz = panfrost_emit_varyings(
2449          batch, &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2,
2450          vertex_count);
2451    }
2452 
2453 #if PAN_ARCH <= 5
2454    pan_emit_special_input(
2455       varyings, present, PAN_VARY_PNTCOORD,
2456       (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2457          ? MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MAX_Y
2458          : MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MIN_Y);
2459    pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2460                           MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2461    pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2462                           MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2463 #endif
2464 
2465    batch->varyings.bufs = T.gpu;
2466    batch->varyings.vs = linkage->producer;
2467    batch->varyings.fs = linkage->consumer;
2468 }
2469 #endif
2470 
2471 static void
emit_tls(struct panfrost_batch * batch)2472 emit_tls(struct panfrost_batch *batch)
2473 {
2474    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2475 
2476    /* Emitted with the FB descriptor on Midgard. */
2477    if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2478       return;
2479 
2480    struct panfrost_bo *tls_bo =
2481       batch->stack_size ? panfrost_batch_get_scratchpad(
2482                              batch, batch->stack_size, dev->thread_tls_alloc,
2483                              dev->core_id_range)
2484                         : NULL;
2485    struct pan_tls_info tls = {
2486       .tls =
2487          {
2488             .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2489             .size = batch->stack_size,
2490          },
2491    };
2492 
2493    assert(batch->tls.cpu);
2494    GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2495 }
2496 
2497 static void
emit_fbd(struct panfrost_batch * batch,struct pan_fb_info * fb)2498 emit_fbd(struct panfrost_batch *batch, struct pan_fb_info *fb)
2499 {
2500    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2501    struct panfrost_bo *tls_bo =
2502       batch->stack_size ? panfrost_batch_get_scratchpad(
2503                              batch, batch->stack_size, dev->thread_tls_alloc,
2504                              dev->core_id_range)
2505                         : NULL;
2506    struct pan_tls_info tls = {
2507       .tls =
2508          {
2509             .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2510             .size = batch->stack_size,
2511          },
2512    };
2513 
2514 #if PAN_ARCH >= 6
2515    fb->sample_positions =
2516       dev->sample_positions->ptr.gpu +
2517       panfrost_sample_positions_offset(pan_sample_pattern(fb->nr_samples));
2518 #endif
2519 
2520    batch->framebuffer.gpu |=
2521       GENX(pan_emit_fbd)(fb, &tls, &batch->tiler_ctx, batch->framebuffer.cpu);
2522 }
2523 
2524 /* Mark a surface as written */
2525 
2526 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2527 panfrost_initialize_surface(struct panfrost_batch *batch,
2528                             struct pipe_surface *surf)
2529 {
2530    if (surf) {
2531       struct panfrost_resource *rsrc = pan_resource(surf->texture);
2532       BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2533    }
2534 }
2535 
2536 /* Generate a fragment job. This should be called once per frame. (Usually,
2537  * this corresponds to eglSwapBuffers or one of glFlush, glFinish)
2538  */
2539 static void
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2540 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2541 {
2542    /* Mark the affected buffers as initialized, since we're writing to it.
2543     * Also, add the surfaces we're writing to to the batch */
2544 
2545    struct pipe_framebuffer_state *fb = &batch->key;
2546 
2547    for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2548       panfrost_initialize_surface(batch, fb->cbufs[i]);
2549 
2550    panfrost_initialize_surface(batch, fb->zsbuf);
2551 
2552    /* The passed tile coords can be out of range in some cases, so we need
2553     * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2554     * Theoretically we also need to clamp the coordinates positive, but we
2555     * avoid that edge case as all four values are unsigned. Also,
2556     * theoretically we could clamp the minima, but if that has to happen
2557     * the asserts would fail anyway (since the maxima would get clamped
2558     * and then be smaller than the minima). An edge case of sorts occurs
2559     * when no scissors are added to draw, so by default min=~0 and max=0.
2560     * But that can't happen if any actual drawing occurs (beyond a
2561     * wallpaper reload), so this is again irrelevant in practice. */
2562 
2563    batch->maxx = MIN2(batch->maxx, fb->width);
2564    batch->maxy = MIN2(batch->maxy, fb->height);
2565 
2566    /* Rendering region must be at least 1x1; otherwise, there is nothing
2567     * to do and the whole job chain should have been discarded. */
2568 
2569    assert(batch->maxx > batch->minx);
2570    assert(batch->maxy > batch->miny);
2571 
2572    JOBX(emit_fragment_job)(batch, pfb);
2573 }
2574 
2575 /* Count generated primitives (when there is no geom/tess shaders) for
2576  * transform feedback */
2577 
2578 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2579 panfrost_statistics_record(struct panfrost_context *ctx,
2580                            const struct pipe_draw_info *info,
2581                            const struct pipe_draw_start_count_bias *draw)
2582 {
2583    if (!ctx->active_queries)
2584       return;
2585 
2586    uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2587    ctx->prims_generated += prims;
2588 
2589    if (!ctx->streamout.num_targets)
2590       return;
2591 
2592    ctx->tf_prims_generated += prims;
2593    ctx->dirty |= PAN_DIRTY_SO;
2594 }
2595 
2596 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2597 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2598 {
2599    unsigned count =
2600       u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2601 
2602    for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2603       if (!ctx->streamout.targets[i])
2604          continue;
2605 
2606       pan_so_target(ctx->streamout.targets[i])->offset += count;
2607    }
2608 }
2609 
2610 /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of
2611  * 3D state. In particular, it groups the fragment shader descriptor with
2612  * depth/stencil, blend, polygon offset, and multisampling state. These pieces
2613  * of state are dirty tracked independently for the benefit of newer GPUs that
2614  * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D
2615  * dirty flags that trigger re-emits of the fragment RSD.
2616  *
2617  * Obscurely, occlusion queries are included. Occlusion query state is nominally
2618  * specified in the draw call descriptor, but must be considered when determing
2619  * early-Z state which is part of the RSD.
2620  */
2621 #define FRAGMENT_RSD_DIRTY_MASK                                                \
2622    (PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER |   \
2623     PAN_DIRTY_OQ)
2624 
2625 static inline void
panfrost_update_shader_state(struct panfrost_batch * batch,enum pipe_shader_type st)2626 panfrost_update_shader_state(struct panfrost_batch *batch,
2627                              enum pipe_shader_type st)
2628 {
2629    struct panfrost_context *ctx = batch->ctx;
2630    struct panfrost_compiled_shader *ss = ctx->prog[st];
2631 
2632    bool frag = (st == PIPE_SHADER_FRAGMENT);
2633    unsigned dirty_3d = ctx->dirty;
2634    unsigned dirty = ctx->dirty_shader[st];
2635 
2636    if (dirty & (PAN_DIRTY_STAGE_TEXTURE | PAN_DIRTY_STAGE_SHADER)) {
2637       batch->textures[st] = panfrost_emit_texture_descriptors(batch, st);
2638    }
2639 
2640    if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2641       batch->samplers[st] = panfrost_emit_sampler_descriptors(batch, st);
2642    }
2643 
2644    /* On Bifrost and older, the fragment shader descriptor is fused
2645     * together with the renderer state; the combined renderer state
2646     * descriptor is emitted below. Otherwise, the shader descriptor is
2647     * standalone and is emitted here.
2648     */
2649    if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) {
2650       batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2651    }
2652 
2653 #if PAN_ARCH >= 9
2654    if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2655       batch->images[st] =
2656          ctx->image_mask[st] ? panfrost_emit_images(batch, st) : 0;
2657    }
2658 #endif
2659 
2660    if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2661       batch->uniform_buffers[st] = panfrost_emit_const_buf(
2662          batch, st, &batch->nr_uniform_buffers[st], &batch->push_uniforms[st],
2663          &batch->nr_push_uniforms[st]);
2664    }
2665 
2666 #if PAN_ARCH <= 7
2667    /* On Bifrost and older, if the fragment shader changes OR any renderer
2668     * state specified with the fragment shader, the whole renderer state
2669     * descriptor is dirtied and must be reemited.
2670     */
2671    if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) ||
2672                 (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) {
2673 
2674       batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2675    }
2676 
2677    /* Vertex shaders need to mix vertex data and image descriptors in the
2678     * attribute array. This is taken care of in panfrost_update_state_3d().
2679     */
2680    if (st != PIPE_SHADER_VERTEX && (dirty & PAN_DIRTY_STAGE_IMAGE)) {
2681       batch->attribs[st] =
2682          panfrost_emit_image_attribs(batch, &batch->attrib_bufs[st], st);
2683    }
2684 #endif
2685 }
2686 
2687 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2688 panfrost_update_state_3d(struct panfrost_batch *batch)
2689 {
2690    struct panfrost_context *ctx = batch->ctx;
2691    unsigned dirty = ctx->dirty;
2692 
2693    if (dirty & PAN_DIRTY_TLS_SIZE)
2694       panfrost_batch_adjust_stack_size(batch);
2695 
2696    if (dirty & PAN_DIRTY_BLEND)
2697       panfrost_set_batch_masks_blend(batch);
2698 
2699    if (dirty & PAN_DIRTY_ZS)
2700       panfrost_set_batch_masks_zs(batch);
2701 
2702 #if PAN_ARCH >= 9
2703    if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) ||
2704        (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER))
2705       batch->depth_stencil = panfrost_emit_depth_stencil(batch);
2706 
2707    if (dirty & PAN_DIRTY_BLEND)
2708       batch->blend = panfrost_emit_blend_valhall(batch);
2709 
2710    if (dirty & PAN_DIRTY_VERTEX) {
2711       batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(batch);
2712 
2713       batch->attrib_bufs[PIPE_SHADER_VERTEX] =
2714          panfrost_emit_vertex_buffers(batch);
2715    }
2716 #else
2717    unsigned vt_shader_dirty = ctx->dirty_shader[PIPE_SHADER_VERTEX];
2718 
2719    /* Vertex data, vertex shader and images accessed by the vertex shader have
2720     * an impact on the attributes array, we need to re-emit anytime one of these
2721     * parameters changes. */
2722    if ((dirty & PAN_DIRTY_VERTEX) ||
2723        (vt_shader_dirty & (PAN_DIRTY_STAGE_IMAGE | PAN_DIRTY_STAGE_SHADER))) {
2724       batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(
2725          batch, &batch->attrib_bufs[PIPE_SHADER_VERTEX]);
2726    }
2727 #endif
2728 }
2729 
2730 static void
panfrost_launch_xfb(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned count)2731 panfrost_launch_xfb(struct panfrost_batch *batch,
2732                     const struct pipe_draw_info *info, unsigned count)
2733 {
2734    struct panfrost_context *ctx = batch->ctx;
2735 
2736    /* Nothing to do */
2737    if (batch->ctx->streamout.num_targets == 0)
2738       return;
2739 
2740    /* TODO: XFB with index buffers */
2741    // assert(info->index_size == 0);
2742    u_trim_pipe_prim(info->mode, &count);
2743 
2744    if (count == 0)
2745       return;
2746 
2747    perf_debug_ctx(batch->ctx, "Emulating transform feedback");
2748 
2749    struct panfrost_uncompiled_shader *vs_uncompiled =
2750       ctx->uncompiled[PIPE_SHADER_VERTEX];
2751    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2752 
2753    vs_uncompiled->xfb->stream_output = vs->stream_output;
2754 
2755    mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
2756    mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
2757    mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
2758    unsigned saved_nr_push_uniforms =
2759       batch->nr_push_uniforms[PIPE_SHADER_VERTEX];
2760 
2761    ctx->uncompiled[PIPE_SHADER_VERTEX] = NULL; /* should not be read */
2762    ctx->prog[PIPE_SHADER_VERTEX] = vs_uncompiled->xfb;
2763    batch->rsd[PIPE_SHADER_VERTEX] =
2764       panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
2765 
2766    batch->uniform_buffers[PIPE_SHADER_VERTEX] =
2767       panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
2768                               &batch->push_uniforms[PIPE_SHADER_VERTEX],
2769                               &batch->nr_push_uniforms[PIPE_SHADER_VERTEX]);
2770 
2771    JOBX(launch_xfb)(batch, info, count);
2772    batch->compute_count++;
2773 
2774    ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled;
2775    ctx->prog[PIPE_SHADER_VERTEX] = vs;
2776    batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
2777    batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
2778    batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
2779    batch->nr_push_uniforms[PIPE_SHADER_VERTEX] = saved_nr_push_uniforms;
2780 }
2781 
2782 /*
2783  * Increase the vertex count on the batch using a saturating add, and hope the
2784  * compiler can use the machine instruction here...
2785  */
2786 static inline void
panfrost_increase_vertex_count(struct panfrost_batch * batch,uint32_t increment)2787 panfrost_increase_vertex_count(struct panfrost_batch *batch, uint32_t increment)
2788 {
2789    uint32_t sum = batch->tiler_ctx.vertex_count + increment;
2790 
2791    if (sum >= batch->tiler_ctx.vertex_count)
2792       batch->tiler_ctx.vertex_count = sum;
2793    else
2794       batch->tiler_ctx.vertex_count = UINT32_MAX;
2795 }
2796 
2797 /*
2798  * If we change whether we're drawing points, or whether point sprites are
2799  * enabled (specified in the rasterizer), we may need to rebind shaders
2800  * accordingly. This implicitly covers the case of rebinding framebuffers,
2801  * because all dirty flags are set there.
2802  */
2803 static void
panfrost_update_active_prim(struct panfrost_context * ctx,const struct pipe_draw_info * info)2804 panfrost_update_active_prim(struct panfrost_context *ctx,
2805                             const struct pipe_draw_info *info)
2806 {
2807    const enum mesa_prim prev_prim = u_reduced_prim(ctx->active_prim);
2808    const enum mesa_prim new_prim = u_reduced_prim(info->mode);
2809 
2810    ctx->active_prim = info->mode;
2811 
2812    if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
2813        (prev_prim != new_prim)) {
2814       panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
2815    }
2816 }
2817 
2818 static unsigned
panfrost_draw_get_vertex_count(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool idvs)2819 panfrost_draw_get_vertex_count(struct panfrost_batch *batch,
2820                                const struct pipe_draw_info *info,
2821                                const struct pipe_draw_start_count_bias *draw,
2822                                bool idvs)
2823 {
2824    struct panfrost_context *ctx = batch->ctx;
2825    unsigned vertex_count = ctx->vertex_count;
2826    unsigned min_index = 0, max_index = 0;
2827 
2828    batch->indices = 0;
2829    if (info->index_size && PAN_ARCH >= 9) {
2830       batch->indices = panfrost_get_index_buffer(batch, info, draw);
2831 
2832       /* Use index count to estimate vertex count */
2833       panfrost_increase_vertex_count(batch, draw->count);
2834    } else if (info->index_size) {
2835       batch->indices = panfrost_get_index_buffer_bounded(
2836          batch, info, draw, &min_index, &max_index);
2837 
2838       /* Use the corresponding values */
2839       vertex_count = max_index - min_index + 1;
2840       ctx->offset_start = min_index + draw->index_bias;
2841       panfrost_increase_vertex_count(batch, vertex_count);
2842    } else {
2843       ctx->offset_start = draw->start;
2844       panfrost_increase_vertex_count(batch, vertex_count);
2845    }
2846 
2847    if (info->instance_count > 1) {
2848       unsigned count = vertex_count;
2849 
2850       /* Index-Driven Vertex Shading requires different instances to
2851        * have different cache lines for position results. Each vertex
2852        * position is 16 bytes and the Mali cache line is 64 bytes, so
2853        * the instance count must be aligned to 4 vertices.
2854        */
2855       if (idvs)
2856          count = ALIGN_POT(count, 4);
2857 
2858       ctx->padded_count = panfrost_padded_vertex_count(count);
2859    } else {
2860       ctx->padded_count = vertex_count;
2861    }
2862 
2863    return vertex_count;
2864 }
2865 
2866 static void
panfrost_direct_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)2867 panfrost_direct_draw(struct panfrost_batch *batch,
2868                      const struct pipe_draw_info *info, unsigned drawid_offset,
2869                      const struct pipe_draw_start_count_bias *draw)
2870 {
2871    if (!draw->count || !info->instance_count)
2872       return;
2873 
2874    struct panfrost_context *ctx = batch->ctx;
2875 
2876    panfrost_update_active_prim(ctx, info);
2877 
2878    /* Take into account a negative bias */
2879    ctx->vertex_count =
2880       draw->count + (info->index_size ? abs(draw->index_bias) : 0);
2881    ctx->instance_count = info->instance_count;
2882    ctx->base_vertex = info->index_size ? draw->index_bias : 0;
2883    ctx->base_instance = info->start_instance;
2884    ctx->drawid = drawid_offset;
2885 
2886    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2887    bool idvs = vs->info.vs.idvs;
2888 
2889    UNUSED unsigned vertex_count =
2890       panfrost_draw_get_vertex_count(batch, info, draw, idvs);
2891 
2892    panfrost_statistics_record(ctx, info, draw);
2893 
2894    panfrost_update_state_3d(batch);
2895    panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
2896    panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
2897    panfrost_clean_state_3d(ctx);
2898 
2899    if (ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb) {
2900       panfrost_launch_xfb(batch, info, draw->count);
2901    }
2902 
2903    /* Increment transform feedback offsets */
2904    panfrost_update_streamout_offsets(ctx);
2905 
2906    /* Any side effects must be handled by the XFB shader, so we only need
2907     * to run vertex shaders if we need rasterization.
2908     */
2909    if (panfrost_batch_skip_rasterization(batch))
2910       return;
2911 
2912 #if PAN_ARCH <= 7
2913    /* Emit all sort of descriptors. */
2914    panfrost_emit_varying_descriptor(batch,
2915                                     ctx->padded_count * ctx->instance_count,
2916                                     info->mode == MESA_PRIM_POINTS);
2917 #endif
2918 
2919    JOBX(launch_draw)(batch, info, drawid_offset, draw, vertex_count);
2920    batch->draw_count++;
2921 }
2922 
2923 static bool
panfrost_compatible_batch_state(struct panfrost_batch * batch,enum mesa_prim reduced_prim)2924 panfrost_compatible_batch_state(struct panfrost_batch *batch,
2925                                 enum mesa_prim reduced_prim)
2926 {
2927    struct panfrost_context *ctx = batch->ctx;
2928    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2929 
2930    if (reduced_prim == MESA_PRIM_LINES &&
2931        !pan_tristate_set(&batch->line_smoothing, rast->line_smooth))
2932       return false;
2933 
2934    /* Only applies on Valhall */
2935    if (PAN_ARCH < 9)
2936       return true;
2937 
2938    bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
2939    bool first = rast->flatshade_first;
2940 
2941    /* gl_PointCoord orientation only matters when drawing points, but
2942     * provoking vertex doesn't matter for points.
2943     */
2944    if (reduced_prim == MESA_PRIM_POINTS)
2945       return pan_tristate_set(&batch->sprite_coord_origin, coord);
2946    else
2947       return pan_tristate_set(&batch->first_provoking_vertex, first);
2948 }
2949 
2950 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)2951 panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
2952                   unsigned drawid_offset,
2953                   const struct pipe_draw_indirect_info *indirect,
2954                   const struct pipe_draw_start_count_bias *draws,
2955                   unsigned num_draws)
2956 {
2957    struct panfrost_context *ctx = pan_context(pipe);
2958    struct panfrost_device *dev = pan_device(pipe->screen);
2959 
2960    if (!panfrost_render_condition_check(ctx))
2961       return;
2962 
2963    ctx->draw_calls++;
2964 
2965    /* Emulate indirect draws on JM */
2966    if (indirect && indirect->buffer) {
2967       assert(num_draws == 1);
2968       util_draw_indirect(pipe, info, indirect);
2969       perf_debug(dev, "Emulating indirect draw on the CPU");
2970       return;
2971    }
2972 
2973    /* Do some common setup */
2974    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
2975 
2976    /* Don't add too many jobs to a single batch. Job manager hardware has a
2977     * hard limit of 65536 jobs per job chain. Given a draw issues a maximum
2978     * of 3 jobs (a vertex, a tiler and a compute job is XFB is enabled), we
2979     * could use 65536 / 3 as a limit, but we choose a smaller soft limit
2980     * (arbitrary) to avoid the risk of timeouts. This might not be a good
2981     * idea. */
2982    if (unlikely(batch->draw_count > 10000))
2983       batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
2984 
2985    enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
2986 
2987    if (unlikely(!panfrost_compatible_batch_state(batch, reduced_prim))) {
2988       batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
2989 
2990       ASSERTED bool succ = panfrost_compatible_batch_state(batch, reduced_prim);
2991       assert(succ && "must be able to set state for a fresh batch");
2992    }
2993 
2994    /* panfrost_batch_skip_rasterization reads
2995     * batch->scissor_culls_everything, which is set by
2996     * panfrost_emit_viewport, so call that first.
2997     */
2998    if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
2999       batch->viewport = panfrost_emit_viewport(batch);
3000 
3001    /* Mark everything dirty when debugging */
3002    if (unlikely(dev->debug & PAN_DBG_DIRTY))
3003       panfrost_dirty_state_all(ctx);
3004 
3005    /* Conservatively assume draw parameters always change */
3006    ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3007 
3008    struct pipe_draw_info tmp_info = *info;
3009    unsigned drawid = drawid_offset;
3010 
3011    for (unsigned i = 0; i < num_draws; i++) {
3012       panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);
3013 
3014       if (tmp_info.increment_draw_id) {
3015          ctx->dirty |= PAN_DIRTY_DRAWID;
3016          drawid++;
3017       }
3018    }
3019 }
3020 
3021 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3022  * construct the COMPUTE job and some of its payload.
3023  */
3024 
3025 static void
panfrost_launch_grid_on_batch(struct pipe_context * pipe,struct panfrost_batch * batch,const struct pipe_grid_info * info)3026 panfrost_launch_grid_on_batch(struct pipe_context *pipe,
3027                               struct panfrost_batch *batch,
3028                               const struct pipe_grid_info *info)
3029 {
3030    struct panfrost_context *ctx = pan_context(pipe);
3031 
3032    if (info->indirect && !PAN_GPU_INDIRECTS) {
3033       struct pipe_transfer *transfer;
3034       uint32_t *params =
3035          pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset,
3036                                3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer);
3037 
3038       struct pipe_grid_info direct = *info;
3039       direct.indirect = NULL;
3040       direct.grid[0] = params[0];
3041       direct.grid[1] = params[1];
3042       direct.grid[2] = params[2];
3043       pipe_buffer_unmap(pipe, transfer);
3044 
3045       if (params[0] && params[1] && params[2])
3046          panfrost_launch_grid_on_batch(pipe, batch, &direct);
3047 
3048       return;
3049    }
3050 
3051    ctx->compute_grid = info;
3052 
3053    /* Conservatively assume workgroup size changes every launch */
3054    ctx->dirty |= PAN_DIRTY_PARAMS;
3055 
3056    panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
3057 
3058    /* We want our compute thread descriptor to be per job.
3059     * Save the global one, and restore it when we're done emitting
3060     * the job.
3061     */
3062    mali_ptr saved_tls = batch->tls.gpu;
3063    batch->tls.gpu = panfrost_emit_shared_memory(batch, info);
3064 
3065    JOBX(launch_grid)(batch, info);
3066    batch->compute_count++;
3067    batch->tls.gpu = saved_tls;
3068 }
3069 
3070 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)3071 panfrost_launch_grid(struct pipe_context *pipe,
3072                      const struct pipe_grid_info *info)
3073 {
3074    struct panfrost_context *ctx = pan_context(pipe);
3075 
3076    /* XXX - shouldn't be necessary with working memory barriers. Affected
3077     * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3078    panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
3079 
3080    struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3081    panfrost_launch_grid_on_batch(pipe, batch, info);
3082 
3083    panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
3084 }
3085 
3086 #define AFBC_BLOCK_ALIGN 16
3087 
3088 static void
panfrost_launch_afbc_shader(struct panfrost_batch * batch,void * cso,struct pipe_constant_buffer * cbuf,unsigned nr_blocks)3089 panfrost_launch_afbc_shader(struct panfrost_batch *batch, void *cso,
3090                             struct pipe_constant_buffer *cbuf,
3091                             unsigned nr_blocks)
3092 {
3093    struct pipe_context *pctx = &batch->ctx->base;
3094    void *saved_cso = NULL;
3095    struct pipe_constant_buffer saved_const = {};
3096    struct pipe_grid_info grid = {
3097       .block[0] = 1,
3098       .block[1] = 1,
3099       .block[2] = 1,
3100       .grid[0] = nr_blocks,
3101       .grid[1] = 1,
3102       .grid[2] = 1,
3103    };
3104 
3105    struct panfrost_constant_buffer *pbuf =
3106       &batch->ctx->constant_buffer[PIPE_SHADER_COMPUTE];
3107    saved_cso = batch->ctx->uncompiled[PIPE_SHADER_COMPUTE];
3108    util_copy_constant_buffer(&pbuf->cb[0], &saved_const, true);
3109 
3110    pctx->bind_compute_state(pctx, cso);
3111    pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, false, cbuf);
3112 
3113    panfrost_launch_grid_on_batch(pctx, batch, &grid);
3114 
3115    pctx->bind_compute_state(pctx, saved_cso);
3116    pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, true, &saved_const);
3117 }
3118 
3119 #define LAUNCH_AFBC_SHADER(name, batch, rsrc, consts, nr_blocks)               \
3120    struct pan_afbc_shader_data *shaders =                                      \
3121       panfrost_afbc_get_shaders(batch->ctx, rsrc, AFBC_BLOCK_ALIGN);           \
3122    struct pipe_constant_buffer constant_buffer = {                             \
3123       .buffer_size = sizeof(consts),                                           \
3124       .user_buffer = &consts};                                                 \
3125    panfrost_launch_afbc_shader(batch, shaders->name##_cso, &constant_buffer,   \
3126                                nr_blocks);
3127 
3128 static void
panfrost_afbc_size(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * metadata,unsigned offset,unsigned level)3129 panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src,
3130                    struct panfrost_bo *metadata, unsigned offset,
3131                    unsigned level)
3132 {
3133    struct pan_image_slice_layout *slice = &src->image.layout.slices[level];
3134    struct panfrost_afbc_size_info consts = {
3135       .src =
3136          src->image.data.base + src->image.data.offset + slice->offset,
3137       .metadata = metadata->ptr.gpu + offset,
3138    };
3139 
3140    panfrost_batch_read_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3141    panfrost_batch_write_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3142 
3143    LAUNCH_AFBC_SHADER(size, batch, src, consts, slice->afbc.nr_blocks);
3144 }
3145 
3146 static void
panfrost_afbc_pack(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * dst,struct pan_image_slice_layout * dst_slice,struct panfrost_bo * metadata,unsigned metadata_offset,unsigned level)3147 panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src,
3148                    struct panfrost_bo *dst,
3149                    struct pan_image_slice_layout *dst_slice,
3150                    struct panfrost_bo *metadata, unsigned metadata_offset,
3151                    unsigned level)
3152 {
3153    struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level];
3154    struct panfrost_afbc_pack_info consts = {
3155       .src = src->image.data.base + src->image.data.offset +
3156              src_slice->offset,
3157       .dst = dst->ptr.gpu + dst_slice->offset,
3158       .metadata = metadata->ptr.gpu + metadata_offset,
3159       .header_size = dst_slice->afbc.header_size,
3160       .src_stride = src_slice->afbc.stride,
3161       .dst_stride = dst_slice->afbc.stride,
3162    };
3163 
3164    panfrost_batch_write_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3165    panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE);
3166    panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3167 
3168    LAUNCH_AFBC_SHADER(pack, batch, src, consts, dst_slice->afbc.nr_blocks);
3169 }
3170 
3171 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)3172 panfrost_create_rasterizer_state(struct pipe_context *pctx,
3173                                  const struct pipe_rasterizer_state *cso)
3174 {
3175    struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3176 
3177    so->base = *cso;
3178 
3179 #if PAN_ARCH <= 7
3180    pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3181       cfg.multisample_enable = cso->multisample;
3182       cfg.fixed_function_near_discard = cso->depth_clip_near;
3183       cfg.fixed_function_far_discard = cso->depth_clip_far;
3184       cfg.shader_depth_range_fixed = true;
3185    }
3186 
3187    pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3188       cfg.front_facing_depth_bias = cso->offset_tri;
3189       cfg.back_facing_depth_bias = cso->offset_tri;
3190       cfg.single_sampled_lines = !cso->multisample;
3191    }
3192 #endif
3193 
3194    return so;
3195 }
3196 
3197 #if PAN_ARCH >= 9
3198 /*
3199  * Given a pipe_vertex_element, pack the corresponding Valhall attribute
3200  * descriptor. This function is called at CSO create time.
3201  */
3202 static void
panfrost_pack_attribute(struct panfrost_device * dev,const struct pipe_vertex_element el,struct mali_attribute_packed * out)3203 panfrost_pack_attribute(struct panfrost_device *dev,
3204                         const struct pipe_vertex_element el,
3205                         struct mali_attribute_packed *out)
3206 {
3207    pan_pack(out, ATTRIBUTE, cfg) {
3208       cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER;
3209       cfg.frequency = (el.instance_divisor > 0)
3210                          ? MALI_ATTRIBUTE_FREQUENCY_INSTANCE
3211                          : MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3212       cfg.format = GENX(panfrost_format_from_pipe_format)(el.src_format)->hw;
3213       cfg.offset = el.src_offset;
3214       cfg.buffer_index = el.vertex_buffer_index;
3215       cfg.stride = el.src_stride;
3216 
3217       if (el.instance_divisor == 0) {
3218          /* Per-vertex */
3219          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
3220          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3221          cfg.offset_enable = true;
3222       } else if (util_is_power_of_two_or_zero(el.instance_divisor)) {
3223          /* Per-instance, POT divisor */
3224          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
3225          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3226          cfg.divisor_r = __builtin_ctz(el.instance_divisor);
3227       } else {
3228          /* Per-instance, NPOT divisor */
3229          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
3230          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3231 
3232          cfg.divisor_d = panfrost_compute_magic_divisor(
3233             el.instance_divisor, &cfg.divisor_r, &cfg.divisor_e);
3234       }
3235    }
3236 }
3237 #endif
3238 
3239 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)3240 panfrost_create_vertex_elements_state(struct pipe_context *pctx,
3241                                       unsigned num_elements,
3242                                       const struct pipe_vertex_element *elements)
3243 {
3244    struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3245    UNUSED struct panfrost_device *dev = pan_device(pctx->screen);
3246 
3247    so->num_elements = num_elements;
3248    memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3249 
3250    for (unsigned i = 0; i < num_elements; ++i)
3251       so->strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
3252 #if PAN_ARCH >= 9
3253    for (unsigned i = 0; i < num_elements; ++i)
3254       panfrost_pack_attribute(dev, elements[i], &so->attributes[i]);
3255 #else
3256    /* Assign attribute buffers corresponding to the vertex buffers, keyed
3257     * for a particular divisor since that's how instancing works on Mali */
3258    for (unsigned i = 0; i < num_elements; ++i) {
3259       so->element_buffer[i] = pan_assign_vertex_buffer(
3260          so->buffers, &so->nr_bufs, elements[i].vertex_buffer_index,
3261          elements[i].instance_divisor);
3262    }
3263 
3264    for (int i = 0; i < num_elements; ++i) {
3265       enum pipe_format fmt = elements[i].src_format;
3266       so->formats[i] = GENX(panfrost_format_from_pipe_format)(fmt)->hw;
3267 
3268       assert(MALI_EXTRACT_INDEX(so->formats[i]) && "format must be supported");
3269    }
3270 
3271    /* Let's also prepare vertex builtins */
3272    so->formats[PAN_VERTEX_ID] =
3273       GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3274    so->formats[PAN_INSTANCE_ID] =
3275       GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3276 #endif
3277 
3278    return so;
3279 }
3280 
3281 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)3282 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3283 {
3284    switch (in) {
3285    case PIPE_STENCIL_OP_KEEP:
3286       return MALI_STENCIL_OP_KEEP;
3287    case PIPE_STENCIL_OP_ZERO:
3288       return MALI_STENCIL_OP_ZERO;
3289    case PIPE_STENCIL_OP_REPLACE:
3290       return MALI_STENCIL_OP_REPLACE;
3291    case PIPE_STENCIL_OP_INCR:
3292       return MALI_STENCIL_OP_INCR_SAT;
3293    case PIPE_STENCIL_OP_DECR:
3294       return MALI_STENCIL_OP_DECR_SAT;
3295    case PIPE_STENCIL_OP_INCR_WRAP:
3296       return MALI_STENCIL_OP_INCR_WRAP;
3297    case PIPE_STENCIL_OP_DECR_WRAP:
3298       return MALI_STENCIL_OP_DECR_WRAP;
3299    case PIPE_STENCIL_OP_INVERT:
3300       return MALI_STENCIL_OP_INVERT;
3301    default:
3302       unreachable("Invalid stencil op");
3303    }
3304 }
3305 
3306 #if PAN_ARCH <= 7
3307 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)3308 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3309                     struct mali_stencil_packed *out)
3310 {
3311    pan_pack(out, STENCIL, s) {
3312       s.mask = in->valuemask;
3313       s.compare_function = (enum mali_func)in->func;
3314       s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3315       s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3316       s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3317    }
3318 }
3319 #endif
3320 
3321 static bool
pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state * zsa)3322 pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa)
3323 {
3324    if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS)
3325       return false;
3326 
3327    if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS)
3328       return false;
3329 
3330    if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS)
3331       return false;
3332 
3333    return true;
3334 }
3335 
3336 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)3337 panfrost_create_depth_stencil_state(
3338    struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *zsa)
3339 {
3340    struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3341    so->base = *zsa;
3342 
3343    const struct pipe_stencil_state front = zsa->stencil[0];
3344    const struct pipe_stencil_state back =
3345       zsa->stencil[1].enabled ? zsa->stencil[1] : front;
3346 
3347    enum mali_func depth_func =
3348       zsa->depth_enabled ? (enum mali_func)zsa->depth_func : MALI_FUNC_ALWAYS;
3349 
3350    /* Normalize (there's no separate enable) */
3351    if (PAN_ARCH <= 5 && !zsa->alpha_enabled)
3352       so->base.alpha_func = MALI_FUNC_ALWAYS;
3353 
3354 #if PAN_ARCH <= 7
3355    /* Prepack relevant parts of the Renderer State Descriptor. They will
3356     * be ORed in at draw-time */
3357    pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3358       cfg.depth_function = depth_func;
3359       cfg.depth_write_mask = zsa->depth_writemask;
3360    }
3361 
3362    pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3363       cfg.stencil_enable = front.enabled;
3364       cfg.stencil_mask_front = front.writemask;
3365       cfg.stencil_mask_back = back.writemask;
3366 
3367 #if PAN_ARCH <= 5
3368       cfg.alpha_test_compare_function = (enum mali_func)so->base.alpha_func;
3369 #endif
3370    }
3371 
3372    /* Stencil tests have their own words in the RSD */
3373    pan_pipe_to_stencil(&front, &so->stencil_front);
3374    pan_pipe_to_stencil(&back, &so->stencil_back);
3375 #else
3376    pan_pack(&so->desc, DEPTH_STENCIL, cfg) {
3377       cfg.front_compare_function = (enum mali_func)front.func;
3378       cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op);
3379       cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op);
3380       cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op);
3381 
3382       cfg.back_compare_function = (enum mali_func)back.func;
3383       cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op);
3384       cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op);
3385       cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op);
3386 
3387       cfg.stencil_test_enable = front.enabled;
3388       cfg.front_write_mask = front.writemask;
3389       cfg.back_write_mask = back.writemask;
3390       cfg.front_value_mask = front.valuemask;
3391       cfg.back_value_mask = back.valuemask;
3392 
3393       cfg.depth_write_enable = zsa->depth_writemask;
3394       cfg.depth_function = depth_func;
3395    }
3396 #endif
3397 
3398    so->enabled = zsa->stencil[0].enabled ||
3399                  (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3400 
3401    so->zs_always_passes = pipe_zs_always_passes(zsa);
3402    so->writes_zs = util_writes_depth_stencil(zsa);
3403 
3404    /* TODO: Bounds test should be easy */
3405    assert(!zsa->depth_bounds_test);
3406 
3407    return so;
3408 }
3409 
3410 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)3411 panfrost_create_sampler_view(struct pipe_context *pctx,
3412                              struct pipe_resource *texture,
3413                              const struct pipe_sampler_view *template)
3414 {
3415    struct panfrost_context *ctx = pan_context(pctx);
3416    struct panfrost_sampler_view *so =
3417       rzalloc(pctx, struct panfrost_sampler_view);
3418 
3419    pan_legalize_afbc_format(ctx, pan_resource(texture), template->format,
3420                             false, false);
3421 
3422    pipe_reference(NULL, &texture->reference);
3423 
3424    so->base = *template;
3425    so->base.texture = texture;
3426    so->base.reference.count = 1;
3427    so->base.context = pctx;
3428 
3429    panfrost_create_sampler_view_bo(so, pctx, texture);
3430 
3431    return (struct pipe_sampler_view *)so;
3432 }
3433 
3434 /* A given Gallium blend state can be encoded to the hardware in numerous,
3435  * dramatically divergent ways due to the interactions of blending with
3436  * framebuffer formats. Conceptually, there are two modes:
3437  *
3438  * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3439  *   state, and suitable blend constant)
3440  *
3441  * - Blend shaders (for everything else)
3442  *
3443  * A given Gallium blend configuration will compile to exactly one
3444  * fixed-function blend state, if it compiles to any, although the constant
3445  * will vary across runs as that is tracked outside of the Gallium CSO.
3446  *
3447  * However, that same blend configuration will compile to many different blend
3448  * shaders, depending on the framebuffer formats active. The rationale is that
3449  * blend shaders override not just fixed-function blending but also
3450  * fixed-function format conversion, so blend shaders are keyed to a particular
3451  * framebuffer format. As an example, the tilebuffer format is identical for
3452  * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3453  * blend shaders.
3454  *
3455  * All of this state is encapsulated in the panfrost_blend_state struct
3456  * (our subclass of pipe_blend_state).
3457  */
3458 
3459 /* Create a blend CSO. Essentially, try to compile a fixed-function
3460  * expression and initialize blend shaders */
3461 
3462 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)3463 panfrost_create_blend_state(struct pipe_context *pipe,
3464                             const struct pipe_blend_state *blend)
3465 {
3466    struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3467    so->base = *blend;
3468 
3469    so->pan.logicop_enable = blend->logicop_enable;
3470    so->pan.logicop_func = blend->logicop_func;
3471    so->pan.rt_count = blend->max_rt + 1;
3472 
3473    for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3474       unsigned g = blend->independent_blend_enable ? c : 0;
3475       const struct pipe_rt_blend_state pipe = blend->rt[g];
3476       struct pan_blend_equation equation = {0};
3477 
3478       equation.color_mask = pipe.colormask;
3479       equation.blend_enable = pipe.blend_enable;
3480 
3481       if (pipe.blend_enable) {
3482          equation.rgb_func = pipe.rgb_func;
3483          equation.rgb_src_factor = pipe.rgb_src_factor;
3484          equation.rgb_dst_factor = pipe.rgb_dst_factor;
3485          equation.alpha_func = pipe.alpha_func;
3486          equation.alpha_src_factor = pipe.alpha_src_factor;
3487          equation.alpha_dst_factor = pipe.alpha_dst_factor;
3488       }
3489 
3490       /* Determine some common properties */
3491       unsigned constant_mask = pan_blend_constant_mask(equation);
3492       const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
3493       so->info[c] = (struct pan_blend_info){
3494          .enabled = (equation.color_mask != 0) &&
3495                     !(blend->logicop_enable &&
3496                       blend->logicop_func == PIPE_LOGICOP_NOOP),
3497          .opaque = !blend->logicop_enable && pan_blend_is_opaque(equation),
3498          .constant_mask = constant_mask,
3499 
3500          /* TODO: check the dest for the logicop */
3501          .load_dest = blend->logicop_enable || pan_blend_reads_dest(equation),
3502 
3503          /* Could this possibly be fixed-function? */
3504          .fixed_function =
3505             !blend->logicop_enable &&
3506             pan_blend_can_fixed_function(equation, supports_2src) &&
3507             (!constant_mask || pan_blend_supports_constant(PAN_ARCH, c)),
3508 
3509          .alpha_zero_nop = pan_blend_alpha_zero_nop(equation),
3510          .alpha_one_store = pan_blend_alpha_one_store(equation),
3511       };
3512 
3513       so->pan.rts[c].equation = equation;
3514 
3515       /* Bifrost needs to know if any render target loads its
3516        * destination in the hot draw path, so precompute this */
3517       if (so->info[c].load_dest)
3518          so->load_dest_mask |= BITFIELD_BIT(c);
3519 
3520       /* Bifrost needs to know if any render target loads its
3521        * destination in the hot draw path, so precompute this */
3522       if (so->info[c].enabled)
3523          so->enabled_mask |= BITFIELD_BIT(c);
3524 
3525       /* Converting equations to Mali style is expensive, do it at
3526        * CSO create time instead of draw-time */
3527       if (so->info[c].fixed_function) {
3528          so->equation[c] = pan_pack_blend(equation);
3529       }
3530    }
3531 
3532    return so;
3533 }
3534 
3535 #if PAN_ARCH >= 9
3536 static enum mali_flush_to_zero_mode
panfrost_ftz_mode(struct pan_shader_info * info)3537 panfrost_ftz_mode(struct pan_shader_info *info)
3538 {
3539    if (info->ftz_fp32) {
3540       if (info->ftz_fp16)
3541          return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
3542       else
3543          return MALI_FLUSH_TO_ZERO_MODE_DX11;
3544    } else {
3545       /* We don't have a "flush FP16, preserve FP32" mode, but APIs
3546        * should not be able to generate that.
3547        */
3548       assert(!info->ftz_fp16 && !info->ftz_fp32);
3549       return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
3550    }
3551 }
3552 #endif
3553 
3554 static void
prepare_shader(struct panfrost_compiled_shader * state,struct panfrost_pool * pool,bool upload)3555 prepare_shader(struct panfrost_compiled_shader *state,
3556                struct panfrost_pool *pool, bool upload)
3557 {
3558 #if PAN_ARCH <= 7
3559    void *out = &state->partial_rsd;
3560 
3561    if (upload) {
3562       struct panfrost_ptr ptr =
3563          pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3564 
3565       state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3566       out = ptr.cpu;
3567    }
3568 
3569    pan_pack(out, RENDERER_STATE, cfg) {
3570       pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
3571    }
3572 #else
3573    assert(upload);
3574 
3575    /* The address in the shader program descriptor must be non-null, but
3576     * the entire shader program descriptor may be omitted.
3577     *
3578     * See dEQP-GLES31.functional.compute.basic.empty
3579     */
3580    if (!state->bin.gpu)
3581       return;
3582 
3583    bool vs = (state->info.stage == MESA_SHADER_VERTEX);
3584    bool secondary_enable = (vs && state->info.vs.secondary_enable);
3585 
3586    unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1;
3587    struct panfrost_ptr ptr =
3588       pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM);
3589 
3590    state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3591 
3592    /* Generic, or IDVS/points */
3593    pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) {
3594       cfg.stage = pan_shader_stage(&state->info);
3595 
3596       if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
3597          cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
3598       else if (vs)
3599          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
3600 
3601       cfg.register_allocation =
3602          pan_register_allocation(state->info.work_reg_count);
3603       cfg.binary = state->bin.gpu;
3604       cfg.preload.r48_r63 = (state->info.preload >> 48);
3605       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3606 
3607       if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
3608          cfg.requires_helper_threads = state->info.contains_barrier;
3609    }
3610 
3611    if (!vs)
3612       return;
3613 
3614    /* IDVS/triangles */
3615    pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) {
3616       cfg.stage = pan_shader_stage(&state->info);
3617       cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
3618       cfg.register_allocation =
3619          pan_register_allocation(state->info.work_reg_count);
3620       cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
3621       cfg.preload.r48_r63 = (state->info.preload >> 48);
3622       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3623    }
3624 
3625    if (!secondary_enable)
3626       return;
3627 
3628    pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) {
3629       unsigned work_count = state->info.vs.secondary_work_reg_count;
3630 
3631       cfg.stage = pan_shader_stage(&state->info);
3632       cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
3633       cfg.register_allocation = pan_register_allocation(work_count);
3634       cfg.binary = state->bin.gpu + state->info.vs.secondary_offset;
3635       cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48);
3636       cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3637    }
3638 #endif
3639 }
3640 
3641 static void
screen_destroy(struct pipe_screen * pscreen)3642 screen_destroy(struct pipe_screen *pscreen)
3643 {
3644    struct panfrost_device *dev = pan_device(pscreen);
3645    GENX(pan_blitter_cache_cleanup)(&dev->blitter);
3646 }
3647 
3648 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)3649 panfrost_sampler_view_destroy(struct pipe_context *pctx,
3650                               struct pipe_sampler_view *pview)
3651 {
3652    struct panfrost_sampler_view *view = (struct panfrost_sampler_view *)pview;
3653 
3654    pipe_resource_reference(&pview->texture, NULL);
3655    panfrost_bo_unreference(view->state.bo);
3656    ralloc_free(view);
3657 }
3658 
3659 static void
context_populate_vtbl(struct pipe_context * pipe)3660 context_populate_vtbl(struct pipe_context *pipe)
3661 {
3662    pipe->draw_vbo = panfrost_draw_vbo;
3663    pipe->launch_grid = panfrost_launch_grid;
3664 
3665    pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
3666    pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
3667    pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
3668    pipe->create_sampler_view = panfrost_create_sampler_view;
3669    pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
3670    pipe->create_sampler_state = panfrost_create_sampler_state;
3671    pipe->create_blend_state = panfrost_create_blend_state;
3672 
3673    pipe->get_sample_position = u_default_get_sample_position;
3674 }
3675 
3676 #if PAN_ARCH <= 5
3677 
3678 /* Returns the polygon list's GPU address if available, or otherwise allocates
3679  * the polygon list.  It's perfectly fast to use allocate/free BO directly,
3680  * since we'll hit the BO cache and this is one-per-batch anyway. */
3681 
3682 static mali_ptr
batch_get_polygon_list(struct panfrost_batch * batch)3683 batch_get_polygon_list(struct panfrost_batch *batch)
3684 {
3685    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3686 
3687    if (!batch->tiler_ctx.midgard.polygon_list) {
3688       bool has_draws = batch->draw_count > 0;
3689       unsigned size = panfrost_tiler_get_polygon_list_size(
3690          batch->key.width, batch->key.height, batch->tiler_ctx.vertex_count,
3691          !dev->model->quirks.no_hierarchical_tiling);
3692 
3693       /* Create the BO as invisible if we can. If there are no draws,
3694        * we need to write the polygon list manually because there's
3695        * no WRITE_VALUE job in the chain
3696        */
3697       bool init_polygon_list = !has_draws;
3698       batch->polygon_list_bo = panfrost_batch_create_bo(
3699          batch, size, init_polygon_list ? 0 : PAN_BO_INVISIBLE,
3700          PIPE_SHADER_VERTEX, "Polygon list");
3701       batch->tiler_ctx.midgard.polygon_list = batch->polygon_list_bo->ptr.gpu;
3702       panfrost_batch_add_bo(batch, batch->polygon_list_bo,
3703                             PIPE_SHADER_FRAGMENT);
3704 
3705       if (init_polygon_list && dev->model->quirks.no_hierarchical_tiling) {
3706          assert(batch->polygon_list_bo->ptr.cpu);
3707          uint32_t *polygon_list_body =
3708             batch->polygon_list_bo->ptr.cpu +
3709             MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
3710 
3711          /* Magic for Mali T720 */
3712          polygon_list_body[0] = 0xa0000000;
3713       } else if (init_polygon_list) {
3714          assert(batch->polygon_list_bo->ptr.cpu);
3715          uint32_t *header = batch->polygon_list_bo->ptr.cpu;
3716          memset(header, 0, size);
3717       }
3718 
3719       batch->tiler_ctx.midgard.disable = !has_draws;
3720       batch->tiler_ctx.midgard.no_hierarchical_tiling =
3721          dev->model->quirks.no_hierarchical_tiling;
3722       batch->tiler_ctx.midgard.heap.start = dev->tiler_heap->ptr.gpu;
3723       batch->tiler_ctx.midgard.heap.size = panfrost_bo_size(dev->tiler_heap);
3724    }
3725 
3726    return batch->tiler_ctx.midgard.polygon_list;
3727 }
3728 #endif
3729 
3730 static void
init_polygon_list(struct panfrost_batch * batch)3731 init_polygon_list(struct panfrost_batch *batch)
3732 {
3733 #if PAN_ARCH <= 5
3734    mali_ptr polygon_list = batch_get_polygon_list(batch);
3735    pan_jc_initialize_tiler(&batch->pool.base, &batch->jm.jobs.vtc_jc,
3736                            polygon_list);
3737 #endif
3738 }
3739 
3740 static int
submit_batch(struct panfrost_batch * batch,struct pan_fb_info * fb)3741 submit_batch(struct panfrost_batch *batch, struct pan_fb_info *fb)
3742 {
3743    JOBX(preload_fb)(batch, fb);
3744    init_polygon_list(batch);
3745 
3746    /* Now that all draws are in, we can finally prepare the
3747     * FBD for the batch (if there is one). */
3748 
3749    emit_tls(batch);
3750 
3751    if (panfrost_has_fragment_job(batch)) {
3752       emit_fbd(batch, fb);
3753       emit_fragment_job(batch, fb);
3754    }
3755 
3756    return JOBX(submit_batch)(batch);
3757 }
3758 
3759 void
GENX(panfrost_cmdstream_screen_init)3760 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
3761 {
3762    struct panfrost_device *dev = &screen->dev;
3763 
3764    screen->vtbl.prepare_shader = prepare_shader;
3765    screen->vtbl.screen_destroy = screen_destroy;
3766    screen->vtbl.context_populate_vtbl = context_populate_vtbl;
3767    screen->vtbl.init_batch = JOBX(init_batch);
3768    screen->vtbl.submit_batch = submit_batch;
3769    screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
3770    screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
3771    screen->vtbl.compile_shader = GENX(pan_shader_compile);
3772    screen->vtbl.afbc_size = panfrost_afbc_size;
3773    screen->vtbl.afbc_pack = panfrost_afbc_pack;
3774 
3775    GENX(pan_blitter_cache_init)
3776    (&dev->blitter, panfrost_device_gpu_id(dev), &dev->blend_shaders,
3777     &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base);
3778 
3779 #if PAN_GPU_INDIRECTS
3780    pan_indirect_dispatch_meta_init(
3781       &dev->indirect_dispatch, panfrost_device_gpu_id(dev),
3782       &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base);
3783 #endif
3784 }
3785