• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2017 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "util/macros.h"
27 #include "util/u_prim.h"
28 #include "util/u_vbuf.h"
29 #include "util/u_helpers.h"
30 #include "util/u_draw.h"
31 #include "util/u_memory.h"
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "gallium/auxiliary/util/u_blend.h"
35 
36 #include "genxml/gen_macros.h"
37 
38 #include "pan_pool.h"
39 #include "pan_bo.h"
40 #include "pan_blend.h"
41 #include "pan_context.h"
42 #include "pan_job.h"
43 #include "pan_shader.h"
44 #include "pan_texture.h"
45 #include "pan_util.h"
46 #include "pan_indirect_draw.h"
47 #include "pan_indirect_dispatch.h"
48 #include "pan_blitter.h"
49 
50 #define PAN_GPU_INDIRECTS (PAN_ARCH == 7)
51 
52 struct panfrost_rasterizer {
53         struct pipe_rasterizer_state base;
54 
55 #if PAN_ARCH <= 7
56         /* Partially packed RSD words */
57         struct mali_multisample_misc_packed multisample;
58         struct mali_stencil_mask_misc_packed stencil_misc;
59 #endif
60 };
61 
62 struct panfrost_zsa_state {
63         struct pipe_depth_stencil_alpha_state base;
64 
65         /* Is any depth, stencil, or alpha testing enabled? */
66         bool enabled;
67 
68         /* Does the depth and stencil tests always pass? This ignores write
69          * masks, we are only interested in whether pixels may be killed.
70          */
71         bool zs_always_passes;
72 
73         /* Are depth or stencil writes possible? */
74         bool writes_zs;
75 
76 #if PAN_ARCH <= 7
77         /* Prepacked words from the RSD */
78         struct mali_multisample_misc_packed rsd_depth;
79         struct mali_stencil_mask_misc_packed rsd_stencil;
80         struct mali_stencil_packed stencil_front, stencil_back;
81 #else
82         /* Depth/stencil descriptor template */
83         struct mali_depth_stencil_packed desc;
84 #endif
85 };
86 
87 struct panfrost_sampler_state {
88         struct pipe_sampler_state base;
89         struct mali_sampler_packed hw;
90 };
91 
92 /* Misnomer: Sampler view corresponds to textures, not samplers */
93 
94 struct panfrost_sampler_view {
95         struct pipe_sampler_view base;
96         struct panfrost_pool_ref state;
97         struct mali_texture_packed bifrost_descriptor;
98         mali_ptr texture_bo;
99         uint64_t modifier;
100 
101         /* Pool used to allocate the descriptor. If NULL, defaults to the global
102          * descriptor pool. Can be set for short lived descriptors, useful for
103          * shader images on Valhall.
104          */
105         struct panfrost_pool *pool;
106 };
107 
108 struct panfrost_vertex_state {
109         unsigned num_elements;
110         struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS];
111 
112 #if PAN_ARCH >= 9
113         /* Packed attribute descriptor. All fields are set at CSO create time
114          * except for stride, which must be ORed in at draw time
115          */
116         struct mali_attribute_packed attributes[PIPE_MAX_ATTRIBS];
117 #else
118         /* buffers corresponds to attribute buffer, element_buffers corresponds
119          * to an index in buffers for each vertex element */
120         struct pan_vertex_buffer buffers[PIPE_MAX_ATTRIBS];
121         unsigned element_buffer[PIPE_MAX_ATTRIBS];
122         unsigned nr_bufs;
123 
124         unsigned formats[PIPE_MAX_ATTRIBS];
125 #endif
126 };
127 
128 /* Statically assert that PIPE_* enums match the hardware enums.
129  * (As long as they match, we don't need to translate them.)
130  */
131 static_assert((int)PIPE_FUNC_NEVER    == MALI_FUNC_NEVER,     "must match");
132 static_assert((int)PIPE_FUNC_LESS     == MALI_FUNC_LESS,      "must match");
133 static_assert((int)PIPE_FUNC_EQUAL    == MALI_FUNC_EQUAL,     "must match");
134 static_assert((int)PIPE_FUNC_LEQUAL   == MALI_FUNC_LEQUAL,    "must match");
135 static_assert((int)PIPE_FUNC_GREATER  == MALI_FUNC_GREATER,   "must match");
136 static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match");
137 static_assert((int)PIPE_FUNC_GEQUAL   == MALI_FUNC_GEQUAL,    "must match");
138 static_assert((int)PIPE_FUNC_ALWAYS   == MALI_FUNC_ALWAYS,    "must match");
139 
140 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)141 panfrost_sample_pattern(unsigned samples)
142 {
143         switch (samples) {
144         case 1:  return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
145         case 4:  return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
146         case 8:  return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
147         case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
148         default: unreachable("Unsupported sample count");
149         }
150 }
151 
152 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)153 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
154 {
155         /* CLAMP is only supported on Midgard, where it is broken for nearest
156          * filtering. Use CLAMP_TO_EDGE in that case.
157          */
158 
159         switch (w) {
160         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
161         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
162         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
163         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
164         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
165         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
166 
167 #if PAN_ARCH <= 5
168         case PIPE_TEX_WRAP_CLAMP:
169                 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :
170                                        MALI_WRAP_MODE_CLAMP;
171         case PIPE_TEX_WRAP_MIRROR_CLAMP:
172                 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :
173                                        MALI_WRAP_MODE_MIRRORED_CLAMP;
174 #endif
175 
176         default: unreachable("Invalid wrap");
177         }
178 }
179 
180 /* The hardware compares in the wrong order order, so we have to flip before
181  * encoding. Yes, really. */
182 
183 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)184 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
185 {
186         return !cso->compare_mode ? MALI_FUNC_NEVER :
187                 panfrost_flip_compare_func((enum mali_func) cso->compare_func);
188 }
189 
190 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)191 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
192 {
193         switch (f) {
194         case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
195         case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
196 #if PAN_ARCH >= 6
197         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
198 #else
199         case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST;
200 #endif
201         default: unreachable("Invalid");
202         }
203 }
204 
205 
206 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)207 panfrost_create_sampler_state(
208         struct pipe_context *pctx,
209         const struct pipe_sampler_state *cso)
210 {
211         struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
212         so->base = *cso;
213 
214         bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
215 
216         pan_pack(&so->hw, SAMPLER, cfg) {
217                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
218                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
219 
220                 cfg.normalized_coordinates = cso->normalized_coords;
221                 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
222                 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
223                 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
224 
225                 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
226                 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
227                 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
228 
229                 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
230                 cfg.compare_function = panfrost_sampler_compare_func(cso);
231                 cfg.seamless_cube_map = cso->seamless_cube_map;
232 
233                 cfg.border_color_r = cso->border_color.ui[0];
234                 cfg.border_color_g = cso->border_color.ui[1];
235                 cfg.border_color_b = cso->border_color.ui[2];
236                 cfg.border_color_a = cso->border_color.ui[3];
237 
238 #if PAN_ARCH >= 6
239                 if (cso->max_anisotropy > 1) {
240                         cfg.maximum_anisotropy = cso->max_anisotropy;
241                         cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
242                 }
243 #else
244                 /* Emulate disabled mipmapping by clamping the LOD as tight as
245                  * possible (from 0 to epsilon = 1/256) */
246                 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
247                         cfg.maximum_lod = cfg.minimum_lod + 1;
248 #endif
249         }
250 
251         return so;
252 }
253 
254 static bool
panfrost_fs_required(struct panfrost_shader_state * fs,struct panfrost_blend_state * blend,struct pipe_framebuffer_state * state,const struct panfrost_zsa_state * zsa)255 panfrost_fs_required(
256                 struct panfrost_shader_state *fs,
257                 struct panfrost_blend_state *blend,
258                 struct pipe_framebuffer_state *state,
259                 const struct panfrost_zsa_state *zsa)
260 {
261         /* If we generally have side effects. This inclues use of discard,
262          * which can affect the results of an occlusion query. */
263         if (fs->info.fs.sidefx)
264                 return true;
265 
266         /* Using an empty FS requires early-z to be enabled, but alpha test
267          * needs it disabled. Alpha test is only native on Midgard, so only
268          * check there.
269          */
270         if (PAN_ARCH <= 5 && zsa->base.alpha_func != PIPE_FUNC_ALWAYS)
271                 return true;
272 
273         /* If colour is written we need to execute */
274         for (unsigned i = 0; i < state->nr_cbufs; ++i) {
275                 if (state->cbufs[i] && !blend->info[i].no_colour)
276                         return true;
277         }
278 
279         /* If depth is written and not implied we need to execute.
280          * TODO: Predicate on Z/S writes being enabled */
281         return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil);
282 }
283 
284 /* Get pointers to the blend shaders bound to each active render target. Used
285  * to emit the blend descriptors, as well as the fragment renderer state
286  * descriptor.
287  */
288 static void
panfrost_get_blend_shaders(struct panfrost_batch * batch,mali_ptr * blend_shaders)289 panfrost_get_blend_shaders(struct panfrost_batch *batch,
290                            mali_ptr *blend_shaders)
291 {
292         unsigned shader_offset = 0;
293         struct panfrost_bo *shader_bo = NULL;
294 
295         for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) {
296                 if (batch->key.cbufs[c]) {
297                         blend_shaders[c] = panfrost_get_blend(batch,
298                                         c, &shader_bo, &shader_offset);
299                 }
300         }
301 }
302 
303 #if PAN_ARCH >= 5
304 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)305 pack_blend_constant(enum pipe_format format, float cons)
306 {
307         const struct util_format_description *format_desc =
308                 util_format_description(format);
309 
310         unsigned chan_size = 0;
311 
312         for (unsigned i = 0; i < format_desc->nr_channels; i++)
313                 chan_size = MAX2(format_desc->channel[0].size, chan_size);
314 
315         uint16_t unorm = (cons * ((1 << chan_size) - 1));
316         return unorm << (16 - chan_size);
317 }
318 
319 /*
320  * Determine whether to set the respective overdraw alpha flag.
321  *
322  * The overdraw alpha=1 flag should be set when alpha=1 implies full overdraw,
323  * equivalently, all enabled render targets have alpha_one_store set. Likewise,
324  * overdraw alpha=0 should be set when alpha=0 implies no overdraw,
325  * equivalently, all enabled render targets have alpha_zero_nop set.
326  */
327 static bool
panfrost_overdraw_alpha(const struct panfrost_context * ctx,bool zero)328 panfrost_overdraw_alpha(const struct panfrost_context *ctx, bool zero)
329 {
330         const struct panfrost_blend_state *so = ctx->blend;
331 
332         for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) {
333                 const struct pan_blend_info info = so->info[i];
334 
335                 bool enabled = ctx->pipe_framebuffer.cbufs[i] && info.no_colour;
336                 bool flag = zero ? info.alpha_zero_nop : info.alpha_one_store;
337 
338                 if (enabled && !flag)
339                         return false;
340         }
341 
342         return true;
343 }
344 
345 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,mali_ptr * blend_shaders)346 panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders)
347 {
348         unsigned rt_count = batch->key.nr_cbufs;
349         struct panfrost_context *ctx = batch->ctx;
350         const struct panfrost_blend_state *so = ctx->blend;
351         bool dithered = so->base.dither;
352 
353         /* Always have at least one render target for depth-only passes */
354         for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
355                 struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
356 
357                 /* Disable blending for unbacked render targets */
358                 if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
359                         pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
360                                 cfg.enable = false;
361 #if PAN_ARCH >= 6
362                                 cfg.internal.mode = MALI_BLEND_MODE_OFF;
363 #endif
364                         }
365 
366                         continue;
367                 }
368 
369                 struct pan_blend_info info = so->info[i];
370                 enum pipe_format format = batch->key.cbufs[i]->format;
371                 float cons = pan_blend_get_constant(info.constant_mask,
372                                                     ctx->blend_color.color);
373 
374                 /* Word 0: Flags and constant */
375                 pan_pack(packed, BLEND, cfg) {
376                         cfg.srgb = util_format_is_srgb(format);
377                         cfg.load_destination = info.load_dest;
378                         cfg.round_to_fb_precision = !dithered;
379                         cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
380 #if PAN_ARCH >= 6
381                         if (!blend_shaders[i])
382                                 cfg.constant = pack_blend_constant(format, cons);
383 #else
384                         cfg.blend_shader = (blend_shaders[i] != 0);
385 
386                         if (blend_shaders[i])
387                                 cfg.shader_pc = blend_shaders[i];
388                         else
389                                 cfg.constant = cons;
390 #endif
391                 }
392 
393                 if (!blend_shaders[i]) {
394                         /* Word 1: Blend Equation */
395                         STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
396                         packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
397                 }
398 
399 #if PAN_ARCH >= 6
400                 const struct panfrost_device *dev = pan_device(ctx->base.screen);
401                 struct panfrost_shader_state *fs =
402                         panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
403 
404                 /* Words 2 and 3: Internal blend */
405                 if (blend_shaders[i]) {
406                         /* The blend shader's address needs to be at
407                          * the same top 32 bit as the fragment shader.
408                          * TODO: Ensure that's always the case.
409                          */
410                         assert(!fs->bin.bo ||
411                                         (blend_shaders[i] & (0xffffffffull << 32)) ==
412                                         (fs->bin.gpu & (0xffffffffull << 32)));
413 
414                         pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
415                                 cfg.mode = MALI_BLEND_MODE_SHADER;
416                                 cfg.shader.pc = (u32) blend_shaders[i];
417 
418 #if PAN_ARCH <= 7
419                                 unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
420                                 assert(!(ret_offset & 0x7));
421 
422                                 cfg.shader.return_value = ret_offset ?
423                                         fs->bin.gpu + ret_offset : 0;
424 #endif
425                         }
426                 } else {
427                         pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
428                                 cfg.mode = info.opaque ?
429                                         MALI_BLEND_MODE_OPAQUE :
430                                         MALI_BLEND_MODE_FIXED_FUNCTION;
431 
432                                 /* If we want the conversion to work properly,
433                                  * num_comps must be set to 4
434                                  */
435                                 cfg.fixed_function.num_comps = 4;
436                                 cfg.fixed_function.conversion.memory_format =
437                                         panfrost_format_to_bifrost_blend(dev, format, dithered);
438                                 cfg.fixed_function.conversion.register_format =
439                                         fs->info.bifrost.blend[i].format;
440                                 cfg.fixed_function.rt = i;
441 
442 #if PAN_ARCH <= 7
443                                 if (!info.opaque) {
444                                         cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
445                                         cfg.fixed_function.alpha_one_store = info.alpha_one_store;
446                                 }
447 #endif
448                         }
449                 }
450 #endif
451         }
452 }
453 #endif
454 
455 static inline bool
pan_allow_forward_pixel_to_kill(struct panfrost_context * ctx,struct panfrost_shader_state * fs)456 pan_allow_forward_pixel_to_kill(struct panfrost_context *ctx, struct panfrost_shader_state *fs)
457 {
458         /* Track if any colour buffer is reused across draws, either
459          * from reading it directly, or from failing to write it
460          */
461         unsigned rt_mask = ctx->fb_rt_mask;
462         uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);
463         bool blend_reads_dest = (ctx->blend->load_dest_mask & rt_mask);
464         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
465 
466         return fs->info.fs.can_fpk &&
467                 !(rt_mask & ~rt_written) &&
468                 !alpha_to_coverage &&
469                 !blend_reads_dest;
470 }
471 
472 static mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)473 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
474 {
475         struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
476 
477         panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
478         panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
479 
480         return ss->state.gpu;
481 }
482 
483 #if PAN_ARCH <= 7
484 /* Construct a partial RSD corresponding to no executed fragment shader, and
485  * merge with the existing partial RSD. */
486 
487 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)488 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
489 {
490         struct mali_renderer_state_packed empty_rsd;
491 
492         pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
493 #if PAN_ARCH >= 6
494                 cfg.properties.shader_modifies_coverage = true;
495                 cfg.properties.allow_forward_pixel_to_kill = true;
496                 cfg.properties.allow_forward_pixel_to_be_killed = true;
497                 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
498 
499                 /* Alpha isn't written so these are vacuous */
500                 cfg.multisample_misc.overdraw_alpha0 = true;
501                 cfg.multisample_misc.overdraw_alpha1 = true;
502 #else
503                 cfg.shader.shader = 0x1;
504                 cfg.properties.work_register_count = 1;
505                 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
506                 cfg.properties.force_early_z = true;
507 #endif
508         }
509 
510         pan_merge((*rsd), empty_rsd, RENDERER_STATE);
511 }
512 
513 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,mali_ptr * blend_shaders,struct mali_renderer_state_packed * rsd)514 panfrost_prepare_fs_state(struct panfrost_context *ctx,
515                           mali_ptr *blend_shaders,
516                           struct mali_renderer_state_packed *rsd)
517 {
518         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
519         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
520         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
521         struct panfrost_blend_state *so = ctx->blend;
522         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
523         bool msaa = rast->multisample;
524 
525         unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
526 
527         bool has_blend_shader = false;
528 
529         for (unsigned c = 0; c < rt_count; ++c)
530                 has_blend_shader |= (blend_shaders[c] != 0);
531 
532         bool has_oq = ctx->occlusion_query && ctx->active_queries;
533 
534         pan_pack(rsd, RENDERER_STATE, cfg) {
535                 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
536 #if PAN_ARCH >= 6
537                         struct pan_earlyzs_state earlyzs =
538                                pan_earlyzs_get(fs->earlyzs,
539                                                ctx->depth_stencil->writes_zs ||
540                                                has_oq,
541                                                ctx->blend->base.alpha_to_coverage,
542                                                ctx->depth_stencil->zs_always_passes);
543 
544                         cfg.properties.pixel_kill_operation = earlyzs.kill;
545                         cfg.properties.zs_update_operation = earlyzs.update;
546 
547                         cfg.properties.allow_forward_pixel_to_kill =
548                                 pan_allow_forward_pixel_to_kill(ctx, fs);
549 #else
550                         cfg.properties.force_early_z =
551                                 fs->info.fs.can_early_z && !alpha_to_coverage &&
552                                 ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);
553 
554                         /* TODO: Reduce this limit? */
555                         if (has_blend_shader)
556                                 cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8);
557                         else
558                                 cfg.properties.work_register_count = fs->info.work_reg_count;
559 
560                         /* Hardware quirks around early-zs forcing without a
561                          * depth buffer. Note this breaks occlusion queries. */
562                         bool force_ez_with_discard = !zsa->enabled && !has_oq;
563 
564                         cfg.properties.shader_reads_tilebuffer =
565                                 force_ez_with_discard && fs->info.fs.can_discard;
566                         cfg.properties.shader_contains_discard =
567                                 !force_ez_with_discard && fs->info.fs.can_discard;
568 #endif
569                 }
570 
571 #if PAN_ARCH == 4
572                 if (rt_count > 0) {
573                         cfg.multisample_misc.load_destination = so->info[0].load_dest;
574                         cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
575                         cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour;
576                         cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
577                         cfg.stencil_mask_misc.dither_disable = !so->base.dither;
578                         cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
579 
580                         if (blend_shaders[0]) {
581                                 cfg.blend_shader = blend_shaders[0];
582                         } else {
583                                 cfg.blend_constant = pan_blend_get_constant(
584                                                 so->info[0].constant_mask,
585                                                 ctx->blend_color.color);
586                         }
587                 } else {
588                         /* If there is no colour buffer, leaving fields default is
589                          * fine, except for blending which is nonnullable */
590                         cfg.blend_equation.color_mask = 0xf;
591                         cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
592                         cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
593                         cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
594                         cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
595                         cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
596                         cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
597                 }
598 #elif PAN_ARCH == 5
599                 /* Workaround */
600                 cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
601 #endif
602 
603                 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
604 
605                 cfg.multisample_misc.evaluate_per_sample =
606                         msaa && (ctx->min_samples > 1);
607 
608 #if PAN_ARCH >= 6
609                 /* MSAA blend shaders need to pass their sample ID to
610                  * LD_TILE/ST_TILE, so we must preload it. Additionally, we
611                  * need per-sample shading for the blend shader, accomplished
612                  * by forcing per-sample shading for the whole program. */
613 
614                 if (msaa && has_blend_shader) {
615                         cfg.multisample_misc.evaluate_per_sample = true;
616                         cfg.preload.fragment.sample_mask_id = true;
617                 }
618 
619                 /* Flip gl_PointCoord (and point sprites) depending on API
620                  * setting on framebuffer orientation. We do not use
621                  * lower_wpos_pntc on Bifrost.
622                  */
623                 cfg.properties.point_sprite_coord_origin_max_y =
624                         (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
625 
626                 cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
627                 cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
628 #endif
629 
630                 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
631                 cfg.depth_units = rast->offset_units * 2.0f;
632                 cfg.depth_factor = rast->offset_scale;
633 
634                 bool back_enab = zsa->base.stencil[1].enabled;
635                 cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
636                 cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
637 
638 #if PAN_ARCH <= 5
639                 /* v6+ fits register preload here, no alpha testing */
640                 cfg.alpha_reference = zsa->base.alpha_ref_value;
641 #endif
642         }
643 }
644 
645 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,mali_ptr * blend_shaders)646 panfrost_emit_frag_shader(struct panfrost_context *ctx,
647                           struct mali_renderer_state_packed *fragmeta,
648                           mali_ptr *blend_shaders)
649 {
650         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
651         const struct panfrost_rasterizer *rast = ctx->rasterizer;
652         struct panfrost_shader_state *fs =
653                 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
654 
655         /* We need to merge several several partial renderer state descriptors,
656          * so stage to temporary storage rather than reading back write-combine
657          * memory, which will trash performance. */
658         struct mali_renderer_state_packed rsd;
659         panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
660 
661 #if PAN_ARCH == 4
662         if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
663                 /* Word 14: SFBD Blend Equation */
664                 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
665                 rsd.opaque[14] = ctx->blend->equation[0];
666         }
667 #endif
668 
669         /* Merge with CSO state and upload */
670         if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
671                 struct mali_renderer_state_packed *partial_rsd =
672                         (struct mali_renderer_state_packed *)&fs->partial_rsd;
673                 STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
674                 pan_merge(rsd, *partial_rsd, RENDERER_STATE);
675         } else {
676                 pan_merge_empty_fs(&rsd);
677         }
678 
679         /* Word 8, 9 Misc state */
680         rsd.opaque[8] |= zsa->rsd_depth.opaque[0]
681                        | rast->multisample.opaque[0];
682 
683         rsd.opaque[9] |= zsa->rsd_stencil.opaque[0]
684                        | rast->stencil_misc.opaque[0];
685 
686         /* Word 10, 11 Stencil Front and Back */
687         rsd.opaque[10] |= zsa->stencil_front.opaque[0];
688         rsd.opaque[11] |= zsa->stencil_back.opaque[0];
689 
690         memcpy(fragmeta, &rsd, sizeof(rsd));
691 }
692 
693 static mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)694 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
695 {
696         struct panfrost_context *ctx = batch->ctx;
697         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
698 
699         panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
700 
701         struct panfrost_ptr xfer;
702 
703 #if PAN_ARCH == 4
704         xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
705 #else
706         unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
707 
708         xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base,
709                                              PAN_DESC(RENDERER_STATE),
710                                              PAN_DESC_ARRAY(rt_count, BLEND));
711 #endif
712 
713         mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
714         panfrost_get_blend_shaders(batch, blend_shaders);
715 
716         panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);
717 
718 #if PAN_ARCH >= 5
719         panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders);
720 #endif
721 
722         return xfer.gpu;
723 }
724 #endif
725 
726 static mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)727 panfrost_emit_viewport(struct panfrost_batch *batch)
728 {
729         struct panfrost_context *ctx = batch->ctx;
730         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
731         const struct pipe_scissor_state *ss = &ctx->scissor;
732         const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
733 
734         /* Derive min/max from translate/scale. Note since |x| >= 0 by
735          * definition, we have that -|x| <= |x| hence translate - |scale| <=
736          * translate + |scale|, so the ordering is correct here. */
737         float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
738         float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
739         float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
740         float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
741         float minz = (vp->translate[2] - fabsf(vp->scale[2]));
742         float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
743 
744         /* Scissor to the intersection of viewport and to the scissor, clamped
745          * to the framebuffer */
746 
747         unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0));
748         unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0));
749         unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0));
750         unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0));
751 
752         if (ss && rast->scissor) {
753                 minx = MAX2(ss->minx, minx);
754                 miny = MAX2(ss->miny, miny);
755                 maxx = MIN2(ss->maxx, maxx);
756                 maxy = MIN2(ss->maxy, maxy);
757         }
758 
759         /* Set the range to [1, 1) so max values don't wrap round */
760         if (maxx == 0 || maxy == 0)
761                 maxx = maxy = minx = miny = 1;
762 
763         panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
764         batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
765 
766         /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */
767         maxx--;
768         maxy--;
769 
770         batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY;
771         batch->maximum_z = rast->depth_clip_far  ? maxz : +INFINITY;
772 
773 #if PAN_ARCH <= 7
774         struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
775 
776         pan_pack(T.cpu, VIEWPORT, cfg) {
777                 cfg.scissor_minimum_x = minx;
778                 cfg.scissor_minimum_y = miny;
779                 cfg.scissor_maximum_x = maxx;
780                 cfg.scissor_maximum_y = maxy;
781 
782                 cfg.minimum_z = batch->minimum_z;
783                 cfg.maximum_z = batch->maximum_z;
784         }
785 
786         return T.gpu;
787 #else
788         pan_pack(&batch->scissor, SCISSOR, cfg) {
789                 cfg.scissor_minimum_x = minx;
790                 cfg.scissor_minimum_y = miny;
791                 cfg.scissor_maximum_x = maxx;
792                 cfg.scissor_maximum_y = maxy;
793         }
794 
795         return 0;
796 #endif
797 }
798 
799 #if PAN_ARCH >= 9
800 /**
801  * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the
802  * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at
803  * CSO create time. However, the stencil reference values and shader
804  * interactions are dynamic state. Pack only the dynamic state here and OR
805  * together.
806  */
807 static mali_ptr
panfrost_emit_depth_stencil(struct panfrost_batch * batch)808 panfrost_emit_depth_stencil(struct panfrost_batch *batch)
809 {
810         struct panfrost_context *ctx = batch->ctx;
811         const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
812         struct panfrost_rasterizer *rast = ctx->rasterizer;
813         struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
814         bool back_enab = zsa->base.stencil[1].enabled;
815 
816         struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL);
817         struct mali_depth_stencil_packed dynamic;
818 
819         pan_pack(&dynamic, DEPTH_STENCIL, cfg) {
820                 cfg.front_reference_value = ctx->stencil_ref.ref_value[0];
821                 cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
822 
823                 cfg.stencil_from_shader = fs->info.fs.writes_stencil;
824                 cfg.depth_source = pan_depth_source(&fs->info);
825 
826                 cfg.depth_bias_enable = rast->base.offset_tri;
827                 cfg.depth_units = rast->base.offset_units * 2.0f;
828                 cfg.depth_factor = rast->base.offset_scale;
829                 cfg.depth_bias_clamp = rast->base.offset_clamp;
830         }
831 
832         pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
833         memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL));
834 
835         return T.gpu;
836 }
837 
838 /**
839  * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared
840  * with Bifrost, but the container data structure is simplified.
841  */
842 static mali_ptr
panfrost_emit_blend_valhall(struct panfrost_batch * batch)843 panfrost_emit_blend_valhall(struct panfrost_batch *batch)
844 {
845         unsigned rt_count = MAX2(batch->key.nr_cbufs, 1);
846 
847         struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND);
848 
849         mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
850         panfrost_get_blend_shaders(batch, blend_shaders);
851 
852         panfrost_emit_blend(batch, T.cpu, blend_shaders);
853 
854         /* Precalculate for the per-draw path */
855         bool has_blend_shader = false;
856 
857         for (unsigned i = 0; i < rt_count; ++i)
858                 has_blend_shader |= !!blend_shaders[i];
859 
860         batch->ctx->valhall_has_blend_shader = has_blend_shader;
861 
862         return T.gpu;
863 }
864 
865 /**
866  * Emit Valhall buffer descriptors for bound vertex buffers at draw-time.
867  */
868 static mali_ptr
panfrost_emit_vertex_buffers(struct panfrost_batch * batch)869 panfrost_emit_vertex_buffers(struct panfrost_batch *batch)
870 {
871         struct panfrost_context *ctx = batch->ctx;
872         unsigned buffer_count = util_last_bit(ctx->vb_mask);
873         struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base,
874                                                           buffer_count, BUFFER);
875         struct mali_buffer_packed *buffers = T.cpu;
876 
877         u_foreach_bit(i, ctx->vb_mask) {
878                 struct pipe_vertex_buffer vb = ctx->vertex_buffers[i];
879                 struct pipe_resource *prsrc = vb.buffer.resource;
880                 struct panfrost_resource *rsrc = pan_resource(prsrc);
881                 assert(!vb.is_user_buffer);
882 
883                 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
884 
885                 pan_pack(buffers + i, BUFFER, cfg) {
886                         cfg.address = rsrc->image.data.bo->ptr.gpu +
887                                       vb.buffer_offset;
888 
889                         cfg.size = prsrc->width0 - vb.buffer_offset;
890                 }
891         }
892 
893         return T.gpu;
894 }
895 
896 /**
897  * Emit Valhall attribute descriptors and associated (vertex) buffer
898  * descriptors at draw-time. The attribute descriptors are packed at draw time
899  * except for the stride field. The buffer descriptors are packed here, though
900  * that could be moved into panfrost_set_vertex_buffers if needed.
901  */
902 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch)903 panfrost_emit_vertex_data(struct panfrost_batch *batch)
904 {
905         struct panfrost_context *ctx = batch->ctx;
906         struct panfrost_vertex_state *vtx = ctx->vertex;
907         struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base,
908                                                           vtx->num_elements,
909                                                           ATTRIBUTE);
910         struct mali_attribute_packed *attributes = T.cpu;
911 
912         for (unsigned i = 0; i < vtx->num_elements; ++i) {
913                 struct mali_attribute_packed packed;
914                 unsigned vbi = vtx->pipe[i].vertex_buffer_index;
915 
916                 pan_pack(&packed, ATTRIBUTE, cfg) {
917                         cfg.stride = ctx->vertex_buffers[vbi].stride;
918                 }
919 
920                 pan_merge(packed, vtx->attributes[i], ATTRIBUTE);
921                 attributes[i] = packed;
922         }
923 
924         return T.gpu;
925 }
926 
927 /*
928  * Emit Valhall descriptors for shader images. Unlike previous generations,
929  * Valhall does not have a special descriptor for images. Standard texture
930  * descriptors are used. The binding is different in Gallium, however, so we
931  * translate.
932  */
933 static struct pipe_sampler_view
panfrost_pipe_image_to_sampler_view(struct pipe_image_view * v)934 panfrost_pipe_image_to_sampler_view(struct pipe_image_view *v)
935 {
936         struct pipe_sampler_view out = {
937                 .format = v->format,
938                 .texture = v->resource,
939                 .target = v->resource->target,
940                 .swizzle_r = PIPE_SWIZZLE_X,
941                 .swizzle_g = PIPE_SWIZZLE_Y,
942                 .swizzle_b = PIPE_SWIZZLE_Z,
943                 .swizzle_a = PIPE_SWIZZLE_W
944         };
945 
946         if (out.target == PIPE_BUFFER) {
947                 out.u.buf.offset = v->u.buf.offset;
948                 out.u.buf.size = v->u.buf.size;
949         } else {
950                 out.u.tex.first_layer = v->u.tex.first_layer;
951                 out.u.tex.last_layer = v->u.tex.last_layer;
952 
953                 /* Single level only */
954                 out.u.tex.first_level = v->u.tex.level;
955                 out.u.tex.last_level = v->u.tex.level;
956         }
957 
958         return out;
959 }
960 
961 static void
962 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
963                              struct pipe_context *pctx);
964 
965 static mali_ptr
panfrost_emit_images(struct panfrost_batch * batch,enum pipe_shader_type stage)966 panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage)
967 {
968         struct panfrost_context *ctx = batch->ctx;
969         unsigned last_bit = util_last_bit(ctx->image_mask[stage]);
970 
971         struct panfrost_ptr T =
972                 pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE);
973 
974         struct mali_texture_packed *out = (struct mali_texture_packed *) T.cpu;
975 
976         for (int i = 0; i < last_bit; ++i) {
977                 struct pipe_image_view *image = &ctx->images[stage][i];
978 
979                 if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) {
980                         memset(&out[i], 0, sizeof(out[i]));
981                         continue;
982                 }
983 
984                 /* Construct a synthetic sampler view so we can use our usual
985                  * sampler view code for the actual descriptor packing.
986                  *
987                  * Use the batch pool for a transient allocation, rather than
988                  * allocating a long-lived descriptor.
989                  */
990                 struct panfrost_sampler_view view = {
991                         .base = panfrost_pipe_image_to_sampler_view(image),
992                         .pool = &batch->pool
993                 };
994 
995                 /* If we specify a cube map, the hardware internally treat it as
996                  * a 2D array. Since cube maps as images can confuse our common
997                  * texturing code, explicitly use a 2D array.
998                  *
999                  * Similar concerns apply to 3D textures.
1000                  */
1001                 if (view.base.target == PIPE_BUFFER)
1002                         view.base.target = PIPE_BUFFER;
1003                 else
1004                         view.base.target = PIPE_TEXTURE_2D_ARRAY;
1005 
1006                 panfrost_update_sampler_view(&view, &ctx->base);
1007                 out[i] = view.bifrost_descriptor;
1008 
1009                 panfrost_track_image_access(batch, stage, image);
1010         }
1011 
1012         return T.gpu;
1013 }
1014 #endif
1015 
1016 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)1017 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1018                                  enum pipe_shader_type st,
1019                                  struct panfrost_constant_buffer *buf,
1020                                  unsigned index)
1021 {
1022         struct pipe_constant_buffer *cb = &buf->cb[index];
1023         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1024 
1025         if (rsrc) {
1026                 panfrost_batch_read_rsrc(batch, rsrc, st);
1027 
1028                 /* Alignment gauranteed by
1029                  * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1030                 return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset;
1031         } else if (cb->user_buffer) {
1032                 return pan_pool_upload_aligned(&batch->pool.base,
1033                                                cb->user_buffer +
1034                                                cb->buffer_offset,
1035                                                cb->buffer_size, 16);
1036         } else {
1037                 unreachable("No constant buffer");
1038         }
1039 }
1040 
1041 struct sysval_uniform {
1042         union {
1043                 float f[4];
1044                 int32_t i[4];
1045                 uint32_t u[4];
1046                 uint64_t du[2];
1047         };
1048 };
1049 
1050 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1051 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1052                                       struct sysval_uniform *uniform)
1053 {
1054         struct panfrost_context *ctx = batch->ctx;
1055         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1056 
1057         uniform->f[0] = vp->scale[0];
1058         uniform->f[1] = vp->scale[1];
1059         uniform->f[2] = vp->scale[2];
1060 }
1061 
1062 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1063 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1064                                        struct sysval_uniform *uniform)
1065 {
1066         struct panfrost_context *ctx = batch->ctx;
1067         const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1068 
1069         uniform->f[0] = vp->translate[0];
1070         uniform->f[1] = vp->translate[1];
1071         uniform->f[2] = vp->translate[2];
1072 }
1073 
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1074 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1075                                        enum pipe_shader_type st,
1076                                        unsigned int sysvalid,
1077                                        struct sysval_uniform *uniform)
1078 {
1079         struct panfrost_context *ctx = batch->ctx;
1080         unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1081         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1082         bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1083         struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1084 
1085         assert(dim);
1086 
1087         if (tex->target == PIPE_BUFFER) {
1088                 assert(dim == 1);
1089                 uniform->i[0] =
1090                         tex->u.buf.size / util_format_get_blocksize(tex->format);
1091                 return;
1092         }
1093 
1094         uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1095 
1096         if (dim > 1)
1097                 uniform->i[1] = u_minify(tex->texture->height0,
1098                                          tex->u.tex.first_level);
1099 
1100         if (dim > 2)
1101                 uniform->i[2] = u_minify(tex->texture->depth0,
1102                                          tex->u.tex.first_level);
1103 
1104         if (is_array) {
1105                 unsigned size = tex->texture->array_size;
1106 
1107                 /* Internally, we store the number of 2D images (faces * array
1108                  * size). Externally, we report the array size in terms of
1109                  * complete cubes. So divide by the # of faces per cube.
1110                  */
1111                 if (tex->target == PIPE_TEXTURE_CUBE_ARRAY)
1112                         size /= 6;
1113 
1114                 uniform->i[dim] = size;
1115         }
1116 }
1117 
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1118 static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
1119                                               enum pipe_shader_type st,
1120                                               unsigned int sysvalid,
1121                                               struct sysval_uniform *uniform)
1122 {
1123         struct panfrost_context *ctx = batch->ctx;
1124         unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1125         unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1126         unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1127 
1128         assert(dim && dim < 4);
1129 
1130         struct pipe_image_view *image = &ctx->images[st][idx];
1131 
1132         if (image->resource->target == PIPE_BUFFER) {
1133                 unsigned blocksize = util_format_get_blocksize(image->format);
1134                 uniform->i[0] = image->resource->width0 / blocksize;
1135                 return;
1136         }
1137 
1138         uniform->i[0] = u_minify(image->resource->width0,
1139                                  image->u.tex.level);
1140 
1141         if (dim > 1)
1142                 uniform->i[1] = u_minify(image->resource->height0,
1143                                          image->u.tex.level);
1144 
1145         if (dim > 2)
1146                 uniform->i[2] = u_minify(image->resource->depth0,
1147                                          image->u.tex.level);
1148 
1149         if (is_array)
1150                 uniform->i[dim] = image->resource->array_size;
1151 }
1152 
1153 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)1154 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1155                             enum pipe_shader_type st,
1156                             unsigned ssbo_id,
1157                             struct sysval_uniform *uniform)
1158 {
1159         struct panfrost_context *ctx = batch->ctx;
1160 
1161         assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1162         struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1163 
1164         /* Compute address */
1165         struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1166         struct panfrost_bo *bo = rsrc->image.data.bo;
1167 
1168         panfrost_batch_write_rsrc(batch, rsrc, st);
1169 
1170         util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1171                         sb.buffer_offset, sb.buffer_size);
1172 
1173         /* Upload address and size as sysval */
1174         uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
1175         uniform->u[2] = sb.buffer_size;
1176 }
1177 
1178 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)1179 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1180                                enum pipe_shader_type st,
1181                                unsigned samp_idx,
1182                                struct sysval_uniform *uniform)
1183 {
1184         struct panfrost_context *ctx = batch->ctx;
1185         struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1186 
1187         uniform->f[0] = sampl->min_lod;
1188         uniform->f[1] = sampl->max_lod;
1189         uniform->f[2] = sampl->lod_bias;
1190 
1191         /* Even without any errata, Midgard represents "no mipmapping" as
1192          * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1193          * panfrost_create_sampler_state which also explains our choice of
1194          * epsilon value (again to keep behaviour consistent) */
1195 
1196         if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1197                 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1198 }
1199 
1200 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1201 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1202                                        struct sysval_uniform *uniform)
1203 {
1204         struct panfrost_context *ctx = batch->ctx;
1205 
1206         uniform->u[0] = ctx->compute_grid->grid[0];
1207         uniform->u[1] = ctx->compute_grid->grid[1];
1208         uniform->u[2] = ctx->compute_grid->grid[2];
1209 }
1210 
1211 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1212 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
1213                                         struct sysval_uniform *uniform)
1214 {
1215         struct panfrost_context *ctx = batch->ctx;
1216 
1217         uniform->u[0] = ctx->compute_grid->block[0];
1218         uniform->u[1] = ctx->compute_grid->block[1];
1219         uniform->u[2] = ctx->compute_grid->block[2];
1220 }
1221 
1222 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1223 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
1224                                 struct sysval_uniform *uniform)
1225 {
1226         struct panfrost_context *ctx = batch->ctx;
1227 
1228         uniform->u[0] = ctx->compute_grid->work_dim;
1229 }
1230 
1231 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
1232  * Midgard, we emulate the Bifrost path with some extra arithmetic in the
1233  * shader, to keep the code as unified as possible. */
1234 
1235 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1236 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
1237                                 struct sysval_uniform *uniform)
1238 {
1239         struct panfrost_context *ctx = batch->ctx;
1240         struct panfrost_device *dev = pan_device(ctx->base.screen);
1241 
1242         unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1243         uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples));
1244 }
1245 
1246 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1247 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1248                                 struct sysval_uniform *uniform)
1249 {
1250         unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1251         uniform->u[0] = samples > 1;
1252 }
1253 
1254 #if PAN_ARCH >= 6
1255 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)1256 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1257                 unsigned size_and_rt, struct sysval_uniform *uniform)
1258 {
1259         struct panfrost_context *ctx = batch->ctx;
1260         struct panfrost_device *dev = pan_device(ctx->base.screen);
1261         unsigned rt = size_and_rt & 0xF;
1262         unsigned size = size_and_rt >> 4;
1263 
1264         if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1265                 enum pipe_format format = batch->key.cbufs[rt]->format;
1266                 uniform->u[0] =
1267                         GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32;
1268         } else {
1269                 pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
1270                         cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;
1271         }
1272 }
1273 #endif
1274 
1275 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1276 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1277 {
1278         return target->buffer_offset + (pan_so_target(target)->offset * stride);
1279 }
1280 
1281 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,const struct panfrost_ptr * ptr,struct panfrost_shader_state * ss,enum pipe_shader_type st)1282 panfrost_upload_sysvals(struct panfrost_batch *batch,
1283                         const struct panfrost_ptr *ptr,
1284                         struct panfrost_shader_state *ss,
1285                         enum pipe_shader_type st)
1286 {
1287         struct sysval_uniform *uniforms = ptr->cpu;
1288 
1289         for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {
1290                 int sysval = ss->info.sysvals.sysvals[i];
1291 
1292                 switch (PAN_SYSVAL_TYPE(sysval)) {
1293                 case PAN_SYSVAL_VIEWPORT_SCALE:
1294                         panfrost_upload_viewport_scale_sysval(batch,
1295                                                               &uniforms[i]);
1296                         break;
1297                 case PAN_SYSVAL_VIEWPORT_OFFSET:
1298                         panfrost_upload_viewport_offset_sysval(batch,
1299                                                                &uniforms[i]);
1300                         break;
1301                 case PAN_SYSVAL_TEXTURE_SIZE:
1302                         panfrost_upload_txs_sysval(batch, st,
1303                                                    PAN_SYSVAL_ID(sysval),
1304                                                    &uniforms[i]);
1305                         break;
1306                 case PAN_SYSVAL_SSBO:
1307                         panfrost_upload_ssbo_sysval(batch, st,
1308                                                     PAN_SYSVAL_ID(sysval),
1309                                                     &uniforms[i]);
1310                         break;
1311 
1312                 case PAN_SYSVAL_XFB:
1313                 {
1314                         unsigned buf = PAN_SYSVAL_ID(sysval);
1315                         struct panfrost_shader_state *vs =
1316                                 panfrost_get_shader_state(batch->ctx, PIPE_SHADER_VERTEX);
1317                         struct pipe_stream_output_info *so = &vs->stream_output;
1318                         unsigned stride = so->stride[buf] * 4;
1319 
1320                         struct pipe_stream_output_target *target = NULL;
1321                         if (buf < batch->ctx->streamout.num_targets)
1322                                 target = batch->ctx->streamout.targets[buf];
1323 
1324                         if (!target) {
1325                                 /* Memory sink */
1326                                 uniforms[i].du[0] = 0x8ull << 60;
1327                                 break;
1328                         }
1329 
1330                         struct panfrost_resource *rsrc = pan_resource(target->buffer);
1331                         unsigned offset = panfrost_xfb_offset(stride, target);
1332 
1333                         util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1334                                 offset, target->buffer_size - offset);
1335 
1336                         panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1337 
1338                         uniforms[i].du[0] = rsrc->image.data.bo->ptr.gpu + offset;
1339                         break;
1340                 }
1341 
1342                 case PAN_SYSVAL_NUM_VERTICES:
1343                         uniforms[i].u[0] = batch->ctx->vertex_count;
1344                         break;
1345 
1346                 case PAN_SYSVAL_NUM_WORK_GROUPS:
1347                         for (unsigned j = 0; j < 3; j++) {
1348                                 batch->num_wg_sysval[j] =
1349                                         ptr->gpu + (i * sizeof(*uniforms)) + (j * 4);
1350                         }
1351                         panfrost_upload_num_work_groups_sysval(batch,
1352                                                                &uniforms[i]);
1353                         break;
1354                 case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1355                         panfrost_upload_local_group_size_sysval(batch,
1356                                                                 &uniforms[i]);
1357                         break;
1358                 case PAN_SYSVAL_WORK_DIM:
1359                         panfrost_upload_work_dim_sysval(batch,
1360                                                         &uniforms[i]);
1361                         break;
1362                 case PAN_SYSVAL_SAMPLER:
1363                         panfrost_upload_sampler_sysval(batch, st,
1364                                                        PAN_SYSVAL_ID(sysval),
1365                                                        &uniforms[i]);
1366                         break;
1367                 case PAN_SYSVAL_IMAGE_SIZE:
1368                         panfrost_upload_image_size_sysval(batch, st,
1369                                                           PAN_SYSVAL_ID(sysval),
1370                                                           &uniforms[i]);
1371                         break;
1372                 case PAN_SYSVAL_SAMPLE_POSITIONS:
1373                         panfrost_upload_sample_positions_sysval(batch,
1374                                                         &uniforms[i]);
1375                         break;
1376                 case PAN_SYSVAL_MULTISAMPLED:
1377                         panfrost_upload_multisampled_sysval(batch,
1378                                                                &uniforms[i]);
1379                         break;
1380 #if PAN_ARCH >= 6
1381                 case PAN_SYSVAL_RT_CONVERSION:
1382                         panfrost_upload_rt_conversion_sysval(batch,
1383                                         PAN_SYSVAL_ID(sysval), &uniforms[i]);
1384                         break;
1385 #endif
1386                 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1387                         batch->ctx->first_vertex_sysval_ptr =
1388                                 ptr->gpu + (i * sizeof(*uniforms));
1389                         batch->ctx->base_vertex_sysval_ptr =
1390                                 batch->ctx->first_vertex_sysval_ptr + 4;
1391                         batch->ctx->base_instance_sysval_ptr =
1392                                 batch->ctx->first_vertex_sysval_ptr + 8;
1393 
1394                         uniforms[i].u[0] = batch->ctx->offset_start;
1395                         uniforms[i].u[1] = batch->ctx->base_vertex;
1396                         uniforms[i].u[2] = batch->ctx->base_instance;
1397                         break;
1398                 case PAN_SYSVAL_DRAWID:
1399                         uniforms[i].u[0] = batch->ctx->drawid;
1400                         break;
1401                 default:
1402                         assert(0);
1403                 }
1404         }
1405 }
1406 
1407 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1408 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1409                                  struct panfrost_constant_buffer *buf,
1410                                  unsigned index)
1411 {
1412         struct pipe_constant_buffer *cb = &buf->cb[index];
1413         struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1414 
1415         if (rsrc) {
1416                 panfrost_bo_mmap(rsrc->image.data.bo);
1417                 panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1418                 panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false);
1419 
1420                 return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset;
1421         } else if (cb->user_buffer) {
1422                 return cb->user_buffer + cb->buffer_offset;
1423         } else
1424                 unreachable("No constant buffer");
1425 }
1426 
1427 /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
1428  * implemented with buffer descriptors in the resource table, sized in terms of
1429  * bytes. On Bifrost and older, UBOs have special uniform buffer data
1430  * structure, sized in terms of entries.
1431  */
1432 static void
panfrost_emit_ubo(void * base,unsigned index,mali_ptr address,size_t size)1433 panfrost_emit_ubo(void *base, unsigned index, mali_ptr address, size_t size)
1434 {
1435 #if PAN_ARCH >= 9
1436         struct mali_buffer_packed *out = base;
1437 
1438         pan_pack(out + index, BUFFER, cfg) {
1439                 cfg.size = size;
1440                 cfg.address = address;
1441         }
1442 #else
1443         struct mali_uniform_buffer_packed *out = base;
1444 
1445         /* Issue (57) for the ARB_uniform_buffer_object spec says that
1446          * the buffer can be larger than the uniform data inside it,
1447          * so clamp ubo size to what hardware supports. */
1448 
1449         pan_pack(out + index, UNIFORM_BUFFER, cfg) {
1450                 cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12);
1451                 cfg.pointer = address;
1452         }
1453 #endif
1454 }
1455 
1456 static mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,unsigned * buffer_count,mali_ptr * push_constants,unsigned * pushed_words)1457 panfrost_emit_const_buf(struct panfrost_batch *batch,
1458                         enum pipe_shader_type stage,
1459                         unsigned *buffer_count,
1460                         mali_ptr *push_constants,
1461                         unsigned *pushed_words)
1462 {
1463         struct panfrost_context *ctx = batch->ctx;
1464         struct panfrost_shader_variants *all = ctx->shader[stage];
1465 
1466         if (!all)
1467                 return 0;
1468 
1469         struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1470         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1471 
1472         /* Allocate room for the sysval and the uniforms */
1473         size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
1474         struct panfrost_ptr transfer =
1475                 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1476 
1477         /* Upload sysvals requested by the shader */
1478         panfrost_upload_sysvals(batch, &transfer, ss, stage);
1479 
1480         /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1481         struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
1482         unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1483         unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1484         struct panfrost_ptr ubos = { 0 };
1485 
1486 #if PAN_ARCH >= 9
1487         ubos = pan_pool_alloc_desc_array(&batch->pool.base,
1488                                          ubo_count + 1,
1489                                          BUFFER);
1490 #else
1491         ubos = pan_pool_alloc_desc_array(&batch->pool.base,
1492                                          ubo_count + 1,
1493                                          UNIFORM_BUFFER);
1494 #endif
1495 
1496         if (buffer_count)
1497                 *buffer_count = ubo_count + (sys_size ? 1 : 0);
1498 
1499         /* Upload sysval as a final UBO */
1500 
1501         if (sys_size)
1502                 panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size);
1503 
1504         /* The rest are honest-to-goodness UBOs */
1505 
1506         u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1507                 size_t usz = buf->cb[ubo].buffer_size;
1508                 mali_ptr address = 0;
1509 
1510                 if (usz > 0) {
1511                         address = panfrost_map_constant_buffer_gpu(batch,
1512                                         stage, buf, ubo);
1513                 }
1514 
1515                 panfrost_emit_ubo(ubos.cpu, ubo, address, usz);
1516         }
1517 
1518         if (pushed_words)
1519                 *pushed_words = ss->info.push.count;
1520 
1521         if (ss->info.push.count == 0)
1522                 return ubos.gpu;
1523 
1524         /* Copy push constants required by the shader */
1525         struct panfrost_ptr push_transfer =
1526                 pan_pool_alloc_aligned(&batch->pool.base,
1527                                        ss->info.push.count * 4, 16);
1528 
1529         uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;
1530         *push_constants = push_transfer.gpu;
1531 
1532         for (unsigned i = 0; i < ss->info.push.count; ++i) {
1533                 struct panfrost_ubo_word src = ss->info.push.words[i];
1534 
1535                 if (src.ubo == sysval_ubo) {
1536                         unsigned sysval_idx = src.offset / 16;
1537                         unsigned sysval_comp = (src.offset % 16) / 4;
1538                         unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]);
1539                         mali_ptr ptr = push_transfer.gpu + (4 * i);
1540 
1541                         switch (sysval_type) {
1542                         case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1543                                 switch (sysval_comp) {
1544                                 case 0:
1545                                         batch->ctx->first_vertex_sysval_ptr = ptr;
1546                                         break;
1547                                 case 1:
1548                                         batch->ctx->base_vertex_sysval_ptr = ptr;
1549                                         break;
1550                                 case 2:
1551                                         batch->ctx->base_instance_sysval_ptr = ptr;
1552                                         break;
1553                                 case 3:
1554                                         /* Spurious (Midgard doesn't pack) */
1555                                         break;
1556                                 default:
1557                                         unreachable("Invalid vertex/instance offset component\n");
1558                                 }
1559                                 break;
1560 
1561                         case PAN_SYSVAL_NUM_WORK_GROUPS:
1562                                 batch->num_wg_sysval[sysval_comp] = ptr;
1563                                 break;
1564 
1565                         default:
1566                                 break;
1567                         }
1568                 }
1569                 /* Map the UBO, this should be cheap. However this is reading
1570                  * from write-combine memory which is _very_ slow. It might pay
1571                  * off to upload sysvals to a staging buffer on the CPU on the
1572                  * assumption sysvals will get pushed (TODO) */
1573 
1574                 const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
1575                         panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1576 
1577                 /* TODO: Is there any benefit to combining ranges */
1578                 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
1579         }
1580 
1581         return ubos.gpu;
1582 }
1583 
1584 static mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * grid)1585 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1586                             const struct pipe_grid_info *grid)
1587 {
1588         struct panfrost_context *ctx = batch->ctx;
1589         struct panfrost_device *dev = pan_device(ctx->base.screen);
1590         struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1591         struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1592         struct panfrost_ptr t =
1593                 pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1594 
1595         struct pan_tls_info info = {
1596                 .tls.size = ss->info.tls_size,
1597                 .wls.size = ss->info.wls_size,
1598                 .wls.dim.x = grid->grid[0],
1599                 .wls.dim.y = grid->grid[1],
1600                 .wls.dim.z = grid->grid[2],
1601         };
1602 
1603         if (ss->info.tls_size) {
1604                 struct panfrost_bo *bo =
1605                         panfrost_batch_get_scratchpad(batch,
1606                                                       ss->info.tls_size,
1607                                                       dev->thread_tls_alloc,
1608                                                       dev->core_id_range);
1609                 info.tls.ptr = bo->ptr.gpu;
1610         }
1611 
1612         if (ss->info.wls_size) {
1613                 unsigned size =
1614                         pan_wls_adjust_size(info.wls.size) *
1615                         pan_wls_instances(&info.wls.dim) *
1616                         dev->core_id_range;
1617 
1618                 struct panfrost_bo *bo =
1619                         panfrost_batch_get_shared_memory(batch, size, 1);
1620 
1621                 info.wls.ptr = bo->ptr.gpu;
1622         }
1623 
1624         GENX(pan_emit_tls)(&info, t.cpu);
1625         return t.gpu;
1626 }
1627 
1628 #if PAN_ARCH <= 5
1629 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1630 panfrost_get_tex_desc(struct panfrost_batch *batch,
1631                       enum pipe_shader_type st,
1632                       struct panfrost_sampler_view *view)
1633 {
1634         if (!view)
1635                 return (mali_ptr) 0;
1636 
1637         struct pipe_sampler_view *pview = &view->base;
1638         struct panfrost_resource *rsrc = pan_resource(pview->texture);
1639 
1640         panfrost_batch_read_rsrc(batch, rsrc, st);
1641         panfrost_batch_add_bo(batch, view->state.bo, st);
1642 
1643         return view->state.gpu;
1644 }
1645 #endif
1646 
1647 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1648 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1649                                 struct pipe_context *pctx,
1650                                 struct pipe_resource *texture)
1651 {
1652         struct panfrost_device *device = pan_device(pctx->screen);
1653         struct panfrost_context *ctx = pan_context(pctx);
1654         struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1655         enum pipe_format format = so->base.format;
1656         assert(prsrc->image.data.bo);
1657 
1658         /* Format to access the stencil/depth portion of a Z32_S8 texture */
1659         if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1660                 assert(prsrc->separate_stencil);
1661                 texture = &prsrc->separate_stencil->base;
1662                 prsrc = (struct panfrost_resource *)texture;
1663                 format = texture->format;
1664         } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1665                 format = PIPE_FORMAT_Z32_FLOAT;
1666         }
1667 
1668         const struct util_format_description *desc = util_format_description(format);
1669 
1670         bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM);
1671 
1672         if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) {
1673                 if (desc->is_snorm)
1674                         format = PIPE_FORMAT_R8G8B8A8_SNORM;
1675                 else
1676                         format = PIPE_FORMAT_R8G8B8A8_UNORM;
1677                 desc = util_format_description(format);
1678         }
1679 
1680         so->texture_bo = prsrc->image.data.bo->ptr.gpu;
1681         so->modifier = prsrc->image.layout.modifier;
1682 
1683         /* MSAA only supported for 2D textures */
1684 
1685         assert(texture->nr_samples <= 1 ||
1686                so->base.target == PIPE_TEXTURE_2D ||
1687                so->base.target == PIPE_TEXTURE_2D_ARRAY);
1688 
1689         enum mali_texture_dimension type =
1690                 panfrost_translate_texture_dimension(so->base.target);
1691 
1692         bool is_buffer = (so->base.target == PIPE_BUFFER);
1693 
1694         unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1695         unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1696         unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1697         unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1698         unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1699         unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) /
1700                             util_format_get_blocksize(format);
1701 
1702         if (so->base.target == PIPE_TEXTURE_3D) {
1703                 first_layer /= prsrc->image.layout.depth;
1704                 last_layer /= prsrc->image.layout.depth;
1705                 assert(!first_layer && !last_layer);
1706         }
1707 
1708         struct pan_image_view iview = {
1709                 .format = format,
1710                 .dim = type,
1711                 .first_level = first_level,
1712                 .last_level = last_level,
1713                 .first_layer = first_layer,
1714                 .last_layer = last_layer,
1715                 .swizzle = {
1716                         so->base.swizzle_r,
1717                         so->base.swizzle_g,
1718                         so->base.swizzle_b,
1719                         so->base.swizzle_a,
1720                 },
1721                 .image = &prsrc->image,
1722 
1723                 .buf.offset = buf_offset,
1724                 .buf.size = buf_size,
1725         };
1726 
1727         unsigned size =
1728                 (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1729                 GENX(panfrost_estimate_texture_payload_size)(&iview);
1730 
1731         struct panfrost_pool *pool = so->pool ?: &ctx->descs;
1732         struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64);
1733         so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1734 
1735         void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1736 
1737         if (PAN_ARCH <= 5) {
1738                 payload.cpu += pan_size(TEXTURE);
1739                 payload.gpu += pan_size(TEXTURE);
1740         }
1741 
1742         GENX(panfrost_new_texture)(device, &iview, tex, &payload);
1743 }
1744 
1745 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1746 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1747                              struct pipe_context *pctx)
1748 {
1749         struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1750         if (view->texture_bo != rsrc->image.data.bo->ptr.gpu ||
1751             view->modifier != rsrc->image.layout.modifier) {
1752                 panfrost_bo_unreference(view->state.bo);
1753                 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1754         }
1755 }
1756 
1757 static mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1758 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1759                                   enum pipe_shader_type stage)
1760 {
1761         struct panfrost_context *ctx = batch->ctx;
1762 
1763         if (!ctx->sampler_view_count[stage])
1764                 return 0;
1765 
1766 #if PAN_ARCH >= 6
1767         struct panfrost_ptr T =
1768                 pan_pool_alloc_desc_array(&batch->pool.base,
1769                                           ctx->sampler_view_count[stage],
1770                                           TEXTURE);
1771         struct mali_texture_packed *out =
1772                 (struct mali_texture_packed *) T.cpu;
1773 
1774         for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1775                 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1776 
1777                 if (!view) {
1778                         memset(&out[i], 0, sizeof(out[i]));
1779                         continue;
1780                 }
1781 
1782                 struct pipe_sampler_view *pview = &view->base;
1783                 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1784 
1785                 panfrost_update_sampler_view(view, &ctx->base);
1786                 out[i] = view->bifrost_descriptor;
1787 
1788                 panfrost_batch_read_rsrc(batch, rsrc, stage);
1789                 panfrost_batch_add_bo(batch, view->state.bo, stage);
1790         }
1791 
1792         return T.gpu;
1793 #else
1794         uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1795 
1796         for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1797                 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1798 
1799                 if (!view) {
1800                         trampolines[i] = 0;
1801                         continue;
1802                 }
1803 
1804                 panfrost_update_sampler_view(view, &ctx->base);
1805 
1806                 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1807         }
1808 
1809         return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1810                                        sizeof(uint64_t) *
1811                                        ctx->sampler_view_count[stage],
1812                                        sizeof(uint64_t));
1813 #endif
1814 }
1815 
1816 static mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1817 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1818                                   enum pipe_shader_type stage)
1819 {
1820         struct panfrost_context *ctx = batch->ctx;
1821 
1822         if (!ctx->sampler_count[stage])
1823                 return 0;
1824 
1825         struct panfrost_ptr T =
1826                 pan_pool_alloc_desc_array(&batch->pool.base,
1827                                           ctx->sampler_count[stage],
1828                                           SAMPLER);
1829         struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu;
1830 
1831         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) {
1832                 struct panfrost_sampler_state *st = ctx->samplers[stage][i];
1833 
1834                 out[i] = st ? st->hw : (struct mali_sampler_packed){0};
1835         }
1836 
1837         return T.gpu;
1838 }
1839 
1840 #if PAN_ARCH <= 7
1841 /* Packs all image attribute descs and attribute buffer descs.
1842  * `first_image_buf_index` must be the index of the first image attribute buffer descriptor.
1843  */
1844 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1845 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1846                    struct mali_attribute_packed *attribs, unsigned first_buf)
1847 {
1848         struct panfrost_device *dev = pan_device(ctx->base.screen);
1849         unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1850 
1851         for (unsigned i = 0; i < last_bit; ++i) {
1852                 enum pipe_format format = ctx->images[shader][i].format;
1853 
1854                 pan_pack(attribs + i, ATTRIBUTE, cfg) {
1855                         /* Continuation record means 2 buffers per image */
1856                         cfg.buffer_index = first_buf + (i * 2);
1857                         cfg.offset_enable = (PAN_ARCH <= 5);
1858                         cfg.format = dev->formats[format].hw;
1859                 }
1860         }
1861 }
1862 
1863 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1864 pan_modifier_to_attr_type(uint64_t modifier)
1865 {
1866         switch (modifier) {
1867         case DRM_FORMAT_MOD_LINEAR:
1868                 return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1869         case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1870                 return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1871         default:
1872                 unreachable("Invalid modifier for attribute record");
1873         }
1874 }
1875 
1876 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1877 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1878                 struct mali_attribute_buffer_packed *bufs,
1879                 unsigned first_image_buf_index)
1880 {
1881         struct panfrost_context *ctx = batch->ctx;
1882         unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1883 
1884         for (unsigned i = 0; i < last_bit; ++i) {
1885                 struct pipe_image_view *image = &ctx->images[shader][i];
1886 
1887                 if (!(ctx->image_mask[shader] & (1 << i)) ||
1888                     !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1889                         /* Unused image bindings */
1890                         pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg);
1891                         pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg);
1892                         continue;
1893                 }
1894 
1895                 struct panfrost_resource *rsrc = pan_resource(image->resource);
1896 
1897                 /* TODO: MSAA */
1898                 assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported");
1899 
1900                 bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1901                 bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1902 
1903                 unsigned offset = is_buffer ? image->u.buf.offset :
1904                         panfrost_texture_offset(&rsrc->image.layout,
1905                                                 image->u.tex.level,
1906                                                 is_3d ? 0 : image->u.tex.first_layer,
1907                                                 is_3d ? image->u.tex.first_layer : 0);
1908 
1909                 panfrost_track_image_access(batch, shader, image);
1910 
1911                 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1912                         cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1913                         cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset;
1914                         cfg.stride = util_format_get_blocksize(image->format);
1915                         cfg.size = rsrc->image.data.bo->size - offset;
1916                 }
1917 
1918                 if (is_buffer) {
1919                         pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1920                                 cfg.s_dimension = rsrc->base.width0 /
1921                                         util_format_get_blocksize(image->format);
1922                                 cfg.t_dimension = cfg.r_dimension = 1;
1923                         }
1924 
1925                         continue;
1926                 }
1927 
1928                 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1929                         unsigned level = image->u.tex.level;
1930 
1931                         cfg.s_dimension = u_minify(rsrc->base.width0, level);
1932                         cfg.t_dimension = u_minify(rsrc->base.height0, level);
1933                         cfg.r_dimension = is_3d ?
1934                                 u_minify(rsrc->base.depth0, level) :
1935                                 image->u.tex.last_layer - image->u.tex.first_layer + 1;
1936 
1937                         cfg.row_stride =
1938                                 rsrc->image.layout.slices[level].row_stride;
1939 
1940                         if (rsrc->base.target != PIPE_TEXTURE_2D) {
1941                                 cfg.slice_stride =
1942                                         panfrost_get_layer_stride(&rsrc->image.layout,
1943                                                                   level);
1944                         }
1945                 }
1946         }
1947 }
1948 
1949 static mali_ptr
panfrost_emit_image_attribs(struct panfrost_batch * batch,mali_ptr * buffers,enum pipe_shader_type type)1950 panfrost_emit_image_attribs(struct panfrost_batch *batch,
1951                             mali_ptr *buffers,
1952                             enum pipe_shader_type type)
1953 {
1954         struct panfrost_context *ctx = batch->ctx;
1955         struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type);
1956 
1957         if (!shader->info.attribute_count) {
1958                 *buffers = 0;
1959                 return 0;
1960         }
1961 
1962         /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1963         unsigned attr_count = shader->info.attribute_count;
1964         unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
1965 
1966         struct panfrost_ptr bufs =
1967                 pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1968 
1969         struct panfrost_ptr attribs =
1970                 pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1971 
1972         emit_image_attribs(ctx, type, attribs.cpu, 0);
1973         emit_image_bufs(batch, type, bufs.cpu, 0);
1974 
1975         /* We need an empty attrib buf to stop the prefetching on Bifrost */
1976 #if PAN_ARCH >= 6
1977         pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
1978                  ATTRIBUTE_BUFFER, cfg);
1979 #endif
1980 
1981         *buffers = bufs.gpu;
1982         return attribs.gpu;
1983 }
1984 
1985 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1986 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1987                           mali_ptr *buffers)
1988 {
1989         struct panfrost_context *ctx = batch->ctx;
1990         struct panfrost_vertex_state *so = ctx->vertex;
1991         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1992         bool instanced = ctx->indirect_draw || ctx->instance_count > 1;
1993         uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1994         unsigned nr_images = util_last_bit(image_mask);
1995 
1996         /* Worst case: everything is NPOT, which is only possible if instancing
1997          * is enabled. Otherwise single record is gauranteed.
1998          * Also, we allocate more memory than what's needed here if either instancing
1999          * is enabled or images are present, this can be improved. */
2000         unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
2001         unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
2002                            (PAN_ARCH >= 6 ? 1 : 0);
2003 
2004         unsigned count = vs->info.attribute_count;
2005 
2006         if (vs->xfb)
2007                 count = MAX2(count, vs->xfb->info.attribute_count);
2008 
2009 #if PAN_ARCH <= 5
2010         /* Midgard needs vertexid/instanceid handled specially */
2011         bool special_vbufs = count >= PAN_VERTEX_ID;
2012 
2013         if (special_vbufs)
2014                 nr_bufs += 2;
2015 #endif
2016 
2017         if (!nr_bufs) {
2018                 *buffers = 0;
2019                 return 0;
2020         }
2021 
2022         struct panfrost_ptr S =
2023                 pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
2024                                           ATTRIBUTE_BUFFER);
2025         struct panfrost_ptr T =
2026                 pan_pool_alloc_desc_array(&batch->pool.base, count,
2027                                           ATTRIBUTE);
2028 
2029         struct mali_attribute_buffer_packed *bufs =
2030                 (struct mali_attribute_buffer_packed *) S.cpu;
2031 
2032         struct mali_attribute_packed *out =
2033                 (struct mali_attribute_packed *) T.cpu;
2034 
2035         unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
2036         unsigned k = 0;
2037 
2038         for (unsigned i = 0; i < so->nr_bufs; ++i) {
2039                 unsigned vbi = so->buffers[i].vbi;
2040                 unsigned divisor = so->buffers[i].divisor;
2041                 attrib_to_buffer[i] = k;
2042 
2043                 if (!(ctx->vb_mask & (1 << vbi)))
2044                         continue;
2045 
2046                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2047                 struct panfrost_resource *rsrc;
2048 
2049                 rsrc = pan_resource(buf->buffer.resource);
2050                 if (!rsrc)
2051                         continue;
2052 
2053                 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2054 
2055                 /* Mask off lower bits, see offset fixup below */
2056                 mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset;
2057                 mali_ptr addr = raw_addr & ~63;
2058 
2059                 /* Since we advanced the base pointer, we shrink the buffer
2060                  * size, but add the offset we subtracted */
2061                 unsigned size = rsrc->base.width0 + (raw_addr - addr)
2062                         - buf->buffer_offset;
2063 
2064                 /* When there is a divisor, the hardware-level divisor is
2065                  * the product of the instance divisor and the padded count */
2066                 unsigned stride = buf->stride;
2067 
2068                 if (ctx->indirect_draw) {
2069                         /* We allocated 2 records for each attribute buffer */
2070                         assert((k & 1) == 0);
2071 
2072                         /* With indirect draws we can't guess the vertex_count.
2073                          * Pre-set the address, stride and size fields, the
2074                          * compute shader do the rest.
2075                          */
2076                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2077                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D;
2078                                 cfg.pointer = addr;
2079                                 cfg.stride = stride;
2080                                 cfg.size = size;
2081                         }
2082 
2083                         /* We store the unmodified divisor in the continuation
2084                          * slot so the compute shader can retrieve it.
2085                          */
2086                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
2087                                 cfg.divisor = divisor;
2088                         }
2089 
2090                         k += 2;
2091                         continue;
2092                 }
2093 
2094                 unsigned hw_divisor = ctx->padded_count * divisor;
2095 
2096                 if (ctx->instance_count <= 1) {
2097                         /* Per-instance would be every attribute equal */
2098                         if (divisor)
2099                                 stride = 0;
2100 
2101                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2102                                 cfg.pointer = addr;
2103                                 cfg.stride = stride;
2104                                 cfg.size = size;
2105                         }
2106                 } else if (!divisor) {
2107                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2108                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
2109                                 cfg.pointer = addr;
2110                                 cfg.stride = stride;
2111                                 cfg.size = size;
2112                                 cfg.divisor = ctx->padded_count;
2113                         }
2114                 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
2115                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2116                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
2117                                 cfg.pointer = addr;
2118                                 cfg.stride = stride;
2119                                 cfg.size = size;
2120                                 cfg.divisor_r = __builtin_ctz(hw_divisor);
2121                         }
2122 
2123                 } else {
2124                         unsigned shift = 0, extra_flags = 0;
2125 
2126                         unsigned magic_divisor =
2127                                 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
2128 
2129                         /* Records with continuations must be aligned */
2130                         k = ALIGN_POT(k, 2);
2131                         attrib_to_buffer[i] = k;
2132 
2133                         pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2134                                 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
2135                                 cfg.pointer = addr;
2136                                 cfg.stride = stride;
2137                                 cfg.size = size;
2138 
2139                                 cfg.divisor_r = shift;
2140                                 cfg.divisor_e = extra_flags;
2141                         }
2142 
2143                         pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
2144                                 cfg.divisor_numerator = magic_divisor;
2145                                 cfg.divisor = divisor;
2146                         }
2147 
2148                         ++k;
2149                 }
2150 
2151                 ++k;
2152         }
2153 
2154 #if PAN_ARCH <= 5
2155         /* Add special gl_VertexID/gl_InstanceID buffers */
2156         if (special_vbufs) {
2157                 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
2158 
2159                 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
2160                         cfg.buffer_index = k++;
2161                         cfg.format = so->formats[PAN_VERTEX_ID];
2162                 }
2163 
2164                 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
2165 
2166                 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
2167                         cfg.buffer_index = k++;
2168                         cfg.format = so->formats[PAN_INSTANCE_ID];
2169                 }
2170         }
2171 #endif
2172 
2173         if (nr_images) {
2174                 k = ALIGN_POT(k, 2);
2175                 emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
2176                 emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
2177                 k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
2178         }
2179 
2180 #if PAN_ARCH >= 6
2181         /* We need an empty attrib buf to stop the prefetching on Bifrost */
2182         pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
2183 #endif
2184 
2185         /* Attribute addresses require 64-byte alignment, so let:
2186          *
2187          *      base' = base & ~63 = base - (base & 63)
2188          *      offset' = offset + (base & 63)
2189          *
2190          * Since base' + offset' = base + offset, these are equivalent
2191          * addressing modes and now base is 64 aligned.
2192          */
2193 
2194         /* While these are usually equal, they are not required to be. In some
2195          * cases, u_blitter passes too high a value for num_elements.
2196          */
2197         assert(vs->info.attributes_read_count <= so->num_elements);
2198 
2199         for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) {
2200                 unsigned vbi = so->pipe[i].vertex_buffer_index;
2201                 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2202 
2203                 /* BOs are aligned; just fixup for buffer_offset */
2204                 signed src_offset = so->pipe[i].src_offset;
2205                 src_offset += (buf->buffer_offset & 63);
2206 
2207                 /* Base instance offset */
2208                 if (ctx->base_instance && so->pipe[i].instance_divisor) {
2209                         src_offset += (ctx->base_instance * buf->stride) /
2210                                       so->pipe[i].instance_divisor;
2211                 }
2212 
2213                 /* Also, somewhat obscurely per-instance data needs to be
2214                  * offset in response to a delayed start in an indexed draw */
2215 
2216                 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
2217                         src_offset -= buf->stride * ctx->offset_start;
2218 
2219                 pan_pack(out + i, ATTRIBUTE, cfg) {
2220                         cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
2221                         cfg.format = so->formats[i];
2222                         cfg.offset = src_offset;
2223                 }
2224         }
2225 
2226         *buffers = S.gpu;
2227         return T.gpu;
2228 }
2229 
2230 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)2231 panfrost_emit_varyings(struct panfrost_batch *batch,
2232                 struct mali_attribute_buffer_packed *slot,
2233                 unsigned stride, unsigned count)
2234 {
2235         unsigned size = stride * count;
2236         mali_ptr ptr =
2237                 batch->ctx->indirect_draw ? 0 :
2238                 pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
2239 
2240         pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
2241                 cfg.stride = stride;
2242                 cfg.size = size;
2243                 cfg.pointer = ptr;
2244         }
2245 
2246         return ptr;
2247 }
2248 
2249 /* Given a varying, figure out which index it corresponds to */
2250 
2251 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)2252 pan_varying_index(unsigned present, enum pan_special_varying v)
2253 {
2254         return util_bitcount(present & BITFIELD_MASK(v));
2255 }
2256 
2257 /* Determines which varying buffers are required */
2258 
2259 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)2260 pan_varying_present(const struct panfrost_device *dev,
2261                     struct pan_shader_info *producer,
2262                     struct pan_shader_info *consumer,
2263                     uint16_t point_coord_mask)
2264 {
2265         /* At the moment we always emit general and position buffers. Not
2266          * strictly necessary but usually harmless */
2267 
2268         unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
2269 
2270         /* Enable special buffers by the shader info */
2271 
2272         if (producer->vs.writes_point_size)
2273                 present |= BITFIELD_BIT(PAN_VARY_PSIZ);
2274 
2275 #if PAN_ARCH <= 5
2276         /* On Midgard, these exist as real varyings. Later architectures use
2277          * LD_VAR_SPECIAL reads instead. */
2278 
2279         if (consumer->fs.reads_point_coord)
2280                 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2281 
2282         if (consumer->fs.reads_face)
2283                 present |= BITFIELD_BIT(PAN_VARY_FACE);
2284 
2285         if (consumer->fs.reads_frag_coord)
2286                 present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
2287 
2288         /* Also, if we have a point sprite, we need a point coord buffer */
2289 
2290         for (unsigned i = 0; i < consumer->varyings.input_count; i++)  {
2291                 gl_varying_slot loc = consumer->varyings.input[i].location;
2292 
2293                 if (util_varying_is_point_coord(loc, point_coord_mask))
2294                         present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2295         }
2296 #endif
2297 
2298         return present;
2299 }
2300 
2301 /* Emitters for varying records */
2302 
2303 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)2304 pan_emit_vary(const struct panfrost_device *dev,
2305               struct mali_attribute_packed *out,
2306               unsigned buffer_index,
2307               mali_pixel_format format, unsigned offset)
2308 {
2309         pan_pack(out, ATTRIBUTE, cfg) {
2310                 cfg.buffer_index = buffer_index;
2311                 cfg.offset_enable = (PAN_ARCH <= 5);
2312                 cfg.format = format;
2313                 cfg.offset = offset;
2314         }
2315 }
2316 
2317 /* Special records */
2318 
2319 static const struct {
2320        unsigned components;
2321        enum mali_format format;
2322 } pan_varying_formats[PAN_VARY_MAX] = {
2323         [PAN_VARY_POSITION]     = { 4, MALI_SNAP_4 },
2324         [PAN_VARY_PSIZ]         = { 1, MALI_R16F },
2325         [PAN_VARY_PNTCOORD]     = { 1, MALI_R16F },
2326         [PAN_VARY_FACE]         = { 1, MALI_R32I },
2327         [PAN_VARY_FRAGCOORD]    = { 4, MALI_RGBA32F },
2328 };
2329 
2330 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)2331 pan_special_format(const struct panfrost_device *dev,
2332                 enum pan_special_varying buf)
2333 {
2334         assert(buf < PAN_VARY_MAX);
2335         mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2336 
2337 #if PAN_ARCH <= 6
2338         unsigned nr = pan_varying_formats[buf].components;
2339         format |= panfrost_get_default_swizzle(nr);
2340 #endif
2341 
2342         return format;
2343 }
2344 
2345 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)2346 pan_emit_vary_special(const struct panfrost_device *dev,
2347                       struct mali_attribute_packed *out,
2348                       unsigned present, enum pan_special_varying buf)
2349 {
2350         pan_emit_vary(dev, out, pan_varying_index(present, buf),
2351                         pan_special_format(dev, buf), 0);
2352 }
2353 
2354 /* Negative indicates a varying is not found */
2355 
2356 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)2357 pan_find_vary(const struct pan_shader_varying *vary,
2358                 unsigned vary_count, unsigned loc)
2359 {
2360         for (unsigned i = 0; i < vary_count; ++i) {
2361                 if (vary[i].location == loc)
2362                         return i;
2363         }
2364 
2365         return -1;
2366 }
2367 
2368 /* Assign varying locations for the general buffer. Returns the calculated
2369  * per-vertex stride, and outputs offsets into the passed array. Negative
2370  * offset indicates a varying is not used. */
2371 
2372 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2373 pan_assign_varyings(const struct panfrost_device *dev,
2374                     struct pan_shader_info *producer,
2375                     struct pan_shader_info *consumer,
2376                     signed *offsets)
2377 {
2378         unsigned producer_count = producer->varyings.output_count;
2379         unsigned consumer_count = consumer->varyings.input_count;
2380 
2381         const struct pan_shader_varying *producer_vars = producer->varyings.output;
2382         const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2383 
2384         unsigned stride = 0;
2385 
2386         for (unsigned i = 0; i < producer_count; ++i) {
2387                 signed loc = pan_find_vary(consumer_vars, consumer_count,
2388                                 producer_vars[i].location);
2389 
2390                 if (loc >= 0) {
2391                         offsets[i] = stride;
2392 
2393                         enum pipe_format format = consumer_vars[loc].format;
2394                         stride += util_format_get_blocksize(format);
2395                 } else {
2396                         offsets[i] = -1;
2397                 }
2398         }
2399 
2400         return stride;
2401 }
2402 
2403 /* Emitter for a single varying (attribute) descriptor */
2404 
2405 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,signed offset,enum pan_special_varying pos_varying)2406 panfrost_emit_varying(const struct panfrost_device *dev,
2407                       struct mali_attribute_packed *out,
2408                       const struct pan_shader_varying varying,
2409                       enum pipe_format pipe_format,
2410                       unsigned present,
2411                       uint16_t point_sprite_mask,
2412                       signed offset,
2413                       enum pan_special_varying pos_varying)
2414 {
2415         /* Note: varying.format != pipe_format in some obscure cases due to a
2416          * limitation of the NIR linker. This should be fixed in the future to
2417          * eliminate the additional lookups. See:
2418          * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2419          */
2420         gl_varying_slot loc = varying.location;
2421         mali_pixel_format format = dev->formats[pipe_format].hw;
2422 
2423         if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2424                 pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2425         } else if (loc == VARYING_SLOT_POS) {
2426                 pan_emit_vary_special(dev, out, present, pos_varying);
2427         } else if (loc == VARYING_SLOT_PSIZ) {
2428                 pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2429         } else if (loc == VARYING_SLOT_FACE) {
2430                 pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2431         } else if (offset < 0) {
2432                 pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2433         } else {
2434                 STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2435                 pan_emit_vary(dev, out, 0, format, offset);
2436         }
2437 }
2438 
2439 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2440  * rather than draw time (under good conditions). */
2441 
2442 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_shader_state * producer,struct panfrost_shader_state * consumer,uint16_t point_coord_mask,struct pan_linkage * out)2443 panfrost_emit_varying_descs(
2444                 struct panfrost_pool *pool,
2445                 struct panfrost_shader_state *producer,
2446                 struct panfrost_shader_state *consumer,
2447                 uint16_t point_coord_mask,
2448                 struct pan_linkage *out)
2449 {
2450         struct panfrost_device *dev = pool->base.dev;
2451         unsigned producer_count = producer->info.varyings.output_count;
2452         unsigned consumer_count = consumer->info.varyings.input_count;
2453 
2454         /* Offsets within the general varying buffer, indexed by location */
2455         signed offsets[PAN_MAX_VARYINGS];
2456         assert(producer_count <= ARRAY_SIZE(offsets));
2457         assert(consumer_count <= ARRAY_SIZE(offsets));
2458 
2459         /* Allocate enough descriptors for both shader stages */
2460         struct panfrost_ptr T =
2461                 pan_pool_alloc_desc_array(&pool->base,
2462                                           producer_count + consumer_count,
2463                                           ATTRIBUTE);
2464 
2465         /* Take a reference if we're being put on the CSO */
2466         if (!pool->owned) {
2467                 out->bo = pool->transient_bo;
2468                 panfrost_bo_reference(out->bo);
2469         }
2470 
2471         struct mali_attribute_packed *descs = T.cpu;
2472         out->producer = producer_count ? T.gpu : 0;
2473         out->consumer = consumer_count ? T.gpu +
2474                 (pan_size(ATTRIBUTE) * producer_count) : 0;
2475 
2476         /* Lay out the varyings. Must use producer to lay out, in order to
2477          * respect transform feedback precisions. */
2478         out->present = pan_varying_present(dev, &producer->info,
2479                         &consumer->info, point_coord_mask);
2480 
2481         out->stride = pan_assign_varyings(dev, &producer->info,
2482                         &consumer->info, offsets);
2483 
2484         for (unsigned i = 0; i < producer_count; ++i) {
2485                 signed j = pan_find_vary(consumer->info.varyings.input,
2486                                 consumer->info.varyings.input_count,
2487                                 producer->info.varyings.output[i].location);
2488 
2489                 enum pipe_format format = (j >= 0) ?
2490                         consumer->info.varyings.input[j].format :
2491                         producer->info.varyings.output[i].format;
2492 
2493                 panfrost_emit_varying(dev, descs + i,
2494                                 producer->info.varyings.output[i], format,
2495                                 out->present, 0, offsets[i], PAN_VARY_POSITION);
2496         }
2497 
2498         for (unsigned i = 0; i < consumer_count; ++i) {
2499                 signed j = pan_find_vary(producer->info.varyings.output,
2500                                 producer->info.varyings.output_count,
2501                                 consumer->info.varyings.input[i].location);
2502 
2503                 signed offset = (j >= 0) ? offsets[j] : -1;
2504 
2505                 panfrost_emit_varying(dev, descs + producer_count + i,
2506                                 consumer->info.varyings.input[i],
2507                                 consumer->info.varyings.input[i].format,
2508                                 out->present, point_coord_mask,
2509                                 offset, PAN_VARY_FRAGCOORD);
2510         }
2511 }
2512 
2513 #if PAN_ARCH <= 5
2514 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2515 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2516                 unsigned present,
2517                 enum pan_special_varying v,
2518                 unsigned special)
2519 {
2520         if (present & BITFIELD_BIT(v)) {
2521                 unsigned idx = pan_varying_index(present, v);
2522 
2523                 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2524                         cfg.special = special;
2525                         cfg.type = 0;
2526                 }
2527         }
2528 }
2529 #endif
2530 
2531 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,mali_ptr * vs_attribs,mali_ptr * fs_attribs,mali_ptr * buffers,unsigned * buffer_count,mali_ptr * position,mali_ptr * psiz,bool point_coord_replace)2532 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2533                                  unsigned vertex_count,
2534                                  mali_ptr *vs_attribs,
2535                                  mali_ptr *fs_attribs,
2536                                  mali_ptr *buffers,
2537                                  unsigned *buffer_count,
2538                                  mali_ptr *position,
2539                                  mali_ptr *psiz,
2540                                  bool point_coord_replace)
2541 {
2542         /* Load the shaders */
2543         struct panfrost_context *ctx = batch->ctx;
2544         struct panfrost_shader_state *vs, *fs;
2545 
2546         vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2547         fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2548 
2549         uint16_t point_coord_mask = 0;
2550 
2551 #if PAN_ARCH <= 5
2552         /* Point sprites are lowered on Bifrost and newer */
2553         if (point_coord_replace)
2554                 point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2555 #endif
2556 
2557         /* In good conditions, we only need to link varyings once */
2558         bool prelink =
2559                 (point_coord_mask == 0) &&
2560                 !vs->info.separable &&
2561                 !fs->info.separable;
2562 
2563         /* Try to reduce copies */
2564         struct pan_linkage _linkage;
2565         struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2566 
2567         /* Emit ATTRIBUTE descriptors if needed */
2568         if (!prelink || vs->linkage.bo == NULL) {
2569                 struct panfrost_pool *pool =
2570                         prelink ? &ctx->descs : &batch->pool;
2571 
2572                 panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
2573         }
2574 
2575         unsigned present = linkage->present, stride = linkage->stride;
2576         unsigned count = util_bitcount(present);
2577         struct panfrost_ptr T =
2578                 pan_pool_alloc_desc_array(&batch->pool.base,
2579                                           count + 1,
2580                                           ATTRIBUTE_BUFFER);
2581         struct mali_attribute_buffer_packed *varyings =
2582                 (struct mali_attribute_buffer_packed *) T.cpu;
2583 
2584         if (buffer_count)
2585                 *buffer_count = count;
2586 
2587 #if PAN_ARCH >= 6
2588         /* Suppress prefetch on Bifrost */
2589         memset(varyings + count, 0, sizeof(*varyings));
2590 #endif
2591 
2592         if (stride) {
2593                 panfrost_emit_varyings(batch,
2594                                 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2595                                 stride, vertex_count);
2596         } else {
2597                 /* The indirect draw code reads the stride field, make sure
2598                  * that it is initialised */
2599                 memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0,
2600                        sizeof(*varyings));
2601         }
2602 
2603         /* fp32 vec4 gl_Position */
2604         *position = panfrost_emit_varyings(batch,
2605                         &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2606                         sizeof(float) * 4, vertex_count);
2607 
2608         if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2609                 *psiz = panfrost_emit_varyings(batch,
2610                                 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2611                                 2, vertex_count);
2612         }
2613 
2614 #if PAN_ARCH <= 5
2615         pan_emit_special_input(varyings, present,
2616                         PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2617         pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2618                         MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2619         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2620                         MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2621 #endif
2622 
2623         *buffers = T.gpu;
2624         *vs_attribs = linkage->producer;
2625         *fs_attribs = linkage->consumer;
2626 }
2627 
2628 /*
2629  * Emit jobs required for the rasterization pipeline. If there are side effects
2630  * from the vertex shader, these are handled ahead-of-time with a compute
2631  * shader. This function should not be called if rasterization is skipped.
2632  */
2633 static void
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)2634 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2635                                 const struct panfrost_ptr *vertex_job,
2636                                 const struct panfrost_ptr *tiler_job)
2637 {
2638         struct panfrost_context *ctx = batch->ctx;
2639 
2640         /* XXX - set job_barrier in case buffers get ping-ponged and we need to
2641          * enforce ordering, this has a perf hit! See
2642          * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations
2643          */
2644         unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2645                                            MALI_JOB_TYPE_VERTEX, true, false,
2646                                            ctx->indirect_draw ?
2647                                            batch->indirect_draw_job_id : 0,
2648                                            0, vertex_job, false);
2649 
2650         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2651                          MALI_JOB_TYPE_TILER, false, false,
2652                          vertex, 0, tiler_job, false);
2653 }
2654 #endif
2655 
2656 static void
emit_tls(struct panfrost_batch * batch)2657 emit_tls(struct panfrost_batch *batch)
2658 {
2659         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2660 
2661         /* Emitted with the FB descriptor on Midgard. */
2662         if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2663                 return;
2664 
2665         struct panfrost_bo *tls_bo =
2666                 batch->stack_size ?
2667                 panfrost_batch_get_scratchpad(batch,
2668                                               batch->stack_size,
2669                                               dev->thread_tls_alloc,
2670                                               dev->core_id_range):
2671                 NULL;
2672         struct pan_tls_info tls = {
2673                 .tls = {
2674                         .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2675                         .size = batch->stack_size,
2676                 },
2677         };
2678 
2679         assert(batch->tls.cpu);
2680         GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2681 }
2682 
2683 static void
emit_fbd(struct panfrost_batch * batch,const struct pan_fb_info * fb)2684 emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
2685 {
2686         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2687         struct panfrost_bo *tls_bo =
2688                 batch->stack_size ?
2689                 panfrost_batch_get_scratchpad(batch,
2690                                               batch->stack_size,
2691                                               dev->thread_tls_alloc,
2692                                               dev->core_id_range):
2693                 NULL;
2694         struct pan_tls_info tls = {
2695                 .tls = {
2696                         .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2697                         .size = batch->stack_size,
2698                 },
2699         };
2700 
2701         batch->framebuffer.gpu |=
2702                 GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx,
2703                                    batch->framebuffer.cpu);
2704 }
2705 
2706 /* Mark a surface as written */
2707 
2708 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2709 panfrost_initialize_surface(struct panfrost_batch *batch,
2710                             struct pipe_surface *surf)
2711 {
2712         if (surf) {
2713                 struct panfrost_resource *rsrc = pan_resource(surf->texture);
2714                 BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2715         }
2716 }
2717 
2718 /* Generate a fragment job. This should be called once per frame. (Usually,
2719  * this corresponds to eglSwapBuffers or one of glFlush, glFinish)
2720  */
2721 static mali_ptr
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2722 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2723 {
2724         /* Mark the affected buffers as initialized, since we're writing to it.
2725          * Also, add the surfaces we're writing to to the batch */
2726 
2727         struct pipe_framebuffer_state *fb = &batch->key;
2728 
2729         for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2730                 panfrost_initialize_surface(batch, fb->cbufs[i]);
2731 
2732         panfrost_initialize_surface(batch, fb->zsbuf);
2733 
2734         /* The passed tile coords can be out of range in some cases, so we need
2735          * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2736          * Theoretically we also need to clamp the coordinates positive, but we
2737          * avoid that edge case as all four values are unsigned. Also,
2738          * theoretically we could clamp the minima, but if that has to happen
2739          * the asserts would fail anyway (since the maxima would get clamped
2740          * and then be smaller than the minima). An edge case of sorts occurs
2741          * when no scissors are added to draw, so by default min=~0 and max=0.
2742          * But that can't happen if any actual drawing occurs (beyond a
2743          * wallpaper reload), so this is again irrelevant in practice. */
2744 
2745         batch->maxx = MIN2(batch->maxx, fb->width);
2746         batch->maxy = MIN2(batch->maxy, fb->height);
2747 
2748         /* Rendering region must be at least 1x1; otherwise, there is nothing
2749          * to do and the whole job chain should have been discarded. */
2750 
2751         assert(batch->maxx > batch->minx);
2752         assert(batch->maxy > batch->miny);
2753 
2754         struct panfrost_ptr transfer =
2755                 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
2756 
2757         GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu,
2758                                     transfer.cpu);
2759 
2760         return transfer.gpu;
2761 }
2762 
2763 #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
2764 
2765 static uint8_t
pan_draw_mode(enum pipe_prim_type mode)2766 pan_draw_mode(enum pipe_prim_type mode)
2767 {
2768         switch (mode) {
2769                 DEFINE_CASE(POINTS);
2770                 DEFINE_CASE(LINES);
2771                 DEFINE_CASE(LINE_LOOP);
2772                 DEFINE_CASE(LINE_STRIP);
2773                 DEFINE_CASE(TRIANGLES);
2774                 DEFINE_CASE(TRIANGLE_STRIP);
2775                 DEFINE_CASE(TRIANGLE_FAN);
2776                 DEFINE_CASE(QUADS);
2777                 DEFINE_CASE(POLYGON);
2778 #if PAN_ARCH <= 6
2779                 DEFINE_CASE(QUAD_STRIP);
2780 #endif
2781 
2782         default:
2783                 unreachable("Invalid draw mode");
2784         }
2785 }
2786 
2787 #undef DEFINE_CASE
2788 
2789 /* Count generated primitives (when there is no geom/tess shaders) for
2790  * transform feedback */
2791 
2792 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2793 panfrost_statistics_record(
2794                 struct panfrost_context *ctx,
2795                 const struct pipe_draw_info *info,
2796                 const struct pipe_draw_start_count_bias *draw)
2797 {
2798         if (!ctx->active_queries)
2799                 return;
2800 
2801         uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2802         ctx->prims_generated += prims;
2803 
2804         if (!ctx->streamout.num_targets)
2805                 return;
2806 
2807         ctx->tf_prims_generated += prims;
2808         ctx->dirty |= PAN_DIRTY_SO;
2809 }
2810 
2811 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2812 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2813 {
2814         unsigned count = u_stream_outputs_for_vertices(ctx->active_prim,
2815                                                        ctx->vertex_count);
2816 
2817         for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2818                 if (!ctx->streamout.targets[i])
2819                         continue;
2820 
2821                 pan_so_target(ctx->streamout.targets[i])->offset += count;
2822         }
2823 }
2824 
2825 static inline enum mali_index_type
panfrost_translate_index_size(unsigned size)2826 panfrost_translate_index_size(unsigned size)
2827 {
2828         STATIC_ASSERT(MALI_INDEX_TYPE_NONE  == 0);
2829         STATIC_ASSERT(MALI_INDEX_TYPE_UINT8  == 1);
2830         STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2);
2831 
2832         return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size;
2833 }
2834 
2835 #if PAN_ARCH <= 7
2836 static inline void
pan_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)2837 pan_emit_draw_descs(struct panfrost_batch *batch,
2838                 struct MALI_DRAW *d, enum pipe_shader_type st)
2839 {
2840         d->offset_start = batch->ctx->offset_start;
2841         d->instance_size = batch->ctx->instance_count > 1 ?
2842                            batch->ctx->padded_count : 1;
2843 
2844         d->uniform_buffers = batch->uniform_buffers[st];
2845         d->push_uniforms = batch->push_uniforms[st];
2846         d->textures = batch->textures[st];
2847         d->samplers = batch->samplers[st];
2848 }
2849 
2850 static void
panfrost_draw_emit_vertex_section(struct panfrost_batch * batch,mali_ptr vs_vary,mali_ptr varyings,mali_ptr attribs,mali_ptr attrib_bufs,void * section)2851 panfrost_draw_emit_vertex_section(struct panfrost_batch *batch,
2852                                   mali_ptr vs_vary, mali_ptr varyings,
2853                                   mali_ptr attribs, mali_ptr attrib_bufs,
2854                                   void *section)
2855 {
2856         pan_pack(section, DRAW, cfg) {
2857                 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
2858                 cfg.attributes = attribs;
2859                 cfg.attribute_buffers = attrib_bufs;
2860                 cfg.varyings = vs_vary;
2861                 cfg.varying_buffers = vs_vary ? varyings : 0;
2862                 cfg.thread_storage = batch->tls.gpu;
2863                 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
2864         }
2865 }
2866 
2867 static void
panfrost_draw_emit_vertex(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,mali_ptr vs_vary,mali_ptr varyings,mali_ptr attribs,mali_ptr attrib_bufs,void * job)2868 panfrost_draw_emit_vertex(struct panfrost_batch *batch,
2869                           const struct pipe_draw_info *info,
2870                           void *invocation_template,
2871                           mali_ptr vs_vary, mali_ptr varyings,
2872                           mali_ptr attribs, mali_ptr attrib_bufs,
2873                           void *job)
2874 {
2875         void *section =
2876                 pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
2877         memcpy(section, invocation_template, pan_size(INVOCATION));
2878 
2879         pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
2880                 cfg.job_task_split = 5;
2881         }
2882 
2883         section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
2884         panfrost_draw_emit_vertex_section(batch, vs_vary, varyings,
2885                                           attribs, attrib_bufs, section);
2886 }
2887 #endif
2888 
2889 static void
panfrost_emit_primitive_size(struct panfrost_context * ctx,bool points,mali_ptr size_array,void * prim_size)2890 panfrost_emit_primitive_size(struct panfrost_context *ctx,
2891                              bool points, mali_ptr size_array,
2892                              void *prim_size)
2893 {
2894         struct panfrost_rasterizer *rast = ctx->rasterizer;
2895 
2896         pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
2897                 if (panfrost_writes_point_size(ctx)) {
2898                         cfg.size_array = size_array;
2899                 } else {
2900                         cfg.constant = points ?
2901                                        rast->base.point_size :
2902                                        rast->base.line_width;
2903                 }
2904         }
2905 }
2906 
2907 static bool
panfrost_is_implicit_prim_restart(const struct pipe_draw_info * info)2908 panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info)
2909 {
2910        /* As a reminder primitive_restart should always be checked before any
2911           access to restart_index. */
2912         return info->primitive_restart &&
2913                 info->restart_index == (unsigned)BITFIELD_MASK(info->index_size * 8);
2914 }
2915 
2916 /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of
2917  * 3D state. In particular, it groups the fragment shader descriptor with
2918  * depth/stencil, blend, polygon offset, and multisampling state. These pieces
2919  * of state are dirty tracked independently for the benefit of newer GPUs that
2920  * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D
2921  * dirty flags that trigger re-emits of the fragment RSD.
2922  *
2923  * Obscurely, occlusion queries are included. Occlusion query state is nominally
2924  * specified in the draw call descriptor, but must be considered when determing
2925  * early-Z state which is part of the RSD.
2926  */
2927 #define FRAGMENT_RSD_DIRTY_MASK ( \
2928         PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | \
2929         PAN_DIRTY_RASTERIZER | PAN_DIRTY_OQ)
2930 
2931 static inline void
panfrost_update_shader_state(struct panfrost_batch * batch,enum pipe_shader_type st)2932 panfrost_update_shader_state(struct panfrost_batch *batch,
2933                              enum pipe_shader_type st)
2934 {
2935         struct panfrost_context *ctx = batch->ctx;
2936         struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
2937 
2938         bool frag = (st == PIPE_SHADER_FRAGMENT);
2939         unsigned dirty_3d = ctx->dirty;
2940         unsigned dirty = ctx->dirty_shader[st];
2941 
2942         if (dirty & PAN_DIRTY_STAGE_TEXTURE) {
2943                 batch->textures[st] =
2944                         panfrost_emit_texture_descriptors(batch, st);
2945         }
2946 
2947         if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2948                 batch->samplers[st] =
2949                         panfrost_emit_sampler_descriptors(batch, st);
2950         }
2951 
2952         /* On Bifrost and older, the fragment shader descriptor is fused
2953          * together with the renderer state; the combined renderer state
2954          * descriptor is emitted below. Otherwise, the shader descriptor is
2955          * standalone and is emitted here.
2956          */
2957         if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) {
2958                 batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2959         }
2960 
2961 #if PAN_ARCH >= 9
2962         if (dirty & PAN_DIRTY_STAGE_IMAGE)
2963                 batch->images[st] = panfrost_emit_images(batch, st);
2964 #endif
2965 
2966         if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2967                 batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st,
2968                                 NULL, &batch->push_uniforms[st], NULL);
2969         }
2970 
2971 #if PAN_ARCH <= 7
2972         /* On Bifrost and older, if the fragment shader changes OR any renderer
2973          * state specified with the fragment shader, the whole renderer state
2974          * descriptor is dirtied and must be reemited.
2975          */
2976         if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) ||
2977                      (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) {
2978 
2979                 batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2980         }
2981 
2982         if (frag && (dirty & PAN_DIRTY_STAGE_IMAGE)) {
2983                 batch->attribs[st] = panfrost_emit_image_attribs(batch,
2984                                 &batch->attrib_bufs[st], st);
2985         }
2986 #endif
2987 }
2988 
2989 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2990 panfrost_update_state_3d(struct panfrost_batch *batch)
2991 {
2992         struct panfrost_context *ctx = batch->ctx;
2993         unsigned dirty = ctx->dirty;
2994 
2995         if (dirty & PAN_DIRTY_TLS_SIZE)
2996                 panfrost_batch_adjust_stack_size(batch);
2997 
2998         if (dirty & PAN_DIRTY_BLEND)
2999                 panfrost_set_batch_masks_blend(batch);
3000 
3001         if (dirty & PAN_DIRTY_ZS)
3002                 panfrost_set_batch_masks_zs(batch);
3003 
3004 #if PAN_ARCH >= 9
3005         if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) ||
3006             (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER))
3007                 batch->depth_stencil = panfrost_emit_depth_stencil(batch);
3008 
3009         if (dirty & PAN_DIRTY_BLEND)
3010                 batch->blend = panfrost_emit_blend_valhall(batch);
3011 
3012         if (dirty & PAN_DIRTY_VERTEX) {
3013                 batch->attribs[PIPE_SHADER_VERTEX] =
3014                         panfrost_emit_vertex_data(batch);
3015 
3016                 batch->attrib_bufs[PIPE_SHADER_VERTEX] =
3017                         panfrost_emit_vertex_buffers(batch);
3018         }
3019 #endif
3020 }
3021 
3022 #if PAN_ARCH >= 6
3023 static mali_ptr
panfrost_batch_get_bifrost_tiler(struct panfrost_batch * batch,unsigned vertex_count)3024 panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count)
3025 {
3026         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3027 
3028         if (!vertex_count)
3029                 return 0;
3030 
3031         if (batch->tiler_ctx.bifrost)
3032                 return batch->tiler_ctx.bifrost;
3033 
3034         struct panfrost_ptr t =
3035                 pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
3036 
3037         GENX(pan_emit_tiler_heap)(dev, t.cpu);
3038 
3039         mali_ptr heap = t.gpu;
3040 
3041         t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
3042         GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
3043                                  util_framebuffer_get_num_samples(&batch->key),
3044                                  pan_tristate_get(batch->first_provoking_vertex),
3045                                  heap, t.cpu);
3046 
3047         batch->tiler_ctx.bifrost = t.gpu;
3048         return batch->tiler_ctx.bifrost;
3049 }
3050 #endif
3051 
3052 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
3053  * jobs and Valhall IDVS jobs
3054  */
3055 static void
panfrost_emit_primitive(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,mali_ptr indices,bool secondary_shader,void * out)3056 panfrost_emit_primitive(struct panfrost_context *ctx,
3057                         const struct pipe_draw_info *info,
3058                         const struct pipe_draw_start_count_bias *draw,
3059                         mali_ptr indices, bool secondary_shader, void *out)
3060 {
3061         UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3062 
3063         bool lines = (info->mode == PIPE_PRIM_LINES ||
3064                       info->mode == PIPE_PRIM_LINE_LOOP ||
3065                       info->mode == PIPE_PRIM_LINE_STRIP);
3066 
3067         pan_pack(out, PRIMITIVE, cfg) {
3068                 cfg.draw_mode = pan_draw_mode(info->mode);
3069                 if (panfrost_writes_point_size(ctx))
3070                         cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
3071 
3072 #if PAN_ARCH <= 8
3073                 /* For line primitives, PRIMITIVE.first_provoking_vertex must
3074                  * be set to true and the provoking vertex is selected with
3075                  * DRAW.flat_shading_vertex.
3076                  */
3077                 if (lines)
3078                         cfg.first_provoking_vertex = true;
3079                 else
3080                         cfg.first_provoking_vertex = rast->flatshade_first;
3081 
3082                 if (panfrost_is_implicit_prim_restart(info)) {
3083                         cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
3084                 } else if (info->primitive_restart) {
3085                         cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
3086                         cfg.primitive_restart_index = info->restart_index;
3087                 }
3088 
3089                 cfg.job_task_split = 6;
3090 #else
3091                 struct panfrost_shader_state *fs =
3092                         panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
3093 
3094                 cfg.allow_rotating_primitives = !(lines || fs->info.bifrost.uses_flat_shading);
3095                 cfg.primitive_restart = info->primitive_restart;
3096 
3097                 /* Non-fixed restart indices should have been lowered */
3098                 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
3099 #endif
3100 
3101                 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
3102                 cfg.index_type = panfrost_translate_index_size(info->index_size);
3103 
3104 
3105                 if (PAN_ARCH >= 9) {
3106                         /* Base vertex offset on Valhall is used for both
3107                          * indexed and non-indexed draws, in a simple way for
3108                          * either. Handle both cases.
3109                          */
3110                         if (cfg.index_type)
3111                                 cfg.base_vertex_offset = draw->index_bias;
3112                         else
3113                                 cfg.base_vertex_offset = draw->start;
3114 
3115                         /* Indices are moved outside the primitive descriptor
3116                          * on Valhall, so we don't need to set that here
3117                          */
3118                 } else if (cfg.index_type) {
3119                         cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
3120 
3121 #if PAN_ARCH <= 7
3122                         cfg.indices = indices;
3123 #endif
3124                 }
3125 
3126 #if PAN_ARCH >= 6
3127                 cfg.secondary_shader = secondary_shader;
3128 #endif
3129         }
3130 }
3131 
3132 #if PAN_ARCH >= 9
3133 static mali_ptr
panfrost_emit_resources(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr ubos,unsigned ubo_count)3134 panfrost_emit_resources(struct panfrost_batch *batch,
3135                         enum pipe_shader_type stage,
3136                         mali_ptr ubos, unsigned ubo_count)
3137 {
3138         struct panfrost_context *ctx = batch->ctx;
3139         struct panfrost_ptr T;
3140         unsigned nr_tables = 12;
3141 
3142         /* Although individual resources need only 16 byte alignment, the
3143          * resource table as a whole must be 64-byte aligned.
3144          */
3145         T = pan_pool_alloc_aligned(&batch->pool.base, nr_tables * pan_size(RESOURCE), 64);
3146         memset(T.cpu, 0, nr_tables * pan_size(RESOURCE));
3147 
3148         panfrost_make_resource_table(T, PAN_TABLE_UBO, ubos, ubo_count);
3149 
3150         panfrost_make_resource_table(T, PAN_TABLE_TEXTURE,
3151                                      batch->textures[stage],
3152                                      ctx->sampler_view_count[stage]);
3153 
3154         panfrost_make_resource_table(T, PAN_TABLE_SAMPLER,
3155                                      batch->samplers[stage],
3156                                      ctx->sampler_count[stage]);
3157 
3158         panfrost_make_resource_table(T, PAN_TABLE_IMAGE,
3159                                      batch->images[stage],
3160                                      util_last_bit(ctx->image_mask[stage]));
3161 
3162         if (stage == PIPE_SHADER_VERTEX) {
3163                 panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE,
3164                                              batch->attribs[stage],
3165                                              ctx->vertex->num_elements);
3166 
3167                 panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER,
3168                                              batch->attrib_bufs[stage],
3169                                              util_last_bit(ctx->vb_mask));
3170         }
3171 
3172         return T.gpu | nr_tables;
3173 }
3174 
3175 static void
panfrost_emit_shader(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,mali_ptr shader_ptr,mali_ptr thread_storage)3176 panfrost_emit_shader(struct panfrost_batch *batch,
3177                      struct MALI_SHADER_ENVIRONMENT *cfg,
3178                      enum pipe_shader_type stage,
3179                      mali_ptr shader_ptr,
3180                      mali_ptr thread_storage)
3181 {
3182         unsigned fau_words = 0, ubo_count = 0;
3183         mali_ptr ubos, resources;
3184 
3185         ubos = panfrost_emit_const_buf(batch, stage, &ubo_count, &cfg->fau,
3186                                        &fau_words);
3187 
3188         resources = panfrost_emit_resources(batch, stage, ubos, ubo_count);
3189 
3190         cfg->thread_storage = thread_storage;
3191         cfg->shader = shader_ptr;
3192         cfg->resources = resources;
3193 
3194         /* Each entry of FAU is 64-bits */
3195         cfg->fau_count = DIV_ROUND_UP(fau_words, 2);
3196 }
3197 #endif
3198 
3199 static void
panfrost_emit_draw(void * out,struct panfrost_batch * batch,bool fs_required,enum pipe_prim_type prim,mali_ptr pos,mali_ptr fs_vary,mali_ptr varyings)3200 panfrost_emit_draw(void *out,
3201                    struct panfrost_batch *batch,
3202                    bool fs_required,
3203                    enum pipe_prim_type prim,
3204                    mali_ptr pos, mali_ptr fs_vary, mali_ptr varyings)
3205 {
3206         struct panfrost_context *ctx = batch->ctx;
3207         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3208         bool polygon = (prim == PIPE_PRIM_TRIANGLES);
3209 
3210         pan_pack(out, DRAW, cfg) {
3211                 /*
3212                  * From the Gallium documentation,
3213                  * pipe_rasterizer_state::cull_face "indicates which faces of
3214                  * polygons to cull". Points and lines are not considered
3215                  * polygons and should be drawn even if all faces are culled.
3216                  * The hardware does not take primitive type into account when
3217                  * culling, so we need to do that check ourselves.
3218                  */
3219                 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
3220                 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
3221                 cfg.front_face_ccw = rast->front_ccw;
3222 
3223                 if (ctx->occlusion_query && ctx->active_queries) {
3224                         if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
3225                                 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
3226                         else
3227                                 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
3228 
3229                         struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
3230                         cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
3231                         panfrost_batch_write_rsrc(ctx->batch, rsrc,
3232                                               PIPE_SHADER_FRAGMENT);
3233                 }
3234 
3235 #if PAN_ARCH >= 9
3236                 struct panfrost_shader_state *fs =
3237                         panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
3238 
3239                 cfg.multisample_enable = rast->multisample;
3240                 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
3241 
3242                 /* Use per-sample shading if required by API Also use it when a
3243                  * blend shader is used with multisampling, as this is handled
3244                  * by a single ST_TILE in the blend shader with the current
3245                  * sample ID, requiring per-sample shading.
3246                  */
3247                 cfg.evaluate_per_sample =
3248                         (rast->multisample &&
3249                          ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
3250 
3251                 cfg.single_sampled_lines = !rast->multisample;
3252 
3253                 cfg.vertex_array.packet = true;
3254 
3255                 cfg.minimum_z = batch->minimum_z;
3256                 cfg.maximum_z = batch->maximum_z;
3257 
3258                 cfg.depth_stencil = batch->depth_stencil;
3259 
3260                 if (fs_required) {
3261                         bool has_oq = ctx->occlusion_query && ctx->active_queries;
3262 
3263                         struct pan_earlyzs_state earlyzs =
3264                                pan_earlyzs_get(fs->earlyzs,
3265                                                ctx->depth_stencil->writes_zs || has_oq,
3266                                                ctx->blend->base.alpha_to_coverage,
3267                                                ctx->depth_stencil->zs_always_passes);
3268 
3269                         cfg.pixel_kill_operation = earlyzs.kill;
3270                         cfg.zs_update_operation = earlyzs.update;
3271 
3272                         cfg.allow_forward_pixel_to_kill = pan_allow_forward_pixel_to_kill(ctx, fs);
3273                         cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
3274 
3275                         /* Mask of render targets that may be written. A render
3276                          * target may be written if the fragment shader writes
3277                          * to it AND it actually exists. If the render target
3278                          * doesn't actually exist, the blend descriptor will be
3279                          * OFF so it may be omitted from the mask.
3280                          *
3281                          * Only set when there is a fragment shader, since
3282                          * otherwise no colour updates are possible.
3283                          */
3284                         cfg.render_target_mask =
3285                                 (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
3286                                 ctx->fb_rt_mask;
3287 
3288                         /* Also use per-sample shading if required by the shader
3289                          */
3290                         cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
3291 
3292                         /* Unlike Bifrost, alpha-to-coverage must be included in
3293                          * this identically-named flag. Confusing, isn't it?
3294                          */
3295                         cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
3296                                                        fs->info.fs.can_discard ||
3297                                                        ctx->blend->base.alpha_to_coverage;
3298 
3299                         /* Blend descriptors are only accessed by a BLEND
3300                          * instruction on Valhall. It follows that if the
3301                          * fragment shader is omitted, we may also emit the
3302                          * blend descriptors.
3303                          */
3304                         cfg.blend = batch->blend;
3305                         cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
3306                         cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
3307 
3308                         cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
3309                         cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
3310 
3311                         panfrost_emit_shader(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
3312                                              batch->rsd[PIPE_SHADER_FRAGMENT],
3313                                              batch->tls.gpu);
3314                 } else {
3315                         /* These operations need to be FORCE to benefit from the
3316                          * depth-only pass optimizations.
3317                          */
3318                         cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
3319                         cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
3320 
3321                         /* No shader and no blend => no shader or blend
3322                          * reasons to disable FPK. The only FPK-related state
3323                          * not covered is alpha-to-coverage which we don't set
3324                          * without blend.
3325                          */
3326                         cfg.allow_forward_pixel_to_kill = true;
3327 
3328                         /* No shader => no shader side effects */
3329                         cfg.allow_forward_pixel_to_be_killed = true;
3330 
3331                         /* Alpha isn't written so these are vacuous */
3332                         cfg.overdraw_alpha0 = true;
3333                         cfg.overdraw_alpha1 = true;
3334                 }
3335 #else
3336                 cfg.position = pos;
3337                 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
3338                 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
3339                 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
3340                 cfg.viewport = batch->viewport;
3341                 cfg.varyings = fs_vary;
3342                 cfg.varying_buffers = fs_vary ? varyings : 0;
3343                 cfg.thread_storage = batch->tls.gpu;
3344 
3345                 /* For all primitives but lines DRAW.flat_shading_vertex must
3346                  * be set to 0 and the provoking vertex is selected with the
3347                  * PRIMITIVE.first_provoking_vertex field.
3348                  */
3349                 if (prim == PIPE_PRIM_LINES) {
3350                         /* The logic is inverted across arches. */
3351                         cfg.flat_shading_vertex = rast->flatshade_first
3352                                                 ^ (PAN_ARCH <= 5);
3353                 }
3354 
3355                 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
3356 #endif
3357         }
3358 }
3359 
3360 #if PAN_ARCH >= 9
3361 static void
panfrost_emit_malloc_vertex(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,mali_ptr indices,bool secondary_shader,void * job)3362 panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
3363                             const struct pipe_draw_info *info,
3364                             const struct pipe_draw_start_count_bias *draw,
3365                             mali_ptr indices, bool secondary_shader,
3366                             void *job)
3367 {
3368         struct panfrost_context *ctx = batch->ctx;
3369 
3370         struct panfrost_shader_state *vs =
3371                 panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3372 
3373         struct panfrost_shader_state *fs =
3374                 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
3375 
3376         bool fs_required = panfrost_fs_required(fs, ctx->blend,
3377                                                 &ctx->pipe_framebuffer,
3378                                                 ctx->depth_stencil);
3379 
3380         /* Varying shaders only feed data to the fragment shader, so if we omit
3381          * the fragment shader, we should omit the varying shader too.
3382          */
3383         secondary_shader &= fs_required;
3384 
3385         panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader,
3386                                 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
3387 
3388         pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
3389                 cfg.count = info->instance_count;
3390         }
3391 
3392         pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
3393                 if (secondary_shader) {
3394                         unsigned v = vs->info.varyings.output_count;
3395                         unsigned f = fs->info.varyings.input_count;
3396                         unsigned slots = MAX2(v, f);
3397                         slots += util_bitcount(fs->key.fixed_varying_mask);
3398                         unsigned size = slots * 16;
3399 
3400                         /* Assumes 16 byte slots. We could do better. */
3401                         cfg.vertex_packet_stride = size + 16;
3402                         cfg.vertex_attribute_stride = size;
3403                 } else {
3404                         /* Hardware requirement for "no varyings" */
3405                         cfg.vertex_packet_stride = 16;
3406                         cfg.vertex_attribute_stride = 0;
3407                 }
3408         }
3409 
3410         pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
3411                 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
3412         }
3413 
3414         STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
3415         memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR),
3416                &batch->scissor, pan_size(SCISSOR));
3417 
3418         panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0,
3419                                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
3420 
3421         pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
3422                 cfg.address = indices;
3423         }
3424 
3425         panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW),
3426                            batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0);
3427 
3428         pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
3429                 /* IDVS/points vertex shader */
3430                 mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX];
3431 
3432                 /* IDVS/triangle vertex shader */
3433                 if (vs_ptr && info->mode != PIPE_PRIM_POINTS)
3434                         vs_ptr += pan_size(SHADER_PROGRAM);
3435 
3436                 panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr,
3437                                      batch->tls.gpu);
3438         }
3439 
3440         pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
3441                 /* If a varying shader is used, we configure it with the same
3442                  * state as the position shader for backwards compatible
3443                  * behaviour with Bifrost. This could be optimized.
3444                  */
3445                 if (!secondary_shader) continue;
3446 
3447                 mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
3448                                 (2 * pan_size(SHADER_PROGRAM));
3449 
3450                 panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX,
3451                              ptr, batch->tls.gpu);
3452         }
3453 }
3454 #endif
3455 
3456 #if PAN_ARCH <= 7
3457 static void
panfrost_draw_emit_tiler(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,mali_ptr indices,mali_ptr fs_vary,mali_ptr varyings,mali_ptr pos,mali_ptr psiz,bool secondary_shader,void * job)3458 panfrost_draw_emit_tiler(struct panfrost_batch *batch,
3459                          const struct pipe_draw_info *info,
3460                          const struct pipe_draw_start_count_bias *draw,
3461                          void *invocation_template,
3462                          mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
3463                          mali_ptr pos, mali_ptr psiz, bool secondary_shader,
3464                          void *job)
3465 {
3466         struct panfrost_context *ctx = batch->ctx;
3467 
3468         void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
3469         memcpy(section, invocation_template, pan_size(INVOCATION));
3470 
3471         panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader,
3472                                 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
3473 
3474         void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
3475         enum pipe_prim_type prim = u_reduced_prim(info->mode);
3476 
3477 #if PAN_ARCH >= 6
3478         pan_section_pack(job, TILER_JOB, TILER, cfg) {
3479                 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
3480         }
3481 
3482         pan_section_pack(job, TILER_JOB, PADDING, cfg);
3483 #endif
3484 
3485         panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW),
3486                            batch, true, prim, pos, fs_vary, varyings);
3487 
3488         panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
3489 }
3490 #endif
3491 
3492 static void
panfrost_launch_xfb(struct panfrost_batch * batch,const struct pipe_draw_info * info,mali_ptr attribs,mali_ptr attrib_bufs,unsigned count)3493 panfrost_launch_xfb(struct panfrost_batch *batch,
3494                     const struct pipe_draw_info *info,
3495                     mali_ptr attribs, mali_ptr attrib_bufs,
3496                     unsigned count)
3497 {
3498         struct panfrost_context *ctx = batch->ctx;
3499 
3500         struct panfrost_ptr t =
3501                 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3502 
3503         /* Nothing to do */
3504         if (batch->ctx->streamout.num_targets == 0)
3505                 return;
3506 
3507         /* TODO: XFB with index buffers */
3508         //assert(info->index_size == 0);
3509         u_trim_pipe_prim(info->mode, &count);
3510 
3511         if (count == 0)
3512                 return;
3513 
3514         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3515         struct panfrost_shader_variants v = { .variants = vs->xfb };
3516 
3517         vs->xfb->stream_output = vs->stream_output;
3518 
3519         struct panfrost_shader_variants *saved_vs = ctx->shader[PIPE_SHADER_VERTEX];
3520         mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
3521         mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
3522         mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
3523 
3524         ctx->shader[PIPE_SHADER_VERTEX] = &v;
3525         batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
3526 
3527 #if PAN_ARCH >= 9
3528         pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
3529                 cfg.workgroup_size_x = 1;
3530                 cfg.workgroup_size_y = 1;
3531                 cfg.workgroup_size_z = 1;
3532 
3533                 cfg.workgroup_count_x = count;
3534                 cfg.workgroup_count_y = info->instance_count;
3535                 cfg.workgroup_count_z = 1;
3536 
3537                 panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX,
3538                                      batch->rsd[PIPE_SHADER_VERTEX],
3539                                      batch->tls.gpu);
3540 
3541                 /* TODO: Indexing. Also, this is a legacy feature... */
3542                 cfg.compute.attribute_offset = batch->ctx->offset_start;
3543 
3544                 /* Transform feedback shaders do not use barriers or shared
3545                  * memory, so we may merge workgroups.
3546                  */
3547                 cfg.allow_merging_workgroups = true;
3548                 cfg.task_increment = 1;
3549                 cfg.task_axis = MALI_TASK_AXIS_Z;
3550         }
3551 #else
3552         struct mali_invocation_packed invocation;
3553 
3554         panfrost_pack_work_groups_compute(&invocation,
3555                         1, count, info->instance_count,
3556                         1, 1, 1, PAN_ARCH <= 5, false);
3557 
3558         batch->uniform_buffers[PIPE_SHADER_VERTEX] =
3559                 panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
3560                                 &batch->push_uniforms[PIPE_SHADER_VERTEX], NULL);
3561 
3562         panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
3563                                   attribs, attrib_bufs, t.cpu);
3564 #endif
3565         enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
3566 #if PAN_ARCH <= 5
3567         job_type = MALI_JOB_TYPE_VERTEX;
3568 #endif
3569         panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type,
3570                          true, false, 0, 0, &t, false);
3571 
3572         ctx->shader[PIPE_SHADER_VERTEX] = saved_vs;
3573         batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
3574         batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
3575         batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
3576 }
3577 
3578 static void
panfrost_direct_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)3579 panfrost_direct_draw(struct panfrost_batch *batch,
3580                      const struct pipe_draw_info *info,
3581                      unsigned drawid_offset,
3582                      const struct pipe_draw_start_count_bias *draw)
3583 {
3584         if (!draw->count || !info->instance_count)
3585                 return;
3586 
3587         struct panfrost_context *ctx = batch->ctx;
3588 
3589         /* If we change whether we're drawing points, or whether point sprites
3590          * are enabled (specified in the rasterizer), we may need to rebind
3591          * shaders accordingly. This implicitly covers the case of rebinding
3592          * framebuffers, because all dirty flags are set there.
3593          */
3594         if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
3595             ((ctx->active_prim == PIPE_PRIM_POINTS) ^
3596              (info->mode       == PIPE_PRIM_POINTS))) {
3597 
3598                 ctx->active_prim = info->mode;
3599                 panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
3600         }
3601 
3602         /* Take into account a negative bias */
3603         ctx->indirect_draw = false;
3604         ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);
3605         ctx->instance_count = info->instance_count;
3606         ctx->base_vertex = info->index_size ? draw->index_bias : 0;
3607         ctx->base_instance = info->start_instance;
3608         ctx->active_prim = info->mode;
3609         ctx->drawid = drawid_offset;
3610 
3611         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3612 
3613         bool idvs = vs->info.vs.idvs;
3614         bool secondary_shader = vs->info.vs.secondary_enable;
3615 
3616         UNUSED struct panfrost_ptr tiler, vertex;
3617 
3618         if (idvs) {
3619 #if PAN_ARCH >= 9
3620                 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
3621 #elif PAN_ARCH >= 6
3622                 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
3623 #else
3624                 unreachable("IDVS is unsupported on Midgard");
3625 #endif
3626         } else {
3627                 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3628                 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
3629         }
3630 
3631         unsigned vertex_count = ctx->vertex_count;
3632 
3633         unsigned min_index = 0, max_index = 0;
3634         mali_ptr indices = 0;
3635 
3636         if (info->index_size && PAN_ARCH >= 9) {
3637                 indices = panfrost_get_index_buffer(batch, info, draw);
3638         } else if (info->index_size) {
3639                 indices = panfrost_get_index_buffer_bounded(batch, info, draw,
3640                                                             &min_index,
3641                                                             &max_index);
3642 
3643                 /* Use the corresponding values */
3644                 vertex_count = max_index - min_index + 1;
3645                 ctx->offset_start = min_index + draw->index_bias;
3646         } else {
3647                 ctx->offset_start = draw->start;
3648         }
3649 
3650         if (info->instance_count > 1) {
3651                 unsigned count = vertex_count;
3652 
3653                 /* Index-Driven Vertex Shading requires different instances to
3654                  * have different cache lines for position results. Each vertex
3655                  * position is 16 bytes and the Mali cache line is 64 bytes, so
3656                  * the instance count must be aligned to 4 vertices.
3657                  */
3658                 if (idvs)
3659                         count = ALIGN_POT(count, 4);
3660 
3661                 ctx->padded_count = panfrost_padded_vertex_count(count);
3662         } else
3663                 ctx->padded_count = vertex_count;
3664 
3665         panfrost_statistics_record(ctx, info, draw);
3666 
3667 #if PAN_ARCH <= 7
3668         struct mali_invocation_packed invocation;
3669         if (info->instance_count > 1) {
3670                 panfrost_pack_work_groups_compute(&invocation,
3671                                                   1, vertex_count, info->instance_count,
3672                                                   1, 1, 1, true, false);
3673         } else {
3674                 pan_pack(&invocation, INVOCATION, cfg) {
3675                         cfg.invocations = MALI_POSITIVE(vertex_count);
3676                         cfg.size_y_shift = 0;
3677                         cfg.size_z_shift = 0;
3678                         cfg.workgroups_x_shift = 0;
3679                         cfg.workgroups_y_shift = 0;
3680                         cfg.workgroups_z_shift = 32;
3681                         cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
3682                 }
3683         }
3684 
3685         /* Emit all sort of descriptors. */
3686         mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
3687 
3688         panfrost_emit_varying_descriptor(batch,
3689                                          ctx->padded_count *
3690                                          ctx->instance_count,
3691                                          &vs_vary, &fs_vary, &varyings,
3692                                          NULL, &pos, &psiz,
3693                                          info->mode == PIPE_PRIM_POINTS);
3694 
3695         mali_ptr attribs, attrib_bufs;
3696         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
3697 #endif
3698 
3699         panfrost_update_state_3d(batch);
3700         panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3701         panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3702         panfrost_clean_state_3d(ctx);
3703 
3704         if (vs->xfb) {
3705 #if PAN_ARCH >= 9
3706                 mali_ptr attribs = 0, attrib_bufs = 0;
3707 #endif
3708                 panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count);
3709         }
3710 
3711         /* Increment transform feedback offsets */
3712         panfrost_update_streamout_offsets(ctx);
3713 
3714         /* Any side effects must be handled by the XFB shader, so we only need
3715          * to run vertex shaders if we need rasterization.
3716          */
3717         if (panfrost_batch_skip_rasterization(batch))
3718                 return;
3719 
3720 #if PAN_ARCH >= 9
3721         assert(idvs && "Memory allocated IDVS required on Valhall");
3722 
3723         panfrost_emit_malloc_vertex(batch, info, draw, indices,
3724                                     secondary_shader, tiler.cpu);
3725 
3726         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3727                          MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0,
3728                          0, &tiler, false);
3729 #else
3730         /* Fire off the draw itself */
3731         panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
3732                                  fs_vary, varyings, pos, psiz, secondary_shader,
3733                                  tiler.cpu);
3734         if (idvs) {
3735 #if PAN_ARCH >= 6
3736                 panfrost_draw_emit_vertex_section(batch,
3737                                   vs_vary, varyings,
3738                                   attribs, attrib_bufs,
3739                                   pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
3740 
3741                 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3742                                  MALI_JOB_TYPE_INDEXED_VERTEX, false, false,
3743                                  0, 0, &tiler, false);
3744 #endif
3745         } else {
3746                 panfrost_draw_emit_vertex(batch, info, &invocation,
3747                                           vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
3748                 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3749         }
3750 #endif
3751 }
3752 
3753 #if PAN_GPU_INDIRECTS
3754 static void
panfrost_indirect_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)3755 panfrost_indirect_draw(struct panfrost_batch *batch,
3756                        const struct pipe_draw_info *info,
3757                        unsigned drawid_offset,
3758                        const struct pipe_draw_indirect_info *indirect,
3759                        const struct pipe_draw_start_count_bias *draw)
3760 {
3761         /* Indirect draw count and multi-draw not supported. */
3762         assert(indirect->draw_count == 1 && !indirect->indirect_draw_count);
3763 
3764         struct panfrost_context *ctx = batch->ctx;
3765         struct panfrost_device *dev = pan_device(ctx->base.screen);
3766 
3767         /* TODO: update statistics (see panfrost_statistics_record()) */
3768         /* TODO: Increment transform feedback offsets */
3769         assert(ctx->streamout.num_targets == 0);
3770 
3771         ctx->active_prim = info->mode;
3772         ctx->drawid = drawid_offset;
3773         ctx->indirect_draw = true;
3774 
3775         struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3776 
3777         bool idvs = vs->info.vs.idvs;
3778         bool secondary_shader = vs->info.vs.secondary_enable;
3779 
3780         struct panfrost_ptr tiler = { 0 }, vertex = { 0 };
3781 
3782         if (idvs) {
3783 #if PAN_ARCH >= 6
3784                 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
3785 #else
3786                 unreachable("IDVS is unsupported on Midgard");
3787 #endif
3788         } else {
3789                 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3790                 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
3791         }
3792 
3793         struct panfrost_bo *index_buf = NULL;
3794 
3795         if (info->index_size) {
3796                 assert(!info->has_user_indices);
3797                 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
3798                 index_buf = rsrc->image.data.bo;
3799                 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
3800         }
3801 
3802         mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
3803         unsigned varying_buf_count;
3804 
3805         /* We want to create templates, set all count fields to 0 to reflect
3806          * that.
3807          */
3808         ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0;
3809         ctx->offset_start = 0;
3810 
3811         /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the
3812          * vertex shader uses gl_VertexID or gl_BaseVertex.
3813          */
3814         ctx->first_vertex_sysval_ptr = 0;
3815         ctx->base_vertex_sysval_ptr = 0;
3816         ctx->base_instance_sysval_ptr = 0;
3817 
3818         panfrost_update_state_3d(batch);
3819         panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3820         panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3821         panfrost_clean_state_3d(ctx);
3822 
3823         bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS);
3824 
3825         panfrost_emit_varying_descriptor(batch, 0,
3826                                          &vs_vary, &fs_vary, &varyings,
3827                                          &varying_buf_count, &pos, &psiz,
3828                                          point_coord_replace);
3829 
3830         mali_ptr attribs, attrib_bufs;
3831         attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
3832 
3833         /* Zero-ed invocation, the compute job will update it. */
3834         static struct mali_invocation_packed invocation;
3835 
3836         /* Fire off the draw itself */
3837         panfrost_draw_emit_tiler(batch, info, draw, &invocation,
3838                                  index_buf ? index_buf->ptr.gpu : 0,
3839                                  fs_vary, varyings, pos, psiz, secondary_shader,
3840                                  tiler.cpu);
3841         if (idvs) {
3842 #if PAN_ARCH >= 6
3843                 panfrost_draw_emit_vertex_section(batch,
3844                                   vs_vary, varyings,
3845                                   attribs, attrib_bufs,
3846                                   pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
3847 #endif
3848         } else {
3849                 panfrost_draw_emit_vertex(batch, info, &invocation,
3850                                           vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
3851         }
3852 
3853         /* Add the varying heap BO to the batch if we're allocating varyings. */
3854         if (varyings) {
3855                 panfrost_batch_add_bo(batch,
3856                                       dev->indirect_draw_shaders.varying_heap,
3857                                       PIPE_SHADER_VERTEX);
3858         }
3859 
3860         assert(indirect->buffer);
3861 
3862         struct panfrost_resource *draw_buf = pan_resource(indirect->buffer);
3863 
3864         /* Don't count images: those attributes don't need to be patched. */
3865         unsigned attrib_count =
3866                 vs->info.attribute_count -
3867                 util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]);
3868 
3869         panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX);
3870 
3871         struct pan_indirect_draw_info draw_info = {
3872                 .last_indirect_draw = batch->indirect_draw_job_id,
3873                 .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset,
3874                 .index_buf = index_buf ? index_buf->ptr.gpu : 0,
3875                 .first_vertex_sysval = ctx->first_vertex_sysval_ptr,
3876                 .base_vertex_sysval = ctx->base_vertex_sysval_ptr,
3877                 .base_instance_sysval = ctx->base_instance_sysval_ptr,
3878                 .vertex_job = vertex.gpu,
3879                 .tiler_job = tiler.gpu,
3880                 .attrib_bufs = attrib_bufs,
3881                 .attribs = attribs,
3882                 .attrib_count = attrib_count,
3883                 .varying_bufs = varyings,
3884                 .index_size = info->index_size,
3885         };
3886 
3887         if (panfrost_writes_point_size(ctx))
3888                 draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE;
3889 
3890         if (vs->info.vs.writes_point_size)
3891                 draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ;
3892 
3893         if (idvs)
3894                 draw_info.flags |= PAN_INDIRECT_DRAW_IDVS;
3895 
3896         if (info->primitive_restart) {
3897                 draw_info.restart_index = info->restart_index;
3898                 draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART;
3899         }
3900 
3901         batch->indirect_draw_job_id =
3902                 GENX(panfrost_emit_indirect_draw)(&batch->pool.base,
3903                                                   &batch->scoreboard,
3904                                                   &draw_info,
3905                                                   &batch->indirect_draw_ctx);
3906 
3907         if (idvs) {
3908                 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3909                                  MALI_JOB_TYPE_INDEXED_VERTEX, false, false,
3910                                  0, 0, &tiler, false);
3911         } else {
3912                 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3913         }
3914 }
3915 #endif
3916 
3917 static bool
panfrost_compatible_batch_state(struct panfrost_batch * batch,bool points)3918 panfrost_compatible_batch_state(struct panfrost_batch *batch,
3919                                 bool points)
3920 {
3921         /* Only applies on Valhall */
3922         if (PAN_ARCH < 9)
3923                 return true;
3924 
3925         struct panfrost_context *ctx = batch->ctx;
3926         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3927 
3928         bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
3929         bool first = rast->flatshade_first;
3930 
3931         /* gl_PointCoord orientation only matters when drawing points, but
3932          * provoking vertex doesn't matter for points.
3933          */
3934         if (points)
3935                 return pan_tristate_set(&batch->sprite_coord_origin, coord);
3936         else
3937                 return pan_tristate_set(&batch->first_provoking_vertex, first);
3938 }
3939 
3940 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3941 panfrost_draw_vbo(struct pipe_context *pipe,
3942                   const struct pipe_draw_info *info,
3943                   unsigned drawid_offset,
3944                   const struct pipe_draw_indirect_info *indirect,
3945                   const struct pipe_draw_start_count_bias *draws,
3946                   unsigned num_draws)
3947 {
3948         struct panfrost_context *ctx = pan_context(pipe);
3949         struct panfrost_device *dev = pan_device(pipe->screen);
3950 
3951         if (!panfrost_render_condition_check(ctx))
3952                 return;
3953 
3954         /* Emulate indirect draws unless we're using the experimental path */
3955         if ((!(dev->debug & PAN_DBG_INDIRECT) || !PAN_GPU_INDIRECTS) && indirect && indirect->buffer) {
3956                 assert(num_draws == 1);
3957                 util_draw_indirect(pipe, info, indirect);
3958                 return;
3959         }
3960 
3961         /* Do some common setup */
3962         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3963 
3964         /* Don't add too many jobs to a single batch. Hardware has a hard limit
3965          * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to
3966          * avoid the risk of timeouts. This might not be a good idea. */
3967         if (unlikely(batch->scoreboard.job_index > 10000))
3968                 batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3969 
3970         bool points = (info->mode == PIPE_PRIM_POINTS);
3971 
3972         if (unlikely(!panfrost_compatible_batch_state(batch, points))) {
3973                 batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
3974 
3975                 ASSERTED bool succ = panfrost_compatible_batch_state(batch, points);
3976                 assert(succ && "must be able to set state for a fresh batch");
3977         }
3978 
3979         /* panfrost_batch_skip_rasterization reads
3980          * batch->scissor_culls_everything, which is set by
3981          * panfrost_emit_viewport, so call that first.
3982          */
3983         if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
3984                 batch->viewport = panfrost_emit_viewport(batch);
3985 
3986         /* Mark everything dirty when debugging */
3987         if (unlikely(dev->debug & PAN_DBG_DIRTY))
3988                 panfrost_dirty_state_all(ctx);
3989 
3990         /* Conservatively assume draw parameters always change */
3991         ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3992 
3993         if (indirect) {
3994                 assert(num_draws == 1);
3995                 assert(PAN_GPU_INDIRECTS);
3996 
3997 #if PAN_GPU_INDIRECTS
3998                 if (indirect->count_from_stream_output) {
3999                         struct pipe_draw_start_count_bias tmp_draw = *draws;
4000                         struct panfrost_streamout_target *so =
4001                                 pan_so_target(indirect->count_from_stream_output);
4002 
4003                         tmp_draw.start = 0;
4004                         tmp_draw.count = so->offset;
4005                         tmp_draw.index_bias = 0;
4006                         panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw);
4007                         return;
4008                 }
4009 
4010                 panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]);
4011                 return;
4012 #endif
4013         }
4014 
4015         struct pipe_draw_info tmp_info = *info;
4016         unsigned drawid = drawid_offset;
4017 
4018         for (unsigned i = 0; i < num_draws; i++) {
4019                 panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);
4020 
4021                 if (tmp_info.increment_draw_id) {
4022                         ctx->dirty |= PAN_DIRTY_DRAWID;
4023                         drawid++;
4024                 }
4025         }
4026 
4027 }
4028 
4029 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
4030  * construct the COMPUTE job and some of its payload.
4031  */
4032 
4033 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)4034 panfrost_launch_grid(struct pipe_context *pipe,
4035                 const struct pipe_grid_info *info)
4036 {
4037         struct panfrost_context *ctx = pan_context(pipe);
4038 
4039         /* XXX - shouldn't be necessary with working memory barriers. Affected
4040          * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
4041         panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
4042 
4043         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
4044 
4045         struct panfrost_shader_state *cs =
4046                 &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0];
4047 
4048         /* Indirect dispatch can't handle workgroup local storage since that
4049          * would require dynamic memory allocation. Bail in this case. */
4050         if (info->indirect && ((cs->info.wls_size != 0) || !PAN_GPU_INDIRECTS)) {
4051                 struct pipe_transfer *transfer;
4052                 uint32_t *params = pipe_buffer_map_range(pipe, info->indirect,
4053                                 info->indirect_offset,
4054                                 3 * sizeof(uint32_t),
4055                                 PIPE_MAP_READ,
4056                                 &transfer);
4057 
4058                 struct pipe_grid_info direct = *info;
4059                 direct.indirect = NULL;
4060                 direct.grid[0] = params[0];
4061                 direct.grid[1] = params[1];
4062                 direct.grid[2] = params[2];
4063                 pipe_buffer_unmap(pipe, transfer);
4064 
4065                 if (params[0] && params[1] && params[2])
4066                         panfrost_launch_grid(pipe, &direct);
4067 
4068                 return;
4069         }
4070 
4071         ctx->compute_grid = info;
4072 
4073         struct panfrost_ptr t =
4074                 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
4075 
4076         /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so
4077          * reuse the graphics path for this by lowering to Gallium */
4078 
4079         struct pipe_constant_buffer ubuf = {
4080                 .buffer = NULL,
4081                 .buffer_offset = 0,
4082                 .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->req_input_mem,
4083                 .user_buffer = info->input
4084         };
4085 
4086         if (info->input)
4087                 pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf);
4088 
4089         /* Invoke according to the grid info */
4090 
4091         unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] };
4092 
4093         if (info->indirect)
4094                 num_wg[0] = num_wg[1] = num_wg[2] = 1;
4095 
4096         panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
4097 
4098 #if PAN_ARCH <= 7
4099         panfrost_pack_work_groups_compute(pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION),
4100                                           num_wg[0], num_wg[1], num_wg[2],
4101                                           info->block[0], info->block[1],
4102                                           info->block[2],
4103                                           false, info->indirect != NULL);
4104 
4105         pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
4106                 cfg.job_task_split =
4107                         util_logbase2_ceil(info->block[0] + 1) +
4108                         util_logbase2_ceil(info->block[1] + 1) +
4109                         util_logbase2_ceil(info->block[2] + 1);
4110         }
4111 
4112         pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
4113                 cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
4114                 cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
4115                 cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
4116                 cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
4117                 cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
4118                 cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
4119                 cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
4120         }
4121 #else
4122         pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
4123                 cfg.workgroup_size_x = info->block[0];
4124                 cfg.workgroup_size_y = info->block[1];
4125                 cfg.workgroup_size_z = info->block[2];
4126 
4127                 cfg.workgroup_count_x = num_wg[0];
4128                 cfg.workgroup_count_y = num_wg[1];
4129                 cfg.workgroup_count_z = num_wg[2];
4130 
4131                 panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
4132                                      batch->rsd[PIPE_SHADER_COMPUTE],
4133                                      panfrost_emit_shared_memory(batch, info));
4134 
4135                 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups;
4136                 cfg.task_increment = 1;
4137                 cfg.task_axis = MALI_TASK_AXIS_Z;
4138         }
4139 #endif
4140 
4141         unsigned indirect_dep = 0;
4142 #if PAN_GPU_INDIRECTS
4143         if (info->indirect) {
4144                 struct pan_indirect_dispatch_info indirect = {
4145                         .job = t.gpu,
4146                         .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
4147                                         info->indirect_offset,
4148                         .num_wg_sysval = {
4149                                 batch->num_wg_sysval[0],
4150                                 batch->num_wg_sysval[1],
4151                                 batch->num_wg_sysval[2],
4152                         },
4153                 };
4154 
4155                 indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base,
4156                                                                 &batch->scoreboard,
4157                                                                 &indirect);
4158         }
4159 #endif
4160 
4161         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
4162                          MALI_JOB_TYPE_COMPUTE, true, false,
4163                          indirect_dep, 0, &t, false);
4164         panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
4165 }
4166 
4167 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)4168 panfrost_create_rasterizer_state(
4169         struct pipe_context *pctx,
4170         const struct pipe_rasterizer_state *cso)
4171 {
4172         struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
4173 
4174         so->base = *cso;
4175 
4176         /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */
4177         assert(cso->offset_clamp == 0.0);
4178 
4179 #if PAN_ARCH <= 7
4180         pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
4181                 cfg.multisample_enable = cso->multisample;
4182                 cfg.fixed_function_near_discard = cso->depth_clip_near;
4183                 cfg.fixed_function_far_discard = cso->depth_clip_far;
4184                 cfg.shader_depth_range_fixed = true;
4185         }
4186 
4187         pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
4188                 cfg.front_facing_depth_bias = cso->offset_tri;
4189                 cfg.back_facing_depth_bias = cso->offset_tri;
4190                 cfg.single_sampled_lines = !cso->multisample;
4191         }
4192 #endif
4193 
4194         return so;
4195 }
4196 
4197 #if PAN_ARCH >= 9
4198 /*
4199  * Given a pipe_vertex_element, pack the corresponding Valhall attribute
4200  * descriptor. This function is called at CSO create time. Since
4201  * pipe_vertex_element lacks a stride, the packed attribute descriptor will not
4202  * be uploaded until draw time.
4203  */
4204 static void
panfrost_pack_attribute(struct panfrost_device * dev,const struct pipe_vertex_element el,struct mali_attribute_packed * out)4205 panfrost_pack_attribute(struct panfrost_device *dev,
4206                         const struct pipe_vertex_element el,
4207                         struct mali_attribute_packed *out)
4208 {
4209         pan_pack(out, ATTRIBUTE, cfg) {
4210                 cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER;
4211                 cfg.frequency = (el.instance_divisor > 0) ?
4212                         MALI_ATTRIBUTE_FREQUENCY_INSTANCE :
4213                         MALI_ATTRIBUTE_FREQUENCY_VERTEX;
4214                 cfg.format = dev->formats[el.src_format].hw;
4215                 cfg.offset = el.src_offset;
4216                 cfg.buffer_index = el.vertex_buffer_index;
4217 
4218                 if (el.instance_divisor == 0) {
4219                         /* Per-vertex */
4220                         cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
4221                         cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
4222                         cfg.offset_enable = true;
4223                 } else if (util_is_power_of_two_or_zero(el.instance_divisor)) {
4224                         /* Per-instance, POT divisor */
4225                         cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
4226                         cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
4227                         cfg.divisor_r = __builtin_ctz(el.instance_divisor);
4228                 } else {
4229                         /* Per-instance, NPOT divisor */
4230                         cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
4231                         cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
4232 
4233                         cfg.divisor_d =
4234                                 panfrost_compute_magic_divisor(el.instance_divisor,
4235                                                 &cfg.divisor_r, &cfg.divisor_e);
4236                 }
4237         }
4238 }
4239 #endif
4240 
4241 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)4242 panfrost_create_vertex_elements_state(
4243         struct pipe_context *pctx,
4244         unsigned num_elements,
4245         const struct pipe_vertex_element *elements)
4246 {
4247         struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
4248         struct panfrost_device *dev = pan_device(pctx->screen);
4249 
4250         so->num_elements = num_elements;
4251         memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
4252 
4253 #if PAN_ARCH >= 9
4254         for (unsigned i = 0; i < num_elements; ++i)
4255                 panfrost_pack_attribute(dev, elements[i], &so->attributes[i]);
4256 #else
4257         /* Assign attribute buffers corresponding to the vertex buffers, keyed
4258          * for a particular divisor since that's how instancing works on Mali */
4259         for (unsigned i = 0; i < num_elements; ++i) {
4260                 so->element_buffer[i] = pan_assign_vertex_buffer(
4261                                 so->buffers, &so->nr_bufs,
4262                                 elements[i].vertex_buffer_index,
4263                                 elements[i].instance_divisor);
4264         }
4265 
4266         for (int i = 0; i < num_elements; ++i) {
4267                 enum pipe_format fmt = elements[i].src_format;
4268                 so->formats[i] = dev->formats[fmt].hw;
4269         }
4270 
4271         /* Let's also prepare vertex builtins */
4272         so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
4273         so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
4274 #endif
4275 
4276         return so;
4277 }
4278 
4279 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)4280 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
4281 {
4282         switch (in) {
4283         case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP;
4284         case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO;
4285         case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE;
4286         case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT;
4287         case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT;
4288         case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP;
4289         case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP;
4290         case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT;
4291         default: unreachable("Invalid stencil op");
4292         }
4293 }
4294 
4295 #if PAN_ARCH <= 7
4296 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)4297 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
4298                     struct mali_stencil_packed *out)
4299 {
4300         pan_pack(out, STENCIL, s) {
4301                 s.mask = in->valuemask;
4302                 s.compare_function = (enum mali_func) in->func;
4303                 s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
4304                 s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
4305                 s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
4306         }
4307 }
4308 #endif
4309 
4310 static bool
pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state * zsa)4311 pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa)
4312 {
4313         if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS)
4314                 return false;
4315 
4316         if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS)
4317                 return false;
4318 
4319         if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS)
4320                 return false;
4321 
4322         return true;
4323 }
4324 
4325 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)4326 panfrost_create_depth_stencil_state(struct pipe_context *pipe,
4327                                     const struct pipe_depth_stencil_alpha_state *zsa)
4328 {
4329         struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
4330         so->base = *zsa;
4331 
4332         const struct pipe_stencil_state front = zsa->stencil[0];
4333         const struct pipe_stencil_state back =
4334                 zsa->stencil[1].enabled ? zsa->stencil[1] : front;
4335 
4336         enum mali_func depth_func = zsa->depth_enabled ?
4337                 (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS;
4338 
4339         /* Normalize (there's no separate enable) */
4340         if (PAN_ARCH <= 5 && !zsa->alpha_enabled)
4341                 so->base.alpha_func = MALI_FUNC_ALWAYS;
4342 
4343 #if PAN_ARCH <= 7
4344         /* Prepack relevant parts of the Renderer State Descriptor. They will
4345          * be ORed in at draw-time */
4346         pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
4347                 cfg.depth_function = depth_func;
4348                 cfg.depth_write_mask = zsa->depth_writemask;
4349         }
4350 
4351         pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
4352                 cfg.stencil_enable = front.enabled;
4353                 cfg.stencil_mask_front = front.writemask;
4354                 cfg.stencil_mask_back = back.writemask;
4355 
4356 #if PAN_ARCH <= 5
4357                 cfg.alpha_test_compare_function =
4358                         (enum mali_func) so->base.alpha_func;
4359 #endif
4360         }
4361 
4362         /* Stencil tests have their own words in the RSD */
4363         pan_pipe_to_stencil(&front, &so->stencil_front);
4364         pan_pipe_to_stencil(&back, &so->stencil_back);
4365 #else
4366         pan_pack(&so->desc, DEPTH_STENCIL, cfg) {
4367                 cfg.front_compare_function = (enum mali_func) front.func;
4368                 cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op);
4369                 cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op);
4370                 cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op);
4371 
4372                 cfg.back_compare_function = (enum mali_func) back.func;
4373                 cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op);
4374                 cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op);
4375                 cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op);
4376 
4377                 cfg.stencil_test_enable = front.enabled;
4378                 cfg.front_write_mask = front.writemask;
4379                 cfg.back_write_mask = back.writemask;
4380                 cfg.front_value_mask = front.valuemask;
4381                 cfg.back_value_mask = back.valuemask;
4382 
4383                 cfg.depth_write_enable = zsa->depth_writemask;
4384                 cfg.depth_function = depth_func;
4385         }
4386 #endif
4387 
4388         so->enabled = zsa->stencil[0].enabled ||
4389                 (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
4390 
4391         so->zs_always_passes = pipe_zs_always_passes(zsa);
4392         so->writes_zs = util_writes_depth_stencil(zsa);
4393 
4394         /* TODO: Bounds test should be easy */
4395         assert(!zsa->depth_bounds_test);
4396 
4397         return so;
4398 }
4399 
4400 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)4401 panfrost_create_sampler_view(
4402         struct pipe_context *pctx,
4403         struct pipe_resource *texture,
4404         const struct pipe_sampler_view *template)
4405 {
4406         struct panfrost_context *ctx = pan_context(pctx);
4407         struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
4408 
4409         pan_legalize_afbc_format(ctx, pan_resource(texture), template->format);
4410 
4411         pipe_reference(NULL, &texture->reference);
4412 
4413         so->base = *template;
4414         so->base.texture = texture;
4415         so->base.reference.count = 1;
4416         so->base.context = pctx;
4417 
4418         panfrost_create_sampler_view_bo(so, pctx, texture);
4419 
4420         return (struct pipe_sampler_view *) so;
4421 }
4422 
4423 /* A given Gallium blend state can be encoded to the hardware in numerous,
4424  * dramatically divergent ways due to the interactions of blending with
4425  * framebuffer formats. Conceptually, there are two modes:
4426  *
4427  * - Fixed-function blending (for suitable framebuffer formats, suitable blend
4428  *   state, and suitable blend constant)
4429  *
4430  * - Blend shaders (for everything else)
4431  *
4432  * A given Gallium blend configuration will compile to exactly one
4433  * fixed-function blend state, if it compiles to any, although the constant
4434  * will vary across runs as that is tracked outside of the Gallium CSO.
4435  *
4436  * However, that same blend configuration will compile to many different blend
4437  * shaders, depending on the framebuffer formats active. The rationale is that
4438  * blend shaders override not just fixed-function blending but also
4439  * fixed-function format conversion, so blend shaders are keyed to a particular
4440  * framebuffer format. As an example, the tilebuffer format is identical for
4441  * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
4442  * blend shaders.
4443  *
4444  * All of this state is encapsulated in the panfrost_blend_state struct
4445  * (our subclass of pipe_blend_state).
4446  */
4447 
4448 /* Create a blend CSO. Essentially, try to compile a fixed-function
4449  * expression and initialize blend shaders */
4450 
4451 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)4452 panfrost_create_blend_state(struct pipe_context *pipe,
4453                             const struct pipe_blend_state *blend)
4454 {
4455         struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
4456         so->base = *blend;
4457 
4458         so->pan.logicop_enable = blend->logicop_enable;
4459         so->pan.logicop_func = blend->logicop_func;
4460         so->pan.rt_count = blend->max_rt + 1;
4461 
4462         for (unsigned c = 0; c < so->pan.rt_count; ++c) {
4463                 unsigned g = blend->independent_blend_enable ? c : 0;
4464                 const struct pipe_rt_blend_state pipe = blend->rt[g];
4465                 struct pan_blend_equation equation = {0};
4466 
4467                 equation.color_mask = pipe.colormask;
4468                 equation.blend_enable = pipe.blend_enable;
4469 
4470                 if (pipe.blend_enable) {
4471                         equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func);
4472                         equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor);
4473                         equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor);
4474                         equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor);
4475                         equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor);
4476                         equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func);
4477                         equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor);
4478                         equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor);
4479                         equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor);
4480                         equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor);
4481                 }
4482 
4483                 /* Determine some common properties */
4484                 unsigned constant_mask = pan_blend_constant_mask(equation);
4485                 const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
4486                 so->info[c] = (struct pan_blend_info) {
4487                         .no_colour = (equation.color_mask == 0),
4488                         .opaque = pan_blend_is_opaque(equation),
4489                         .constant_mask = constant_mask,
4490 
4491                         /* TODO: check the dest for the logicop */
4492                         .load_dest = blend->logicop_enable ||
4493                                 pan_blend_reads_dest(equation),
4494 
4495                         /* Could this possibly be fixed-function? */
4496                         .fixed_function = !blend->logicop_enable &&
4497                                 pan_blend_can_fixed_function(equation,
4498                                                              supports_2src) &&
4499                                 (!constant_mask ||
4500                                  pan_blend_supports_constant(PAN_ARCH, c)),
4501 
4502                         .alpha_zero_nop = pan_blend_alpha_zero_nop(equation),
4503                         .alpha_one_store = pan_blend_alpha_one_store(equation),
4504                 };
4505 
4506                 so->pan.rts[c].equation = equation;
4507 
4508                 /* Bifrost needs to know if any render target loads its
4509                  * destination in the hot draw path, so precompute this */
4510                 if (so->info[c].load_dest)
4511                         so->load_dest_mask |= BITFIELD_BIT(c);
4512 
4513                 /* Converting equations to Mali style is expensive, do it at
4514                  * CSO create time instead of draw-time */
4515                 if (so->info[c].fixed_function) {
4516                         so->equation[c] = pan_pack_blend(equation);
4517                 }
4518         }
4519 
4520         return so;
4521 }
4522 
4523 static void
prepare_shader(struct panfrost_shader_state * state,struct panfrost_pool * pool,bool upload)4524 prepare_shader(struct panfrost_shader_state *state,
4525             struct panfrost_pool *pool, bool upload)
4526 {
4527 #if PAN_ARCH <= 7
4528         void *out = &state->partial_rsd;
4529 
4530         if (upload) {
4531                 struct panfrost_ptr ptr =
4532                         pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
4533 
4534                 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
4535                 out = ptr.cpu;
4536         }
4537 
4538         pan_pack(out, RENDERER_STATE, cfg) {
4539                 pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
4540 
4541        }
4542 #else
4543         assert(upload);
4544 
4545         /* The address in the shader program descriptor must be non-null, but
4546          * the entire shader program descriptor may be omitted.
4547          *
4548          * See dEQP-GLES31.functional.compute.basic.empty
4549          */
4550         if (!state->bin.gpu)
4551                 return;
4552 
4553         bool vs = (state->info.stage == MESA_SHADER_VERTEX);
4554         bool secondary_enable = (vs && state->info.vs.secondary_enable);
4555 
4556         unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1;
4557         struct panfrost_ptr ptr = pan_pool_alloc_desc_array(&pool->base,
4558                                                             nr_variants,
4559                                                             SHADER_PROGRAM);
4560 
4561         state->state = panfrost_pool_take_ref(pool, ptr.gpu);
4562 
4563         /* Generic, or IDVS/points */
4564         pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) {
4565                 cfg.stage = pan_shader_stage(&state->info);
4566                 cfg.primary_shader = true;
4567                 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
4568                 cfg.binary = state->bin.gpu;
4569                 cfg.preload.r48_r63 = (state->info.preload >> 48);
4570 
4571                 if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
4572                         cfg.requires_helper_threads = state->info.contains_barrier;
4573         }
4574 
4575         if (!vs)
4576                 return;
4577 
4578         /* IDVS/triangles */
4579         pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) {
4580                 cfg.stage = pan_shader_stage(&state->info);
4581                 cfg.primary_shader = true;
4582                 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
4583                 cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
4584                 cfg.preload.r48_r63 = (state->info.preload >> 48);
4585         }
4586 
4587         if (!secondary_enable)
4588                 return;
4589 
4590         pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) {
4591                 unsigned work_count = state->info.vs.secondary_work_reg_count;
4592 
4593                 cfg.stage = pan_shader_stage(&state->info);
4594                 cfg.primary_shader = false;
4595                 cfg.register_allocation = pan_register_allocation(work_count);
4596                 cfg.binary = state->bin.gpu + state->info.vs.secondary_offset;
4597                 cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48);
4598         }
4599 #endif
4600 }
4601 
4602 static void
panfrost_get_sample_position(struct pipe_context * context,unsigned sample_count,unsigned sample_index,float * out_value)4603 panfrost_get_sample_position(struct pipe_context *context,
4604                              unsigned sample_count,
4605                              unsigned sample_index,
4606                              float *out_value)
4607 {
4608         panfrost_query_sample_position(
4609                         panfrost_sample_pattern(sample_count),
4610                         sample_index,
4611                         out_value);
4612 }
4613 
4614 static void
screen_destroy(struct pipe_screen * pscreen)4615 screen_destroy(struct pipe_screen *pscreen)
4616 {
4617         struct panfrost_device *dev = pan_device(pscreen);
4618         GENX(pan_blitter_cleanup)(dev);
4619 
4620 #if PAN_GPU_INDIRECTS
4621         GENX(panfrost_cleanup_indirect_draw_shaders)(dev);
4622         GENX(pan_indirect_dispatch_cleanup)(dev);
4623 #endif
4624 }
4625 
4626 static void
preload(struct panfrost_batch * batch,struct pan_fb_info * fb)4627 preload(struct panfrost_batch *batch, struct pan_fb_info *fb)
4628 {
4629         GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
4630                              PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
4631 }
4632 
4633 static void
init_batch(struct panfrost_batch * batch)4634 init_batch(struct panfrost_batch *batch)
4635 {
4636         /* Reserve the framebuffer and local storage descriptors */
4637         batch->framebuffer =
4638 #if PAN_ARCH == 4
4639                 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
4640 #else
4641                 pan_pool_alloc_desc_aggregate(&batch->pool.base,
4642                                               PAN_DESC(FRAMEBUFFER),
4643                                               PAN_DESC(ZS_CRC_EXTENSION),
4644                                               PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
4645 
4646                 batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD;
4647 #endif
4648 
4649 #if PAN_ARCH >= 6
4650         batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
4651 #else
4652         /* On Midgard, the TLS is embedded in the FB descriptor */
4653         batch->tls = batch->framebuffer;
4654 #endif
4655 }
4656 
4657 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)4658 panfrost_sampler_view_destroy(
4659         struct pipe_context *pctx,
4660         struct pipe_sampler_view *pview)
4661 {
4662         struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
4663 
4664         pipe_resource_reference(&pview->texture, NULL);
4665         panfrost_bo_unreference(view->state.bo);
4666         ralloc_free(view);
4667 }
4668 
4669 static void
context_init(struct pipe_context * pipe)4670 context_init(struct pipe_context *pipe)
4671 {
4672         pipe->draw_vbo           = panfrost_draw_vbo;
4673         pipe->launch_grid        = panfrost_launch_grid;
4674 
4675         pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
4676         pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
4677         pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
4678         pipe->create_sampler_view = panfrost_create_sampler_view;
4679         pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
4680         pipe->create_sampler_state = panfrost_create_sampler_state;
4681         pipe->create_blend_state = panfrost_create_blend_state;
4682 
4683         pipe->get_sample_position = panfrost_get_sample_position;
4684 }
4685 
4686 #if PAN_ARCH <= 5
4687 
4688 /* Returns the polygon list's GPU address if available, or otherwise allocates
4689  * the polygon list.  It's perfectly fast to use allocate/free BO directly,
4690  * since we'll hit the BO cache and this is one-per-batch anyway. */
4691 
4692 static mali_ptr
batch_get_polygon_list(struct panfrost_batch * batch)4693 batch_get_polygon_list(struct panfrost_batch *batch)
4694 {
4695         struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
4696 
4697         if (!batch->tiler_ctx.midgard.polygon_list) {
4698                 bool has_draws = batch->scoreboard.first_tiler != NULL;
4699                 unsigned size =
4700                         panfrost_tiler_get_polygon_list_size(dev,
4701                                                              batch->key.width,
4702                                                              batch->key.height,
4703                                                              has_draws);
4704                 size = util_next_power_of_two(size);
4705 
4706                 /* Create the BO as invisible if we can. In the non-hierarchical tiler case,
4707                  * we need to write the polygon list manually because there's not WRITE_VALUE
4708                  * job in the chain (maybe we should add one...). */
4709                 bool init_polygon_list = !has_draws && dev->model->quirks.no_hierarchical_tiling;
4710                 batch->tiler_ctx.midgard.polygon_list =
4711                         panfrost_batch_create_bo(batch, size,
4712                                                  init_polygon_list ? 0 : PAN_BO_INVISIBLE,
4713                                                  PIPE_SHADER_VERTEX,
4714                                                  "Polygon list");
4715                 panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list,
4716                                 PIPE_SHADER_FRAGMENT);
4717 
4718                 if (init_polygon_list) {
4719                         assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu);
4720                         uint32_t *polygon_list_body =
4721                                 batch->tiler_ctx.midgard.polygon_list->ptr.cpu +
4722                                 MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
4723 
4724                         /* Magic for Mali T720 */
4725                         polygon_list_body[0] = 0xa0000000;
4726                 }
4727 
4728                 batch->tiler_ctx.midgard.disable = !has_draws;
4729         }
4730 
4731         return batch->tiler_ctx.midgard.polygon_list->ptr.gpu;
4732 }
4733 #endif
4734 
4735 static void
init_polygon_list(struct panfrost_batch * batch)4736 init_polygon_list(struct panfrost_batch *batch)
4737 {
4738 #if PAN_ARCH <= 5
4739         mali_ptr polygon_list = batch_get_polygon_list(batch);
4740         panfrost_scoreboard_initialize_tiler(&batch->pool.base,
4741                                              &batch->scoreboard,
4742                                              polygon_list);
4743 #endif
4744 }
4745 
4746 void
GENX(panfrost_cmdstream_screen_init)4747 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
4748 {
4749         struct panfrost_device *dev = &screen->dev;
4750 
4751         screen->vtbl.prepare_shader = prepare_shader;
4752         screen->vtbl.emit_tls    = emit_tls;
4753         screen->vtbl.emit_fbd    = emit_fbd;
4754         screen->vtbl.emit_fragment_job = emit_fragment_job;
4755         screen->vtbl.screen_destroy = screen_destroy;
4756         screen->vtbl.preload     = preload;
4757         screen->vtbl.context_init = context_init;
4758         screen->vtbl.init_batch = init_batch;
4759         screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
4760         screen->vtbl.init_polygon_list = init_polygon_list;
4761         screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
4762         screen->vtbl.compile_shader = GENX(pan_shader_compile);
4763 
4764         GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
4765                                &screen->blitter.desc_pool.base);
4766 #if PAN_GPU_INDIRECTS
4767         GENX(pan_indirect_dispatch_init)(dev);
4768         GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base);
4769 #endif
4770 }
4771