1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2017 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "util/macros.h"
27 #include "util/u_prim.h"
28 #include "util/u_vbuf.h"
29 #include "util/u_helpers.h"
30 #include "util/u_draw.h"
31 #include "util/u_memory.h"
32 #include "pipe/p_defines.h"
33 #include "pipe/p_state.h"
34 #include "gallium/auxiliary/util/u_blend.h"
35
36 #include "genxml/gen_macros.h"
37
38 #include "pan_pool.h"
39 #include "pan_bo.h"
40 #include "pan_blend.h"
41 #include "pan_context.h"
42 #include "pan_job.h"
43 #include "pan_shader.h"
44 #include "pan_texture.h"
45 #include "pan_util.h"
46 #include "pan_indirect_draw.h"
47 #include "pan_indirect_dispatch.h"
48 #include "pan_blitter.h"
49
50 #define PAN_GPU_INDIRECTS (PAN_ARCH == 7)
51
52 struct panfrost_rasterizer {
53 struct pipe_rasterizer_state base;
54
55 #if PAN_ARCH <= 7
56 /* Partially packed RSD words */
57 struct mali_multisample_misc_packed multisample;
58 struct mali_stencil_mask_misc_packed stencil_misc;
59 #endif
60 };
61
62 struct panfrost_zsa_state {
63 struct pipe_depth_stencil_alpha_state base;
64
65 /* Is any depth, stencil, or alpha testing enabled? */
66 bool enabled;
67
68 /* Does the depth and stencil tests always pass? This ignores write
69 * masks, we are only interested in whether pixels may be killed.
70 */
71 bool zs_always_passes;
72
73 /* Are depth or stencil writes possible? */
74 bool writes_zs;
75
76 #if PAN_ARCH <= 7
77 /* Prepacked words from the RSD */
78 struct mali_multisample_misc_packed rsd_depth;
79 struct mali_stencil_mask_misc_packed rsd_stencil;
80 struct mali_stencil_packed stencil_front, stencil_back;
81 #else
82 /* Depth/stencil descriptor template */
83 struct mali_depth_stencil_packed desc;
84 #endif
85 };
86
87 struct panfrost_sampler_state {
88 struct pipe_sampler_state base;
89 struct mali_sampler_packed hw;
90 };
91
92 /* Misnomer: Sampler view corresponds to textures, not samplers */
93
94 struct panfrost_sampler_view {
95 struct pipe_sampler_view base;
96 struct panfrost_pool_ref state;
97 struct mali_texture_packed bifrost_descriptor;
98 mali_ptr texture_bo;
99 uint64_t modifier;
100
101 /* Pool used to allocate the descriptor. If NULL, defaults to the global
102 * descriptor pool. Can be set for short lived descriptors, useful for
103 * shader images on Valhall.
104 */
105 struct panfrost_pool *pool;
106 };
107
108 struct panfrost_vertex_state {
109 unsigned num_elements;
110 struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS];
111
112 #if PAN_ARCH >= 9
113 /* Packed attribute descriptor. All fields are set at CSO create time
114 * except for stride, which must be ORed in at draw time
115 */
116 struct mali_attribute_packed attributes[PIPE_MAX_ATTRIBS];
117 #else
118 /* buffers corresponds to attribute buffer, element_buffers corresponds
119 * to an index in buffers for each vertex element */
120 struct pan_vertex_buffer buffers[PIPE_MAX_ATTRIBS];
121 unsigned element_buffer[PIPE_MAX_ATTRIBS];
122 unsigned nr_bufs;
123
124 unsigned formats[PIPE_MAX_ATTRIBS];
125 #endif
126 };
127
128 /* Statically assert that PIPE_* enums match the hardware enums.
129 * (As long as they match, we don't need to translate them.)
130 */
131 static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match");
132 static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match");
133 static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match");
134 static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match");
135 static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match");
136 static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match");
137 static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match");
138 static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match");
139
140 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)141 panfrost_sample_pattern(unsigned samples)
142 {
143 switch (samples) {
144 case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
145 case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
146 case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
147 case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
148 default: unreachable("Unsupported sample count");
149 }
150 }
151
152 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)153 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
154 {
155 /* CLAMP is only supported on Midgard, where it is broken for nearest
156 * filtering. Use CLAMP_TO_EDGE in that case.
157 */
158
159 switch (w) {
160 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
161 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
162 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
163 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
164 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
165 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
166
167 #if PAN_ARCH <= 5
168 case PIPE_TEX_WRAP_CLAMP:
169 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :
170 MALI_WRAP_MODE_CLAMP;
171 case PIPE_TEX_WRAP_MIRROR_CLAMP:
172 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :
173 MALI_WRAP_MODE_MIRRORED_CLAMP;
174 #endif
175
176 default: unreachable("Invalid wrap");
177 }
178 }
179
180 /* The hardware compares in the wrong order order, so we have to flip before
181 * encoding. Yes, really. */
182
183 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)184 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
185 {
186 return !cso->compare_mode ? MALI_FUNC_NEVER :
187 panfrost_flip_compare_func((enum mali_func) cso->compare_func);
188 }
189
190 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)191 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
192 {
193 switch (f) {
194 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
195 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
196 #if PAN_ARCH >= 6
197 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
198 #else
199 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST;
200 #endif
201 default: unreachable("Invalid");
202 }
203 }
204
205
206 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)207 panfrost_create_sampler_state(
208 struct pipe_context *pctx,
209 const struct pipe_sampler_state *cso)
210 {
211 struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
212 so->base = *cso;
213
214 bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
215
216 pan_pack(&so->hw, SAMPLER, cfg) {
217 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
218 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
219
220 cfg.normalized_coordinates = cso->normalized_coords;
221 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
222 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
223 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
224
225 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
226 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
227 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
228
229 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
230 cfg.compare_function = panfrost_sampler_compare_func(cso);
231 cfg.seamless_cube_map = cso->seamless_cube_map;
232
233 cfg.border_color_r = cso->border_color.ui[0];
234 cfg.border_color_g = cso->border_color.ui[1];
235 cfg.border_color_b = cso->border_color.ui[2];
236 cfg.border_color_a = cso->border_color.ui[3];
237
238 #if PAN_ARCH >= 6
239 if (cso->max_anisotropy > 1) {
240 cfg.maximum_anisotropy = cso->max_anisotropy;
241 cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
242 }
243 #else
244 /* Emulate disabled mipmapping by clamping the LOD as tight as
245 * possible (from 0 to epsilon = 1/256) */
246 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
247 cfg.maximum_lod = cfg.minimum_lod + 1;
248 #endif
249 }
250
251 return so;
252 }
253
254 static bool
panfrost_fs_required(struct panfrost_shader_state * fs,struct panfrost_blend_state * blend,struct pipe_framebuffer_state * state,const struct panfrost_zsa_state * zsa)255 panfrost_fs_required(
256 struct panfrost_shader_state *fs,
257 struct panfrost_blend_state *blend,
258 struct pipe_framebuffer_state *state,
259 const struct panfrost_zsa_state *zsa)
260 {
261 /* If we generally have side effects. This inclues use of discard,
262 * which can affect the results of an occlusion query. */
263 if (fs->info.fs.sidefx)
264 return true;
265
266 /* Using an empty FS requires early-z to be enabled, but alpha test
267 * needs it disabled. Alpha test is only native on Midgard, so only
268 * check there.
269 */
270 if (PAN_ARCH <= 5 && zsa->base.alpha_func != PIPE_FUNC_ALWAYS)
271 return true;
272
273 /* If colour is written we need to execute */
274 for (unsigned i = 0; i < state->nr_cbufs; ++i) {
275 if (state->cbufs[i] && !blend->info[i].no_colour)
276 return true;
277 }
278
279 /* If depth is written and not implied we need to execute.
280 * TODO: Predicate on Z/S writes being enabled */
281 return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil);
282 }
283
284 /* Get pointers to the blend shaders bound to each active render target. Used
285 * to emit the blend descriptors, as well as the fragment renderer state
286 * descriptor.
287 */
288 static void
panfrost_get_blend_shaders(struct panfrost_batch * batch,mali_ptr * blend_shaders)289 panfrost_get_blend_shaders(struct panfrost_batch *batch,
290 mali_ptr *blend_shaders)
291 {
292 unsigned shader_offset = 0;
293 struct panfrost_bo *shader_bo = NULL;
294
295 for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) {
296 if (batch->key.cbufs[c]) {
297 blend_shaders[c] = panfrost_get_blend(batch,
298 c, &shader_bo, &shader_offset);
299 }
300 }
301 }
302
303 #if PAN_ARCH >= 5
304 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)305 pack_blend_constant(enum pipe_format format, float cons)
306 {
307 const struct util_format_description *format_desc =
308 util_format_description(format);
309
310 unsigned chan_size = 0;
311
312 for (unsigned i = 0; i < format_desc->nr_channels; i++)
313 chan_size = MAX2(format_desc->channel[0].size, chan_size);
314
315 uint16_t unorm = (cons * ((1 << chan_size) - 1));
316 return unorm << (16 - chan_size);
317 }
318
319 /*
320 * Determine whether to set the respective overdraw alpha flag.
321 *
322 * The overdraw alpha=1 flag should be set when alpha=1 implies full overdraw,
323 * equivalently, all enabled render targets have alpha_one_store set. Likewise,
324 * overdraw alpha=0 should be set when alpha=0 implies no overdraw,
325 * equivalently, all enabled render targets have alpha_zero_nop set.
326 */
327 static bool
panfrost_overdraw_alpha(const struct panfrost_context * ctx,bool zero)328 panfrost_overdraw_alpha(const struct panfrost_context *ctx, bool zero)
329 {
330 const struct panfrost_blend_state *so = ctx->blend;
331
332 for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) {
333 const struct pan_blend_info info = so->info[i];
334
335 bool enabled = ctx->pipe_framebuffer.cbufs[i] && info.no_colour;
336 bool flag = zero ? info.alpha_zero_nop : info.alpha_one_store;
337
338 if (enabled && !flag)
339 return false;
340 }
341
342 return true;
343 }
344
345 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,mali_ptr * blend_shaders)346 panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders)
347 {
348 unsigned rt_count = batch->key.nr_cbufs;
349 struct panfrost_context *ctx = batch->ctx;
350 const struct panfrost_blend_state *so = ctx->blend;
351 bool dithered = so->base.dither;
352
353 /* Always have at least one render target for depth-only passes */
354 for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
355 struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
356
357 /* Disable blending for unbacked render targets */
358 if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
359 pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
360 cfg.enable = false;
361 #if PAN_ARCH >= 6
362 cfg.internal.mode = MALI_BLEND_MODE_OFF;
363 #endif
364 }
365
366 continue;
367 }
368
369 struct pan_blend_info info = so->info[i];
370 enum pipe_format format = batch->key.cbufs[i]->format;
371 float cons = pan_blend_get_constant(info.constant_mask,
372 ctx->blend_color.color);
373
374 /* Word 0: Flags and constant */
375 pan_pack(packed, BLEND, cfg) {
376 cfg.srgb = util_format_is_srgb(format);
377 cfg.load_destination = info.load_dest;
378 cfg.round_to_fb_precision = !dithered;
379 cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
380 #if PAN_ARCH >= 6
381 if (!blend_shaders[i])
382 cfg.constant = pack_blend_constant(format, cons);
383 #else
384 cfg.blend_shader = (blend_shaders[i] != 0);
385
386 if (blend_shaders[i])
387 cfg.shader_pc = blend_shaders[i];
388 else
389 cfg.constant = cons;
390 #endif
391 }
392
393 if (!blend_shaders[i]) {
394 /* Word 1: Blend Equation */
395 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
396 packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
397 }
398
399 #if PAN_ARCH >= 6
400 const struct panfrost_device *dev = pan_device(ctx->base.screen);
401 struct panfrost_shader_state *fs =
402 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
403
404 /* Words 2 and 3: Internal blend */
405 if (blend_shaders[i]) {
406 /* The blend shader's address needs to be at
407 * the same top 32 bit as the fragment shader.
408 * TODO: Ensure that's always the case.
409 */
410 assert(!fs->bin.bo ||
411 (blend_shaders[i] & (0xffffffffull << 32)) ==
412 (fs->bin.gpu & (0xffffffffull << 32)));
413
414 pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
415 cfg.mode = MALI_BLEND_MODE_SHADER;
416 cfg.shader.pc = (u32) blend_shaders[i];
417
418 #if PAN_ARCH <= 7
419 unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
420 assert(!(ret_offset & 0x7));
421
422 cfg.shader.return_value = ret_offset ?
423 fs->bin.gpu + ret_offset : 0;
424 #endif
425 }
426 } else {
427 pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
428 cfg.mode = info.opaque ?
429 MALI_BLEND_MODE_OPAQUE :
430 MALI_BLEND_MODE_FIXED_FUNCTION;
431
432 /* If we want the conversion to work properly,
433 * num_comps must be set to 4
434 */
435 cfg.fixed_function.num_comps = 4;
436 cfg.fixed_function.conversion.memory_format =
437 panfrost_format_to_bifrost_blend(dev, format, dithered);
438 cfg.fixed_function.conversion.register_format =
439 fs->info.bifrost.blend[i].format;
440 cfg.fixed_function.rt = i;
441
442 #if PAN_ARCH <= 7
443 if (!info.opaque) {
444 cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
445 cfg.fixed_function.alpha_one_store = info.alpha_one_store;
446 }
447 #endif
448 }
449 }
450 #endif
451 }
452 }
453 #endif
454
455 static inline bool
pan_allow_forward_pixel_to_kill(struct panfrost_context * ctx,struct panfrost_shader_state * fs)456 pan_allow_forward_pixel_to_kill(struct panfrost_context *ctx, struct panfrost_shader_state *fs)
457 {
458 /* Track if any colour buffer is reused across draws, either
459 * from reading it directly, or from failing to write it
460 */
461 unsigned rt_mask = ctx->fb_rt_mask;
462 uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);
463 bool blend_reads_dest = (ctx->blend->load_dest_mask & rt_mask);
464 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
465
466 return fs->info.fs.can_fpk &&
467 !(rt_mask & ~rt_written) &&
468 !alpha_to_coverage &&
469 !blend_reads_dest;
470 }
471
472 static mali_ptr
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)473 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
474 {
475 struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
476
477 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
478 panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
479
480 return ss->state.gpu;
481 }
482
483 #if PAN_ARCH <= 7
484 /* Construct a partial RSD corresponding to no executed fragment shader, and
485 * merge with the existing partial RSD. */
486
487 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)488 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
489 {
490 struct mali_renderer_state_packed empty_rsd;
491
492 pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
493 #if PAN_ARCH >= 6
494 cfg.properties.shader_modifies_coverage = true;
495 cfg.properties.allow_forward_pixel_to_kill = true;
496 cfg.properties.allow_forward_pixel_to_be_killed = true;
497 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
498
499 /* Alpha isn't written so these are vacuous */
500 cfg.multisample_misc.overdraw_alpha0 = true;
501 cfg.multisample_misc.overdraw_alpha1 = true;
502 #else
503 cfg.shader.shader = 0x1;
504 cfg.properties.work_register_count = 1;
505 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
506 cfg.properties.force_early_z = true;
507 #endif
508 }
509
510 pan_merge((*rsd), empty_rsd, RENDERER_STATE);
511 }
512
513 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,mali_ptr * blend_shaders,struct mali_renderer_state_packed * rsd)514 panfrost_prepare_fs_state(struct panfrost_context *ctx,
515 mali_ptr *blend_shaders,
516 struct mali_renderer_state_packed *rsd)
517 {
518 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
519 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
520 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
521 struct panfrost_blend_state *so = ctx->blend;
522 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
523 bool msaa = rast->multisample;
524
525 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
526
527 bool has_blend_shader = false;
528
529 for (unsigned c = 0; c < rt_count; ++c)
530 has_blend_shader |= (blend_shaders[c] != 0);
531
532 bool has_oq = ctx->occlusion_query && ctx->active_queries;
533
534 pan_pack(rsd, RENDERER_STATE, cfg) {
535 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
536 #if PAN_ARCH >= 6
537 struct pan_earlyzs_state earlyzs =
538 pan_earlyzs_get(fs->earlyzs,
539 ctx->depth_stencil->writes_zs ||
540 has_oq,
541 ctx->blend->base.alpha_to_coverage,
542 ctx->depth_stencil->zs_always_passes);
543
544 cfg.properties.pixel_kill_operation = earlyzs.kill;
545 cfg.properties.zs_update_operation = earlyzs.update;
546
547 cfg.properties.allow_forward_pixel_to_kill =
548 pan_allow_forward_pixel_to_kill(ctx, fs);
549 #else
550 cfg.properties.force_early_z =
551 fs->info.fs.can_early_z && !alpha_to_coverage &&
552 ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);
553
554 /* TODO: Reduce this limit? */
555 if (has_blend_shader)
556 cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8);
557 else
558 cfg.properties.work_register_count = fs->info.work_reg_count;
559
560 /* Hardware quirks around early-zs forcing without a
561 * depth buffer. Note this breaks occlusion queries. */
562 bool force_ez_with_discard = !zsa->enabled && !has_oq;
563
564 cfg.properties.shader_reads_tilebuffer =
565 force_ez_with_discard && fs->info.fs.can_discard;
566 cfg.properties.shader_contains_discard =
567 !force_ez_with_discard && fs->info.fs.can_discard;
568 #endif
569 }
570
571 #if PAN_ARCH == 4
572 if (rt_count > 0) {
573 cfg.multisample_misc.load_destination = so->info[0].load_dest;
574 cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
575 cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour;
576 cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
577 cfg.stencil_mask_misc.dither_disable = !so->base.dither;
578 cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
579
580 if (blend_shaders[0]) {
581 cfg.blend_shader = blend_shaders[0];
582 } else {
583 cfg.blend_constant = pan_blend_get_constant(
584 so->info[0].constant_mask,
585 ctx->blend_color.color);
586 }
587 } else {
588 /* If there is no colour buffer, leaving fields default is
589 * fine, except for blending which is nonnullable */
590 cfg.blend_equation.color_mask = 0xf;
591 cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
592 cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
593 cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
594 cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
595 cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
596 cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
597 }
598 #elif PAN_ARCH == 5
599 /* Workaround */
600 cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
601 #endif
602
603 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
604
605 cfg.multisample_misc.evaluate_per_sample =
606 msaa && (ctx->min_samples > 1);
607
608 #if PAN_ARCH >= 6
609 /* MSAA blend shaders need to pass their sample ID to
610 * LD_TILE/ST_TILE, so we must preload it. Additionally, we
611 * need per-sample shading for the blend shader, accomplished
612 * by forcing per-sample shading for the whole program. */
613
614 if (msaa && has_blend_shader) {
615 cfg.multisample_misc.evaluate_per_sample = true;
616 cfg.preload.fragment.sample_mask_id = true;
617 }
618
619 /* Flip gl_PointCoord (and point sprites) depending on API
620 * setting on framebuffer orientation. We do not use
621 * lower_wpos_pntc on Bifrost.
622 */
623 cfg.properties.point_sprite_coord_origin_max_y =
624 (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
625
626 cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
627 cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
628 #endif
629
630 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
631 cfg.depth_units = rast->offset_units * 2.0f;
632 cfg.depth_factor = rast->offset_scale;
633
634 bool back_enab = zsa->base.stencil[1].enabled;
635 cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
636 cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
637
638 #if PAN_ARCH <= 5
639 /* v6+ fits register preload here, no alpha testing */
640 cfg.alpha_reference = zsa->base.alpha_ref_value;
641 #endif
642 }
643 }
644
645 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,mali_ptr * blend_shaders)646 panfrost_emit_frag_shader(struct panfrost_context *ctx,
647 struct mali_renderer_state_packed *fragmeta,
648 mali_ptr *blend_shaders)
649 {
650 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
651 const struct panfrost_rasterizer *rast = ctx->rasterizer;
652 struct panfrost_shader_state *fs =
653 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
654
655 /* We need to merge several several partial renderer state descriptors,
656 * so stage to temporary storage rather than reading back write-combine
657 * memory, which will trash performance. */
658 struct mali_renderer_state_packed rsd;
659 panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
660
661 #if PAN_ARCH == 4
662 if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
663 /* Word 14: SFBD Blend Equation */
664 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
665 rsd.opaque[14] = ctx->blend->equation[0];
666 }
667 #endif
668
669 /* Merge with CSO state and upload */
670 if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
671 struct mali_renderer_state_packed *partial_rsd =
672 (struct mali_renderer_state_packed *)&fs->partial_rsd;
673 STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
674 pan_merge(rsd, *partial_rsd, RENDERER_STATE);
675 } else {
676 pan_merge_empty_fs(&rsd);
677 }
678
679 /* Word 8, 9 Misc state */
680 rsd.opaque[8] |= zsa->rsd_depth.opaque[0]
681 | rast->multisample.opaque[0];
682
683 rsd.opaque[9] |= zsa->rsd_stencil.opaque[0]
684 | rast->stencil_misc.opaque[0];
685
686 /* Word 10, 11 Stencil Front and Back */
687 rsd.opaque[10] |= zsa->stencil_front.opaque[0];
688 rsd.opaque[11] |= zsa->stencil_back.opaque[0];
689
690 memcpy(fragmeta, &rsd, sizeof(rsd));
691 }
692
693 static mali_ptr
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)694 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
695 {
696 struct panfrost_context *ctx = batch->ctx;
697 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
698
699 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
700
701 struct panfrost_ptr xfer;
702
703 #if PAN_ARCH == 4
704 xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
705 #else
706 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
707
708 xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base,
709 PAN_DESC(RENDERER_STATE),
710 PAN_DESC_ARRAY(rt_count, BLEND));
711 #endif
712
713 mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
714 panfrost_get_blend_shaders(batch, blend_shaders);
715
716 panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);
717
718 #if PAN_ARCH >= 5
719 panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders);
720 #endif
721
722 return xfer.gpu;
723 }
724 #endif
725
726 static mali_ptr
panfrost_emit_viewport(struct panfrost_batch * batch)727 panfrost_emit_viewport(struct panfrost_batch *batch)
728 {
729 struct panfrost_context *ctx = batch->ctx;
730 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
731 const struct pipe_scissor_state *ss = &ctx->scissor;
732 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
733
734 /* Derive min/max from translate/scale. Note since |x| >= 0 by
735 * definition, we have that -|x| <= |x| hence translate - |scale| <=
736 * translate + |scale|, so the ordering is correct here. */
737 float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
738 float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
739 float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
740 float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
741 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
742 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
743
744 /* Scissor to the intersection of viewport and to the scissor, clamped
745 * to the framebuffer */
746
747 unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0));
748 unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0));
749 unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0));
750 unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0));
751
752 if (ss && rast->scissor) {
753 minx = MAX2(ss->minx, minx);
754 miny = MAX2(ss->miny, miny);
755 maxx = MIN2(ss->maxx, maxx);
756 maxy = MIN2(ss->maxy, maxy);
757 }
758
759 /* Set the range to [1, 1) so max values don't wrap round */
760 if (maxx == 0 || maxy == 0)
761 maxx = maxy = minx = miny = 1;
762
763 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
764 batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
765
766 /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */
767 maxx--;
768 maxy--;
769
770 batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY;
771 batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY;
772
773 #if PAN_ARCH <= 7
774 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
775
776 pan_pack(T.cpu, VIEWPORT, cfg) {
777 cfg.scissor_minimum_x = minx;
778 cfg.scissor_minimum_y = miny;
779 cfg.scissor_maximum_x = maxx;
780 cfg.scissor_maximum_y = maxy;
781
782 cfg.minimum_z = batch->minimum_z;
783 cfg.maximum_z = batch->maximum_z;
784 }
785
786 return T.gpu;
787 #else
788 pan_pack(&batch->scissor, SCISSOR, cfg) {
789 cfg.scissor_minimum_x = minx;
790 cfg.scissor_minimum_y = miny;
791 cfg.scissor_maximum_x = maxx;
792 cfg.scissor_maximum_y = maxy;
793 }
794
795 return 0;
796 #endif
797 }
798
799 #if PAN_ARCH >= 9
800 /**
801 * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the
802 * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at
803 * CSO create time. However, the stencil reference values and shader
804 * interactions are dynamic state. Pack only the dynamic state here and OR
805 * together.
806 */
807 static mali_ptr
panfrost_emit_depth_stencil(struct panfrost_batch * batch)808 panfrost_emit_depth_stencil(struct panfrost_batch *batch)
809 {
810 struct panfrost_context *ctx = batch->ctx;
811 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
812 struct panfrost_rasterizer *rast = ctx->rasterizer;
813 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
814 bool back_enab = zsa->base.stencil[1].enabled;
815
816 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL);
817 struct mali_depth_stencil_packed dynamic;
818
819 pan_pack(&dynamic, DEPTH_STENCIL, cfg) {
820 cfg.front_reference_value = ctx->stencil_ref.ref_value[0];
821 cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
822
823 cfg.stencil_from_shader = fs->info.fs.writes_stencil;
824 cfg.depth_source = pan_depth_source(&fs->info);
825
826 cfg.depth_bias_enable = rast->base.offset_tri;
827 cfg.depth_units = rast->base.offset_units * 2.0f;
828 cfg.depth_factor = rast->base.offset_scale;
829 cfg.depth_bias_clamp = rast->base.offset_clamp;
830 }
831
832 pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
833 memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL));
834
835 return T.gpu;
836 }
837
838 /**
839 * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared
840 * with Bifrost, but the container data structure is simplified.
841 */
842 static mali_ptr
panfrost_emit_blend_valhall(struct panfrost_batch * batch)843 panfrost_emit_blend_valhall(struct panfrost_batch *batch)
844 {
845 unsigned rt_count = MAX2(batch->key.nr_cbufs, 1);
846
847 struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND);
848
849 mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
850 panfrost_get_blend_shaders(batch, blend_shaders);
851
852 panfrost_emit_blend(batch, T.cpu, blend_shaders);
853
854 /* Precalculate for the per-draw path */
855 bool has_blend_shader = false;
856
857 for (unsigned i = 0; i < rt_count; ++i)
858 has_blend_shader |= !!blend_shaders[i];
859
860 batch->ctx->valhall_has_blend_shader = has_blend_shader;
861
862 return T.gpu;
863 }
864
865 /**
866 * Emit Valhall buffer descriptors for bound vertex buffers at draw-time.
867 */
868 static mali_ptr
panfrost_emit_vertex_buffers(struct panfrost_batch * batch)869 panfrost_emit_vertex_buffers(struct panfrost_batch *batch)
870 {
871 struct panfrost_context *ctx = batch->ctx;
872 unsigned buffer_count = util_last_bit(ctx->vb_mask);
873 struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base,
874 buffer_count, BUFFER);
875 struct mali_buffer_packed *buffers = T.cpu;
876
877 u_foreach_bit(i, ctx->vb_mask) {
878 struct pipe_vertex_buffer vb = ctx->vertex_buffers[i];
879 struct pipe_resource *prsrc = vb.buffer.resource;
880 struct panfrost_resource *rsrc = pan_resource(prsrc);
881 assert(!vb.is_user_buffer);
882
883 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
884
885 pan_pack(buffers + i, BUFFER, cfg) {
886 cfg.address = rsrc->image.data.bo->ptr.gpu +
887 vb.buffer_offset;
888
889 cfg.size = prsrc->width0 - vb.buffer_offset;
890 }
891 }
892
893 return T.gpu;
894 }
895
896 /**
897 * Emit Valhall attribute descriptors and associated (vertex) buffer
898 * descriptors at draw-time. The attribute descriptors are packed at draw time
899 * except for the stride field. The buffer descriptors are packed here, though
900 * that could be moved into panfrost_set_vertex_buffers if needed.
901 */
902 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch)903 panfrost_emit_vertex_data(struct panfrost_batch *batch)
904 {
905 struct panfrost_context *ctx = batch->ctx;
906 struct panfrost_vertex_state *vtx = ctx->vertex;
907 struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base,
908 vtx->num_elements,
909 ATTRIBUTE);
910 struct mali_attribute_packed *attributes = T.cpu;
911
912 for (unsigned i = 0; i < vtx->num_elements; ++i) {
913 struct mali_attribute_packed packed;
914 unsigned vbi = vtx->pipe[i].vertex_buffer_index;
915
916 pan_pack(&packed, ATTRIBUTE, cfg) {
917 cfg.stride = ctx->vertex_buffers[vbi].stride;
918 }
919
920 pan_merge(packed, vtx->attributes[i], ATTRIBUTE);
921 attributes[i] = packed;
922 }
923
924 return T.gpu;
925 }
926
927 /*
928 * Emit Valhall descriptors for shader images. Unlike previous generations,
929 * Valhall does not have a special descriptor for images. Standard texture
930 * descriptors are used. The binding is different in Gallium, however, so we
931 * translate.
932 */
933 static struct pipe_sampler_view
panfrost_pipe_image_to_sampler_view(struct pipe_image_view * v)934 panfrost_pipe_image_to_sampler_view(struct pipe_image_view *v)
935 {
936 struct pipe_sampler_view out = {
937 .format = v->format,
938 .texture = v->resource,
939 .target = v->resource->target,
940 .swizzle_r = PIPE_SWIZZLE_X,
941 .swizzle_g = PIPE_SWIZZLE_Y,
942 .swizzle_b = PIPE_SWIZZLE_Z,
943 .swizzle_a = PIPE_SWIZZLE_W
944 };
945
946 if (out.target == PIPE_BUFFER) {
947 out.u.buf.offset = v->u.buf.offset;
948 out.u.buf.size = v->u.buf.size;
949 } else {
950 out.u.tex.first_layer = v->u.tex.first_layer;
951 out.u.tex.last_layer = v->u.tex.last_layer;
952
953 /* Single level only */
954 out.u.tex.first_level = v->u.tex.level;
955 out.u.tex.last_level = v->u.tex.level;
956 }
957
958 return out;
959 }
960
961 static void
962 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
963 struct pipe_context *pctx);
964
965 static mali_ptr
panfrost_emit_images(struct panfrost_batch * batch,enum pipe_shader_type stage)966 panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage)
967 {
968 struct panfrost_context *ctx = batch->ctx;
969 unsigned last_bit = util_last_bit(ctx->image_mask[stage]);
970
971 struct panfrost_ptr T =
972 pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE);
973
974 struct mali_texture_packed *out = (struct mali_texture_packed *) T.cpu;
975
976 for (int i = 0; i < last_bit; ++i) {
977 struct pipe_image_view *image = &ctx->images[stage][i];
978
979 if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) {
980 memset(&out[i], 0, sizeof(out[i]));
981 continue;
982 }
983
984 /* Construct a synthetic sampler view so we can use our usual
985 * sampler view code for the actual descriptor packing.
986 *
987 * Use the batch pool for a transient allocation, rather than
988 * allocating a long-lived descriptor.
989 */
990 struct panfrost_sampler_view view = {
991 .base = panfrost_pipe_image_to_sampler_view(image),
992 .pool = &batch->pool
993 };
994
995 /* If we specify a cube map, the hardware internally treat it as
996 * a 2D array. Since cube maps as images can confuse our common
997 * texturing code, explicitly use a 2D array.
998 *
999 * Similar concerns apply to 3D textures.
1000 */
1001 if (view.base.target == PIPE_BUFFER)
1002 view.base.target = PIPE_BUFFER;
1003 else
1004 view.base.target = PIPE_TEXTURE_2D_ARRAY;
1005
1006 panfrost_update_sampler_view(&view, &ctx->base);
1007 out[i] = view.bifrost_descriptor;
1008
1009 panfrost_track_image_access(batch, stage, image);
1010 }
1011
1012 return T.gpu;
1013 }
1014 #endif
1015
1016 static mali_ptr
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)1017 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1018 enum pipe_shader_type st,
1019 struct panfrost_constant_buffer *buf,
1020 unsigned index)
1021 {
1022 struct pipe_constant_buffer *cb = &buf->cb[index];
1023 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1024
1025 if (rsrc) {
1026 panfrost_batch_read_rsrc(batch, rsrc, st);
1027
1028 /* Alignment gauranteed by
1029 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1030 return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset;
1031 } else if (cb->user_buffer) {
1032 return pan_pool_upload_aligned(&batch->pool.base,
1033 cb->user_buffer +
1034 cb->buffer_offset,
1035 cb->buffer_size, 16);
1036 } else {
1037 unreachable("No constant buffer");
1038 }
1039 }
1040
1041 struct sysval_uniform {
1042 union {
1043 float f[4];
1044 int32_t i[4];
1045 uint32_t u[4];
1046 uint64_t du[2];
1047 };
1048 };
1049
1050 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1051 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1052 struct sysval_uniform *uniform)
1053 {
1054 struct panfrost_context *ctx = batch->ctx;
1055 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1056
1057 uniform->f[0] = vp->scale[0];
1058 uniform->f[1] = vp->scale[1];
1059 uniform->f[2] = vp->scale[2];
1060 }
1061
1062 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1063 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1064 struct sysval_uniform *uniform)
1065 {
1066 struct panfrost_context *ctx = batch->ctx;
1067 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1068
1069 uniform->f[0] = vp->translate[0];
1070 uniform->f[1] = vp->translate[1];
1071 uniform->f[2] = vp->translate[2];
1072 }
1073
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1074 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1075 enum pipe_shader_type st,
1076 unsigned int sysvalid,
1077 struct sysval_uniform *uniform)
1078 {
1079 struct panfrost_context *ctx = batch->ctx;
1080 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1081 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1082 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1083 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1084
1085 assert(dim);
1086
1087 if (tex->target == PIPE_BUFFER) {
1088 assert(dim == 1);
1089 uniform->i[0] =
1090 tex->u.buf.size / util_format_get_blocksize(tex->format);
1091 return;
1092 }
1093
1094 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1095
1096 if (dim > 1)
1097 uniform->i[1] = u_minify(tex->texture->height0,
1098 tex->u.tex.first_level);
1099
1100 if (dim > 2)
1101 uniform->i[2] = u_minify(tex->texture->depth0,
1102 tex->u.tex.first_level);
1103
1104 if (is_array) {
1105 unsigned size = tex->texture->array_size;
1106
1107 /* Internally, we store the number of 2D images (faces * array
1108 * size). Externally, we report the array size in terms of
1109 * complete cubes. So divide by the # of faces per cube.
1110 */
1111 if (tex->target == PIPE_TEXTURE_CUBE_ARRAY)
1112 size /= 6;
1113
1114 uniform->i[dim] = size;
1115 }
1116 }
1117
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1118 static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
1119 enum pipe_shader_type st,
1120 unsigned int sysvalid,
1121 struct sysval_uniform *uniform)
1122 {
1123 struct panfrost_context *ctx = batch->ctx;
1124 unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1125 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1126 unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1127
1128 assert(dim && dim < 4);
1129
1130 struct pipe_image_view *image = &ctx->images[st][idx];
1131
1132 if (image->resource->target == PIPE_BUFFER) {
1133 unsigned blocksize = util_format_get_blocksize(image->format);
1134 uniform->i[0] = image->resource->width0 / blocksize;
1135 return;
1136 }
1137
1138 uniform->i[0] = u_minify(image->resource->width0,
1139 image->u.tex.level);
1140
1141 if (dim > 1)
1142 uniform->i[1] = u_minify(image->resource->height0,
1143 image->u.tex.level);
1144
1145 if (dim > 2)
1146 uniform->i[2] = u_minify(image->resource->depth0,
1147 image->u.tex.level);
1148
1149 if (is_array)
1150 uniform->i[dim] = image->resource->array_size;
1151 }
1152
1153 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)1154 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1155 enum pipe_shader_type st,
1156 unsigned ssbo_id,
1157 struct sysval_uniform *uniform)
1158 {
1159 struct panfrost_context *ctx = batch->ctx;
1160
1161 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1162 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1163
1164 /* Compute address */
1165 struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1166 struct panfrost_bo *bo = rsrc->image.data.bo;
1167
1168 panfrost_batch_write_rsrc(batch, rsrc, st);
1169
1170 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1171 sb.buffer_offset, sb.buffer_size);
1172
1173 /* Upload address and size as sysval */
1174 uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
1175 uniform->u[2] = sb.buffer_size;
1176 }
1177
1178 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)1179 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1180 enum pipe_shader_type st,
1181 unsigned samp_idx,
1182 struct sysval_uniform *uniform)
1183 {
1184 struct panfrost_context *ctx = batch->ctx;
1185 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1186
1187 uniform->f[0] = sampl->min_lod;
1188 uniform->f[1] = sampl->max_lod;
1189 uniform->f[2] = sampl->lod_bias;
1190
1191 /* Even without any errata, Midgard represents "no mipmapping" as
1192 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1193 * panfrost_create_sampler_state which also explains our choice of
1194 * epsilon value (again to keep behaviour consistent) */
1195
1196 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1197 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1198 }
1199
1200 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1201 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1202 struct sysval_uniform *uniform)
1203 {
1204 struct panfrost_context *ctx = batch->ctx;
1205
1206 uniform->u[0] = ctx->compute_grid->grid[0];
1207 uniform->u[1] = ctx->compute_grid->grid[1];
1208 uniform->u[2] = ctx->compute_grid->grid[2];
1209 }
1210
1211 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1212 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
1213 struct sysval_uniform *uniform)
1214 {
1215 struct panfrost_context *ctx = batch->ctx;
1216
1217 uniform->u[0] = ctx->compute_grid->block[0];
1218 uniform->u[1] = ctx->compute_grid->block[1];
1219 uniform->u[2] = ctx->compute_grid->block[2];
1220 }
1221
1222 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1223 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
1224 struct sysval_uniform *uniform)
1225 {
1226 struct panfrost_context *ctx = batch->ctx;
1227
1228 uniform->u[0] = ctx->compute_grid->work_dim;
1229 }
1230
1231 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
1232 * Midgard, we emulate the Bifrost path with some extra arithmetic in the
1233 * shader, to keep the code as unified as possible. */
1234
1235 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1236 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
1237 struct sysval_uniform *uniform)
1238 {
1239 struct panfrost_context *ctx = batch->ctx;
1240 struct panfrost_device *dev = pan_device(ctx->base.screen);
1241
1242 unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1243 uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples));
1244 }
1245
1246 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1247 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1248 struct sysval_uniform *uniform)
1249 {
1250 unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1251 uniform->u[0] = samples > 1;
1252 }
1253
1254 #if PAN_ARCH >= 6
1255 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)1256 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1257 unsigned size_and_rt, struct sysval_uniform *uniform)
1258 {
1259 struct panfrost_context *ctx = batch->ctx;
1260 struct panfrost_device *dev = pan_device(ctx->base.screen);
1261 unsigned rt = size_and_rt & 0xF;
1262 unsigned size = size_and_rt >> 4;
1263
1264 if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1265 enum pipe_format format = batch->key.cbufs[rt]->format;
1266 uniform->u[0] =
1267 GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32;
1268 } else {
1269 pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
1270 cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;
1271 }
1272 }
1273 #endif
1274
1275 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1276 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1277 {
1278 return target->buffer_offset + (pan_so_target(target)->offset * stride);
1279 }
1280
1281 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,const struct panfrost_ptr * ptr,struct panfrost_shader_state * ss,enum pipe_shader_type st)1282 panfrost_upload_sysvals(struct panfrost_batch *batch,
1283 const struct panfrost_ptr *ptr,
1284 struct panfrost_shader_state *ss,
1285 enum pipe_shader_type st)
1286 {
1287 struct sysval_uniform *uniforms = ptr->cpu;
1288
1289 for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) {
1290 int sysval = ss->info.sysvals.sysvals[i];
1291
1292 switch (PAN_SYSVAL_TYPE(sysval)) {
1293 case PAN_SYSVAL_VIEWPORT_SCALE:
1294 panfrost_upload_viewport_scale_sysval(batch,
1295 &uniforms[i]);
1296 break;
1297 case PAN_SYSVAL_VIEWPORT_OFFSET:
1298 panfrost_upload_viewport_offset_sysval(batch,
1299 &uniforms[i]);
1300 break;
1301 case PAN_SYSVAL_TEXTURE_SIZE:
1302 panfrost_upload_txs_sysval(batch, st,
1303 PAN_SYSVAL_ID(sysval),
1304 &uniforms[i]);
1305 break;
1306 case PAN_SYSVAL_SSBO:
1307 panfrost_upload_ssbo_sysval(batch, st,
1308 PAN_SYSVAL_ID(sysval),
1309 &uniforms[i]);
1310 break;
1311
1312 case PAN_SYSVAL_XFB:
1313 {
1314 unsigned buf = PAN_SYSVAL_ID(sysval);
1315 struct panfrost_shader_state *vs =
1316 panfrost_get_shader_state(batch->ctx, PIPE_SHADER_VERTEX);
1317 struct pipe_stream_output_info *so = &vs->stream_output;
1318 unsigned stride = so->stride[buf] * 4;
1319
1320 struct pipe_stream_output_target *target = NULL;
1321 if (buf < batch->ctx->streamout.num_targets)
1322 target = batch->ctx->streamout.targets[buf];
1323
1324 if (!target) {
1325 /* Memory sink */
1326 uniforms[i].du[0] = 0x8ull << 60;
1327 break;
1328 }
1329
1330 struct panfrost_resource *rsrc = pan_resource(target->buffer);
1331 unsigned offset = panfrost_xfb_offset(stride, target);
1332
1333 util_range_add(&rsrc->base, &rsrc->valid_buffer_range,
1334 offset, target->buffer_size - offset);
1335
1336 panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1337
1338 uniforms[i].du[0] = rsrc->image.data.bo->ptr.gpu + offset;
1339 break;
1340 }
1341
1342 case PAN_SYSVAL_NUM_VERTICES:
1343 uniforms[i].u[0] = batch->ctx->vertex_count;
1344 break;
1345
1346 case PAN_SYSVAL_NUM_WORK_GROUPS:
1347 for (unsigned j = 0; j < 3; j++) {
1348 batch->num_wg_sysval[j] =
1349 ptr->gpu + (i * sizeof(*uniforms)) + (j * 4);
1350 }
1351 panfrost_upload_num_work_groups_sysval(batch,
1352 &uniforms[i]);
1353 break;
1354 case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1355 panfrost_upload_local_group_size_sysval(batch,
1356 &uniforms[i]);
1357 break;
1358 case PAN_SYSVAL_WORK_DIM:
1359 panfrost_upload_work_dim_sysval(batch,
1360 &uniforms[i]);
1361 break;
1362 case PAN_SYSVAL_SAMPLER:
1363 panfrost_upload_sampler_sysval(batch, st,
1364 PAN_SYSVAL_ID(sysval),
1365 &uniforms[i]);
1366 break;
1367 case PAN_SYSVAL_IMAGE_SIZE:
1368 panfrost_upload_image_size_sysval(batch, st,
1369 PAN_SYSVAL_ID(sysval),
1370 &uniforms[i]);
1371 break;
1372 case PAN_SYSVAL_SAMPLE_POSITIONS:
1373 panfrost_upload_sample_positions_sysval(batch,
1374 &uniforms[i]);
1375 break;
1376 case PAN_SYSVAL_MULTISAMPLED:
1377 panfrost_upload_multisampled_sysval(batch,
1378 &uniforms[i]);
1379 break;
1380 #if PAN_ARCH >= 6
1381 case PAN_SYSVAL_RT_CONVERSION:
1382 panfrost_upload_rt_conversion_sysval(batch,
1383 PAN_SYSVAL_ID(sysval), &uniforms[i]);
1384 break;
1385 #endif
1386 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1387 batch->ctx->first_vertex_sysval_ptr =
1388 ptr->gpu + (i * sizeof(*uniforms));
1389 batch->ctx->base_vertex_sysval_ptr =
1390 batch->ctx->first_vertex_sysval_ptr + 4;
1391 batch->ctx->base_instance_sysval_ptr =
1392 batch->ctx->first_vertex_sysval_ptr + 8;
1393
1394 uniforms[i].u[0] = batch->ctx->offset_start;
1395 uniforms[i].u[1] = batch->ctx->base_vertex;
1396 uniforms[i].u[2] = batch->ctx->base_instance;
1397 break;
1398 case PAN_SYSVAL_DRAWID:
1399 uniforms[i].u[0] = batch->ctx->drawid;
1400 break;
1401 default:
1402 assert(0);
1403 }
1404 }
1405 }
1406
1407 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1408 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1409 struct panfrost_constant_buffer *buf,
1410 unsigned index)
1411 {
1412 struct pipe_constant_buffer *cb = &buf->cb[index];
1413 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1414
1415 if (rsrc) {
1416 panfrost_bo_mmap(rsrc->image.data.bo);
1417 panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1418 panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false);
1419
1420 return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset;
1421 } else if (cb->user_buffer) {
1422 return cb->user_buffer + cb->buffer_offset;
1423 } else
1424 unreachable("No constant buffer");
1425 }
1426
1427 /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
1428 * implemented with buffer descriptors in the resource table, sized in terms of
1429 * bytes. On Bifrost and older, UBOs have special uniform buffer data
1430 * structure, sized in terms of entries.
1431 */
1432 static void
panfrost_emit_ubo(void * base,unsigned index,mali_ptr address,size_t size)1433 panfrost_emit_ubo(void *base, unsigned index, mali_ptr address, size_t size)
1434 {
1435 #if PAN_ARCH >= 9
1436 struct mali_buffer_packed *out = base;
1437
1438 pan_pack(out + index, BUFFER, cfg) {
1439 cfg.size = size;
1440 cfg.address = address;
1441 }
1442 #else
1443 struct mali_uniform_buffer_packed *out = base;
1444
1445 /* Issue (57) for the ARB_uniform_buffer_object spec says that
1446 * the buffer can be larger than the uniform data inside it,
1447 * so clamp ubo size to what hardware supports. */
1448
1449 pan_pack(out + index, UNIFORM_BUFFER, cfg) {
1450 cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12);
1451 cfg.pointer = address;
1452 }
1453 #endif
1454 }
1455
1456 static mali_ptr
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,unsigned * buffer_count,mali_ptr * push_constants,unsigned * pushed_words)1457 panfrost_emit_const_buf(struct panfrost_batch *batch,
1458 enum pipe_shader_type stage,
1459 unsigned *buffer_count,
1460 mali_ptr *push_constants,
1461 unsigned *pushed_words)
1462 {
1463 struct panfrost_context *ctx = batch->ctx;
1464 struct panfrost_shader_variants *all = ctx->shader[stage];
1465
1466 if (!all)
1467 return 0;
1468
1469 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1470 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1471
1472 /* Allocate room for the sysval and the uniforms */
1473 size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count;
1474 struct panfrost_ptr transfer =
1475 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1476
1477 /* Upload sysvals requested by the shader */
1478 panfrost_upload_sysvals(batch, &transfer, ss, stage);
1479
1480 /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1481 struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage);
1482 unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1483 unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1484 struct panfrost_ptr ubos = { 0 };
1485
1486 #if PAN_ARCH >= 9
1487 ubos = pan_pool_alloc_desc_array(&batch->pool.base,
1488 ubo_count + 1,
1489 BUFFER);
1490 #else
1491 ubos = pan_pool_alloc_desc_array(&batch->pool.base,
1492 ubo_count + 1,
1493 UNIFORM_BUFFER);
1494 #endif
1495
1496 if (buffer_count)
1497 *buffer_count = ubo_count + (sys_size ? 1 : 0);
1498
1499 /* Upload sysval as a final UBO */
1500
1501 if (sys_size)
1502 panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size);
1503
1504 /* The rest are honest-to-goodness UBOs */
1505
1506 u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1507 size_t usz = buf->cb[ubo].buffer_size;
1508 mali_ptr address = 0;
1509
1510 if (usz > 0) {
1511 address = panfrost_map_constant_buffer_gpu(batch,
1512 stage, buf, ubo);
1513 }
1514
1515 panfrost_emit_ubo(ubos.cpu, ubo, address, usz);
1516 }
1517
1518 if (pushed_words)
1519 *pushed_words = ss->info.push.count;
1520
1521 if (ss->info.push.count == 0)
1522 return ubos.gpu;
1523
1524 /* Copy push constants required by the shader */
1525 struct panfrost_ptr push_transfer =
1526 pan_pool_alloc_aligned(&batch->pool.base,
1527 ss->info.push.count * 4, 16);
1528
1529 uint32_t *push_cpu = (uint32_t *) push_transfer.cpu;
1530 *push_constants = push_transfer.gpu;
1531
1532 for (unsigned i = 0; i < ss->info.push.count; ++i) {
1533 struct panfrost_ubo_word src = ss->info.push.words[i];
1534
1535 if (src.ubo == sysval_ubo) {
1536 unsigned sysval_idx = src.offset / 16;
1537 unsigned sysval_comp = (src.offset % 16) / 4;
1538 unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]);
1539 mali_ptr ptr = push_transfer.gpu + (4 * i);
1540
1541 switch (sysval_type) {
1542 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1543 switch (sysval_comp) {
1544 case 0:
1545 batch->ctx->first_vertex_sysval_ptr = ptr;
1546 break;
1547 case 1:
1548 batch->ctx->base_vertex_sysval_ptr = ptr;
1549 break;
1550 case 2:
1551 batch->ctx->base_instance_sysval_ptr = ptr;
1552 break;
1553 case 3:
1554 /* Spurious (Midgard doesn't pack) */
1555 break;
1556 default:
1557 unreachable("Invalid vertex/instance offset component\n");
1558 }
1559 break;
1560
1561 case PAN_SYSVAL_NUM_WORK_GROUPS:
1562 batch->num_wg_sysval[sysval_comp] = ptr;
1563 break;
1564
1565 default:
1566 break;
1567 }
1568 }
1569 /* Map the UBO, this should be cheap. However this is reading
1570 * from write-combine memory which is _very_ slow. It might pay
1571 * off to upload sysvals to a staging buffer on the CPU on the
1572 * assumption sysvals will get pushed (TODO) */
1573
1574 const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu :
1575 panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1576
1577 /* TODO: Is there any benefit to combining ranges */
1578 memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4);
1579 }
1580
1581 return ubos.gpu;
1582 }
1583
1584 static mali_ptr
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * grid)1585 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1586 const struct pipe_grid_info *grid)
1587 {
1588 struct panfrost_context *ctx = batch->ctx;
1589 struct panfrost_device *dev = pan_device(ctx->base.screen);
1590 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1591 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1592 struct panfrost_ptr t =
1593 pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1594
1595 struct pan_tls_info info = {
1596 .tls.size = ss->info.tls_size,
1597 .wls.size = ss->info.wls_size,
1598 .wls.dim.x = grid->grid[0],
1599 .wls.dim.y = grid->grid[1],
1600 .wls.dim.z = grid->grid[2],
1601 };
1602
1603 if (ss->info.tls_size) {
1604 struct panfrost_bo *bo =
1605 panfrost_batch_get_scratchpad(batch,
1606 ss->info.tls_size,
1607 dev->thread_tls_alloc,
1608 dev->core_id_range);
1609 info.tls.ptr = bo->ptr.gpu;
1610 }
1611
1612 if (ss->info.wls_size) {
1613 unsigned size =
1614 pan_wls_adjust_size(info.wls.size) *
1615 pan_wls_instances(&info.wls.dim) *
1616 dev->core_id_range;
1617
1618 struct panfrost_bo *bo =
1619 panfrost_batch_get_shared_memory(batch, size, 1);
1620
1621 info.wls.ptr = bo->ptr.gpu;
1622 }
1623
1624 GENX(pan_emit_tls)(&info, t.cpu);
1625 return t.gpu;
1626 }
1627
1628 #if PAN_ARCH <= 5
1629 static mali_ptr
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1630 panfrost_get_tex_desc(struct panfrost_batch *batch,
1631 enum pipe_shader_type st,
1632 struct panfrost_sampler_view *view)
1633 {
1634 if (!view)
1635 return (mali_ptr) 0;
1636
1637 struct pipe_sampler_view *pview = &view->base;
1638 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1639
1640 panfrost_batch_read_rsrc(batch, rsrc, st);
1641 panfrost_batch_add_bo(batch, view->state.bo, st);
1642
1643 return view->state.gpu;
1644 }
1645 #endif
1646
1647 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1648 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1649 struct pipe_context *pctx,
1650 struct pipe_resource *texture)
1651 {
1652 struct panfrost_device *device = pan_device(pctx->screen);
1653 struct panfrost_context *ctx = pan_context(pctx);
1654 struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1655 enum pipe_format format = so->base.format;
1656 assert(prsrc->image.data.bo);
1657
1658 /* Format to access the stencil/depth portion of a Z32_S8 texture */
1659 if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1660 assert(prsrc->separate_stencil);
1661 texture = &prsrc->separate_stencil->base;
1662 prsrc = (struct panfrost_resource *)texture;
1663 format = texture->format;
1664 } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1665 format = PIPE_FORMAT_Z32_FLOAT;
1666 }
1667
1668 const struct util_format_description *desc = util_format_description(format);
1669
1670 bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM);
1671
1672 if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) {
1673 if (desc->is_snorm)
1674 format = PIPE_FORMAT_R8G8B8A8_SNORM;
1675 else
1676 format = PIPE_FORMAT_R8G8B8A8_UNORM;
1677 desc = util_format_description(format);
1678 }
1679
1680 so->texture_bo = prsrc->image.data.bo->ptr.gpu;
1681 so->modifier = prsrc->image.layout.modifier;
1682
1683 /* MSAA only supported for 2D textures */
1684
1685 assert(texture->nr_samples <= 1 ||
1686 so->base.target == PIPE_TEXTURE_2D ||
1687 so->base.target == PIPE_TEXTURE_2D_ARRAY);
1688
1689 enum mali_texture_dimension type =
1690 panfrost_translate_texture_dimension(so->base.target);
1691
1692 bool is_buffer = (so->base.target == PIPE_BUFFER);
1693
1694 unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1695 unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1696 unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1697 unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1698 unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1699 unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) /
1700 util_format_get_blocksize(format);
1701
1702 if (so->base.target == PIPE_TEXTURE_3D) {
1703 first_layer /= prsrc->image.layout.depth;
1704 last_layer /= prsrc->image.layout.depth;
1705 assert(!first_layer && !last_layer);
1706 }
1707
1708 struct pan_image_view iview = {
1709 .format = format,
1710 .dim = type,
1711 .first_level = first_level,
1712 .last_level = last_level,
1713 .first_layer = first_layer,
1714 .last_layer = last_layer,
1715 .swizzle = {
1716 so->base.swizzle_r,
1717 so->base.swizzle_g,
1718 so->base.swizzle_b,
1719 so->base.swizzle_a,
1720 },
1721 .image = &prsrc->image,
1722
1723 .buf.offset = buf_offset,
1724 .buf.size = buf_size,
1725 };
1726
1727 unsigned size =
1728 (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1729 GENX(panfrost_estimate_texture_payload_size)(&iview);
1730
1731 struct panfrost_pool *pool = so->pool ?: &ctx->descs;
1732 struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64);
1733 so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1734
1735 void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1736
1737 if (PAN_ARCH <= 5) {
1738 payload.cpu += pan_size(TEXTURE);
1739 payload.gpu += pan_size(TEXTURE);
1740 }
1741
1742 GENX(panfrost_new_texture)(device, &iview, tex, &payload);
1743 }
1744
1745 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1746 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1747 struct pipe_context *pctx)
1748 {
1749 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1750 if (view->texture_bo != rsrc->image.data.bo->ptr.gpu ||
1751 view->modifier != rsrc->image.layout.modifier) {
1752 panfrost_bo_unreference(view->state.bo);
1753 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1754 }
1755 }
1756
1757 static mali_ptr
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1758 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1759 enum pipe_shader_type stage)
1760 {
1761 struct panfrost_context *ctx = batch->ctx;
1762
1763 if (!ctx->sampler_view_count[stage])
1764 return 0;
1765
1766 #if PAN_ARCH >= 6
1767 struct panfrost_ptr T =
1768 pan_pool_alloc_desc_array(&batch->pool.base,
1769 ctx->sampler_view_count[stage],
1770 TEXTURE);
1771 struct mali_texture_packed *out =
1772 (struct mali_texture_packed *) T.cpu;
1773
1774 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1775 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1776
1777 if (!view) {
1778 memset(&out[i], 0, sizeof(out[i]));
1779 continue;
1780 }
1781
1782 struct pipe_sampler_view *pview = &view->base;
1783 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1784
1785 panfrost_update_sampler_view(view, &ctx->base);
1786 out[i] = view->bifrost_descriptor;
1787
1788 panfrost_batch_read_rsrc(batch, rsrc, stage);
1789 panfrost_batch_add_bo(batch, view->state.bo, stage);
1790 }
1791
1792 return T.gpu;
1793 #else
1794 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1795
1796 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1797 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1798
1799 if (!view) {
1800 trampolines[i] = 0;
1801 continue;
1802 }
1803
1804 panfrost_update_sampler_view(view, &ctx->base);
1805
1806 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1807 }
1808
1809 return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1810 sizeof(uint64_t) *
1811 ctx->sampler_view_count[stage],
1812 sizeof(uint64_t));
1813 #endif
1814 }
1815
1816 static mali_ptr
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1817 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1818 enum pipe_shader_type stage)
1819 {
1820 struct panfrost_context *ctx = batch->ctx;
1821
1822 if (!ctx->sampler_count[stage])
1823 return 0;
1824
1825 struct panfrost_ptr T =
1826 pan_pool_alloc_desc_array(&batch->pool.base,
1827 ctx->sampler_count[stage],
1828 SAMPLER);
1829 struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu;
1830
1831 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) {
1832 struct panfrost_sampler_state *st = ctx->samplers[stage][i];
1833
1834 out[i] = st ? st->hw : (struct mali_sampler_packed){0};
1835 }
1836
1837 return T.gpu;
1838 }
1839
1840 #if PAN_ARCH <= 7
1841 /* Packs all image attribute descs and attribute buffer descs.
1842 * `first_image_buf_index` must be the index of the first image attribute buffer descriptor.
1843 */
1844 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1845 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1846 struct mali_attribute_packed *attribs, unsigned first_buf)
1847 {
1848 struct panfrost_device *dev = pan_device(ctx->base.screen);
1849 unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1850
1851 for (unsigned i = 0; i < last_bit; ++i) {
1852 enum pipe_format format = ctx->images[shader][i].format;
1853
1854 pan_pack(attribs + i, ATTRIBUTE, cfg) {
1855 /* Continuation record means 2 buffers per image */
1856 cfg.buffer_index = first_buf + (i * 2);
1857 cfg.offset_enable = (PAN_ARCH <= 5);
1858 cfg.format = dev->formats[format].hw;
1859 }
1860 }
1861 }
1862
1863 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1864 pan_modifier_to_attr_type(uint64_t modifier)
1865 {
1866 switch (modifier) {
1867 case DRM_FORMAT_MOD_LINEAR:
1868 return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1869 case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1870 return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1871 default:
1872 unreachable("Invalid modifier for attribute record");
1873 }
1874 }
1875
1876 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1877 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1878 struct mali_attribute_buffer_packed *bufs,
1879 unsigned first_image_buf_index)
1880 {
1881 struct panfrost_context *ctx = batch->ctx;
1882 unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1883
1884 for (unsigned i = 0; i < last_bit; ++i) {
1885 struct pipe_image_view *image = &ctx->images[shader][i];
1886
1887 if (!(ctx->image_mask[shader] & (1 << i)) ||
1888 !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1889 /* Unused image bindings */
1890 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg);
1891 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg);
1892 continue;
1893 }
1894
1895 struct panfrost_resource *rsrc = pan_resource(image->resource);
1896
1897 /* TODO: MSAA */
1898 assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported");
1899
1900 bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1901 bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1902
1903 unsigned offset = is_buffer ? image->u.buf.offset :
1904 panfrost_texture_offset(&rsrc->image.layout,
1905 image->u.tex.level,
1906 is_3d ? 0 : image->u.tex.first_layer,
1907 is_3d ? image->u.tex.first_layer : 0);
1908
1909 panfrost_track_image_access(batch, shader, image);
1910
1911 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1912 cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1913 cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset;
1914 cfg.stride = util_format_get_blocksize(image->format);
1915 cfg.size = rsrc->image.data.bo->size - offset;
1916 }
1917
1918 if (is_buffer) {
1919 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1920 cfg.s_dimension = rsrc->base.width0 /
1921 util_format_get_blocksize(image->format);
1922 cfg.t_dimension = cfg.r_dimension = 1;
1923 }
1924
1925 continue;
1926 }
1927
1928 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) {
1929 unsigned level = image->u.tex.level;
1930
1931 cfg.s_dimension = u_minify(rsrc->base.width0, level);
1932 cfg.t_dimension = u_minify(rsrc->base.height0, level);
1933 cfg.r_dimension = is_3d ?
1934 u_minify(rsrc->base.depth0, level) :
1935 image->u.tex.last_layer - image->u.tex.first_layer + 1;
1936
1937 cfg.row_stride =
1938 rsrc->image.layout.slices[level].row_stride;
1939
1940 if (rsrc->base.target != PIPE_TEXTURE_2D) {
1941 cfg.slice_stride =
1942 panfrost_get_layer_stride(&rsrc->image.layout,
1943 level);
1944 }
1945 }
1946 }
1947 }
1948
1949 static mali_ptr
panfrost_emit_image_attribs(struct panfrost_batch * batch,mali_ptr * buffers,enum pipe_shader_type type)1950 panfrost_emit_image_attribs(struct panfrost_batch *batch,
1951 mali_ptr *buffers,
1952 enum pipe_shader_type type)
1953 {
1954 struct panfrost_context *ctx = batch->ctx;
1955 struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type);
1956
1957 if (!shader->info.attribute_count) {
1958 *buffers = 0;
1959 return 0;
1960 }
1961
1962 /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
1963 unsigned attr_count = shader->info.attribute_count;
1964 unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
1965
1966 struct panfrost_ptr bufs =
1967 pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
1968
1969 struct panfrost_ptr attribs =
1970 pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
1971
1972 emit_image_attribs(ctx, type, attribs.cpu, 0);
1973 emit_image_bufs(batch, type, bufs.cpu, 0);
1974
1975 /* We need an empty attrib buf to stop the prefetching on Bifrost */
1976 #if PAN_ARCH >= 6
1977 pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
1978 ATTRIBUTE_BUFFER, cfg);
1979 #endif
1980
1981 *buffers = bufs.gpu;
1982 return attribs.gpu;
1983 }
1984
1985 static mali_ptr
panfrost_emit_vertex_data(struct panfrost_batch * batch,mali_ptr * buffers)1986 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1987 mali_ptr *buffers)
1988 {
1989 struct panfrost_context *ctx = batch->ctx;
1990 struct panfrost_vertex_state *so = ctx->vertex;
1991 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1992 bool instanced = ctx->indirect_draw || ctx->instance_count > 1;
1993 uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
1994 unsigned nr_images = util_last_bit(image_mask);
1995
1996 /* Worst case: everything is NPOT, which is only possible if instancing
1997 * is enabled. Otherwise single record is gauranteed.
1998 * Also, we allocate more memory than what's needed here if either instancing
1999 * is enabled or images are present, this can be improved. */
2000 unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
2001 unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) +
2002 (PAN_ARCH >= 6 ? 1 : 0);
2003
2004 unsigned count = vs->info.attribute_count;
2005
2006 if (vs->xfb)
2007 count = MAX2(count, vs->xfb->info.attribute_count);
2008
2009 #if PAN_ARCH <= 5
2010 /* Midgard needs vertexid/instanceid handled specially */
2011 bool special_vbufs = count >= PAN_VERTEX_ID;
2012
2013 if (special_vbufs)
2014 nr_bufs += 2;
2015 #endif
2016
2017 if (!nr_bufs) {
2018 *buffers = 0;
2019 return 0;
2020 }
2021
2022 struct panfrost_ptr S =
2023 pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs,
2024 ATTRIBUTE_BUFFER);
2025 struct panfrost_ptr T =
2026 pan_pool_alloc_desc_array(&batch->pool.base, count,
2027 ATTRIBUTE);
2028
2029 struct mali_attribute_buffer_packed *bufs =
2030 (struct mali_attribute_buffer_packed *) S.cpu;
2031
2032 struct mali_attribute_packed *out =
2033 (struct mali_attribute_packed *) T.cpu;
2034
2035 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
2036 unsigned k = 0;
2037
2038 for (unsigned i = 0; i < so->nr_bufs; ++i) {
2039 unsigned vbi = so->buffers[i].vbi;
2040 unsigned divisor = so->buffers[i].divisor;
2041 attrib_to_buffer[i] = k;
2042
2043 if (!(ctx->vb_mask & (1 << vbi)))
2044 continue;
2045
2046 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2047 struct panfrost_resource *rsrc;
2048
2049 rsrc = pan_resource(buf->buffer.resource);
2050 if (!rsrc)
2051 continue;
2052
2053 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2054
2055 /* Mask off lower bits, see offset fixup below */
2056 mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset;
2057 mali_ptr addr = raw_addr & ~63;
2058
2059 /* Since we advanced the base pointer, we shrink the buffer
2060 * size, but add the offset we subtracted */
2061 unsigned size = rsrc->base.width0 + (raw_addr - addr)
2062 - buf->buffer_offset;
2063
2064 /* When there is a divisor, the hardware-level divisor is
2065 * the product of the instance divisor and the padded count */
2066 unsigned stride = buf->stride;
2067
2068 if (ctx->indirect_draw) {
2069 /* We allocated 2 records for each attribute buffer */
2070 assert((k & 1) == 0);
2071
2072 /* With indirect draws we can't guess the vertex_count.
2073 * Pre-set the address, stride and size fields, the
2074 * compute shader do the rest.
2075 */
2076 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2077 cfg.type = MALI_ATTRIBUTE_TYPE_1D;
2078 cfg.pointer = addr;
2079 cfg.stride = stride;
2080 cfg.size = size;
2081 }
2082
2083 /* We store the unmodified divisor in the continuation
2084 * slot so the compute shader can retrieve it.
2085 */
2086 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
2087 cfg.divisor = divisor;
2088 }
2089
2090 k += 2;
2091 continue;
2092 }
2093
2094 unsigned hw_divisor = ctx->padded_count * divisor;
2095
2096 if (ctx->instance_count <= 1) {
2097 /* Per-instance would be every attribute equal */
2098 if (divisor)
2099 stride = 0;
2100
2101 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2102 cfg.pointer = addr;
2103 cfg.stride = stride;
2104 cfg.size = size;
2105 }
2106 } else if (!divisor) {
2107 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2108 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
2109 cfg.pointer = addr;
2110 cfg.stride = stride;
2111 cfg.size = size;
2112 cfg.divisor = ctx->padded_count;
2113 }
2114 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
2115 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2116 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
2117 cfg.pointer = addr;
2118 cfg.stride = stride;
2119 cfg.size = size;
2120 cfg.divisor_r = __builtin_ctz(hw_divisor);
2121 }
2122
2123 } else {
2124 unsigned shift = 0, extra_flags = 0;
2125
2126 unsigned magic_divisor =
2127 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
2128
2129 /* Records with continuations must be aligned */
2130 k = ALIGN_POT(k, 2);
2131 attrib_to_buffer[i] = k;
2132
2133 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2134 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
2135 cfg.pointer = addr;
2136 cfg.stride = stride;
2137 cfg.size = size;
2138
2139 cfg.divisor_r = shift;
2140 cfg.divisor_e = extra_flags;
2141 }
2142
2143 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
2144 cfg.divisor_numerator = magic_divisor;
2145 cfg.divisor = divisor;
2146 }
2147
2148 ++k;
2149 }
2150
2151 ++k;
2152 }
2153
2154 #if PAN_ARCH <= 5
2155 /* Add special gl_VertexID/gl_InstanceID buffers */
2156 if (special_vbufs) {
2157 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
2158
2159 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
2160 cfg.buffer_index = k++;
2161 cfg.format = so->formats[PAN_VERTEX_ID];
2162 }
2163
2164 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
2165
2166 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
2167 cfg.buffer_index = k++;
2168 cfg.format = so->formats[PAN_INSTANCE_ID];
2169 }
2170 }
2171 #endif
2172
2173 if (nr_images) {
2174 k = ALIGN_POT(k, 2);
2175 emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
2176 emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
2177 k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
2178 }
2179
2180 #if PAN_ARCH >= 6
2181 /* We need an empty attrib buf to stop the prefetching on Bifrost */
2182 pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg);
2183 #endif
2184
2185 /* Attribute addresses require 64-byte alignment, so let:
2186 *
2187 * base' = base & ~63 = base - (base & 63)
2188 * offset' = offset + (base & 63)
2189 *
2190 * Since base' + offset' = base + offset, these are equivalent
2191 * addressing modes and now base is 64 aligned.
2192 */
2193
2194 /* While these are usually equal, they are not required to be. In some
2195 * cases, u_blitter passes too high a value for num_elements.
2196 */
2197 assert(vs->info.attributes_read_count <= so->num_elements);
2198
2199 for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) {
2200 unsigned vbi = so->pipe[i].vertex_buffer_index;
2201 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2202
2203 /* BOs are aligned; just fixup for buffer_offset */
2204 signed src_offset = so->pipe[i].src_offset;
2205 src_offset += (buf->buffer_offset & 63);
2206
2207 /* Base instance offset */
2208 if (ctx->base_instance && so->pipe[i].instance_divisor) {
2209 src_offset += (ctx->base_instance * buf->stride) /
2210 so->pipe[i].instance_divisor;
2211 }
2212
2213 /* Also, somewhat obscurely per-instance data needs to be
2214 * offset in response to a delayed start in an indexed draw */
2215
2216 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
2217 src_offset -= buf->stride * ctx->offset_start;
2218
2219 pan_pack(out + i, ATTRIBUTE, cfg) {
2220 cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
2221 cfg.format = so->formats[i];
2222 cfg.offset = src_offset;
2223 }
2224 }
2225
2226 *buffers = S.gpu;
2227 return T.gpu;
2228 }
2229
2230 static mali_ptr
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)2231 panfrost_emit_varyings(struct panfrost_batch *batch,
2232 struct mali_attribute_buffer_packed *slot,
2233 unsigned stride, unsigned count)
2234 {
2235 unsigned size = stride * count;
2236 mali_ptr ptr =
2237 batch->ctx->indirect_draw ? 0 :
2238 pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
2239
2240 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
2241 cfg.stride = stride;
2242 cfg.size = size;
2243 cfg.pointer = ptr;
2244 }
2245
2246 return ptr;
2247 }
2248
2249 /* Given a varying, figure out which index it corresponds to */
2250
2251 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)2252 pan_varying_index(unsigned present, enum pan_special_varying v)
2253 {
2254 return util_bitcount(present & BITFIELD_MASK(v));
2255 }
2256
2257 /* Determines which varying buffers are required */
2258
2259 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)2260 pan_varying_present(const struct panfrost_device *dev,
2261 struct pan_shader_info *producer,
2262 struct pan_shader_info *consumer,
2263 uint16_t point_coord_mask)
2264 {
2265 /* At the moment we always emit general and position buffers. Not
2266 * strictly necessary but usually harmless */
2267
2268 unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
2269
2270 /* Enable special buffers by the shader info */
2271
2272 if (producer->vs.writes_point_size)
2273 present |= BITFIELD_BIT(PAN_VARY_PSIZ);
2274
2275 #if PAN_ARCH <= 5
2276 /* On Midgard, these exist as real varyings. Later architectures use
2277 * LD_VAR_SPECIAL reads instead. */
2278
2279 if (consumer->fs.reads_point_coord)
2280 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2281
2282 if (consumer->fs.reads_face)
2283 present |= BITFIELD_BIT(PAN_VARY_FACE);
2284
2285 if (consumer->fs.reads_frag_coord)
2286 present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
2287
2288 /* Also, if we have a point sprite, we need a point coord buffer */
2289
2290 for (unsigned i = 0; i < consumer->varyings.input_count; i++) {
2291 gl_varying_slot loc = consumer->varyings.input[i].location;
2292
2293 if (util_varying_is_point_coord(loc, point_coord_mask))
2294 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2295 }
2296 #endif
2297
2298 return present;
2299 }
2300
2301 /* Emitters for varying records */
2302
2303 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)2304 pan_emit_vary(const struct panfrost_device *dev,
2305 struct mali_attribute_packed *out,
2306 unsigned buffer_index,
2307 mali_pixel_format format, unsigned offset)
2308 {
2309 pan_pack(out, ATTRIBUTE, cfg) {
2310 cfg.buffer_index = buffer_index;
2311 cfg.offset_enable = (PAN_ARCH <= 5);
2312 cfg.format = format;
2313 cfg.offset = offset;
2314 }
2315 }
2316
2317 /* Special records */
2318
2319 static const struct {
2320 unsigned components;
2321 enum mali_format format;
2322 } pan_varying_formats[PAN_VARY_MAX] = {
2323 [PAN_VARY_POSITION] = { 4, MALI_SNAP_4 },
2324 [PAN_VARY_PSIZ] = { 1, MALI_R16F },
2325 [PAN_VARY_PNTCOORD] = { 1, MALI_R16F },
2326 [PAN_VARY_FACE] = { 1, MALI_R32I },
2327 [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F },
2328 };
2329
2330 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)2331 pan_special_format(const struct panfrost_device *dev,
2332 enum pan_special_varying buf)
2333 {
2334 assert(buf < PAN_VARY_MAX);
2335 mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2336
2337 #if PAN_ARCH <= 6
2338 unsigned nr = pan_varying_formats[buf].components;
2339 format |= panfrost_get_default_swizzle(nr);
2340 #endif
2341
2342 return format;
2343 }
2344
2345 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)2346 pan_emit_vary_special(const struct panfrost_device *dev,
2347 struct mali_attribute_packed *out,
2348 unsigned present, enum pan_special_varying buf)
2349 {
2350 pan_emit_vary(dev, out, pan_varying_index(present, buf),
2351 pan_special_format(dev, buf), 0);
2352 }
2353
2354 /* Negative indicates a varying is not found */
2355
2356 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)2357 pan_find_vary(const struct pan_shader_varying *vary,
2358 unsigned vary_count, unsigned loc)
2359 {
2360 for (unsigned i = 0; i < vary_count; ++i) {
2361 if (vary[i].location == loc)
2362 return i;
2363 }
2364
2365 return -1;
2366 }
2367
2368 /* Assign varying locations for the general buffer. Returns the calculated
2369 * per-vertex stride, and outputs offsets into the passed array. Negative
2370 * offset indicates a varying is not used. */
2371
2372 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2373 pan_assign_varyings(const struct panfrost_device *dev,
2374 struct pan_shader_info *producer,
2375 struct pan_shader_info *consumer,
2376 signed *offsets)
2377 {
2378 unsigned producer_count = producer->varyings.output_count;
2379 unsigned consumer_count = consumer->varyings.input_count;
2380
2381 const struct pan_shader_varying *producer_vars = producer->varyings.output;
2382 const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2383
2384 unsigned stride = 0;
2385
2386 for (unsigned i = 0; i < producer_count; ++i) {
2387 signed loc = pan_find_vary(consumer_vars, consumer_count,
2388 producer_vars[i].location);
2389
2390 if (loc >= 0) {
2391 offsets[i] = stride;
2392
2393 enum pipe_format format = consumer_vars[loc].format;
2394 stride += util_format_get_blocksize(format);
2395 } else {
2396 offsets[i] = -1;
2397 }
2398 }
2399
2400 return stride;
2401 }
2402
2403 /* Emitter for a single varying (attribute) descriptor */
2404
2405 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,signed offset,enum pan_special_varying pos_varying)2406 panfrost_emit_varying(const struct panfrost_device *dev,
2407 struct mali_attribute_packed *out,
2408 const struct pan_shader_varying varying,
2409 enum pipe_format pipe_format,
2410 unsigned present,
2411 uint16_t point_sprite_mask,
2412 signed offset,
2413 enum pan_special_varying pos_varying)
2414 {
2415 /* Note: varying.format != pipe_format in some obscure cases due to a
2416 * limitation of the NIR linker. This should be fixed in the future to
2417 * eliminate the additional lookups. See:
2418 * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2419 */
2420 gl_varying_slot loc = varying.location;
2421 mali_pixel_format format = dev->formats[pipe_format].hw;
2422
2423 if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2424 pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2425 } else if (loc == VARYING_SLOT_POS) {
2426 pan_emit_vary_special(dev, out, present, pos_varying);
2427 } else if (loc == VARYING_SLOT_PSIZ) {
2428 pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2429 } else if (loc == VARYING_SLOT_FACE) {
2430 pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2431 } else if (offset < 0) {
2432 pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2433 } else {
2434 STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2435 pan_emit_vary(dev, out, 0, format, offset);
2436 }
2437 }
2438
2439 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2440 * rather than draw time (under good conditions). */
2441
2442 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_shader_state * producer,struct panfrost_shader_state * consumer,uint16_t point_coord_mask,struct pan_linkage * out)2443 panfrost_emit_varying_descs(
2444 struct panfrost_pool *pool,
2445 struct panfrost_shader_state *producer,
2446 struct panfrost_shader_state *consumer,
2447 uint16_t point_coord_mask,
2448 struct pan_linkage *out)
2449 {
2450 struct panfrost_device *dev = pool->base.dev;
2451 unsigned producer_count = producer->info.varyings.output_count;
2452 unsigned consumer_count = consumer->info.varyings.input_count;
2453
2454 /* Offsets within the general varying buffer, indexed by location */
2455 signed offsets[PAN_MAX_VARYINGS];
2456 assert(producer_count <= ARRAY_SIZE(offsets));
2457 assert(consumer_count <= ARRAY_SIZE(offsets));
2458
2459 /* Allocate enough descriptors for both shader stages */
2460 struct panfrost_ptr T =
2461 pan_pool_alloc_desc_array(&pool->base,
2462 producer_count + consumer_count,
2463 ATTRIBUTE);
2464
2465 /* Take a reference if we're being put on the CSO */
2466 if (!pool->owned) {
2467 out->bo = pool->transient_bo;
2468 panfrost_bo_reference(out->bo);
2469 }
2470
2471 struct mali_attribute_packed *descs = T.cpu;
2472 out->producer = producer_count ? T.gpu : 0;
2473 out->consumer = consumer_count ? T.gpu +
2474 (pan_size(ATTRIBUTE) * producer_count) : 0;
2475
2476 /* Lay out the varyings. Must use producer to lay out, in order to
2477 * respect transform feedback precisions. */
2478 out->present = pan_varying_present(dev, &producer->info,
2479 &consumer->info, point_coord_mask);
2480
2481 out->stride = pan_assign_varyings(dev, &producer->info,
2482 &consumer->info, offsets);
2483
2484 for (unsigned i = 0; i < producer_count; ++i) {
2485 signed j = pan_find_vary(consumer->info.varyings.input,
2486 consumer->info.varyings.input_count,
2487 producer->info.varyings.output[i].location);
2488
2489 enum pipe_format format = (j >= 0) ?
2490 consumer->info.varyings.input[j].format :
2491 producer->info.varyings.output[i].format;
2492
2493 panfrost_emit_varying(dev, descs + i,
2494 producer->info.varyings.output[i], format,
2495 out->present, 0, offsets[i], PAN_VARY_POSITION);
2496 }
2497
2498 for (unsigned i = 0; i < consumer_count; ++i) {
2499 signed j = pan_find_vary(producer->info.varyings.output,
2500 producer->info.varyings.output_count,
2501 consumer->info.varyings.input[i].location);
2502
2503 signed offset = (j >= 0) ? offsets[j] : -1;
2504
2505 panfrost_emit_varying(dev, descs + producer_count + i,
2506 consumer->info.varyings.input[i],
2507 consumer->info.varyings.input[i].format,
2508 out->present, point_coord_mask,
2509 offset, PAN_VARY_FRAGCOORD);
2510 }
2511 }
2512
2513 #if PAN_ARCH <= 5
2514 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2515 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2516 unsigned present,
2517 enum pan_special_varying v,
2518 unsigned special)
2519 {
2520 if (present & BITFIELD_BIT(v)) {
2521 unsigned idx = pan_varying_index(present, v);
2522
2523 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2524 cfg.special = special;
2525 cfg.type = 0;
2526 }
2527 }
2528 }
2529 #endif
2530
2531 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,mali_ptr * vs_attribs,mali_ptr * fs_attribs,mali_ptr * buffers,unsigned * buffer_count,mali_ptr * position,mali_ptr * psiz,bool point_coord_replace)2532 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2533 unsigned vertex_count,
2534 mali_ptr *vs_attribs,
2535 mali_ptr *fs_attribs,
2536 mali_ptr *buffers,
2537 unsigned *buffer_count,
2538 mali_ptr *position,
2539 mali_ptr *psiz,
2540 bool point_coord_replace)
2541 {
2542 /* Load the shaders */
2543 struct panfrost_context *ctx = batch->ctx;
2544 struct panfrost_shader_state *vs, *fs;
2545
2546 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2547 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2548
2549 uint16_t point_coord_mask = 0;
2550
2551 #if PAN_ARCH <= 5
2552 /* Point sprites are lowered on Bifrost and newer */
2553 if (point_coord_replace)
2554 point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2555 #endif
2556
2557 /* In good conditions, we only need to link varyings once */
2558 bool prelink =
2559 (point_coord_mask == 0) &&
2560 !vs->info.separable &&
2561 !fs->info.separable;
2562
2563 /* Try to reduce copies */
2564 struct pan_linkage _linkage;
2565 struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2566
2567 /* Emit ATTRIBUTE descriptors if needed */
2568 if (!prelink || vs->linkage.bo == NULL) {
2569 struct panfrost_pool *pool =
2570 prelink ? &ctx->descs : &batch->pool;
2571
2572 panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
2573 }
2574
2575 unsigned present = linkage->present, stride = linkage->stride;
2576 unsigned count = util_bitcount(present);
2577 struct panfrost_ptr T =
2578 pan_pool_alloc_desc_array(&batch->pool.base,
2579 count + 1,
2580 ATTRIBUTE_BUFFER);
2581 struct mali_attribute_buffer_packed *varyings =
2582 (struct mali_attribute_buffer_packed *) T.cpu;
2583
2584 if (buffer_count)
2585 *buffer_count = count;
2586
2587 #if PAN_ARCH >= 6
2588 /* Suppress prefetch on Bifrost */
2589 memset(varyings + count, 0, sizeof(*varyings));
2590 #endif
2591
2592 if (stride) {
2593 panfrost_emit_varyings(batch,
2594 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2595 stride, vertex_count);
2596 } else {
2597 /* The indirect draw code reads the stride field, make sure
2598 * that it is initialised */
2599 memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0,
2600 sizeof(*varyings));
2601 }
2602
2603 /* fp32 vec4 gl_Position */
2604 *position = panfrost_emit_varyings(batch,
2605 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2606 sizeof(float) * 4, vertex_count);
2607
2608 if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2609 *psiz = panfrost_emit_varyings(batch,
2610 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2611 2, vertex_count);
2612 }
2613
2614 #if PAN_ARCH <= 5
2615 pan_emit_special_input(varyings, present,
2616 PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2617 pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2618 MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2619 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2620 MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2621 #endif
2622
2623 *buffers = T.gpu;
2624 *vs_attribs = linkage->producer;
2625 *fs_attribs = linkage->consumer;
2626 }
2627
2628 /*
2629 * Emit jobs required for the rasterization pipeline. If there are side effects
2630 * from the vertex shader, these are handled ahead-of-time with a compute
2631 * shader. This function should not be called if rasterization is skipped.
2632 */
2633 static void
panfrost_emit_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)2634 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2635 const struct panfrost_ptr *vertex_job,
2636 const struct panfrost_ptr *tiler_job)
2637 {
2638 struct panfrost_context *ctx = batch->ctx;
2639
2640 /* XXX - set job_barrier in case buffers get ping-ponged and we need to
2641 * enforce ordering, this has a perf hit! See
2642 * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations
2643 */
2644 unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2645 MALI_JOB_TYPE_VERTEX, true, false,
2646 ctx->indirect_draw ?
2647 batch->indirect_draw_job_id : 0,
2648 0, vertex_job, false);
2649
2650 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
2651 MALI_JOB_TYPE_TILER, false, false,
2652 vertex, 0, tiler_job, false);
2653 }
2654 #endif
2655
2656 static void
emit_tls(struct panfrost_batch * batch)2657 emit_tls(struct panfrost_batch *batch)
2658 {
2659 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2660
2661 /* Emitted with the FB descriptor on Midgard. */
2662 if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2663 return;
2664
2665 struct panfrost_bo *tls_bo =
2666 batch->stack_size ?
2667 panfrost_batch_get_scratchpad(batch,
2668 batch->stack_size,
2669 dev->thread_tls_alloc,
2670 dev->core_id_range):
2671 NULL;
2672 struct pan_tls_info tls = {
2673 .tls = {
2674 .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2675 .size = batch->stack_size,
2676 },
2677 };
2678
2679 assert(batch->tls.cpu);
2680 GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2681 }
2682
2683 static void
emit_fbd(struct panfrost_batch * batch,const struct pan_fb_info * fb)2684 emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
2685 {
2686 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2687 struct panfrost_bo *tls_bo =
2688 batch->stack_size ?
2689 panfrost_batch_get_scratchpad(batch,
2690 batch->stack_size,
2691 dev->thread_tls_alloc,
2692 dev->core_id_range):
2693 NULL;
2694 struct pan_tls_info tls = {
2695 .tls = {
2696 .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2697 .size = batch->stack_size,
2698 },
2699 };
2700
2701 batch->framebuffer.gpu |=
2702 GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx,
2703 batch->framebuffer.cpu);
2704 }
2705
2706 /* Mark a surface as written */
2707
2708 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2709 panfrost_initialize_surface(struct panfrost_batch *batch,
2710 struct pipe_surface *surf)
2711 {
2712 if (surf) {
2713 struct panfrost_resource *rsrc = pan_resource(surf->texture);
2714 BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2715 }
2716 }
2717
2718 /* Generate a fragment job. This should be called once per frame. (Usually,
2719 * this corresponds to eglSwapBuffers or one of glFlush, glFinish)
2720 */
2721 static mali_ptr
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2722 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2723 {
2724 /* Mark the affected buffers as initialized, since we're writing to it.
2725 * Also, add the surfaces we're writing to to the batch */
2726
2727 struct pipe_framebuffer_state *fb = &batch->key;
2728
2729 for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2730 panfrost_initialize_surface(batch, fb->cbufs[i]);
2731
2732 panfrost_initialize_surface(batch, fb->zsbuf);
2733
2734 /* The passed tile coords can be out of range in some cases, so we need
2735 * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2736 * Theoretically we also need to clamp the coordinates positive, but we
2737 * avoid that edge case as all four values are unsigned. Also,
2738 * theoretically we could clamp the minima, but if that has to happen
2739 * the asserts would fail anyway (since the maxima would get clamped
2740 * and then be smaller than the minima). An edge case of sorts occurs
2741 * when no scissors are added to draw, so by default min=~0 and max=0.
2742 * But that can't happen if any actual drawing occurs (beyond a
2743 * wallpaper reload), so this is again irrelevant in practice. */
2744
2745 batch->maxx = MIN2(batch->maxx, fb->width);
2746 batch->maxy = MIN2(batch->maxy, fb->height);
2747
2748 /* Rendering region must be at least 1x1; otherwise, there is nothing
2749 * to do and the whole job chain should have been discarded. */
2750
2751 assert(batch->maxx > batch->minx);
2752 assert(batch->maxy > batch->miny);
2753
2754 struct panfrost_ptr transfer =
2755 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
2756
2757 GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu,
2758 transfer.cpu);
2759
2760 return transfer.gpu;
2761 }
2762
2763 #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c;
2764
2765 static uint8_t
pan_draw_mode(enum pipe_prim_type mode)2766 pan_draw_mode(enum pipe_prim_type mode)
2767 {
2768 switch (mode) {
2769 DEFINE_CASE(POINTS);
2770 DEFINE_CASE(LINES);
2771 DEFINE_CASE(LINE_LOOP);
2772 DEFINE_CASE(LINE_STRIP);
2773 DEFINE_CASE(TRIANGLES);
2774 DEFINE_CASE(TRIANGLE_STRIP);
2775 DEFINE_CASE(TRIANGLE_FAN);
2776 DEFINE_CASE(QUADS);
2777 DEFINE_CASE(POLYGON);
2778 #if PAN_ARCH <= 6
2779 DEFINE_CASE(QUAD_STRIP);
2780 #endif
2781
2782 default:
2783 unreachable("Invalid draw mode");
2784 }
2785 }
2786
2787 #undef DEFINE_CASE
2788
2789 /* Count generated primitives (when there is no geom/tess shaders) for
2790 * transform feedback */
2791
2792 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2793 panfrost_statistics_record(
2794 struct panfrost_context *ctx,
2795 const struct pipe_draw_info *info,
2796 const struct pipe_draw_start_count_bias *draw)
2797 {
2798 if (!ctx->active_queries)
2799 return;
2800
2801 uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2802 ctx->prims_generated += prims;
2803
2804 if (!ctx->streamout.num_targets)
2805 return;
2806
2807 ctx->tf_prims_generated += prims;
2808 ctx->dirty |= PAN_DIRTY_SO;
2809 }
2810
2811 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2812 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2813 {
2814 unsigned count = u_stream_outputs_for_vertices(ctx->active_prim,
2815 ctx->vertex_count);
2816
2817 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2818 if (!ctx->streamout.targets[i])
2819 continue;
2820
2821 pan_so_target(ctx->streamout.targets[i])->offset += count;
2822 }
2823 }
2824
2825 static inline enum mali_index_type
panfrost_translate_index_size(unsigned size)2826 panfrost_translate_index_size(unsigned size)
2827 {
2828 STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0);
2829 STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1);
2830 STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2);
2831
2832 return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size;
2833 }
2834
2835 #if PAN_ARCH <= 7
2836 static inline void
pan_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)2837 pan_emit_draw_descs(struct panfrost_batch *batch,
2838 struct MALI_DRAW *d, enum pipe_shader_type st)
2839 {
2840 d->offset_start = batch->ctx->offset_start;
2841 d->instance_size = batch->ctx->instance_count > 1 ?
2842 batch->ctx->padded_count : 1;
2843
2844 d->uniform_buffers = batch->uniform_buffers[st];
2845 d->push_uniforms = batch->push_uniforms[st];
2846 d->textures = batch->textures[st];
2847 d->samplers = batch->samplers[st];
2848 }
2849
2850 static void
panfrost_draw_emit_vertex_section(struct panfrost_batch * batch,mali_ptr vs_vary,mali_ptr varyings,mali_ptr attribs,mali_ptr attrib_bufs,void * section)2851 panfrost_draw_emit_vertex_section(struct panfrost_batch *batch,
2852 mali_ptr vs_vary, mali_ptr varyings,
2853 mali_ptr attribs, mali_ptr attrib_bufs,
2854 void *section)
2855 {
2856 pan_pack(section, DRAW, cfg) {
2857 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
2858 cfg.attributes = attribs;
2859 cfg.attribute_buffers = attrib_bufs;
2860 cfg.varyings = vs_vary;
2861 cfg.varying_buffers = vs_vary ? varyings : 0;
2862 cfg.thread_storage = batch->tls.gpu;
2863 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
2864 }
2865 }
2866
2867 static void
panfrost_draw_emit_vertex(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,mali_ptr vs_vary,mali_ptr varyings,mali_ptr attribs,mali_ptr attrib_bufs,void * job)2868 panfrost_draw_emit_vertex(struct panfrost_batch *batch,
2869 const struct pipe_draw_info *info,
2870 void *invocation_template,
2871 mali_ptr vs_vary, mali_ptr varyings,
2872 mali_ptr attribs, mali_ptr attrib_bufs,
2873 void *job)
2874 {
2875 void *section =
2876 pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
2877 memcpy(section, invocation_template, pan_size(INVOCATION));
2878
2879 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
2880 cfg.job_task_split = 5;
2881 }
2882
2883 section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
2884 panfrost_draw_emit_vertex_section(batch, vs_vary, varyings,
2885 attribs, attrib_bufs, section);
2886 }
2887 #endif
2888
2889 static void
panfrost_emit_primitive_size(struct panfrost_context * ctx,bool points,mali_ptr size_array,void * prim_size)2890 panfrost_emit_primitive_size(struct panfrost_context *ctx,
2891 bool points, mali_ptr size_array,
2892 void *prim_size)
2893 {
2894 struct panfrost_rasterizer *rast = ctx->rasterizer;
2895
2896 pan_pack(prim_size, PRIMITIVE_SIZE, cfg) {
2897 if (panfrost_writes_point_size(ctx)) {
2898 cfg.size_array = size_array;
2899 } else {
2900 cfg.constant = points ?
2901 rast->base.point_size :
2902 rast->base.line_width;
2903 }
2904 }
2905 }
2906
2907 static bool
panfrost_is_implicit_prim_restart(const struct pipe_draw_info * info)2908 panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info)
2909 {
2910 /* As a reminder primitive_restart should always be checked before any
2911 access to restart_index. */
2912 return info->primitive_restart &&
2913 info->restart_index == (unsigned)BITFIELD_MASK(info->index_size * 8);
2914 }
2915
2916 /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of
2917 * 3D state. In particular, it groups the fragment shader descriptor with
2918 * depth/stencil, blend, polygon offset, and multisampling state. These pieces
2919 * of state are dirty tracked independently for the benefit of newer GPUs that
2920 * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D
2921 * dirty flags that trigger re-emits of the fragment RSD.
2922 *
2923 * Obscurely, occlusion queries are included. Occlusion query state is nominally
2924 * specified in the draw call descriptor, but must be considered when determing
2925 * early-Z state which is part of the RSD.
2926 */
2927 #define FRAGMENT_RSD_DIRTY_MASK ( \
2928 PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | \
2929 PAN_DIRTY_RASTERIZER | PAN_DIRTY_OQ)
2930
2931 static inline void
panfrost_update_shader_state(struct panfrost_batch * batch,enum pipe_shader_type st)2932 panfrost_update_shader_state(struct panfrost_batch *batch,
2933 enum pipe_shader_type st)
2934 {
2935 struct panfrost_context *ctx = batch->ctx;
2936 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
2937
2938 bool frag = (st == PIPE_SHADER_FRAGMENT);
2939 unsigned dirty_3d = ctx->dirty;
2940 unsigned dirty = ctx->dirty_shader[st];
2941
2942 if (dirty & PAN_DIRTY_STAGE_TEXTURE) {
2943 batch->textures[st] =
2944 panfrost_emit_texture_descriptors(batch, st);
2945 }
2946
2947 if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2948 batch->samplers[st] =
2949 panfrost_emit_sampler_descriptors(batch, st);
2950 }
2951
2952 /* On Bifrost and older, the fragment shader descriptor is fused
2953 * together with the renderer state; the combined renderer state
2954 * descriptor is emitted below. Otherwise, the shader descriptor is
2955 * standalone and is emitted here.
2956 */
2957 if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) {
2958 batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2959 }
2960
2961 #if PAN_ARCH >= 9
2962 if (dirty & PAN_DIRTY_STAGE_IMAGE)
2963 batch->images[st] = panfrost_emit_images(batch, st);
2964 #endif
2965
2966 if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2967 batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st,
2968 NULL, &batch->push_uniforms[st], NULL);
2969 }
2970
2971 #if PAN_ARCH <= 7
2972 /* On Bifrost and older, if the fragment shader changes OR any renderer
2973 * state specified with the fragment shader, the whole renderer state
2974 * descriptor is dirtied and must be reemited.
2975 */
2976 if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) ||
2977 (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) {
2978
2979 batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2980 }
2981
2982 if (frag && (dirty & PAN_DIRTY_STAGE_IMAGE)) {
2983 batch->attribs[st] = panfrost_emit_image_attribs(batch,
2984 &batch->attrib_bufs[st], st);
2985 }
2986 #endif
2987 }
2988
2989 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2990 panfrost_update_state_3d(struct panfrost_batch *batch)
2991 {
2992 struct panfrost_context *ctx = batch->ctx;
2993 unsigned dirty = ctx->dirty;
2994
2995 if (dirty & PAN_DIRTY_TLS_SIZE)
2996 panfrost_batch_adjust_stack_size(batch);
2997
2998 if (dirty & PAN_DIRTY_BLEND)
2999 panfrost_set_batch_masks_blend(batch);
3000
3001 if (dirty & PAN_DIRTY_ZS)
3002 panfrost_set_batch_masks_zs(batch);
3003
3004 #if PAN_ARCH >= 9
3005 if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) ||
3006 (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER))
3007 batch->depth_stencil = panfrost_emit_depth_stencil(batch);
3008
3009 if (dirty & PAN_DIRTY_BLEND)
3010 batch->blend = panfrost_emit_blend_valhall(batch);
3011
3012 if (dirty & PAN_DIRTY_VERTEX) {
3013 batch->attribs[PIPE_SHADER_VERTEX] =
3014 panfrost_emit_vertex_data(batch);
3015
3016 batch->attrib_bufs[PIPE_SHADER_VERTEX] =
3017 panfrost_emit_vertex_buffers(batch);
3018 }
3019 #endif
3020 }
3021
3022 #if PAN_ARCH >= 6
3023 static mali_ptr
panfrost_batch_get_bifrost_tiler(struct panfrost_batch * batch,unsigned vertex_count)3024 panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count)
3025 {
3026 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3027
3028 if (!vertex_count)
3029 return 0;
3030
3031 if (batch->tiler_ctx.bifrost)
3032 return batch->tiler_ctx.bifrost;
3033
3034 struct panfrost_ptr t =
3035 pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
3036
3037 GENX(pan_emit_tiler_heap)(dev, t.cpu);
3038
3039 mali_ptr heap = t.gpu;
3040
3041 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
3042 GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
3043 util_framebuffer_get_num_samples(&batch->key),
3044 pan_tristate_get(batch->first_provoking_vertex),
3045 heap, t.cpu);
3046
3047 batch->tiler_ctx.bifrost = t.gpu;
3048 return batch->tiler_ctx.bifrost;
3049 }
3050 #endif
3051
3052 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
3053 * jobs and Valhall IDVS jobs
3054 */
3055 static void
panfrost_emit_primitive(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,mali_ptr indices,bool secondary_shader,void * out)3056 panfrost_emit_primitive(struct panfrost_context *ctx,
3057 const struct pipe_draw_info *info,
3058 const struct pipe_draw_start_count_bias *draw,
3059 mali_ptr indices, bool secondary_shader, void *out)
3060 {
3061 UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3062
3063 bool lines = (info->mode == PIPE_PRIM_LINES ||
3064 info->mode == PIPE_PRIM_LINE_LOOP ||
3065 info->mode == PIPE_PRIM_LINE_STRIP);
3066
3067 pan_pack(out, PRIMITIVE, cfg) {
3068 cfg.draw_mode = pan_draw_mode(info->mode);
3069 if (panfrost_writes_point_size(ctx))
3070 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
3071
3072 #if PAN_ARCH <= 8
3073 /* For line primitives, PRIMITIVE.first_provoking_vertex must
3074 * be set to true and the provoking vertex is selected with
3075 * DRAW.flat_shading_vertex.
3076 */
3077 if (lines)
3078 cfg.first_provoking_vertex = true;
3079 else
3080 cfg.first_provoking_vertex = rast->flatshade_first;
3081
3082 if (panfrost_is_implicit_prim_restart(info)) {
3083 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
3084 } else if (info->primitive_restart) {
3085 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
3086 cfg.primitive_restart_index = info->restart_index;
3087 }
3088
3089 cfg.job_task_split = 6;
3090 #else
3091 struct panfrost_shader_state *fs =
3092 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
3093
3094 cfg.allow_rotating_primitives = !(lines || fs->info.bifrost.uses_flat_shading);
3095 cfg.primitive_restart = info->primitive_restart;
3096
3097 /* Non-fixed restart indices should have been lowered */
3098 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
3099 #endif
3100
3101 cfg.index_count = ctx->indirect_draw ? 1 : draw->count;
3102 cfg.index_type = panfrost_translate_index_size(info->index_size);
3103
3104
3105 if (PAN_ARCH >= 9) {
3106 /* Base vertex offset on Valhall is used for both
3107 * indexed and non-indexed draws, in a simple way for
3108 * either. Handle both cases.
3109 */
3110 if (cfg.index_type)
3111 cfg.base_vertex_offset = draw->index_bias;
3112 else
3113 cfg.base_vertex_offset = draw->start;
3114
3115 /* Indices are moved outside the primitive descriptor
3116 * on Valhall, so we don't need to set that here
3117 */
3118 } else if (cfg.index_type) {
3119 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
3120
3121 #if PAN_ARCH <= 7
3122 cfg.indices = indices;
3123 #endif
3124 }
3125
3126 #if PAN_ARCH >= 6
3127 cfg.secondary_shader = secondary_shader;
3128 #endif
3129 }
3130 }
3131
3132 #if PAN_ARCH >= 9
3133 static mali_ptr
panfrost_emit_resources(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr ubos,unsigned ubo_count)3134 panfrost_emit_resources(struct panfrost_batch *batch,
3135 enum pipe_shader_type stage,
3136 mali_ptr ubos, unsigned ubo_count)
3137 {
3138 struct panfrost_context *ctx = batch->ctx;
3139 struct panfrost_ptr T;
3140 unsigned nr_tables = 12;
3141
3142 /* Although individual resources need only 16 byte alignment, the
3143 * resource table as a whole must be 64-byte aligned.
3144 */
3145 T = pan_pool_alloc_aligned(&batch->pool.base, nr_tables * pan_size(RESOURCE), 64);
3146 memset(T.cpu, 0, nr_tables * pan_size(RESOURCE));
3147
3148 panfrost_make_resource_table(T, PAN_TABLE_UBO, ubos, ubo_count);
3149
3150 panfrost_make_resource_table(T, PAN_TABLE_TEXTURE,
3151 batch->textures[stage],
3152 ctx->sampler_view_count[stage]);
3153
3154 panfrost_make_resource_table(T, PAN_TABLE_SAMPLER,
3155 batch->samplers[stage],
3156 ctx->sampler_count[stage]);
3157
3158 panfrost_make_resource_table(T, PAN_TABLE_IMAGE,
3159 batch->images[stage],
3160 util_last_bit(ctx->image_mask[stage]));
3161
3162 if (stage == PIPE_SHADER_VERTEX) {
3163 panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE,
3164 batch->attribs[stage],
3165 ctx->vertex->num_elements);
3166
3167 panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER,
3168 batch->attrib_bufs[stage],
3169 util_last_bit(ctx->vb_mask));
3170 }
3171
3172 return T.gpu | nr_tables;
3173 }
3174
3175 static void
panfrost_emit_shader(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,mali_ptr shader_ptr,mali_ptr thread_storage)3176 panfrost_emit_shader(struct panfrost_batch *batch,
3177 struct MALI_SHADER_ENVIRONMENT *cfg,
3178 enum pipe_shader_type stage,
3179 mali_ptr shader_ptr,
3180 mali_ptr thread_storage)
3181 {
3182 unsigned fau_words = 0, ubo_count = 0;
3183 mali_ptr ubos, resources;
3184
3185 ubos = panfrost_emit_const_buf(batch, stage, &ubo_count, &cfg->fau,
3186 &fau_words);
3187
3188 resources = panfrost_emit_resources(batch, stage, ubos, ubo_count);
3189
3190 cfg->thread_storage = thread_storage;
3191 cfg->shader = shader_ptr;
3192 cfg->resources = resources;
3193
3194 /* Each entry of FAU is 64-bits */
3195 cfg->fau_count = DIV_ROUND_UP(fau_words, 2);
3196 }
3197 #endif
3198
3199 static void
panfrost_emit_draw(void * out,struct panfrost_batch * batch,bool fs_required,enum pipe_prim_type prim,mali_ptr pos,mali_ptr fs_vary,mali_ptr varyings)3200 panfrost_emit_draw(void *out,
3201 struct panfrost_batch *batch,
3202 bool fs_required,
3203 enum pipe_prim_type prim,
3204 mali_ptr pos, mali_ptr fs_vary, mali_ptr varyings)
3205 {
3206 struct panfrost_context *ctx = batch->ctx;
3207 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3208 bool polygon = (prim == PIPE_PRIM_TRIANGLES);
3209
3210 pan_pack(out, DRAW, cfg) {
3211 /*
3212 * From the Gallium documentation,
3213 * pipe_rasterizer_state::cull_face "indicates which faces of
3214 * polygons to cull". Points and lines are not considered
3215 * polygons and should be drawn even if all faces are culled.
3216 * The hardware does not take primitive type into account when
3217 * culling, so we need to do that check ourselves.
3218 */
3219 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
3220 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
3221 cfg.front_face_ccw = rast->front_ccw;
3222
3223 if (ctx->occlusion_query && ctx->active_queries) {
3224 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
3225 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
3226 else
3227 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
3228
3229 struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
3230 cfg.occlusion = rsrc->image.data.bo->ptr.gpu;
3231 panfrost_batch_write_rsrc(ctx->batch, rsrc,
3232 PIPE_SHADER_FRAGMENT);
3233 }
3234
3235 #if PAN_ARCH >= 9
3236 struct panfrost_shader_state *fs =
3237 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
3238
3239 cfg.multisample_enable = rast->multisample;
3240 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
3241
3242 /* Use per-sample shading if required by API Also use it when a
3243 * blend shader is used with multisampling, as this is handled
3244 * by a single ST_TILE in the blend shader with the current
3245 * sample ID, requiring per-sample shading.
3246 */
3247 cfg.evaluate_per_sample =
3248 (rast->multisample &&
3249 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
3250
3251 cfg.single_sampled_lines = !rast->multisample;
3252
3253 cfg.vertex_array.packet = true;
3254
3255 cfg.minimum_z = batch->minimum_z;
3256 cfg.maximum_z = batch->maximum_z;
3257
3258 cfg.depth_stencil = batch->depth_stencil;
3259
3260 if (fs_required) {
3261 bool has_oq = ctx->occlusion_query && ctx->active_queries;
3262
3263 struct pan_earlyzs_state earlyzs =
3264 pan_earlyzs_get(fs->earlyzs,
3265 ctx->depth_stencil->writes_zs || has_oq,
3266 ctx->blend->base.alpha_to_coverage,
3267 ctx->depth_stencil->zs_always_passes);
3268
3269 cfg.pixel_kill_operation = earlyzs.kill;
3270 cfg.zs_update_operation = earlyzs.update;
3271
3272 cfg.allow_forward_pixel_to_kill = pan_allow_forward_pixel_to_kill(ctx, fs);
3273 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
3274
3275 /* Mask of render targets that may be written. A render
3276 * target may be written if the fragment shader writes
3277 * to it AND it actually exists. If the render target
3278 * doesn't actually exist, the blend descriptor will be
3279 * OFF so it may be omitted from the mask.
3280 *
3281 * Only set when there is a fragment shader, since
3282 * otherwise no colour updates are possible.
3283 */
3284 cfg.render_target_mask =
3285 (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
3286 ctx->fb_rt_mask;
3287
3288 /* Also use per-sample shading if required by the shader
3289 */
3290 cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
3291
3292 /* Unlike Bifrost, alpha-to-coverage must be included in
3293 * this identically-named flag. Confusing, isn't it?
3294 */
3295 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
3296 fs->info.fs.can_discard ||
3297 ctx->blend->base.alpha_to_coverage;
3298
3299 /* Blend descriptors are only accessed by a BLEND
3300 * instruction on Valhall. It follows that if the
3301 * fragment shader is omitted, we may also emit the
3302 * blend descriptors.
3303 */
3304 cfg.blend = batch->blend;
3305 cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
3306 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
3307
3308 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
3309 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
3310
3311 panfrost_emit_shader(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
3312 batch->rsd[PIPE_SHADER_FRAGMENT],
3313 batch->tls.gpu);
3314 } else {
3315 /* These operations need to be FORCE to benefit from the
3316 * depth-only pass optimizations.
3317 */
3318 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
3319 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
3320
3321 /* No shader and no blend => no shader or blend
3322 * reasons to disable FPK. The only FPK-related state
3323 * not covered is alpha-to-coverage which we don't set
3324 * without blend.
3325 */
3326 cfg.allow_forward_pixel_to_kill = true;
3327
3328 /* No shader => no shader side effects */
3329 cfg.allow_forward_pixel_to_be_killed = true;
3330
3331 /* Alpha isn't written so these are vacuous */
3332 cfg.overdraw_alpha0 = true;
3333 cfg.overdraw_alpha1 = true;
3334 }
3335 #else
3336 cfg.position = pos;
3337 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
3338 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
3339 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
3340 cfg.viewport = batch->viewport;
3341 cfg.varyings = fs_vary;
3342 cfg.varying_buffers = fs_vary ? varyings : 0;
3343 cfg.thread_storage = batch->tls.gpu;
3344
3345 /* For all primitives but lines DRAW.flat_shading_vertex must
3346 * be set to 0 and the provoking vertex is selected with the
3347 * PRIMITIVE.first_provoking_vertex field.
3348 */
3349 if (prim == PIPE_PRIM_LINES) {
3350 /* The logic is inverted across arches. */
3351 cfg.flat_shading_vertex = rast->flatshade_first
3352 ^ (PAN_ARCH <= 5);
3353 }
3354
3355 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
3356 #endif
3357 }
3358 }
3359
3360 #if PAN_ARCH >= 9
3361 static void
panfrost_emit_malloc_vertex(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,mali_ptr indices,bool secondary_shader,void * job)3362 panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
3363 const struct pipe_draw_info *info,
3364 const struct pipe_draw_start_count_bias *draw,
3365 mali_ptr indices, bool secondary_shader,
3366 void *job)
3367 {
3368 struct panfrost_context *ctx = batch->ctx;
3369
3370 struct panfrost_shader_state *vs =
3371 panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3372
3373 struct panfrost_shader_state *fs =
3374 panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
3375
3376 bool fs_required = panfrost_fs_required(fs, ctx->blend,
3377 &ctx->pipe_framebuffer,
3378 ctx->depth_stencil);
3379
3380 /* Varying shaders only feed data to the fragment shader, so if we omit
3381 * the fragment shader, we should omit the varying shader too.
3382 */
3383 secondary_shader &= fs_required;
3384
3385 panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader,
3386 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
3387
3388 pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
3389 cfg.count = info->instance_count;
3390 }
3391
3392 pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
3393 if (secondary_shader) {
3394 unsigned v = vs->info.varyings.output_count;
3395 unsigned f = fs->info.varyings.input_count;
3396 unsigned slots = MAX2(v, f);
3397 slots += util_bitcount(fs->key.fixed_varying_mask);
3398 unsigned size = slots * 16;
3399
3400 /* Assumes 16 byte slots. We could do better. */
3401 cfg.vertex_packet_stride = size + 16;
3402 cfg.vertex_attribute_stride = size;
3403 } else {
3404 /* Hardware requirement for "no varyings" */
3405 cfg.vertex_packet_stride = 16;
3406 cfg.vertex_attribute_stride = 0;
3407 }
3408 }
3409
3410 pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
3411 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
3412 }
3413
3414 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
3415 memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR),
3416 &batch->scissor, pan_size(SCISSOR));
3417
3418 panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0,
3419 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
3420
3421 pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
3422 cfg.address = indices;
3423 }
3424
3425 panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW),
3426 batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0);
3427
3428 pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
3429 /* IDVS/points vertex shader */
3430 mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX];
3431
3432 /* IDVS/triangle vertex shader */
3433 if (vs_ptr && info->mode != PIPE_PRIM_POINTS)
3434 vs_ptr += pan_size(SHADER_PROGRAM);
3435
3436 panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr,
3437 batch->tls.gpu);
3438 }
3439
3440 pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
3441 /* If a varying shader is used, we configure it with the same
3442 * state as the position shader for backwards compatible
3443 * behaviour with Bifrost. This could be optimized.
3444 */
3445 if (!secondary_shader) continue;
3446
3447 mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] +
3448 (2 * pan_size(SHADER_PROGRAM));
3449
3450 panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX,
3451 ptr, batch->tls.gpu);
3452 }
3453 }
3454 #endif
3455
3456 #if PAN_ARCH <= 7
3457 static void
panfrost_draw_emit_tiler(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,mali_ptr indices,mali_ptr fs_vary,mali_ptr varyings,mali_ptr pos,mali_ptr psiz,bool secondary_shader,void * job)3458 panfrost_draw_emit_tiler(struct panfrost_batch *batch,
3459 const struct pipe_draw_info *info,
3460 const struct pipe_draw_start_count_bias *draw,
3461 void *invocation_template,
3462 mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings,
3463 mali_ptr pos, mali_ptr psiz, bool secondary_shader,
3464 void *job)
3465 {
3466 struct panfrost_context *ctx = batch->ctx;
3467
3468 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
3469 memcpy(section, invocation_template, pan_size(INVOCATION));
3470
3471 panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader,
3472 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
3473
3474 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
3475 enum pipe_prim_type prim = u_reduced_prim(info->mode);
3476
3477 #if PAN_ARCH >= 6
3478 pan_section_pack(job, TILER_JOB, TILER, cfg) {
3479 cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0);
3480 }
3481
3482 pan_section_pack(job, TILER_JOB, PADDING, cfg);
3483 #endif
3484
3485 panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW),
3486 batch, true, prim, pos, fs_vary, varyings);
3487
3488 panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size);
3489 }
3490 #endif
3491
3492 static void
panfrost_launch_xfb(struct panfrost_batch * batch,const struct pipe_draw_info * info,mali_ptr attribs,mali_ptr attrib_bufs,unsigned count)3493 panfrost_launch_xfb(struct panfrost_batch *batch,
3494 const struct pipe_draw_info *info,
3495 mali_ptr attribs, mali_ptr attrib_bufs,
3496 unsigned count)
3497 {
3498 struct panfrost_context *ctx = batch->ctx;
3499
3500 struct panfrost_ptr t =
3501 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3502
3503 /* Nothing to do */
3504 if (batch->ctx->streamout.num_targets == 0)
3505 return;
3506
3507 /* TODO: XFB with index buffers */
3508 //assert(info->index_size == 0);
3509 u_trim_pipe_prim(info->mode, &count);
3510
3511 if (count == 0)
3512 return;
3513
3514 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3515 struct panfrost_shader_variants v = { .variants = vs->xfb };
3516
3517 vs->xfb->stream_output = vs->stream_output;
3518
3519 struct panfrost_shader_variants *saved_vs = ctx->shader[PIPE_SHADER_VERTEX];
3520 mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
3521 mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
3522 mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
3523
3524 ctx->shader[PIPE_SHADER_VERTEX] = &v;
3525 batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
3526
3527 #if PAN_ARCH >= 9
3528 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
3529 cfg.workgroup_size_x = 1;
3530 cfg.workgroup_size_y = 1;
3531 cfg.workgroup_size_z = 1;
3532
3533 cfg.workgroup_count_x = count;
3534 cfg.workgroup_count_y = info->instance_count;
3535 cfg.workgroup_count_z = 1;
3536
3537 panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX,
3538 batch->rsd[PIPE_SHADER_VERTEX],
3539 batch->tls.gpu);
3540
3541 /* TODO: Indexing. Also, this is a legacy feature... */
3542 cfg.compute.attribute_offset = batch->ctx->offset_start;
3543
3544 /* Transform feedback shaders do not use barriers or shared
3545 * memory, so we may merge workgroups.
3546 */
3547 cfg.allow_merging_workgroups = true;
3548 cfg.task_increment = 1;
3549 cfg.task_axis = MALI_TASK_AXIS_Z;
3550 }
3551 #else
3552 struct mali_invocation_packed invocation;
3553
3554 panfrost_pack_work_groups_compute(&invocation,
3555 1, count, info->instance_count,
3556 1, 1, 1, PAN_ARCH <= 5, false);
3557
3558 batch->uniform_buffers[PIPE_SHADER_VERTEX] =
3559 panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
3560 &batch->push_uniforms[PIPE_SHADER_VERTEX], NULL);
3561
3562 panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0,
3563 attribs, attrib_bufs, t.cpu);
3564 #endif
3565 enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
3566 #if PAN_ARCH <= 5
3567 job_type = MALI_JOB_TYPE_VERTEX;
3568 #endif
3569 panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type,
3570 true, false, 0, 0, &t, false);
3571
3572 ctx->shader[PIPE_SHADER_VERTEX] = saved_vs;
3573 batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
3574 batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
3575 batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
3576 }
3577
3578 static void
panfrost_direct_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)3579 panfrost_direct_draw(struct panfrost_batch *batch,
3580 const struct pipe_draw_info *info,
3581 unsigned drawid_offset,
3582 const struct pipe_draw_start_count_bias *draw)
3583 {
3584 if (!draw->count || !info->instance_count)
3585 return;
3586
3587 struct panfrost_context *ctx = batch->ctx;
3588
3589 /* If we change whether we're drawing points, or whether point sprites
3590 * are enabled (specified in the rasterizer), we may need to rebind
3591 * shaders accordingly. This implicitly covers the case of rebinding
3592 * framebuffers, because all dirty flags are set there.
3593 */
3594 if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
3595 ((ctx->active_prim == PIPE_PRIM_POINTS) ^
3596 (info->mode == PIPE_PRIM_POINTS))) {
3597
3598 ctx->active_prim = info->mode;
3599 panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
3600 }
3601
3602 /* Take into account a negative bias */
3603 ctx->indirect_draw = false;
3604 ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);
3605 ctx->instance_count = info->instance_count;
3606 ctx->base_vertex = info->index_size ? draw->index_bias : 0;
3607 ctx->base_instance = info->start_instance;
3608 ctx->active_prim = info->mode;
3609 ctx->drawid = drawid_offset;
3610
3611 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3612
3613 bool idvs = vs->info.vs.idvs;
3614 bool secondary_shader = vs->info.vs.secondary_enable;
3615
3616 UNUSED struct panfrost_ptr tiler, vertex;
3617
3618 if (idvs) {
3619 #if PAN_ARCH >= 9
3620 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
3621 #elif PAN_ARCH >= 6
3622 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
3623 #else
3624 unreachable("IDVS is unsupported on Midgard");
3625 #endif
3626 } else {
3627 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3628 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
3629 }
3630
3631 unsigned vertex_count = ctx->vertex_count;
3632
3633 unsigned min_index = 0, max_index = 0;
3634 mali_ptr indices = 0;
3635
3636 if (info->index_size && PAN_ARCH >= 9) {
3637 indices = panfrost_get_index_buffer(batch, info, draw);
3638 } else if (info->index_size) {
3639 indices = panfrost_get_index_buffer_bounded(batch, info, draw,
3640 &min_index,
3641 &max_index);
3642
3643 /* Use the corresponding values */
3644 vertex_count = max_index - min_index + 1;
3645 ctx->offset_start = min_index + draw->index_bias;
3646 } else {
3647 ctx->offset_start = draw->start;
3648 }
3649
3650 if (info->instance_count > 1) {
3651 unsigned count = vertex_count;
3652
3653 /* Index-Driven Vertex Shading requires different instances to
3654 * have different cache lines for position results. Each vertex
3655 * position is 16 bytes and the Mali cache line is 64 bytes, so
3656 * the instance count must be aligned to 4 vertices.
3657 */
3658 if (idvs)
3659 count = ALIGN_POT(count, 4);
3660
3661 ctx->padded_count = panfrost_padded_vertex_count(count);
3662 } else
3663 ctx->padded_count = vertex_count;
3664
3665 panfrost_statistics_record(ctx, info, draw);
3666
3667 #if PAN_ARCH <= 7
3668 struct mali_invocation_packed invocation;
3669 if (info->instance_count > 1) {
3670 panfrost_pack_work_groups_compute(&invocation,
3671 1, vertex_count, info->instance_count,
3672 1, 1, 1, true, false);
3673 } else {
3674 pan_pack(&invocation, INVOCATION, cfg) {
3675 cfg.invocations = MALI_POSITIVE(vertex_count);
3676 cfg.size_y_shift = 0;
3677 cfg.size_z_shift = 0;
3678 cfg.workgroups_x_shift = 0;
3679 cfg.workgroups_y_shift = 0;
3680 cfg.workgroups_z_shift = 32;
3681 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
3682 }
3683 }
3684
3685 /* Emit all sort of descriptors. */
3686 mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
3687
3688 panfrost_emit_varying_descriptor(batch,
3689 ctx->padded_count *
3690 ctx->instance_count,
3691 &vs_vary, &fs_vary, &varyings,
3692 NULL, &pos, &psiz,
3693 info->mode == PIPE_PRIM_POINTS);
3694
3695 mali_ptr attribs, attrib_bufs;
3696 attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
3697 #endif
3698
3699 panfrost_update_state_3d(batch);
3700 panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3701 panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3702 panfrost_clean_state_3d(ctx);
3703
3704 if (vs->xfb) {
3705 #if PAN_ARCH >= 9
3706 mali_ptr attribs = 0, attrib_bufs = 0;
3707 #endif
3708 panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count);
3709 }
3710
3711 /* Increment transform feedback offsets */
3712 panfrost_update_streamout_offsets(ctx);
3713
3714 /* Any side effects must be handled by the XFB shader, so we only need
3715 * to run vertex shaders if we need rasterization.
3716 */
3717 if (panfrost_batch_skip_rasterization(batch))
3718 return;
3719
3720 #if PAN_ARCH >= 9
3721 assert(idvs && "Memory allocated IDVS required on Valhall");
3722
3723 panfrost_emit_malloc_vertex(batch, info, draw, indices,
3724 secondary_shader, tiler.cpu);
3725
3726 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3727 MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0,
3728 0, &tiler, false);
3729 #else
3730 /* Fire off the draw itself */
3731 panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices,
3732 fs_vary, varyings, pos, psiz, secondary_shader,
3733 tiler.cpu);
3734 if (idvs) {
3735 #if PAN_ARCH >= 6
3736 panfrost_draw_emit_vertex_section(batch,
3737 vs_vary, varyings,
3738 attribs, attrib_bufs,
3739 pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
3740
3741 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3742 MALI_JOB_TYPE_INDEXED_VERTEX, false, false,
3743 0, 0, &tiler, false);
3744 #endif
3745 } else {
3746 panfrost_draw_emit_vertex(batch, info, &invocation,
3747 vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
3748 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3749 }
3750 #endif
3751 }
3752
3753 #if PAN_GPU_INDIRECTS
3754 static void
panfrost_indirect_draw(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)3755 panfrost_indirect_draw(struct panfrost_batch *batch,
3756 const struct pipe_draw_info *info,
3757 unsigned drawid_offset,
3758 const struct pipe_draw_indirect_info *indirect,
3759 const struct pipe_draw_start_count_bias *draw)
3760 {
3761 /* Indirect draw count and multi-draw not supported. */
3762 assert(indirect->draw_count == 1 && !indirect->indirect_draw_count);
3763
3764 struct panfrost_context *ctx = batch->ctx;
3765 struct panfrost_device *dev = pan_device(ctx->base.screen);
3766
3767 /* TODO: update statistics (see panfrost_statistics_record()) */
3768 /* TODO: Increment transform feedback offsets */
3769 assert(ctx->streamout.num_targets == 0);
3770
3771 ctx->active_prim = info->mode;
3772 ctx->drawid = drawid_offset;
3773 ctx->indirect_draw = true;
3774
3775 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
3776
3777 bool idvs = vs->info.vs.idvs;
3778 bool secondary_shader = vs->info.vs.secondary_enable;
3779
3780 struct panfrost_ptr tiler = { 0 }, vertex = { 0 };
3781
3782 if (idvs) {
3783 #if PAN_ARCH >= 6
3784 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
3785 #else
3786 unreachable("IDVS is unsupported on Midgard");
3787 #endif
3788 } else {
3789 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
3790 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
3791 }
3792
3793 struct panfrost_bo *index_buf = NULL;
3794
3795 if (info->index_size) {
3796 assert(!info->has_user_indices);
3797 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
3798 index_buf = rsrc->image.data.bo;
3799 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
3800 }
3801
3802 mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0;
3803 unsigned varying_buf_count;
3804
3805 /* We want to create templates, set all count fields to 0 to reflect
3806 * that.
3807 */
3808 ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0;
3809 ctx->offset_start = 0;
3810
3811 /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the
3812 * vertex shader uses gl_VertexID or gl_BaseVertex.
3813 */
3814 ctx->first_vertex_sysval_ptr = 0;
3815 ctx->base_vertex_sysval_ptr = 0;
3816 ctx->base_instance_sysval_ptr = 0;
3817
3818 panfrost_update_state_3d(batch);
3819 panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3820 panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3821 panfrost_clean_state_3d(ctx);
3822
3823 bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS);
3824
3825 panfrost_emit_varying_descriptor(batch, 0,
3826 &vs_vary, &fs_vary, &varyings,
3827 &varying_buf_count, &pos, &psiz,
3828 point_coord_replace);
3829
3830 mali_ptr attribs, attrib_bufs;
3831 attribs = panfrost_emit_vertex_data(batch, &attrib_bufs);
3832
3833 /* Zero-ed invocation, the compute job will update it. */
3834 static struct mali_invocation_packed invocation;
3835
3836 /* Fire off the draw itself */
3837 panfrost_draw_emit_tiler(batch, info, draw, &invocation,
3838 index_buf ? index_buf->ptr.gpu : 0,
3839 fs_vary, varyings, pos, psiz, secondary_shader,
3840 tiler.cpu);
3841 if (idvs) {
3842 #if PAN_ARCH >= 6
3843 panfrost_draw_emit_vertex_section(batch,
3844 vs_vary, varyings,
3845 attribs, attrib_bufs,
3846 pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
3847 #endif
3848 } else {
3849 panfrost_draw_emit_vertex(batch, info, &invocation,
3850 vs_vary, varyings, attribs, attrib_bufs, vertex.cpu);
3851 }
3852
3853 /* Add the varying heap BO to the batch if we're allocating varyings. */
3854 if (varyings) {
3855 panfrost_batch_add_bo(batch,
3856 dev->indirect_draw_shaders.varying_heap,
3857 PIPE_SHADER_VERTEX);
3858 }
3859
3860 assert(indirect->buffer);
3861
3862 struct panfrost_resource *draw_buf = pan_resource(indirect->buffer);
3863
3864 /* Don't count images: those attributes don't need to be patched. */
3865 unsigned attrib_count =
3866 vs->info.attribute_count -
3867 util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]);
3868
3869 panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX);
3870
3871 struct pan_indirect_draw_info draw_info = {
3872 .last_indirect_draw = batch->indirect_draw_job_id,
3873 .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset,
3874 .index_buf = index_buf ? index_buf->ptr.gpu : 0,
3875 .first_vertex_sysval = ctx->first_vertex_sysval_ptr,
3876 .base_vertex_sysval = ctx->base_vertex_sysval_ptr,
3877 .base_instance_sysval = ctx->base_instance_sysval_ptr,
3878 .vertex_job = vertex.gpu,
3879 .tiler_job = tiler.gpu,
3880 .attrib_bufs = attrib_bufs,
3881 .attribs = attribs,
3882 .attrib_count = attrib_count,
3883 .varying_bufs = varyings,
3884 .index_size = info->index_size,
3885 };
3886
3887 if (panfrost_writes_point_size(ctx))
3888 draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE;
3889
3890 if (vs->info.vs.writes_point_size)
3891 draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ;
3892
3893 if (idvs)
3894 draw_info.flags |= PAN_INDIRECT_DRAW_IDVS;
3895
3896 if (info->primitive_restart) {
3897 draw_info.restart_index = info->restart_index;
3898 draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART;
3899 }
3900
3901 batch->indirect_draw_job_id =
3902 GENX(panfrost_emit_indirect_draw)(&batch->pool.base,
3903 &batch->scoreboard,
3904 &draw_info,
3905 &batch->indirect_draw_ctx);
3906
3907 if (idvs) {
3908 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
3909 MALI_JOB_TYPE_INDEXED_VERTEX, false, false,
3910 0, 0, &tiler, false);
3911 } else {
3912 panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
3913 }
3914 }
3915 #endif
3916
3917 static bool
panfrost_compatible_batch_state(struct panfrost_batch * batch,bool points)3918 panfrost_compatible_batch_state(struct panfrost_batch *batch,
3919 bool points)
3920 {
3921 /* Only applies on Valhall */
3922 if (PAN_ARCH < 9)
3923 return true;
3924
3925 struct panfrost_context *ctx = batch->ctx;
3926 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3927
3928 bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
3929 bool first = rast->flatshade_first;
3930
3931 /* gl_PointCoord orientation only matters when drawing points, but
3932 * provoking vertex doesn't matter for points.
3933 */
3934 if (points)
3935 return pan_tristate_set(&batch->sprite_coord_origin, coord);
3936 else
3937 return pan_tristate_set(&batch->first_provoking_vertex, first);
3938 }
3939
3940 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3941 panfrost_draw_vbo(struct pipe_context *pipe,
3942 const struct pipe_draw_info *info,
3943 unsigned drawid_offset,
3944 const struct pipe_draw_indirect_info *indirect,
3945 const struct pipe_draw_start_count_bias *draws,
3946 unsigned num_draws)
3947 {
3948 struct panfrost_context *ctx = pan_context(pipe);
3949 struct panfrost_device *dev = pan_device(pipe->screen);
3950
3951 if (!panfrost_render_condition_check(ctx))
3952 return;
3953
3954 /* Emulate indirect draws unless we're using the experimental path */
3955 if ((!(dev->debug & PAN_DBG_INDIRECT) || !PAN_GPU_INDIRECTS) && indirect && indirect->buffer) {
3956 assert(num_draws == 1);
3957 util_draw_indirect(pipe, info, indirect);
3958 return;
3959 }
3960
3961 /* Do some common setup */
3962 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3963
3964 /* Don't add too many jobs to a single batch. Hardware has a hard limit
3965 * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to
3966 * avoid the risk of timeouts. This might not be a good idea. */
3967 if (unlikely(batch->scoreboard.job_index > 10000))
3968 batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3969
3970 bool points = (info->mode == PIPE_PRIM_POINTS);
3971
3972 if (unlikely(!panfrost_compatible_batch_state(batch, points))) {
3973 batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
3974
3975 ASSERTED bool succ = panfrost_compatible_batch_state(batch, points);
3976 assert(succ && "must be able to set state for a fresh batch");
3977 }
3978
3979 /* panfrost_batch_skip_rasterization reads
3980 * batch->scissor_culls_everything, which is set by
3981 * panfrost_emit_viewport, so call that first.
3982 */
3983 if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
3984 batch->viewport = panfrost_emit_viewport(batch);
3985
3986 /* Mark everything dirty when debugging */
3987 if (unlikely(dev->debug & PAN_DBG_DIRTY))
3988 panfrost_dirty_state_all(ctx);
3989
3990 /* Conservatively assume draw parameters always change */
3991 ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3992
3993 if (indirect) {
3994 assert(num_draws == 1);
3995 assert(PAN_GPU_INDIRECTS);
3996
3997 #if PAN_GPU_INDIRECTS
3998 if (indirect->count_from_stream_output) {
3999 struct pipe_draw_start_count_bias tmp_draw = *draws;
4000 struct panfrost_streamout_target *so =
4001 pan_so_target(indirect->count_from_stream_output);
4002
4003 tmp_draw.start = 0;
4004 tmp_draw.count = so->offset;
4005 tmp_draw.index_bias = 0;
4006 panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw);
4007 return;
4008 }
4009
4010 panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]);
4011 return;
4012 #endif
4013 }
4014
4015 struct pipe_draw_info tmp_info = *info;
4016 unsigned drawid = drawid_offset;
4017
4018 for (unsigned i = 0; i < num_draws; i++) {
4019 panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]);
4020
4021 if (tmp_info.increment_draw_id) {
4022 ctx->dirty |= PAN_DIRTY_DRAWID;
4023 drawid++;
4024 }
4025 }
4026
4027 }
4028
4029 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
4030 * construct the COMPUTE job and some of its payload.
4031 */
4032
4033 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)4034 panfrost_launch_grid(struct pipe_context *pipe,
4035 const struct pipe_grid_info *info)
4036 {
4037 struct panfrost_context *ctx = pan_context(pipe);
4038
4039 /* XXX - shouldn't be necessary with working memory barriers. Affected
4040 * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
4041 panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
4042
4043 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
4044
4045 struct panfrost_shader_state *cs =
4046 &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0];
4047
4048 /* Indirect dispatch can't handle workgroup local storage since that
4049 * would require dynamic memory allocation. Bail in this case. */
4050 if (info->indirect && ((cs->info.wls_size != 0) || !PAN_GPU_INDIRECTS)) {
4051 struct pipe_transfer *transfer;
4052 uint32_t *params = pipe_buffer_map_range(pipe, info->indirect,
4053 info->indirect_offset,
4054 3 * sizeof(uint32_t),
4055 PIPE_MAP_READ,
4056 &transfer);
4057
4058 struct pipe_grid_info direct = *info;
4059 direct.indirect = NULL;
4060 direct.grid[0] = params[0];
4061 direct.grid[1] = params[1];
4062 direct.grid[2] = params[2];
4063 pipe_buffer_unmap(pipe, transfer);
4064
4065 if (params[0] && params[1] && params[2])
4066 panfrost_launch_grid(pipe, &direct);
4067
4068 return;
4069 }
4070
4071 ctx->compute_grid = info;
4072
4073 struct panfrost_ptr t =
4074 pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
4075
4076 /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so
4077 * reuse the graphics path for this by lowering to Gallium */
4078
4079 struct pipe_constant_buffer ubuf = {
4080 .buffer = NULL,
4081 .buffer_offset = 0,
4082 .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->req_input_mem,
4083 .user_buffer = info->input
4084 };
4085
4086 if (info->input)
4087 pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf);
4088
4089 /* Invoke according to the grid info */
4090
4091 unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] };
4092
4093 if (info->indirect)
4094 num_wg[0] = num_wg[1] = num_wg[2] = 1;
4095
4096 panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
4097
4098 #if PAN_ARCH <= 7
4099 panfrost_pack_work_groups_compute(pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION),
4100 num_wg[0], num_wg[1], num_wg[2],
4101 info->block[0], info->block[1],
4102 info->block[2],
4103 false, info->indirect != NULL);
4104
4105 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
4106 cfg.job_task_split =
4107 util_logbase2_ceil(info->block[0] + 1) +
4108 util_logbase2_ceil(info->block[1] + 1) +
4109 util_logbase2_ceil(info->block[2] + 1);
4110 }
4111
4112 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
4113 cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
4114 cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
4115 cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
4116 cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
4117 cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
4118 cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
4119 cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
4120 }
4121 #else
4122 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
4123 cfg.workgroup_size_x = info->block[0];
4124 cfg.workgroup_size_y = info->block[1];
4125 cfg.workgroup_size_z = info->block[2];
4126
4127 cfg.workgroup_count_x = num_wg[0];
4128 cfg.workgroup_count_y = num_wg[1];
4129 cfg.workgroup_count_z = num_wg[2];
4130
4131 panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
4132 batch->rsd[PIPE_SHADER_COMPUTE],
4133 panfrost_emit_shared_memory(batch, info));
4134
4135 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups;
4136 cfg.task_increment = 1;
4137 cfg.task_axis = MALI_TASK_AXIS_Z;
4138 }
4139 #endif
4140
4141 unsigned indirect_dep = 0;
4142 #if PAN_GPU_INDIRECTS
4143 if (info->indirect) {
4144 struct pan_indirect_dispatch_info indirect = {
4145 .job = t.gpu,
4146 .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu +
4147 info->indirect_offset,
4148 .num_wg_sysval = {
4149 batch->num_wg_sysval[0],
4150 batch->num_wg_sysval[1],
4151 batch->num_wg_sysval[2],
4152 },
4153 };
4154
4155 indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base,
4156 &batch->scoreboard,
4157 &indirect);
4158 }
4159 #endif
4160
4161 panfrost_add_job(&batch->pool.base, &batch->scoreboard,
4162 MALI_JOB_TYPE_COMPUTE, true, false,
4163 indirect_dep, 0, &t, false);
4164 panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
4165 }
4166
4167 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)4168 panfrost_create_rasterizer_state(
4169 struct pipe_context *pctx,
4170 const struct pipe_rasterizer_state *cso)
4171 {
4172 struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
4173
4174 so->base = *cso;
4175
4176 /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */
4177 assert(cso->offset_clamp == 0.0);
4178
4179 #if PAN_ARCH <= 7
4180 pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
4181 cfg.multisample_enable = cso->multisample;
4182 cfg.fixed_function_near_discard = cso->depth_clip_near;
4183 cfg.fixed_function_far_discard = cso->depth_clip_far;
4184 cfg.shader_depth_range_fixed = true;
4185 }
4186
4187 pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
4188 cfg.front_facing_depth_bias = cso->offset_tri;
4189 cfg.back_facing_depth_bias = cso->offset_tri;
4190 cfg.single_sampled_lines = !cso->multisample;
4191 }
4192 #endif
4193
4194 return so;
4195 }
4196
4197 #if PAN_ARCH >= 9
4198 /*
4199 * Given a pipe_vertex_element, pack the corresponding Valhall attribute
4200 * descriptor. This function is called at CSO create time. Since
4201 * pipe_vertex_element lacks a stride, the packed attribute descriptor will not
4202 * be uploaded until draw time.
4203 */
4204 static void
panfrost_pack_attribute(struct panfrost_device * dev,const struct pipe_vertex_element el,struct mali_attribute_packed * out)4205 panfrost_pack_attribute(struct panfrost_device *dev,
4206 const struct pipe_vertex_element el,
4207 struct mali_attribute_packed *out)
4208 {
4209 pan_pack(out, ATTRIBUTE, cfg) {
4210 cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER;
4211 cfg.frequency = (el.instance_divisor > 0) ?
4212 MALI_ATTRIBUTE_FREQUENCY_INSTANCE :
4213 MALI_ATTRIBUTE_FREQUENCY_VERTEX;
4214 cfg.format = dev->formats[el.src_format].hw;
4215 cfg.offset = el.src_offset;
4216 cfg.buffer_index = el.vertex_buffer_index;
4217
4218 if (el.instance_divisor == 0) {
4219 /* Per-vertex */
4220 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
4221 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
4222 cfg.offset_enable = true;
4223 } else if (util_is_power_of_two_or_zero(el.instance_divisor)) {
4224 /* Per-instance, POT divisor */
4225 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
4226 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
4227 cfg.divisor_r = __builtin_ctz(el.instance_divisor);
4228 } else {
4229 /* Per-instance, NPOT divisor */
4230 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
4231 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
4232
4233 cfg.divisor_d =
4234 panfrost_compute_magic_divisor(el.instance_divisor,
4235 &cfg.divisor_r, &cfg.divisor_e);
4236 }
4237 }
4238 }
4239 #endif
4240
4241 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)4242 panfrost_create_vertex_elements_state(
4243 struct pipe_context *pctx,
4244 unsigned num_elements,
4245 const struct pipe_vertex_element *elements)
4246 {
4247 struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
4248 struct panfrost_device *dev = pan_device(pctx->screen);
4249
4250 so->num_elements = num_elements;
4251 memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
4252
4253 #if PAN_ARCH >= 9
4254 for (unsigned i = 0; i < num_elements; ++i)
4255 panfrost_pack_attribute(dev, elements[i], &so->attributes[i]);
4256 #else
4257 /* Assign attribute buffers corresponding to the vertex buffers, keyed
4258 * for a particular divisor since that's how instancing works on Mali */
4259 for (unsigned i = 0; i < num_elements; ++i) {
4260 so->element_buffer[i] = pan_assign_vertex_buffer(
4261 so->buffers, &so->nr_bufs,
4262 elements[i].vertex_buffer_index,
4263 elements[i].instance_divisor);
4264 }
4265
4266 for (int i = 0; i < num_elements; ++i) {
4267 enum pipe_format fmt = elements[i].src_format;
4268 so->formats[i] = dev->formats[fmt].hw;
4269 }
4270
4271 /* Let's also prepare vertex builtins */
4272 so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
4273 so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw;
4274 #endif
4275
4276 return so;
4277 }
4278
4279 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)4280 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
4281 {
4282 switch (in) {
4283 case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP;
4284 case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO;
4285 case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE;
4286 case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT;
4287 case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT;
4288 case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP;
4289 case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP;
4290 case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT;
4291 default: unreachable("Invalid stencil op");
4292 }
4293 }
4294
4295 #if PAN_ARCH <= 7
4296 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)4297 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
4298 struct mali_stencil_packed *out)
4299 {
4300 pan_pack(out, STENCIL, s) {
4301 s.mask = in->valuemask;
4302 s.compare_function = (enum mali_func) in->func;
4303 s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
4304 s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
4305 s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
4306 }
4307 }
4308 #endif
4309
4310 static bool
pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state * zsa)4311 pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa)
4312 {
4313 if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS)
4314 return false;
4315
4316 if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS)
4317 return false;
4318
4319 if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS)
4320 return false;
4321
4322 return true;
4323 }
4324
4325 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)4326 panfrost_create_depth_stencil_state(struct pipe_context *pipe,
4327 const struct pipe_depth_stencil_alpha_state *zsa)
4328 {
4329 struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
4330 so->base = *zsa;
4331
4332 const struct pipe_stencil_state front = zsa->stencil[0];
4333 const struct pipe_stencil_state back =
4334 zsa->stencil[1].enabled ? zsa->stencil[1] : front;
4335
4336 enum mali_func depth_func = zsa->depth_enabled ?
4337 (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS;
4338
4339 /* Normalize (there's no separate enable) */
4340 if (PAN_ARCH <= 5 && !zsa->alpha_enabled)
4341 so->base.alpha_func = MALI_FUNC_ALWAYS;
4342
4343 #if PAN_ARCH <= 7
4344 /* Prepack relevant parts of the Renderer State Descriptor. They will
4345 * be ORed in at draw-time */
4346 pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
4347 cfg.depth_function = depth_func;
4348 cfg.depth_write_mask = zsa->depth_writemask;
4349 }
4350
4351 pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
4352 cfg.stencil_enable = front.enabled;
4353 cfg.stencil_mask_front = front.writemask;
4354 cfg.stencil_mask_back = back.writemask;
4355
4356 #if PAN_ARCH <= 5
4357 cfg.alpha_test_compare_function =
4358 (enum mali_func) so->base.alpha_func;
4359 #endif
4360 }
4361
4362 /* Stencil tests have their own words in the RSD */
4363 pan_pipe_to_stencil(&front, &so->stencil_front);
4364 pan_pipe_to_stencil(&back, &so->stencil_back);
4365 #else
4366 pan_pack(&so->desc, DEPTH_STENCIL, cfg) {
4367 cfg.front_compare_function = (enum mali_func) front.func;
4368 cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op);
4369 cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op);
4370 cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op);
4371
4372 cfg.back_compare_function = (enum mali_func) back.func;
4373 cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op);
4374 cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op);
4375 cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op);
4376
4377 cfg.stencil_test_enable = front.enabled;
4378 cfg.front_write_mask = front.writemask;
4379 cfg.back_write_mask = back.writemask;
4380 cfg.front_value_mask = front.valuemask;
4381 cfg.back_value_mask = back.valuemask;
4382
4383 cfg.depth_write_enable = zsa->depth_writemask;
4384 cfg.depth_function = depth_func;
4385 }
4386 #endif
4387
4388 so->enabled = zsa->stencil[0].enabled ||
4389 (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
4390
4391 so->zs_always_passes = pipe_zs_always_passes(zsa);
4392 so->writes_zs = util_writes_depth_stencil(zsa);
4393
4394 /* TODO: Bounds test should be easy */
4395 assert(!zsa->depth_bounds_test);
4396
4397 return so;
4398 }
4399
4400 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)4401 panfrost_create_sampler_view(
4402 struct pipe_context *pctx,
4403 struct pipe_resource *texture,
4404 const struct pipe_sampler_view *template)
4405 {
4406 struct panfrost_context *ctx = pan_context(pctx);
4407 struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
4408
4409 pan_legalize_afbc_format(ctx, pan_resource(texture), template->format);
4410
4411 pipe_reference(NULL, &texture->reference);
4412
4413 so->base = *template;
4414 so->base.texture = texture;
4415 so->base.reference.count = 1;
4416 so->base.context = pctx;
4417
4418 panfrost_create_sampler_view_bo(so, pctx, texture);
4419
4420 return (struct pipe_sampler_view *) so;
4421 }
4422
4423 /* A given Gallium blend state can be encoded to the hardware in numerous,
4424 * dramatically divergent ways due to the interactions of blending with
4425 * framebuffer formats. Conceptually, there are two modes:
4426 *
4427 * - Fixed-function blending (for suitable framebuffer formats, suitable blend
4428 * state, and suitable blend constant)
4429 *
4430 * - Blend shaders (for everything else)
4431 *
4432 * A given Gallium blend configuration will compile to exactly one
4433 * fixed-function blend state, if it compiles to any, although the constant
4434 * will vary across runs as that is tracked outside of the Gallium CSO.
4435 *
4436 * However, that same blend configuration will compile to many different blend
4437 * shaders, depending on the framebuffer formats active. The rationale is that
4438 * blend shaders override not just fixed-function blending but also
4439 * fixed-function format conversion, so blend shaders are keyed to a particular
4440 * framebuffer format. As an example, the tilebuffer format is identical for
4441 * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
4442 * blend shaders.
4443 *
4444 * All of this state is encapsulated in the panfrost_blend_state struct
4445 * (our subclass of pipe_blend_state).
4446 */
4447
4448 /* Create a blend CSO. Essentially, try to compile a fixed-function
4449 * expression and initialize blend shaders */
4450
4451 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)4452 panfrost_create_blend_state(struct pipe_context *pipe,
4453 const struct pipe_blend_state *blend)
4454 {
4455 struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
4456 so->base = *blend;
4457
4458 so->pan.logicop_enable = blend->logicop_enable;
4459 so->pan.logicop_func = blend->logicop_func;
4460 so->pan.rt_count = blend->max_rt + 1;
4461
4462 for (unsigned c = 0; c < so->pan.rt_count; ++c) {
4463 unsigned g = blend->independent_blend_enable ? c : 0;
4464 const struct pipe_rt_blend_state pipe = blend->rt[g];
4465 struct pan_blend_equation equation = {0};
4466
4467 equation.color_mask = pipe.colormask;
4468 equation.blend_enable = pipe.blend_enable;
4469
4470 if (pipe.blend_enable) {
4471 equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func);
4472 equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor);
4473 equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor);
4474 equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor);
4475 equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor);
4476 equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func);
4477 equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor);
4478 equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor);
4479 equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor);
4480 equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor);
4481 }
4482
4483 /* Determine some common properties */
4484 unsigned constant_mask = pan_blend_constant_mask(equation);
4485 const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
4486 so->info[c] = (struct pan_blend_info) {
4487 .no_colour = (equation.color_mask == 0),
4488 .opaque = pan_blend_is_opaque(equation),
4489 .constant_mask = constant_mask,
4490
4491 /* TODO: check the dest for the logicop */
4492 .load_dest = blend->logicop_enable ||
4493 pan_blend_reads_dest(equation),
4494
4495 /* Could this possibly be fixed-function? */
4496 .fixed_function = !blend->logicop_enable &&
4497 pan_blend_can_fixed_function(equation,
4498 supports_2src) &&
4499 (!constant_mask ||
4500 pan_blend_supports_constant(PAN_ARCH, c)),
4501
4502 .alpha_zero_nop = pan_blend_alpha_zero_nop(equation),
4503 .alpha_one_store = pan_blend_alpha_one_store(equation),
4504 };
4505
4506 so->pan.rts[c].equation = equation;
4507
4508 /* Bifrost needs to know if any render target loads its
4509 * destination in the hot draw path, so precompute this */
4510 if (so->info[c].load_dest)
4511 so->load_dest_mask |= BITFIELD_BIT(c);
4512
4513 /* Converting equations to Mali style is expensive, do it at
4514 * CSO create time instead of draw-time */
4515 if (so->info[c].fixed_function) {
4516 so->equation[c] = pan_pack_blend(equation);
4517 }
4518 }
4519
4520 return so;
4521 }
4522
4523 static void
prepare_shader(struct panfrost_shader_state * state,struct panfrost_pool * pool,bool upload)4524 prepare_shader(struct panfrost_shader_state *state,
4525 struct panfrost_pool *pool, bool upload)
4526 {
4527 #if PAN_ARCH <= 7
4528 void *out = &state->partial_rsd;
4529
4530 if (upload) {
4531 struct panfrost_ptr ptr =
4532 pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
4533
4534 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
4535 out = ptr.cpu;
4536 }
4537
4538 pan_pack(out, RENDERER_STATE, cfg) {
4539 pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
4540
4541 }
4542 #else
4543 assert(upload);
4544
4545 /* The address in the shader program descriptor must be non-null, but
4546 * the entire shader program descriptor may be omitted.
4547 *
4548 * See dEQP-GLES31.functional.compute.basic.empty
4549 */
4550 if (!state->bin.gpu)
4551 return;
4552
4553 bool vs = (state->info.stage == MESA_SHADER_VERTEX);
4554 bool secondary_enable = (vs && state->info.vs.secondary_enable);
4555
4556 unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1;
4557 struct panfrost_ptr ptr = pan_pool_alloc_desc_array(&pool->base,
4558 nr_variants,
4559 SHADER_PROGRAM);
4560
4561 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
4562
4563 /* Generic, or IDVS/points */
4564 pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) {
4565 cfg.stage = pan_shader_stage(&state->info);
4566 cfg.primary_shader = true;
4567 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
4568 cfg.binary = state->bin.gpu;
4569 cfg.preload.r48_r63 = (state->info.preload >> 48);
4570
4571 if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
4572 cfg.requires_helper_threads = state->info.contains_barrier;
4573 }
4574
4575 if (!vs)
4576 return;
4577
4578 /* IDVS/triangles */
4579 pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) {
4580 cfg.stage = pan_shader_stage(&state->info);
4581 cfg.primary_shader = true;
4582 cfg.register_allocation = pan_register_allocation(state->info.work_reg_count);
4583 cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
4584 cfg.preload.r48_r63 = (state->info.preload >> 48);
4585 }
4586
4587 if (!secondary_enable)
4588 return;
4589
4590 pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) {
4591 unsigned work_count = state->info.vs.secondary_work_reg_count;
4592
4593 cfg.stage = pan_shader_stage(&state->info);
4594 cfg.primary_shader = false;
4595 cfg.register_allocation = pan_register_allocation(work_count);
4596 cfg.binary = state->bin.gpu + state->info.vs.secondary_offset;
4597 cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48);
4598 }
4599 #endif
4600 }
4601
4602 static void
panfrost_get_sample_position(struct pipe_context * context,unsigned sample_count,unsigned sample_index,float * out_value)4603 panfrost_get_sample_position(struct pipe_context *context,
4604 unsigned sample_count,
4605 unsigned sample_index,
4606 float *out_value)
4607 {
4608 panfrost_query_sample_position(
4609 panfrost_sample_pattern(sample_count),
4610 sample_index,
4611 out_value);
4612 }
4613
4614 static void
screen_destroy(struct pipe_screen * pscreen)4615 screen_destroy(struct pipe_screen *pscreen)
4616 {
4617 struct panfrost_device *dev = pan_device(pscreen);
4618 GENX(pan_blitter_cleanup)(dev);
4619
4620 #if PAN_GPU_INDIRECTS
4621 GENX(panfrost_cleanup_indirect_draw_shaders)(dev);
4622 GENX(pan_indirect_dispatch_cleanup)(dev);
4623 #endif
4624 }
4625
4626 static void
preload(struct panfrost_batch * batch,struct pan_fb_info * fb)4627 preload(struct panfrost_batch *batch, struct pan_fb_info *fb)
4628 {
4629 GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
4630 PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
4631 }
4632
4633 static void
init_batch(struct panfrost_batch * batch)4634 init_batch(struct panfrost_batch *batch)
4635 {
4636 /* Reserve the framebuffer and local storage descriptors */
4637 batch->framebuffer =
4638 #if PAN_ARCH == 4
4639 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
4640 #else
4641 pan_pool_alloc_desc_aggregate(&batch->pool.base,
4642 PAN_DESC(FRAMEBUFFER),
4643 PAN_DESC(ZS_CRC_EXTENSION),
4644 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
4645
4646 batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD;
4647 #endif
4648
4649 #if PAN_ARCH >= 6
4650 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
4651 #else
4652 /* On Midgard, the TLS is embedded in the FB descriptor */
4653 batch->tls = batch->framebuffer;
4654 #endif
4655 }
4656
4657 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)4658 panfrost_sampler_view_destroy(
4659 struct pipe_context *pctx,
4660 struct pipe_sampler_view *pview)
4661 {
4662 struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
4663
4664 pipe_resource_reference(&pview->texture, NULL);
4665 panfrost_bo_unreference(view->state.bo);
4666 ralloc_free(view);
4667 }
4668
4669 static void
context_init(struct pipe_context * pipe)4670 context_init(struct pipe_context *pipe)
4671 {
4672 pipe->draw_vbo = panfrost_draw_vbo;
4673 pipe->launch_grid = panfrost_launch_grid;
4674
4675 pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
4676 pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
4677 pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
4678 pipe->create_sampler_view = panfrost_create_sampler_view;
4679 pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
4680 pipe->create_sampler_state = panfrost_create_sampler_state;
4681 pipe->create_blend_state = panfrost_create_blend_state;
4682
4683 pipe->get_sample_position = panfrost_get_sample_position;
4684 }
4685
4686 #if PAN_ARCH <= 5
4687
4688 /* Returns the polygon list's GPU address if available, or otherwise allocates
4689 * the polygon list. It's perfectly fast to use allocate/free BO directly,
4690 * since we'll hit the BO cache and this is one-per-batch anyway. */
4691
4692 static mali_ptr
batch_get_polygon_list(struct panfrost_batch * batch)4693 batch_get_polygon_list(struct panfrost_batch *batch)
4694 {
4695 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
4696
4697 if (!batch->tiler_ctx.midgard.polygon_list) {
4698 bool has_draws = batch->scoreboard.first_tiler != NULL;
4699 unsigned size =
4700 panfrost_tiler_get_polygon_list_size(dev,
4701 batch->key.width,
4702 batch->key.height,
4703 has_draws);
4704 size = util_next_power_of_two(size);
4705
4706 /* Create the BO as invisible if we can. In the non-hierarchical tiler case,
4707 * we need to write the polygon list manually because there's not WRITE_VALUE
4708 * job in the chain (maybe we should add one...). */
4709 bool init_polygon_list = !has_draws && dev->model->quirks.no_hierarchical_tiling;
4710 batch->tiler_ctx.midgard.polygon_list =
4711 panfrost_batch_create_bo(batch, size,
4712 init_polygon_list ? 0 : PAN_BO_INVISIBLE,
4713 PIPE_SHADER_VERTEX,
4714 "Polygon list");
4715 panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list,
4716 PIPE_SHADER_FRAGMENT);
4717
4718 if (init_polygon_list) {
4719 assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu);
4720 uint32_t *polygon_list_body =
4721 batch->tiler_ctx.midgard.polygon_list->ptr.cpu +
4722 MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
4723
4724 /* Magic for Mali T720 */
4725 polygon_list_body[0] = 0xa0000000;
4726 }
4727
4728 batch->tiler_ctx.midgard.disable = !has_draws;
4729 }
4730
4731 return batch->tiler_ctx.midgard.polygon_list->ptr.gpu;
4732 }
4733 #endif
4734
4735 static void
init_polygon_list(struct panfrost_batch * batch)4736 init_polygon_list(struct panfrost_batch *batch)
4737 {
4738 #if PAN_ARCH <= 5
4739 mali_ptr polygon_list = batch_get_polygon_list(batch);
4740 panfrost_scoreboard_initialize_tiler(&batch->pool.base,
4741 &batch->scoreboard,
4742 polygon_list);
4743 #endif
4744 }
4745
4746 void
GENX(panfrost_cmdstream_screen_init)4747 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
4748 {
4749 struct panfrost_device *dev = &screen->dev;
4750
4751 screen->vtbl.prepare_shader = prepare_shader;
4752 screen->vtbl.emit_tls = emit_tls;
4753 screen->vtbl.emit_fbd = emit_fbd;
4754 screen->vtbl.emit_fragment_job = emit_fragment_job;
4755 screen->vtbl.screen_destroy = screen_destroy;
4756 screen->vtbl.preload = preload;
4757 screen->vtbl.context_init = context_init;
4758 screen->vtbl.init_batch = init_batch;
4759 screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
4760 screen->vtbl.init_polygon_list = init_polygon_list;
4761 screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
4762 screen->vtbl.compile_shader = GENX(pan_shader_compile);
4763
4764 GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
4765 &screen->blitter.desc_pool.base);
4766 #if PAN_GPU_INDIRECTS
4767 GENX(pan_indirect_dispatch_init)(dev);
4768 GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base);
4769 #endif
4770 }
4771