1 /*
2 * Copyright (C) 2023 Amazon.com, Inc. or its affiliates.
3 * Copyright (C) 2018 Alyssa Rosenzweig
4 * Copyright (C) 2020 Collabora Ltd.
5 * Copyright © 2017 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26
27 #include "gallium/auxiliary/util/u_blend.h"
28 #include "pipe/p_defines.h"
29 #include "pipe/p_state.h"
30 #include "util/macros.h"
31 #include "util/u_draw.h"
32 #include "util/u_helpers.h"
33 #include "util/u_memory.h"
34 #include "util/u_prim.h"
35 #include "util/u_sample_positions.h"
36 #include "util/u_vbuf.h"
37 #include "util/u_viewport.h"
38
39 #include "decode.h"
40
41 #include "genxml/gen_macros.h"
42
43 #include "pan_afbc_cso.h"
44 #include "pan_blend.h"
45 #include "pan_bo.h"
46 #include "pan_cmdstream.h"
47 #include "pan_context.h"
48 #include "pan_csf.h"
49 #include "pan_fb_preload.h"
50 #include "pan_format.h"
51 #include "pan_indirect_dispatch.h"
52 #include "pan_jm.h"
53 #include "pan_job.h"
54 #include "pan_pool.h"
55 #include "pan_resource.h"
56 #include "pan_samples.h"
57 #include "pan_shader.h"
58 #include "pan_texture.h"
59 #include "pan_util.h"
60
61 /* JOBX() is used to select the job backend helpers to call from generic
62 * functions. */
63 #if PAN_ARCH <= 9
64 #define JOBX(__suffix) GENX(jm_##__suffix)
65 #elif PAN_ARCH <= 10
66 #define JOBX(__suffix) GENX(csf_##__suffix)
67 #else
68 #error "Unsupported arch"
69 #endif
70
71 struct panfrost_sampler_state {
72 struct pipe_sampler_state base;
73 struct mali_sampler_packed hw;
74 };
75
76 /* Misnomer: Sampler view corresponds to textures, not samplers */
77
78 struct panfrost_sampler_view {
79 struct pipe_sampler_view base;
80 struct panfrost_pool_ref state;
81 struct mali_texture_packed bifrost_descriptor;
82 uint64_t texture_bo;
83 uint64_t texture_size;
84 uint64_t modifier;
85
86 /* Pool used to allocate the descriptor. If NULL, defaults to the global
87 * descriptor pool. Can be set for short lived descriptors, useful for
88 * shader images on Valhall.
89 */
90 struct panfrost_pool *pool;
91 };
92
93 /* Statically assert that PIPE_* enums match the hardware enums.
94 * (As long as they match, we don't need to translate them.)
95 */
96 static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match");
97 static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match");
98 static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match");
99 static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match");
100 static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match");
101 static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match");
102 static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match");
103 static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match");
104
105 static inline enum mali_sample_pattern
panfrost_sample_pattern(unsigned samples)106 panfrost_sample_pattern(unsigned samples)
107 {
108 switch (samples) {
109 case 1:
110 return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED;
111 case 4:
112 return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID;
113 case 8:
114 return MALI_SAMPLE_PATTERN_D3D_8X_GRID;
115 case 16:
116 return MALI_SAMPLE_PATTERN_D3D_16X_GRID;
117 default:
118 unreachable("Unsupported sample count");
119 }
120 }
121
122 static unsigned
translate_tex_wrap(enum pipe_tex_wrap w,bool using_nearest)123 translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
124 {
125 /* CLAMP is only supported on Midgard, where it is broken for nearest
126 * filtering. Use CLAMP_TO_EDGE in that case.
127 */
128
129 switch (w) {
130 case PIPE_TEX_WRAP_REPEAT:
131 return MALI_WRAP_MODE_REPEAT;
132 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
133 return MALI_WRAP_MODE_CLAMP_TO_EDGE;
134 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
135 return MALI_WRAP_MODE_CLAMP_TO_BORDER;
136 case PIPE_TEX_WRAP_MIRROR_REPEAT:
137 return MALI_WRAP_MODE_MIRRORED_REPEAT;
138 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
139 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
140 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
141 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
142
143 #if PAN_ARCH <= 5
144 case PIPE_TEX_WRAP_CLAMP:
145 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE
146 : MALI_WRAP_MODE_CLAMP;
147 case PIPE_TEX_WRAP_MIRROR_CLAMP:
148 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE
149 : MALI_WRAP_MODE_MIRRORED_CLAMP;
150 #endif
151
152 default:
153 unreachable("Invalid wrap");
154 }
155 }
156
157 /* The hardware compares in the wrong order order, so we have to flip before
158 * encoding. Yes, really. */
159
160 static enum mali_func
panfrost_sampler_compare_func(const struct pipe_sampler_state * cso)161 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
162 {
163 return !cso->compare_mode
164 ? MALI_FUNC_NEVER
165 : panfrost_flip_compare_func((enum mali_func)cso->compare_func);
166 }
167
168 static enum mali_mipmap_mode
pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)169 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
170 {
171 switch (f) {
172 case PIPE_TEX_MIPFILTER_NEAREST:
173 return MALI_MIPMAP_MODE_NEAREST;
174 case PIPE_TEX_MIPFILTER_LINEAR:
175 return MALI_MIPMAP_MODE_TRILINEAR;
176 #if PAN_ARCH >= 6
177 case PIPE_TEX_MIPFILTER_NONE:
178 return MALI_MIPMAP_MODE_NONE;
179 #else
180 case PIPE_TEX_MIPFILTER_NONE:
181 return MALI_MIPMAP_MODE_NEAREST;
182 #endif
183 default:
184 unreachable("Invalid");
185 }
186 }
187
188 #if PAN_ARCH == 7
189 static void
pan_afbc_reswizzle_border_color(const struct pipe_sampler_state * cso,struct panfrost_sampler_state * so)190 pan_afbc_reswizzle_border_color(const struct pipe_sampler_state *cso,
191 struct panfrost_sampler_state *so)
192 {
193 if (!panfrost_format_supports_afbc(PAN_ARCH, cso->border_color_format))
194 return;
195
196 /* On v7, pan_texture.c composes the API swizzle with a bijective
197 * swizzle derived from the format, to allow more formats than the
198 * hardware otherwise supports. When packing border colours, we need to
199 * undo this bijection, by swizzling with its inverse.
200 */
201 unsigned mali_format =
202 GENX(panfrost_format_from_pipe_format)(cso->border_color_format)->hw;
203 enum mali_rgb_component_order order = mali_format & BITFIELD_MASK(12);
204
205 unsigned char inverted_swizzle[4];
206 panfrost_invert_swizzle(GENX(pan_decompose_swizzle)(order).post,
207 inverted_swizzle);
208
209 util_format_apply_color_swizzle(&so->base.border_color, &cso->border_color,
210 inverted_swizzle,
211 false /* is_integer (irrelevant) */);
212 }
213 #endif
214
215 static void *
panfrost_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * cso)216 panfrost_create_sampler_state(struct pipe_context *pctx,
217 const struct pipe_sampler_state *cso)
218 {
219 struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state);
220 so->base = *cso;
221
222 #if PAN_ARCH == 7
223 pan_afbc_reswizzle_border_color(cso, so);
224 #endif
225
226 bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
227
228 pan_pack(&so->hw, SAMPLER, cfg) {
229 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
230 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
231
232 cfg.normalized_coordinates = !cso->unnormalized_coords;
233 cfg.lod_bias = cso->lod_bias;
234 cfg.minimum_lod = cso->min_lod;
235 cfg.maximum_lod = cso->max_lod;
236
237 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest);
238 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest);
239 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest);
240
241 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
242 cfg.compare_function = panfrost_sampler_compare_func(cso);
243 cfg.seamless_cube_map = cso->seamless_cube_map;
244
245 cfg.border_color_r = so->base.border_color.ui[0];
246 cfg.border_color_g = so->base.border_color.ui[1];
247 cfg.border_color_b = so->base.border_color.ui[2];
248 cfg.border_color_a = so->base.border_color.ui[3];
249
250 #if PAN_ARCH >= 6
251 if (cso->max_anisotropy > 1) {
252 cfg.maximum_anisotropy = cso->max_anisotropy;
253 cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC;
254 }
255 #else
256 /* Emulate disabled mipmapping by clamping the LOD as tight as
257 * possible (from 0 to epsilon = 1/256) */
258 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
259 cfg.maximum_lod = cfg.minimum_lod + (1.0 / 256.0);
260 #endif
261 }
262
263 return so;
264 }
265
266 /* Get pointers to the blend shaders bound to each active render target. Used
267 * to emit the blend descriptors, as well as the fragment renderer state
268 * descriptor.
269 */
270 static void
panfrost_get_blend_shaders(struct panfrost_batch * batch,uint64_t * blend_shaders)271 panfrost_get_blend_shaders(struct panfrost_batch *batch,
272 uint64_t *blend_shaders)
273 {
274 unsigned shader_offset = 0;
275 struct panfrost_bo *shader_bo = NULL;
276
277 for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) {
278 if (batch->key.cbufs[c]) {
279 blend_shaders[c] =
280 panfrost_get_blend(batch, c, &shader_bo, &shader_offset);
281 }
282 }
283
284 if (shader_bo)
285 perf_debug(batch->ctx, "Blend shader use");
286 }
287
288 #if PAN_ARCH >= 5
289 UNUSED static uint16_t
pack_blend_constant(enum pipe_format format,float cons)290 pack_blend_constant(enum pipe_format format, float cons)
291 {
292 const struct util_format_description *format_desc =
293 util_format_description(format);
294
295 unsigned chan_size = 0;
296
297 for (unsigned i = 0; i < format_desc->nr_channels; i++)
298 chan_size = MAX2(format_desc->channel[0].size, chan_size);
299
300 uint16_t unorm = (cons * ((1 << chan_size) - 1));
301 return unorm << (16 - chan_size);
302 }
303
304 static void
panfrost_emit_blend(struct panfrost_batch * batch,void * rts,uint64_t * blend_shaders)305 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
306 uint64_t *blend_shaders)
307 {
308 unsigned rt_count = batch->key.nr_cbufs;
309 struct panfrost_context *ctx = batch->ctx;
310 const struct panfrost_blend_state *so = ctx->blend;
311 bool dithered = so->base.dither;
312
313 /* Always have at least one render target for depth-only passes */
314 for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
315 struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
316
317 /* Disable blending for unbacked render targets */
318 if (rt_count == 0 || !batch->key.cbufs[i] || !so->info[i].enabled) {
319 pan_pack(packed, BLEND, cfg) {
320 cfg.enable = false;
321 #if PAN_ARCH >= 6
322 cfg.internal.mode = MALI_BLEND_MODE_OFF;
323 #endif
324 }
325
326 continue;
327 }
328
329 struct pan_blend_info info = so->info[i];
330 enum pipe_format format = batch->key.cbufs[i]->format;
331 float cons =
332 pan_blend_get_constant(info.constant_mask, ctx->blend_color.color);
333
334 /* Word 0: Flags and constant */
335 pan_pack(packed, BLEND, cfg) {
336 cfg.srgb = util_format_is_srgb(format);
337 cfg.load_destination = info.load_dest;
338 cfg.round_to_fb_precision = !dithered;
339 cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
340 #if PAN_ARCH >= 6
341 if (!blend_shaders[i])
342 cfg.constant = pack_blend_constant(format, cons);
343 #else
344 cfg.blend_shader = (blend_shaders[i] != 0);
345
346 if (blend_shaders[i])
347 cfg.shader_pc = blend_shaders[i];
348 else
349 cfg.constant = cons;
350 #endif
351 }
352
353 if (!blend_shaders[i]) {
354 /* Word 1: Blend Equation */
355 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
356 packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
357 }
358
359 #if PAN_ARCH >= 6
360 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
361 struct mali_internal_blend_packed *internal_blend_packed =
362 (struct mali_internal_blend_packed *)&packed->opaque[2];
363
364 /* Words 2 and 3: Internal blend */
365 if (blend_shaders[i]) {
366 /* The blend shader's address needs to be at
367 * the same top 32 bit as the fragment shader.
368 * TODO: Ensure that's always the case.
369 */
370 assert(!fs->bin.bo || (blend_shaders[i] & (0xffffffffull << 32)) ==
371 (fs->bin.gpu & (0xffffffffull << 32)));
372
373 pan_pack(internal_blend_packed, INTERNAL_BLEND, cfg) {
374 cfg.mode = MALI_BLEND_MODE_SHADER;
375 cfg.shader.pc = (uint32_t)blend_shaders[i];
376
377 #if PAN_ARCH <= 7
378 unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
379 assert(!(ret_offset & 0x7));
380
381 cfg.shader.return_value = ret_offset ? fs->bin.gpu + ret_offset : 0;
382 #endif
383 }
384 } else {
385 pan_pack(internal_blend_packed, INTERNAL_BLEND, cfg) {
386 cfg.mode = info.opaque ? MALI_BLEND_MODE_OPAQUE
387 : MALI_BLEND_MODE_FIXED_FUNCTION;
388
389 /* If we want the conversion to work properly,
390 * num_comps must be set to 4
391 */
392 cfg.fixed_function.num_comps = 4;
393 cfg.fixed_function.conversion.memory_format = GENX(
394 panfrost_dithered_format_from_pipe_format)(format, dithered);
395 cfg.fixed_function.rt = i;
396
397 #if PAN_ARCH >= 7
398 if (cfg.mode == MALI_BLEND_MODE_FIXED_FUNCTION &&
399 (cfg.fixed_function.conversion.memory_format & 0xff) ==
400 MALI_RGB_COMPONENT_ORDER_RGB1) {
401 /* fixed function does not like RGB1 as the component order */
402 /* force this field to be the default 0 (RGBA) */
403 cfg.fixed_function.conversion.memory_format &= ~0xff;
404 cfg.fixed_function.conversion.memory_format |=
405 MALI_RGB_COMPONENT_ORDER_RGBA;
406 }
407 #endif
408 #if PAN_ARCH <= 7
409 if (!info.opaque) {
410 cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop;
411 cfg.fixed_function.alpha_one_store = info.alpha_one_store;
412 }
413
414 if (fs->info.fs.untyped_color_outputs) {
415 cfg.fixed_function.conversion.register_format = GENX(
416 pan_fixup_blend_type)(fs->info.bifrost.blend[i].type, format);
417 } else {
418 cfg.fixed_function.conversion.register_format =
419 fs->info.bifrost.blend[i].format;
420 }
421 #endif
422 }
423 }
424 #endif
425 }
426 }
427 #endif
428
429 static uint64_t
panfrost_emit_compute_shader_meta(struct panfrost_batch * batch,enum pipe_shader_type stage)430 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch,
431 enum pipe_shader_type stage)
432 {
433 struct panfrost_compiled_shader *ss = batch->ctx->prog[stage];
434
435 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX);
436 panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX);
437
438 return ss->state.gpu;
439 }
440
441 static float
panfrost_z_depth_offset(struct panfrost_context * ctx,float offset_units)442 panfrost_z_depth_offset(struct panfrost_context *ctx, float offset_units)
443 {
444 if (ctx->pipe_framebuffer.zsbuf) {
445 if (util_format_is_float(ctx->pipe_framebuffer.zsbuf->format)) {
446 /* no scaling necessary, hw will do this at run time */
447 return offset_units;
448 }
449 }
450 /* if fixed point, apply the minimum resolvable difference scaling here */
451 return 2.0f * offset_units;
452 }
453
454 #if PAN_ARCH <= 7
455 /* Construct a partial RSD corresponding to no executed fragment shader, and
456 * merge with the existing partial RSD. */
457
458 static void
pan_merge_empty_fs(struct mali_renderer_state_packed * rsd)459 pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
460 {
461 struct mali_renderer_state_packed empty_rsd;
462
463 pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
464 #if PAN_ARCH >= 6
465 cfg.properties.shader_modifies_coverage = true;
466 cfg.properties.allow_forward_pixel_to_kill = true;
467 cfg.properties.allow_forward_pixel_to_be_killed = true;
468 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
469
470 /* Alpha isn't written so these are vacuous */
471 cfg.multisample_misc.overdraw_alpha0 = true;
472 cfg.multisample_misc.overdraw_alpha1 = true;
473 #else
474 cfg.shader.shader = 0x1;
475 cfg.properties.work_register_count = 1;
476 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
477 cfg.properties.force_early_z = true;
478 #endif
479 }
480
481 pan_merge((*rsd), empty_rsd, RENDERER_STATE);
482 }
483
484 static void
panfrost_prepare_fs_state(struct panfrost_context * ctx,uint64_t * blend_shaders,struct mali_renderer_state_packed * rsd)485 panfrost_prepare_fs_state(struct panfrost_context *ctx, uint64_t *blend_shaders,
486 struct mali_renderer_state_packed *rsd)
487 {
488 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
489 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
490 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
491 struct panfrost_blend_state *so = ctx->blend;
492 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
493 bool msaa = rast->multisample;
494
495 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
496
497 bool has_blend_shader = false;
498
499 for (unsigned c = 0; c < rt_count; ++c)
500 has_blend_shader |= (blend_shaders[c] != 0);
501
502 bool has_oq = ctx->occlusion_query && ctx->active_queries;
503
504 pan_pack(rsd, RENDERER_STATE, cfg) {
505 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
506 #if PAN_ARCH >= 6
507 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
508 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
509 ctx->blend->base.alpha_to_coverage,
510 ctx->depth_stencil->zs_always_passes);
511
512 cfg.properties.pixel_kill_operation = earlyzs.kill;
513 cfg.properties.zs_update_operation = earlyzs.update;
514
515 cfg.properties.allow_forward_pixel_to_kill =
516 pan_allow_forward_pixel_to_kill(ctx, fs);
517 #else
518 cfg.properties.force_early_z =
519 fs->info.fs.can_early_z && !alpha_to_coverage &&
520 ((enum mali_func)zsa->base.alpha_func == MALI_FUNC_ALWAYS);
521
522 /* TODO: Reduce this limit? */
523 if (has_blend_shader)
524 cfg.properties.work_register_count =
525 MAX2(fs->info.work_reg_count, 8);
526 else
527 cfg.properties.work_register_count = fs->info.work_reg_count;
528
529 /* Hardware quirks around early-zs forcing without a
530 * depth buffer. Note this breaks occlusion queries. */
531 bool force_ez_with_discard = !zsa->enabled && !has_oq;
532
533 cfg.properties.shader_reads_tilebuffer =
534 force_ez_with_discard && fs->info.fs.can_discard;
535 cfg.properties.shader_contains_discard =
536 !force_ez_with_discard && fs->info.fs.can_discard;
537 #endif
538 }
539
540 #if PAN_ARCH == 4
541 if (rt_count > 0) {
542 cfg.multisample_misc.load_destination = so->info[0].load_dest;
543 cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
544 cfg.stencil_mask_misc.write_enable = so->info[0].enabled;
545 cfg.stencil_mask_misc.srgb =
546 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
547 cfg.stencil_mask_misc.dither_disable = !so->base.dither;
548 cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
549
550 if (blend_shaders[0]) {
551 cfg.blend_shader = blend_shaders[0];
552 } else {
553 cfg.blend_constant = pan_blend_get_constant(
554 so->info[0].constant_mask, ctx->blend_color.color);
555 }
556 } else {
557 /* If there is no colour buffer, leaving fields default is
558 * fine, except for blending which is nonnullable */
559 cfg.blend_equation.color_mask = 0xf;
560 cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
561 cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
562 cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
563 cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
564 cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
565 cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
566 }
567 #elif PAN_ARCH == 5
568 /* Workaround */
569 cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
570 #endif
571
572 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
573
574 cfg.multisample_misc.evaluate_per_sample = msaa && (ctx->min_samples > 1);
575
576 #if PAN_ARCH >= 6
577 /* MSAA blend shaders need to pass their sample ID to
578 * LD_TILE/ST_TILE, so we must preload it. Additionally, we
579 * need per-sample shading for the blend shader, accomplished
580 * by forcing per-sample shading for the whole program. */
581
582 if (msaa && has_blend_shader) {
583 cfg.multisample_misc.evaluate_per_sample = true;
584 cfg.preload.fragment.sample_mask_id = true;
585 }
586
587 /* Bifrost does not have native point sprites. Point sprites are
588 * lowered in the driver to gl_PointCoord reads. This field
589 * actually controls the orientation of gl_PointCoord. Both
590 * orientations are controlled with sprite_coord_mode in
591 * Gallium.
592 */
593 cfg.properties.point_sprite_coord_origin_max_y =
594 (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
595
596 cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
597 cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
598 #endif
599
600 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
601 cfg.depth_units = panfrost_z_depth_offset(ctx, rast->offset_units);
602 cfg.depth_factor = rast->offset_scale;
603 cfg.depth_bias_clamp = rast->offset_clamp;
604
605 bool back_enab = zsa->base.stencil[1].enabled;
606 cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0];
607 cfg.stencil_back.reference_value =
608 ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
609
610 #if PAN_ARCH <= 5
611 /* v6+ fits register preload here, no alpha testing */
612 cfg.alpha_reference = zsa->base.alpha_ref_value;
613 #endif
614 }
615 }
616
617 static void
panfrost_emit_frag_shader(struct panfrost_context * ctx,struct mali_renderer_state_packed * fragmeta,uint64_t * blend_shaders)618 panfrost_emit_frag_shader(struct panfrost_context *ctx,
619 struct mali_renderer_state_packed *fragmeta,
620 uint64_t *blend_shaders)
621 {
622 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
623 const struct panfrost_rasterizer *rast = ctx->rasterizer;
624 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
625
626 /* We need to merge several several partial renderer state descriptors,
627 * so stage to temporary storage rather than reading back write-combine
628 * memory, which will trash performance. */
629 struct mali_renderer_state_packed rsd;
630 panfrost_prepare_fs_state(ctx, blend_shaders, &rsd);
631
632 #if PAN_ARCH == 4
633 if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
634 /* Word 14: SFBD Blend Equation */
635 STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
636 rsd.opaque[14] = ctx->blend->equation[0];
637 }
638 #endif
639
640 /* Merge with CSO state and upload */
641 if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
642 struct mali_renderer_state_packed *partial_rsd =
643 (struct mali_renderer_state_packed *)&fs->partial_rsd;
644 STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
645 pan_merge(rsd, *partial_rsd, RENDERER_STATE);
646 } else {
647 pan_merge_empty_fs(&rsd);
648 }
649
650 /* Word 8, 9 Misc state */
651 rsd.opaque[8] |= zsa->rsd_depth.opaque[0] | rast->multisample.opaque[0];
652
653 rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] | rast->stencil_misc.opaque[0];
654
655 /* late patching of the merged RSD in case of line-smoothing */
656 if (u_reduced_prim(ctx->active_prim) == MESA_PRIM_LINES &&
657 rast->base.line_smooth) {
658 rsd.opaque[8] |= (1u << 16); // multisample_enable = 1
659 rsd.opaque[9] &= ~(1u << 30); // single_sampled_lines = 0
660 }
661
662 /* Word 10, 11 Stencil Front and Back */
663 rsd.opaque[10] |= zsa->stencil_front.opaque[0];
664 rsd.opaque[11] |= zsa->stencil_back.opaque[0];
665
666 memcpy(fragmeta, &rsd, sizeof(rsd));
667 }
668
669 static uint64_t
panfrost_emit_frag_shader_meta(struct panfrost_batch * batch)670 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
671 {
672 struct panfrost_context *ctx = batch->ctx;
673 struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_FRAGMENT];
674
675 panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT);
676 panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_FRAGMENT);
677
678 struct panfrost_ptr xfer;
679
680 #if PAN_ARCH == 4
681 xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE);
682 #else
683 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
684
685 xfer =
686 pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(RENDERER_STATE),
687 PAN_DESC_ARRAY(rt_count, BLEND));
688 #endif
689
690 if (!xfer.cpu)
691 return 0;
692
693 uint64_t blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
694 panfrost_get_blend_shaders(batch, blend_shaders);
695
696 panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *)xfer.cpu,
697 blend_shaders);
698
699 #if PAN_ARCH >= 5
700 panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE),
701 blend_shaders);
702 #endif
703
704 return xfer.gpu;
705 }
706 #endif
707
708 static uint64_t
panfrost_emit_viewport(struct panfrost_batch * batch)709 panfrost_emit_viewport(struct panfrost_batch *batch)
710 {
711 struct panfrost_context *ctx = batch->ctx;
712 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
713 const struct pipe_scissor_state *ss = &ctx->scissor;
714 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
715
716 /* Derive min/max from translate/scale. Note since |x| >= 0 by
717 * definition, we have that -|x| <= |x| hence translate - |scale| <=
718 * translate + |scale|, so the ordering is correct here. */
719 float vp_minx = vp->translate[0] - fabsf(vp->scale[0]);
720 float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]);
721 float vp_miny = vp->translate[1] - fabsf(vp->scale[1]);
722 float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]);
723
724 float minz, maxz;
725 util_viewport_zmin_zmax(vp, rast->clip_halfz, &minz, &maxz);
726
727 /* Scissor to the intersection of viewport and to the scissor, clamped
728 * to the framebuffer */
729
730 unsigned minx = MIN2(batch->key.width, MAX2((int)vp_minx, 0));
731 unsigned maxx = MIN2(batch->key.width, MAX2((int)vp_maxx, 0));
732 unsigned miny = MIN2(batch->key.height, MAX2((int)vp_miny, 0));
733 unsigned maxy = MIN2(batch->key.height, MAX2((int)vp_maxy, 0));
734
735 if (ss && rast->scissor) {
736 minx = MAX2(ss->minx, minx);
737 miny = MAX2(ss->miny, miny);
738 maxx = MIN2(ss->maxx, maxx);
739 maxy = MIN2(ss->maxy, maxy);
740 }
741
742 /* Set the range to [1, 1) so max values don't wrap round */
743 if (maxx == 0 || maxy == 0)
744 maxx = maxy = minx = miny = 1;
745
746 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
747 batch->scissor_culls_everything = (minx >= maxx || miny >= maxy);
748
749 /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */
750 maxx--;
751 maxy--;
752
753 batch->minimum_z = minz;
754 batch->maximum_z = maxz;
755
756 #if PAN_ARCH <= 7
757 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT);
758
759 if (!T.cpu)
760 return 0;
761
762 pan_cast_and_pack(T.cpu, VIEWPORT, cfg) {
763 cfg.scissor_minimum_x = minx;
764 cfg.scissor_minimum_y = miny;
765 cfg.scissor_maximum_x = maxx;
766 cfg.scissor_maximum_y = maxy;
767
768 cfg.minimum_z = batch->minimum_z;
769 cfg.maximum_z = batch->maximum_z;
770 }
771
772 return T.gpu;
773 #else
774 pan_cast_and_pack(&batch->scissor, SCISSOR, cfg) {
775 cfg.scissor_minimum_x = minx;
776 cfg.scissor_minimum_y = miny;
777 cfg.scissor_maximum_x = maxx;
778 cfg.scissor_maximum_y = maxy;
779 }
780
781 return 0;
782 #endif
783 }
784
785 #if PAN_ARCH >= 9
786 /**
787 * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the
788 * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at
789 * CSO create time. However, the stencil reference values and shader
790 * interactions are dynamic state. Pack only the dynamic state here and OR
791 * together.
792 */
793 static uint64_t
panfrost_emit_depth_stencil(struct panfrost_batch * batch)794 panfrost_emit_depth_stencil(struct panfrost_batch *batch)
795 {
796 struct panfrost_context *ctx = batch->ctx;
797 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
798 struct panfrost_rasterizer *rast = ctx->rasterizer;
799 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
800 bool back_enab = zsa->base.stencil[1].enabled;
801
802 struct panfrost_ptr T =
803 pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL);
804
805 if (!T.cpu)
806 return 0;
807
808 struct mali_depth_stencil_packed dynamic;
809 pan_pack(&dynamic, DEPTH_STENCIL, cfg) {
810 cfg.front_reference_value = ctx->stencil_ref.ref_value[0];
811 cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
812
813 cfg.stencil_from_shader = fs->info.fs.writes_stencil;
814 cfg.depth_source = pan_depth_source(&fs->info);
815
816 cfg.depth_bias_enable = rast->base.offset_tri;
817 cfg.depth_units = panfrost_z_depth_offset(ctx, rast->base.offset_units);
818 cfg.depth_factor = rast->base.offset_scale;
819 cfg.depth_bias_clamp = rast->base.offset_clamp;
820
821 assert(rast->base.depth_clip_near == rast->base.depth_clip_far);
822 cfg.depth_cull_enable = rast->base.depth_clip_near;
823 cfg.depth_clamp_mode = rast->base.depth_clamp
824 ? MALI_DEPTH_CLAMP_MODE_BOUNDS
825 : MALI_DEPTH_CLAMP_MODE_0_1;
826 }
827
828 pan_merge(dynamic, zsa->desc, DEPTH_STENCIL);
829 memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL));
830
831 return T.gpu;
832 }
833
834 /**
835 * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared
836 * with Bifrost, but the container data structure is simplified.
837 */
838 static uint64_t
panfrost_emit_blend_valhall(struct panfrost_batch * batch)839 panfrost_emit_blend_valhall(struct panfrost_batch *batch)
840 {
841 unsigned rt_count = MAX2(batch->key.nr_cbufs, 1);
842
843 struct panfrost_ptr T =
844 pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND);
845
846 if (!T.cpu)
847 return 0;
848
849 uint64_t blend_shaders[PIPE_MAX_COLOR_BUFS] = {0};
850 panfrost_get_blend_shaders(batch, blend_shaders);
851
852 panfrost_emit_blend(batch, T.cpu, blend_shaders);
853
854 /* Precalculate for the per-draw path */
855 bool has_blend_shader = false;
856
857 for (unsigned i = 0; i < rt_count; ++i)
858 has_blend_shader |= !!blend_shaders[i];
859
860 batch->ctx->valhall_has_blend_shader = has_blend_shader;
861
862 return T.gpu;
863 }
864
865 /**
866 * Emit Valhall buffer descriptors for bound vertex buffers at draw-time.
867 */
868 static uint64_t
panfrost_emit_vertex_buffers(struct panfrost_batch * batch)869 panfrost_emit_vertex_buffers(struct panfrost_batch *batch)
870 {
871 struct panfrost_context *ctx = batch->ctx;
872 unsigned buffer_count = util_last_bit(ctx->vb_mask);
873 struct panfrost_ptr T =
874 pan_pool_alloc_desc_array(&batch->pool.base, buffer_count, BUFFER);
875
876 if (!T.cpu)
877 return 0;
878
879 struct mali_buffer_packed *buffers = T.cpu;
880
881 memset(buffers, 0, sizeof(*buffers) * buffer_count);
882
883 u_foreach_bit(i, ctx->vb_mask) {
884 struct pipe_vertex_buffer vb = ctx->vertex_buffers[i];
885 struct pipe_resource *prsrc = vb.buffer.resource;
886 struct panfrost_resource *rsrc = pan_resource(prsrc);
887 assert(!vb.is_user_buffer);
888
889 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
890
891 pan_pack(buffers + i, BUFFER, cfg) {
892 cfg.address = rsrc->image.data.base + vb.buffer_offset;
893
894 cfg.size = prsrc->width0 - vb.buffer_offset;
895 }
896 }
897
898 return T.gpu;
899 }
900
901 static uint64_t
panfrost_emit_vertex_data(struct panfrost_batch * batch)902 panfrost_emit_vertex_data(struct panfrost_batch *batch)
903 {
904 struct panfrost_context *ctx = batch->ctx;
905 struct panfrost_vertex_state *vtx = ctx->vertex;
906
907 return pan_pool_upload_aligned(&batch->pool.base, vtx->attributes,
908 vtx->num_elements * pan_size(ATTRIBUTE),
909 pan_alignment(ATTRIBUTE));
910 }
911
912 static void panfrost_update_sampler_view(struct panfrost_sampler_view *view,
913 struct pipe_context *pctx);
914
915 static uint64_t
panfrost_emit_images(struct panfrost_batch * batch,enum pipe_shader_type stage)916 panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage)
917 {
918 struct panfrost_context *ctx = batch->ctx;
919 unsigned last_bit = util_last_bit(ctx->image_mask[stage]);
920
921 struct panfrost_ptr T =
922 pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE);
923
924 struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
925
926 for (int i = 0; i < last_bit; ++i) {
927 struct pipe_image_view *image = &ctx->images[stage][i];
928
929 if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) {
930 memset(&out[i], 0, sizeof(out[i]));
931 continue;
932 }
933
934 /* Construct a synthetic sampler view so we can use our usual
935 * sampler view code for the actual descriptor packing.
936 *
937 * Use the batch pool for a transient allocation, rather than
938 * allocating a long-lived descriptor.
939 */
940 struct panfrost_sampler_view view = {
941 .base = util_image_to_sampler_view(image),
942 .pool = &batch->pool,
943 };
944
945 panfrost_update_sampler_view(&view, &ctx->base);
946 out[i] = view.bifrost_descriptor;
947
948 panfrost_track_image_access(batch, stage, image);
949 }
950
951 return T.gpu;
952 }
953 #endif
954
955 static uint64_t
panfrost_map_constant_buffer_gpu(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_constant_buffer * buf,unsigned index)956 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
957 enum pipe_shader_type st,
958 struct panfrost_constant_buffer *buf,
959 unsigned index)
960 {
961 struct pipe_constant_buffer *cb = &buf->cb[index];
962 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
963
964 if (rsrc) {
965 panfrost_batch_read_rsrc(batch, rsrc, st);
966
967 /* Alignment gauranteed by
968 * pipe_caps.constant_buffer_offset_alignment */
969 return rsrc->image.data.base + cb->buffer_offset;
970 } else if (cb->user_buffer) {
971 return pan_pool_upload_aligned(&batch->pool.base,
972 cb->user_buffer + cb->buffer_offset,
973 cb->buffer_size, 16);
974 } else {
975 unreachable("No constant buffer");
976 }
977 }
978
979 struct sysval_uniform {
980 union {
981 float f[4];
982 int32_t i[4];
983 uint32_t u[4];
984 uint64_t du[2];
985 };
986 };
987
988 static void
panfrost_upload_viewport_scale_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)989 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
990 struct sysval_uniform *uniform)
991 {
992 struct panfrost_context *ctx = batch->ctx;
993 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
994
995 uniform->f[0] = vp->scale[0];
996 uniform->f[1] = vp->scale[1];
997 uniform->f[2] = vp->scale[2];
998 }
999
1000 static void
panfrost_upload_viewport_offset_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1001 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1002 struct sysval_uniform *uniform)
1003 {
1004 struct panfrost_context *ctx = batch->ctx;
1005 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1006
1007 uniform->f[0] = vp->translate[0];
1008 uniform->f[1] = vp->translate[1];
1009 uniform->f[2] = vp->translate[2];
1010 }
1011
1012 static void
panfrost_upload_txs_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1013 panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1014 enum pipe_shader_type st, unsigned int sysvalid,
1015 struct sysval_uniform *uniform)
1016 {
1017 struct panfrost_context *ctx = batch->ctx;
1018 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1019 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1020 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1021 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1022
1023 assert(dim);
1024
1025 if (tex->target == PIPE_BUFFER) {
1026 assert(dim == 1);
1027 unsigned buf_size = tex->u.buf.size / util_format_get_blocksize(tex->format);
1028 uniform->i[0] = MIN2(buf_size, PAN_MAX_TEXEL_BUFFER_ELEMENTS);
1029 return;
1030 }
1031
1032 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1033
1034 if (dim > 1)
1035 uniform->i[1] = u_minify(tex->texture->height0, tex->u.tex.first_level);
1036
1037 if (dim > 2)
1038 uniform->i[2] = u_minify(tex->texture->depth0, tex->u.tex.first_level);
1039
1040 if (is_array) {
1041 unsigned size = tex->texture->array_size;
1042
1043 /* Internally, we store the number of 2D images (faces * array
1044 * size). Externally, we report the array size in terms of
1045 * complete cubes. So divide by the # of faces per cube.
1046 */
1047 if (tex->target == PIPE_TEXTURE_CUBE_ARRAY)
1048 size /= 6;
1049
1050 uniform->i[dim] = size;
1051 }
1052 }
1053
1054 static void
panfrost_upload_image_size_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned int sysvalid,struct sysval_uniform * uniform)1055 panfrost_upload_image_size_sysval(struct panfrost_batch *batch,
1056 enum pipe_shader_type st,
1057 unsigned int sysvalid,
1058 struct sysval_uniform *uniform)
1059 {
1060 struct panfrost_context *ctx = batch->ctx;
1061 unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1062 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1063 unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1064
1065 assert(dim && dim < 4);
1066
1067 struct pipe_image_view *image = &ctx->images[st][idx];
1068
1069 if (image->resource->target == PIPE_BUFFER) {
1070 unsigned blocksize = util_format_get_blocksize(image->format);
1071 uniform->i[0] = image->resource->width0 / blocksize;
1072 return;
1073 }
1074
1075 uniform->i[0] = u_minify(image->resource->width0, image->u.tex.level);
1076
1077 if (dim > 1)
1078 uniform->i[1] = u_minify(image->resource->height0, image->u.tex.level);
1079
1080 if (dim > 2)
1081 uniform->i[2] = u_minify(image->resource->depth0, image->u.tex.level);
1082
1083 if (is_array)
1084 uniform->i[dim] = image->resource->array_size;
1085 }
1086
1087 static void
panfrost_upload_ssbo_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned ssbo_id,struct sysval_uniform * uniform)1088 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1089 enum pipe_shader_type st, unsigned ssbo_id,
1090 struct sysval_uniform *uniform)
1091 {
1092 struct panfrost_context *ctx = batch->ctx;
1093
1094 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1095 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1096
1097 /* Compute address */
1098 struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1099 struct panfrost_bo *bo = rsrc->bo;
1100
1101 panfrost_batch_write_rsrc(batch, rsrc, st);
1102
1103 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1104 sb.buffer_size);
1105
1106 /* Upload address and size as sysval */
1107 uniform->du[0] = bo->ptr.gpu + sb.buffer_offset;
1108 uniform->u[2] = sb.buffer_size;
1109 }
1110
1111 static void
panfrost_upload_sampler_sysval(struct panfrost_batch * batch,enum pipe_shader_type st,unsigned samp_idx,struct sysval_uniform * uniform)1112 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1113 enum pipe_shader_type st, unsigned samp_idx,
1114 struct sysval_uniform *uniform)
1115 {
1116 struct panfrost_context *ctx = batch->ctx;
1117 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1118
1119 uniform->f[0] = sampl->min_lod;
1120 uniform->f[1] = sampl->max_lod;
1121 uniform->f[2] = sampl->lod_bias;
1122
1123 /* Even without any errata, Midgard represents "no mipmapping" as
1124 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1125 * panfrost_create_sampler_state which also explains our choice of
1126 * epsilon value (again to keep behaviour consistent) */
1127
1128 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1129 uniform->f[1] = uniform->f[0] + (1.0 / 256.0);
1130 }
1131
1132 static void
panfrost_upload_num_work_groups_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1133 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1134 struct sysval_uniform *uniform)
1135 {
1136 struct panfrost_context *ctx = batch->ctx;
1137
1138 uniform->u[0] = ctx->compute_grid->grid[0];
1139 uniform->u[1] = ctx->compute_grid->grid[1];
1140 uniform->u[2] = ctx->compute_grid->grid[2];
1141 }
1142
1143 static void
panfrost_upload_local_group_size_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1144 panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch,
1145 struct sysval_uniform *uniform)
1146 {
1147 struct panfrost_context *ctx = batch->ctx;
1148
1149 uniform->u[0] = ctx->compute_grid->block[0];
1150 uniform->u[1] = ctx->compute_grid->block[1];
1151 uniform->u[2] = ctx->compute_grid->block[2];
1152 }
1153
1154 static void
panfrost_upload_work_dim_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1155 panfrost_upload_work_dim_sysval(struct panfrost_batch *batch,
1156 struct sysval_uniform *uniform)
1157 {
1158 struct panfrost_context *ctx = batch->ctx;
1159
1160 uniform->u[0] = ctx->compute_grid->work_dim;
1161 }
1162
1163 /* Sample positions are pushed in a Bifrost specific format on Bifrost. On
1164 * Midgard, we emulate the Bifrost path with some extra arithmetic in the
1165 * shader, to keep the code as unified as possible. */
1166
1167 static void
panfrost_upload_sample_positions_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1168 panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch,
1169 struct sysval_uniform *uniform)
1170 {
1171 struct panfrost_context *ctx = batch->ctx;
1172 struct panfrost_device *dev = pan_device(ctx->base.screen);
1173
1174 unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1175 uniform->du[0] =
1176 dev->sample_positions->ptr.gpu +
1177 panfrost_sample_positions_offset(panfrost_sample_pattern(samples));
1178 }
1179
1180 static void
panfrost_upload_multisampled_sysval(struct panfrost_batch * batch,struct sysval_uniform * uniform)1181 panfrost_upload_multisampled_sysval(struct panfrost_batch *batch,
1182 struct sysval_uniform *uniform)
1183 {
1184 unsigned samples = util_framebuffer_get_num_samples(&batch->key);
1185 uniform->u[0] = (samples > 1) ? ~0 : 0;
1186 }
1187
1188 #if PAN_ARCH >= 6
1189 static void
panfrost_upload_rt_conversion_sysval(struct panfrost_batch * batch,unsigned size_and_rt,struct sysval_uniform * uniform)1190 panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
1191 unsigned size_and_rt,
1192 struct sysval_uniform *uniform)
1193 {
1194 unsigned rt = size_and_rt & 0xF;
1195 unsigned size = size_and_rt >> 4;
1196
1197 if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
1198 enum pipe_format format = batch->key.cbufs[rt]->format;
1199 uniform->u[0] =
1200 GENX(pan_blend_get_internal_desc)(format, rt, size, false) >> 32;
1201 } else {
1202 pan_cast_and_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
1203 cfg.memory_format =
1204 GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_NONE)->hw;
1205 }
1206 }
1207 #endif
1208
1209 static unsigned
panfrost_xfb_offset(unsigned stride,struct pipe_stream_output_target * target)1210 panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target)
1211 {
1212 return target->buffer_offset + (pan_so_target(target)->offset * stride);
1213 }
1214
1215 static void
panfrost_upload_sysvals(struct panfrost_batch * batch,void * ptr_cpu,uint64_t ptr_gpu,struct panfrost_compiled_shader * ss,enum pipe_shader_type st)1216 panfrost_upload_sysvals(struct panfrost_batch *batch, void *ptr_cpu,
1217 uint64_t ptr_gpu, struct panfrost_compiled_shader *ss,
1218 enum pipe_shader_type st)
1219 {
1220 struct sysval_uniform *uniforms = ptr_cpu;
1221
1222 for (unsigned i = 0; i < ss->sysvals.sysval_count; ++i) {
1223 int sysval = ss->sysvals.sysvals[i];
1224
1225 switch (PAN_SYSVAL_TYPE(sysval)) {
1226 case PAN_SYSVAL_VIEWPORT_SCALE:
1227 panfrost_upload_viewport_scale_sysval(batch, &uniforms[i]);
1228 break;
1229 case PAN_SYSVAL_VIEWPORT_OFFSET:
1230 panfrost_upload_viewport_offset_sysval(batch, &uniforms[i]);
1231 break;
1232 case PAN_SYSVAL_TEXTURE_SIZE:
1233 panfrost_upload_txs_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1234 &uniforms[i]);
1235 break;
1236 case PAN_SYSVAL_SSBO:
1237 panfrost_upload_ssbo_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1238 &uniforms[i]);
1239 break;
1240
1241 case PAN_SYSVAL_XFB: {
1242 unsigned buf = PAN_SYSVAL_ID(sysval);
1243 struct panfrost_compiled_shader *vs =
1244 batch->ctx->prog[PIPE_SHADER_VERTEX];
1245 struct pipe_stream_output_info *so = &vs->stream_output;
1246 unsigned stride = so->stride[buf] * 4;
1247
1248 struct pipe_stream_output_target *target = NULL;
1249 if (buf < batch->ctx->streamout.num_targets)
1250 target = batch->ctx->streamout.targets[buf];
1251
1252 if (!target) {
1253 /* Memory sink */
1254 uniforms[i].du[0] = 0x8ull << 60;
1255 break;
1256 }
1257
1258 struct panfrost_resource *rsrc = pan_resource(target->buffer);
1259 unsigned offset = panfrost_xfb_offset(stride, target);
1260
1261 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
1262 target->buffer_size - offset);
1263
1264 panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
1265
1266 uniforms[i].du[0] = rsrc->image.data.base + offset;
1267 break;
1268 }
1269
1270 case PAN_SYSVAL_NUM_VERTICES:
1271 uniforms[i].u[0] = batch->ctx->vertex_count;
1272 break;
1273
1274 case PAN_SYSVAL_NUM_WORK_GROUPS:
1275 for (unsigned j = 0; j < 3; j++) {
1276 batch->num_wg_sysval[j] =
1277 ptr_gpu + (i * sizeof(*uniforms)) + (j * 4);
1278 }
1279 panfrost_upload_num_work_groups_sysval(batch, &uniforms[i]);
1280 break;
1281 case PAN_SYSVAL_LOCAL_GROUP_SIZE:
1282 panfrost_upload_local_group_size_sysval(batch, &uniforms[i]);
1283 break;
1284 case PAN_SYSVAL_WORK_DIM:
1285 panfrost_upload_work_dim_sysval(batch, &uniforms[i]);
1286 break;
1287 case PAN_SYSVAL_SAMPLER:
1288 panfrost_upload_sampler_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1289 &uniforms[i]);
1290 break;
1291 case PAN_SYSVAL_IMAGE_SIZE:
1292 panfrost_upload_image_size_sysval(batch, st, PAN_SYSVAL_ID(sysval),
1293 &uniforms[i]);
1294 break;
1295 case PAN_SYSVAL_SAMPLE_POSITIONS:
1296 panfrost_upload_sample_positions_sysval(batch, &uniforms[i]);
1297 break;
1298 case PAN_SYSVAL_MULTISAMPLED:
1299 panfrost_upload_multisampled_sysval(batch, &uniforms[i]);
1300 break;
1301 #if PAN_ARCH >= 6
1302 case PAN_SYSVAL_RT_CONVERSION:
1303 panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval),
1304 &uniforms[i]);
1305 break;
1306 #endif
1307 case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS:
1308 uniforms[i].u[0] = batch->ctx->offset_start;
1309 uniforms[i].u[1] = batch->ctx->base_vertex;
1310 uniforms[i].u[2] = batch->ctx->base_instance;
1311 break;
1312 case PAN_SYSVAL_DRAWID:
1313 uniforms[i].u[0] = batch->ctx->drawid;
1314 break;
1315 default:
1316 assert(0);
1317 }
1318 }
1319 }
1320
1321 static const void *
panfrost_map_constant_buffer_cpu(struct panfrost_context * ctx,struct panfrost_constant_buffer * buf,unsigned index)1322 panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx,
1323 struct panfrost_constant_buffer *buf,
1324 unsigned index)
1325 {
1326 struct pipe_constant_buffer *cb = &buf->cb[index];
1327 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1328
1329 if (rsrc) {
1330 if (panfrost_bo_mmap(rsrc->bo))
1331 return NULL;
1332
1333 panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping");
1334 panfrost_bo_wait(rsrc->bo, INT64_MAX, false);
1335
1336 return rsrc->bo->ptr.cpu + cb->buffer_offset;
1337 } else if (cb->user_buffer) {
1338 return cb->user_buffer + cb->buffer_offset;
1339 } else
1340 unreachable("No constant buffer");
1341 }
1342
1343 /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are
1344 * implemented with buffer descriptors in the resource table, sized in terms of
1345 * bytes. On Bifrost and older, UBOs have special uniform buffer data
1346 * structure, sized in terms of entries.
1347 */
1348 static void
panfrost_emit_ubo(void * base,unsigned index,uint64_t address,size_t size)1349 panfrost_emit_ubo(void *base, unsigned index, uint64_t address, size_t size)
1350 {
1351 #if PAN_ARCH >= 9
1352 struct mali_buffer_packed *out = base;
1353
1354 pan_pack(out + index, BUFFER, cfg) {
1355 cfg.size = size;
1356 cfg.address = address;
1357 }
1358 #else
1359 struct mali_uniform_buffer_packed *out = base;
1360
1361 /* Issue (57) for the ARB_uniform_buffer_object spec says that
1362 * the buffer can be larger than the uniform data inside it,
1363 * so clamp ubo size to what hardware supports. */
1364
1365 pan_pack(out + index, UNIFORM_BUFFER, cfg) {
1366 cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12);
1367 cfg.pointer = address;
1368 }
1369 #endif
1370 }
1371
1372 #if PAN_ARCH >= 9
1373 static uint64_t
panfrost_emit_ssbos(struct panfrost_batch * batch,enum pipe_shader_type st)1374 panfrost_emit_ssbos(struct panfrost_batch *batch, enum pipe_shader_type st)
1375 {
1376 struct panfrost_context *ctx = batch->ctx;
1377 unsigned ssbo_count = util_last_bit(ctx->ssbo_mask[st]);
1378
1379 if (!ssbo_count)
1380 return 0;
1381
1382 struct panfrost_ptr ssbos =
1383 pan_pool_alloc_desc_array(&batch->pool.base, ssbo_count, BUFFER);
1384 struct mali_buffer_packed *bufs = ssbos.cpu;
1385
1386 memset(bufs, 0, sizeof(bufs[0]) * ssbo_count);
1387
1388 u_foreach_bit(ssbo_id, ctx->ssbo_mask[st]) {
1389 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1390 struct panfrost_resource *rsrc = pan_resource(sb.buffer);
1391 struct panfrost_bo *bo = rsrc->bo;
1392
1393 panfrost_batch_write_rsrc(batch, rsrc, st);
1394
1395 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset,
1396 sb.buffer_size);
1397 pan_pack(&bufs[ssbo_id], BUFFER, cfg) {
1398 cfg.size = sb.buffer_size;
1399 cfg.address = bo->ptr.gpu + sb.buffer_offset;
1400 }
1401 }
1402
1403 return ssbos.gpu;
1404 }
1405 #endif
1406
1407 static uint64_t
panfrost_emit_const_buf(struct panfrost_batch * batch,enum pipe_shader_type stage,unsigned * buffer_count,uint64_t * push_constants,unsigned * pushed_words)1408 panfrost_emit_const_buf(struct panfrost_batch *batch,
1409 enum pipe_shader_type stage, unsigned *buffer_count,
1410 uint64_t *push_constants, unsigned *pushed_words)
1411 {
1412 struct panfrost_context *ctx = batch->ctx;
1413 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1414 struct panfrost_compiled_shader *ss = ctx->prog[stage];
1415
1416 if (!ss)
1417 return 0;
1418
1419 /* Allocate room for the sysval and the uniforms */
1420 size_t sys_size = sizeof(float) * 4 * ss->sysvals.sysval_count;
1421 struct panfrost_ptr transfer =
1422 pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16);
1423
1424 if (!transfer.cpu)
1425 return 0;
1426
1427 /* Upload sysvals requested by the shader */
1428 uint8_t *sysvals = alloca(sys_size);
1429 panfrost_upload_sysvals(batch, sysvals, transfer.gpu, ss, stage);
1430 memcpy(transfer.cpu, sysvals, sys_size);
1431
1432 /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */
1433 struct panfrost_compiled_shader *shader = ctx->prog[stage];
1434 unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0);
1435 unsigned sysval_ubo = sys_size ? ubo_count : ~0;
1436 unsigned desc_size;
1437 struct panfrost_ptr ubos = {0};
1438
1439 #if PAN_ARCH >= 9
1440 desc_size = sizeof(struct mali_buffer_packed);
1441 ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, BUFFER);
1442 #else
1443 desc_size = sizeof(struct mali_uniform_buffer_packed);
1444 ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1,
1445 UNIFORM_BUFFER);
1446 #endif
1447
1448 if (!ubos.cpu)
1449 return 0;
1450
1451 memset(ubos.cpu, 0, desc_size * (ubo_count + 1));
1452
1453 if (buffer_count)
1454 *buffer_count = ubo_count + (sys_size ? 1 : 0);
1455
1456 /* Upload sysval as a final UBO */
1457
1458 if (sys_size)
1459 panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size);
1460
1461 /* The rest are honest-to-goodness UBOs */
1462
1463 u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) {
1464 size_t usz = buf->cb[ubo].buffer_size;
1465 uint64_t address = 0;
1466
1467 if (usz > 0) {
1468 address = panfrost_map_constant_buffer_gpu(batch, stage, buf, ubo);
1469 }
1470
1471 panfrost_emit_ubo(ubos.cpu, ubo, address, usz);
1472 }
1473
1474 if (pushed_words)
1475 *pushed_words = ss->info.push.count;
1476
1477 if (ss->info.push.count == 0)
1478 return ubos.gpu;
1479
1480 /* Copy push constants required by the shader */
1481 struct panfrost_ptr push_transfer =
1482 pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16);
1483
1484 if (!push_transfer.cpu)
1485 return 0;
1486
1487 uint32_t *push_cpu = (uint32_t *)push_transfer.cpu;
1488 *push_constants = push_transfer.gpu;
1489
1490 for (unsigned i = 0; i < ss->info.push.count; ++i) {
1491 struct panfrost_ubo_word src = ss->info.push.words[i];
1492
1493 if (src.ubo == sysval_ubo) {
1494 unsigned sysval_idx = src.offset / 16;
1495 unsigned sysval_comp = (src.offset % 16) / 4;
1496 unsigned sysval_type =
1497 PAN_SYSVAL_TYPE(ss->sysvals.sysvals[sysval_idx]);
1498 uint64_t ptr = push_transfer.gpu + (4 * i);
1499
1500 if (sysval_type == PAN_SYSVAL_NUM_WORK_GROUPS &&
1501 sysval_comp < ARRAY_SIZE(batch->num_wg_sysval))
1502 batch->num_wg_sysval[sysval_comp] = ptr;
1503 }
1504 /* Map the UBO, this should be cheap. For some buffers this may
1505 * read from write-combine memory which is slow, though :-(
1506 */
1507 const void *mapped_ubo =
1508 (src.ubo == sysval_ubo)
1509 ? sysvals
1510 : panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo);
1511
1512 if (!mapped_ubo)
1513 return 0;
1514
1515 /* TODO: Is there any benefit to combining ranges */
1516 memcpy(push_cpu + i, (uint8_t *)mapped_ubo + src.offset, 4);
1517 }
1518
1519 return ubos.gpu;
1520 }
1521
1522 /*
1523 * Choose the number of WLS instances to allocate. This must be a power-of-two.
1524 * The number of WLS instances limits the number of concurrent tasks on a given
1525 * shader core, setting to the (rounded) total number of tasks avoids any
1526 * throttling. Smaller values save memory at the expense of possible throttling.
1527 *
1528 * With indirect dispatch, we don't know at launch-time how many tasks will be
1529 * needed, so we use a conservative value that's unlikely to cause slowdown in
1530 * practice without wasting too much memory.
1531 */
1532 static unsigned
panfrost_choose_wls_instance_count(const struct pipe_grid_info * grid)1533 panfrost_choose_wls_instance_count(const struct pipe_grid_info *grid)
1534 {
1535 if (grid->indirect) {
1536 /* May need tuning in the future, conservative guess */
1537 return 128;
1538 } else {
1539 return util_next_power_of_two(grid->grid[0]) *
1540 util_next_power_of_two(grid->grid[1]) *
1541 util_next_power_of_two(grid->grid[2]);
1542 }
1543 }
1544
1545 static uint64_t
panfrost_emit_shared_memory(struct panfrost_batch * batch,const struct pipe_grid_info * grid)1546 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1547 const struct pipe_grid_info *grid)
1548 {
1549 struct panfrost_context *ctx = batch->ctx;
1550 struct panfrost_device *dev = pan_device(ctx->base.screen);
1551 struct panfrost_compiled_shader *ss = ctx->prog[PIPE_SHADER_COMPUTE];
1552 struct panfrost_ptr t =
1553 pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
1554
1555 struct pan_tls_info info = {
1556 .tls.size = ss->info.tls_size,
1557 .wls.size = ss->info.wls_size + grid->variable_shared_mem,
1558 .wls.instances = panfrost_choose_wls_instance_count(grid),
1559 };
1560
1561 if (ss->info.tls_size) {
1562 struct panfrost_bo *bo = panfrost_batch_get_scratchpad(
1563 batch, ss->info.tls_size, dev->thread_tls_alloc, dev->core_id_range);
1564 info.tls.ptr = bo->ptr.gpu;
1565 }
1566
1567 if (info.wls.size) {
1568 unsigned size = pan_wls_adjust_size(info.wls.size) * info.wls.instances *
1569 dev->core_id_range;
1570
1571 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1);
1572
1573 info.wls.ptr = bo->ptr.gpu;
1574 }
1575
1576 GENX(pan_emit_tls)(&info, t.cpu);
1577 return t.gpu;
1578 }
1579
1580 #if PAN_ARCH <= 5
1581 static uint64_t
panfrost_get_tex_desc(struct panfrost_batch * batch,enum pipe_shader_type st,struct panfrost_sampler_view * view)1582 panfrost_get_tex_desc(struct panfrost_batch *batch, enum pipe_shader_type st,
1583 struct panfrost_sampler_view *view)
1584 {
1585 if (!view)
1586 return (uint64_t)0;
1587
1588 struct pipe_sampler_view *pview = &view->base;
1589 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1590
1591 panfrost_batch_read_rsrc(batch, rsrc, st);
1592 panfrost_batch_add_bo(batch, view->state.bo, st);
1593
1594 return view->state.gpu;
1595 }
1596 #endif
1597
1598 static void
panfrost_create_sampler_view_bo(struct panfrost_sampler_view * so,struct pipe_context * pctx,struct pipe_resource * texture)1599 panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
1600 struct pipe_context *pctx,
1601 struct pipe_resource *texture)
1602 {
1603 struct panfrost_device *device = pan_device(pctx->screen);
1604 struct panfrost_context *ctx = pan_context(pctx);
1605 struct panfrost_resource *prsrc = (struct panfrost_resource *)texture;
1606 enum pipe_format format = so->base.format;
1607 assert(prsrc->bo);
1608
1609 /* Format to access the stencil/depth portion of a Z32_S8 texture */
1610 if (format == PIPE_FORMAT_X32_S8X24_UINT) {
1611 assert(prsrc->separate_stencil);
1612 texture = &prsrc->separate_stencil->base;
1613 prsrc = (struct panfrost_resource *)texture;
1614 format = texture->format;
1615 } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
1616 format = PIPE_FORMAT_Z32_FLOAT;
1617 }
1618
1619 so->texture_bo = prsrc->image.data.base;
1620 so->texture_size = prsrc->image.layout.data_size;
1621 so->modifier = prsrc->image.layout.modifier;
1622
1623 /* MSAA only supported for 2D textures */
1624
1625 assert(texture->nr_samples <= 1 || so->base.target == PIPE_TEXTURE_2D ||
1626 so->base.target == PIPE_TEXTURE_2D_ARRAY);
1627
1628 enum mali_texture_dimension type =
1629 panfrost_translate_texture_dimension(so->base.target);
1630
1631 bool is_buffer = (so->base.target == PIPE_BUFFER);
1632
1633 unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level;
1634 unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level;
1635 unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer;
1636 unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer;
1637 unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0;
1638 unsigned buf_size =
1639 (is_buffer ? so->base.u.buf.size : 0) / util_format_get_blocksize(format);
1640 buf_size = MIN2(buf_size, PAN_MAX_TEXEL_BUFFER_ELEMENTS);
1641
1642 if (so->base.target == PIPE_TEXTURE_3D) {
1643 first_layer /= prsrc->image.layout.depth;
1644 last_layer /= prsrc->image.layout.depth;
1645 assert(!first_layer && !last_layer);
1646 }
1647
1648 struct pan_image_view iview = {
1649 .format = format,
1650 .dim = type,
1651 .first_level = first_level,
1652 .last_level = last_level,
1653 .first_layer = first_layer,
1654 .last_layer = last_layer,
1655 .swizzle =
1656 {
1657 so->base.swizzle_r,
1658 so->base.swizzle_g,
1659 so->base.swizzle_b,
1660 so->base.swizzle_a,
1661 },
1662 .planes = {NULL},
1663 .buf.offset = buf_offset,
1664 .buf.size = buf_size,
1665 };
1666
1667 #if PAN_ARCH >= 7
1668 /* v7+ doesn't have an _RRRR component order. */
1669 if (util_format_is_depth_or_stencil(format))
1670 GENX(panfrost_texture_swizzle_replicate_x)(&iview);
1671 #endif
1672 #if PAN_ARCH == 7
1673 /* v7 requires AFBC reswizzle */
1674 if (!util_format_is_depth_or_stencil(format) &&
1675 !panfrost_format_is_yuv(format) &&
1676 panfrost_format_supports_afbc(PAN_ARCH, format))
1677 GENX(panfrost_texture_afbc_reswizzle)(&iview);
1678 #endif
1679
1680 panfrost_set_image_view_planes(&iview, texture);
1681
1682 unsigned size = (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
1683 GENX(panfrost_estimate_texture_payload_size)(&iview);
1684
1685 struct panfrost_pool *pool = so->pool ?: &ctx->descs;
1686 struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64);
1687
1688 if (!payload.cpu) {
1689 mesa_loge("panfrost_create_sampler_view_bo failed");
1690 return;
1691 }
1692
1693 so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
1694
1695 void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
1696
1697 if (PAN_ARCH <= 5) {
1698 payload.cpu += pan_size(TEXTURE);
1699 payload.gpu += pan_size(TEXTURE);
1700 }
1701
1702 const struct util_format_description *desc =
1703 util_format_description(format);
1704
1705 if ((device->debug & PAN_DBG_YUV) && panfrost_format_is_yuv(format)) {
1706
1707 if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
1708 iview.swizzle[2] = PIPE_SWIZZLE_1;
1709 } else if (desc->layout == UTIL_FORMAT_LAYOUT_PLANAR2) {
1710 iview.swizzle[1] = PIPE_SWIZZLE_0;
1711 iview.swizzle[2] = PIPE_SWIZZLE_0;
1712 }
1713 }
1714
1715 if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC &&
1716 so->base.astc_decode_format == PIPE_ASTC_DECODE_FORMAT_UNORM8) {
1717 iview.astc.narrow = true;
1718 }
1719
1720 GENX(panfrost_new_texture)(&iview, tex, &payload);
1721 }
1722
1723 static void
panfrost_update_sampler_view(struct panfrost_sampler_view * view,struct pipe_context * pctx)1724 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1725 struct pipe_context *pctx)
1726 {
1727 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1728 if (view->texture_bo != rsrc->image.data.base ||
1729 view->texture_size != rsrc->image.layout.data_size ||
1730 view->modifier != rsrc->image.layout.modifier) {
1731 panfrost_bo_unreference(view->state.bo);
1732 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1733 }
1734 }
1735
1736 #if PAN_ARCH >= 6
1737 static void
panfrost_emit_null_texture(struct mali_texture_packed * out)1738 panfrost_emit_null_texture(struct mali_texture_packed *out)
1739
1740 {
1741 /* Annoyingly, an all zero texture descriptor is not valid and will raise
1742 * a DATA_INVALID_FAULT if you try to texture it, instead of returning
1743 * 0000s! Fill in with sometthing that will behave robustly.
1744 */
1745 pan_pack(out, TEXTURE, cfg) {
1746 cfg.dimension = MALI_TEXTURE_DIMENSION_2D;
1747 cfg.width = 1;
1748 cfg.height = 1;
1749 cfg.depth = 1;
1750 cfg.array_size = 1;
1751 cfg.format = MALI_PACK_FMT(CONSTANT, 0000, L);
1752 #if PAN_ARCH <= 7
1753 cfg.texel_ordering = MALI_TEXTURE_LAYOUT_LINEAR;
1754 #endif
1755 }
1756 }
1757 #endif
1758
1759 static uint64_t
panfrost_emit_texture_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1760 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1761 enum pipe_shader_type stage)
1762 {
1763 struct panfrost_context *ctx = batch->ctx;
1764
1765 unsigned actual_count = ctx->sampler_view_count[stage];
1766 unsigned needed_count = ctx->prog[stage]->info.texture_count;
1767 unsigned alloc_count = MAX2(actual_count, needed_count);
1768
1769 if (!alloc_count)
1770 return 0;
1771
1772 #if PAN_ARCH >= 6
1773 struct panfrost_ptr T =
1774 pan_pool_alloc_desc_array(&batch->pool.base, alloc_count, TEXTURE);
1775
1776 if (!T.cpu)
1777 return 0;
1778
1779 struct mali_texture_packed *out = (struct mali_texture_packed *)T.cpu;
1780
1781 for (int i = 0; i < actual_count; ++i) {
1782 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1783
1784 if (!view) {
1785 panfrost_emit_null_texture(&out[i]);
1786 continue;
1787 }
1788
1789 struct pipe_sampler_view *pview = &view->base;
1790 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1791
1792 panfrost_update_sampler_view(view, &ctx->base);
1793 out[i] = view->bifrost_descriptor;
1794
1795 panfrost_batch_read_rsrc(batch, rsrc, stage);
1796 panfrost_batch_add_bo(batch, view->state.bo, stage);
1797 }
1798
1799 for (int i = actual_count; i < needed_count; ++i)
1800 panfrost_emit_null_texture(&out[i]);
1801
1802 return T.gpu;
1803 #else
1804 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1805
1806 for (int i = 0; i < actual_count; ++i) {
1807 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1808
1809 if (!view) {
1810 trampolines[i] = 0;
1811 continue;
1812 }
1813
1814 panfrost_update_sampler_view(view, &ctx->base);
1815
1816 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1817 }
1818
1819 for (int i = actual_count; i < needed_count; ++i)
1820 trampolines[i] = 0;
1821
1822 return pan_pool_upload_aligned(&batch->pool.base, trampolines,
1823 sizeof(uint64_t) * alloc_count,
1824 sizeof(uint64_t));
1825 #endif
1826 }
1827
1828 static uint64_t
panfrost_upload_wa_sampler(struct panfrost_batch * batch)1829 panfrost_upload_wa_sampler(struct panfrost_batch *batch)
1830 {
1831 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, SAMPLER);
1832 pan_cast_and_pack(T.cpu, SAMPLER, cfg)
1833 ;
1834 return T.gpu;
1835 }
1836
1837 static uint64_t
panfrost_emit_sampler_descriptors(struct panfrost_batch * batch,enum pipe_shader_type stage)1838 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1839 enum pipe_shader_type stage)
1840 {
1841 struct panfrost_context *ctx = batch->ctx;
1842
1843 /* We always need at least 1 sampler for txf to work */
1844 if (!ctx->sampler_count[stage])
1845 return panfrost_upload_wa_sampler(batch);
1846
1847 struct panfrost_ptr T = pan_pool_alloc_desc_array(
1848 &batch->pool.base, ctx->sampler_count[stage], SAMPLER);
1849
1850 if (!T.cpu)
1851 return 0;
1852
1853 struct mali_sampler_packed *out = (struct mali_sampler_packed *)T.cpu;
1854
1855 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) {
1856 struct panfrost_sampler_state *st = ctx->samplers[stage][i];
1857
1858 out[i] = st ? st->hw : (struct mali_sampler_packed){0};
1859 }
1860
1861 return T.gpu;
1862 }
1863
1864 #if PAN_ARCH <= 7
1865 /* Packs all image attribute descs and attribute buffer descs.
1866 * `first_image_buf_index` must be the index of the first image attribute buffer
1867 * descriptor.
1868 */
1869 static void
emit_image_attribs(struct panfrost_context * ctx,enum pipe_shader_type shader,struct mali_attribute_packed * attribs,unsigned first_buf)1870 emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader,
1871 struct mali_attribute_packed *attribs, unsigned first_buf)
1872 {
1873 unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1874
1875 for (unsigned i = 0; i < last_bit; ++i) {
1876 enum pipe_format format = ctx->images[shader][i].format;
1877
1878 pan_pack(attribs + i, ATTRIBUTE, cfg) {
1879 /* Continuation record means 2 buffers per image */
1880 cfg.buffer_index = first_buf + (i * 2);
1881 cfg.offset_enable = (PAN_ARCH <= 5);
1882 cfg.format = GENX(panfrost_format_from_pipe_format)(format)->hw;
1883 }
1884 }
1885 }
1886
1887 static enum mali_attribute_type
pan_modifier_to_attr_type(uint64_t modifier)1888 pan_modifier_to_attr_type(uint64_t modifier)
1889 {
1890 switch (modifier) {
1891 case DRM_FORMAT_MOD_LINEAR:
1892 return MALI_ATTRIBUTE_TYPE_3D_LINEAR;
1893 case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED:
1894 return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED;
1895 default:
1896 unreachable("Invalid modifier for attribute record");
1897 }
1898 }
1899
1900 static void
emit_image_bufs(struct panfrost_batch * batch,enum pipe_shader_type shader,struct mali_attribute_buffer_packed * bufs,unsigned first_image_buf_index)1901 emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
1902 struct mali_attribute_buffer_packed *bufs,
1903 unsigned first_image_buf_index)
1904 {
1905 struct panfrost_context *ctx = batch->ctx;
1906 unsigned last_bit = util_last_bit(ctx->image_mask[shader]);
1907
1908 for (unsigned i = 0; i < last_bit; ++i) {
1909 struct pipe_image_view *image = &ctx->images[shader][i];
1910
1911 if (!(ctx->image_mask[shader] & (1 << i)) ||
1912 !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
1913 /* Unused image bindings */
1914 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg)
1915 ;
1916 pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg)
1917 ;
1918 continue;
1919 }
1920
1921 struct panfrost_resource *rsrc = pan_resource(image->resource);
1922
1923 bool is_msaa = image->resource->nr_samples > 1;
1924
1925 bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
1926 bool is_buffer = rsrc->base.target == PIPE_BUFFER;
1927
1928 unsigned offset = is_buffer ? image->u.buf.offset
1929 : panfrost_texture_offset(
1930 &rsrc->image.layout, image->u.tex.level,
1931 (is_3d || is_msaa) ? 0 : image->u.tex.first_layer,
1932 (is_3d || is_msaa) ? image->u.tex.first_layer : 0);
1933
1934 panfrost_track_image_access(batch, shader, image);
1935
1936 pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) {
1937 cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier);
1938 cfg.pointer = rsrc->image.data.base + offset;
1939 cfg.stride = util_format_get_blocksize(image->format);
1940 cfg.size = panfrost_bo_size(rsrc->bo) - offset;
1941 }
1942
1943 if (is_buffer) {
1944 pan_cast_and_pack(&bufs[(i * 2) + 1], ATTRIBUTE_BUFFER_CONTINUATION_3D,
1945 cfg) {
1946 cfg.s_dimension =
1947 rsrc->base.width0 / util_format_get_blocksize(image->format);
1948 cfg.t_dimension = cfg.r_dimension = 1;
1949 }
1950
1951 continue;
1952 }
1953
1954 pan_cast_and_pack(&bufs[(i * 2) + 1], ATTRIBUTE_BUFFER_CONTINUATION_3D,
1955 cfg) {
1956 unsigned level = image->u.tex.level;
1957 unsigned samples = rsrc->image.layout.nr_samples;
1958
1959 cfg.s_dimension = u_minify(rsrc->base.width0, level);
1960 cfg.t_dimension = u_minify(rsrc->base.height0, level);
1961 cfg.r_dimension = is_3d ? u_minify(rsrc->image.layout.depth, level)
1962 : (image->u.tex.last_layer - image->u.tex.first_layer + 1);
1963
1964 cfg.row_stride = rsrc->image.layout.slices[level].row_stride;
1965 if (cfg.r_dimension > 1) {
1966 cfg.slice_stride =
1967 panfrost_get_layer_stride(&rsrc->image.layout, level);
1968 }
1969
1970 if (is_msaa) {
1971 if (cfg.r_dimension == 1) {
1972 /* regular multisampled images get the sample index in
1973 the R dimension */
1974 cfg.r_dimension = samples;
1975 cfg.slice_stride =
1976 panfrost_get_layer_stride(&rsrc->image.layout, level) / samples;
1977 } else {
1978 /* multisampled image arrays are emulated by making the
1979 image "samples" times higher than the original image,
1980 and fixing up the T coordinate by the sample number
1981 to address the correct sample (on bifrost) */
1982 cfg.t_dimension *= samples;
1983 }
1984 }
1985 }
1986 }
1987 }
1988
1989 static uint64_t
panfrost_emit_image_attribs(struct panfrost_batch * batch,uint64_t * buffers,enum pipe_shader_type type)1990 panfrost_emit_image_attribs(struct panfrost_batch *batch, uint64_t *buffers,
1991 enum pipe_shader_type type)
1992 {
1993 struct panfrost_context *ctx = batch->ctx;
1994 struct panfrost_compiled_shader *shader = ctx->prog[type];
1995
1996 if (!shader->info.attribute_count) {
1997 *buffers = 0;
1998 return 0;
1999 }
2000
2001 /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */
2002 unsigned attr_count = shader->info.attribute_count;
2003 unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0);
2004
2005 struct panfrost_ptr bufs =
2006 pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER);
2007
2008 struct panfrost_ptr attribs =
2009 pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE);
2010
2011 emit_image_attribs(ctx, type, attribs.cpu, 0);
2012 emit_image_bufs(batch, type, bufs.cpu, 0);
2013
2014 /* We need an empty attrib buf to stop the prefetching on Bifrost */
2015 #if PAN_ARCH >= 6
2016 struct mali_attribute_buffer_packed *attrib_bufs = bufs.cpu;
2017
2018 pan_pack(&attrib_bufs[buf_count - 1], ATTRIBUTE_BUFFER, cfg)
2019 ;
2020 #endif
2021
2022 *buffers = bufs.gpu;
2023 return attribs.gpu;
2024 }
2025
2026 static uint64_t
panfrost_emit_vertex_data(struct panfrost_batch * batch,uint64_t * buffers)2027 panfrost_emit_vertex_data(struct panfrost_batch *batch, uint64_t *buffers)
2028 {
2029 struct panfrost_context *ctx = batch->ctx;
2030 struct panfrost_vertex_state *so = ctx->vertex;
2031 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2032 bool instanced = ctx->instance_count > 1;
2033 uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX];
2034 unsigned nr_images = util_last_bit(image_mask);
2035
2036 /* Worst case: everything is NPOT, which is only possible if instancing
2037 * is enabled. Otherwise single record is gauranteed.
2038 * Also, we allocate more memory than what's needed here if either instancing
2039 * is enabled or images are present, this can be improved. */
2040 unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1;
2041 unsigned nr_bufs =
2042 ((so->nr_bufs + nr_images) * bufs_per_attrib) + (PAN_ARCH >= 6 ? 1 : 0);
2043
2044 unsigned count = vs->info.attribute_count;
2045
2046 struct panfrost_compiled_shader *xfb =
2047 ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb;
2048
2049 if (xfb)
2050 count = MAX2(count, xfb->info.attribute_count);
2051
2052 #if PAN_ARCH <= 5
2053 /* Midgard needs vertexid/instanceid handled specially */
2054 bool special_vbufs = count >= PAN_VERTEX_ID;
2055
2056 if (special_vbufs)
2057 nr_bufs += 2;
2058 #endif
2059
2060 if (!nr_bufs) {
2061 *buffers = 0;
2062 return 0;
2063 }
2064
2065 struct panfrost_ptr S =
2066 pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, ATTRIBUTE_BUFFER);
2067 struct panfrost_ptr T =
2068 pan_pool_alloc_desc_array(&batch->pool.base, count, ATTRIBUTE);
2069
2070 struct mali_attribute_buffer_packed *bufs =
2071 (struct mali_attribute_buffer_packed *)S.cpu;
2072
2073 struct mali_attribute_packed *out = (struct mali_attribute_packed *)T.cpu;
2074
2075 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = {0};
2076 unsigned k = 0;
2077
2078 for (unsigned i = 0; i < so->nr_bufs; ++i) {
2079 unsigned vbi = so->buffers[i].vbi;
2080 unsigned divisor = so->buffers[i].divisor;
2081 attrib_to_buffer[i] = k;
2082
2083 if (!(ctx->vb_mask & (1 << vbi)))
2084 continue;
2085
2086 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2087 struct panfrost_resource *rsrc;
2088
2089 rsrc = pan_resource(buf->buffer.resource);
2090 if (!rsrc)
2091 continue;
2092
2093 panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX);
2094
2095 /* Mask off lower bits, see offset fixup below */
2096 uint64_t raw_addr = rsrc->image.data.base + buf->buffer_offset;
2097 uint64_t addr = raw_addr & ~63;
2098
2099 /* Since we advanced the base pointer, we shrink the buffer
2100 * size, but add the offset we subtracted */
2101 unsigned size =
2102 rsrc->base.width0 + (raw_addr - addr) - buf->buffer_offset;
2103
2104 /* When there is a divisor, the hardware-level divisor is
2105 * the product of the instance divisor and the padded count */
2106 unsigned stride = so->strides[vbi];
2107 unsigned hw_divisor = ctx->padded_count * divisor;
2108
2109 if (ctx->instance_count <= 1) {
2110 /* Per-instance would be every attribute equal */
2111 if (divisor)
2112 stride = 0;
2113
2114 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2115 cfg.pointer = addr;
2116 cfg.stride = stride;
2117 cfg.size = size;
2118 }
2119 } else if (!divisor) {
2120 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2121 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
2122 cfg.pointer = addr;
2123 cfg.stride = stride;
2124 cfg.size = size;
2125 cfg.divisor = ctx->padded_count;
2126 }
2127 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
2128 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2129 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
2130 cfg.pointer = addr;
2131 cfg.stride = stride;
2132 cfg.size = size;
2133 cfg.divisor_r = __builtin_ctz(hw_divisor);
2134 }
2135
2136 } else {
2137 unsigned shift = 0, extra_flags = 0;
2138
2139 unsigned magic_divisor =
2140 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
2141
2142 /* Records with continuations must be aligned */
2143 k = ALIGN_POT(k, 2);
2144 attrib_to_buffer[i] = k;
2145
2146 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
2147 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
2148 cfg.pointer = addr;
2149 cfg.stride = stride;
2150 cfg.size = size;
2151
2152 cfg.divisor_r = shift;
2153 cfg.divisor_e = extra_flags;
2154 }
2155
2156 pan_cast_and_pack(&bufs[k + 1], ATTRIBUTE_BUFFER_CONTINUATION_NPOT,
2157 cfg) {
2158 cfg.divisor_numerator = magic_divisor;
2159 cfg.divisor = divisor;
2160 }
2161
2162 ++k;
2163 }
2164
2165 ++k;
2166 }
2167
2168 #if PAN_ARCH <= 5
2169 /* Add special gl_VertexID/gl_InstanceID buffers */
2170 if (special_vbufs) {
2171 panfrost_vertex_id(ctx->padded_count,
2172 (struct mali_attribute_vertex_id_packed *)&bufs[k],
2173 ctx->instance_count > 1);
2174
2175 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
2176 cfg.buffer_index = k++;
2177 cfg.format = so->formats[PAN_VERTEX_ID];
2178 }
2179
2180 panfrost_instance_id(ctx->padded_count,
2181 (struct mali_attribute_instance_id_packed *)&bufs[k],
2182 ctx->instance_count > 1);
2183
2184 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
2185 cfg.buffer_index = k++;
2186 cfg.format = so->formats[PAN_INSTANCE_ID];
2187 }
2188 }
2189 #endif
2190
2191 if (nr_images) {
2192 k = ALIGN_POT(k, 2);
2193 emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k);
2194 emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k);
2195 k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2);
2196 }
2197
2198 #if PAN_ARCH >= 6
2199 /* We need an empty attrib buf to stop the prefetching on Bifrost */
2200 pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg)
2201 ;
2202 #endif
2203
2204 /* Attribute addresses require 64-byte alignment, so let:
2205 *
2206 * base' = base & ~63 = base - (base & 63)
2207 * offset' = offset + (base & 63)
2208 *
2209 * Since base' + offset' = base + offset, these are equivalent
2210 * addressing modes and now base is 64 aligned.
2211 */
2212
2213 /* While these are usually equal, they are not required to be. In some
2214 * cases, u_blitter passes too high a value for num_elements.
2215 */
2216 assert(vs->info.attributes_read_count <= so->num_elements);
2217
2218 for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) {
2219 unsigned vbi = so->pipe[i].vertex_buffer_index;
2220 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
2221
2222 /* BOs are aligned; just fixup for buffer_offset */
2223 signed src_offset = so->pipe[i].src_offset;
2224 src_offset += (buf->buffer_offset & 63);
2225
2226 /* Base instance offset */
2227 if (ctx->base_instance && so->pipe[i].instance_divisor) {
2228 src_offset += (ctx->base_instance * so->pipe[i].src_stride) /
2229 so->pipe[i].instance_divisor;
2230 }
2231
2232 /* Also, somewhat obscurely per-instance data needs to be
2233 * offset in response to a delayed start in an indexed draw */
2234
2235 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
2236 src_offset -= so->pipe[i].src_stride * ctx->offset_start;
2237
2238 pan_pack(out + i, ATTRIBUTE, cfg) {
2239 cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]];
2240 cfg.format = so->formats[i];
2241 cfg.offset = src_offset;
2242 }
2243 }
2244
2245 *buffers = S.gpu;
2246 return T.gpu;
2247 }
2248
2249 static uint64_t
panfrost_emit_varyings(struct panfrost_batch * batch,struct mali_attribute_buffer_packed * slot,unsigned stride,unsigned count)2250 panfrost_emit_varyings(struct panfrost_batch *batch,
2251 struct mali_attribute_buffer_packed *slot,
2252 unsigned stride, unsigned count)
2253 {
2254 unsigned size = stride * count;
2255 uint64_t ptr =
2256 pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu;
2257
2258 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
2259 cfg.stride = stride;
2260 cfg.size = size;
2261 cfg.pointer = ptr;
2262 }
2263
2264 return ptr;
2265 }
2266
2267 /* Given a varying, figure out which index it corresponds to */
2268
2269 static inline unsigned
pan_varying_index(unsigned present,enum pan_special_varying v)2270 pan_varying_index(unsigned present, enum pan_special_varying v)
2271 {
2272 return util_bitcount(present & BITFIELD_MASK(v));
2273 }
2274
2275 /* Determines which varying buffers are required */
2276
2277 static inline unsigned
pan_varying_present(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,uint16_t point_coord_mask)2278 pan_varying_present(const struct panfrost_device *dev,
2279 struct pan_shader_info *producer,
2280 struct pan_shader_info *consumer, uint16_t point_coord_mask)
2281 {
2282 /* At the moment we always emit general and position buffers. Not
2283 * strictly necessary but usually harmless */
2284
2285 unsigned present =
2286 BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION);
2287
2288 /* Enable special buffers by the shader info */
2289
2290 if (producer->vs.writes_point_size)
2291 present |= BITFIELD_BIT(PAN_VARY_PSIZ);
2292
2293 #if PAN_ARCH <= 5
2294 /* On Midgard, these exist as real varyings. Later architectures use
2295 * LD_VAR_SPECIAL reads instead. */
2296
2297 if (consumer->fs.reads_point_coord)
2298 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2299
2300 if (consumer->fs.reads_face)
2301 present |= BITFIELD_BIT(PAN_VARY_FACE);
2302
2303 if (consumer->fs.reads_frag_coord)
2304 present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD);
2305
2306 /* Also, if we have a point sprite, we need a point coord buffer */
2307
2308 for (unsigned i = 0; i < consumer->varyings.input_count; i++) {
2309 gl_varying_slot loc = consumer->varyings.input[i].location;
2310
2311 if (util_varying_is_point_coord(loc, point_coord_mask))
2312 present |= BITFIELD_BIT(PAN_VARY_PNTCOORD);
2313 }
2314 #endif
2315
2316 return present;
2317 }
2318
2319 /* Emitters for varying records */
2320
2321 static void
pan_emit_vary(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned buffer_index,mali_pixel_format format,unsigned offset)2322 pan_emit_vary(const struct panfrost_device *dev,
2323 struct mali_attribute_packed *out, unsigned buffer_index,
2324 mali_pixel_format format, unsigned offset)
2325 {
2326 pan_pack(out, ATTRIBUTE, cfg) {
2327 cfg.buffer_index = buffer_index;
2328 cfg.offset_enable = (PAN_ARCH <= 5);
2329 cfg.format = format;
2330 cfg.offset = offset;
2331 }
2332 }
2333
2334 /* Special records */
2335
2336 /* clang-format off */
2337 static const struct {
2338 unsigned components;
2339 enum mali_format format;
2340 } pan_varying_formats[PAN_VARY_MAX] = {
2341 [PAN_VARY_POSITION] = { 4, MALI_SNAP_4 },
2342 [PAN_VARY_PSIZ] = { 1, MALI_R16F },
2343 [PAN_VARY_PNTCOORD] = { 4, MALI_RGBA32F },
2344 [PAN_VARY_FACE] = { 1, MALI_R32I },
2345 [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F },
2346 };
2347 /* clang-format on */
2348
2349 static mali_pixel_format
pan_special_format(const struct panfrost_device * dev,enum pan_special_varying buf)2350 pan_special_format(const struct panfrost_device *dev,
2351 enum pan_special_varying buf)
2352 {
2353 assert(buf < PAN_VARY_MAX);
2354 mali_pixel_format format = (pan_varying_formats[buf].format << 12);
2355
2356 #if PAN_ARCH <= 6
2357 unsigned nr = pan_varying_formats[buf].components;
2358 format |= panfrost_get_default_swizzle(nr);
2359 #endif
2360
2361 return format;
2362 }
2363
2364 static void
pan_emit_vary_special(const struct panfrost_device * dev,struct mali_attribute_packed * out,unsigned present,enum pan_special_varying buf)2365 pan_emit_vary_special(const struct panfrost_device *dev,
2366 struct mali_attribute_packed *out, unsigned present,
2367 enum pan_special_varying buf)
2368 {
2369 pan_emit_vary(dev, out, pan_varying_index(present, buf),
2370 pan_special_format(dev, buf), 0);
2371 }
2372
2373 /* Negative indicates a varying is not found */
2374
2375 static signed
pan_find_vary(const struct pan_shader_varying * vary,unsigned vary_count,unsigned loc)2376 pan_find_vary(const struct pan_shader_varying *vary, unsigned vary_count,
2377 unsigned loc)
2378 {
2379 for (unsigned i = 0; i < vary_count; ++i) {
2380 if (vary[i].location == loc)
2381 return i;
2382 }
2383
2384 return -1;
2385 }
2386
2387 /* Assign varying locations for the general buffer. Returns the calculated
2388 * per-vertex stride, and outputs offsets into the passed array. Negative
2389 * offset indicates a varying is not used. */
2390
2391 static unsigned
pan_assign_varyings(const struct panfrost_device * dev,struct pan_shader_info * producer,struct pan_shader_info * consumer,signed * offsets)2392 pan_assign_varyings(const struct panfrost_device *dev,
2393 struct pan_shader_info *producer,
2394 struct pan_shader_info *consumer, signed *offsets)
2395 {
2396 unsigned producer_count = producer->varyings.output_count;
2397 unsigned consumer_count = consumer->varyings.input_count;
2398
2399 const struct pan_shader_varying *producer_vars = producer->varyings.output;
2400 const struct pan_shader_varying *consumer_vars = consumer->varyings.input;
2401
2402 unsigned stride = 0;
2403
2404 for (unsigned i = 0; i < producer_count; ++i) {
2405 signed loc = pan_find_vary(consumer_vars, consumer_count,
2406 producer_vars[i].location);
2407 enum pipe_format format =
2408 loc >= 0 ? consumer_vars[loc].format : PIPE_FORMAT_NONE;
2409
2410 if (format != PIPE_FORMAT_NONE) {
2411 offsets[i] = stride;
2412 stride += util_format_get_blocksize(format);
2413 } else {
2414 offsets[i] = -1;
2415 }
2416 }
2417
2418 return stride;
2419 }
2420
2421 /* Emitter for a single varying (attribute) descriptor */
2422
2423 static void
panfrost_emit_varying(const struct panfrost_device * dev,struct mali_attribute_packed * out,const struct pan_shader_varying varying,enum pipe_format pipe_format,unsigned present,uint16_t point_sprite_mask,signed offset,enum pan_special_varying pos_varying)2424 panfrost_emit_varying(const struct panfrost_device *dev,
2425 struct mali_attribute_packed *out,
2426 const struct pan_shader_varying varying,
2427 enum pipe_format pipe_format, unsigned present,
2428 uint16_t point_sprite_mask, signed offset,
2429 enum pan_special_varying pos_varying)
2430 {
2431 /* Note: varying.format != pipe_format in some obscure cases due to a
2432 * limitation of the NIR linker. This should be fixed in the future to
2433 * eliminate the additional lookups. See:
2434 * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex
2435 */
2436 gl_varying_slot loc = varying.location;
2437 mali_pixel_format format =
2438 GENX(panfrost_format_from_pipe_format)(pipe_format)->hw;
2439
2440 if (util_varying_is_point_coord(loc, point_sprite_mask)) {
2441 pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD);
2442 } else if (loc == VARYING_SLOT_POS) {
2443 pan_emit_vary_special(dev, out, present, pos_varying);
2444 } else if (loc == VARYING_SLOT_PSIZ) {
2445 pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ);
2446 } else if (loc == VARYING_SLOT_FACE) {
2447 pan_emit_vary_special(dev, out, present, PAN_VARY_FACE);
2448 } else if (offset < 0) {
2449 pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0);
2450 } else {
2451 STATIC_ASSERT(PAN_VARY_GENERAL == 0);
2452 pan_emit_vary(dev, out, 0, format, offset);
2453 }
2454 }
2455
2456 /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time,
2457 * rather than draw time (under good conditions). */
2458
2459 static void
panfrost_emit_varying_descs(struct panfrost_pool * pool,struct panfrost_compiled_shader * producer,struct panfrost_compiled_shader * consumer,uint16_t point_coord_mask,struct pan_linkage * out)2460 panfrost_emit_varying_descs(struct panfrost_pool *pool,
2461 struct panfrost_compiled_shader *producer,
2462 struct panfrost_compiled_shader *consumer,
2463 uint16_t point_coord_mask, struct pan_linkage *out)
2464 {
2465 struct panfrost_device *dev = pool->dev;
2466 unsigned producer_count = producer->info.varyings.output_count;
2467 unsigned consumer_count = consumer->info.varyings.input_count;
2468
2469 /* Offsets within the general varying buffer, indexed by location */
2470 signed offsets[PAN_MAX_VARYINGS];
2471 assert(producer_count <= ARRAY_SIZE(offsets));
2472 assert(consumer_count <= ARRAY_SIZE(offsets));
2473
2474 /* Allocate enough descriptors for both shader stages */
2475 struct panfrost_ptr T = pan_pool_alloc_desc_array(
2476 &pool->base, producer_count + consumer_count, ATTRIBUTE);
2477
2478 /* Take a reference if we're being put on the CSO */
2479 if (!pool->owned) {
2480 out->bo = pool->transient_bo;
2481 panfrost_bo_reference(out->bo);
2482 }
2483
2484 struct mali_attribute_packed *descs = T.cpu;
2485 out->producer = producer_count ? T.gpu : 0;
2486 out->consumer =
2487 consumer_count ? T.gpu + (pan_size(ATTRIBUTE) * producer_count) : 0;
2488
2489 /* Lay out the varyings. Must use producer to lay out, in order to
2490 * respect transform feedback precisions. */
2491 out->present = pan_varying_present(dev, &producer->info, &consumer->info,
2492 point_coord_mask);
2493
2494 out->stride =
2495 pan_assign_varyings(dev, &producer->info, &consumer->info, offsets);
2496
2497 for (unsigned i = 0; i < producer_count; ++i) {
2498 signed j = pan_find_vary(consumer->info.varyings.input,
2499 consumer->info.varyings.input_count,
2500 producer->info.varyings.output[i].location);
2501
2502 enum pipe_format format = (j >= 0)
2503 ? consumer->info.varyings.input[j].format
2504 : producer->info.varyings.output[i].format;
2505
2506 panfrost_emit_varying(dev, descs + i, producer->info.varyings.output[i],
2507 format, out->present, 0, offsets[i],
2508 PAN_VARY_POSITION);
2509 }
2510
2511 for (unsigned i = 0; i < consumer_count; ++i) {
2512 signed j = pan_find_vary(producer->info.varyings.output,
2513 producer->info.varyings.output_count,
2514 consumer->info.varyings.input[i].location);
2515
2516 signed offset = (j >= 0) ? offsets[j] : -1;
2517
2518 panfrost_emit_varying(
2519 dev, descs + producer_count + i, consumer->info.varyings.input[i],
2520 consumer->info.varyings.input[i].format, out->present,
2521 point_coord_mask, offset, PAN_VARY_FRAGCOORD);
2522 }
2523 }
2524
2525 #if PAN_ARCH <= 5
2526 static void
pan_emit_special_input(struct mali_attribute_buffer_packed * out,unsigned present,enum pan_special_varying v,unsigned special)2527 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
2528 unsigned present, enum pan_special_varying v,
2529 unsigned special)
2530 {
2531 if (present & BITFIELD_BIT(v)) {
2532 unsigned idx = pan_varying_index(present, v);
2533
2534 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
2535 cfg.special = special;
2536 cfg.type = 0;
2537 }
2538 }
2539 }
2540 #endif
2541
2542 static void
panfrost_emit_varying_descriptor(struct panfrost_batch * batch,unsigned vertex_count,bool point_coord_replace)2543 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2544 unsigned vertex_count,
2545 bool point_coord_replace)
2546 {
2547 struct panfrost_context *ctx = batch->ctx;
2548 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2549 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
2550
2551 uint16_t point_coord_mask = 0;
2552
2553 memset(&batch->varyings, 0, sizeof(batch->varyings));
2554
2555 #if PAN_ARCH <= 5
2556 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
2557
2558 /* Point sprites are lowered on Bifrost and newer */
2559 if (point_coord_replace)
2560 point_coord_mask = ctx->rasterizer->base.sprite_coord_enable;
2561 #endif
2562
2563 /* In good conditions, we only need to link varyings once */
2564 bool prelink =
2565 (point_coord_mask == 0) && !vs->info.separable && !fs->info.separable;
2566
2567 /* Try to reduce copies */
2568 struct pan_linkage _linkage;
2569 struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage;
2570
2571 /* Emit ATTRIBUTE descriptors if needed */
2572 if (!prelink || vs->linkage.bo == NULL) {
2573 struct panfrost_pool *pool = prelink ? &ctx->descs : &batch->pool;
2574
2575 panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage);
2576 }
2577
2578 unsigned present = linkage->present, stride = linkage->stride;
2579 unsigned count = util_bitcount(present);
2580 struct panfrost_ptr T =
2581 pan_pool_alloc_desc_array(&batch->pool.base, count + 1, ATTRIBUTE_BUFFER);
2582
2583 if (!T.cpu) {
2584 mesa_loge("panfrost_emit_varying_descriptor failed");
2585 return;
2586 }
2587
2588 struct mali_attribute_buffer_packed *varyings =
2589 (struct mali_attribute_buffer_packed *)T.cpu;
2590
2591 batch->varyings.nr_bufs = count;
2592
2593 #if PAN_ARCH >= 6
2594 /* Suppress prefetch on Bifrost */
2595 memset(varyings + count, 0, sizeof(*varyings));
2596 #endif
2597
2598 if (stride) {
2599 panfrost_emit_varyings(
2600 batch, &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], stride,
2601 vertex_count);
2602 } else {
2603 /* The indirect draw code reads the stride field, make sure
2604 * that it is initialised */
2605 memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0,
2606 sizeof(*varyings));
2607 }
2608
2609 /* fp32 vec4 gl_Position */
2610 batch->varyings.pos = panfrost_emit_varyings(
2611 batch, &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2612 sizeof(float) * 4, vertex_count);
2613
2614 if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) {
2615 batch->varyings.psiz = panfrost_emit_varyings(
2616 batch, &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2,
2617 vertex_count);
2618 }
2619
2620 #if PAN_ARCH <= 5
2621 pan_emit_special_input(
2622 varyings, present, PAN_VARY_PNTCOORD,
2623 (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2624 ? MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MAX_Y
2625 : MALI_ATTRIBUTE_SPECIAL_POINT_COORD_MIN_Y);
2626 pan_emit_special_input(varyings, present, PAN_VARY_FACE,
2627 MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2628 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
2629 MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2630 #endif
2631
2632 batch->varyings.bufs = T.gpu;
2633 batch->varyings.vs = linkage->producer;
2634 batch->varyings.fs = linkage->consumer;
2635 }
2636 #endif
2637
2638 static void
emit_tls(struct panfrost_batch * batch)2639 emit_tls(struct panfrost_batch *batch)
2640 {
2641 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2642
2643 /* Emitted with the FB descriptor on Midgard. */
2644 if (PAN_ARCH <= 5 && batch->framebuffer.gpu)
2645 return;
2646
2647 struct panfrost_bo *tls_bo =
2648 batch->stack_size ? panfrost_batch_get_scratchpad(
2649 batch, batch->stack_size, dev->thread_tls_alloc,
2650 dev->core_id_range)
2651 : NULL;
2652 struct pan_tls_info tls = {
2653 .tls =
2654 {
2655 .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2656 .size = batch->stack_size,
2657 },
2658 };
2659
2660 assert(batch->tls.cpu);
2661 GENX(pan_emit_tls)(&tls, batch->tls.cpu);
2662 }
2663
2664 static void
emit_fbd(struct panfrost_batch * batch,struct pan_fb_info * fb)2665 emit_fbd(struct panfrost_batch *batch, struct pan_fb_info *fb)
2666 {
2667 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
2668 struct panfrost_bo *tls_bo =
2669 batch->stack_size ? panfrost_batch_get_scratchpad(
2670 batch, batch->stack_size, dev->thread_tls_alloc,
2671 dev->core_id_range)
2672 : NULL;
2673 struct pan_tls_info tls = {
2674 .tls =
2675 {
2676 .ptr = tls_bo ? tls_bo->ptr.gpu : 0,
2677 .size = batch->stack_size,
2678 },
2679 };
2680
2681 #if PAN_ARCH >= 6
2682 fb->sample_positions =
2683 dev->sample_positions->ptr.gpu +
2684 panfrost_sample_positions_offset(pan_sample_pattern(fb->nr_samples));
2685 #endif
2686
2687 JOBX(emit_fbds)(batch, fb, &tls);
2688 }
2689
2690 /* Mark a surface as written */
2691
2692 static void
panfrost_initialize_surface(struct panfrost_batch * batch,struct pipe_surface * surf)2693 panfrost_initialize_surface(struct panfrost_batch *batch,
2694 struct pipe_surface *surf)
2695 {
2696 if (surf) {
2697 struct panfrost_resource *rsrc = pan_resource(surf->texture);
2698 BITSET_SET(rsrc->valid.data, surf->u.tex.level);
2699 if (rsrc->separate_stencil)
2700 BITSET_SET(rsrc->separate_stencil->valid.data, surf->u.tex.level);
2701 }
2702 }
2703
2704 /* Generate a fragment job. This should be called once per frame. (Usually,
2705 * this corresponds to eglSwapBuffers or one of glFlush, glFinish)
2706 */
2707 static void
emit_fragment_job(struct panfrost_batch * batch,const struct pan_fb_info * pfb)2708 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
2709 {
2710 /* Mark the affected buffers as initialized, since we're writing to it.
2711 * Also, add the surfaces we're writing to to the batch */
2712
2713 struct pipe_framebuffer_state *fb = &batch->key;
2714
2715 for (unsigned i = 0; i < fb->nr_cbufs; ++i)
2716 panfrost_initialize_surface(batch, fb->cbufs[i]);
2717
2718 panfrost_initialize_surface(batch, fb->zsbuf);
2719
2720 /* The passed tile coords can be out of range in some cases, so we need
2721 * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT.
2722 * Theoretically we also need to clamp the coordinates positive, but we
2723 * avoid that edge case as all four values are unsigned. Also,
2724 * theoretically we could clamp the minima, but if that has to happen
2725 * the asserts would fail anyway (since the maxima would get clamped
2726 * and then be smaller than the minima). An edge case of sorts occurs
2727 * when no scissors are added to draw, so by default min=~0 and max=0.
2728 * But that can't happen if any actual drawing occurs (beyond a
2729 * wallpaper reload), so this is again irrelevant in practice. */
2730
2731 batch->maxx = MIN2(batch->maxx, fb->width);
2732 batch->maxy = MIN2(batch->maxy, fb->height);
2733
2734 /* Rendering region must be at least 1x1; otherwise, there is nothing
2735 * to do and the whole job chain should have been discarded. */
2736
2737 assert(batch->maxx > batch->minx);
2738 assert(batch->maxy > batch->miny);
2739
2740 JOBX(emit_fragment_job)(batch, pfb);
2741 }
2742
2743 /* Count generated primitives (when there is no geom/tess shaders) for
2744 * transform feedback */
2745
2746 static void
panfrost_statistics_record(struct panfrost_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw)2747 panfrost_statistics_record(struct panfrost_context *ctx,
2748 const struct pipe_draw_info *info,
2749 const struct pipe_draw_start_count_bias *draw)
2750 {
2751 if (!ctx->active_queries)
2752 return;
2753
2754 uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
2755 ctx->prims_generated += prims;
2756
2757 if (!ctx->streamout.num_targets)
2758 return;
2759
2760 ctx->tf_prims_generated += prims;
2761 ctx->dirty |= PAN_DIRTY_SO;
2762 }
2763
2764 static void
panfrost_update_streamout_offsets(struct panfrost_context * ctx)2765 panfrost_update_streamout_offsets(struct panfrost_context *ctx)
2766 {
2767 unsigned count =
2768 u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count);
2769
2770 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2771 if (!ctx->streamout.targets[i])
2772 continue;
2773
2774 pan_so_target(ctx->streamout.targets[i])->offset += count;
2775 }
2776 }
2777
2778 /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of
2779 * 3D state. In particular, it groups the fragment shader descriptor with
2780 * depth/stencil, blend, polygon offset, and multisampling state. These pieces
2781 * of state are dirty tracked independently for the benefit of newer GPUs that
2782 * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D
2783 * dirty flags that trigger re-emits of the fragment RSD.
2784 *
2785 * Obscurely, occlusion queries are included. Occlusion query state is nominally
2786 * specified in the draw call descriptor, but must be considered when determing
2787 * early-Z state which is part of the RSD.
2788 */
2789 #define FRAGMENT_RSD_DIRTY_MASK \
2790 (PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | PAN_DIRTY_RASTERIZER | \
2791 PAN_DIRTY_OQ)
2792
2793 static inline void
panfrost_update_shader_state(struct panfrost_batch * batch,enum pipe_shader_type st)2794 panfrost_update_shader_state(struct panfrost_batch *batch,
2795 enum pipe_shader_type st)
2796 {
2797 struct panfrost_context *ctx = batch->ctx;
2798 struct panfrost_compiled_shader *ss = ctx->prog[st];
2799
2800 bool frag = (st == PIPE_SHADER_FRAGMENT);
2801 unsigned dirty_3d = ctx->dirty;
2802 unsigned dirty = ctx->dirty_shader[st];
2803
2804 if (dirty & (PAN_DIRTY_STAGE_TEXTURE | PAN_DIRTY_STAGE_SHADER)) {
2805 batch->textures[st] = panfrost_emit_texture_descriptors(batch, st);
2806 }
2807
2808 if (dirty & PAN_DIRTY_STAGE_SAMPLER) {
2809 batch->samplers[st] = panfrost_emit_sampler_descriptors(batch, st);
2810 }
2811
2812 /* On Bifrost and older, the fragment shader descriptor is fused
2813 * together with the renderer state; the combined renderer state
2814 * descriptor is emitted below. Otherwise, the shader descriptor is
2815 * standalone and is emitted here.
2816 */
2817 if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) {
2818 batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st);
2819 }
2820
2821 #if PAN_ARCH >= 9
2822 if (dirty & PAN_DIRTY_STAGE_IMAGE) {
2823 batch->images[st] =
2824 ctx->image_mask[st] ? panfrost_emit_images(batch, st) : 0;
2825 }
2826
2827 if (dirty & PAN_DIRTY_STAGE_SSBO)
2828 batch->ssbos[st] = panfrost_emit_ssbos(batch, st);
2829 #endif
2830
2831 if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) {
2832 batch->uniform_buffers[st] = panfrost_emit_const_buf(
2833 batch, st, &batch->nr_uniform_buffers[st], &batch->push_uniforms[st],
2834 &batch->nr_push_uniforms[st]);
2835 }
2836
2837 #if PAN_ARCH <= 7
2838 /* On Bifrost and older, if the fragment shader changes OR any renderer
2839 * state specified with the fragment shader, the whole renderer state
2840 * descriptor is dirtied and must be reemited.
2841 */
2842 if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) ||
2843 (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) {
2844
2845 batch->rsd[st] = panfrost_emit_frag_shader_meta(batch);
2846 }
2847
2848 /* Vertex shaders need to mix vertex data and image descriptors in the
2849 * attribute array. This is taken care of in panfrost_update_state_3d().
2850 */
2851 if (st != PIPE_SHADER_VERTEX && (dirty & PAN_DIRTY_STAGE_IMAGE)) {
2852 batch->attribs[st] =
2853 panfrost_emit_image_attribs(batch, &batch->attrib_bufs[st], st);
2854 }
2855 #endif
2856 }
2857
2858 static inline void
panfrost_update_state_3d(struct panfrost_batch * batch)2859 panfrost_update_state_3d(struct panfrost_batch *batch)
2860 {
2861 struct panfrost_context *ctx = batch->ctx;
2862 unsigned dirty = ctx->dirty;
2863
2864 if (dirty & PAN_DIRTY_TLS_SIZE)
2865 panfrost_batch_adjust_stack_size(batch);
2866
2867 if (dirty & PAN_DIRTY_BLEND)
2868 panfrost_set_batch_masks_blend(batch);
2869
2870 if (dirty & PAN_DIRTY_ZS)
2871 panfrost_set_batch_masks_zs(batch);
2872
2873 #if PAN_ARCH >= 9
2874 if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) ||
2875 (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER))
2876 batch->depth_stencil = panfrost_emit_depth_stencil(batch);
2877
2878 if (dirty & PAN_DIRTY_BLEND)
2879 batch->blend = panfrost_emit_blend_valhall(batch);
2880
2881 if (dirty & PAN_DIRTY_VERTEX) {
2882 batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(batch);
2883
2884 batch->attrib_bufs[PIPE_SHADER_VERTEX] =
2885 panfrost_emit_vertex_buffers(batch);
2886 }
2887 #else
2888 unsigned vt_shader_dirty = ctx->dirty_shader[PIPE_SHADER_VERTEX];
2889 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2890 struct panfrost_vertex_state *vstate = ctx->vertex;
2891 bool attr_offsetted_by_instance_base =
2892 vstate->attr_depends_on_base_instance_mask &
2893 BITFIELD_MASK(vs->info.attributes_read_count);
2894
2895 /* Vertex data, vertex shader and images accessed by the vertex shader have
2896 * an impact on the attributes array, we need to re-emit anytime one of these
2897 * parameters changes. */
2898 if ((dirty & PAN_DIRTY_VERTEX) ||
2899 (vt_shader_dirty & (PAN_DIRTY_STAGE_IMAGE | PAN_DIRTY_STAGE_SHADER)) ||
2900 attr_offsetted_by_instance_base) {
2901 batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(
2902 batch, &batch->attrib_bufs[PIPE_SHADER_VERTEX]);
2903 }
2904 #endif
2905 }
2906
2907 static void
panfrost_launch_xfb(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned count)2908 panfrost_launch_xfb(struct panfrost_batch *batch,
2909 const struct pipe_draw_info *info, unsigned count)
2910 {
2911 struct panfrost_context *ctx = batch->ctx;
2912
2913 /* Nothing to do */
2914 if (batch->ctx->streamout.num_targets == 0)
2915 return;
2916
2917 /* TODO: XFB with index buffers */
2918 // assert(info->index_size == 0);
2919
2920 if (!u_trim_pipe_prim(info->mode, &count))
2921 return;
2922
2923 perf_debug(batch->ctx, "Emulating transform feedback");
2924
2925 struct panfrost_uncompiled_shader *vs_uncompiled =
2926 ctx->uncompiled[PIPE_SHADER_VERTEX];
2927 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
2928
2929 vs_uncompiled->xfb->stream_output = vs->stream_output;
2930
2931 uint64_t saved_rsd = batch->rsd[PIPE_SHADER_VERTEX];
2932 uint64_t saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX];
2933 uint64_t saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX];
2934 unsigned saved_nr_push_uniforms =
2935 batch->nr_push_uniforms[PIPE_SHADER_VERTEX];
2936
2937 ctx->uncompiled[PIPE_SHADER_VERTEX] = NULL; /* should not be read */
2938 ctx->prog[PIPE_SHADER_VERTEX] = vs_uncompiled->xfb;
2939 batch->rsd[PIPE_SHADER_VERTEX] =
2940 panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX);
2941
2942 batch->uniform_buffers[PIPE_SHADER_VERTEX] =
2943 panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL,
2944 &batch->push_uniforms[PIPE_SHADER_VERTEX],
2945 &batch->nr_push_uniforms[PIPE_SHADER_VERTEX]);
2946
2947 JOBX(launch_xfb)(batch, info, count);
2948 batch->compute_count++;
2949
2950 ctx->uncompiled[PIPE_SHADER_VERTEX] = vs_uncompiled;
2951 ctx->prog[PIPE_SHADER_VERTEX] = vs;
2952 batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd;
2953 batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo;
2954 batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push;
2955 batch->nr_push_uniforms[PIPE_SHADER_VERTEX] = saved_nr_push_uniforms;
2956 }
2957
2958 /*
2959 * Increase the vertex count on the batch using a saturating add, and hope the
2960 * compiler can use the machine instruction here...
2961 */
2962 static inline void
panfrost_increase_vertex_count(struct panfrost_batch * batch,uint32_t increment)2963 panfrost_increase_vertex_count(struct panfrost_batch *batch, uint32_t increment)
2964 {
2965 uint32_t sum = batch->vertex_count + increment;
2966
2967 if (sum >= batch->vertex_count)
2968 batch->vertex_count = sum;
2969 else
2970 batch->vertex_count = UINT32_MAX;
2971
2972 #if PAN_ARCH <= 5
2973 batch->tiler_ctx.midgard.vertex_count = batch->vertex_count;
2974 #endif
2975 }
2976
2977 /*
2978 * If we change whether we're drawing points, or whether point sprites are
2979 * enabled (specified in the rasterizer), we may need to rebind shaders
2980 * accordingly. This implicitly covers the case of rebinding framebuffers,
2981 * because all dirty flags are set there.
2982 */
2983 static void
panfrost_update_active_prim(struct panfrost_context * ctx,const struct pipe_draw_info * info)2984 panfrost_update_active_prim(struct panfrost_context *ctx,
2985 const struct pipe_draw_info *info)
2986 {
2987 const enum mesa_prim prev_prim = u_reduced_prim(ctx->active_prim);
2988 const enum mesa_prim new_prim = u_reduced_prim(info->mode);
2989
2990 ctx->active_prim = info->mode;
2991
2992 if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
2993 (prev_prim != new_prim)) {
2994 panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
2995 }
2996 }
2997
2998 static unsigned
panfrost_draw_get_vertex_count(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool idvs)2999 panfrost_draw_get_vertex_count(struct panfrost_batch *batch,
3000 const struct pipe_draw_info *info,
3001 const struct pipe_draw_start_count_bias *draw,
3002 bool idvs)
3003 {
3004 struct panfrost_context *ctx = batch->ctx;
3005 unsigned vertex_count = ctx->vertex_count;
3006 unsigned min_index = 0, max_index = 0;
3007
3008 batch->indices = 0;
3009 if (info->index_size && PAN_ARCH >= 9) {
3010 batch->indices = panfrost_get_index_buffer(batch, info, draw);
3011
3012 /* Use index count to estimate vertex count */
3013 panfrost_increase_vertex_count(batch, draw->count);
3014 } else if (info->index_size) {
3015 batch->indices = panfrost_get_index_buffer_bounded(
3016 batch, info, draw, &min_index, &max_index);
3017
3018 /* Use the corresponding values */
3019 vertex_count = max_index - min_index + 1;
3020 ctx->offset_start = min_index + draw->index_bias;
3021 panfrost_increase_vertex_count(batch, vertex_count);
3022 } else {
3023 ctx->offset_start = draw->start;
3024 panfrost_increase_vertex_count(batch, vertex_count);
3025 }
3026
3027 if (PAN_ARCH <= 9 && info->instance_count > 1) {
3028 unsigned count = vertex_count;
3029
3030 /* Index-Driven Vertex Shading requires different instances to
3031 * have different cache lines for position results. Each vertex
3032 * position is 16 bytes and the Mali cache line is 64 bytes, so
3033 * the instance count must be aligned to 4 vertices.
3034 */
3035 if (idvs)
3036 count = ALIGN_POT(count, 4);
3037
3038 ctx->padded_count = panfrost_padded_vertex_count(count);
3039 } else {
3040 ctx->padded_count = vertex_count;
3041 }
3042
3043 return vertex_count;
3044 }
3045
3046 static void
panfrost_single_draw_direct(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draw)3047 panfrost_single_draw_direct(struct panfrost_batch *batch,
3048 const struct pipe_draw_info *info,
3049 unsigned drawid_offset,
3050 const struct pipe_draw_start_count_bias *draw)
3051 {
3052 if (!draw->count || !info->instance_count)
3053 return;
3054
3055 struct panfrost_context *ctx = batch->ctx;
3056
3057 panfrost_update_active_prim(ctx, info);
3058
3059 /* Take into account a negative bias */
3060 ctx->vertex_count =
3061 draw->count + (info->index_size ? abs(draw->index_bias) : 0);
3062 ctx->instance_count = info->instance_count;
3063 ctx->base_vertex = info->index_size ? draw->index_bias : 0;
3064 ctx->base_instance = info->start_instance;
3065 ctx->drawid = drawid_offset;
3066
3067 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
3068 bool idvs = vs->info.vs.idvs;
3069
3070 UNUSED unsigned vertex_count =
3071 panfrost_draw_get_vertex_count(batch, info, draw, idvs);
3072
3073 panfrost_statistics_record(ctx, info, draw);
3074
3075 panfrost_update_state_3d(batch);
3076 panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3077 panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3078 panfrost_clean_state_3d(ctx);
3079
3080 if (ctx->uncompiled[PIPE_SHADER_VERTEX]->xfb) {
3081 panfrost_launch_xfb(batch, info, draw->count);
3082 }
3083
3084 /* Increment transform feedback offsets */
3085 panfrost_update_streamout_offsets(ctx);
3086
3087 /* Any side effects must be handled by the XFB shader, so we only need
3088 * to run vertex shaders if we need rasterization.
3089 */
3090 if (panfrost_batch_skip_rasterization(batch))
3091 return;
3092
3093 #if PAN_ARCH <= 7
3094 /* Emit all sort of descriptors. */
3095 panfrost_emit_varying_descriptor(batch,
3096 ctx->padded_count * ctx->instance_count,
3097 info->mode == MESA_PRIM_POINTS);
3098 #endif
3099
3100 JOBX(launch_draw)(batch, info, drawid_offset, draw, vertex_count);
3101 batch->draw_count++;
3102 }
3103
3104 static bool
panfrost_compatible_batch_state(struct panfrost_batch * batch,enum mesa_prim reduced_prim)3105 panfrost_compatible_batch_state(struct panfrost_batch *batch,
3106 enum mesa_prim reduced_prim)
3107 {
3108 struct panfrost_context *ctx = batch->ctx;
3109 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
3110
3111 if (reduced_prim == MESA_PRIM_LINES &&
3112 !u_tristate_set(&batch->line_smoothing, rast->line_smooth))
3113 return false;
3114
3115 /* Only applies on Valhall */
3116 if (PAN_ARCH < 9)
3117 return true;
3118
3119 bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
3120 bool first = rast->flatshade_first;
3121
3122 /* gl_PointCoord orientation only matters when drawing points, but
3123 * provoking vertex doesn't matter for points.
3124 */
3125 if (reduced_prim == MESA_PRIM_POINTS)
3126 return u_tristate_set(&batch->sprite_coord_origin, coord);
3127 else
3128 return u_tristate_set(&batch->first_provoking_vertex, first);
3129 }
3130
3131 static struct panfrost_batch *
prepare_draw(struct pipe_context * pipe,const struct pipe_draw_info * info)3132 prepare_draw(struct pipe_context *pipe, const struct pipe_draw_info *info)
3133 {
3134 struct panfrost_context *ctx = pan_context(pipe);
3135 struct panfrost_device *dev = pan_device(pipe->screen);
3136
3137 /* Do some common setup */
3138 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3139 if (!batch)
3140 return NULL;
3141
3142 /* Don't add too many jobs to a single batch. Job manager hardware has a
3143 * hard limit of 65536 jobs per job chain. Given a draw issues a maximum
3144 * of 3 jobs (a vertex, a tiler and a compute job is XFB is enabled), we
3145 * could use 65536 / 3 as a limit, but we choose a smaller soft limit
3146 * (arbitrary) to avoid the risk of timeouts. This might not be a good
3147 * idea. */
3148 if (unlikely(batch->draw_count > 10000)) {
3149 batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws");
3150 if (!batch)
3151 return NULL;
3152 }
3153
3154 enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
3155
3156 if (unlikely(!panfrost_compatible_batch_state(batch, reduced_prim))) {
3157 batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
3158 if (!batch)
3159 return NULL;
3160
3161 ASSERTED bool succ = panfrost_compatible_batch_state(batch, reduced_prim);
3162 assert(succ && "must be able to set state for a fresh batch");
3163 }
3164
3165 /* panfrost_batch_skip_rasterization reads
3166 * batch->scissor_culls_everything, which is set by
3167 * panfrost_emit_viewport, so call that first.
3168 */
3169 if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR))
3170 batch->viewport = panfrost_emit_viewport(batch);
3171
3172 /* Mark everything dirty when debugging */
3173 if (unlikely(dev->debug & PAN_DBG_DIRTY))
3174 panfrost_dirty_state_all(ctx);
3175
3176 /* Conservatively assume draw parameters always change */
3177 ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID;
3178
3179 return batch;
3180 }
3181
3182 static void
panfrost_draw_indirect(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect)3183 panfrost_draw_indirect(struct pipe_context *pipe,
3184 const struct pipe_draw_info *info,
3185 unsigned drawid_offset,
3186 const struct pipe_draw_indirect_info *indirect)
3187 {
3188 struct panfrost_context *ctx = pan_context(pipe);
3189
3190 if (!PAN_GPU_SUPPORTS_DRAW_INDIRECT || ctx->active_queries ||
3191 ctx->streamout.num_targets) {
3192 util_draw_indirect(pipe, info, drawid_offset, indirect);
3193 perf_debug(ctx, "Emulating indirect draw on the CPU");
3194 return;
3195 }
3196
3197 struct panfrost_batch *batch = prepare_draw(pipe, info);
3198 if (!batch) {
3199 mesa_loge("prepare_draw failed");
3200 return;
3201 }
3202
3203 struct pipe_draw_info tmp_info = *info;
3204
3205 panfrost_batch_read_rsrc(batch, pan_resource(indirect->buffer),
3206 PIPE_SHADER_VERTEX);
3207
3208 panfrost_update_active_prim(ctx, &tmp_info);
3209
3210 ctx->drawid = drawid_offset;
3211
3212 batch->indices = 0;
3213 if (info->index_size) {
3214 struct panfrost_resource *index_buffer =
3215 pan_resource(info->index.resource);
3216 panfrost_batch_read_rsrc(batch, index_buffer, PIPE_SHADER_VERTEX);
3217 batch->indices = index_buffer->image.data.base;
3218 }
3219
3220 panfrost_update_state_3d(batch);
3221 panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX);
3222 panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT);
3223 panfrost_clean_state_3d(ctx);
3224
3225 /* Increment transform feedback offsets */
3226 panfrost_update_streamout_offsets(ctx);
3227
3228 /* Any side effects must be handled by the XFB shader, so we only need
3229 * to run vertex shaders if we need rasterization.
3230 */
3231 if (panfrost_batch_skip_rasterization(batch))
3232 return;
3233
3234 JOBX(launch_draw_indirect)(batch, &tmp_info, drawid_offset, indirect);
3235 batch->draw_count++;
3236 }
3237
3238 static void
panfrost_multi_draw_direct(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3239 panfrost_multi_draw_direct(struct pipe_context *pipe,
3240 const struct pipe_draw_info *info,
3241 unsigned drawid_offset,
3242 const struct pipe_draw_start_count_bias *draws,
3243 unsigned num_draws)
3244 {
3245 struct panfrost_context *ctx = pan_context(pipe);
3246 struct panfrost_batch *batch = prepare_draw(pipe, info);
3247 if (!batch) {
3248 mesa_loge("prepare_draw failed");
3249 return;
3250 }
3251
3252 struct pipe_draw_info tmp_info = *info;
3253 unsigned drawid = drawid_offset;
3254
3255 for (unsigned i = 0; i < num_draws; i++) {
3256 panfrost_single_draw_direct(batch, &tmp_info, drawid, &draws[i]);
3257
3258 if (tmp_info.increment_draw_id) {
3259 ctx->dirty |= PAN_DIRTY_DRAWID;
3260 drawid++;
3261 }
3262 }
3263 }
3264
3265 static void
panfrost_draw_vbo(struct pipe_context * pipe,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)3266 panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
3267 unsigned drawid_offset,
3268 const struct pipe_draw_indirect_info *indirect,
3269 const struct pipe_draw_start_count_bias *draws,
3270 unsigned num_draws)
3271 {
3272 struct panfrost_context *ctx = pan_context(pipe);
3273
3274 if (!panfrost_render_condition_check(ctx))
3275 return;
3276
3277 ctx->draw_calls++;
3278
3279 if (indirect && indirect->buffer) {
3280 assert(num_draws == 1);
3281 panfrost_draw_indirect(pipe, info, drawid_offset, indirect);
3282 } else {
3283 panfrost_multi_draw_direct(pipe, info, drawid_offset, draws, num_draws);
3284 }
3285 }
3286
3287 /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
3288 * construct the COMPUTE job and some of its payload.
3289 */
3290
3291 static void
panfrost_launch_grid_on_batch(struct pipe_context * pipe,struct panfrost_batch * batch,const struct pipe_grid_info * info)3292 panfrost_launch_grid_on_batch(struct pipe_context *pipe,
3293 struct panfrost_batch *batch,
3294 const struct pipe_grid_info *info)
3295 {
3296 struct panfrost_context *ctx = pan_context(pipe);
3297
3298 util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
3299 if (!*res)
3300 continue;
3301
3302 struct panfrost_resource *buffer = pan_resource(*res);
3303 panfrost_batch_write_rsrc(batch, buffer, PIPE_SHADER_COMPUTE);
3304 }
3305
3306 if (info->indirect && !PAN_GPU_SUPPORTS_DISPATCH_INDIRECT) {
3307 struct pipe_transfer *transfer;
3308 uint32_t *params =
3309 pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset,
3310 3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer);
3311
3312 struct pipe_grid_info direct = *info;
3313 direct.indirect = NULL;
3314 direct.grid[0] = params[0];
3315 direct.grid[1] = params[1];
3316 direct.grid[2] = params[2];
3317 pipe_buffer_unmap(pipe, transfer);
3318
3319 if (params[0] && params[1] && params[2])
3320 panfrost_launch_grid_on_batch(pipe, batch, &direct);
3321
3322 return;
3323 }
3324
3325 ctx->compute_grid = info;
3326
3327 /* Conservatively assume workgroup size changes every launch */
3328 ctx->dirty |= PAN_DIRTY_PARAMS;
3329
3330 panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE);
3331
3332 /* We want our compute thread descriptor to be per job.
3333 * Save the global one, and restore it when we're done emitting
3334 * the job.
3335 */
3336 uint64_t saved_tls = batch->tls.gpu;
3337 batch->tls.gpu = panfrost_emit_shared_memory(batch, info);
3338
3339 /* if indirect, mark the indirect buffer as being read */
3340 if (info->indirect)
3341 panfrost_batch_read_rsrc(batch, pan_resource(info->indirect), PIPE_SHADER_COMPUTE);
3342
3343 /* launch it */
3344 JOBX(launch_grid)(batch, info);
3345 batch->compute_count++;
3346 batch->tls.gpu = saved_tls;
3347 }
3348
3349 static void
panfrost_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)3350 panfrost_launch_grid(struct pipe_context *pipe,
3351 const struct pipe_grid_info *info)
3352 {
3353 struct panfrost_context *ctx = pan_context(pipe);
3354
3355 /* XXX - shouldn't be necessary with working memory barriers. Affected
3356 * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */
3357 panfrost_flush_all_batches(ctx, "Launch grid pre-barrier");
3358
3359 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
3360 panfrost_launch_grid_on_batch(pipe, batch, info);
3361
3362 panfrost_flush_all_batches(ctx, "Launch grid post-barrier");
3363 }
3364
3365 #define AFBC_BLOCK_ALIGN 16
3366
3367 static void
panfrost_launch_afbc_shader(struct panfrost_batch * batch,void * cso,struct pipe_constant_buffer * cbuf,unsigned nr_blocks)3368 panfrost_launch_afbc_shader(struct panfrost_batch *batch, void *cso,
3369 struct pipe_constant_buffer *cbuf,
3370 unsigned nr_blocks)
3371 {
3372 struct pipe_context *pctx = &batch->ctx->base;
3373 void *saved_cso = NULL;
3374 struct pipe_constant_buffer saved_const = {};
3375 struct pipe_grid_info grid = {
3376 .block[0] = 1,
3377 .block[1] = 1,
3378 .block[2] = 1,
3379 .grid[0] = nr_blocks,
3380 .grid[1] = 1,
3381 .grid[2] = 1,
3382 };
3383
3384 struct panfrost_constant_buffer *pbuf =
3385 &batch->ctx->constant_buffer[PIPE_SHADER_COMPUTE];
3386 saved_cso = batch->ctx->uncompiled[PIPE_SHADER_COMPUTE];
3387 util_copy_constant_buffer(&pbuf->cb[0], &saved_const, true);
3388
3389 pctx->bind_compute_state(pctx, cso);
3390 pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, false, cbuf);
3391
3392 panfrost_launch_grid_on_batch(pctx, batch, &grid);
3393
3394 pctx->bind_compute_state(pctx, saved_cso);
3395 pctx->set_constant_buffer(pctx, PIPE_SHADER_COMPUTE, 0, true, &saved_const);
3396 }
3397
3398 #define LAUNCH_AFBC_SHADER(name, batch, rsrc, consts, nr_blocks) \
3399 struct pan_afbc_shader_data *shaders = \
3400 panfrost_afbc_get_shaders(batch->ctx, rsrc, AFBC_BLOCK_ALIGN); \
3401 struct pipe_constant_buffer constant_buffer = { \
3402 .buffer_size = sizeof(consts), \
3403 .user_buffer = &consts}; \
3404 panfrost_launch_afbc_shader(batch, shaders->name##_cso, &constant_buffer, \
3405 nr_blocks);
3406
3407 static void
panfrost_afbc_size(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * metadata,unsigned offset,unsigned level)3408 panfrost_afbc_size(struct panfrost_batch *batch, struct panfrost_resource *src,
3409 struct panfrost_bo *metadata, unsigned offset,
3410 unsigned level)
3411 {
3412 struct pan_image_slice_layout *slice = &src->image.layout.slices[level];
3413 struct panfrost_afbc_size_info consts = {
3414 .src =
3415 src->image.data.base + src->image.data.offset + slice->offset,
3416 .metadata = metadata->ptr.gpu + offset,
3417 };
3418
3419 panfrost_batch_read_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3420 panfrost_batch_write_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3421
3422 LAUNCH_AFBC_SHADER(size, batch, src, consts, slice->afbc.nr_blocks);
3423 }
3424
3425 static void
panfrost_afbc_pack(struct panfrost_batch * batch,struct panfrost_resource * src,struct panfrost_bo * dst,struct pan_image_slice_layout * dst_slice,struct panfrost_bo * metadata,unsigned metadata_offset,unsigned level)3426 panfrost_afbc_pack(struct panfrost_batch *batch, struct panfrost_resource *src,
3427 struct panfrost_bo *dst,
3428 struct pan_image_slice_layout *dst_slice,
3429 struct panfrost_bo *metadata, unsigned metadata_offset,
3430 unsigned level)
3431 {
3432 struct pan_image_slice_layout *src_slice = &src->image.layout.slices[level];
3433 struct panfrost_afbc_pack_info consts = {
3434 .src = src->image.data.base + src->image.data.offset +
3435 src_slice->offset,
3436 .dst = dst->ptr.gpu + dst_slice->offset,
3437 .metadata = metadata->ptr.gpu + metadata_offset,
3438 .header_size = dst_slice->afbc.header_size,
3439 .src_stride = src_slice->afbc.stride,
3440 .dst_stride = dst_slice->afbc.stride,
3441 };
3442
3443 panfrost_batch_read_rsrc(batch, src, PIPE_SHADER_COMPUTE);
3444 panfrost_batch_write_bo(batch, dst, PIPE_SHADER_COMPUTE);
3445 panfrost_batch_add_bo(batch, metadata, PIPE_SHADER_COMPUTE);
3446
3447 LAUNCH_AFBC_SHADER(pack, batch, src, consts, dst_slice->afbc.nr_blocks);
3448 }
3449
3450 static void *
panfrost_create_rasterizer_state(struct pipe_context * pctx,const struct pipe_rasterizer_state * cso)3451 panfrost_create_rasterizer_state(struct pipe_context *pctx,
3452 const struct pipe_rasterizer_state *cso)
3453 {
3454 struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer);
3455
3456 so->base = *cso;
3457
3458 #if PAN_ARCH <= 7
3459 pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) {
3460 cfg.multisample_enable = cso->multisample;
3461 cfg.fixed_function_near_discard = cso->depth_clip_near;
3462 cfg.fixed_function_far_discard = cso->depth_clip_far;
3463 cfg.fixed_function_depth_range_fixed = !cso->depth_clamp;
3464 cfg.shader_depth_range_fixed = true;
3465 }
3466
3467 pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) {
3468 cfg.front_facing_depth_bias = cso->offset_tri;
3469 cfg.back_facing_depth_bias = cso->offset_tri;
3470 cfg.single_sampled_lines = !cso->multisample;
3471 }
3472 #endif
3473
3474 return so;
3475 }
3476
3477 #if PAN_ARCH >= 9
3478 /*
3479 * Given a pipe_vertex_element, pack the corresponding Valhall attribute
3480 * descriptor. This function is called at CSO create time.
3481 */
3482 static void
panfrost_pack_attribute(struct panfrost_device * dev,const struct pipe_vertex_element el,struct mali_attribute_packed * out)3483 panfrost_pack_attribute(struct panfrost_device *dev,
3484 const struct pipe_vertex_element el,
3485 struct mali_attribute_packed *out)
3486 {
3487 pan_pack(out, ATTRIBUTE, cfg) {
3488 cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER;
3489 cfg.frequency = (el.instance_divisor > 0)
3490 ? MALI_ATTRIBUTE_FREQUENCY_INSTANCE
3491 : MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3492 cfg.format = GENX(panfrost_format_from_pipe_format)(el.src_format)->hw;
3493 cfg.offset = el.src_offset;
3494 cfg.buffer_index = el.vertex_buffer_index;
3495 cfg.stride = el.src_stride;
3496
3497 if (el.instance_divisor == 0) {
3498 /* Per-vertex */
3499 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
3500 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
3501 cfg.offset_enable = true;
3502 } else if (util_is_power_of_two_or_zero(el.instance_divisor)) {
3503 /* Per-instance, POT divisor */
3504 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
3505 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3506 cfg.divisor_r = __builtin_ctz(el.instance_divisor);
3507 } else {
3508 /* Per-instance, NPOT divisor */
3509 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
3510 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
3511
3512 cfg.divisor_d = panfrost_compute_magic_divisor(
3513 el.instance_divisor, &cfg.divisor_r, &cfg.divisor_e);
3514 }
3515 }
3516 }
3517 #endif
3518
3519 static void *
panfrost_create_vertex_elements_state(struct pipe_context * pctx,unsigned num_elements,const struct pipe_vertex_element * elements)3520 panfrost_create_vertex_elements_state(struct pipe_context *pctx,
3521 unsigned num_elements,
3522 const struct pipe_vertex_element *elements)
3523 {
3524 struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state);
3525 UNUSED struct panfrost_device *dev = pan_device(pctx->screen);
3526
3527 so->num_elements = num_elements;
3528 memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
3529
3530 for (unsigned i = 0; i < num_elements; ++i)
3531 so->strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
3532 #if PAN_ARCH >= 9
3533 for (unsigned i = 0; i < num_elements; ++i)
3534 panfrost_pack_attribute(dev, elements[i], &so->attributes[i]);
3535 #else
3536 /* Assign attribute buffers corresponding to the vertex buffers, keyed
3537 * for a particular divisor since that's how instancing works on Mali */
3538 for (unsigned i = 0; i < num_elements; ++i) {
3539 so->element_buffer[i] = pan_assign_vertex_buffer(
3540 so->buffers, &so->nr_bufs, elements[i].vertex_buffer_index,
3541 elements[i].instance_divisor);
3542 if (elements[i].instance_divisor)
3543 so->attr_depends_on_base_instance_mask |= BITFIELD_BIT(i);
3544 }
3545
3546 for (int i = 0; i < num_elements; ++i) {
3547 enum pipe_format fmt = elements[i].src_format;
3548 so->formats[i] = GENX(panfrost_format_from_pipe_format)(fmt)->hw;
3549
3550 assert(MALI_EXTRACT_INDEX(so->formats[i]) && "format must be supported");
3551 }
3552
3553 /* Let's also prepare vertex builtins */
3554 so->formats[PAN_VERTEX_ID] =
3555 GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3556 so->formats[PAN_INSTANCE_ID] =
3557 GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32_UINT)->hw;
3558 #endif
3559
3560 return so;
3561 }
3562
3563 static inline unsigned
pan_pipe_to_stencil_op(enum pipe_stencil_op in)3564 pan_pipe_to_stencil_op(enum pipe_stencil_op in)
3565 {
3566 switch (in) {
3567 case PIPE_STENCIL_OP_KEEP:
3568 return MALI_STENCIL_OP_KEEP;
3569 case PIPE_STENCIL_OP_ZERO:
3570 return MALI_STENCIL_OP_ZERO;
3571 case PIPE_STENCIL_OP_REPLACE:
3572 return MALI_STENCIL_OP_REPLACE;
3573 case PIPE_STENCIL_OP_INCR:
3574 return MALI_STENCIL_OP_INCR_SAT;
3575 case PIPE_STENCIL_OP_DECR:
3576 return MALI_STENCIL_OP_DECR_SAT;
3577 case PIPE_STENCIL_OP_INCR_WRAP:
3578 return MALI_STENCIL_OP_INCR_WRAP;
3579 case PIPE_STENCIL_OP_DECR_WRAP:
3580 return MALI_STENCIL_OP_DECR_WRAP;
3581 case PIPE_STENCIL_OP_INVERT:
3582 return MALI_STENCIL_OP_INVERT;
3583 default:
3584 unreachable("Invalid stencil op");
3585 }
3586 }
3587
3588 #if PAN_ARCH <= 7
3589 static inline void
pan_pipe_to_stencil(const struct pipe_stencil_state * in,struct mali_stencil_packed * out)3590 pan_pipe_to_stencil(const struct pipe_stencil_state *in,
3591 struct mali_stencil_packed *out)
3592 {
3593 pan_pack(out, STENCIL, s) {
3594 s.mask = in->valuemask;
3595 s.compare_function = (enum mali_func)in->func;
3596 s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op);
3597 s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op);
3598 s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op);
3599 }
3600 }
3601 #endif
3602
3603 static bool
pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state * zsa)3604 pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa)
3605 {
3606 if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS)
3607 return false;
3608
3609 if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS)
3610 return false;
3611
3612 if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS)
3613 return false;
3614
3615 return true;
3616 }
3617
3618 static void *
panfrost_create_depth_stencil_state(struct pipe_context * pipe,const struct pipe_depth_stencil_alpha_state * zsa)3619 panfrost_create_depth_stencil_state(
3620 struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *zsa)
3621 {
3622 struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state);
3623 so->base = *zsa;
3624
3625 const struct pipe_stencil_state front = zsa->stencil[0];
3626 const struct pipe_stencil_state back =
3627 zsa->stencil[1].enabled ? zsa->stencil[1] : front;
3628
3629 enum mali_func depth_func =
3630 zsa->depth_enabled ? (enum mali_func)zsa->depth_func : MALI_FUNC_ALWAYS;
3631
3632 /* Normalize (there's no separate enable) */
3633 if (PAN_ARCH <= 5 && !zsa->alpha_enabled)
3634 so->base.alpha_func = MALI_FUNC_ALWAYS;
3635
3636 #if PAN_ARCH <= 7
3637 /* Prepack relevant parts of the Renderer State Descriptor. They will
3638 * be ORed in at draw-time */
3639 pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) {
3640 cfg.depth_function = depth_func;
3641 cfg.depth_write_mask = zsa->depth_writemask;
3642 }
3643
3644 pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) {
3645 cfg.stencil_enable = front.enabled;
3646 cfg.stencil_mask_front = front.writemask;
3647 cfg.stencil_mask_back = back.writemask;
3648
3649 #if PAN_ARCH <= 5
3650 cfg.alpha_test_compare_function = (enum mali_func)so->base.alpha_func;
3651 #endif
3652 }
3653
3654 /* Stencil tests have their own words in the RSD */
3655 pan_pipe_to_stencil(&front, &so->stencil_front);
3656 pan_pipe_to_stencil(&back, &so->stencil_back);
3657 #else
3658 /* Pack with nodefaults so only explicitly set fields affect pan_merge() when
3659 * emitting depth stencil descriptor */
3660 pan_cast_and_pack_nodefaults(&so->desc, DEPTH_STENCIL, cfg) {
3661 cfg.front_compare_function = (enum mali_func)front.func;
3662 cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op);
3663 cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op);
3664 cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op);
3665
3666 cfg.back_compare_function = (enum mali_func)back.func;
3667 cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op);
3668 cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op);
3669 cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op);
3670
3671 cfg.stencil_test_enable = front.enabled;
3672 cfg.front_write_mask = front.writemask;
3673 cfg.back_write_mask = back.writemask;
3674 cfg.front_value_mask = front.valuemask;
3675 cfg.back_value_mask = back.valuemask;
3676
3677 cfg.depth_write_enable = zsa->depth_writemask;
3678 cfg.depth_function = depth_func;
3679 }
3680 #endif
3681
3682 so->enabled = zsa->stencil[0].enabled ||
3683 (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS);
3684
3685 so->zs_always_passes = pipe_zs_always_passes(zsa);
3686 so->writes_zs = util_writes_depth_stencil(zsa);
3687
3688 /* TODO: Bounds test should be easy */
3689 assert(!zsa->depth_bounds_test);
3690
3691 return so;
3692 }
3693
3694 static struct pipe_sampler_view *
panfrost_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * texture,const struct pipe_sampler_view * template)3695 panfrost_create_sampler_view(struct pipe_context *pctx,
3696 struct pipe_resource *texture,
3697 const struct pipe_sampler_view *template)
3698 {
3699 struct panfrost_context *ctx = pan_context(pctx);
3700 struct panfrost_sampler_view *so =
3701 rzalloc(pctx, struct panfrost_sampler_view);
3702
3703 pan_legalize_format(ctx, pan_resource(texture), template->format, false,
3704 false);
3705
3706 pipe_reference(NULL, &texture->reference);
3707
3708 so->base = *template;
3709 so->base.texture = texture;
3710 so->base.reference.count = 1;
3711 so->base.context = pctx;
3712
3713 panfrost_create_sampler_view_bo(so, pctx, texture);
3714
3715 return (struct pipe_sampler_view *)so;
3716 }
3717
3718 /* A given Gallium blend state can be encoded to the hardware in numerous,
3719 * dramatically divergent ways due to the interactions of blending with
3720 * framebuffer formats. Conceptually, there are two modes:
3721 *
3722 * - Fixed-function blending (for suitable framebuffer formats, suitable blend
3723 * state, and suitable blend constant)
3724 *
3725 * - Blend shaders (for everything else)
3726 *
3727 * A given Gallium blend configuration will compile to exactly one
3728 * fixed-function blend state, if it compiles to any, although the constant
3729 * will vary across runs as that is tracked outside of the Gallium CSO.
3730 *
3731 * However, that same blend configuration will compile to many different blend
3732 * shaders, depending on the framebuffer formats active. The rationale is that
3733 * blend shaders override not just fixed-function blending but also
3734 * fixed-function format conversion, so blend shaders are keyed to a particular
3735 * framebuffer format. As an example, the tilebuffer format is identical for
3736 * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require
3737 * blend shaders.
3738 *
3739 * All of this state is encapsulated in the panfrost_blend_state struct
3740 * (our subclass of pipe_blend_state).
3741 */
3742
3743 /* Create a blend CSO. Essentially, try to compile a fixed-function
3744 * expression and initialize blend shaders */
3745
3746 static void *
panfrost_create_blend_state(struct pipe_context * pipe,const struct pipe_blend_state * blend)3747 panfrost_create_blend_state(struct pipe_context *pipe,
3748 const struct pipe_blend_state *blend)
3749 {
3750 struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state);
3751 so->base = *blend;
3752
3753 so->pan.logicop_enable = blend->logicop_enable;
3754 so->pan.logicop_func = blend->logicop_func;
3755 so->pan.rt_count = blend->max_rt + 1;
3756 so->pan.alpha_to_one = blend->alpha_to_one;
3757
3758 for (unsigned c = 0; c < so->pan.rt_count; ++c) {
3759 unsigned g = blend->independent_blend_enable ? c : 0;
3760 const struct pipe_rt_blend_state pipe = blend->rt[g];
3761 struct pan_blend_equation equation = {0};
3762
3763 equation.color_mask = pipe.colormask;
3764 equation.blend_enable = pipe.blend_enable;
3765
3766 if (pipe.blend_enable) {
3767 equation.rgb_func = pipe.rgb_func;
3768 equation.rgb_src_factor = pipe.rgb_src_factor;
3769 equation.rgb_dst_factor = pipe.rgb_dst_factor;
3770 equation.alpha_func = pipe.alpha_func;
3771 equation.alpha_src_factor = pipe.alpha_src_factor;
3772 equation.alpha_dst_factor = pipe.alpha_dst_factor;
3773 }
3774
3775 /* Determine some common properties */
3776 unsigned constant_mask = pan_blend_constant_mask(equation);
3777 const bool supports_2src = pan_blend_supports_2src(PAN_ARCH);
3778 so->info[c] = (struct pan_blend_info){
3779 .enabled = (equation.color_mask != 0) &&
3780 !(blend->logicop_enable &&
3781 blend->logicop_func == PIPE_LOGICOP_NOOP),
3782 .opaque = !blend->logicop_enable && pan_blend_is_opaque(equation),
3783 .constant_mask = constant_mask,
3784
3785 /* TODO: check the dest for the logicop */
3786 .load_dest = blend->logicop_enable || pan_blend_reads_dest(equation),
3787
3788 /* Could this possibly be fixed-function? */
3789 .fixed_function =
3790 !blend->logicop_enable &&
3791 pan_blend_can_fixed_function(equation, supports_2src) &&
3792 (!constant_mask || pan_blend_supports_constant(PAN_ARCH, c)),
3793
3794 .alpha_zero_nop = pan_blend_alpha_zero_nop(equation),
3795 .alpha_one_store = pan_blend_alpha_one_store(equation),
3796 };
3797
3798 so->pan.rts[c].equation = equation;
3799
3800 /* Bifrost needs to know if any render target loads its
3801 * destination in the hot draw path, so precompute this */
3802 if (so->info[c].load_dest)
3803 so->load_dest_mask |= BITFIELD_BIT(c);
3804
3805 /* Bifrost needs to know if any render target loads its
3806 * destination in the hot draw path, so precompute this */
3807 if (so->info[c].enabled)
3808 so->enabled_mask |= BITFIELD_BIT(c);
3809
3810 /* Converting equations to Mali style is expensive, do it at
3811 * CSO create time instead of draw-time */
3812 if (so->info[c].fixed_function) {
3813 so->equation[c] = pan_pack_blend(equation);
3814 }
3815 }
3816
3817 return so;
3818 }
3819
3820 #if PAN_ARCH >= 9
3821 static enum mali_flush_to_zero_mode
panfrost_ftz_mode(struct pan_shader_info * info)3822 panfrost_ftz_mode(struct pan_shader_info *info)
3823 {
3824 if (info->ftz_fp32) {
3825 if (info->ftz_fp16)
3826 return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
3827 else
3828 return MALI_FLUSH_TO_ZERO_MODE_DX11;
3829 } else {
3830 /* We don't have a "flush FP16, preserve FP32" mode, but APIs
3831 * should not be able to generate that.
3832 */
3833 assert(!info->ftz_fp16 && !info->ftz_fp32);
3834 return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
3835 }
3836 }
3837 #endif
3838
3839 static void
prepare_shader(struct panfrost_compiled_shader * state,struct panfrost_pool * pool,bool upload)3840 prepare_shader(struct panfrost_compiled_shader *state,
3841 struct panfrost_pool *pool, bool upload)
3842 {
3843 #if PAN_ARCH <= 7
3844 struct mali_renderer_state_packed *out =
3845 (struct mali_renderer_state_packed *)&state->partial_rsd;
3846
3847 if (upload) {
3848 struct panfrost_ptr ptr =
3849 pan_pool_alloc_desc(&pool->base, RENDERER_STATE);
3850
3851 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3852 out = ptr.cpu;
3853 }
3854
3855 pan_pack(out, RENDERER_STATE, cfg) {
3856 pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
3857 }
3858 #else
3859 assert(upload);
3860
3861 /* The address in the shader program descriptor must be non-null, but
3862 * the entire shader program descriptor may be omitted.
3863 *
3864 * See dEQP-GLES31.functional.compute.basic.empty
3865 */
3866 if (!state->bin.gpu)
3867 return;
3868
3869 bool vs = (state->info.stage == MESA_SHADER_VERTEX);
3870 bool secondary_enable = (vs && state->info.vs.secondary_enable);
3871
3872 unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1;
3873 struct panfrost_ptr ptr =
3874 pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM);
3875
3876 state->state = panfrost_pool_take_ref(pool, ptr.gpu);
3877
3878 struct mali_shader_program_packed *programs = ptr.cpu;
3879
3880 /* Generic, or IDVS/points */
3881 pan_cast_and_pack(&programs[0], SHADER_PROGRAM, cfg) {
3882 cfg.stage = pan_shader_stage(&state->info);
3883
3884 if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
3885 cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
3886 else if (vs)
3887 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
3888
3889 cfg.register_allocation =
3890 pan_register_allocation(state->info.work_reg_count);
3891 cfg.binary = state->bin.gpu;
3892 cfg.preload.r48_r63 = (state->info.preload >> 48);
3893 cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3894
3895 if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
3896 cfg.requires_helper_threads = state->info.contains_barrier;
3897 }
3898
3899 if (!vs)
3900 return;
3901
3902 /* IDVS/triangles */
3903 pan_pack(&programs[1], SHADER_PROGRAM, cfg) {
3904 cfg.stage = pan_shader_stage(&state->info);
3905 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
3906 cfg.register_allocation =
3907 pan_register_allocation(state->info.work_reg_count);
3908 cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
3909 cfg.preload.r48_r63 = (state->info.preload >> 48);
3910 cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3911 }
3912
3913 if (!secondary_enable)
3914 return;
3915
3916 pan_pack(&programs[2], SHADER_PROGRAM, cfg) {
3917 unsigned work_count = state->info.vs.secondary_work_reg_count;
3918
3919 cfg.stage = pan_shader_stage(&state->info);
3920 cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
3921 cfg.register_allocation = pan_register_allocation(work_count);
3922 cfg.binary = state->bin.gpu + state->info.vs.secondary_offset;
3923 cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48);
3924 cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
3925 }
3926 #endif
3927 }
3928
3929 static void
screen_destroy(struct pipe_screen * pscreen)3930 screen_destroy(struct pipe_screen *pscreen)
3931 {
3932 struct panfrost_device *dev = pan_device(pscreen);
3933 GENX(pan_fb_preload_cache_cleanup)(&dev->fb_preload_cache);
3934 }
3935
3936 static void
panfrost_sampler_view_destroy(struct pipe_context * pctx,struct pipe_sampler_view * pview)3937 panfrost_sampler_view_destroy(struct pipe_context *pctx,
3938 struct pipe_sampler_view *pview)
3939 {
3940 struct panfrost_sampler_view *view = (struct panfrost_sampler_view *)pview;
3941
3942 pipe_resource_reference(&pview->texture, NULL);
3943 panfrost_bo_unreference(view->state.bo);
3944 ralloc_free(view);
3945 }
3946
3947 static void
context_populate_vtbl(struct pipe_context * pipe)3948 context_populate_vtbl(struct pipe_context *pipe)
3949 {
3950 pipe->draw_vbo = panfrost_draw_vbo;
3951 pipe->launch_grid = panfrost_launch_grid;
3952
3953 pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state;
3954 pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
3955 pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
3956 pipe->create_sampler_view = panfrost_create_sampler_view;
3957 pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
3958 pipe->create_sampler_state = panfrost_create_sampler_state;
3959 pipe->create_blend_state = panfrost_create_blend_state;
3960
3961 pipe->get_sample_position = u_default_get_sample_position;
3962 }
3963
3964 static void
context_init(struct panfrost_context * ctx)3965 context_init(struct panfrost_context *ctx)
3966 {
3967 }
3968
3969 static void
context_cleanup(struct panfrost_context * ctx)3970 context_cleanup(struct panfrost_context *ctx)
3971 {
3972 }
3973
3974 #if PAN_ARCH <= 5
3975
3976 /* Returns the polygon list's GPU address if available, or otherwise allocates
3977 * the polygon list. It's perfectly fast to use allocate/free BO directly,
3978 * since we'll hit the BO cache and this is one-per-batch anyway. */
3979
3980 static uint64_t
batch_get_polygon_list(struct panfrost_batch * batch)3981 batch_get_polygon_list(struct panfrost_batch *batch)
3982 {
3983 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
3984
3985 if (!batch->tiler_ctx.midgard.polygon_list) {
3986 bool has_draws = batch->draw_count > 0;
3987 unsigned size = panfrost_tiler_get_polygon_list_size(
3988 batch->key.width, batch->key.height, batch->vertex_count,
3989 !dev->model->quirks.no_hierarchical_tiling);
3990
3991 /* Create the BO as invisible if we can. If there are no draws,
3992 * we need to write the polygon list manually because there's
3993 * no WRITE_VALUE job in the chain
3994 */
3995 bool init_polygon_list = !has_draws;
3996 batch->polygon_list_bo = panfrost_batch_create_bo(
3997 batch, size, init_polygon_list ? 0 : PAN_BO_INVISIBLE,
3998 PIPE_SHADER_VERTEX, "Polygon list");
3999 batch->tiler_ctx.midgard.polygon_list = batch->polygon_list_bo->ptr.gpu;
4000 panfrost_batch_add_bo(batch, batch->polygon_list_bo,
4001 PIPE_SHADER_FRAGMENT);
4002
4003 if (init_polygon_list && dev->model->quirks.no_hierarchical_tiling) {
4004 assert(batch->polygon_list_bo->ptr.cpu);
4005 uint32_t *polygon_list_body =
4006 batch->polygon_list_bo->ptr.cpu +
4007 MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
4008
4009 /* Magic for Mali T720 */
4010 polygon_list_body[0] = 0xa0000000;
4011 } else if (init_polygon_list) {
4012 assert(batch->polygon_list_bo->ptr.cpu);
4013 uint32_t *header = batch->polygon_list_bo->ptr.cpu;
4014 memset(header, 0, size);
4015 }
4016
4017 batch->tiler_ctx.midgard.disable = !has_draws;
4018 batch->tiler_ctx.midgard.no_hierarchical_tiling =
4019 dev->model->quirks.no_hierarchical_tiling;
4020 batch->tiler_ctx.midgard.heap.start = dev->tiler_heap->ptr.gpu;
4021 batch->tiler_ctx.midgard.heap.size = panfrost_bo_size(dev->tiler_heap);
4022 }
4023
4024 return batch->tiler_ctx.midgard.polygon_list;
4025 }
4026 #endif
4027
4028 static void
init_polygon_list(struct panfrost_batch * batch)4029 init_polygon_list(struct panfrost_batch *batch)
4030 {
4031 #if PAN_ARCH <= 5
4032 uint64_t polygon_list = batch_get_polygon_list(batch);
4033 pan_jc_initialize_tiler(&batch->pool.base, &batch->jm.jobs.vtc_jc,
4034 polygon_list);
4035 #endif
4036 }
4037
4038 static int
submit_batch(struct panfrost_batch * batch,struct pan_fb_info * fb)4039 submit_batch(struct panfrost_batch *batch, struct pan_fb_info *fb)
4040 {
4041 JOBX(prepare_tiler)(batch, fb);
4042 JOBX(preload_fb)(batch, fb);
4043 init_polygon_list(batch);
4044
4045 /* Now that all draws are in, we can finally prepare the
4046 * FBD for the batch (if there is one). */
4047
4048 emit_tls(batch);
4049
4050 if (panfrost_has_fragment_job(batch)) {
4051 emit_fbd(batch, fb);
4052 emit_fragment_job(batch, fb);
4053 }
4054
4055 return JOBX(submit_batch)(batch);
4056 }
4057
4058 static void
emit_write_timestamp(struct panfrost_batch * batch,struct panfrost_resource * dst,unsigned offset)4059 emit_write_timestamp(struct panfrost_batch *batch,
4060 struct panfrost_resource *dst, unsigned offset)
4061 {
4062 batch->need_job_req_cycle_count = true;
4063 batch->has_time_query = true;
4064
4065 JOBX(emit_write_timestamp)(batch, dst, offset);
4066 }
4067
4068 void
GENX(panfrost_cmdstream_screen_init)4069 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
4070 {
4071 struct panfrost_device *dev = &screen->dev;
4072
4073 screen->vtbl.prepare_shader = prepare_shader;
4074 screen->vtbl.screen_destroy = screen_destroy;
4075 screen->vtbl.context_populate_vtbl = context_populate_vtbl;
4076 screen->vtbl.context_init = JOBX(init_context);
4077 screen->vtbl.context_cleanup = JOBX(cleanup_context);
4078 screen->vtbl.init_batch = JOBX(init_batch);
4079 screen->vtbl.cleanup_batch = JOBX(cleanup_batch);
4080 screen->vtbl.submit_batch = submit_batch;
4081 screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
4082 screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
4083 screen->vtbl.compile_shader = GENX(pan_shader_compile);
4084 screen->vtbl.afbc_size = panfrost_afbc_size;
4085 screen->vtbl.afbc_pack = panfrost_afbc_pack;
4086 screen->vtbl.emit_write_timestamp = emit_write_timestamp;
4087 screen->vtbl.select_tile_size = GENX(pan_select_tile_size);
4088
4089 GENX(pan_fb_preload_cache_init)
4090 (&dev->fb_preload_cache, panfrost_device_gpu_id(dev), &dev->blend_shaders,
4091 &screen->mempools.bin.base, &screen->mempools.desc.base);
4092
4093 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
4094 pan_indirect_dispatch_meta_init(
4095 &dev->indirect_dispatch, panfrost_device_gpu_id(dev),
4096 &screen->mempools.bin.base, &screen->mempools.desc.base);
4097 #endif
4098 }
4099