1 /*
2 * Copyright (C) 2020-2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 * Boris Brezillon <boris.brezillon@collabora.com>
26 */
27
28 #include <math.h>
29 #include <stdio.h>
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_math.h"
32 #include "pan_blend.h"
33 #include "pan_desc.h"
34 #include "pan_encoder.h"
35 #include "pan_fb_preload.h"
36 #include "pan_jc.h"
37 #include "pan_pool.h"
38 #include "pan_shader.h"
39 #include "pan_texture.h"
40
41 #if PAN_ARCH >= 6
42 /* On Midgard, the native preload infrastructure (via MFBD preloads) is broken
43 * or missing in many cases. We instead use software paths as fallbacks, which
44 * are done as TILER jobs. No vertex shader is necessary since we can supply
45 * screen-space coordinates directly.
46 *
47 * This is primarily designed as a fallback for preloads but could be extended
48 * for other clears/blits if needed in the future. */
49
50 static enum mali_register_file_format
nir_type_to_reg_fmt(nir_alu_type in)51 nir_type_to_reg_fmt(nir_alu_type in)
52 {
53 switch (in) {
54 case nir_type_float32:
55 return MALI_REGISTER_FILE_FORMAT_F32;
56 case nir_type_int32:
57 return MALI_REGISTER_FILE_FORMAT_I32;
58 case nir_type_uint32:
59 return MALI_REGISTER_FILE_FORMAT_U32;
60 default:
61 unreachable("Invalid type");
62 }
63 }
64 #endif
65
66 /* On Valhall, the driver gives the hardware a table of resource tables.
67 * Resources are addressed as the index of the table together with the index of
68 * the resource within the table. For simplicity, we put one type of resource
69 * in each table and fix the numbering of the tables.
70 *
71 * This numbering is arbitrary.
72 */
73 enum pan_preload_resource_table {
74 PAN_BLIT_TABLE_ATTRIBUTE = 0,
75 PAN_BLIT_TABLE_ATTRIBUTE_BUFFER,
76 PAN_BLIT_TABLE_SAMPLER,
77 PAN_BLIT_TABLE_TEXTURE,
78
79 PAN_BLIT_NUM_RESOURCE_TABLES
80 };
81
82 struct pan_preload_surface {
83 gl_frag_result loc : 4;
84 nir_alu_type type : 8;
85 enum mali_texture_dimension dim : 2;
86 bool array : 1;
87 unsigned samples : 5;
88 };
89
90 struct pan_preload_shader_key {
91 struct pan_preload_surface surfaces[8];
92 };
93
94 struct pan_preload_shader_data {
95 struct pan_preload_shader_key key;
96 struct pan_shader_info info;
97 uint64_t address;
98 unsigned blend_ret_offsets[8];
99 nir_alu_type blend_types[8];
100 };
101
102 struct pan_preload_blend_shader_key {
103 enum pipe_format format;
104 nir_alu_type type;
105 unsigned rt : 3;
106 unsigned nr_samples : 5;
107 unsigned pad : 24;
108 };
109
110 struct pan_preload_blend_shader_data {
111 struct pan_preload_blend_shader_key key;
112 uint64_t address;
113 };
114
115 struct pan_preload_rsd_key {
116 struct {
117 enum pipe_format format;
118 nir_alu_type type : 8;
119 unsigned samples : 5;
120 enum mali_texture_dimension dim : 2;
121 bool array : 1;
122 } rts[8], z, s;
123 };
124
125 struct pan_preload_rsd_data {
126 struct pan_preload_rsd_key key;
127 uint64_t address;
128 };
129
130 #if PAN_ARCH >= 5
131 static void
pan_preload_emit_blend(unsigned rt,const struct pan_image_view * iview,const struct pan_preload_shader_data * preload_shader,uint64_t blend_shader,struct mali_blend_packed * out)132 pan_preload_emit_blend(unsigned rt,
133 const struct pan_image_view *iview,
134 const struct pan_preload_shader_data *preload_shader,
135 uint64_t blend_shader, struct mali_blend_packed *out)
136 {
137 assert(blend_shader == 0 || PAN_ARCH <= 5);
138
139 pan_pack(out, BLEND, cfg) {
140 if (!iview) {
141 cfg.enable = false;
142 #if PAN_ARCH >= 6
143 cfg.internal.mode = MALI_BLEND_MODE_OFF;
144 #endif
145 continue;
146 }
147
148 cfg.round_to_fb_precision = true;
149 cfg.srgb = util_format_is_srgb(iview->format);
150
151 #if PAN_ARCH >= 6
152 cfg.internal.mode = MALI_BLEND_MODE_OPAQUE;
153 #endif
154
155 if (!blend_shader) {
156 cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
157 cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
158 cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
159 cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
160 cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
161 cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
162 cfg.equation.color_mask = 0xf;
163
164 #if PAN_ARCH >= 6
165 nir_alu_type type = preload_shader->key.surfaces[rt].type;
166
167 cfg.internal.fixed_function.num_comps = 4;
168 cfg.internal.fixed_function.conversion.memory_format = GENX(
169 panfrost_dithered_format_from_pipe_format)(iview->format, false);
170 cfg.internal.fixed_function.conversion.register_format =
171 nir_type_to_reg_fmt(type);
172
173 cfg.internal.fixed_function.rt = rt;
174 #endif
175 } else {
176 #if PAN_ARCH <= 5
177 cfg.blend_shader = true;
178 cfg.shader_pc = blend_shader;
179 #endif
180 }
181 }
182 }
183 #endif
184
185 struct pan_preload_views {
186 unsigned rt_count;
187 const struct pan_image_view *rts[8];
188 const struct pan_image_view *z;
189 const struct pan_image_view *s;
190 };
191
192 static bool
pan_preload_is_ms(struct pan_preload_views * views)193 pan_preload_is_ms(struct pan_preload_views *views)
194 {
195 for (unsigned i = 0; i < views->rt_count; i++) {
196 if (views->rts[i]) {
197 if (pan_image_view_get_nr_samples(views->rts[i]) > 1)
198 return true;
199 }
200 }
201
202 if (views->z && pan_image_view_get_nr_samples(views->z) > 1)
203 return true;
204
205 if (views->s && pan_image_view_get_nr_samples(views->s) > 1)
206 return true;
207
208 return false;
209 }
210
211 #if PAN_ARCH >= 5
212 static void
pan_preload_emit_blends(const struct pan_preload_shader_data * preload_shader,struct pan_preload_views * views,uint64_t * blend_shaders,struct mali_blend_packed * out)213 pan_preload_emit_blends(const struct pan_preload_shader_data *preload_shader,
214 struct pan_preload_views *views,
215 uint64_t *blend_shaders, struct mali_blend_packed *out)
216 {
217 for (unsigned i = 0; i < MAX2(views->rt_count, 1); ++i) {
218 const struct pan_image_view *rt_view = views->rts[i];
219 uint64_t blend_shader = blend_shaders ? blend_shaders[i] : 0;
220
221 pan_preload_emit_blend(i, rt_view, preload_shader, blend_shader, &out[i]);
222 }
223 }
224 #endif
225
226 #if PAN_ARCH <= 7
227 static void
pan_preload_emit_rsd(const struct pan_preload_shader_data * preload_shader,struct pan_preload_views * views,uint64_t * blend_shaders,struct mali_renderer_state_packed * out)228 pan_preload_emit_rsd(const struct pan_preload_shader_data *preload_shader,
229 struct pan_preload_views *views, uint64_t *blend_shaders,
230 struct mali_renderer_state_packed *out)
231 {
232 UNUSED bool zs = (views->z || views->s);
233 bool ms = pan_preload_is_ms(views);
234
235 pan_pack(out, RENDERER_STATE, cfg) {
236 assert(preload_shader->address);
237 pan_shader_prepare_rsd(&preload_shader->info, preload_shader->address, &cfg);
238
239 cfg.multisample_misc.sample_mask = 0xFFFF;
240 cfg.multisample_misc.multisample_enable = ms;
241 cfg.multisample_misc.evaluate_per_sample = ms;
242 cfg.multisample_misc.depth_write_mask = views->z != NULL;
243 cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS;
244
245 cfg.stencil_mask_misc.stencil_enable = views->s != NULL;
246 cfg.stencil_mask_misc.stencil_mask_front = 0xFF;
247 cfg.stencil_mask_misc.stencil_mask_back = 0xFF;
248 cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS;
249 cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE;
250 cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE;
251 cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE;
252 cfg.stencil_front.mask = 0xFF;
253 cfg.stencil_back = cfg.stencil_front;
254
255 #if PAN_ARCH >= 6
256 if (zs) {
257 /* Writing Z/S requires late updates */
258 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
259 cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
260 } else {
261 /* Skipping ATEST requires forcing Z/S */
262 cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
263 cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
264 }
265
266 /* However, while shaders writing Z/S can normally be killed, on v6
267 * for frame shaders it can cause GPU timeouts, so only allow colour
268 * preload shaders to be killed. */
269 cfg.properties.allow_forward_pixel_to_kill = !zs;
270
271 if (PAN_ARCH == 6)
272 cfg.properties.allow_forward_pixel_to_be_killed = !zs;
273 #else
274
275 uint64_t blend_shader =
276 blend_shaders
277 ? panfrost_last_nonnull(blend_shaders, MAX2(views->rt_count, 1))
278 : 0;
279
280 cfg.properties.work_register_count = 4;
281 cfg.properties.force_early_z = !zs;
282 cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
283
284 /* Set even on v5 for erratum workaround */
285 #if PAN_ARCH == 5
286 cfg.legacy_blend_shader = blend_shader;
287 #else
288 cfg.blend_shader = blend_shader;
289 cfg.stencil_mask_misc.write_enable = true;
290 cfg.stencil_mask_misc.dither_disable = true;
291 cfg.multisample_misc.blend_shader = !!blend_shader;
292 cfg.blend_shader = blend_shader;
293 if (!cfg.multisample_misc.blend_shader) {
294 cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
295 cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
296 cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
297 cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
298 cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
299 cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
300 cfg.blend_constant = 0;
301
302 if (views->rts[0] != NULL) {
303 cfg.stencil_mask_misc.srgb =
304 util_format_is_srgb(views->rts[0]->format);
305 cfg.blend_equation.color_mask = 0xf;
306 }
307 }
308 #endif
309 #endif
310 }
311
312 #if PAN_ARCH >= 5
313 pan_preload_emit_blends(preload_shader, views, blend_shaders,
314 (void*)((uint8_t*)out + pan_size(RENDERER_STATE)));
315 #endif
316 }
317 #endif
318
319 #if PAN_ARCH <= 5
320 static void
pan_preload_get_blend_shaders(struct pan_fb_preload_cache * cache,unsigned rt_count,const struct pan_image_view ** rts,const struct pan_preload_shader_data * preload_shader,uint64_t * blend_shaders)321 pan_preload_get_blend_shaders(struct pan_fb_preload_cache *cache,
322 unsigned rt_count,
323 const struct pan_image_view **rts,
324 const struct pan_preload_shader_data *preload_shader,
325 uint64_t *blend_shaders)
326 {
327 if (!rt_count)
328 return;
329
330 struct pan_blend_state blend_state = {
331 .rt_count = rt_count,
332 };
333
334 for (unsigned i = 0; i < rt_count; i++) {
335 if (!rts[i] || panfrost_blendable_formats_v7[rts[i]->format].internal)
336 continue;
337
338 struct pan_preload_blend_shader_key key = {
339 .format = rts[i]->format,
340 .rt = i,
341 .nr_samples = pan_image_view_get_nr_samples(rts[i]),
342 .type = preload_shader->blend_types[i],
343 };
344
345 pthread_mutex_lock(&cache->shaders.lock);
346 struct hash_entry *he =
347 _mesa_hash_table_search(cache->shaders.blend, &key);
348 struct pan_preload_blend_shader_data *blend_shader = he ? he->data : NULL;
349 if (blend_shader) {
350 blend_shaders[i] = blend_shader->address;
351 pthread_mutex_unlock(&cache->shaders.lock);
352 continue;
353 }
354
355 blend_shader =
356 rzalloc(cache->shaders.blend, struct pan_preload_blend_shader_data);
357 blend_shader->key = key;
358
359 blend_state.rts[i] = (struct pan_blend_rt_state){
360 .format = rts[i]->format,
361 .nr_samples = pan_image_view_get_nr_samples(rts[i]),
362 .equation =
363 {
364 .blend_enable = false,
365 .color_mask = 0xf,
366 },
367 };
368
369 pthread_mutex_lock(&cache->blend_shader_cache->lock);
370 struct pan_blend_shader_variant *b = GENX(pan_blend_get_shader_locked)(
371 cache->blend_shader_cache, &blend_state,
372 preload_shader->blend_types[i], nir_type_float32, /* unused */
373 i);
374
375 assert(b->work_reg_count <= 4);
376 struct panfrost_ptr bin =
377 pan_pool_alloc_aligned(cache->shaders.pool, b->binary.size, 64);
378 memcpy(bin.cpu, b->binary.data, b->binary.size);
379
380 blend_shader->address = bin.gpu | b->first_tag;
381 pthread_mutex_unlock(&cache->blend_shader_cache->lock);
382 _mesa_hash_table_insert(cache->shaders.blend, &blend_shader->key,
383 blend_shader);
384 pthread_mutex_unlock(&cache->shaders.lock);
385 blend_shaders[i] = blend_shader->address;
386 }
387 }
388 #endif
389
390 /*
391 * Early Mali GPUs did not respect sampler LOD clamps or bias, so the Midgard
392 * compiler inserts lowering code with a load_sampler_lod_parameters_pan sysval
393 * that we need to lower. Our samplers do not use LOD clamps or bias, so we
394 * lower to the identity settings and let constant folding get rid of the
395 * unnecessary lowering.
396 */
397 static bool
lower_sampler_parameters(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)398 lower_sampler_parameters(nir_builder *b, nir_intrinsic_instr *intr,
399 UNUSED void *data)
400 {
401 if (intr->intrinsic != nir_intrinsic_load_sampler_lod_parameters_pan)
402 return false;
403
404 const nir_const_value constants[4] = {
405 nir_const_value_for_float(0.0f, 32), /* min_lod */
406 nir_const_value_for_float(INFINITY, 32), /* max_lod */
407 nir_const_value_for_float(0.0f, 32), /* lod_bias */
408 };
409
410 b->cursor = nir_after_instr(&intr->instr);
411 nir_def_rewrite_uses(&intr->def, nir_build_imm(b, 3, 32, constants));
412 return true;
413 }
414
415 static uint32_t
sampler_hw_index(uint32_t index)416 sampler_hw_index(uint32_t index)
417 {
418 return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_SAMPLER, index) : index;
419 }
420
421 static uint32_t
tex_hw_index(uint32_t index)422 tex_hw_index(uint32_t index)
423 {
424 return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_TEXTURE, index) : index;
425 }
426
427 static uint32_t
attr_hw_index(uint32_t index)428 attr_hw_index(uint32_t index)
429 {
430 return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_ATTRIBUTE, index)
431 : index;
432 }
433
434 static const struct pan_preload_shader_data *
pan_preload_get_shader(struct pan_fb_preload_cache * cache,const struct pan_preload_shader_key * key)435 pan_preload_get_shader(struct pan_fb_preload_cache *cache,
436 const struct pan_preload_shader_key *key)
437 {
438 pthread_mutex_lock(&cache->shaders.lock);
439 struct hash_entry *he =
440 _mesa_hash_table_search(cache->shaders.preload, key);
441 struct pan_preload_shader_data *shader = he ? he->data : NULL;
442
443 if (shader)
444 goto out;
445
446 unsigned coord_comps = 0;
447 unsigned sig_offset = 0;
448 char sig[256];
449 bool first = true;
450 for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) {
451 const char *type_str, *dim_str;
452 if (key->surfaces[i].type == nir_type_invalid)
453 continue;
454
455 switch (key->surfaces[i].type) {
456 case nir_type_float32:
457 type_str = "float";
458 break;
459 case nir_type_uint32:
460 type_str = "uint";
461 break;
462 case nir_type_int32:
463 type_str = "int";
464 break;
465 default:
466 unreachable("Invalid type\n");
467 }
468
469 switch (key->surfaces[i].dim) {
470 case MALI_TEXTURE_DIMENSION_CUBE:
471 dim_str = "cube";
472 break;
473 case MALI_TEXTURE_DIMENSION_1D:
474 dim_str = "1D";
475 break;
476 case MALI_TEXTURE_DIMENSION_2D:
477 dim_str = "2D";
478 break;
479 case MALI_TEXTURE_DIMENSION_3D:
480 dim_str = "3D";
481 break;
482 default:
483 unreachable("Invalid dim\n");
484 }
485
486 coord_comps = MAX2(coord_comps, (key->surfaces[i].dim ?: 3) +
487 (key->surfaces[i].array ? 1 : 0));
488
489 if (sig_offset >= sizeof(sig)) {
490 first = false;
491 continue;
492 }
493
494 sig_offset +=
495 snprintf(sig + sig_offset, sizeof(sig) - sig_offset,
496 "%s[%s;%s;%s%s;samples=%d]",
497 first ? "" : ",", gl_frag_result_name(key->surfaces[i].loc),
498 type_str, dim_str, key->surfaces[i].array ? "[]" : "",
499 key->surfaces[i].samples);
500
501 first = false;
502 }
503
504 nir_builder b = nir_builder_init_simple_shader(
505 MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(),
506 "pan_preload(%s)", sig);
507
508 nir_def *barycentric = nir_load_barycentric(
509 &b, nir_intrinsic_load_barycentric_pixel, INTERP_MODE_SMOOTH);
510 nir_def *coord = nir_load_interpolated_input(
511 &b, coord_comps, 32, barycentric, nir_imm_int(&b, 0),
512 .base = attr_hw_index(0), .dest_type = nir_type_float32,
513 .io_semantics.location = VARYING_SLOT_VAR0, .io_semantics.num_slots = 1);
514
515 unsigned active_count = 0;
516 for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) {
517 if (key->surfaces[i].type == nir_type_invalid)
518 continue;
519
520 bool ms = key->surfaces[i].samples > 1;
521 enum glsl_sampler_dim sampler_dim;
522
523 switch (key->surfaces[i].dim) {
524 case MALI_TEXTURE_DIMENSION_1D:
525 sampler_dim = GLSL_SAMPLER_DIM_1D;
526 break;
527 case MALI_TEXTURE_DIMENSION_2D:
528 sampler_dim = ms ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D;
529 break;
530 case MALI_TEXTURE_DIMENSION_3D:
531 sampler_dim = GLSL_SAMPLER_DIM_3D;
532 break;
533 case MALI_TEXTURE_DIMENSION_CUBE:
534 sampler_dim = GLSL_SAMPLER_DIM_CUBE;
535 break;
536 }
537
538
539 nir_tex_instr *tex = nir_tex_instr_create(b.shader, ms ? 3 : 1);
540
541 tex->dest_type = key->surfaces[i].type;
542 tex->texture_index = tex_hw_index(active_count);
543 tex->sampler_index = sampler_hw_index(0);
544 tex->is_array = key->surfaces[i].array;
545 tex->sampler_dim = sampler_dim;
546
547 if (ms) {
548 tex->op = nir_texop_txf_ms;
549
550 tex->src[0] =
551 nir_tex_src_for_ssa(nir_tex_src_coord, nir_f2i32(&b, coord));
552 tex->coord_components = coord_comps;
553
554 tex->src[1] =
555 nir_tex_src_for_ssa(nir_tex_src_ms_index, nir_load_sample_id(&b));
556
557 tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(&b, 0));
558 } else {
559 tex->op = nir_texop_txl;
560
561 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
562 tex->coord_components = coord_comps;
563 }
564
565 nir_def_init(&tex->instr, &tex->def, 4, 32);
566 nir_builder_instr_insert(&b, &tex->instr);
567
568 nir_def *res = &tex->def;
569
570 if (key->surfaces[i].loc >= FRAG_RESULT_DATA0) {
571 nir_store_output(
572 &b, res, nir_imm_int(&b, 0), .base = active_count,
573 .src_type = key->surfaces[i].type,
574 .io_semantics.location = key->surfaces[i].loc,
575 .io_semantics.num_slots = 1,
576 .write_mask = nir_component_mask(res->num_components));
577 } else {
578 unsigned c = key->surfaces[i].loc == FRAG_RESULT_STENCIL ? 1 : 0;
579 nir_store_output(
580 &b, nir_channel(&b, res, c), nir_imm_int(&b, 0),
581 .base = active_count, .src_type = key->surfaces[i].type,
582 .io_semantics.location = key->surfaces[i].loc,
583 .io_semantics.num_slots = 1, .write_mask = nir_component_mask(1));
584 }
585 active_count++;
586 }
587
588 struct panfrost_compile_inputs inputs = {
589 .gpu_id = cache->gpu_id,
590 .is_blit = true,
591 .no_idvs = true,
592 };
593 struct util_dynarray binary;
594
595 util_dynarray_init(&binary, NULL);
596
597 shader = rzalloc(cache->shaders.preload, struct pan_preload_shader_data);
598
599 nir_shader_gather_info(b.shader, nir_shader_get_entrypoint(b.shader));
600
601 for (unsigned i = 0; i < active_count; ++i)
602 BITSET_SET(b.shader->info.textures_used, i);
603
604 pan_shader_preprocess(b.shader, inputs.gpu_id);
605
606 if (PAN_ARCH == 4) {
607 NIR_PASS(_, b.shader, nir_shader_intrinsics_pass,
608 lower_sampler_parameters, nir_metadata_control_flow, NULL);
609 }
610
611 GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader->info);
612
613 shader->key = *key;
614 shader->address =
615 pan_pool_upload_aligned(cache->shaders.pool, binary.data,
616 binary.size, PAN_ARCH >= 6 ? 128 : 64);
617
618 util_dynarray_fini(&binary);
619 ralloc_free(b.shader);
620
621 #if PAN_ARCH >= 6
622 for (unsigned i = 0; i < ARRAY_SIZE(shader->blend_ret_offsets); i++) {
623 shader->blend_ret_offsets[i] =
624 shader->info.bifrost.blend[i].return_offset;
625 shader->blend_types[i] = shader->info.bifrost.blend[i].type;
626 }
627 #endif
628
629 _mesa_hash_table_insert(cache->shaders.preload, &shader->key, shader);
630
631 out:
632 pthread_mutex_unlock(&cache->shaders.lock);
633 return shader;
634 }
635
636 static struct pan_preload_shader_key
pan_preload_get_key(struct pan_preload_views * views)637 pan_preload_get_key(struct pan_preload_views *views)
638 {
639 struct pan_preload_shader_key key = {0};
640
641 if (views->z) {
642 key.surfaces[0].loc = FRAG_RESULT_DEPTH;
643 key.surfaces[0].type = nir_type_float32;
644 key.surfaces[0].samples = pan_image_view_get_nr_samples(views->z);
645 key.surfaces[0].dim = views->z->dim;
646 key.surfaces[0].array = views->z->first_layer != views->z->last_layer;
647 }
648
649 if (views->s) {
650 key.surfaces[1].loc = FRAG_RESULT_STENCIL;
651 key.surfaces[1].type = nir_type_uint32;
652 key.surfaces[1].samples = pan_image_view_get_nr_samples(views->s);
653 key.surfaces[1].dim = views->s->dim;
654 key.surfaces[1].array = views->s->first_layer != views->s->last_layer;
655 }
656
657 for (unsigned i = 0; i < views->rt_count; i++) {
658 if (!views->rts[i])
659 continue;
660
661 key.surfaces[i].loc = FRAG_RESULT_DATA0 + i;
662 key.surfaces[i].type =
663 util_format_is_pure_uint(views->rts[i]->format) ? nir_type_uint32
664 : util_format_is_pure_sint(views->rts[i]->format)
665 ? nir_type_int32
666 : nir_type_float32;
667 key.surfaces[i].samples =
668 pan_image_view_get_nr_samples(views->rts[i]);
669 key.surfaces[i].dim = views->rts[i]->dim;
670 key.surfaces[i].array =
671 views->rts[i]->first_layer != views->rts[i]->last_layer;
672 }
673
674 return key;
675 }
676
677 #if PAN_ARCH <= 7
678 static uint64_t
pan_preload_get_rsd(struct pan_fb_preload_cache * cache,struct pan_preload_views * views)679 pan_preload_get_rsd(struct pan_fb_preload_cache *cache,
680 struct pan_preload_views *views)
681 {
682 struct pan_preload_rsd_key rsd_key = {0};
683
684 assert(!views->rt_count || (!views->z && !views->s));
685
686 struct pan_preload_shader_key preload_key = pan_preload_get_key(views);
687
688 if (views->z) {
689 rsd_key.z.format = views->z->format;
690 rsd_key.z.type = preload_key.surfaces[0].type;
691 rsd_key.z.samples = preload_key.surfaces[0].samples;
692 rsd_key.z.dim = preload_key.surfaces[0].dim;
693 rsd_key.z.array = preload_key.surfaces[0].array;
694 }
695
696 if (views->s) {
697 rsd_key.s.format = views->s->format;
698 rsd_key.s.type = preload_key.surfaces[1].type;
699 rsd_key.s.samples = preload_key.surfaces[1].samples;
700 rsd_key.s.dim = preload_key.surfaces[1].dim;
701 rsd_key.s.array = preload_key.surfaces[1].array;
702 }
703
704 for (unsigned i = 0; i < views->rt_count; i++) {
705 if (!views->rts[i])
706 continue;
707
708 rsd_key.rts[i].format = views->rts[i]->format;
709 rsd_key.rts[i].type = preload_key.surfaces[i].type;
710 rsd_key.rts[i].samples = preload_key.surfaces[i].samples;
711 rsd_key.rts[i].dim = preload_key.surfaces[i].dim;
712 rsd_key.rts[i].array = preload_key.surfaces[i].array;
713 }
714
715 pthread_mutex_lock(&cache->rsds.lock);
716 struct hash_entry *he =
717 _mesa_hash_table_search(cache->rsds.rsds, &rsd_key);
718 struct pan_preload_rsd_data *rsd = he ? he->data : NULL;
719 if (rsd)
720 goto out;
721
722 rsd = rzalloc(cache->rsds.rsds, struct pan_preload_rsd_data);
723 rsd->key = rsd_key;
724
725 #if PAN_ARCH == 4
726 struct panfrost_ptr rsd_ptr =
727 pan_pool_alloc_desc(cache->rsds.pool, RENDERER_STATE);
728 #else
729 unsigned bd_count = PAN_ARCH >= 5 ? MAX2(views->rt_count, 1) : 0;
730 struct panfrost_ptr rsd_ptr = pan_pool_alloc_desc_aggregate(
731 cache->rsds.pool, PAN_DESC(RENDERER_STATE),
732 PAN_DESC_ARRAY(bd_count, BLEND));
733 #endif
734
735 if (!rsd_ptr.cpu)
736 return 0;
737
738 uint64_t blend_shaders[8] = {0};
739
740 const struct pan_preload_shader_data *preload_shader =
741 pan_preload_get_shader(cache, &preload_key);
742
743 #if PAN_ARCH <= 5
744 pan_preload_get_blend_shaders(cache,
745 views->rt_count, views->rts, preload_shader,
746 blend_shaders);
747 #endif
748
749 pan_preload_emit_rsd(preload_shader, views, blend_shaders, rsd_ptr.cpu);
750 rsd->address = rsd_ptr.gpu;
751 _mesa_hash_table_insert(cache->rsds.rsds, &rsd->key, rsd);
752
753 out:
754 pthread_mutex_unlock(&cache->rsds.lock);
755 return rsd->address;
756 }
757 #endif
758
759 static struct pan_preload_views
pan_preload_get_views(const struct pan_fb_info * fb,bool zs,struct pan_image_view * patched_s)760 pan_preload_get_views(const struct pan_fb_info *fb, bool zs,
761 struct pan_image_view *patched_s)
762 {
763 struct pan_preload_views views = {0};
764
765 if (zs) {
766 if (fb->zs.preload.z)
767 views.z = fb->zs.view.zs;
768
769 if (fb->zs.preload.s) {
770 const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs;
771 enum pipe_format fmt = util_format_get_depth_only(view->format);
772
773 switch (view->format) {
774 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
775 fmt = PIPE_FORMAT_X24S8_UINT;
776 break;
777 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
778 fmt = PIPE_FORMAT_X32_S8X24_UINT;
779 break;
780 default:
781 fmt = view->format;
782 break;
783 }
784
785 if (fmt != view->format) {
786 *patched_s = *view;
787 patched_s->format = fmt;
788 views.s = patched_s;
789 } else {
790 views.s = view;
791 }
792 }
793 } else {
794 for (unsigned i = 0; i < fb->rt_count; i++) {
795 if (fb->rts[i].preload)
796 views.rts[i] = fb->rts[i].view;
797 }
798
799 views.rt_count = fb->rt_count;
800 }
801
802 return views;
803 }
804
805 static bool
pan_preload_needed(const struct pan_fb_info * fb,bool zs)806 pan_preload_needed(const struct pan_fb_info *fb, bool zs)
807 {
808 if (zs) {
809 if (fb->zs.preload.z || fb->zs.preload.s)
810 return true;
811 } else {
812 for (unsigned i = 0; i < fb->rt_count; i++) {
813 if (fb->rts[i].preload)
814 return true;
815 }
816 }
817
818 return false;
819 }
820
821 static uint64_t
pan_preload_emit_varying(struct pan_pool * pool)822 pan_preload_emit_varying(struct pan_pool *pool)
823 {
824 struct panfrost_ptr varying = pan_pool_alloc_desc(pool, ATTRIBUTE);
825
826 if (!varying.cpu)
827 return 0;
828
829 pan_cast_and_pack(varying.cpu, ATTRIBUTE, cfg) {
830 cfg.buffer_index = 0;
831 cfg.offset_enable = PAN_ARCH <= 5;
832 cfg.format =
833 GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32G32B32_FLOAT)->hw;
834
835 #if PAN_ARCH >= 9
836 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
837 cfg.table = PAN_BLIT_TABLE_ATTRIBUTE_BUFFER;
838 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
839 cfg.stride = 4 * sizeof(float);
840 #endif
841 }
842
843 return varying.gpu;
844 }
845
846 static uint64_t
pan_preload_emit_varying_buffer(struct pan_pool * pool,uint64_t coordinates)847 pan_preload_emit_varying_buffer(struct pan_pool *pool, uint64_t coordinates)
848 {
849 #if PAN_ARCH >= 9
850 struct panfrost_ptr varying_buffer = pan_pool_alloc_desc(pool, BUFFER);
851
852 if (!varying_buffer.cpu)
853 return 0;
854
855 pan_cast_and_pack(varying_buffer.cpu, BUFFER, cfg) {
856 cfg.address = coordinates;
857 cfg.size = 4 * sizeof(float) * 4;
858 }
859 #else
860 /* Bifrost needs an empty desc to mark end of prefetching */
861 bool padding_buffer = PAN_ARCH >= 6;
862
863 struct panfrost_ptr varying_buffer = pan_pool_alloc_desc_array(
864 pool, (padding_buffer ? 2 : 1), ATTRIBUTE_BUFFER);
865
866 if (!varying_buffer.cpu)
867 return 0;
868
869 pan_cast_and_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) {
870 cfg.pointer = coordinates;
871 cfg.stride = 4 * sizeof(float);
872 cfg.size = cfg.stride * 4;
873 }
874
875 if (padding_buffer) {
876 pan_cast_and_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER),
877 ATTRIBUTE_BUFFER, cfg)
878 ;
879 }
880 #endif
881
882 return varying_buffer.gpu;
883 }
884
885 static uint64_t
pan_preload_emit_sampler(struct pan_pool * pool,bool nearest_filter)886 pan_preload_emit_sampler(struct pan_pool *pool, bool nearest_filter)
887 {
888 struct panfrost_ptr sampler = pan_pool_alloc_desc(pool, SAMPLER);
889
890 if (!sampler.cpu)
891 return 0;
892
893 pan_cast_and_pack(sampler.cpu, SAMPLER, cfg) {
894 cfg.seamless_cube_map = false;
895 cfg.normalized_coordinates = false;
896 cfg.minify_nearest = nearest_filter;
897 cfg.magnify_nearest = nearest_filter;
898 }
899
900 return sampler.gpu;
901 }
902
903 static uint64_t
pan_preload_emit_textures(struct pan_pool * pool,const struct pan_fb_info * fb,bool zs,unsigned * tex_count_out)904 pan_preload_emit_textures(struct pan_pool *pool, const struct pan_fb_info *fb,
905 bool zs, unsigned *tex_count_out)
906 {
907 const struct pan_image_view *views[8];
908 struct pan_image_view patched_views[8];
909 unsigned tex_count = 0;
910 unsigned patched_count = 0;
911
912 if (zs) {
913 if (fb->zs.preload.z) {
914 const struct pan_image_view *view = fb->zs.view.zs;
915 #if PAN_ARCH >= 7
916 struct pan_image_view *pview = &patched_views[patched_count++];
917 *pview = *view;
918 /* v7+ doesn't have an _RRRR component order. */
919 GENX(panfrost_texture_swizzle_replicate_x)(pview);
920 view = pview;
921 #endif
922 views[tex_count++] = view;
923 }
924
925 if (fb->zs.preload.s) {
926 const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs;
927 enum pipe_format fmt = util_format_get_depth_only(view->format);
928
929 switch (view->format) {
930 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
931 fmt = PIPE_FORMAT_X24S8_UINT;
932 break;
933 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
934 fmt = PIPE_FORMAT_X32_S8X24_UINT;
935 break;
936 default:
937 fmt = view->format;
938 break;
939 }
940
941 #if PAN_ARCH >= 7
942 struct pan_image_view *pview = &patched_views[patched_count++];
943 *pview = *view;
944 pview->format = fmt;
945 /* v7+ doesn't have an _RRRR component order. */
946 GENX(panfrost_texture_swizzle_replicate_x)(pview);
947 view = pview;
948 #else
949 if (fmt != view->format) {
950 struct pan_image_view *pview = &patched_views[patched_count++];
951 *pview = *view;
952 pview->format = fmt;
953 view = pview;
954 }
955 #endif
956 views[tex_count++] = view;
957 }
958 } else {
959 for (unsigned i = 0; i < fb->rt_count; i++) {
960 if (fb->rts[i].preload) {
961 const struct pan_image_view *view = fb->rts[i].view;
962 #if PAN_ARCH == 7
963 /* v7 requires AFBC reswizzle. */
964 if (!panfrost_format_is_yuv(view->format) &&
965 panfrost_format_supports_afbc(PAN_ARCH, view->format)) {
966 struct pan_image_view *pview = &patched_views[patched_count++];
967 *pview = *view;
968 GENX(panfrost_texture_afbc_reswizzle)(pview);
969 view = pview;
970 }
971 #endif
972 views[tex_count++] = view;
973 }
974 }
975 }
976
977 *tex_count_out = tex_count;
978
979 #if PAN_ARCH >= 6
980 struct panfrost_ptr textures =
981 pan_pool_alloc_desc_array(pool, tex_count, TEXTURE);
982
983 if (!textures.cpu)
984 return 0;
985
986 for (unsigned i = 0; i < tex_count; i++) {
987 void *texture = textures.cpu + (pan_size(TEXTURE) * i);
988 size_t payload_size =
989 GENX(panfrost_estimate_texture_payload_size)(views[i]);
990 struct panfrost_ptr surfaces =
991 pan_pool_alloc_aligned(pool, payload_size, 64);
992
993 GENX(panfrost_new_texture)(views[i], texture, &surfaces);
994 }
995
996 return textures.gpu;
997 #else
998 uint64_t textures[8] = {0};
999
1000 for (unsigned i = 0; i < tex_count; i++) {
1001 size_t sz = pan_size(TEXTURE) +
1002 GENX(panfrost_estimate_texture_payload_size)(views[i]);
1003 struct panfrost_ptr texture =
1004 pan_pool_alloc_aligned(pool, sz, pan_alignment(TEXTURE));
1005 struct panfrost_ptr surfaces = {
1006 .cpu = texture.cpu + pan_size(TEXTURE),
1007 .gpu = texture.gpu + pan_size(TEXTURE),
1008 };
1009
1010 GENX(panfrost_new_texture)(views[i], texture.cpu, &surfaces);
1011 textures[i] = texture.gpu;
1012 }
1013
1014 return pan_pool_upload_aligned(pool, textures, tex_count * sizeof(uint64_t),
1015 sizeof(uint64_t));
1016 #endif
1017 }
1018
1019 #if PAN_ARCH >= 8
1020 /* TODO: cache */
1021 static uint64_t
pan_preload_emit_zs(struct pan_pool * pool,bool z,bool s)1022 pan_preload_emit_zs(struct pan_pool *pool, bool z, bool s)
1023 {
1024 struct panfrost_ptr zsd = pan_pool_alloc_desc(pool, DEPTH_STENCIL);
1025
1026 if (!zsd.cpu)
1027 return 0;
1028
1029 pan_cast_and_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
1030 cfg.depth_function = MALI_FUNC_ALWAYS;
1031 cfg.depth_write_enable = z;
1032
1033 if (z)
1034 cfg.depth_source = MALI_DEPTH_SOURCE_SHADER;
1035
1036 cfg.stencil_test_enable = s;
1037 cfg.stencil_from_shader = s;
1038
1039 cfg.front_compare_function = MALI_FUNC_ALWAYS;
1040 cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE;
1041 cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE;
1042 cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE;
1043 cfg.front_write_mask = 0xFF;
1044 cfg.front_value_mask = 0xFF;
1045
1046 cfg.back_compare_function = MALI_FUNC_ALWAYS;
1047 cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE;
1048 cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE;
1049 cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE;
1050 cfg.back_write_mask = 0xFF;
1051 cfg.back_value_mask = 0xFF;
1052
1053 cfg.depth_cull_enable = false;
1054 }
1055
1056 return zsd.gpu;
1057 }
1058 #else
1059 static uint64_t
pan_preload_emit_viewport(struct pan_pool * pool,uint16_t minx,uint16_t miny,uint16_t maxx,uint16_t maxy)1060 pan_preload_emit_viewport(struct pan_pool *pool, uint16_t minx, uint16_t miny,
1061 uint16_t maxx, uint16_t maxy)
1062 {
1063 struct panfrost_ptr vp = pan_pool_alloc_desc(pool, VIEWPORT);
1064
1065 if (!vp.cpu)
1066 return 0;
1067
1068 pan_cast_and_pack(vp.cpu, VIEWPORT, cfg) {
1069 cfg.scissor_minimum_x = minx;
1070 cfg.scissor_minimum_y = miny;
1071 cfg.scissor_maximum_x = maxx;
1072 cfg.scissor_maximum_y = maxy;
1073 }
1074
1075 return vp.gpu;
1076 }
1077 #endif
1078
1079 static void
pan_preload_emit_dcd(struct pan_fb_preload_cache * cache,struct pan_pool * pool,struct pan_fb_info * fb,bool zs,uint64_t coordinates,uint64_t tsd,struct mali_draw_packed * out,bool always_write)1080 pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
1081 struct pan_fb_info *fb, bool zs, uint64_t coordinates,
1082 uint64_t tsd, struct mali_draw_packed *out,
1083 bool always_write)
1084 {
1085 unsigned tex_count = 0;
1086 uint64_t textures = pan_preload_emit_textures(pool, fb, zs, &tex_count);
1087 uint64_t samplers = pan_preload_emit_sampler(pool, true);
1088 uint64_t varyings = pan_preload_emit_varying(pool);
1089 uint64_t varying_buffers =
1090 pan_preload_emit_varying_buffer(pool, coordinates);
1091
1092 /* Tiles updated by preload shaders are still considered clean (separate
1093 * for colour and Z/S), allowing us to suppress unnecessary writeback
1094 */
1095 UNUSED bool clean_fragment_write = !always_write;
1096
1097 /* Image view used when patching stencil formats for combined
1098 * depth/stencil preloads.
1099 */
1100 struct pan_image_view patched_s;
1101
1102 struct pan_preload_views views = pan_preload_get_views(fb, zs, &patched_s);
1103
1104 #if PAN_ARCH <= 7
1105 pan_pack(out, DRAW, cfg) {
1106 uint16_t minx = 0, miny = 0, maxx, maxy;
1107
1108 if (PAN_ARCH == 4) {
1109 maxx = fb->width - 1;
1110 maxy = fb->height - 1;
1111 } else {
1112 /* Align on 32x32 tiles */
1113 minx = fb->extent.minx & ~31;
1114 miny = fb->extent.miny & ~31;
1115 maxx = MIN2(ALIGN_POT(fb->extent.maxx + 1, 32), fb->width) - 1;
1116 maxy = MIN2(ALIGN_POT(fb->extent.maxy + 1, 32), fb->height) - 1;
1117 }
1118
1119 cfg.thread_storage = tsd;
1120 cfg.state = pan_preload_get_rsd(cache, &views);
1121
1122 cfg.position = coordinates;
1123 cfg.viewport = pan_preload_emit_viewport(pool, minx, miny, maxx, maxy);
1124
1125 cfg.varyings = varyings;
1126 cfg.varying_buffers = varying_buffers;
1127 cfg.textures = textures;
1128 cfg.samplers = samplers;
1129
1130 #if PAN_ARCH >= 6
1131 cfg.clean_fragment_write = clean_fragment_write;
1132 #endif
1133 }
1134 #else
1135 struct panfrost_ptr T;
1136 unsigned nr_tables = PAN_BLIT_NUM_RESOURCE_TABLES;
1137
1138 /* Although individual resources need only 16 byte alignment, the
1139 * resource table as a whole must be 64-byte aligned.
1140 */
1141 T = pan_pool_alloc_aligned(pool, nr_tables * pan_size(RESOURCE), 64);
1142 memset(T.cpu, 0, nr_tables * pan_size(RESOURCE));
1143
1144 panfrost_make_resource_table(T, PAN_BLIT_TABLE_TEXTURE, textures, tex_count);
1145 panfrost_make_resource_table(T, PAN_BLIT_TABLE_SAMPLER, samplers, 1);
1146 panfrost_make_resource_table(T, PAN_BLIT_TABLE_ATTRIBUTE, varyings, 1);
1147 panfrost_make_resource_table(T, PAN_BLIT_TABLE_ATTRIBUTE_BUFFER,
1148 varying_buffers, 1);
1149
1150 struct pan_preload_shader_key key = pan_preload_get_key(&views);
1151 const struct pan_preload_shader_data *preload_shader =
1152 pan_preload_get_shader(cache, &key);
1153
1154 bool z = fb->zs.preload.z;
1155 bool s = fb->zs.preload.s;
1156 bool ms = pan_preload_is_ms(&views);
1157
1158 struct panfrost_ptr spd = pan_pool_alloc_desc(pool, SHADER_PROGRAM);
1159
1160 if (!spd.cpu) {
1161 mesa_loge("pan_pool_alloc_desc failed");
1162 return;
1163 }
1164
1165 pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
1166 cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
1167 cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
1168 cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
1169 cfg.binary = preload_shader->address;
1170 cfg.preload.r48_r63 = preload_shader->info.preload >> 48;
1171 }
1172
1173 unsigned bd_count = views.rt_count;
1174 struct panfrost_ptr blend = pan_pool_alloc_desc_array(pool, bd_count, BLEND);
1175
1176 if (!blend.cpu) {
1177 mesa_loge("pan_pool_alloc_desc_array failed");
1178 return;
1179 }
1180
1181 if (!zs) {
1182 pan_preload_emit_blends(preload_shader, &views, NULL, blend.cpu);
1183 }
1184
1185 pan_pack(out, DRAW, cfg) {
1186 if (zs) {
1187 /* ZS_EMIT requires late update/kill */
1188 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
1189 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
1190 cfg.blend_count = 0;
1191 } else {
1192 /* Skipping ATEST requires forcing Z/S */
1193 cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
1194 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1195
1196 cfg.blend = blend.gpu;
1197 cfg.blend_count = bd_count;
1198 cfg.render_target_mask = 0x1;
1199 }
1200
1201 cfg.allow_forward_pixel_to_kill = !zs;
1202 cfg.allow_forward_pixel_to_be_killed = true;
1203 cfg.depth_stencil = pan_preload_emit_zs(pool, z, s);
1204 cfg.sample_mask = 0xFFFF;
1205 cfg.multisample_enable = ms;
1206 cfg.evaluate_per_sample = ms;
1207 cfg.maximum_z = 1.0;
1208 cfg.clean_fragment_write = clean_fragment_write;
1209 cfg.shader.resources = T.gpu | nr_tables;
1210 cfg.shader.shader = spd.gpu;
1211 cfg.shader.thread_storage = tsd;
1212 }
1213 #endif
1214 }
1215
1216 #if PAN_ARCH >= 6
1217 static void
pan_preload_fb_alloc_pre_post_dcds(struct pan_pool * desc_pool,struct pan_fb_info * fb)1218 pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool,
1219 struct pan_fb_info *fb)
1220 {
1221 if (fb->bifrost.pre_post.dcds.gpu)
1222 return;
1223
1224 fb->bifrost.pre_post.dcds = pan_pool_alloc_desc_array(desc_pool, 3, DRAW);
1225 }
1226
1227 static void
pan_preload_emit_pre_frame_dcd(struct pan_fb_preload_cache * cache,struct pan_pool * desc_pool,struct pan_fb_info * fb,bool zs,uint64_t coords,uint64_t tsd)1228 pan_preload_emit_pre_frame_dcd(struct pan_fb_preload_cache *cache,
1229 struct pan_pool *desc_pool,
1230 struct pan_fb_info *fb, bool zs, uint64_t coords,
1231 uint64_t tsd)
1232 {
1233 unsigned dcd_idx = zs ? 1 : 0;
1234 pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb);
1235 if (!fb->bifrost.pre_post.dcds.cpu) {
1236 mesa_loge("pan_preload_fb_alloc_pre_post_dcds failed");
1237 return;
1238 }
1239
1240 void *dcd = fb->bifrost.pre_post.dcds.cpu + (dcd_idx * pan_size(DRAW));
1241
1242 /* We only use crc_rt to determine whether to force writes for updating
1243 * the CRCs, so use a conservative tile size (16x16).
1244 */
1245 int crc_rt = GENX(pan_select_crc_rt)(fb, 16 * 16);
1246
1247 bool always_write = false;
1248
1249 /* If CRC data is currently invalid and this batch will make it valid,
1250 * write even clean tiles to make sure CRC data is updated. */
1251 if (crc_rt >= 0) {
1252 bool *valid = fb->rts[crc_rt].crc_valid;
1253 bool full = !fb->extent.minx && !fb->extent.miny &&
1254 fb->extent.maxx == (fb->width - 1) &&
1255 fb->extent.maxy == (fb->height - 1);
1256
1257 if (full && !(*valid))
1258 always_write = true;
1259 }
1260
1261 pan_preload_emit_dcd(cache, desc_pool, fb, zs, coords, tsd, dcd,
1262 always_write);
1263 if (zs) {
1264 enum pipe_format fmt = fb->zs.view.zs
1265 ? fb->zs.view.zs->planes[0]->layout.format
1266 : fb->zs.view.s->planes[0]->layout.format;
1267 bool always = false;
1268
1269 /* If we're dealing with a combined ZS resource and only one
1270 * component is cleared, we need to reload the whole surface
1271 * because the zs_clean_pixel_write_enable flag is set in that
1272 * case.
1273 */
1274 if (util_format_is_depth_and_stencil(fmt) &&
1275 fb->zs.clear.z != fb->zs.clear.s)
1276 always = true;
1277
1278 /* We could use INTERSECT on Bifrost v7 too, but
1279 * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile
1280 * buffer one or more tiles ahead, making ZS data immediately
1281 * available for any ZS tests taking place in other shaders.
1282 * Thing's haven't been benchmarked to determine what's
1283 * preferable (saving bandwidth vs having ZS preloaded
1284 * earlier), so let's leave it like that for now.
1285 */
1286 fb->bifrost.pre_post.modes[dcd_idx] =
1287 PAN_ARCH > 6
1288 ? MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS
1289 : always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS
1290 : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
1291 } else {
1292 fb->bifrost.pre_post.modes[dcd_idx] =
1293 always_write ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS
1294 : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
1295 }
1296 }
1297 #else
1298 static struct panfrost_ptr
pan_preload_emit_tiler_job(struct pan_fb_preload_cache * cache,struct pan_pool * desc_pool,struct pan_fb_info * fb,bool zs,uint64_t coords,uint64_t tsd)1299 pan_preload_emit_tiler_job(struct pan_fb_preload_cache *cache, struct pan_pool *desc_pool,
1300 struct pan_fb_info *fb, bool zs, uint64_t coords,
1301 uint64_t tsd)
1302 {
1303 struct panfrost_ptr job = pan_pool_alloc_desc(desc_pool, TILER_JOB);
1304
1305 if (!job.cpu)
1306 return (struct panfrost_ptr){0};
1307
1308 pan_preload_emit_dcd(cache, desc_pool, fb, zs, coords, tsd,
1309 pan_section_ptr(job.cpu, TILER_JOB, DRAW), false);
1310
1311 pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) {
1312 cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP;
1313 cfg.index_count = 4;
1314 cfg.job_task_split = 6;
1315 }
1316
1317 pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) {
1318 cfg.constant = 1.0f;
1319 }
1320
1321 void *invoc = pan_section_ptr(job.cpu, TILER_JOB, INVOCATION);
1322 panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false);
1323
1324 return job;
1325 }
1326 #endif
1327
1328 static struct panfrost_ptr
pan_preload_fb_part(struct pan_fb_preload_cache * cache,struct pan_pool * pool,struct pan_fb_info * fb,bool zs,uint64_t coords,uint64_t tsd)1329 pan_preload_fb_part(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
1330 struct pan_fb_info *fb, bool zs, uint64_t coords,
1331 uint64_t tsd)
1332 {
1333 struct panfrost_ptr job = {0};
1334
1335 #if PAN_ARCH >= 6
1336 pan_preload_emit_pre_frame_dcd(cache, pool, fb, zs, coords, tsd);
1337 #else
1338 job = pan_preload_emit_tiler_job(cache, pool, fb, zs, coords, tsd);
1339 #endif
1340 return job;
1341 }
1342
1343 unsigned
GENX(pan_preload_fb)1344 GENX(pan_preload_fb)(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
1345 struct pan_fb_info *fb, uint64_t tsd,
1346 struct panfrost_ptr *jobs)
1347 {
1348 bool preload_zs = pan_preload_needed(fb, true);
1349 bool preload_rts = pan_preload_needed(fb, false);
1350 uint64_t coords;
1351
1352 if (!preload_zs && !preload_rts)
1353 return 0;
1354
1355 float rect[] = {
1356 0.0, 0.0, 0, 1.0,
1357 fb->width, 0.0, 0, 1.0,
1358 0.0, fb->height, 0, 1.0,
1359 fb->width, fb->height, 0, 1.0,
1360 };
1361
1362 coords = pan_pool_upload_aligned(pool, rect, sizeof(rect), 64);
1363
1364 unsigned njobs = 0;
1365 if (preload_zs) {
1366 struct panfrost_ptr job =
1367 pan_preload_fb_part(cache, pool, fb, true, coords, tsd);
1368 if (jobs && job.cpu)
1369 jobs[njobs++] = job;
1370 }
1371
1372 if (preload_rts) {
1373 struct panfrost_ptr job =
1374 pan_preload_fb_part(cache, pool, fb, false, coords, tsd);
1375 if (jobs && job.cpu)
1376 jobs[njobs++] = job;
1377 }
1378
1379 return njobs;
1380 }
1381
1382 DERIVE_HASH_TABLE(pan_preload_shader_key);
1383 DERIVE_HASH_TABLE(pan_preload_blend_shader_key);
1384 DERIVE_HASH_TABLE(pan_preload_rsd_key);
1385
1386 static void
pan_preload_prefill_preload_shader_cache(struct pan_fb_preload_cache * cache)1387 pan_preload_prefill_preload_shader_cache(struct pan_fb_preload_cache *cache)
1388 {
1389 static const struct pan_preload_shader_key prefill[] = {
1390 {
1391 .surfaces[0] =
1392 {
1393 .loc = FRAG_RESULT_DEPTH,
1394 .type = nir_type_float32,
1395 .dim = MALI_TEXTURE_DIMENSION_2D,
1396 .samples = 1,
1397 },
1398 },
1399 {
1400 .surfaces[1] =
1401 {
1402 .loc = FRAG_RESULT_STENCIL,
1403 .type = nir_type_uint32,
1404 .dim = MALI_TEXTURE_DIMENSION_2D,
1405 .samples = 1,
1406 },
1407 },
1408 {
1409 .surfaces[0] =
1410 {
1411 .loc = FRAG_RESULT_DATA0,
1412 .type = nir_type_float32,
1413 .dim = MALI_TEXTURE_DIMENSION_2D,
1414 .samples = 1,
1415 },
1416 },
1417 };
1418
1419 for (unsigned i = 0; i < ARRAY_SIZE(prefill); i++)
1420 pan_preload_get_shader(cache, &prefill[i]);
1421 }
1422
1423 void
GENX(pan_fb_preload_cache_init)1424 GENX(pan_fb_preload_cache_init)(
1425 struct pan_fb_preload_cache *cache, unsigned gpu_id,
1426 struct pan_blend_shader_cache *blend_shader_cache, struct pan_pool *bin_pool,
1427 struct pan_pool *desc_pool)
1428 {
1429 cache->gpu_id = gpu_id;
1430 cache->shaders.preload = pan_preload_shader_key_table_create(NULL);
1431 cache->shaders.blend = pan_preload_blend_shader_key_table_create(NULL);
1432 cache->shaders.pool = bin_pool;
1433 pthread_mutex_init(&cache->shaders.lock, NULL);
1434 pan_preload_prefill_preload_shader_cache(cache);
1435
1436 cache->rsds.pool = desc_pool;
1437 cache->rsds.rsds = pan_preload_rsd_key_table_create(NULL);
1438 pthread_mutex_init(&cache->rsds.lock, NULL);
1439 cache->blend_shader_cache = blend_shader_cache;
1440 }
1441
1442 void
GENX(pan_fb_preload_cache_cleanup)1443 GENX(pan_fb_preload_cache_cleanup)(struct pan_fb_preload_cache *cache)
1444 {
1445 _mesa_hash_table_destroy(cache->shaders.preload, NULL);
1446 _mesa_hash_table_destroy(cache->shaders.blend, NULL);
1447 pthread_mutex_destroy(&cache->shaders.lock);
1448 _mesa_hash_table_destroy(cache->rsds.rsds, NULL);
1449 pthread_mutex_destroy(&cache->rsds.lock);
1450 }
1451