• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020-2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *   Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  *   Boris Brezillon <boris.brezillon@collabora.com>
26  */
27 
28 #include <math.h>
29 #include <stdio.h>
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_math.h"
32 #include "pan_blend.h"
33 #include "pan_desc.h"
34 #include "pan_encoder.h"
35 #include "pan_fb_preload.h"
36 #include "pan_jc.h"
37 #include "pan_pool.h"
38 #include "pan_shader.h"
39 #include "pan_texture.h"
40 
41 #if PAN_ARCH >= 6
42 /* On Midgard, the native preload infrastructure (via MFBD preloads) is broken
43  * or missing in many cases. We instead use software paths as fallbacks, which
44  * are done as TILER jobs. No vertex shader is necessary since we can supply
45  * screen-space coordinates directly.
46  *
47  * This is primarily designed as a fallback for preloads but could be extended
48  * for other clears/blits if needed in the future. */
49 
50 static enum mali_register_file_format
nir_type_to_reg_fmt(nir_alu_type in)51 nir_type_to_reg_fmt(nir_alu_type in)
52 {
53    switch (in) {
54    case nir_type_float32:
55       return MALI_REGISTER_FILE_FORMAT_F32;
56    case nir_type_int32:
57       return MALI_REGISTER_FILE_FORMAT_I32;
58    case nir_type_uint32:
59       return MALI_REGISTER_FILE_FORMAT_U32;
60    default:
61       unreachable("Invalid type");
62    }
63 }
64 #endif
65 
66 /* On Valhall, the driver gives the hardware a table of resource tables.
67  * Resources are addressed as the index of the table together with the index of
68  * the resource within the table. For simplicity, we put one type of resource
69  * in each table and fix the numbering of the tables.
70  *
71  * This numbering is arbitrary.
72  */
73 enum pan_preload_resource_table {
74    PAN_BLIT_TABLE_ATTRIBUTE = 0,
75    PAN_BLIT_TABLE_ATTRIBUTE_BUFFER,
76    PAN_BLIT_TABLE_SAMPLER,
77    PAN_BLIT_TABLE_TEXTURE,
78 
79    PAN_BLIT_NUM_RESOURCE_TABLES
80 };
81 
82 struct pan_preload_surface {
83    gl_frag_result loc              : 4;
84    nir_alu_type type               : 8;
85    enum mali_texture_dimension dim : 2;
86    bool array                      : 1;
87    unsigned samples                : 5;
88 };
89 
90 struct pan_preload_shader_key {
91    struct pan_preload_surface surfaces[8];
92 };
93 
94 struct pan_preload_shader_data {
95    struct pan_preload_shader_key key;
96    struct pan_shader_info info;
97    uint64_t address;
98    unsigned blend_ret_offsets[8];
99    nir_alu_type blend_types[8];
100 };
101 
102 struct pan_preload_blend_shader_key {
103    enum pipe_format format;
104    nir_alu_type type;
105    unsigned rt         : 3;
106    unsigned nr_samples : 5;
107    unsigned pad        : 24;
108 };
109 
110 struct pan_preload_blend_shader_data {
111    struct pan_preload_blend_shader_key key;
112    uint64_t address;
113 };
114 
115 struct pan_preload_rsd_key {
116    struct {
117       enum pipe_format format;
118       nir_alu_type type               : 8;
119       unsigned samples                : 5;
120       enum mali_texture_dimension dim : 2;
121       bool array                      : 1;
122    } rts[8], z, s;
123 };
124 
125 struct pan_preload_rsd_data {
126    struct pan_preload_rsd_key key;
127    uint64_t address;
128 };
129 
130 #if PAN_ARCH >= 5
131 static void
pan_preload_emit_blend(unsigned rt,const struct pan_image_view * iview,const struct pan_preload_shader_data * preload_shader,uint64_t blend_shader,struct mali_blend_packed * out)132 pan_preload_emit_blend(unsigned rt,
133                        const struct pan_image_view *iview,
134                        const struct pan_preload_shader_data *preload_shader,
135                        uint64_t blend_shader, struct mali_blend_packed *out)
136 {
137    assert(blend_shader == 0 || PAN_ARCH <= 5);
138 
139    pan_pack(out, BLEND, cfg) {
140       if (!iview) {
141          cfg.enable = false;
142 #if PAN_ARCH >= 6
143          cfg.internal.mode = MALI_BLEND_MODE_OFF;
144 #endif
145          continue;
146       }
147 
148       cfg.round_to_fb_precision = true;
149       cfg.srgb = util_format_is_srgb(iview->format);
150 
151 #if PAN_ARCH >= 6
152       cfg.internal.mode = MALI_BLEND_MODE_OPAQUE;
153 #endif
154 
155       if (!blend_shader) {
156          cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
157          cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
158          cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
159          cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
160          cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
161          cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
162          cfg.equation.color_mask = 0xf;
163 
164 #if PAN_ARCH >= 6
165          nir_alu_type type = preload_shader->key.surfaces[rt].type;
166 
167          cfg.internal.fixed_function.num_comps = 4;
168          cfg.internal.fixed_function.conversion.memory_format = GENX(
169             panfrost_dithered_format_from_pipe_format)(iview->format, false);
170          cfg.internal.fixed_function.conversion.register_format =
171             nir_type_to_reg_fmt(type);
172 
173          cfg.internal.fixed_function.rt = rt;
174 #endif
175       } else {
176 #if PAN_ARCH <= 5
177          cfg.blend_shader = true;
178          cfg.shader_pc = blend_shader;
179 #endif
180       }
181    }
182 }
183 #endif
184 
185 struct pan_preload_views {
186    unsigned rt_count;
187    const struct pan_image_view *rts[8];
188    const struct pan_image_view *z;
189    const struct pan_image_view *s;
190 };
191 
192 static bool
pan_preload_is_ms(struct pan_preload_views * views)193 pan_preload_is_ms(struct pan_preload_views *views)
194 {
195    for (unsigned i = 0; i < views->rt_count; i++) {
196       if (views->rts[i]) {
197          if (pan_image_view_get_nr_samples(views->rts[i]) > 1)
198             return true;
199       }
200    }
201 
202    if (views->z && pan_image_view_get_nr_samples(views->z) > 1)
203       return true;
204 
205    if (views->s && pan_image_view_get_nr_samples(views->s) > 1)
206       return true;
207 
208    return false;
209 }
210 
211 #if PAN_ARCH >= 5
212 static void
pan_preload_emit_blends(const struct pan_preload_shader_data * preload_shader,struct pan_preload_views * views,uint64_t * blend_shaders,struct mali_blend_packed * out)213 pan_preload_emit_blends(const struct pan_preload_shader_data *preload_shader,
214                         struct pan_preload_views *views,
215                         uint64_t *blend_shaders, struct mali_blend_packed *out)
216 {
217    for (unsigned i = 0; i < MAX2(views->rt_count, 1); ++i) {
218       const struct pan_image_view *rt_view = views->rts[i];
219       uint64_t blend_shader = blend_shaders ? blend_shaders[i] : 0;
220 
221       pan_preload_emit_blend(i, rt_view, preload_shader, blend_shader, &out[i]);
222    }
223 }
224 #endif
225 
226 #if PAN_ARCH <= 7
227 static void
pan_preload_emit_rsd(const struct pan_preload_shader_data * preload_shader,struct pan_preload_views * views,uint64_t * blend_shaders,struct mali_renderer_state_packed * out)228 pan_preload_emit_rsd(const struct pan_preload_shader_data *preload_shader,
229                      struct pan_preload_views *views, uint64_t *blend_shaders,
230                      struct mali_renderer_state_packed *out)
231 {
232    UNUSED bool zs = (views->z || views->s);
233    bool ms = pan_preload_is_ms(views);
234 
235    pan_pack(out, RENDERER_STATE, cfg) {
236       assert(preload_shader->address);
237       pan_shader_prepare_rsd(&preload_shader->info, preload_shader->address, &cfg);
238 
239       cfg.multisample_misc.sample_mask = 0xFFFF;
240       cfg.multisample_misc.multisample_enable = ms;
241       cfg.multisample_misc.evaluate_per_sample = ms;
242       cfg.multisample_misc.depth_write_mask = views->z != NULL;
243       cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS;
244 
245       cfg.stencil_mask_misc.stencil_enable = views->s != NULL;
246       cfg.stencil_mask_misc.stencil_mask_front = 0xFF;
247       cfg.stencil_mask_misc.stencil_mask_back = 0xFF;
248       cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS;
249       cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE;
250       cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE;
251       cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE;
252       cfg.stencil_front.mask = 0xFF;
253       cfg.stencil_back = cfg.stencil_front;
254 
255 #if PAN_ARCH >= 6
256       if (zs) {
257          /* Writing Z/S requires late updates */
258          cfg.properties.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
259          cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
260       } else {
261          /* Skipping ATEST requires forcing Z/S */
262          cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
263          cfg.properties.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
264       }
265 
266       /* However, while shaders writing Z/S can normally be killed, on v6
267        * for frame shaders it can cause GPU timeouts, so only allow colour
268        * preload shaders to be killed. */
269       cfg.properties.allow_forward_pixel_to_kill = !zs;
270 
271       if (PAN_ARCH == 6)
272          cfg.properties.allow_forward_pixel_to_be_killed = !zs;
273 #else
274 
275       uint64_t blend_shader =
276          blend_shaders
277             ? panfrost_last_nonnull(blend_shaders, MAX2(views->rt_count, 1))
278             : 0;
279 
280       cfg.properties.work_register_count = 4;
281       cfg.properties.force_early_z = !zs;
282       cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
283 
284       /* Set even on v5 for erratum workaround */
285 #if PAN_ARCH == 5
286       cfg.legacy_blend_shader = blend_shader;
287 #else
288       cfg.blend_shader = blend_shader;
289       cfg.stencil_mask_misc.write_enable = true;
290       cfg.stencil_mask_misc.dither_disable = true;
291       cfg.multisample_misc.blend_shader = !!blend_shader;
292       cfg.blend_shader = blend_shader;
293       if (!cfg.multisample_misc.blend_shader) {
294          cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
295          cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
296          cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
297          cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
298          cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
299          cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
300          cfg.blend_constant = 0;
301 
302          if (views->rts[0] != NULL) {
303             cfg.stencil_mask_misc.srgb =
304                util_format_is_srgb(views->rts[0]->format);
305             cfg.blend_equation.color_mask = 0xf;
306          }
307       }
308 #endif
309 #endif
310    }
311 
312 #if PAN_ARCH >= 5
313    pan_preload_emit_blends(preload_shader, views, blend_shaders,
314                            (void*)((uint8_t*)out + pan_size(RENDERER_STATE)));
315 #endif
316 }
317 #endif
318 
319 #if PAN_ARCH <= 5
320 static void
pan_preload_get_blend_shaders(struct pan_fb_preload_cache * cache,unsigned rt_count,const struct pan_image_view ** rts,const struct pan_preload_shader_data * preload_shader,uint64_t * blend_shaders)321 pan_preload_get_blend_shaders(struct pan_fb_preload_cache *cache,
322                               unsigned rt_count,
323                               const struct pan_image_view **rts,
324                               const struct pan_preload_shader_data *preload_shader,
325                               uint64_t *blend_shaders)
326 {
327    if (!rt_count)
328       return;
329 
330    struct pan_blend_state blend_state = {
331       .rt_count = rt_count,
332    };
333 
334    for (unsigned i = 0; i < rt_count; i++) {
335       if (!rts[i] || panfrost_blendable_formats_v7[rts[i]->format].internal)
336          continue;
337 
338       struct pan_preload_blend_shader_key key = {
339          .format = rts[i]->format,
340          .rt = i,
341          .nr_samples = pan_image_view_get_nr_samples(rts[i]),
342          .type = preload_shader->blend_types[i],
343       };
344 
345       pthread_mutex_lock(&cache->shaders.lock);
346       struct hash_entry *he =
347          _mesa_hash_table_search(cache->shaders.blend, &key);
348       struct pan_preload_blend_shader_data *blend_shader = he ? he->data : NULL;
349       if (blend_shader) {
350          blend_shaders[i] = blend_shader->address;
351          pthread_mutex_unlock(&cache->shaders.lock);
352          continue;
353       }
354 
355       blend_shader =
356          rzalloc(cache->shaders.blend, struct pan_preload_blend_shader_data);
357       blend_shader->key = key;
358 
359       blend_state.rts[i] = (struct pan_blend_rt_state){
360          .format = rts[i]->format,
361          .nr_samples = pan_image_view_get_nr_samples(rts[i]),
362          .equation =
363             {
364                .blend_enable = false,
365                .color_mask = 0xf,
366             },
367       };
368 
369       pthread_mutex_lock(&cache->blend_shader_cache->lock);
370       struct pan_blend_shader_variant *b = GENX(pan_blend_get_shader_locked)(
371          cache->blend_shader_cache, &blend_state,
372          preload_shader->blend_types[i], nir_type_float32, /* unused */
373          i);
374 
375       assert(b->work_reg_count <= 4);
376       struct panfrost_ptr bin =
377          pan_pool_alloc_aligned(cache->shaders.pool, b->binary.size, 64);
378       memcpy(bin.cpu, b->binary.data, b->binary.size);
379 
380       blend_shader->address = bin.gpu | b->first_tag;
381       pthread_mutex_unlock(&cache->blend_shader_cache->lock);
382       _mesa_hash_table_insert(cache->shaders.blend, &blend_shader->key,
383                               blend_shader);
384       pthread_mutex_unlock(&cache->shaders.lock);
385       blend_shaders[i] = blend_shader->address;
386    }
387 }
388 #endif
389 
390 /*
391  * Early Mali GPUs did not respect sampler LOD clamps or bias, so the Midgard
392  * compiler inserts lowering code with a load_sampler_lod_parameters_pan sysval
393  * that we need to lower. Our samplers do not use LOD clamps or bias, so we
394  * lower to the identity settings and let constant folding get rid of the
395  * unnecessary lowering.
396  */
397 static bool
lower_sampler_parameters(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)398 lower_sampler_parameters(nir_builder *b, nir_intrinsic_instr *intr,
399                          UNUSED void *data)
400 {
401    if (intr->intrinsic != nir_intrinsic_load_sampler_lod_parameters_pan)
402       return false;
403 
404    const nir_const_value constants[4] = {
405       nir_const_value_for_float(0.0f, 32),     /* min_lod */
406       nir_const_value_for_float(INFINITY, 32), /* max_lod */
407       nir_const_value_for_float(0.0f, 32),     /* lod_bias */
408    };
409 
410    b->cursor = nir_after_instr(&intr->instr);
411    nir_def_rewrite_uses(&intr->def, nir_build_imm(b, 3, 32, constants));
412    return true;
413 }
414 
415 static uint32_t
sampler_hw_index(uint32_t index)416 sampler_hw_index(uint32_t index)
417 {
418    return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_SAMPLER, index) : index;
419 }
420 
421 static uint32_t
tex_hw_index(uint32_t index)422 tex_hw_index(uint32_t index)
423 {
424    return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_TEXTURE, index) : index;
425 }
426 
427 static uint32_t
attr_hw_index(uint32_t index)428 attr_hw_index(uint32_t index)
429 {
430    return PAN_ARCH >= 9 ? pan_res_handle(PAN_BLIT_TABLE_ATTRIBUTE, index)
431                         : index;
432 }
433 
434 static const struct pan_preload_shader_data *
pan_preload_get_shader(struct pan_fb_preload_cache * cache,const struct pan_preload_shader_key * key)435 pan_preload_get_shader(struct pan_fb_preload_cache *cache,
436                        const struct pan_preload_shader_key *key)
437 {
438    pthread_mutex_lock(&cache->shaders.lock);
439    struct hash_entry *he =
440       _mesa_hash_table_search(cache->shaders.preload, key);
441    struct pan_preload_shader_data *shader = he ? he->data : NULL;
442 
443    if (shader)
444       goto out;
445 
446    unsigned coord_comps = 0;
447    unsigned sig_offset = 0;
448    char sig[256];
449    bool first = true;
450    for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) {
451       const char *type_str, *dim_str;
452       if (key->surfaces[i].type == nir_type_invalid)
453          continue;
454 
455       switch (key->surfaces[i].type) {
456       case nir_type_float32:
457          type_str = "float";
458          break;
459       case nir_type_uint32:
460          type_str = "uint";
461          break;
462       case nir_type_int32:
463          type_str = "int";
464          break;
465       default:
466          unreachable("Invalid type\n");
467       }
468 
469       switch (key->surfaces[i].dim) {
470       case MALI_TEXTURE_DIMENSION_CUBE:
471          dim_str = "cube";
472          break;
473       case MALI_TEXTURE_DIMENSION_1D:
474          dim_str = "1D";
475          break;
476       case MALI_TEXTURE_DIMENSION_2D:
477          dim_str = "2D";
478          break;
479       case MALI_TEXTURE_DIMENSION_3D:
480          dim_str = "3D";
481          break;
482       default:
483          unreachable("Invalid dim\n");
484       }
485 
486       coord_comps = MAX2(coord_comps, (key->surfaces[i].dim ?: 3) +
487                                          (key->surfaces[i].array ? 1 : 0));
488 
489       if (sig_offset >= sizeof(sig)) {
490          first = false;
491          continue;
492       }
493 
494       sig_offset +=
495          snprintf(sig + sig_offset, sizeof(sig) - sig_offset,
496                   "%s[%s;%s;%s%s;samples=%d]",
497                   first ? "" : ",", gl_frag_result_name(key->surfaces[i].loc),
498                   type_str, dim_str, key->surfaces[i].array ? "[]" : "",
499                   key->surfaces[i].samples);
500 
501       first = false;
502    }
503 
504    nir_builder b = nir_builder_init_simple_shader(
505       MESA_SHADER_FRAGMENT, GENX(pan_shader_get_compiler_options)(),
506       "pan_preload(%s)", sig);
507 
508    nir_def *barycentric = nir_load_barycentric(
509       &b, nir_intrinsic_load_barycentric_pixel, INTERP_MODE_SMOOTH);
510    nir_def *coord = nir_load_interpolated_input(
511       &b, coord_comps, 32, barycentric, nir_imm_int(&b, 0),
512       .base = attr_hw_index(0), .dest_type = nir_type_float32,
513       .io_semantics.location = VARYING_SLOT_VAR0, .io_semantics.num_slots = 1);
514 
515    unsigned active_count = 0;
516    for (unsigned i = 0; i < ARRAY_SIZE(key->surfaces); i++) {
517       if (key->surfaces[i].type == nir_type_invalid)
518          continue;
519 
520       bool ms = key->surfaces[i].samples > 1;
521       enum glsl_sampler_dim sampler_dim;
522 
523       switch (key->surfaces[i].dim) {
524       case MALI_TEXTURE_DIMENSION_1D:
525          sampler_dim = GLSL_SAMPLER_DIM_1D;
526          break;
527       case MALI_TEXTURE_DIMENSION_2D:
528          sampler_dim = ms ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D;
529          break;
530       case MALI_TEXTURE_DIMENSION_3D:
531          sampler_dim = GLSL_SAMPLER_DIM_3D;
532          break;
533       case MALI_TEXTURE_DIMENSION_CUBE:
534          sampler_dim = GLSL_SAMPLER_DIM_CUBE;
535          break;
536       }
537 
538 
539       nir_tex_instr *tex = nir_tex_instr_create(b.shader, ms ? 3 : 1);
540 
541       tex->dest_type = key->surfaces[i].type;
542       tex->texture_index = tex_hw_index(active_count);
543       tex->sampler_index = sampler_hw_index(0);
544       tex->is_array = key->surfaces[i].array;
545       tex->sampler_dim = sampler_dim;
546 
547       if (ms) {
548          tex->op = nir_texop_txf_ms;
549 
550          tex->src[0] =
551             nir_tex_src_for_ssa(nir_tex_src_coord, nir_f2i32(&b, coord));
552          tex->coord_components = coord_comps;
553 
554          tex->src[1] =
555             nir_tex_src_for_ssa(nir_tex_src_ms_index, nir_load_sample_id(&b));
556 
557          tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(&b, 0));
558       } else {
559          tex->op = nir_texop_txl;
560 
561          tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
562          tex->coord_components = coord_comps;
563       }
564 
565       nir_def_init(&tex->instr, &tex->def, 4, 32);
566       nir_builder_instr_insert(&b, &tex->instr);
567 
568       nir_def *res = &tex->def;
569 
570       if (key->surfaces[i].loc >= FRAG_RESULT_DATA0) {
571          nir_store_output(
572             &b, res, nir_imm_int(&b, 0), .base = active_count,
573             .src_type = key->surfaces[i].type,
574             .io_semantics.location = key->surfaces[i].loc,
575             .io_semantics.num_slots = 1,
576             .write_mask = nir_component_mask(res->num_components));
577       } else {
578          unsigned c = key->surfaces[i].loc == FRAG_RESULT_STENCIL ? 1 : 0;
579          nir_store_output(
580             &b, nir_channel(&b, res, c), nir_imm_int(&b, 0),
581             .base = active_count, .src_type = key->surfaces[i].type,
582             .io_semantics.location = key->surfaces[i].loc,
583             .io_semantics.num_slots = 1, .write_mask = nir_component_mask(1));
584       }
585       active_count++;
586    }
587 
588    struct panfrost_compile_inputs inputs = {
589       .gpu_id = cache->gpu_id,
590       .is_blit = true,
591       .no_idvs = true,
592    };
593    struct util_dynarray binary;
594 
595    util_dynarray_init(&binary, NULL);
596 
597    shader = rzalloc(cache->shaders.preload, struct pan_preload_shader_data);
598 
599    nir_shader_gather_info(b.shader, nir_shader_get_entrypoint(b.shader));
600 
601    for (unsigned i = 0; i < active_count; ++i)
602       BITSET_SET(b.shader->info.textures_used, i);
603 
604    pan_shader_preprocess(b.shader, inputs.gpu_id);
605 
606    if (PAN_ARCH == 4) {
607       NIR_PASS(_, b.shader, nir_shader_intrinsics_pass,
608                lower_sampler_parameters, nir_metadata_control_flow, NULL);
609    }
610 
611    GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader->info);
612 
613    shader->key = *key;
614    shader->address =
615       pan_pool_upload_aligned(cache->shaders.pool, binary.data,
616                               binary.size, PAN_ARCH >= 6 ? 128 : 64);
617 
618    util_dynarray_fini(&binary);
619    ralloc_free(b.shader);
620 
621 #if PAN_ARCH >= 6
622    for (unsigned i = 0; i < ARRAY_SIZE(shader->blend_ret_offsets); i++) {
623       shader->blend_ret_offsets[i] =
624          shader->info.bifrost.blend[i].return_offset;
625       shader->blend_types[i] = shader->info.bifrost.blend[i].type;
626    }
627 #endif
628 
629    _mesa_hash_table_insert(cache->shaders.preload, &shader->key, shader);
630 
631 out:
632    pthread_mutex_unlock(&cache->shaders.lock);
633    return shader;
634 }
635 
636 static struct pan_preload_shader_key
pan_preload_get_key(struct pan_preload_views * views)637 pan_preload_get_key(struct pan_preload_views *views)
638 {
639    struct pan_preload_shader_key key = {0};
640 
641    if (views->z) {
642       key.surfaces[0].loc = FRAG_RESULT_DEPTH;
643       key.surfaces[0].type = nir_type_float32;
644       key.surfaces[0].samples = pan_image_view_get_nr_samples(views->z);
645       key.surfaces[0].dim = views->z->dim;
646       key.surfaces[0].array = views->z->first_layer != views->z->last_layer;
647    }
648 
649    if (views->s) {
650       key.surfaces[1].loc = FRAG_RESULT_STENCIL;
651       key.surfaces[1].type = nir_type_uint32;
652       key.surfaces[1].samples = pan_image_view_get_nr_samples(views->s);
653       key.surfaces[1].dim = views->s->dim;
654       key.surfaces[1].array = views->s->first_layer != views->s->last_layer;
655    }
656 
657    for (unsigned i = 0; i < views->rt_count; i++) {
658       if (!views->rts[i])
659          continue;
660 
661       key.surfaces[i].loc = FRAG_RESULT_DATA0 + i;
662       key.surfaces[i].type =
663          util_format_is_pure_uint(views->rts[i]->format) ? nir_type_uint32
664          : util_format_is_pure_sint(views->rts[i]->format)
665             ? nir_type_int32
666             : nir_type_float32;
667       key.surfaces[i].samples =
668          pan_image_view_get_nr_samples(views->rts[i]);
669       key.surfaces[i].dim = views->rts[i]->dim;
670       key.surfaces[i].array =
671          views->rts[i]->first_layer != views->rts[i]->last_layer;
672    }
673 
674    return key;
675 }
676 
677 #if PAN_ARCH <= 7
678 static uint64_t
pan_preload_get_rsd(struct pan_fb_preload_cache * cache,struct pan_preload_views * views)679 pan_preload_get_rsd(struct pan_fb_preload_cache *cache,
680                     struct pan_preload_views *views)
681 {
682    struct pan_preload_rsd_key rsd_key = {0};
683 
684    assert(!views->rt_count || (!views->z && !views->s));
685 
686    struct pan_preload_shader_key preload_key = pan_preload_get_key(views);
687 
688    if (views->z) {
689       rsd_key.z.format = views->z->format;
690       rsd_key.z.type = preload_key.surfaces[0].type;
691       rsd_key.z.samples = preload_key.surfaces[0].samples;
692       rsd_key.z.dim = preload_key.surfaces[0].dim;
693       rsd_key.z.array = preload_key.surfaces[0].array;
694    }
695 
696    if (views->s) {
697       rsd_key.s.format = views->s->format;
698       rsd_key.s.type = preload_key.surfaces[1].type;
699       rsd_key.s.samples = preload_key.surfaces[1].samples;
700       rsd_key.s.dim = preload_key.surfaces[1].dim;
701       rsd_key.s.array = preload_key.surfaces[1].array;
702    }
703 
704    for (unsigned i = 0; i < views->rt_count; i++) {
705       if (!views->rts[i])
706          continue;
707 
708       rsd_key.rts[i].format = views->rts[i]->format;
709       rsd_key.rts[i].type = preload_key.surfaces[i].type;
710       rsd_key.rts[i].samples = preload_key.surfaces[i].samples;
711       rsd_key.rts[i].dim = preload_key.surfaces[i].dim;
712       rsd_key.rts[i].array = preload_key.surfaces[i].array;
713    }
714 
715    pthread_mutex_lock(&cache->rsds.lock);
716    struct hash_entry *he =
717       _mesa_hash_table_search(cache->rsds.rsds, &rsd_key);
718    struct pan_preload_rsd_data *rsd = he ? he->data : NULL;
719    if (rsd)
720       goto out;
721 
722    rsd = rzalloc(cache->rsds.rsds, struct pan_preload_rsd_data);
723    rsd->key = rsd_key;
724 
725 #if PAN_ARCH == 4
726    struct panfrost_ptr rsd_ptr =
727       pan_pool_alloc_desc(cache->rsds.pool, RENDERER_STATE);
728 #else
729    unsigned bd_count = PAN_ARCH >= 5 ? MAX2(views->rt_count, 1) : 0;
730    struct panfrost_ptr rsd_ptr = pan_pool_alloc_desc_aggregate(
731       cache->rsds.pool, PAN_DESC(RENDERER_STATE),
732       PAN_DESC_ARRAY(bd_count, BLEND));
733 #endif
734 
735    if (!rsd_ptr.cpu)
736       return 0;
737 
738    uint64_t blend_shaders[8] = {0};
739 
740    const struct pan_preload_shader_data *preload_shader =
741       pan_preload_get_shader(cache, &preload_key);
742 
743 #if PAN_ARCH <= 5
744    pan_preload_get_blend_shaders(cache,
745                                  views->rt_count, views->rts, preload_shader,
746                                  blend_shaders);
747 #endif
748 
749    pan_preload_emit_rsd(preload_shader, views, blend_shaders, rsd_ptr.cpu);
750    rsd->address = rsd_ptr.gpu;
751    _mesa_hash_table_insert(cache->rsds.rsds, &rsd->key, rsd);
752 
753 out:
754    pthread_mutex_unlock(&cache->rsds.lock);
755    return rsd->address;
756 }
757 #endif
758 
759 static struct pan_preload_views
pan_preload_get_views(const struct pan_fb_info * fb,bool zs,struct pan_image_view * patched_s)760 pan_preload_get_views(const struct pan_fb_info *fb, bool zs,
761                       struct pan_image_view *patched_s)
762 {
763    struct pan_preload_views views = {0};
764 
765    if (zs) {
766       if (fb->zs.preload.z)
767          views.z = fb->zs.view.zs;
768 
769       if (fb->zs.preload.s) {
770          const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs;
771          enum pipe_format fmt = util_format_get_depth_only(view->format);
772 
773          switch (view->format) {
774          case PIPE_FORMAT_Z24_UNORM_S8_UINT:
775             fmt = PIPE_FORMAT_X24S8_UINT;
776             break;
777          case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
778             fmt = PIPE_FORMAT_X32_S8X24_UINT;
779             break;
780          default:
781             fmt = view->format;
782             break;
783          }
784 
785          if (fmt != view->format) {
786             *patched_s = *view;
787             patched_s->format = fmt;
788             views.s = patched_s;
789          } else {
790             views.s = view;
791          }
792       }
793    } else {
794       for (unsigned i = 0; i < fb->rt_count; i++) {
795          if (fb->rts[i].preload)
796             views.rts[i] = fb->rts[i].view;
797       }
798 
799       views.rt_count = fb->rt_count;
800    }
801 
802    return views;
803 }
804 
805 static bool
pan_preload_needed(const struct pan_fb_info * fb,bool zs)806 pan_preload_needed(const struct pan_fb_info *fb, bool zs)
807 {
808    if (zs) {
809       if (fb->zs.preload.z || fb->zs.preload.s)
810          return true;
811    } else {
812       for (unsigned i = 0; i < fb->rt_count; i++) {
813          if (fb->rts[i].preload)
814             return true;
815       }
816    }
817 
818    return false;
819 }
820 
821 static uint64_t
pan_preload_emit_varying(struct pan_pool * pool)822 pan_preload_emit_varying(struct pan_pool *pool)
823 {
824    struct panfrost_ptr varying = pan_pool_alloc_desc(pool, ATTRIBUTE);
825 
826    if (!varying.cpu)
827       return 0;
828 
829    pan_cast_and_pack(varying.cpu, ATTRIBUTE, cfg) {
830       cfg.buffer_index = 0;
831       cfg.offset_enable = PAN_ARCH <= 5;
832       cfg.format =
833          GENX(panfrost_format_from_pipe_format)(PIPE_FORMAT_R32G32B32_FLOAT)->hw;
834 
835 #if PAN_ARCH >= 9
836       cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
837       cfg.table = PAN_BLIT_TABLE_ATTRIBUTE_BUFFER;
838       cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
839       cfg.stride = 4 * sizeof(float);
840 #endif
841    }
842 
843    return varying.gpu;
844 }
845 
846 static uint64_t
pan_preload_emit_varying_buffer(struct pan_pool * pool,uint64_t coordinates)847 pan_preload_emit_varying_buffer(struct pan_pool *pool, uint64_t coordinates)
848 {
849 #if PAN_ARCH >= 9
850    struct panfrost_ptr varying_buffer = pan_pool_alloc_desc(pool, BUFFER);
851 
852    if (!varying_buffer.cpu)
853       return 0;
854 
855    pan_cast_and_pack(varying_buffer.cpu, BUFFER, cfg) {
856       cfg.address = coordinates;
857       cfg.size = 4 * sizeof(float) * 4;
858    }
859 #else
860    /* Bifrost needs an empty desc to mark end of prefetching */
861    bool padding_buffer = PAN_ARCH >= 6;
862 
863    struct panfrost_ptr varying_buffer = pan_pool_alloc_desc_array(
864       pool, (padding_buffer ? 2 : 1), ATTRIBUTE_BUFFER);
865 
866    if (!varying_buffer.cpu)
867       return 0;
868 
869    pan_cast_and_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) {
870       cfg.pointer = coordinates;
871       cfg.stride = 4 * sizeof(float);
872       cfg.size = cfg.stride * 4;
873    }
874 
875    if (padding_buffer) {
876       pan_cast_and_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER),
877                         ATTRIBUTE_BUFFER, cfg)
878          ;
879    }
880 #endif
881 
882    return varying_buffer.gpu;
883 }
884 
885 static uint64_t
pan_preload_emit_sampler(struct pan_pool * pool,bool nearest_filter)886 pan_preload_emit_sampler(struct pan_pool *pool, bool nearest_filter)
887 {
888    struct panfrost_ptr sampler = pan_pool_alloc_desc(pool, SAMPLER);
889 
890    if (!sampler.cpu)
891       return 0;
892 
893    pan_cast_and_pack(sampler.cpu, SAMPLER, cfg) {
894       cfg.seamless_cube_map = false;
895       cfg.normalized_coordinates = false;
896       cfg.minify_nearest = nearest_filter;
897       cfg.magnify_nearest = nearest_filter;
898    }
899 
900    return sampler.gpu;
901 }
902 
903 static uint64_t
pan_preload_emit_textures(struct pan_pool * pool,const struct pan_fb_info * fb,bool zs,unsigned * tex_count_out)904 pan_preload_emit_textures(struct pan_pool *pool, const struct pan_fb_info *fb,
905                           bool zs, unsigned *tex_count_out)
906 {
907    const struct pan_image_view *views[8];
908    struct pan_image_view patched_views[8];
909    unsigned tex_count = 0;
910    unsigned patched_count = 0;
911 
912    if (zs) {
913       if (fb->zs.preload.z) {
914          const struct pan_image_view *view = fb->zs.view.zs;
915 #if PAN_ARCH >= 7
916          struct pan_image_view *pview = &patched_views[patched_count++];
917          *pview = *view;
918          /* v7+ doesn't have an _RRRR component order. */
919          GENX(panfrost_texture_swizzle_replicate_x)(pview);
920          view = pview;
921 #endif
922          views[tex_count++] = view;
923       }
924 
925       if (fb->zs.preload.s) {
926          const struct pan_image_view *view = fb->zs.view.s ?: fb->zs.view.zs;
927          enum pipe_format fmt = util_format_get_depth_only(view->format);
928 
929          switch (view->format) {
930          case PIPE_FORMAT_Z24_UNORM_S8_UINT:
931             fmt = PIPE_FORMAT_X24S8_UINT;
932             break;
933          case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
934             fmt = PIPE_FORMAT_X32_S8X24_UINT;
935             break;
936          default:
937             fmt = view->format;
938             break;
939          }
940 
941 #if PAN_ARCH >= 7
942          struct pan_image_view *pview = &patched_views[patched_count++];
943          *pview = *view;
944          pview->format = fmt;
945          /* v7+ doesn't have an _RRRR component order. */
946          GENX(panfrost_texture_swizzle_replicate_x)(pview);
947          view = pview;
948 #else
949          if (fmt != view->format) {
950             struct pan_image_view *pview = &patched_views[patched_count++];
951             *pview = *view;
952             pview->format = fmt;
953             view = pview;
954          }
955 #endif
956          views[tex_count++] = view;
957       }
958    } else {
959       for (unsigned i = 0; i < fb->rt_count; i++) {
960          if (fb->rts[i].preload) {
961             const struct pan_image_view *view = fb->rts[i].view;
962 #if PAN_ARCH == 7
963             /* v7 requires AFBC reswizzle. */
964             if (!panfrost_format_is_yuv(view->format) &&
965                 panfrost_format_supports_afbc(PAN_ARCH, view->format)) {
966                struct pan_image_view *pview = &patched_views[patched_count++];
967                *pview = *view;
968                GENX(panfrost_texture_afbc_reswizzle)(pview);
969                view = pview;
970             }
971 #endif
972             views[tex_count++] = view;
973          }
974       }
975    }
976 
977    *tex_count_out = tex_count;
978 
979 #if PAN_ARCH >= 6
980    struct panfrost_ptr textures =
981       pan_pool_alloc_desc_array(pool, tex_count, TEXTURE);
982 
983    if (!textures.cpu)
984       return 0;
985 
986    for (unsigned i = 0; i < tex_count; i++) {
987       void *texture = textures.cpu + (pan_size(TEXTURE) * i);
988       size_t payload_size =
989          GENX(panfrost_estimate_texture_payload_size)(views[i]);
990       struct panfrost_ptr surfaces =
991          pan_pool_alloc_aligned(pool, payload_size, 64);
992 
993       GENX(panfrost_new_texture)(views[i], texture, &surfaces);
994    }
995 
996    return textures.gpu;
997 #else
998    uint64_t textures[8] = {0};
999 
1000    for (unsigned i = 0; i < tex_count; i++) {
1001       size_t sz = pan_size(TEXTURE) +
1002                   GENX(panfrost_estimate_texture_payload_size)(views[i]);
1003       struct panfrost_ptr texture =
1004          pan_pool_alloc_aligned(pool, sz, pan_alignment(TEXTURE));
1005       struct panfrost_ptr surfaces = {
1006          .cpu = texture.cpu + pan_size(TEXTURE),
1007          .gpu = texture.gpu + pan_size(TEXTURE),
1008       };
1009 
1010       GENX(panfrost_new_texture)(views[i], texture.cpu, &surfaces);
1011       textures[i] = texture.gpu;
1012    }
1013 
1014    return pan_pool_upload_aligned(pool, textures, tex_count * sizeof(uint64_t),
1015                                   sizeof(uint64_t));
1016 #endif
1017 }
1018 
1019 #if PAN_ARCH >= 8
1020 /* TODO: cache */
1021 static uint64_t
pan_preload_emit_zs(struct pan_pool * pool,bool z,bool s)1022 pan_preload_emit_zs(struct pan_pool *pool, bool z, bool s)
1023 {
1024    struct panfrost_ptr zsd = pan_pool_alloc_desc(pool, DEPTH_STENCIL);
1025 
1026    if (!zsd.cpu)
1027       return 0;
1028 
1029    pan_cast_and_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
1030       cfg.depth_function = MALI_FUNC_ALWAYS;
1031       cfg.depth_write_enable = z;
1032 
1033       if (z)
1034          cfg.depth_source = MALI_DEPTH_SOURCE_SHADER;
1035 
1036       cfg.stencil_test_enable = s;
1037       cfg.stencil_from_shader = s;
1038 
1039       cfg.front_compare_function = MALI_FUNC_ALWAYS;
1040       cfg.front_stencil_fail = MALI_STENCIL_OP_REPLACE;
1041       cfg.front_depth_fail = MALI_STENCIL_OP_REPLACE;
1042       cfg.front_depth_pass = MALI_STENCIL_OP_REPLACE;
1043       cfg.front_write_mask = 0xFF;
1044       cfg.front_value_mask = 0xFF;
1045 
1046       cfg.back_compare_function = MALI_FUNC_ALWAYS;
1047       cfg.back_stencil_fail = MALI_STENCIL_OP_REPLACE;
1048       cfg.back_depth_fail = MALI_STENCIL_OP_REPLACE;
1049       cfg.back_depth_pass = MALI_STENCIL_OP_REPLACE;
1050       cfg.back_write_mask = 0xFF;
1051       cfg.back_value_mask = 0xFF;
1052 
1053       cfg.depth_cull_enable = false;
1054    }
1055 
1056    return zsd.gpu;
1057 }
1058 #else
1059 static uint64_t
pan_preload_emit_viewport(struct pan_pool * pool,uint16_t minx,uint16_t miny,uint16_t maxx,uint16_t maxy)1060 pan_preload_emit_viewport(struct pan_pool *pool, uint16_t minx, uint16_t miny,
1061                           uint16_t maxx, uint16_t maxy)
1062 {
1063    struct panfrost_ptr vp = pan_pool_alloc_desc(pool, VIEWPORT);
1064 
1065    if (!vp.cpu)
1066       return 0;
1067 
1068    pan_cast_and_pack(vp.cpu, VIEWPORT, cfg) {
1069       cfg.scissor_minimum_x = minx;
1070       cfg.scissor_minimum_y = miny;
1071       cfg.scissor_maximum_x = maxx;
1072       cfg.scissor_maximum_y = maxy;
1073    }
1074 
1075    return vp.gpu;
1076 }
1077 #endif
1078 
1079 static void
pan_preload_emit_dcd(struct pan_fb_preload_cache * cache,struct pan_pool * pool,struct pan_fb_info * fb,bool zs,uint64_t coordinates,uint64_t tsd,struct mali_draw_packed * out,bool always_write)1080 pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
1081                      struct pan_fb_info *fb, bool zs, uint64_t coordinates,
1082                      uint64_t tsd, struct mali_draw_packed *out,
1083                      bool always_write)
1084 {
1085    unsigned tex_count = 0;
1086    uint64_t textures = pan_preload_emit_textures(pool, fb, zs, &tex_count);
1087    uint64_t samplers = pan_preload_emit_sampler(pool, true);
1088    uint64_t varyings = pan_preload_emit_varying(pool);
1089    uint64_t varying_buffers =
1090       pan_preload_emit_varying_buffer(pool, coordinates);
1091 
1092    /* Tiles updated by preload shaders are still considered clean (separate
1093     * for colour and Z/S), allowing us to suppress unnecessary writeback
1094     */
1095    UNUSED bool clean_fragment_write = !always_write;
1096 
1097    /* Image view used when patching stencil formats for combined
1098     * depth/stencil preloads.
1099     */
1100    struct pan_image_view patched_s;
1101 
1102    struct pan_preload_views views = pan_preload_get_views(fb, zs, &patched_s);
1103 
1104 #if PAN_ARCH <= 7
1105    pan_pack(out, DRAW, cfg) {
1106       uint16_t minx = 0, miny = 0, maxx, maxy;
1107 
1108       if (PAN_ARCH == 4) {
1109          maxx = fb->width - 1;
1110          maxy = fb->height - 1;
1111       } else {
1112          /* Align on 32x32 tiles */
1113          minx = fb->extent.minx & ~31;
1114          miny = fb->extent.miny & ~31;
1115          maxx = MIN2(ALIGN_POT(fb->extent.maxx + 1, 32), fb->width) - 1;
1116          maxy = MIN2(ALIGN_POT(fb->extent.maxy + 1, 32), fb->height) - 1;
1117       }
1118 
1119       cfg.thread_storage = tsd;
1120       cfg.state = pan_preload_get_rsd(cache, &views);
1121 
1122       cfg.position = coordinates;
1123       cfg.viewport = pan_preload_emit_viewport(pool, minx, miny, maxx, maxy);
1124 
1125       cfg.varyings = varyings;
1126       cfg.varying_buffers = varying_buffers;
1127       cfg.textures = textures;
1128       cfg.samplers = samplers;
1129 
1130 #if PAN_ARCH >= 6
1131       cfg.clean_fragment_write = clean_fragment_write;
1132 #endif
1133    }
1134 #else
1135    struct panfrost_ptr T;
1136    unsigned nr_tables = PAN_BLIT_NUM_RESOURCE_TABLES;
1137 
1138    /* Although individual resources need only 16 byte alignment, the
1139     * resource table as a whole must be 64-byte aligned.
1140     */
1141    T = pan_pool_alloc_aligned(pool, nr_tables * pan_size(RESOURCE), 64);
1142    memset(T.cpu, 0, nr_tables * pan_size(RESOURCE));
1143 
1144    panfrost_make_resource_table(T, PAN_BLIT_TABLE_TEXTURE, textures, tex_count);
1145    panfrost_make_resource_table(T, PAN_BLIT_TABLE_SAMPLER, samplers, 1);
1146    panfrost_make_resource_table(T, PAN_BLIT_TABLE_ATTRIBUTE, varyings, 1);
1147    panfrost_make_resource_table(T, PAN_BLIT_TABLE_ATTRIBUTE_BUFFER,
1148                                 varying_buffers, 1);
1149 
1150    struct pan_preload_shader_key key = pan_preload_get_key(&views);
1151    const struct pan_preload_shader_data *preload_shader =
1152       pan_preload_get_shader(cache, &key);
1153 
1154    bool z = fb->zs.preload.z;
1155    bool s = fb->zs.preload.s;
1156    bool ms = pan_preload_is_ms(&views);
1157 
1158    struct panfrost_ptr spd = pan_pool_alloc_desc(pool, SHADER_PROGRAM);
1159 
1160    if (!spd.cpu) {
1161       mesa_loge("pan_pool_alloc_desc failed");
1162       return;
1163    }
1164 
1165    pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
1166       cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
1167       cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
1168       cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
1169       cfg.binary = preload_shader->address;
1170       cfg.preload.r48_r63 = preload_shader->info.preload >> 48;
1171    }
1172 
1173    unsigned bd_count = views.rt_count;
1174    struct panfrost_ptr blend = pan_pool_alloc_desc_array(pool, bd_count, BLEND);
1175 
1176    if (!blend.cpu) {
1177       mesa_loge("pan_pool_alloc_desc_array failed");
1178       return;
1179    }
1180 
1181    if (!zs) {
1182       pan_preload_emit_blends(preload_shader, &views, NULL, blend.cpu);
1183    }
1184 
1185    pan_pack(out, DRAW, cfg) {
1186       if (zs) {
1187          /* ZS_EMIT requires late update/kill */
1188          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_LATE;
1189          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_LATE;
1190          cfg.blend_count = 0;
1191       } else {
1192          /* Skipping ATEST requires forcing Z/S */
1193          cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
1194          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1195 
1196          cfg.blend = blend.gpu;
1197          cfg.blend_count = bd_count;
1198          cfg.render_target_mask = 0x1;
1199       }
1200 
1201       cfg.allow_forward_pixel_to_kill = !zs;
1202       cfg.allow_forward_pixel_to_be_killed = true;
1203       cfg.depth_stencil = pan_preload_emit_zs(pool, z, s);
1204       cfg.sample_mask = 0xFFFF;
1205       cfg.multisample_enable = ms;
1206       cfg.evaluate_per_sample = ms;
1207       cfg.maximum_z = 1.0;
1208       cfg.clean_fragment_write = clean_fragment_write;
1209       cfg.shader.resources = T.gpu | nr_tables;
1210       cfg.shader.shader = spd.gpu;
1211       cfg.shader.thread_storage = tsd;
1212    }
1213 #endif
1214 }
1215 
1216 #if PAN_ARCH >= 6
1217 static void
pan_preload_fb_alloc_pre_post_dcds(struct pan_pool * desc_pool,struct pan_fb_info * fb)1218 pan_preload_fb_alloc_pre_post_dcds(struct pan_pool *desc_pool,
1219                                    struct pan_fb_info *fb)
1220 {
1221    if (fb->bifrost.pre_post.dcds.gpu)
1222       return;
1223 
1224    fb->bifrost.pre_post.dcds = pan_pool_alloc_desc_array(desc_pool, 3, DRAW);
1225 }
1226 
1227 static void
pan_preload_emit_pre_frame_dcd(struct pan_fb_preload_cache * cache,struct pan_pool * desc_pool,struct pan_fb_info * fb,bool zs,uint64_t coords,uint64_t tsd)1228 pan_preload_emit_pre_frame_dcd(struct pan_fb_preload_cache *cache,
1229                                struct pan_pool *desc_pool,
1230                                struct pan_fb_info *fb, bool zs, uint64_t coords,
1231                                uint64_t tsd)
1232 {
1233    unsigned dcd_idx = zs ? 1 : 0;
1234    pan_preload_fb_alloc_pre_post_dcds(desc_pool, fb);
1235    if (!fb->bifrost.pre_post.dcds.cpu) {
1236       mesa_loge("pan_preload_fb_alloc_pre_post_dcds failed");
1237       return;
1238    }
1239 
1240    void *dcd = fb->bifrost.pre_post.dcds.cpu + (dcd_idx * pan_size(DRAW));
1241 
1242    /* We only use crc_rt to determine whether to force writes for updating
1243     * the CRCs, so use a conservative tile size (16x16).
1244     */
1245    int crc_rt = GENX(pan_select_crc_rt)(fb, 16 * 16);
1246 
1247    bool always_write = false;
1248 
1249    /* If CRC data is currently invalid and this batch will make it valid,
1250     * write even clean tiles to make sure CRC data is updated. */
1251    if (crc_rt >= 0) {
1252       bool *valid = fb->rts[crc_rt].crc_valid;
1253       bool full = !fb->extent.minx && !fb->extent.miny &&
1254                   fb->extent.maxx == (fb->width - 1) &&
1255                   fb->extent.maxy == (fb->height - 1);
1256 
1257       if (full && !(*valid))
1258          always_write = true;
1259    }
1260 
1261    pan_preload_emit_dcd(cache, desc_pool, fb, zs, coords, tsd, dcd,
1262                         always_write);
1263    if (zs) {
1264       enum pipe_format fmt = fb->zs.view.zs
1265                                 ? fb->zs.view.zs->planes[0]->layout.format
1266                                 : fb->zs.view.s->planes[0]->layout.format;
1267       bool always = false;
1268 
1269       /* If we're dealing with a combined ZS resource and only one
1270        * component is cleared, we need to reload the whole surface
1271        * because the zs_clean_pixel_write_enable flag is set in that
1272        * case.
1273        */
1274       if (util_format_is_depth_and_stencil(fmt) &&
1275           fb->zs.clear.z != fb->zs.clear.s)
1276          always = true;
1277 
1278       /* We could use INTERSECT on Bifrost v7 too, but
1279        * EARLY_ZS_ALWAYS has the advantage of reloading the ZS tile
1280        * buffer one or more tiles ahead, making ZS data immediately
1281        * available for any ZS tests taking place in other shaders.
1282        * Thing's haven't been benchmarked to determine what's
1283        * preferable (saving bandwidth vs having ZS preloaded
1284        * earlier), so let's leave it like that for now.
1285        */
1286       fb->bifrost.pre_post.modes[dcd_idx] =
1287          PAN_ARCH > 6
1288             ? MALI_PRE_POST_FRAME_SHADER_MODE_EARLY_ZS_ALWAYS
1289          : always ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS
1290                   : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
1291    } else {
1292       fb->bifrost.pre_post.modes[dcd_idx] =
1293          always_write ? MALI_PRE_POST_FRAME_SHADER_MODE_ALWAYS
1294                       : MALI_PRE_POST_FRAME_SHADER_MODE_INTERSECT;
1295    }
1296 }
1297 #else
1298 static struct panfrost_ptr
pan_preload_emit_tiler_job(struct pan_fb_preload_cache * cache,struct pan_pool * desc_pool,struct pan_fb_info * fb,bool zs,uint64_t coords,uint64_t tsd)1299 pan_preload_emit_tiler_job(struct pan_fb_preload_cache *cache, struct pan_pool *desc_pool,
1300                            struct pan_fb_info *fb, bool zs, uint64_t coords,
1301                            uint64_t tsd)
1302 {
1303    struct panfrost_ptr job = pan_pool_alloc_desc(desc_pool, TILER_JOB);
1304 
1305    if (!job.cpu)
1306       return (struct panfrost_ptr){0};
1307 
1308    pan_preload_emit_dcd(cache, desc_pool, fb, zs, coords, tsd,
1309                         pan_section_ptr(job.cpu, TILER_JOB, DRAW), false);
1310 
1311    pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) {
1312       cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP;
1313       cfg.index_count = 4;
1314       cfg.job_task_split = 6;
1315    }
1316 
1317    pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) {
1318       cfg.constant = 1.0f;
1319    }
1320 
1321    void *invoc = pan_section_ptr(job.cpu, TILER_JOB, INVOCATION);
1322    panfrost_pack_work_groups_compute(invoc, 1, 4, 1, 1, 1, 1, true, false);
1323 
1324    return job;
1325 }
1326 #endif
1327 
1328 static struct panfrost_ptr
pan_preload_fb_part(struct pan_fb_preload_cache * cache,struct pan_pool * pool,struct pan_fb_info * fb,bool zs,uint64_t coords,uint64_t tsd)1329 pan_preload_fb_part(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
1330                     struct pan_fb_info *fb, bool zs, uint64_t coords,
1331                     uint64_t tsd)
1332 {
1333    struct panfrost_ptr job = {0};
1334 
1335 #if PAN_ARCH >= 6
1336    pan_preload_emit_pre_frame_dcd(cache, pool, fb, zs, coords, tsd);
1337 #else
1338    job = pan_preload_emit_tiler_job(cache, pool, fb, zs, coords, tsd);
1339 #endif
1340    return job;
1341 }
1342 
1343 unsigned
GENX(pan_preload_fb)1344 GENX(pan_preload_fb)(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
1345                      struct pan_fb_info *fb, uint64_t tsd,
1346                      struct panfrost_ptr *jobs)
1347 {
1348    bool preload_zs = pan_preload_needed(fb, true);
1349    bool preload_rts = pan_preload_needed(fb, false);
1350    uint64_t coords;
1351 
1352    if (!preload_zs && !preload_rts)
1353       return 0;
1354 
1355    float rect[] = {
1356       0.0,       0.0,        0, 1.0,
1357       fb->width, 0.0,        0, 1.0,
1358       0.0,       fb->height, 0, 1.0,
1359       fb->width, fb->height, 0, 1.0,
1360    };
1361 
1362    coords = pan_pool_upload_aligned(pool, rect, sizeof(rect), 64);
1363 
1364    unsigned njobs = 0;
1365    if (preload_zs) {
1366       struct panfrost_ptr job =
1367          pan_preload_fb_part(cache, pool, fb, true, coords, tsd);
1368       if (jobs && job.cpu)
1369          jobs[njobs++] = job;
1370    }
1371 
1372    if (preload_rts) {
1373       struct panfrost_ptr job =
1374          pan_preload_fb_part(cache, pool, fb, false, coords, tsd);
1375       if (jobs && job.cpu)
1376          jobs[njobs++] = job;
1377    }
1378 
1379    return njobs;
1380 }
1381 
1382 DERIVE_HASH_TABLE(pan_preload_shader_key);
1383 DERIVE_HASH_TABLE(pan_preload_blend_shader_key);
1384 DERIVE_HASH_TABLE(pan_preload_rsd_key);
1385 
1386 static void
pan_preload_prefill_preload_shader_cache(struct pan_fb_preload_cache * cache)1387 pan_preload_prefill_preload_shader_cache(struct pan_fb_preload_cache *cache)
1388 {
1389    static const struct pan_preload_shader_key prefill[] = {
1390       {
1391          .surfaces[0] =
1392             {
1393                .loc = FRAG_RESULT_DEPTH,
1394                .type = nir_type_float32,
1395                .dim = MALI_TEXTURE_DIMENSION_2D,
1396                .samples = 1,
1397             },
1398       },
1399       {
1400          .surfaces[1] =
1401             {
1402                .loc = FRAG_RESULT_STENCIL,
1403                .type = nir_type_uint32,
1404                .dim = MALI_TEXTURE_DIMENSION_2D,
1405                .samples = 1,
1406             },
1407       },
1408       {
1409          .surfaces[0] =
1410             {
1411                .loc = FRAG_RESULT_DATA0,
1412                .type = nir_type_float32,
1413                .dim = MALI_TEXTURE_DIMENSION_2D,
1414                .samples = 1,
1415             },
1416       },
1417    };
1418 
1419    for (unsigned i = 0; i < ARRAY_SIZE(prefill); i++)
1420       pan_preload_get_shader(cache, &prefill[i]);
1421 }
1422 
1423 void
GENX(pan_fb_preload_cache_init)1424 GENX(pan_fb_preload_cache_init)(
1425    struct pan_fb_preload_cache *cache, unsigned gpu_id,
1426    struct pan_blend_shader_cache *blend_shader_cache, struct pan_pool *bin_pool,
1427    struct pan_pool *desc_pool)
1428 {
1429    cache->gpu_id = gpu_id;
1430    cache->shaders.preload = pan_preload_shader_key_table_create(NULL);
1431    cache->shaders.blend = pan_preload_blend_shader_key_table_create(NULL);
1432    cache->shaders.pool = bin_pool;
1433    pthread_mutex_init(&cache->shaders.lock, NULL);
1434    pan_preload_prefill_preload_shader_cache(cache);
1435 
1436    cache->rsds.pool = desc_pool;
1437    cache->rsds.rsds = pan_preload_rsd_key_table_create(NULL);
1438    pthread_mutex_init(&cache->rsds.lock, NULL);
1439    cache->blend_shader_cache = blend_shader_cache;
1440 }
1441 
1442 void
GENX(pan_fb_preload_cache_cleanup)1443 GENX(pan_fb_preload_cache_cleanup)(struct pan_fb_preload_cache *cache)
1444 {
1445    _mesa_hash_table_destroy(cache->shaders.preload, NULL);
1446    _mesa_hash_table_destroy(cache->shaders.blend, NULL);
1447    pthread_mutex_destroy(&cache->shaders.lock);
1448    _mesa_hash_table_destroy(cache->rsds.rsds, NULL);
1449    pthread_mutex_destroy(&cache->rsds.lock);
1450 }
1451