• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <jonathan@marek.ca>
7  */
8 
9 #include "tu_clear_blit.h"
10 
11 #include "ir3/ir3_nir.h"
12 
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18 
19 #include "tu_buffer.h"
20 #include "tu_cmd_buffer.h"
21 #include "tu_cs.h"
22 #include "tu_formats.h"
23 #include "tu_image.h"
24 #include "tu_tracepoints.h"
25 #include "tu_lrz.h"
26 
27 #include "common/freedreno_gpu_event.h"
28 #include "common/freedreno_lrz.h"
29 
30 static const VkOffset2D blt_no_coord = { ~0, ~0 };
31 
32 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)33 tu_pack_float32_for_unorm(float val, int bits)
34 {
35    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
36 }
37 
38 /* r2d_ = BLIT_OP_SCALE operations */
39 
40 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)41 format_to_ifmt(enum pipe_format format)
42 {
43    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
44        format == PIPE_FORMAT_Z24X8_UNORM)
45       return R2D_UNORM8;
46 
47    /* get_component_bits doesn't work with depth/stencil formats: */
48    if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
49       return R2D_FLOAT32;
50    if (format == PIPE_FORMAT_S8_UINT)
51       return R2D_INT8;
52    if (format == PIPE_FORMAT_A8_UNORM)
53       return R2D_UNORM8;
54 
55    /* use the size of the red channel to find the corresponding "ifmt" */
56    bool is_int = util_format_is_pure_integer(format);
57    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
58    case 4: case 5: case 8:
59       return is_int ? R2D_INT8 : R2D_UNORM8;
60    case 10: case 11:
61       return is_int ? R2D_INT16 : R2D_FLOAT16;
62    case 16:
63       if (util_format_is_float(format))
64          return R2D_FLOAT16;
65       return is_int ? R2D_INT16 : R2D_FLOAT32;
66    case 32:
67       return is_int ? R2D_INT32 : R2D_FLOAT32;
68     default:
69       unreachable("bad format");
70    }
71 }
72 
73 template <chip CHIP>
74 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode,bool is_mutable,bool gmem)75 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode, bool is_mutable, bool gmem)
76 {
77    struct tu_native_format fmt = tu6_format_texture(format, tile_mode, is_mutable);
78 
79    switch (format) {
80    case PIPE_FORMAT_Z24X8_UNORM:
81    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
82       /* Similar to in fdl6_view_init, we want to use
83        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
84        * src.  Since this is called when there is no image and thus no ubwc,
85        * we can always use FMT6_8_8_8_8_UNORM.
86        *
87        * Note (A7XX): Since it's erroneous to use FMT6_8_8_8_8_UNORM for a GMEM
88        * image (see blit_base_format), we use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
89        * instead.
90        */
91       fmt.fmt = CHIP >= A7XX && gmem ? FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 : FMT6_8_8_8_8_UNORM;
92       break;
93    default:
94       break;
95    }
96 
97    return fmt;
98 }
99 
100 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)101 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
102 {
103    struct tu_native_format fmt = tu6_format_color(format, tile_mode, false);
104 
105    switch (format) {
106    case PIPE_FORMAT_Z24X8_UNORM:
107    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
108       /* similar to blit_format_texture but for blit dst */
109       fmt.fmt = FMT6_8_8_8_8_UNORM;
110       break;
111    default:
112       break;
113    }
114 
115    return fmt;
116 }
117 
118 template <chip CHIP>
119 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc,bool gmem)120 blit_base_format(enum pipe_format format, bool ubwc, bool gmem)
121 {
122    if (CHIP >= A7XX && gmem)
123       /* A7XX requires D24S8 in GMEM to always be treated as
124        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 regardless of if the image
125        * is UBWC-compatible. Using FMT6_8_8_8_8_UNORM instead will result
126        * in misrendering around the edges of the destination image.
127        */
128       ubwc = true;
129 
130    if (ubwc) {
131       switch (format) {
132       case PIPE_FORMAT_Z24X8_UNORM:
133       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
134          /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
135          return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
136       default:
137          break;
138       }
139    }
140 
141    /* note: tu6_format_color doesn't care about tiling for .fmt field */
142    return blit_format_color(format, TILE6_LINEAR).fmt;
143 }
144 
145 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)146 r2d_coords(struct tu_cmd_buffer *cmd,
147            struct tu_cs *cs,
148            const VkOffset2D dst,
149            const VkOffset2D src,
150            const VkExtent2D extent)
151 {
152    tu_cs_emit_regs(cs,
153       A6XX_GRAS_2D_DST_TL(.x = dst.x,                    .y = dst.y),
154       A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
155 
156    if (src.x == blt_no_coord.x)
157       return;
158 
159    tu_cs_emit_regs(cs,
160                    A6XX_GRAS_2D_SRC_TL_X(src.x),
161                    A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
162                    A6XX_GRAS_2D_SRC_TL_Y(src.y),
163                    A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
164 }
165 
166 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)167 r2d_clear_value(struct tu_cmd_buffer *cmd,
168                 struct tu_cs *cs,
169                 enum pipe_format format,
170                 const VkClearValue *val)
171 {
172    uint32_t clear_value[4] = {};
173 
174    switch (format) {
175    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
176    case PIPE_FORMAT_Z24X8_UNORM:
177       /* cleared as r8g8b8a8_unorm using special format */
178       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
179       clear_value[1] = clear_value[0] >> 8;
180       clear_value[2] = clear_value[0] >> 16;
181       clear_value[3] = val->depthStencil.stencil;
182       break;
183    case PIPE_FORMAT_Z16_UNORM:
184    case PIPE_FORMAT_Z32_FLOAT:
185       /* R2D_FLOAT32 */
186       clear_value[0] = fui(val->depthStencil.depth);
187       break;
188    case PIPE_FORMAT_S8_UINT:
189       clear_value[0] = val->depthStencil.stencil;
190       break;
191    case PIPE_FORMAT_R9G9B9E5_FLOAT:
192       /* cleared as UINT32 */
193       clear_value[0] = float3_to_rgb9e5(val->color.float32);
194       break;
195    default:
196       assert(!util_format_is_depth_or_stencil(format));
197       const struct util_format_description *desc = util_format_description(format);
198       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
199 
200       assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
201              format == PIPE_FORMAT_R11G11B10_FLOAT);
202 
203       for (unsigned i = 0; i < 4; i++) {
204          if (desc->swizzle[i] > PIPE_SWIZZLE_W)
205             continue;
206 
207          const struct util_format_channel_description *ch =
208             &desc->channel[desc->swizzle[i]];
209          if (ifmt == R2D_UNORM8) {
210             float linear = val->color.float32[i];
211             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
212                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
213 
214             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
215                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
216             else
217                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
218          } else if (ifmt == R2D_FLOAT16) {
219             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
220          } else {
221             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
222                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
223             clear_value[i] = val->color.uint32[i];
224          }
225       }
226       break;
227    }
228 
229    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
230    tu_cs_emit_array(cs, clear_value, 4);
231 }
232 
233 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)234 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
235                  enum a6xx_format *fmt)
236 {
237    /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
238     * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
239     * We also have to move stencil, which is normally in the .w channel, into
240     * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
241     * problems, and avoids using a swap, which seems to sometimes not work
242     * with a D24S8 source, or a texture swizzle which is only supported with
243     * the 3d path. Sometimes this blit happens on already-constructed
244     * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
245     */
246    if (*src_format == PIPE_FORMAT_S8_UINT &&
247        (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
248         dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
249       *fmt = FMT6_A8_UNORM;
250       *src_format = PIPE_FORMAT_A8_UNORM;
251    }
252 }
253 
254 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)255 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
256                  enum a6xx_format *fmt)
257 {
258    if (*dst_format == PIPE_FORMAT_S8_UINT &&
259        (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
260         src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
261       *dst_format = PIPE_FORMAT_A8_UNORM;
262       *fmt = FMT6_A8_UNORM;
263    }
264 }
265 
266 template <chip CHIP>
267 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)268 r2d_src(struct tu_cmd_buffer *cmd,
269         struct tu_cs *cs,
270         const struct fdl6_view *iview,
271         uint32_t layer,
272         VkFilter filter,
273         enum pipe_format dst_format)
274 {
275    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
276    if (filter != VK_FILTER_NEAREST)
277       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
278 
279    enum a6xx_format fmt = (enum a6xx_format)(
280       src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
281    enum pipe_format src_format = iview->format;
282    fixup_src_format(&src_format, dst_format, &fmt);
283 
284    src_info =
285       (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
286       A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
287 
288    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
289    tu_cs_emit(cs, src_info);
290    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
291    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
292 
293    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
294    tu_cs_image_flag_ref(cs, iview, layer);
295 }
296 
297 template <chip CHIP>
298 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)299 r2d_src_depth(struct tu_cmd_buffer *cmd,
300                 struct tu_cs *cs,
301                 const struct tu_image_view *iview,
302                 uint32_t layer,
303                 VkFilter filter)
304 {
305    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
306    tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
307    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
308    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
309    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
310    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
311 
312    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
313    tu_cs_image_flag_ref(cs, &iview->view, layer);
314 }
315 
316 template <chip CHIP>
317 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)318 r2d_src_stencil(struct tu_cmd_buffer *cmd,
319                 struct tu_cs *cs,
320                 const struct tu_image_view *iview,
321                 uint32_t layer,
322                 VkFilter filter)
323 {
324    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
325    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
326    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
327    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
328    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
329 }
330 
331 template <chip CHIP>
332 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)333 r2d_src_buffer(struct tu_cmd_buffer *cmd,
334                struct tu_cs *cs,
335                enum pipe_format format,
336                uint64_t va, uint32_t pitch,
337                uint32_t width, uint32_t height,
338                enum pipe_format dst_format)
339 {
340    struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
341    enum a6xx_format color_format = fmt.fmt;
342    fixup_src_format(&format, dst_format, &color_format);
343 
344    tu_cs_emit_regs(cs,
345                    SP_PS_2D_SRC_INFO(CHIP,
346                       .color_format = color_format,
347                       .color_swap = fmt.swap,
348                       .srgb = util_format_is_srgb(format),
349                       .unk20 = 1,
350                       .unk22 = 1),
351                    SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
352                    SP_PS_2D_SRC(CHIP, .qword = va),
353                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
354 }
355 
356 template <chip CHIP>
357 static void
r2d_src_buffer_unaligned(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)358 r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd,
359                          struct tu_cs *cs,
360                          enum pipe_format format,
361                          uint64_t va,
362                          uint32_t pitch,
363                          uint32_t width,
364                          uint32_t height,
365                          enum pipe_format dst_format)
366 {
367    /* This functionality is only allowed on A7XX, this assertion statically
368     * disallows calling this function on prior generations by mistake.
369     */
370    static_assert(CHIP >= A7XX);
371 
372    struct tu_native_format fmt =
373       blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
374    enum a6xx_format color_format = fmt.fmt;
375    fixup_src_format(&format, dst_format, &color_format);
376 
377    uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
378    va &= ~0x3f;
379    tu_cs_emit_regs(cs,
380                    A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
381                                          .start_offset_texels = offset_texels,
382                                          .type = A6XX_TEX_IMG_BUFFER));
383 
384    tu_cs_emit_regs(cs,
385                    SP_PS_2D_SRC_INFO(CHIP, .color_format = color_format,
386                                      .color_swap = fmt.swap,
387                                      .srgb = util_format_is_srgb(format),
388                                      .unk20 = 1, .unk22 = 1),
389                    SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
390                    SP_PS_2D_SRC(CHIP, .qword = va),
391                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
392 }
393 
394 template <chip CHIP>
395 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)396 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
397         enum pipe_format src_format)
398 {
399    uint32_t dst_info = iview->RB_2D_DST_INFO;
400    enum a6xx_format fmt =
401       (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
402    enum pipe_format dst_format = iview->format;
403    fixup_dst_format(src_format, &dst_format, &fmt);
404 
405    dst_info =
406          (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
407    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
408    tu_cs_emit(cs, dst_info);
409    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
410 
411    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
412    tu_cs_image_flag_ref(cs, iview, layer);
413 }
414 
415 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)416 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
417 {
418    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
419    tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
420    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
421    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
422 
423    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
424    tu_cs_image_flag_ref(cs, &iview->view, layer);
425 }
426 
427 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)428 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
429 {
430    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
431    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
432    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
433    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
434 }
435 
436 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)437 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
438                enum pipe_format src_format)
439 {
440    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
441    enum a6xx_format color_fmt = fmt.fmt;
442    fixup_dst_format(src_format, &format, &color_fmt);
443    fmt.fmt = color_fmt;
444 
445    tu_cs_emit_regs(cs,
446                    A6XX_RB_2D_DST_INFO(
447                       .color_format = fmt.fmt,
448                       .color_swap = fmt.swap,
449                       .srgb = util_format_is_srgb(format)),
450                    A6XX_RB_2D_DST(.qword = va),
451                    A6XX_RB_2D_DST_PITCH(pitch));
452 }
453 
454 template <chip CHIP>
455 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)456 r2d_setup_common(struct tu_cmd_buffer *cmd,
457                  struct tu_cs *cs,
458                  enum pipe_format src_format,
459                  enum pipe_format dst_format,
460                  VkImageAspectFlags aspect_mask,
461                  unsigned blit_param,
462                  bool clear,
463                  bool ubwc,
464                  bool scissor)
465 {
466    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
467       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
468    }
469 
470    enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
471    fixup_dst_format(src_format, &dst_format, &fmt);
472    enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
473 
474    uint32_t unknown_8c01 = 0;
475 
476    /* note: the only format with partial clearing is D24S8 */
477    if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
478       /* preserve stencil channel */
479       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
480          unknown_8c01 = 0x08000041;
481       /* preserve depth channels */
482       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
483          unknown_8c01 = 0x00084001;
484    }
485 
486    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
487    tu_cs_emit(cs, unknown_8c01);    // TODO: seem to be always 0 on A7XX
488 
489    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
490          .rotate = (enum a6xx_rotation) blit_param,
491          .solid_color = clear,
492          .color_format = fmt,
493          .scissor = scissor,
494          .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
495          .mask = 0xf,
496          .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
497       ).value;
498 
499    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
500    tu_cs_emit(cs, blit_cntl);
501 
502    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
503    tu_cs_emit(cs, blit_cntl);
504 
505    if (CHIP > A6XX) {
506       tu_cs_emit_regs(cs, A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
507                                                 .start_offset_texels = 0,
508                                                 .type = A6XX_TEX_2D));
509    }
510 
511    if (fmt == FMT6_10_10_10_2_UNORM_DEST)
512       fmt = FMT6_16_16_16_16_FLOAT;
513 
514    tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
515          .sint = util_format_is_pure_sint(dst_format),
516          .uint = util_format_is_pure_uint(dst_format),
517          .color_format = fmt,
518          .srgb = util_format_is_srgb(dst_format),
519          .mask = 0xf));
520 }
521 
522 template <chip CHIP>
523 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)524 r2d_setup(struct tu_cmd_buffer *cmd,
525           struct tu_cs *cs,
526           enum pipe_format src_format,
527           enum pipe_format dst_format,
528           VkImageAspectFlags aspect_mask,
529           unsigned blit_param,
530           bool clear,
531           bool ubwc,
532           VkSampleCountFlagBits samples)
533 {
534    assert(samples == VK_SAMPLE_COUNT_1_BIT);
535 
536    if (!cmd->state.pass) {
537       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
538    }
539 
540    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
541 }
542 
543 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)544 r2d_teardown(struct tu_cmd_buffer *cmd,
545              struct tu_cs *cs)
546 {
547    /* nothing to do here */
548 }
549 
550 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)551 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
552 {
553    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
554        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
555       /* This a non-context register, so we have to WFI before changing. */
556       tu_cs_emit_wfi(cs);
557       tu_cs_emit_write_reg(
558          cs, REG_A6XX_RB_DBG_ECO_CNTL,
559          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
560    }
561 
562    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
563    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
564 
565    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
566        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
567       tu_cs_emit_wfi(cs);
568       tu_cs_emit_write_reg(
569          cs, REG_A6XX_RB_DBG_ECO_CNTL,
570          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
571    }
572 }
573 
574 /* r3d_ = shader path operations */
575 
576 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)577 load_const(nir_builder *b, unsigned base, unsigned components)
578 {
579    return nir_load_const_ir3(b, components, 32, nir_imm_int(b, 0),
580                              .base = base);
581 }
582 
583 static nir_shader *
build_blit_vs_shader(void)584 build_blit_vs_shader(void)
585 {
586    nir_builder _b =
587       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
588    nir_builder *b = &_b;
589    b->shader->info.internal = true;
590 
591    nir_variable *out_pos =
592       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
593                           "gl_Position");
594    out_pos->data.location = VARYING_SLOT_POS;
595 
596    nir_def *vert0_pos = load_const(b, 0, 2);
597    nir_def *vert1_pos = load_const(b, 4, 2);
598    nir_def *vertex = nir_load_vertex_id(b);
599 
600    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
601    pos = nir_vec4(b, nir_channel(b, pos, 0),
602                      nir_channel(b, pos, 1),
603                      nir_imm_float(b, 0.0),
604                      nir_imm_float(b, 1.0));
605 
606    nir_store_var(b, out_pos, pos, 0xf);
607 
608    nir_variable *out_coords =
609       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
610                           "coords");
611    out_coords->data.location = VARYING_SLOT_VAR0;
612 
613    nir_def *vert0_coords = load_const(b, 2, 2);
614    nir_def *vert1_coords = load_const(b, 6, 2);
615 
616    /* Only used with "z scale" blit path which uses a 3d texture */
617    nir_def *z_coord = load_const(b, 16, 1);
618 
619    nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
620    coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
621                      z_coord);
622 
623    nir_store_var(b, out_coords, coords, 0x7);
624 
625    return b->shader;
626 }
627 
628 static nir_shader *
build_clear_vs_shader(void)629 build_clear_vs_shader(void)
630 {
631    nir_builder _b =
632       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
633    nir_builder *b = &_b;
634    b->shader->info.internal = true;
635 
636    nir_variable *out_pos =
637       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
638                           "gl_Position");
639    out_pos->data.location = VARYING_SLOT_POS;
640 
641    nir_def *vert0_pos = load_const(b, 0, 2);
642    nir_def *vert1_pos = load_const(b, 4, 2);
643    /* c0.z is used to clear depth */
644    nir_def *depth = load_const(b, 2, 1);
645    nir_def *vertex = nir_load_vertex_id(b);
646 
647    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
648    pos = nir_vec4(b, nir_channel(b, pos, 0),
649                      nir_channel(b, pos, 1),
650                      depth, nir_imm_float(b, 1.0));
651 
652    nir_store_var(b, out_pos, pos, 0xf);
653 
654    nir_variable *out_layer =
655       nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
656                           "gl_Layer");
657    out_layer->data.location = VARYING_SLOT_LAYER;
658    nir_def *layer = load_const(b, 3, 1);
659    nir_store_var(b, out_layer, layer, 1);
660 
661    return b->shader;
662 }
663 
664 static nir_shader *
build_blit_fs_shader(bool zscale)665 build_blit_fs_shader(bool zscale)
666 {
667    nir_builder _b =
668       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
669                                      zscale ? "zscale blit fs" : "blit fs");
670    nir_builder *b = &_b;
671    b->shader->info.internal = true;
672 
673    nir_variable *out_color =
674       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
675                           "color0");
676    out_color->data.location = FRAG_RESULT_DATA0;
677 
678    unsigned coord_components = zscale ? 3 : 2;
679    nir_variable *in_coords =
680       nir_variable_create(b->shader, nir_var_shader_in,
681                           glsl_vec_type(coord_components),
682                           "coords");
683    in_coords->data.location = VARYING_SLOT_VAR0;
684 
685    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
686    /* Note: since we're just copying data, we rely on the HW ignoring the
687     * dest_type.
688     */
689    tex->dest_type = nir_type_int32;
690    tex->is_array = false;
691    tex->is_shadow = false;
692    tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
693 
694    tex->texture_index = 0;
695    tex->sampler_index = 0;
696 
697    b->shader->info.num_textures = 1;
698    BITSET_SET(b->shader->info.textures_used, 0);
699 
700    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
701                                      nir_load_var(b, in_coords));
702    tex->coord_components = coord_components;
703 
704    nir_def_init(&tex->instr, &tex->def, 4, 32);
705    nir_builder_instr_insert(b, &tex->instr);
706 
707    nir_store_var(b, out_color, &tex->def, 0xf);
708 
709    return b->shader;
710 }
711 
712 /* We can only read multisample textures via txf_ms, so we need a separate
713  * variant for them.
714  */
715 static nir_shader *
build_ms_copy_fs_shader(bool half_float)716 build_ms_copy_fs_shader(bool half_float)
717 {
718    nir_builder _b =
719       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
720                                      "multisample copy fs");
721    nir_builder *b = &_b;
722    b->shader->info.internal = true;
723 
724    nir_variable *out_color =
725       nir_variable_create(b->shader, nir_var_shader_out,
726                           half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
727                           "color0");
728    out_color->data.location = FRAG_RESULT_DATA0;
729 
730    nir_variable *in_coords =
731       nir_variable_create(b->shader, nir_var_shader_in,
732                           glsl_vec_type(2),
733                           "coords");
734    in_coords->data.location = VARYING_SLOT_VAR0;
735 
736    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
737 
738    tex->op = nir_texop_txf_ms;
739 
740    /* Note: since we're just copying data, we rely on the HW ignoring the
741     * dest_type.
742     */
743    tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
744    tex->is_array = false;
745    tex->is_shadow = false;
746    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
747 
748    tex->texture_index = 0;
749    tex->sampler_index = 0;
750 
751    b->shader->info.num_textures = 1;
752    BITSET_SET(b->shader->info.textures_used, 0);
753    BITSET_SET(b->shader->info.textures_used_by_txf, 0);
754 
755    nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
756 
757    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
758    tex->coord_components = 2;
759 
760    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
761                                      nir_load_sample_id(b));
762 
763    nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
764    nir_builder_instr_insert(b, &tex->instr);
765 
766    nir_store_var(b, out_color, &tex->def, 0xf);
767 
768    return b->shader;
769 }
770 
771 static nir_shader *
build_clear_fs_shader(unsigned mrts)772 build_clear_fs_shader(unsigned mrts)
773 {
774    nir_builder _b =
775       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
776                                      "mrt%u clear fs", mrts);
777    nir_builder *b = &_b;
778    b->shader->info.internal = true;
779 
780    for (unsigned i = 0; i < mrts; i++) {
781       nir_variable *out_color =
782          nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
783                              "color");
784       out_color->data.location = FRAG_RESULT_DATA0 + i;
785 
786       nir_def *color = load_const(b, 4 * i, 4);
787       nir_store_var(b, out_color, color, 0xf);
788    }
789 
790    return b->shader;
791 }
792 
793 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)794 compile_shader(struct tu_device *dev, struct nir_shader *nir,
795                unsigned consts, unsigned *offset, enum global_shader idx)
796 {
797    nir->options = ir3_get_compiler_options(dev->compiler);
798 
799    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
800    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
801 
802    struct ir3_const_allocations const_allocs = {};
803    if (consts > 0)
804       ir3_const_alloc(&const_allocs, IR3_CONST_ALLOC_UBO_RANGES, align(consts, 8), 1);
805 
806    const struct ir3_shader_options options = {
807       .api_wavesize = IR3_SINGLE_OR_DOUBLE,
808       .real_wavesize = IR3_SINGLE_OR_DOUBLE,
809       .const_allocs = const_allocs,
810    };
811 
812    ir3_finalize_nir(dev->compiler, &options.nir_options, nir);
813 
814    struct ir3_shader *sh =
815       ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
816 
817    struct ir3_shader_key key = {};
818    bool created;
819    struct ir3_shader_variant *so =
820       ir3_shader_get_variant(sh, &key, false, false, &created);
821 
822    struct tu6_global *global = dev->global_bo_map;
823 
824    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
825    dev->global_shaders[idx] = sh;
826    dev->global_shader_variants[idx] = so;
827    memcpy(&global->shaders[*offset], so->bin,
828           sizeof(uint32_t) * so->info.sizedwords);
829    dev->global_shader_va[idx] = dev->global_bo->iova +
830       offsetof_arr(struct tu6_global, shaders, *offset);
831    *offset += align(so->info.sizedwords, 32);
832 }
833 
834 void
tu_init_clear_blit_shaders(struct tu_device * dev)835 tu_init_clear_blit_shaders(struct tu_device *dev)
836 {
837    unsigned offset = 0;
838    compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
839    compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
840    compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
841    compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
842    compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
843    compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
844 
845    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
846       compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
847                      (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
848    }
849 }
850 
851 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)852 tu_destroy_clear_blit_shaders(struct tu_device *dev)
853 {
854    for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
855       if (dev->global_shaders[i])
856          ir3_shader_destroy(dev->global_shaders[i]);
857    }
858 }
859 
860 enum r3d_type {
861    R3D_CLEAR,
862    R3D_BLIT,
863    R3D_COPY_HALF,
864 };
865 
866 template <chip CHIP>
867 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)868 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
869            uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
870 {
871    enum global_shader vs_id =
872       type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
873 
874    struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
875    uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
876 
877    enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
878 
879    if (z_scale) {
880       fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
881    } else if (type == R3D_COPY_HALF) {
882       /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
883        *
884        * TODO: Add a half-float blit shader that uses texture() but with half
885        * registers to avoid NaN canonicaliztion for the single-sampled case.
886        */
887       fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
888    } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
889       fs_id = GLOBAL_SH_FS_COPY_MS;
890    }
891 
892    unsigned num_rts = util_bitcount(rts_mask);
893    if (type == R3D_CLEAR)
894       fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
895 
896    struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
897    uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
898 
899    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
900          .vs_state = true,
901          .hs_state = true,
902          .ds_state = true,
903          .gs_state = true,
904          .fs_state = true,
905          .gfx_ibo = true,
906          .gfx_shared_const = true,
907          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
908          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
909 
910    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
911    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
912    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
913    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
914    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
915 
916    struct tu_pvtmem_config pvtmem = {};
917    tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
918    tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
919 
920    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
921    if (CHIP == A7XX) {
922       tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
923    }
924 
925    tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
926 
927    if (CHIP >= A7XX) {
928       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
929 
930       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
931    }
932 
933    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
934    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
935    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
936 
937    tu6_emit_vs<CHIP>(cs, vs, 0);
938    tu6_emit_hs<CHIP>(cs, NULL);
939    tu6_emit_ds<CHIP>(cs, NULL);
940    tu6_emit_gs<CHIP>(cs, NULL);
941    tu6_emit_fs<CHIP>(cs, fs);
942 
943    tu_cs_emit_regs(cs,
944                    A6XX_GRAS_CL_CNTL(
945                       .clip_disable = 1,
946                       .vp_clip_code_ignore = 1,
947                       .vp_xform_disable = 1,
948                       .persp_division_disable = 1,));
949    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
950 
951    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
952    if (CHIP == A6XX) {
953       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
954    } else {
955       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
956    }
957 
958    tu_cs_emit_regs(cs,
959                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
960                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
961    tu_cs_emit_regs(cs,
962                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
963                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
964 
965    tu_cs_emit_regs(cs,
966                    A6XX_VFD_INDEX_OFFSET(),
967                    A6XX_VFD_INSTANCE_START_OFFSET());
968 
969    if (rts_mask) {
970       unsigned rts_count = util_last_bit(rts_mask);
971       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
972       unsigned rt = 0;
973       for (unsigned i = 0; i < rts_count; i++) {
974          unsigned regid = 0;
975          if (rts_mask & (1u << i))
976             regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
977          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
978                         COND(regid & HALF_REG_ID,
979                              A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
980       }
981    }
982 
983    tu6_emit_msaa(cs, samples, false);
984 }
985 
986 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)987 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
988                           struct tu_cs *cs,
989                           uint32_t opcode,
990                           enum a6xx_state_block block,
991                           uint32_t offset,
992                           const void *consts,
993                           uint32_t size_vec4)
994 {
995    assert(offset % cmd->device->compiler->const_upload_unit == 0);
996 
997    struct tu_cs_memory mem = {};
998    VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
999    if (result != VK_SUCCESS) {
1000       vk_command_buffer_set_error(&cmd->vk, result);
1001       return;
1002    }
1003 
1004    memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
1005 
1006    tu_cs_emit_pkt7(cs, opcode, 3);
1007    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
1008                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1009                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1010                   CP_LOAD_STATE6_0_STATE_BLOCK(block) |
1011                   CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
1012    tu_cs_emit_qw(cs, mem.iova);
1013 }
1014 
1015 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)1016 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
1017 {
1018    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
1019 }
1020 
1021 /* z coordinate for "z scale" blit path which uses a 3d texture */
1022 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)1023 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
1024 {
1025    const uint32_t coord[] = {
1026       fui(z),
1027       0,
1028       0,
1029       0,
1030    };
1031 
1032    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
1033 }
1034 
1035 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)1036 r3d_coords(struct tu_cmd_buffer *cmd,
1037            struct tu_cs *cs,
1038            const VkOffset2D dst,
1039            const VkOffset2D src,
1040            const VkExtent2D extent)
1041 {
1042    const bool no_src = src.x != blt_no_coord.x;
1043    int32_t src_x1 = no_src ? src.x : 0;
1044    int32_t src_y1 = no_src ? src.y : 0;
1045 
1046    const float coords[] = {
1047       dst.x,
1048       dst.y,
1049       src_x1,
1050       src_y1,
1051       dst.x + extent.width,
1052       dst.y + extent.height,
1053       src_x1 + extent.width,
1054       src_y1 + extent.height,
1055    };
1056    r3d_coords_raw(cmd, cs, coords);
1057 }
1058 
1059 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1060 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1061 {
1062    uint32_t coords[4] = {};
1063 
1064    switch (format) {
1065    case PIPE_FORMAT_Z24X8_UNORM:
1066    case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1067       /* cleared as r8g8b8a8_unorm using special format */
1068       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1069       coords[0] = fui((tmp & 0xff) / 255.0f);
1070       coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1071       coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1072       coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1073    } break;
1074    case PIPE_FORMAT_Z16_UNORM:
1075    case PIPE_FORMAT_Z32_FLOAT:
1076       coords[0] = fui(val->depthStencil.depth);
1077       coords[1] = 0;
1078       coords[2] = 0;
1079       coords[3] = 0;
1080       break;
1081    case PIPE_FORMAT_S8_UINT:
1082       coords[0] = val->depthStencil.stencil & 0xff;
1083       coords[1] = 0;
1084       coords[2] = 0;
1085       coords[3] = 0;
1086       break;
1087    default:
1088       /* as color formats use clear value as-is */
1089       assert(!util_format_is_depth_or_stencil(format));
1090       memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1091       break;
1092    }
1093 
1094    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1095 }
1096 
1097 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1098 r3d_src_common(struct tu_cmd_buffer *cmd,
1099                struct tu_cs *cs,
1100                const uint32_t *tex_const,
1101                uint32_t offset_base,
1102                uint32_t offset_ubwc,
1103                VkFilter filter)
1104 {
1105    struct tu_cs_memory texture = { };
1106    VkResult result = tu_cs_alloc(&cmd->sub_cs,
1107                                  2, /* allocate space for a sampler too */
1108                                  A6XX_TEX_CONST_DWORDS, &texture);
1109    if (result != VK_SUCCESS) {
1110       vk_command_buffer_set_error(&cmd->vk, result);
1111       return;
1112    }
1113 
1114    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1115 
1116    /* patch addresses for layer offset */
1117    *(uint64_t*) (texture.map + 4) += offset_base;
1118    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1119    texture.map[7] = ubwc_addr;
1120    texture.map[8] = ubwc_addr >> 32;
1121 
1122    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1123       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1124       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1125       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1126       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1127       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1128       0x60000; /* XXX used by blob, doesn't seem necessary */
1129    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1130       A6XX_TEX_SAMP_1_UNNORM_COORDS |
1131       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1132    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1133    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1134 
1135    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1136    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1137                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1138                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1139                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1140                CP_LOAD_STATE6_0_NUM_UNIT(1));
1141    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1142 
1143    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1144 
1145    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1146    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1147       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1148       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1149       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1150       CP_LOAD_STATE6_0_NUM_UNIT(1));
1151    tu_cs_emit_qw(cs, texture.iova);
1152 
1153    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1154    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1155 }
1156 
1157 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1158 r3d_src(struct tu_cmd_buffer *cmd,
1159         struct tu_cs *cs,
1160         const struct fdl6_view *iview,
1161         uint32_t layer,
1162         VkFilter filter,
1163         enum pipe_format dst_format)
1164 {
1165    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1166    memcpy(desc, iview->descriptor, sizeof(desc));
1167 
1168    enum a6xx_format fmt = (enum a6xx_format)(
1169       (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1170    enum pipe_format src_format = iview->format;
1171    fixup_src_format(&src_format, dst_format, &fmt);
1172    desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1173       A6XX_TEX_CONST_0_FMT(fmt);
1174 
1175    r3d_src_common(cmd, cs, desc,
1176                   iview->layer_size * layer,
1177                   iview->ubwc_layer_size * layer,
1178                   filter);
1179 }
1180 
1181 template <chip CHIP>
1182 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1183 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1184                struct tu_cs *cs,
1185                enum pipe_format format,
1186                uint64_t va, uint32_t pitch,
1187                uint32_t width, uint32_t height,
1188                enum pipe_format dst_format)
1189 {
1190    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1191 
1192    struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
1193    enum a6xx_format color_format = fmt.fmt;
1194    fixup_src_format(&format, dst_format, &color_format);
1195 
1196    desc[0] =
1197       COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1198       A6XX_TEX_CONST_0_FMT(color_format) |
1199       A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1200       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1201       A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1202       A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1203       A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1204    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1205    desc[2] =
1206       A6XX_TEX_CONST_2_PITCH(pitch) |
1207       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1208    desc[3] = 0;
1209    desc[4] = va;
1210    desc[5] = va >> 32;
1211    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1212       desc[i] = 0;
1213 
1214    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1215 }
1216 
1217 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1218 r3d_src_depth(struct tu_cmd_buffer *cmd,
1219               struct tu_cs *cs,
1220               const struct tu_image_view *iview,
1221               uint32_t layer)
1222 {
1223    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1224 
1225    memcpy(desc, iview->view.descriptor, sizeof(desc));
1226    uint64_t va = iview->depth_base_addr;
1227 
1228    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1229                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1230                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1231                 A6XX_TEX_CONST_0_SWAP__MASK);
1232    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1233               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1234               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1235               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1236               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1237    desc[2] =
1238       A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1239       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1240    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1241       (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1242    desc[4] = va;
1243    desc[5] = va >> 32;
1244 
1245    r3d_src_common(cmd, cs, desc,
1246                   iview->depth_layer_size * layer,
1247                   iview->view.ubwc_layer_size * layer,
1248                   VK_FILTER_NEAREST);
1249 }
1250 
1251 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1252 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1253                 struct tu_cs *cs,
1254                 const struct tu_image_view *iview,
1255                 uint32_t layer)
1256 {
1257    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1258 
1259    memcpy(desc, iview->view.descriptor, sizeof(desc));
1260    uint64_t va = iview->stencil_base_addr;
1261 
1262    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1263                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1264                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1265                 A6XX_TEX_CONST_0_SWAP__MASK);
1266    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1267               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1268               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1269               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1270               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1271    desc[2] =
1272       A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1273       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1274    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1275    desc[4] = va;
1276    desc[5] = va >> 32;
1277    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1278       desc[i] = 0;
1279 
1280    r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1281                   VK_FILTER_NEAREST);
1282 }
1283 
1284 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1285 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1286                   struct tu_cs *cs,
1287                   const struct tu_image_view *iview,
1288                   uint32_t layer)
1289 {
1290    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1291 
1292    memcpy(desc, iview->view.descriptor, sizeof(desc));
1293 
1294    /* Fixup D24 formats because we always load both depth and stencil. */
1295    enum pipe_format format = iview->view.format;
1296    if (format == PIPE_FORMAT_X24S8_UINT ||
1297        format == PIPE_FORMAT_Z24X8_UNORM ||
1298        format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1299       desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1300       if (iview->view.ubwc_enabled)
1301          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1302       else
1303          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1304    }
1305 
1306    /* When loading/storing GMEM we always load the full image and don't do any
1307     * swizzling or swapping, that's done in the draw when reading/writing
1308     * GMEM, so we need to fixup the swizzle and swap.
1309     */
1310    desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1311                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1312                 A6XX_TEX_CONST_0_SWAP__MASK);
1313    desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1314               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1315               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1316               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1317 
1318    r3d_src_common(cmd, cs, desc,
1319                   iview->view.layer_size * layer,
1320                   iview->view.ubwc_layer_size * layer,
1321                   VK_FILTER_NEAREST);
1322 }
1323 
1324 template <chip CHIP>
1325 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1326 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1327              struct tu_cs *cs,
1328              const struct tu_image_view *iview,
1329              enum pipe_format format,
1330              enum pipe_format dst_format,
1331              uint32_t gmem_offset,
1332              uint32_t cpp)
1333 {
1334    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1335    memcpy(desc, iview->view.descriptor, sizeof(desc));
1336 
1337    enum a6xx_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, true).fmt;
1338    fixup_src_format(&format, dst_format, &fmt);
1339 
1340    /* patch the format so that depth/stencil get the right format and swizzle */
1341    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1342                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1343                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1344    desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1345                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1346                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1347                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1348                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1349 
1350    /* patched for gmem */
1351    desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1352    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1353    desc[2] =
1354       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1355       A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1356    desc[3] = 0;
1357    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1358    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1359    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1360       desc[i] = 0;
1361 
1362    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1363 }
1364 
1365 template <chip CHIP>
1366 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1367 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1368         enum pipe_format src_format)
1369 {
1370    uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1371 
1372    enum a6xx_format fmt = (enum a6xx_format)(
1373       mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1374    enum pipe_format dst_format = iview->format;
1375    fixup_dst_format(src_format, &dst_format, &fmt);
1376    mrt_buf_info =
1377       (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1378       A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1379 
1380    tu_cs_emit_regs(cs,
1381       RB_MRT_BUF_INFO(CHIP, 0, .dword = mrt_buf_info),
1382       A6XX_RB_MRT_PITCH(0, iview->pitch),
1383       A6XX_RB_MRT_ARRAY_PITCH(0, iview->layer_size),
1384       A6XX_RB_MRT_BASE(0, .qword = tu_layer_address(iview, layer)),
1385       A6XX_RB_MRT_BASE_GMEM(0),
1386    );
1387 
1388    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1389    tu_cs_image_flag_ref(cs, iview, layer);
1390 
1391    /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1392     * FMT6_NV12_Y.
1393     */
1394    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1395 
1396    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->ubwc_enabled));
1397    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1398 }
1399 
1400 template <chip CHIP>
1401 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1402 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1403 {
1404    tu_cs_emit_regs(cs,
1405       RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_depth(iview, RB_MRT_BUF_INFO)),
1406       A6XX_RB_MRT_PITCH(0, iview->depth_pitch),
1407       A6XX_RB_MRT_ARRAY_PITCH(0, iview->depth_layer_size),
1408       A6XX_RB_MRT_BASE(0, .qword = iview->depth_base_addr + iview->depth_layer_size * layer),
1409       A6XX_RB_MRT_BASE_GMEM(0),
1410    );
1411 
1412    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1413    tu_cs_image_flag_ref(cs, &iview->view, layer);
1414 
1415    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->view.ubwc_enabled));
1416    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1417 }
1418 
1419 template <chip CHIP>
1420 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1421 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1422 {
1423    tu_cs_emit_regs(cs,
1424       RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_stencil(iview, RB_MRT_BUF_INFO)),
1425       A6XX_RB_MRT_PITCH(0, iview->stencil_pitch),
1426       A6XX_RB_MRT_ARRAY_PITCH(0, iview->stencil_layer_size),
1427       A6XX_RB_MRT_BASE(0, .qword = iview->stencil_base_addr + iview->stencil_layer_size * layer),
1428       A6XX_RB_MRT_BASE_GMEM(0),
1429    );
1430 
1431    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1432    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1433 }
1434 
1435 template <chip CHIP>
1436 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1437 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1438                enum pipe_format src_format)
1439 {
1440    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1441 
1442    enum a6xx_format color_fmt = fmt.fmt;
1443    fixup_dst_format(src_format, &format, &color_fmt);
1444 
1445    tu_cs_emit_regs(cs,
1446                    RB_MRT_BUF_INFO(CHIP, 0, .color_format = color_fmt, .color_swap = fmt.swap),
1447                    A6XX_RB_MRT_PITCH(0, pitch),
1448                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1449                    A6XX_RB_MRT_BASE(0, .qword = va),
1450                    A6XX_RB_MRT_BASE_GMEM(0, 0));
1451 
1452    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1453    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1454 }
1455 
1456 template <chip CHIP>
1457 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1458 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1459              const struct tu_image_view *iview,
1460              const struct tu_render_pass_attachment *att,
1461              bool separate_stencil, unsigned layer)
1462 {
1463    unsigned RB_MRT_BUF_INFO;
1464    unsigned gmem_offset;
1465 
1466    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1467       if (!separate_stencil) {
1468          RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1469          gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1470       } else {
1471          RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1472          gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1473       }
1474    } else {
1475       RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1476       gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1477    }
1478 
1479    tu_cs_emit_regs(cs,
1480                    RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
1481                    A6XX_RB_MRT_PITCH(0, 0),
1482                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1483                    A6XX_RB_MRT_BASE(0, 0),
1484                    A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1485 
1486    enum a6xx_format color_format =
1487       (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1488    tu_cs_emit_regs(cs,
1489                    A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1490 
1491    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1492    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1493 }
1494 
1495 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1496 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1497 {
1498    uint8_t mask = 0xf;
1499    assert(aspect_mask);
1500    /* note: the only format with partial writing is D24S8,
1501     * clear/blit uses the _AS_R8G8B8A8 format to access it
1502     */
1503    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1504       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1505          mask = 0x7;
1506       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1507          mask = 0x8;
1508    }
1509    return mask;
1510 }
1511 
1512 static uint8_t
aspect_write_mask_generic_clear(enum pipe_format format,VkImageAspectFlags aspect_mask)1513 aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspect_mask)
1514 {
1515    uint8_t mask = 0xf;
1516    assert(aspect_mask);
1517    /* note: the only format with partial writing is D24S8 */
1518    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1519       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1520          mask = 0x1;
1521       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1522          mask = 0x2;
1523       if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
1524          mask = 0x3;
1525    }
1526    return mask;
1527 }
1528 
1529 enum r3d_blit_param {
1530    R3D_Z_SCALE = 1 << 0,
1531    R3D_DST_GMEM = 1 << 1,
1532    R3D_COPY = 1 << 2,
1533 };
1534 
1535 template <chip CHIP>
1536 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1537 r3d_setup(struct tu_cmd_buffer *cmd,
1538           struct tu_cs *cs,
1539           enum pipe_format src_format,
1540           enum pipe_format dst_format,
1541           VkImageAspectFlags aspect_mask,
1542           unsigned blit_param,
1543           bool clear,
1544           bool ubwc,
1545           VkSampleCountFlagBits samples)
1546 {
1547    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1548       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1549    }
1550 
1551    enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
1552    fixup_dst_format(src_format, &dst_format, &fmt);
1553 
1554    if (!cmd->state.pass) {
1555       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1556       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1557    }
1558 
1559    if (!(blit_param & R3D_DST_GMEM)) {
1560       if (CHIP == A6XX) {
1561          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1562       } else {
1563          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1564       }
1565 
1566       tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1567 
1568       if (CHIP >= A7XX) {
1569          tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1570          tu_cs_emit_regs(cs,
1571             A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1572       }
1573    }
1574 
1575    enum r3d_type type;
1576    if (clear) {
1577       type = R3D_CLEAR;
1578    } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1579       /* Avoid canonicalizing NaNs in copies by using the special half-float
1580        * path that uses half regs.
1581        */
1582       type = R3D_COPY_HALF;
1583    } else {
1584       type = R3D_BLIT;
1585    }
1586 
1587    r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1588 
1589    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1590    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1591    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1592    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1593 
1594    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1595    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1596    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1597    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1598    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1599    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1600    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1601    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1602    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1603 
1604    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1605                         .color_format = fmt,
1606                         .color_sint = util_format_is_pure_sint(dst_format),
1607                         .color_uint = util_format_is_pure_uint(dst_format)));
1608 
1609    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1610       .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1611    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1612    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1613 
1614    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1615    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1616 
1617    if (CHIP >= A7XX) {
1618       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
1619 
1620       tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
1621       tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
1622       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
1623    }
1624 
1625    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1626                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1627 
1628    /* Disable sample counting in order to not affect occlusion query. */
1629    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1630 
1631    tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL());
1632    if (CHIP >= A7XX) {
1633       tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL());
1634    }
1635 
1636    if (cmd->state.prim_generated_query_running_before_rp) {
1637       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1638    }
1639 
1640    if (cmd->state.predication_active) {
1641       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1642       tu_cs_emit(cs, 0);
1643    }
1644 }
1645 
1646 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1647 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1648 {
1649    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1650    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1651                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1652                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1653    tu_cs_emit(cs, 1); /* instance count */
1654    tu_cs_emit(cs, 2); /* vertex count */
1655 }
1656 
1657 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1658 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1659 {
1660    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1661    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1662                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1663                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1664    tu_cs_emit(cs, 1); /* instance count */
1665    tu_cs_emit(cs, 2); /* vertex count */
1666 }
1667 
1668 template <chip CHIP>
1669 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1670 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1671 {
1672    if (cmd->state.predication_active) {
1673       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1674       tu_cs_emit(cs, 1);
1675    }
1676 
1677    /* Re-enable sample counting. */
1678    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1679 
1680    if (cmd->state.prim_generated_query_running_before_rp) {
1681       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1682    }
1683 }
1684 
1685 /* blit ops - common interface for 2d/shader paths */
1686 
1687 struct blit_ops {
1688    void (*coords)(struct tu_cmd_buffer *cmd,
1689                   struct tu_cs *cs,
1690                   const VkOffset2D dst,
1691                   const VkOffset2D src,
1692                   const VkExtent2D extent);
1693    void (*clear_value)(struct tu_cmd_buffer *cmd,
1694                        struct tu_cs *cs,
1695                        enum pipe_format format,
1696                        const VkClearValue *val);
1697    void (*src)(
1698         struct tu_cmd_buffer *cmd,
1699         struct tu_cs *cs,
1700         const struct fdl6_view *iview,
1701         uint32_t layer,
1702         VkFilter filter,
1703         enum pipe_format dst_format);
1704    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1705                       enum pipe_format format,
1706                       uint64_t va, uint32_t pitch,
1707                       uint32_t width, uint32_t height,
1708                       enum pipe_format dst_format);
1709    void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1710                enum pipe_format src_format);
1711    void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1712    void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1713    void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1714                       enum pipe_format src_format);
1715    void (*setup)(struct tu_cmd_buffer *cmd,
1716                  struct tu_cs *cs,
1717                  enum pipe_format src_format,
1718                  enum pipe_format dst_format,
1719                  VkImageAspectFlags aspect_mask,
1720                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1721                  bool clear,
1722                  bool ubwc,
1723                  VkSampleCountFlagBits samples);
1724    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1725    void (*teardown)(struct tu_cmd_buffer *cmd,
1726                     struct tu_cs *cs);
1727 };
1728 
1729 template <chip CHIP>
1730 static const struct blit_ops r2d_ops = {
1731    .coords = r2d_coords,
1732    .clear_value = r2d_clear_value,
1733    .src = r2d_src<CHIP>,
1734    .src_buffer = r2d_src_buffer<CHIP>,
1735    .dst = r2d_dst<CHIP>,
1736    .dst_depth = r2d_dst_depth,
1737    .dst_stencil = r2d_dst_stencil,
1738    .dst_buffer = r2d_dst_buffer,
1739    .setup = r2d_setup<CHIP>,
1740    .run = r2d_run,
1741    .teardown = r2d_teardown,
1742 };
1743 
1744 template <chip CHIP>
1745 static const struct blit_ops r3d_ops = {
1746    .coords = r3d_coords,
1747    .clear_value = r3d_clear_value,
1748    .src = r3d_src,
1749    .src_buffer = r3d_src_buffer<CHIP>,
1750    .dst = r3d_dst<CHIP>,
1751    .dst_depth = r3d_dst_depth<CHIP>,
1752    .dst_stencil = r3d_dst_stencil<CHIP>,
1753    .dst_buffer = r3d_dst_buffer<CHIP>,
1754    .setup = r3d_setup<CHIP>,
1755    .run = r3d_run,
1756    .teardown = r3d_teardown<CHIP>,
1757 };
1758 
1759 /* passthrough set coords from 3D extents */
1760 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1761 coords(const struct blit_ops *ops,
1762        struct tu_cmd_buffer *cmd,
1763        struct tu_cs *cs,
1764        const VkOffset3D dst,
1765        const VkOffset3D src,
1766        const VkExtent3D extent)
1767 {
1768    ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1769                (VkExtent2D) {extent.width, extent.height});
1770 }
1771 
1772 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1773  * to be a bit careful because we have to pick a format with matching UBWC
1774  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1775  * everything.
1776  */
1777 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1778 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1779 {
1780    if (vk_format_is_compressed(vk_format)) {
1781       switch (vk_format_get_blocksize(vk_format)) {
1782       case 1: return PIPE_FORMAT_R8_UINT;
1783       case 2: return PIPE_FORMAT_R16_UINT;
1784       case 4: return PIPE_FORMAT_R32_UINT;
1785       case 8: return PIPE_FORMAT_R32G32_UINT;
1786       case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1787       default:
1788          unreachable("unhandled format size");
1789       }
1790    }
1791 
1792    enum pipe_format format = vk_format_to_pipe_format(vk_format);
1793 
1794    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1795     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1796     * (also -1.0), when we're supposed to be memcpying the bits. See
1797     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1798     */
1799    format = util_format_snorm_to_unorm(format);
1800 
1801    if (vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1802       return PIPE_FORMAT_R32_UINT;
1803 
1804    /* For VK_FORMAT_D32_SFLOAT_S8_UINT and YCbCr formats use our existing helpers */
1805    if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1806        vk_format_get_ycbcr_info(vk_format))
1807       return tu_aspects_to_plane(vk_format, aspect_mask);
1808 
1809    /* Otherwise, simply return the pipe_format */
1810    return format;
1811 }
1812 
1813 static void
pack_blit_event_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])1814 pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
1815 {
1816    switch (format) {
1817    case PIPE_FORMAT_Z24X8_UNORM:
1818    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1819       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1820                        val->depthStencil.stencil << 24;
1821       return;
1822    case PIPE_FORMAT_Z16_UNORM:
1823       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1824       return;
1825    case PIPE_FORMAT_Z32_FLOAT:
1826       clear_value[0] = fui(val->depthStencil.depth);
1827       return;
1828    case PIPE_FORMAT_S8_UINT:
1829       clear_value[0] = val->depthStencil.stencil;
1830       return;
1831    default:
1832       break;
1833    }
1834 
1835    float tmp[4];
1836    memcpy(tmp, val->color.float32, 4 * sizeof(float));
1837    if (util_format_is_srgb(format)) {
1838       for (int i = 0; i < 3; i++)
1839          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1840    }
1841 
1842 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1843    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1844    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1845    case 4:
1846       PACK_F(r4g4b4a4_unorm);
1847       break;
1848    case 5:
1849       if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1850          PACK_F(r5g6b5_unorm);
1851       else
1852          PACK_F(r5g5b5a1_unorm);
1853       break;
1854    case 8:
1855       if (util_format_is_snorm(format))
1856          PACK_F(r8g8b8a8_snorm);
1857       else if (util_format_is_unorm(format))
1858          PACK_F(r8g8b8a8_unorm);
1859       else
1860          pack_int8(clear_value, val->color.uint32);
1861       break;
1862    case 10:
1863       if (util_format_is_pure_integer(format))
1864          pack_int10_2(clear_value, val->color.uint32);
1865       else
1866          PACK_F(r10g10b10a2_unorm);
1867       break;
1868    case 11:
1869       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
1870       break;
1871    case 16:
1872       if (util_format_is_snorm(format))
1873          PACK_F(r16g16b16a16_snorm);
1874       else if (util_format_is_unorm(format))
1875          PACK_F(r16g16b16a16_unorm);
1876       else if (util_format_is_float(format))
1877          PACK_F(r16g16b16a16_float);
1878       else
1879          pack_int16(clear_value, val->color.uint32);
1880       break;
1881    case 32:
1882       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
1883       break;
1884    case 0:
1885       assert(format == PIPE_FORMAT_A8_UNORM);
1886       PACK_F(a8_unorm);
1887       break;
1888    default:
1889       unreachable("unexpected channel size");
1890    }
1891 #undef PACK_F
1892 }
1893 
1894 static void
event_blit_setup(struct tu_cs * cs,uint32_t buffer_id,const struct tu_render_pass_attachment * att,enum a6xx_blit_event_type blit_event_type,uint32_t clear_mask)1895 event_blit_setup(struct tu_cs *cs,
1896                  uint32_t buffer_id,
1897                  const struct tu_render_pass_attachment *att,
1898                  enum a6xx_blit_event_type blit_event_type,
1899                  uint32_t clear_mask)
1900 {
1901    tu_cs_emit_regs(
1902       cs, A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
1903 
1904    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1905    tu_cs_emit(cs, 0);
1906 
1907    tu_cs_emit_regs(
1908       cs,
1909       A6XX_RB_BLIT_INFO(.type = blit_event_type,
1910                         .sample_0 =
1911                            vk_format_is_int(att->format) ||
1912                            vk_format_is_depth_or_stencil(att->format),
1913                         .depth = vk_format_is_depth_or_stencil(att->format),
1914                         .clear_mask = clear_mask,
1915                         .buffer_id = buffer_id));
1916 }
1917 
1918 struct event_blit_dst_view {
1919    const struct tu_image *image;
1920    const struct fdl6_view *view;
1921 
1922    uint32_t layer;
1923 
1924    uint64_t depth_addr;
1925    uint32_t depth_pitch;
1926 
1927    uint64_t stencil_addr;
1928    uint32_t stencil_pitch;
1929 };
1930 
1931 static event_blit_dst_view
blt_view_from_tu_view(const struct tu_image_view * iview,uint32_t layer)1932 blt_view_from_tu_view(const struct tu_image_view *iview,
1933                       uint32_t layer)
1934 {
1935    struct event_blit_dst_view blt_view;
1936    blt_view.image = iview->image;
1937    blt_view.view = &iview->view;
1938    blt_view.layer = layer;
1939 
1940    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1941       blt_view.depth_addr =
1942          iview->depth_base_addr + iview->depth_layer_size * layer;
1943       blt_view.depth_pitch = iview->depth_pitch;
1944 
1945       blt_view.stencil_addr =
1946          iview->stencil_base_addr + iview->stencil_layer_size * layer;
1947       blt_view.stencil_pitch = iview->stencil_pitch;
1948    }
1949    return blt_view;
1950 }
1951 
1952 template <chip CHIP>
1953 static void
event_blit_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const event_blit_dst_view * blt_view,bool separate_stencil)1954 event_blit_run(struct tu_cmd_buffer *cmd,
1955                struct tu_cs *cs,
1956                const struct tu_render_pass_attachment *att,
1957                const event_blit_dst_view *blt_view,
1958                bool separate_stencil)
1959 {
1960    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
1961    if (blt_view->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1962       if (!separate_stencil) {
1963          tu_cs_emit(cs, tu_fdl_view_depth(blt_view->view, RB_BLIT_DST_INFO));
1964          tu_cs_emit_qw(cs, blt_view->depth_addr);
1965          tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(blt_view->depth_pitch).value);
1966 
1967          tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1968          tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1969       } else {
1970          tu_cs_emit(cs, tu_fdl_view_stencil(blt_view->view, RB_BLIT_DST_INFO) &
1971                            ~A6XX_RB_BLIT_DST_INFO_FLAGS);
1972          tu_cs_emit_qw(cs, blt_view->stencil_addr);
1973          tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(blt_view->stencil_pitch).value);
1974       }
1975    } else {
1976       tu_cs_emit(cs, blt_view->view->RB_BLIT_DST_INFO);
1977       tu_cs_image_ref_2d<CHIP>(cs, blt_view->view, blt_view->layer, false);
1978 
1979       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1980       tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1981    }
1982 
1983    if (att) {
1984       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
1985          tu_cs_emit_regs(
1986             cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(
1987                    cmd, att, blt_view->layer)));
1988       } else {
1989          tu_cs_emit_regs(cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(
1990                                 cmd, att, blt_view->layer)));
1991       }
1992    }
1993 
1994    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
1995 }
1996 
1997 static void
tu7_generic_layer_clear(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,bool separate_stencil,uint32_t layer,const VkClearValue * value,uint32_t a)1998 tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
1999                         struct tu_cs *cs,
2000                         uint32_t buffer_id,
2001                         enum pipe_format format,
2002                         uint8_t clear_mask,
2003                         bool separate_stencil,
2004                         uint32_t layer,
2005                         const VkClearValue *value,
2006                         uint32_t a)
2007 {
2008    const struct tu_render_pass_attachment *att =
2009       &cmd->state.pass->attachments[a];
2010    const struct tu_image_view *iview = cmd->state.attachments[a];
2011 
2012    uint32_t clear_vals[4] = {};
2013    pack_blit_event_clear_value(value, format, clear_vals);
2014 
2015    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2016    tu_cs_emit_array(cs, clear_vals, 4);
2017 
2018    event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
2019 
2020    event_blit_setup(cs, buffer_id, att, BLIT_EVENT_CLEAR, clear_mask);
2021    event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
2022 }
2023 
2024 
2025 
2026 /* Copies/fills/updates for buffers are happening through CCU but need
2027  * additional synchronization when write range is not aligned to 64 bytes.
2028  * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not
2029  * coherent between each other in CCU since format seem to be a part of a
2030  * cache key.
2031  *
2032  * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306
2033  *
2034  * The synchronization with writes from UCHE (e.g. with SSBO stores) are
2035  * solved by the fact that UCHE has byte level dirtiness tracking and that CCU
2036  * flush would happen always before UCHE flush for such case (e.g. both
2037  * renderpass and dispatch would flush pending CCU write).
2038  *
2039  * Additionally see:
2040  * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111
2041  */
2042 template <chip CHIP>
2043 static void
handle_buffer_unaligned_store(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t size,bool * unaligned_store)2044 handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd,
2045                               uint64_t dst_va,
2046                               uint64_t size,
2047                               bool *unaligned_store)
2048 {
2049    if (*unaligned_store)
2050       return;
2051 
2052    if ((dst_va & 63) || (size & 63)) {
2053       tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE,
2054                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
2055       /* Wait for invalidations to land. */
2056       cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2057       tu_emit_cache_flush<CHIP>(cmd);
2058       *unaligned_store = true;
2059    }
2060 }
2061 
2062 template <chip CHIP>
2063 static void
after_buffer_unaligned_buffer_store(struct tu_cmd_buffer * cmd,bool unaligned_store)2064 after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd,
2065                                     bool unaligned_store)
2066 {
2067    if (unaligned_store) {
2068       tu_flush_for_access(&cmd->state.cache,
2069                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
2070                           TU_ACCESS_NONE);
2071    }
2072 }
2073 
2074 template <chip CHIP>
2075 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)2076 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
2077               struct tu_cs *cs,
2078               struct tu_image *image,
2079               const VkClearValue *value)
2080 {
2081    const struct blit_ops *ops = &r2d_ops<CHIP>;
2082 
2083    /* It is assumed that LRZ cache is invalidated at this point for
2084     * the writes here to become visible to LRZ.
2085     *
2086     * LRZ writes are going through UCHE cache, flush UCHE before changing
2087     * LRZ via CCU. Don't need to invalidate CCU since we are presumably
2088     * writing whole cache lines we assume to be 64 bytes.
2089     */
2090    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
2091 
2092    ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
2093               VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
2094               VK_SAMPLE_COUNT_1_BIT);
2095    ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
2096    ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
2097                    image->iova + image->lrz_offset,
2098                    image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
2099    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
2100                (VkExtent2D) { image->lrz_pitch, image->lrz_height });
2101    ops->run(cmd, cs);
2102    ops->teardown(cmd, cs);
2103 
2104    /* Clearing writes via CCU color in the PS stage, and LRZ is read via
2105     * UCHE in the earlier GRAS stage.
2106     */
2107    cmd->state.cache.flush_bits |=
2108       TU_CMD_FLAG_CCU_CLEAN_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
2109       TU_CMD_FLAG_WAIT_FOR_IDLE;
2110 }
2111 TU_GENX(tu6_clear_lrz);
2112 
2113 template <chip CHIP>
2114 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)2115 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
2116                  struct tu_cs *cs,
2117                  struct tu_image *image)
2118 {
2119    const struct blit_ops *ops = &r2d_ops<CHIP>;
2120    VkClearValue clear = {};
2121    clear.color.uint32[0] = 0xffffffff;
2122 
2123    using LRZFC = fd_lrzfc_layout<CHIP>;
2124    uint64_t lrz_fc_iova = image->iova + image->lrz_fc_offset;
2125    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2126               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2127               VK_SAMPLE_COUNT_1_BIT);
2128    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
2129    ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2130                    lrz_fc_iova + offsetof(LRZFC, fc1),
2131                    sizeof(LRZFC::fc1),
2132                    PIPE_FORMAT_R32_UINT);
2133    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2134       sizeof(LRZFC::fc1) / sizeof(uint32_t), 1
2135    });
2136    ops->run(cmd, cs);
2137    if constexpr (LRZFC::HAS_BIDIR) {
2138       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2139                       lrz_fc_iova + offsetof(LRZFC, fc2),
2140                       sizeof(LRZFC::fc2),
2141                       PIPE_FORMAT_R32_UINT);
2142       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2143          sizeof(LRZFC::fc2) / sizeof(uint32_t), 1
2144       });
2145       ops->run(cmd, cs);
2146    }
2147    ops->teardown(cmd, cs);
2148 }
2149 TU_GENX(tu6_dirty_lrz_fc);
2150 
2151 template<chip CHIP>
2152 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)2153 tu_image_view_copy_blit(struct fdl6_view *iview,
2154                         struct tu_image *image,
2155                         enum pipe_format format,
2156                         const VkImageSubresourceLayers *subres,
2157                         uint32_t layer,
2158                         bool z_scale)
2159 {
2160    VkImageAspectFlags aspect_mask = subres->aspectMask;
2161 
2162    /* always use the AS_R8G8B8A8 format for these */
2163    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
2164        format == PIPE_FORMAT_Z24X8_UNORM) {
2165       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
2166    }
2167 
2168    const struct fdl_layout *layout =
2169       &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
2170 
2171    const struct fdl_view_args args = {
2172       .chip = CHIP,
2173       .iova = image->iova,
2174       .base_miplevel = subres->mipLevel,
2175       .level_count = 1,
2176       .base_array_layer = subres->baseArrayLayer + layer,
2177       .layer_count = 1,
2178       .swiz = {
2179          PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
2180       },
2181       .format = tu_format_for_aspect(format, aspect_mask),
2182       .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
2183    };
2184    fdl6_view_init(iview, &layout, &args, false);
2185 }
2186 
2187 template<chip CHIP>
2188 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)2189 tu_image_view_copy(struct fdl6_view *iview,
2190                    struct tu_image *image,
2191                    enum pipe_format format,
2192                    const VkImageSubresourceLayers *subres,
2193                    uint32_t layer)
2194 {
2195    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2196 }
2197 
2198 template<chip CHIP>
2199 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)2200 tu_image_view_blit(struct fdl6_view *iview,
2201                    struct tu_image *image,
2202                    const VkImageSubresourceLayers *subres,
2203                    uint32_t layer)
2204 {
2205    enum pipe_format format = tu_aspects_to_plane(image->vk.format, subres->aspectMask);
2206    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2207 }
2208 
2209 template <chip CHIP>
2210 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)2211 tu6_blit_image(struct tu_cmd_buffer *cmd,
2212                struct tu_image *src_image,
2213                struct tu_image *dst_image,
2214                const VkImageBlit2 *info,
2215                VkFilter filter)
2216 {
2217    const struct blit_ops *ops = &r2d_ops<CHIP>;
2218    struct tu_cs *cs = &cmd->cs;
2219    bool z_scale = false;
2220    uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
2221 
2222    /* 2D blit can't do rotation mirroring from just coordinates */
2223    static const enum a6xx_rotation rotate[2][2] = {
2224       {ROTATE_0, ROTATE_HFLIP},
2225       {ROTATE_VFLIP, ROTATE_180},
2226    };
2227 
2228    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
2229                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
2230    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
2231                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
2232 
2233    int32_t src0_z = info->srcOffsets[0].z;
2234    int32_t src1_z = info->srcOffsets[1].z;
2235 
2236    if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
2237         info->dstOffsets[1].z - info->dstOffsets[0].z) ||
2238        info->srcOffsets[1].z < info->srcOffsets[0].z) {
2239       z_scale = true;
2240    }
2241 
2242    if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
2243       layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
2244       src0_z = info->srcOffsets[1].z;
2245       src1_z = info->srcOffsets[0].z;
2246    }
2247 
2248    if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
2249       assert(layers <= 1);
2250       layers = vk_image_subresource_layer_count(&dst_image->vk,
2251                                                 &info->dstSubresource);
2252    }
2253 
2254    /* BC1_RGB_* formats need to have their last components overriden with 1
2255     * when sampling, which is normally handled with the texture descriptor
2256     * swizzle. The 2d path can't handle that, so use the 3d path.
2257     *
2258     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
2259     * the 2d path.
2260     */
2261 
2262    unsigned blit_param = rotate[mirror_y][mirror_x];
2263    if (dst_image->layout[0].nr_samples > 1 ||
2264        src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
2265        src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
2266        filter == VK_FILTER_CUBIC_EXT ||
2267        z_scale) {
2268       ops = &r3d_ops<CHIP>;
2269       blit_param = z_scale ? R3D_Z_SCALE : 0;
2270    }
2271 
2272    /* use the right format in setup() for D32_S8 */
2273    enum pipe_format src_format = tu_aspects_to_plane(
2274       src_image->vk.format, info->srcSubresource.aspectMask);
2275    enum pipe_format dst_format = tu_aspects_to_plane(
2276       dst_image->vk.format, info->dstSubresource.aspectMask);
2277    trace_start_blit(&cmd->trace, cs,
2278                   ops == &r3d_ops<CHIP>,
2279                   src_image->vk.format,
2280                   dst_image->vk.format,
2281                   layers);
2282 
2283    ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
2284               blit_param, false, dst_image->layout[0].ubwc,
2285               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2286 
2287    if (ops == &r3d_ops<CHIP>) {
2288       const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
2289                                info->srcOffsets[0].x, info->srcOffsets[0].y,
2290                                info->dstOffsets[1].x, info->dstOffsets[1].y,
2291                                info->srcOffsets[1].x, info->srcOffsets[1].y };
2292       r3d_coords_raw(cmd, cs, coords);
2293    } else {
2294       tu_cs_emit_regs(cs,
2295          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
2296                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
2297          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
2298                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
2299       tu_cs_emit_regs(cs,
2300          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
2301          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
2302          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
2303          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
2304    }
2305 
2306    struct fdl6_view dst, src;
2307    tu_image_view_blit<CHIP>(
2308       &dst, dst_image, &info->dstSubresource,
2309       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
2310 
2311    if (z_scale) {
2312       tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
2313                                     &info->srcSubresource, 0, true);
2314       ops->src(cmd, cs, &src, 0, filter, dst_format);
2315    } else {
2316       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
2317    }
2318 
2319    for (uint32_t i = 0; i < layers; i++) {
2320       if (z_scale) {
2321          float t = ((float) i + 0.5f) / (float) layers;
2322          r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
2323       } else {
2324          ops->src(cmd, cs, &src, i, filter, dst_format);
2325       }
2326       ops->dst(cs, &dst, i, src_format);
2327       ops->run(cmd, cs);
2328    }
2329 
2330    ops->teardown(cmd, cs);
2331 
2332    trace_end_blit(&cmd->trace, cs);
2333 }
2334 
2335 template <chip CHIP>
2336 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)2337 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
2338                  const VkBlitImageInfo2 *pBlitImageInfo)
2339 
2340 {
2341    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2342    VK_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
2343    VK_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
2344 
2345    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
2346       /* can't blit both depth and stencil at once with D32_S8
2347        * TODO: more advanced 3D blit path to support it instead?
2348        */
2349       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2350           dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2351          VkImageBlit2 region = pBlitImageInfo->pRegions[i];
2352          u_foreach_bit(b, region.dstSubresource.aspectMask) {
2353             region.srcSubresource.aspectMask = BIT(b);
2354             region.dstSubresource.aspectMask = BIT(b);
2355             tu6_blit_image<CHIP>(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
2356          }
2357          continue;
2358       }
2359       tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
2360                      pBlitImageInfo->filter);
2361    }
2362 
2363    if (dst_image->lrz_height) {
2364       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2365    }
2366 }
2367 TU_GENX(tu_CmdBlitImage2);
2368 
2369 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2370 copy_compressed(VkFormat format,
2371                 VkOffset3D *offset,
2372                 VkExtent3D *extent,
2373                 uint32_t *width,
2374                 uint32_t *height)
2375 {
2376    if (!vk_format_is_compressed(format))
2377       return;
2378 
2379    uint32_t block_width = vk_format_get_blockwidth(format);
2380    uint32_t block_height = vk_format_get_blockheight(format);
2381 
2382    offset->x /= block_width;
2383    offset->y /= block_height;
2384 
2385    if (extent) {
2386       extent->width = DIV_ROUND_UP(extent->width, block_width);
2387       extent->height = DIV_ROUND_UP(extent->height, block_height);
2388    }
2389    if (width)
2390       *width = DIV_ROUND_UP(*width, block_width);
2391    if (height)
2392       *height = DIV_ROUND_UP(*height, block_height);
2393 }
2394 
2395 template <chip CHIP>
2396 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2397 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2398                         struct tu_buffer *src_buffer,
2399                         struct tu_image *dst_image,
2400                         const VkBufferImageCopy2 *info)
2401 {
2402    struct tu_cs *cs = &cmd->cs;
2403    uint32_t layers = MAX2(info->imageExtent.depth,
2404                           vk_image_subresource_layer_count(&dst_image->vk,
2405                                                            &info->imageSubresource));
2406    enum pipe_format src_format =
2407       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2408    enum pipe_format dst_format =
2409       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2410    const struct blit_ops *ops = &r2d_ops<CHIP>;
2411 
2412    /* special case for buffer to stencil */
2413    if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2414        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2415       src_format = PIPE_FORMAT_S8_UINT;
2416    }
2417 
2418    /* note: could use "R8_UNORM" when no UBWC */
2419    bool has_unaligned = CHIP >= A7XX; /* If unaligned buffer copies are supported. */
2420    unsigned blit_param = 0;
2421    if (src_format == PIPE_FORMAT_Y8_UNORM ||
2422        tu_pipe_format_is_float16(src_format)) {
2423       ops = &r3d_ops<CHIP>;
2424       blit_param = R3D_COPY;
2425       has_unaligned = false;
2426    }
2427 
2428    VkOffset3D offset = info->imageOffset;
2429    VkExtent3D extent = info->imageExtent;
2430    uint32_t src_width = info->bufferRowLength ?: extent.width;
2431    uint32_t src_height = info->bufferImageHeight ?: extent.height;
2432 
2433    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2434 
2435    uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2436    uint32_t layer_size = src_height * pitch;
2437 
2438    ops->setup(cmd, cs, src_format, dst_format,
2439               info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2440               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2441 
2442    struct fdl6_view dst;
2443    tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2444                             &info->imageSubresource, offset.z);
2445 
2446    for (uint32_t i = 0; i < layers; i++) {
2447       ops->dst(cs, &dst, i, src_format);
2448 
2449       uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2450       bool unaligned = (src_va & 63) || (pitch & 63);
2451       if (!has_unaligned && unaligned) {
2452          for (uint32_t y = 0; y < extent.height; y++) {
2453             uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2454             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2455                             x + extent.width, 1, dst_format);
2456             ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y},  (VkOffset2D) {x},
2457                         (VkExtent2D) {extent.width, 1});
2458             ops->run(cmd, cs);
2459             src_va += pitch;
2460          }
2461       } else {
2462          if constexpr (CHIP >= A7XX) {
2463             /* Necessary to not trigger static assertion from A6XX variant. */
2464             if (has_unaligned) {
2465                r2d_src_buffer_unaligned<CHIP>(cmd, cs, src_format, src_va,
2466                                               pitch, extent.width,
2467                                               extent.height, dst_format);
2468             } else {
2469                ops->src_buffer(cmd, cs, src_format, src_va, pitch,
2470                                extent.width, extent.height, dst_format);
2471             }
2472          } else {
2473             ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width,
2474                             extent.height, dst_format);
2475          }
2476          coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2477          ops->run(cmd, cs);
2478       }
2479    }
2480 
2481    ops->teardown(cmd, cs);
2482 }
2483 
2484 template <chip CHIP>
2485 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2486 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2487                          const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2488 {
2489    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2490    VK_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2491    VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2492 
2493    for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2494       tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2495                               pCopyBufferToImageInfo->pRegions + i);
2496 
2497    if (dst_image->lrz_height) {
2498       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2499    }
2500 }
2501 TU_GENX(tu_CmdCopyBufferToImage2);
2502 
2503 static void
tu_copy_memory_to_image(struct tu_device * device,struct tu_image * dst_image,const VkMemoryToImageCopyEXT * info,bool copy_memcpy)2504 tu_copy_memory_to_image(struct tu_device *device,
2505                         struct tu_image *dst_image,
2506                         const VkMemoryToImageCopyEXT *info,
2507                         bool copy_memcpy)
2508 {
2509    unsigned plane = tu6_plane_index(dst_image->vk.format,
2510                                     info->imageSubresource.aspectMask);
2511    const struct fdl_layout *layout = &dst_image->layout[plane];
2512 
2513    VkOffset3D offset = info->imageOffset;
2514    VkExtent3D extent = info->imageExtent;
2515    uint32_t src_width = info->memoryRowLength ?: extent.width;
2516    uint32_t src_height = info->memoryImageHeight ?: extent.height;
2517 
2518    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2519 
2520    uint32_t src_pitch = src_width * layout->cpp;
2521 
2522    unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2523       offset.z : info->imageSubresource.baseArrayLayer;
2524    uint32_t layers = MAX2(extent.depth,
2525                           vk_image_subresource_layer_count(&dst_image->vk,
2526                                                            &info->imageSubresource));
2527 
2528    uint32_t image_offset =
2529       fdl_surface_offset(layout,
2530                          info->imageSubresource.mipLevel,
2531                          start_layer);
2532 
2533    uint32_t dst_layer_stride =
2534       fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2535    uint32_t dst_layer_size =
2536       layout->slices[info->imageSubresource.mipLevel].size0;
2537    uint32_t src_layer_stride =
2538       copy_memcpy ? dst_layer_size :
2539       (src_width * src_height * layout->cpp);
2540    bool tiled =
2541       fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2542 
2543    const char *src = (const char *) info->pHostPointer;
2544    char *dst = (char *) dst_image->map + image_offset;
2545    for (unsigned layer = 0; layer < layers; layer++,
2546         src += src_layer_stride, dst += dst_layer_stride) {
2547       if (copy_memcpy) {
2548          memcpy(dst, src, src_layer_stride);
2549       } else if (!tiled) {
2550          uint32_t dst_pitch = fdl_pitch(layout,
2551                                         info->imageSubresource.mipLevel);
2552          for (unsigned y = 0; y < extent.height; y++) {
2553             memcpy(dst + dst_pitch * (y + offset.y) + offset.x * layout->cpp,
2554                    src + src_pitch * y,
2555                    extent.width * layout->cpp);
2556          }
2557       } else {
2558          fdl6_memcpy_linear_to_tiled(offset.x, offset.y,
2559                                      extent.width, extent.height,
2560                                      dst, src, layout,
2561                                      info->imageSubresource.mipLevel,
2562                                      src_pitch,
2563                                      &device->physical_device->ubwc_config);
2564       }
2565 
2566       if (dst_image->bo->cached_non_coherent) {
2567          tu_bo_sync_cache(device, dst_image->bo,
2568                           dst_image->bo_offset + image_offset,
2569                           dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
2570       }
2571    }
2572 }
2573 
2574 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyMemoryToImageEXT(VkDevice _device,const VkCopyMemoryToImageInfoEXT * info)2575 tu_CopyMemoryToImageEXT(VkDevice _device,
2576                         const VkCopyMemoryToImageInfoEXT *info)
2577 {
2578    VK_FROM_HANDLE(tu_device, device, _device);
2579    VK_FROM_HANDLE(tu_image, dst_image, info->dstImage);
2580 
2581    for (unsigned i = 0; i < info->regionCount; i++) {
2582       tu_copy_memory_to_image(device, dst_image, &info->pRegions[i],
2583                               info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2584    }
2585 
2586    if (dst_image->lrz_height) {
2587       TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
2588    }
2589 
2590    return VK_SUCCESS;
2591 }
2592 
2593 template <chip CHIP>
2594 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info,bool * unaligned_store)2595 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2596                         struct tu_image *src_image,
2597                         struct tu_buffer *dst_buffer,
2598                         const VkBufferImageCopy2 *info,
2599                         bool *unaligned_store)
2600 {
2601    struct tu_cs *cs = &cmd->cs;
2602    uint32_t layers = MAX2(info->imageExtent.depth,
2603                           vk_image_subresource_layer_count(&src_image->vk,
2604                                                            &info->imageSubresource));
2605    enum pipe_format dst_format =
2606       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2607    enum pipe_format src_format =
2608       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2609    const struct blit_ops *ops = &r2d_ops<CHIP>;
2610 
2611    if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2612        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2613       dst_format = PIPE_FORMAT_S8_UINT;
2614    }
2615 
2616    /* note: could use "R8_UNORM" when no UBWC */
2617    unsigned blit_param = 0;
2618    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2619        tu_pipe_format_is_float16(src_format)) {
2620       ops = &r3d_ops<CHIP>;
2621       blit_param = R3D_COPY;
2622    }
2623 
2624    VkOffset3D offset = info->imageOffset;
2625    VkExtent3D extent = info->imageExtent;
2626    uint32_t dst_width = info->bufferRowLength ?: extent.width;
2627    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2628 
2629    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2630 
2631    uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2632    uint32_t layer_size = pitch * dst_height;
2633 
2634    handle_buffer_unaligned_store<CHIP>(cmd,
2635                                        dst_buffer->iova + info->bufferOffset,
2636                                        layer_size * layers, unaligned_store);
2637 
2638    ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2639               VK_SAMPLE_COUNT_1_BIT);
2640 
2641    struct fdl6_view src;
2642    tu_image_view_copy<CHIP>(&src, src_image, src_format,
2643                             &info->imageSubresource, offset.z);
2644 
2645    for (uint32_t i = 0; i < layers; i++) {
2646       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2647 
2648       uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2649       if ((dst_va & 63) || (pitch & 63)) {
2650          for (uint32_t y = 0; y < extent.height; y++) {
2651             uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2652             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2653             ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2654                         (VkExtent2D) {extent.width, 1});
2655             ops->run(cmd, cs);
2656             dst_va += pitch;
2657          }
2658       } else {
2659          ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2660          coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2661          ops->run(cmd, cs);
2662       }
2663    }
2664 
2665    ops->teardown(cmd, cs);
2666 }
2667 
2668 template <chip CHIP>
2669 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2670 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2671                          const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2672 {
2673    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2674    VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2675    VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2676 
2677    bool unaligned_store = false;
2678    for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2679       tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2680                               pCopyImageToBufferInfo->pRegions + i,
2681                               &unaligned_store);
2682 
2683    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2684 }
2685 TU_GENX(tu_CmdCopyImageToBuffer2);
2686 
2687 static void
tu_copy_image_to_memory(struct tu_device * device,struct tu_image * src_image,const VkImageToMemoryCopyEXT * info,bool copy_memcpy)2688 tu_copy_image_to_memory(struct tu_device *device,
2689                         struct tu_image *src_image,
2690                         const VkImageToMemoryCopyEXT *info,
2691                         bool copy_memcpy)
2692 {
2693    unsigned plane = tu6_plane_index(src_image->vk.format,
2694                                     info->imageSubresource.aspectMask);
2695    const struct fdl_layout *layout = &src_image->layout[plane];
2696 
2697    VkOffset3D offset = info->imageOffset;
2698    VkExtent3D extent = info->imageExtent;
2699    uint32_t dst_width = info->memoryRowLength ?: extent.width;
2700    uint32_t dst_height = info->memoryImageHeight ?: extent.height;
2701 
2702    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2703 
2704    uint32_t dst_pitch = dst_width * layout->cpp;
2705 
2706    unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2707       offset.z : info->imageSubresource.baseArrayLayer;
2708    uint32_t layers = MAX2(extent.depth,
2709                           vk_image_subresource_layer_count(&src_image->vk,
2710                                                            &info->imageSubresource));
2711 
2712    uint32_t image_offset =
2713       fdl_surface_offset(layout,
2714                          info->imageSubresource.mipLevel,
2715                          start_layer);
2716 
2717    uint32_t src_layer_stride =
2718       fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2719    uint32_t src_layer_size =
2720       layout->slices[info->imageSubresource.mipLevel].size0;
2721    uint32_t dst_layer_stride =
2722       copy_memcpy ? src_layer_size : (dst_width * dst_height * layout->cpp);
2723    bool tiled =
2724       fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2725 
2726    const char *src = (const char *) src_image->map + image_offset;
2727    char *dst = (char *) info->pHostPointer;
2728    for (unsigned layer = 0; layer < layers; layer++,
2729         src += src_layer_stride, dst += dst_layer_stride) {
2730       if (src_image->bo->cached_non_coherent) {
2731          tu_bo_sync_cache(device, src_image->bo,
2732                           src_image->bo_offset + image_offset,
2733                           src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
2734       }
2735 
2736       if (copy_memcpy) {
2737          memcpy(dst, src, dst_layer_stride);
2738       } else if (!tiled) {
2739          uint32_t src_pitch = fdl_pitch(layout,
2740                                         info->imageSubresource.mipLevel);
2741          for (unsigned y = 0; y < extent.height; y++) {
2742             memcpy(dst + dst_pitch * y,
2743                    src + src_pitch * (y + offset.y) + offset.x * layout->cpp,
2744                    extent.width * layout->cpp);
2745          }
2746       } else {
2747          fdl6_memcpy_tiled_to_linear(offset.x, offset.y,
2748                                      extent.width, extent.height,
2749                                      dst, src, layout,
2750                                      info->imageSubresource.mipLevel,
2751                                      dst_pitch,
2752                                      &device->physical_device->ubwc_config);
2753       }
2754    }
2755 }
2756 
2757 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToMemoryEXT(VkDevice _device,const VkCopyImageToMemoryInfoEXT * info)2758 tu_CopyImageToMemoryEXT(VkDevice _device,
2759                         const VkCopyImageToMemoryInfoEXT *info)
2760 {
2761    VK_FROM_HANDLE(tu_device, device, _device);
2762    VK_FROM_HANDLE(tu_image, image, info->srcImage);
2763 
2764    for (unsigned i = 0; i < info->regionCount; i++) {
2765       tu_copy_image_to_memory(device, image, &info->pRegions[i],
2766                               info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2767    }
2768 
2769    return VK_SUCCESS;
2770 }
2771 
2772 
2773 /* Tiled formats don't support swapping, which means that we can't support
2774  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2775  * formats like B5G5R5A1 have a separate linear-only format when sampling.
2776  * Currently we fake support for tiled swapped formats and use the unswapped
2777  * format instead, but this means that reinterpreting copies to and from
2778  * swapped formats can't be performed correctly unless we can swizzle the
2779  * components by reinterpreting the other image as the "correct" swapped
2780  * format, i.e. only when the other image is linear.
2781  */
2782 
2783 template <chip CHIP>
2784 static bool
is_swapped_format(enum pipe_format format,bool is_mutable)2785 is_swapped_format(enum pipe_format format, bool is_mutable)
2786 {
2787    struct tu_native_format linear = blit_format_texture<CHIP>(format, TILE6_LINEAR, is_mutable, false);
2788    struct tu_native_format tiled = blit_format_texture<CHIP>(format, TILE6_3, is_mutable, false);
2789    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2790 }
2791 
2792 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2793  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2794  * versa). This should mirror the logic in fdl6_layout.
2795  */
2796 static bool
image_is_r8g8(struct tu_image * image)2797 image_is_r8g8(struct tu_image *image)
2798 {
2799    return image->layout[0].cpp == 2 &&
2800       vk_format_get_nr_components(image->vk.format) == 2;
2801 }
2802 
2803 template <chip CHIP>
2804 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2805 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2806                        struct tu_image *src_image,
2807                        struct tu_image *dst_image,
2808                        const VkImageCopy2 *info)
2809 {
2810    const struct blit_ops *ops = &r2d_ops<CHIP>;
2811    struct tu_cs *cs = &cmd->cs;
2812 
2813    if (dst_image->layout[0].nr_samples > 1)
2814       ops = &r3d_ops<CHIP>;
2815 
2816    enum pipe_format format = PIPE_FORMAT_NONE;
2817    VkOffset3D src_offset = info->srcOffset;
2818    VkOffset3D dst_offset = info->dstOffset;
2819    VkExtent3D extent = info->extent;
2820    uint32_t layers_to_copy = MAX2(info->extent.depth,
2821                                   vk_image_subresource_layer_count(&src_image->vk,
2822                                                                    &info->srcSubresource));
2823 
2824    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2825     * Images":
2826     *
2827     *    When copying between compressed and uncompressed formats the extent
2828     *    members represent the texel dimensions of the source image and not
2829     *    the destination. When copying from a compressed image to an
2830     *    uncompressed image the image texel dimensions written to the
2831     *    uncompressed image will be source extent divided by the compressed
2832     *    texel block dimensions. When copying from an uncompressed image to a
2833     *    compressed image the image texel dimensions written to the compressed
2834     *    image will be the source extent multiplied by the compressed texel
2835     *    block dimensions.
2836     *
2837     * This means we only have to adjust the extent if the source image is
2838     * compressed.
2839     */
2840    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2841    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2842 
2843    enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2844    enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2845 
2846    /* note: could use "R8_UNORM" when no UBWC */
2847    unsigned blit_param = 0;
2848    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2849        src_format == PIPE_FORMAT_Y8_UNORM ||
2850        tu_pipe_format_is_float16(src_format) ||
2851        tu_pipe_format_is_float16(dst_format)) {
2852       ops = &r3d_ops<CHIP>;
2853       blit_param = R3D_COPY;
2854    }
2855 
2856    bool use_staging_blit = false;
2857 
2858    if (src_format == dst_format) {
2859       /* Images that share a format can always be copied directly because it's
2860        * the same as a blit.
2861        */
2862       format = src_format;
2863    } else if (!src_image->layout[0].tile_mode) {
2864       /* If an image is linear, we can always safely reinterpret it with the
2865        * other image's format and then do a regular blit.
2866        */
2867       format = dst_format;
2868    } else if (!dst_image->layout[0].tile_mode) {
2869       format = src_format;
2870    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2871       /* We can't currently copy r8g8 images to/from other cpp=2 images,
2872        * due to the different tile layout.
2873        */
2874       use_staging_blit = true;
2875    } else if (is_swapped_format<CHIP>(src_format,
2876                                       src_image->layout[0].is_mutable) ||
2877               is_swapped_format<CHIP>(dst_format,
2878                                       src_image->layout[0].is_mutable)) {
2879       /* If either format has a non-identity swap, then we can't copy
2880        * to/from it.
2881        */
2882       use_staging_blit = true;
2883    } else if (!src_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2884       format = dst_format;
2885    } else if (!dst_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2886       format = src_format;
2887    } else {
2888       /* Both formats use UBWC and so neither can be reinterpreted.
2889        * TODO: We could do an in-place decompression of the dst instead.
2890        */
2891       perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2892       use_staging_blit = true;
2893    }
2894 
2895    struct fdl6_view dst, src;
2896 
2897    if (use_staging_blit) {
2898       tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2899       tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2900 
2901       struct fdl_layout staging_layout = { 0 };
2902       VkOffset3D staging_offset = { 0 };
2903 
2904       staging_layout.tile_mode = TILE6_LINEAR;
2905       staging_layout.ubwc = false;
2906 
2907       uint32_t layer_count =
2908          vk_image_subresource_layer_count(&src_image->vk,
2909                                           &info->srcSubresource);
2910       fdl6_layout(&staging_layout,
2911                   &cmd->device->physical_device->dev_info,
2912                   src_format,
2913                   src_image->layout[0].nr_samples,
2914                   extent.width,
2915                   extent.height,
2916                   extent.depth,
2917                   1,
2918                   layer_count,
2919                   extent.depth > 1,
2920                   false,
2921                   NULL);
2922 
2923       struct tu_bo *staging_bo;
2924       VkResult result = tu_get_scratch_bo(cmd->device,
2925                                           staging_layout.size,
2926                                           &staging_bo);
2927       if (result != VK_SUCCESS) {
2928          vk_command_buffer_set_error(&cmd->vk, result);
2929          return;
2930       }
2931 
2932       struct fdl6_view staging;
2933       const struct fdl_layout *staging_layout_ptr = &staging_layout;
2934       const struct fdl_view_args copy_to_args = {
2935          .chip = CHIP,
2936          .iova = staging_bo->iova,
2937          .base_miplevel = 0,
2938          .level_count = 1,
2939          .base_array_layer = 0,
2940          .layer_count = layer_count,
2941          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2942          .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2943          .type = FDL_VIEW_TYPE_2D,
2944       };
2945       fdl6_view_init(&staging, &staging_layout_ptr, &copy_to_args, false);
2946 
2947       ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2948                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2949       coords(ops, cmd, cs, staging_offset, src_offset, extent);
2950 
2951       for (uint32_t i = 0; i < layers_to_copy; i++) {
2952          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2953          ops->dst(cs, &staging, i, src_format);
2954          ops->run(cmd, cs);
2955       }
2956 
2957       /* When executed by the user there has to be a pipeline barrier here,
2958        * but since we're doing it manually we'll have to flush ourselves.
2959        */
2960       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
2961       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2962       tu_cs_emit_wfi(cs);
2963 
2964       const struct fdl_view_args copy_from_args = {
2965          .chip = CHIP,
2966          .iova = staging_bo->iova,
2967          .base_miplevel = 0,
2968          .level_count = 1,
2969          .base_array_layer = 0,
2970          .layer_count = layer_count,
2971          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2972          .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2973          .type = FDL_VIEW_TYPE_2D,
2974       };
2975       fdl6_view_init(&staging, &staging_layout_ptr, &copy_from_args, false);
2976 
2977       ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2978                  blit_param, false, dst_image->layout[0].ubwc,
2979                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2980       coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2981 
2982       for (uint32_t i = 0; i < layers_to_copy; i++) {
2983          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2984          ops->dst(cs, &dst, i, dst_format);
2985          ops->run(cmd, cs);
2986       }
2987    } else {
2988       tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2989       tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
2990 
2991       ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2992                  blit_param, false, dst_image->layout[0].ubwc,
2993                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2994       coords(ops, cmd, cs, dst_offset, src_offset, extent);
2995 
2996       for (uint32_t i = 0; i < layers_to_copy; i++) {
2997          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2998          ops->dst(cs, &dst, i, format);
2999          ops->run(cmd, cs);
3000       }
3001    }
3002 
3003    ops->teardown(cmd, cs);
3004 }
3005 
3006 template <chip CHIP>
3007 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)3008 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
3009                  const VkCopyImageInfo2 *pCopyImageInfo)
3010 {
3011    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3012    VK_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
3013    VK_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
3014 
3015    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
3016       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3017          VkImageCopy2 info = pCopyImageInfo->pRegions[i];
3018          u_foreach_bit(b, info.dstSubresource.aspectMask) {
3019             info.srcSubresource.aspectMask = BIT(b);
3020             info.dstSubresource.aspectMask = BIT(b);
3021             tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
3022          }
3023          continue;
3024       }
3025 
3026       tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
3027                              pCopyImageInfo->pRegions + i);
3028    }
3029 
3030    if (dst_image->lrz_height) {
3031       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
3032    }
3033 }
3034 TU_GENX(tu_CmdCopyImage2);
3035 
3036 static void
tu_copy_image_to_image_cpu(struct tu_device * device,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info,bool copy_memcpy)3037 tu_copy_image_to_image_cpu(struct tu_device *device,
3038                            struct tu_image *src_image,
3039                            struct tu_image *dst_image,
3040                            const VkImageCopy2 *info,
3041                            bool copy_memcpy)
3042 {
3043    unsigned src_plane = tu6_plane_index(src_image->vk.format,
3044                                         info->srcSubresource.aspectMask);
3045    unsigned dst_plane = tu6_plane_index(dst_image->vk.format,
3046                                         info->dstSubresource.aspectMask);
3047 
3048    const struct fdl_layout *src_layout = &src_image->layout[src_plane];
3049    const struct fdl_layout *dst_layout = &dst_image->layout[dst_plane];
3050 
3051    VkOffset3D src_offset = info->srcOffset;
3052    VkOffset3D dst_offset = info->dstOffset;
3053    VkExtent3D extent = info->extent;
3054    uint32_t layers_to_copy = MAX2(info->extent.depth,
3055                                   vk_image_subresource_layer_count(&src_image->vk,
3056                                                                    &info->srcSubresource));
3057 
3058    /* See comment above. */
3059    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
3060    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
3061 
3062    unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3063       src_offset.z : info->srcSubresource.baseArrayLayer;
3064    unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3065       dst_offset.z : info->dstSubresource.baseArrayLayer;
3066 
3067    uint32_t src_layer_stride =
3068       fdl_layer_stride(src_layout, info->srcSubresource.mipLevel);
3069    uint32_t src_layer_size =
3070       src_layout->slices[info->srcSubresource.mipLevel].size0;
3071    uint32_t dst_layer_stride =
3072       fdl_layer_stride(dst_layout, info->dstSubresource.mipLevel);
3073    uint32_t dst_layer_size =
3074       dst_layout->slices[info->dstSubresource.mipLevel].size0;
3075 
3076    uint32_t src_image_offset =
3077       fdl_surface_offset(src_layout,
3078                          info->srcSubresource.mipLevel,
3079                          src_start_layer);
3080    uint32_t dst_image_offset =
3081       fdl_surface_offset(dst_layout,
3082                          info->dstSubresource.mipLevel,
3083                          dst_start_layer);
3084 
3085    bool src_tiled =
3086       fdl_tile_mode(src_layout, info->srcSubresource.mipLevel) != 0;
3087    bool dst_tiled =
3088       fdl_tile_mode(dst_layout, info->dstSubresource.mipLevel) != 0;
3089 
3090    const char *src = (const char *) src_image->map + src_image_offset;
3091    char *dst = (char *) dst_image->map + dst_image_offset;
3092    for (unsigned layer = 0; layer < layers_to_copy; layer++,
3093         src += src_layer_stride, dst += dst_layer_stride) {
3094       if (src_image->bo->cached_non_coherent) {
3095          tu_bo_sync_cache(device, src_image->bo,
3096                           src_image->bo_offset + src_image_offset,
3097                           src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
3098       }
3099 
3100       uint32_t src_pitch = fdl_pitch(src_layout,
3101                                      info->srcSubresource.mipLevel);
3102       uint32_t dst_pitch = fdl_pitch(dst_layout,
3103                                      info->dstSubresource.mipLevel);
3104 
3105       if (copy_memcpy) {
3106          assert(src_layer_size == dst_layer_size);
3107          memcpy(dst, src, src_layer_size);
3108       } else if (!src_tiled && !dst_tiled) {
3109          for (unsigned y = 0; y < extent.height; y++) {
3110             memcpy(dst + dst_pitch * (y + dst_offset.y) + dst_offset.x * dst_layout->cpp,
3111                    src + src_pitch * (y + src_offset.y) + src_offset.x * src_layout->cpp,
3112                    extent.width * src_layout->cpp);
3113          }
3114       } else if (!src_tiled) {
3115          fdl6_memcpy_linear_to_tiled(dst_offset.x, dst_offset.y,
3116                                      extent.width, extent.height,
3117                                      dst,
3118                                      src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp,
3119                                      dst_layout,
3120                                      info->dstSubresource.mipLevel,
3121                                      src_pitch,
3122                                      &device->physical_device->ubwc_config);
3123       } else if (!dst_tiled) {
3124          fdl6_memcpy_tiled_to_linear(src_offset.x, src_offset.y,
3125                                      extent.width, extent.height,
3126                                      dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp,
3127                                      src,
3128                                      src_layout,
3129                                      info->dstSubresource.mipLevel,
3130                                      dst_pitch,
3131                                      &device->physical_device->ubwc_config);
3132       } else {
3133          /* Work tile-by-tile, holding the unswizzled tile in a temporary
3134           * buffer.
3135           */
3136          char temp_tile[256];
3137 
3138          uint32_t block_width, block_height;
3139          fdl6_get_ubwc_blockwidth(src_layout, &block_width, &block_height);
3140 
3141          uint32_t temp_pitch = block_width * src_layout->cpp;
3142 
3143          for (unsigned by = src_offset.y / block_height;
3144               by * block_height < src_offset.y + extent.height; by++) {
3145             uint32_t src_y_start = MAX2(src_offset.y, by * block_height);
3146             uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y;
3147             uint32_t height =
3148                MIN2((by + 1) * block_height, src_offset.y + extent.height) -
3149                src_y_start;
3150             for (unsigned bx = src_offset.x / block_width;
3151                  bx * block_width < src_offset.x + extent.width; bx++) {
3152                uint32_t src_x_start = MAX2(src_offset.x, bx * block_width);
3153                uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x;
3154                uint32_t width =
3155                   MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
3156                   src_x_start;
3157 
3158                fdl6_memcpy_tiled_to_linear(src_x_start, src_y_start,
3159                                            width, height,
3160                                            temp_tile, src, src_layout,
3161                                            info->srcSubresource.mipLevel,
3162                                            temp_pitch,
3163                                            &device->physical_device->ubwc_config);
3164                fdl6_memcpy_linear_to_tiled(dst_x_start, dst_y_start,
3165                                            width, height,
3166                                            dst, temp_tile, dst_layout,
3167                                            info->dstSubresource.mipLevel,
3168                                            temp_pitch,
3169                                            &device->physical_device->ubwc_config);
3170             }
3171          }
3172       }
3173 
3174       if (dst_image->bo->cached_non_coherent) {
3175          tu_bo_sync_cache(device, dst_image->bo,
3176                           dst_image->bo_offset + dst_image_offset,
3177                           dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
3178       }
3179    }
3180 }
3181 
3182 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToImageEXT(VkDevice _device,const VkCopyImageToImageInfoEXT * pCopyImageToImageInfo)3183 tu_CopyImageToImageEXT(VkDevice _device,
3184                        const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo)
3185 {
3186    VK_FROM_HANDLE(tu_device, device, _device);
3187    VK_FROM_HANDLE(tu_image, src_image, pCopyImageToImageInfo->srcImage);
3188    VK_FROM_HANDLE(tu_image, dst_image, pCopyImageToImageInfo->dstImage);
3189    bool copy_memcpy = pCopyImageToImageInfo->flags &
3190       VK_HOST_IMAGE_COPY_MEMCPY_EXT;
3191 
3192    for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) {
3193       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3194          VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i];
3195          u_foreach_bit(b, info.dstSubresource.aspectMask) {
3196             info.srcSubresource.aspectMask = BIT(b);
3197             info.dstSubresource.aspectMask = BIT(b);
3198             tu_copy_image_to_image_cpu(device, src_image, dst_image, &info,
3199                                        copy_memcpy);
3200          }
3201          continue;
3202       }
3203 
3204       tu_copy_image_to_image_cpu(device, src_image, dst_image,
3205                                  pCopyImageToImageInfo->pRegions + i,
3206                                  copy_memcpy);
3207    }
3208 
3209    if (dst_image->lrz_height) {
3210       TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
3211    }
3212 
3213    return VK_SUCCESS;
3214 }
3215 
3216 VKAPI_ATTR VkResult VKAPI_CALL
tu_TransitionImageLayoutEXT(VkDevice device,uint32_t transitionCount,const VkHostImageLayoutTransitionInfoEXT * transitions)3217 tu_TransitionImageLayoutEXT(VkDevice device,
3218                             uint32_t transitionCount,
3219                             const VkHostImageLayoutTransitionInfoEXT *transitions)
3220 {
3221    /* We don't do anything with layouts so this should be a no-op */
3222    return VK_SUCCESS;
3223 }
3224 
3225 template <chip CHIP>
3226 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size,bool * unaligned_store)3227 copy_buffer(struct tu_cmd_buffer *cmd,
3228             uint64_t dst_va,
3229             uint64_t src_va,
3230             uint64_t size,
3231             uint32_t block_size,
3232             bool *unaligned_store)
3233 {
3234    const struct blit_ops *ops = &r2d_ops<CHIP>;
3235    struct tu_cs *cs = &cmd->cs;
3236    enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
3237    uint64_t blocks = size / block_size;
3238 
3239    handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
3240 
3241    ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
3242               VK_SAMPLE_COUNT_1_BIT);
3243 
3244    while (blocks) {
3245       uint32_t src_x = (src_va & 63) / block_size;
3246       uint32_t dst_x = (dst_va & 63) / block_size;
3247       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
3248 
3249       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
3250       ops->dst_buffer(     cs, format, dst_va & ~63, 0, format);
3251       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
3252       ops->run(cmd, cs);
3253 
3254       src_va += width * block_size;
3255       dst_va += width * block_size;
3256       blocks -= width;
3257    }
3258 
3259    ops->teardown(cmd, cs);
3260 }
3261 
3262 template <chip CHIP>
3263 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)3264 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
3265                   const VkCopyBufferInfo2 *pCopyBufferInfo)
3266 {
3267    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3268    VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
3269    VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
3270 
3271    bool unaligned_store = false;
3272    for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
3273       const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
3274       copy_buffer<CHIP>(cmd,
3275                   dst_buffer->iova + region->dstOffset,
3276                   src_buffer->iova + region->srcOffset,
3277                   region->size, 1, &unaligned_store);
3278    }
3279 
3280    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3281 }
3282 TU_GENX(tu_CmdCopyBuffer2);
3283 
3284 template <chip CHIP>
3285 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)3286 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
3287                    VkBuffer dstBuffer,
3288                    VkDeviceSize dstOffset,
3289                    VkDeviceSize dataSize,
3290                    const void *pData)
3291 {
3292    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3293    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3294 
3295    struct tu_cs_memory tmp;
3296    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
3297    if (result != VK_SUCCESS) {
3298       vk_command_buffer_set_error(&cmd->vk, result);
3299       return;
3300    }
3301 
3302    bool unaligned_store = false;
3303    memcpy(tmp.map, pData, dataSize);
3304    copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store);
3305 
3306    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3307 }
3308 TU_GENX(tu_CmdUpdateBuffer);
3309 
3310 template <chip CHIP>
3311 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)3312 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
3313                  VkBuffer dstBuffer,
3314                  VkDeviceSize dstOffset,
3315                  VkDeviceSize fillSize,
3316                  uint32_t data)
3317 {
3318    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3319    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3320    const struct blit_ops *ops = &r2d_ops<CHIP>;
3321    struct tu_cs *cs = &cmd->cs;
3322 
3323    fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
3324 
3325    uint64_t dst_va = buffer->iova + dstOffset;
3326    uint32_t blocks = fillSize / 4;
3327 
3328    bool unaligned_store = false;
3329    handle_buffer_unaligned_store<CHIP>(cmd, dst_va, fillSize, &unaligned_store);
3330 
3331    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
3332               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
3333               VK_SAMPLE_COUNT_1_BIT);
3334 
3335    VkClearValue clear_val = {};
3336    clear_val.color.uint32[0] = data;
3337    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
3338 
3339    while (blocks) {
3340       uint32_t dst_x = (dst_va & 63) / 4;
3341       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
3342 
3343       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
3344       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
3345       ops->run(cmd, cs);
3346 
3347       dst_va += width * 4;
3348       blocks -= width;
3349    }
3350 
3351    ops->teardown(cmd, cs);
3352 
3353    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3354 }
3355 TU_GENX(tu_CmdFillBuffer);
3356 
3357 template <chip CHIP>
3358 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)3359 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
3360                     const VkResolveImageInfo2 *pResolveImageInfo)
3361 {
3362    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3363    VK_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
3364    VK_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
3365    const struct blit_ops *ops = &r2d_ops<CHIP>;
3366    struct tu_cs *cs = &cmd->cs;
3367 
3368    enum pipe_format src_format =
3369       vk_format_to_pipe_format(src_image->vk.format);
3370    enum pipe_format dst_format =
3371       vk_format_to_pipe_format(dst_image->vk.format);
3372    ops->setup(cmd, cs, src_format, dst_format,
3373               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
3374               VK_SAMPLE_COUNT_1_BIT);
3375 
3376    for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
3377       const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
3378       uint32_t layers = MAX2(info->extent.depth,
3379                              vk_image_subresource_layer_count(&dst_image->vk,
3380                                                               &info->dstSubresource));
3381 
3382       /* TODO: aspect masks possible ? */
3383 
3384       coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
3385 
3386       struct fdl6_view dst, src;
3387       tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
3388       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
3389 
3390       for (uint32_t i = 0; i < layers; i++) {
3391          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
3392          ops->dst(cs, &dst, i, src_format);
3393          ops->run(cmd, cs);
3394       }
3395    }
3396 
3397    ops->teardown(cmd, cs);
3398 }
3399 TU_GENX(tu_CmdResolveImage2);
3400 
3401 #define for_each_layer(layer, layer_mask, layers) \
3402    for (uint32_t layer = 0; \
3403         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
3404         layer++) \
3405       if (!layer_mask || (layer_mask & BIT(layer)))
3406 
3407 template <chip CHIP>
3408 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)3409 resolve_sysmem(struct tu_cmd_buffer *cmd,
3410                struct tu_cs *cs,
3411                VkFormat vk_src_format,
3412                VkFormat vk_dst_format,
3413                const struct tu_image_view *src,
3414                const struct tu_image_view *dst,
3415                uint32_t layer_mask,
3416                uint32_t layers,
3417                const VkRect2D *rect,
3418                bool src_separate_ds,
3419                bool dst_separate_ds)
3420 {
3421    const struct blit_ops *ops = &r2d_ops<CHIP>;
3422 
3423    trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
3424 
3425    enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
3426    enum pipe_format dst_format = vk_format_to_pipe_format(vk_dst_format);
3427 
3428    ops->setup(cmd, cs, src_format, dst_format,
3429               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
3430               VK_SAMPLE_COUNT_1_BIT);
3431    ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
3432 
3433    for_each_layer(i, layer_mask, layers) {
3434       if (src_separate_ds) {
3435          if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3436             r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3437          } else {
3438             r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3439          }
3440       } else {
3441          ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
3442       }
3443 
3444       if (dst_separate_ds) {
3445          if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3446             ops->dst_depth(cs, dst, i);
3447          } else {
3448             ops->dst_stencil(cs, dst, i);
3449          }
3450       } else {
3451          ops->dst(cs, &dst->view, i, src_format);
3452       }
3453 
3454       ops->run(cmd, cs);
3455    }
3456 
3457    ops->teardown(cmd, cs);
3458 
3459    trace_end_sysmem_resolve(&cmd->trace, cs);
3460 }
3461 
3462 template <chip CHIP>
3463 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)3464 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
3465                   struct tu_cs *cs,
3466                   const struct tu_image_view *src,
3467                   const struct tu_image_view *dst,
3468                   uint32_t layer_mask,
3469                   uint32_t layers,
3470                   const VkRect2D *rect)
3471 {
3472    assert(src->image->vk.format == dst->image->vk.format ||
3473           (vk_format_is_depth_or_stencil(src->image->vk.format) &&
3474            vk_format_is_depth_or_stencil(dst->image->vk.format)));
3475 
3476    bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3477    bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3478 
3479    if (dst_separate_ds) {
3480       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
3481                      src, dst, layer_mask, layers, rect,
3482                      src_separate_ds, dst_separate_ds);
3483       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
3484                      src, dst, layer_mask, layers, rect,
3485                      src_separate_ds, dst_separate_ds);
3486    } else {
3487       resolve_sysmem<CHIP>(cmd, cs, src->image->vk.format, dst->image->vk.format,
3488                      src, dst, layer_mask, layers, rect,
3489                      src_separate_ds, dst_separate_ds);
3490    }
3491 }
3492 TU_GENX(tu_resolve_sysmem);
3493 
3494 enum tu_resolve_group_buffer_type {
3495    TU_RESOLVE_GROUP_COLOR_BUFFER,
3496    TU_RESOLVE_GROUP_DEPTH_BUFFER,
3497    TU_RESOLVE_GROUP_STENCIL_BUFFER,
3498 };
3499 
3500 template <chip CHIP>
3501 static uint32_t
tu_resolve_group_include_buffer(struct tu_resolve_group * resolve_group,enum tu_resolve_group_buffer_type buffer_type)3502 tu_resolve_group_include_buffer(struct tu_resolve_group *resolve_group,
3503                                 enum tu_resolve_group_buffer_type buffer_type)
3504 {
3505    /* Resolve groups are not usable on a6xx, so no pending resolve is
3506     * established. The default value of 0 is returned as the buffer ID.
3507     */
3508    if (CHIP == A6XX)
3509       return 0;
3510 
3511    resolve_group->pending_resolves = true;
3512 
3513    if (buffer_type == TU_RESOLVE_GROUP_DEPTH_BUFFER)
3514       return 0x8;
3515    if (buffer_type == TU_RESOLVE_GROUP_STENCIL_BUFFER)
3516       return 0x9;
3517 
3518    const uint32_t max_color_buffers = 8;
3519    uint32_t buffer_id = resolve_group->color_buffer_id++;
3520    return buffer_id % max_color_buffers;
3521 }
3522 
3523 template <chip CHIP>
3524 static uint32_t
tu_resolve_group_include_buffer_for_format(struct tu_resolve_group * resolve_group,VkFormat format)3525 tu_resolve_group_include_buffer_for_format(struct tu_resolve_group *resolve_group,
3526                                            VkFormat format)
3527 {
3528    enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER;
3529 
3530    /* D24_UNORM_S8_UINT should be assigned the depth buffer type, regardless of
3531     * whether depth, stencil or both are being resolved.
3532     */
3533    if (format == VK_FORMAT_D24_UNORM_S8_UINT)
3534       buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
3535 
3536    return tu_resolve_group_include_buffer<CHIP>(resolve_group, buffer_type);
3537 }
3538 
3539 template <chip CHIP>
3540 void
tu_emit_resolve_group(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group)3541 tu_emit_resolve_group(struct tu_cmd_buffer *cmd,
3542                           struct tu_cs *cs,
3543                           struct tu_resolve_group *resolve_group)
3544 {
3545    /* Resolve groups are not usable on A6XX, so that template instantiation
3546     * should behave as a no-op.
3547     */
3548    if (CHIP == A6XX || !resolve_group->pending_resolves)
3549       return;
3550 
3551    resolve_group->color_buffer_id = 0;
3552    resolve_group->pending_resolves = false;
3553 
3554    tu_emit_raw_event_write<CHIP>(cmd, cs, CCU_END_RESOLVE_GROUP, false);
3555 }
3556 TU_GENX(tu_emit_resolve_group);
3557 
3558 template <chip CHIP>
3559 static void
clear_image_cp_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3560 clear_image_cp_blit(struct tu_cmd_buffer *cmd,
3561                     struct tu_image *image,
3562                     const VkClearValue *clear_value,
3563                     const VkImageSubresourceRange *range,
3564                     VkImageAspectFlags aspect_mask)
3565 {
3566    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3567    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3568    struct tu_cs *cs = &cmd->cs;
3569    enum pipe_format format;
3570    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
3571       format = PIPE_FORMAT_R32_UINT;
3572    } else {
3573       format = tu_aspects_to_plane(image->vk.format, aspect_mask);
3574    }
3575 
3576    if (image->layout[0].depth0 > 1) {
3577       assert(layer_count == 1);
3578       assert(range->baseArrayLayer == 0);
3579    }
3580 
3581    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
3582 
3583    ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
3584               (VkSampleCountFlagBits) image->layout[0].nr_samples);
3585    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
3586       ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
3587    else
3588       ops->clear_value(cmd, cs, format, clear_value);
3589 
3590    for (unsigned j = 0; j < level_count; j++) {
3591       if (image->layout[0].depth0 > 1)
3592          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
3593 
3594       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
3595                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
3596                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
3597                   });
3598 
3599       struct fdl6_view dst;
3600       const VkImageSubresourceLayers subresource = {
3601          .aspectMask = aspect_mask,
3602          .mipLevel = range->baseMipLevel + j,
3603          .baseArrayLayer = range->baseArrayLayer,
3604          .layerCount = 1,
3605       };
3606       tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
3607 
3608       for (uint32_t i = 0; i < layer_count; i++) {
3609          ops->dst(cs, &dst, i, format);
3610          ops->run(cmd, cs);
3611       }
3612    }
3613 
3614    ops->teardown(cmd, cs);
3615 }
3616 
3617 static void
clear_image_event_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3618 clear_image_event_blit(struct tu_cmd_buffer *cmd,
3619                        struct tu_image *image,
3620                        uint32_t buffer_id,
3621                        const VkClearValue *clear_value,
3622                        const VkImageSubresourceRange *range,
3623                        VkImageAspectFlags aspect_mask)
3624 {
3625    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3626    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3627    VkFormat vk_format = image->vk.format;
3628    if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3629       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
3630          vk_format = VK_FORMAT_S8_UINT;
3631       else
3632          vk_format = VK_FORMAT_D32_SFLOAT;
3633    }
3634 
3635    enum pipe_format format = vk_format_to_pipe_format(vk_format);
3636 
3637    if (image->layout[0].depth0 > 1) {
3638       assert(layer_count == 1);
3639       assert(range->baseArrayLayer == 0);
3640    }
3641 
3642    struct tu_cs *cs = &cmd->cs;
3643 
3644    tu_cs_emit_regs(cs,
3645                    A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
3646 
3647    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3648    tu_cs_emit(cs, 0);
3649 
3650    tu_cs_emit_regs(
3651       cs, A6XX_RB_BLIT_INFO(
3652                 .type = BLIT_EVENT_CLEAR,
3653                 .sample_0 = vk_format_is_int(vk_format) ||
3654                             vk_format_is_depth_or_stencil(vk_format),
3655                 .depth = vk_format_is_depth_or_stencil(vk_format),
3656                 .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask),
3657                 .buffer_id = buffer_id));
3658 
3659    uint32_t clear_vals[4] = {};
3660    pack_blit_event_clear_value(clear_value, format, clear_vals);
3661    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3662    tu_cs_emit_array(cs, clear_vals, 4);
3663 
3664    for (unsigned level = 0; level < level_count; level++) {
3665       if (image->layout[0].depth0 > 1)
3666          layer_count =
3667             u_minify(image->layout[0].depth0, range->baseMipLevel + level);
3668 
3669       uint32_t width =
3670          u_minify(image->layout[0].width0, range->baseMipLevel + level);
3671       uint32_t height =
3672          u_minify(image->layout[0].height0, range->baseMipLevel + level);
3673       tu_cs_emit_regs(
3674          cs, A6XX_RB_BLIT_SCISSOR_TL(.x = 0, .y = 0),
3675          A6XX_RB_BLIT_SCISSOR_BR(.x = width - 1, .y = height - 1));
3676 
3677       struct fdl6_view dst;
3678       const VkImageSubresourceLayers subresource = {
3679          .aspectMask = aspect_mask,
3680          .mipLevel = range->baseMipLevel + level,
3681          .baseArrayLayer = range->baseArrayLayer,
3682          .layerCount = 1,
3683       };
3684       tu_image_view_copy_blit<A7XX>(&dst, image, format, &subresource, 0, false);
3685 
3686       for (uint32_t layer = 0; layer < layer_count; layer++) {
3687 
3688          struct event_blit_dst_view blt_view = {
3689             .image = image,
3690             .view = &dst,
3691             .layer = layer,
3692          };
3693 
3694          if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3695             uint32_t real_level = range->baseMipLevel + level;
3696             uint32_t real_layer = range->baseArrayLayer + layer;
3697             if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
3698                struct fdl_layout *layout = &image->layout[0];
3699                blt_view.depth_addr =
3700                   image->iova +
3701                   fdl_surface_offset(layout, real_level, real_layer);
3702                blt_view.depth_pitch = fdl_pitch(layout, real_level);
3703             } else {
3704                struct fdl_layout *layout = &image->layout[1];
3705                blt_view.stencil_addr =
3706                   image->iova +
3707                   fdl_surface_offset(layout, real_level, real_layer);
3708                blt_view.stencil_pitch = fdl_pitch(layout, real_level);
3709             }
3710          }
3711 
3712          event_blit_run<A7XX>(cmd, cs, NULL, &blt_view,
3713                               aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT);
3714       }
3715    }
3716 }
3717 
3718 static bool
use_generic_clear_for_image_clear(struct tu_cmd_buffer * cmd,struct tu_image * image)3719 use_generic_clear_for_image_clear(struct tu_cmd_buffer *cmd,
3720                                   struct tu_image *image)
3721 {
3722    const struct fd_dev_info *info = cmd->device->physical_device->info;
3723    return info->a7xx.has_generic_clear &&
3724           /* A7XX supports R9G9B9E5_FLOAT as color attachment and supports
3725            * generic clears for it. A7XX TODO: allow R9G9B9E5_FLOAT
3726            * attachments.
3727            */
3728           image->vk.format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 &&
3729           /* Clearing VK_FORMAT_R8G8_* with fast-clear value, certain
3730            * dimensions (e.g. 960x540), and having GMEM renderpass afterwards
3731            * may lead to a GPU fault on A7XX.
3732            */
3733           !(info->a7xx.r8g8_faulty_fast_clear_quirk && image_is_r8g8(image));
3734 }
3735 
3736 template <chip CHIP>
3737 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3738 clear_image(struct tu_cmd_buffer *cmd,
3739             struct tu_image *image,
3740             uint32_t buffer_id,
3741             const VkClearValue *clear_value,
3742             const VkImageSubresourceRange *range,
3743             VkImageAspectFlags aspect_mask)
3744 {
3745    if (use_generic_clear_for_image_clear(cmd, image)) {
3746       clear_image_event_blit(cmd, image, buffer_id, clear_value, range, aspect_mask);
3747    } else {
3748       clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
3749    }
3750 }
3751 
3752 template <chip CHIP>
3753 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3754 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
3755                       VkImage image_h,
3756                       VkImageLayout imageLayout,
3757                       const VkClearColorValue *pColor,
3758                       uint32_t rangeCount,
3759                       const VkImageSubresourceRange *pRanges)
3760 {
3761    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3762    VK_FROM_HANDLE(tu_image, image, image_h);
3763 
3764    bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3765    if (use_generic_clear) {
3766       /* Generic clear doesn't go through CCU (or other caches). */
3767       cmd->state.cache.flush_bits |=
3768          TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_WAIT_FOR_IDLE;
3769       tu_emit_cache_flush<CHIP>(cmd);
3770    }
3771 
3772    struct tu_resolve_group resolve_group = {};
3773 
3774    for (unsigned i = 0; i < rangeCount; i++) {
3775       uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_COLOR_BUFFER);
3776       clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
3777    }
3778 
3779    tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3780    if (use_generic_clear) {
3781       /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3782        * proceed only after the just-emitted generic clears are complete.
3783        */
3784       cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3785       tu_emit_cache_flush<CHIP>(cmd);
3786    }
3787 }
3788 TU_GENX(tu_CmdClearColorImage);
3789 
3790 template <chip CHIP>
3791 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3792 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
3793                              VkImage image_h,
3794                              VkImageLayout imageLayout,
3795                              const VkClearDepthStencilValue *pDepthStencil,
3796                              uint32_t rangeCount,
3797                              const VkImageSubresourceRange *pRanges)
3798 {
3799    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3800    VK_FROM_HANDLE(tu_image, image, image_h);
3801 
3802    bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3803    if (use_generic_clear) {
3804       /* Generic clear doesn't go through CCU (or other caches). */
3805       cmd->state.cache.flush_bits |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
3806                                      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
3807                                      TU_CMD_FLAG_WAIT_FOR_IDLE;
3808       tu_emit_cache_flush<CHIP>(cmd);
3809    }
3810 
3811    struct tu_resolve_group resolve_group = {};
3812 
3813    for (unsigned i = 0; i < rangeCount; i++) {
3814       const VkImageSubresourceRange *range = &pRanges[i];
3815 
3816       if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3817          /* can't clear both depth and stencil at once, split up the aspect mask */
3818          u_foreach_bit(b, range->aspectMask) {
3819             uint32_t buffer_id = 0;
3820             if (BIT(b) == VK_IMAGE_ASPECT_DEPTH_BIT)
3821                buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
3822             if (BIT(b) == VK_IMAGE_ASPECT_STENCIL_BIT)
3823                buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
3824 
3825             clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, BIT(b));
3826          }
3827          continue;
3828       }
3829 
3830       uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<CHIP>(&resolve_group, image->vk.format);
3831       clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
3832    }
3833 
3834    tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3835    if (use_generic_clear) {
3836       /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3837        * proceed only after the just-emitted generic clears are complete.
3838        */
3839       cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3840       tu_emit_cache_flush<CHIP>(cmd);
3841    }
3842 
3843    tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
3844 }
3845 TU_GENX(tu_CmdClearDepthStencilImage);
3846 
3847 /* CmdClearAttachments uses the original color attachment index instead of the
3848  * remapped index used by the shader, and our MRTs use the remapped
3849  * indices, so we have to remap them. We should always be able to find a
3850  * shader attachment thanks to this VU:
3851  *
3852  *    VUID-vkCmdClearAttachments-colorAttachment-09503
3853  *    "The colorAttachment member of each element of pAttachments must not
3854  *    identify a color attachment that is currently mapped to
3855  *    VK_ATTACHMENT_UNUSED in commandBuffer via
3856  *    VkRenderingAttachmentLocationInfoKHR"
3857  */
3858 static unsigned
remap_attachment(struct tu_cmd_buffer * cmd,unsigned a)3859 remap_attachment(struct tu_cmd_buffer *cmd, unsigned a)
3860 {
3861    unsigned i = cmd->vk.dynamic_graphics_state.cal.color_map[a];
3862    assert(i != MESA_VK_ATTACHMENT_UNUSED &&
3863           "app violates VUID-vkCmdClearAttachments-colorAttachment-09503");
3864    return i;
3865 }
3866 
3867 template <chip CHIP>
3868 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3869 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
3870                             uint32_t attachment_count,
3871                             const VkClearAttachment *attachments,
3872                             uint32_t rect_count,
3873                             const VkClearRect *rects)
3874 {
3875    /* the shader path here is special, it avoids changing MRT/etc state */
3876    const struct tu_subpass *subpass = cmd->state.subpass;
3877    const uint32_t mrt_count = subpass->color_count;
3878    struct tu_cs *cs = &cmd->draw_cs;
3879    uint32_t clear_value[MAX_RTS][4];
3880    float z_clear_val = 0.0f;
3881    uint8_t s_clear_val = 0;
3882    uint32_t clear_rts = 0, clear_components = 0;
3883    bool z_clear = false;
3884    bool s_clear = false;
3885 
3886    trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
3887 
3888    for (uint32_t i = 0; i < attachment_count; i++) {
3889       uint32_t a;
3890       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3891          uint32_t c = attachments[i].colorAttachment;
3892          a = subpass->color_attachments[c].attachment;
3893          if (a == VK_ATTACHMENT_UNUSED)
3894             continue;
3895 
3896          uint32_t remapped = remap_attachment(cmd, c);
3897          clear_rts |= 1 << remapped;
3898          clear_components |= 0xf << (remapped * 4);
3899          memcpy(clear_value[remapped], &attachments[i].clearValue, 4 * sizeof(uint32_t));
3900       } else {
3901          a = subpass->depth_stencil_attachment.attachment;
3902          if (a == VK_ATTACHMENT_UNUSED)
3903             continue;
3904 
3905          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3906             z_clear = true;
3907             z_clear_val = attachments[i].clearValue.depthStencil.depth;
3908          }
3909 
3910          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3911             s_clear = true;
3912             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
3913          }
3914       }
3915    }
3916 
3917    /* We may not know the multisample count if there are no attachments, so
3918     * just bail early to avoid corner cases later.
3919     */
3920    if (clear_rts == 0 && !z_clear && !s_clear)
3921       return;
3922 
3923    /* disable all draw states so they don't interfere
3924     * TODO: use and re-use draw states
3925     * we have to disable draw states individually to preserve
3926     * input attachment states, because a secondary command buffer
3927     * won't be able to restore them
3928     */
3929    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3930    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
3931       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
3932           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
3933          continue;
3934       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
3935                      CP_SET_DRAW_STATE__0_DISABLE);
3936       tu_cs_emit_qw(cs, 0);
3937    }
3938    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
3939 
3940    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
3941    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
3942                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
3943                   0xfc000000);
3944    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
3945 
3946    r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
3947 
3948    /* Disable sample counting in order to not affect occlusion query. */
3949    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
3950 
3951    if (cmd->state.prim_generated_query_running_before_rp) {
3952       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
3953    }
3954 
3955    tu_cs_emit_regs(cs,
3956                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
3957    tu_cs_emit_regs(cs,
3958                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
3959 
3960    tu_cs_emit_regs(cs,
3961                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
3962 
3963    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
3964    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
3965    for (uint32_t i = 0; i < mrt_count; i++) {
3966       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
3967             .component_enable = COND(clear_rts & (1 << i), 0xf)));
3968    }
3969 
3970    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
3971    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
3972 
3973    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
3974    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3975          .z_test_enable = z_clear,
3976          .z_write_enable = z_clear,
3977          .zfunc = FUNC_ALWAYS));
3978    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
3979    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
3980    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3981          .stencil_enable = s_clear,
3982          .func = FUNC_ALWAYS,
3983          .zpass = STENCIL_REPLACE));
3984    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
3985    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
3986    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
3987    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
3988 
3989    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
3990 
3991    unsigned num_rts = util_bitcount(clear_rts);
3992    uint32_t packed_clear_value[MAX_RTS][4];
3993 
3994    uint32_t idx = 0;
3995    u_foreach_bit(b, clear_rts) {
3996       memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
3997       idx++;
3998    }
3999 
4000    if (num_rts > 0)
4001       tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
4002                                 0, packed_clear_value, num_rts);
4003 
4004    for (uint32_t i = 0; i < rect_count; i++) {
4005       /* This should be true because of this valid usage for
4006        * vkCmdClearAttachments:
4007        *
4008        *    "If the render pass instance this is recorded in uses multiview,
4009        *    then baseArrayLayer must be zero and layerCount must be one"
4010        */
4011       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
4012 
4013       /* a630 doesn't support multiview masks, which means that we can't use
4014        * the normal multiview path without potentially recompiling a shader
4015        * on-demand or using a more complicated variant that takes the mask as
4016        * a const. Just use the layered path instead, since it shouldn't be
4017        * much worse.
4018        */
4019       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
4020       {
4021          const float coords[] = {
4022             rects[i].rect.offset.x,
4023             rects[i].rect.offset.y,
4024             z_clear_val,
4025             uif(rects[i].baseArrayLayer + layer),
4026             rects[i].rect.offset.x + rects[i].rect.extent.width,
4027             rects[i].rect.offset.y + rects[i].rect.extent.height,
4028             z_clear_val,
4029             1.0f,
4030          };
4031 
4032          r3d_coords_raw(cmd, cs, coords);
4033          r3d_run_vis(cmd, cs);
4034       }
4035    }
4036 
4037    /* Re-enable sample counting. */
4038    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
4039 
4040    if (cmd->state.prim_generated_query_running_before_rp) {
4041       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
4042    }
4043 
4044    trace_end_sysmem_clear_all(&cmd->trace, cs);
4045 }
4046 
4047 template <chip CHIP>
4048 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)4049 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4050                       struct tu_cs *cs,
4051                       uint32_t buffer_id,
4052                       enum pipe_format format,
4053                       uint8_t clear_mask,
4054                       uint32_t gmem_offset,
4055                       const VkClearValue *value)
4056 {
4057    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
4058    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
4059             blit_base_format<CHIP>(format, false, true)));
4060 
4061    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
4062                                          .clear_mask = clear_mask,
4063                                          .buffer_id = buffer_id));
4064 
4065    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
4066    tu_cs_emit(cs, gmem_offset);
4067 
4068    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
4069    tu_cs_emit(cs, 0);
4070 
4071    uint32_t clear_vals[4] = {};
4072    pack_blit_event_clear_value(value, format, clear_vals);
4073 
4074    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4075    tu_cs_emit_array(cs, clear_vals, 4);
4076 
4077    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
4078 }
4079 
4080 template <chip CHIP>
4081 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)4082 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4083                               struct tu_cs *cs,
4084                               struct tu_resolve_group *resolve_group,
4085                               uint32_t attachment,
4086                               uint32_t base_layer,
4087                               uint32_t layers,
4088                               uint32_t layer_mask,
4089                               VkImageAspectFlags mask,
4090                               const VkClearValue *value)
4091 {
4092    const struct tu_render_pass_attachment *att =
4093       &cmd->state.pass->attachments[attachment];
4094 
4095    trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
4096 
4097    tu_cs_emit_regs(cs,
4098                    A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
4099 
4100    enum pipe_format format = vk_format_to_pipe_format(att->format);
4101    for_each_layer(i, layer_mask, layers) {
4102       uint32_t layer = i + base_layer;
4103       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4104          if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4105             uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
4106             clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, 0xf,
4107                                   tu_attachment_gmem_offset(cmd, att, layer), value);
4108          }
4109          if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4110             uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
4111             clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, 0xf,
4112                                   tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
4113          }
4114       } else {
4115          uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<CHIP>(resolve_group, att->format);
4116          clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, format, aspect_write_mask(format, mask),
4117                                tu_attachment_gmem_offset(cmd, att, layer), value);
4118       }
4119    }
4120 
4121    tu_flush_for_access(&cmd->state.renderpass_cache, TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4122 
4123    trace_end_gmem_clear(&cmd->trace, cs);
4124 }
4125 
4126 template <chip CHIP>
4127 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)4128 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
4129                           uint32_t attachment_count,
4130                           const VkClearAttachment *attachments,
4131                           uint32_t rect_count,
4132                           const VkClearRect *rects)
4133 {
4134    const struct tu_subpass *subpass = cmd->state.subpass;
4135    struct tu_cs *cs = &cmd->draw_cs;
4136 
4137    if (rect_count > 1)
4138       perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
4139 
4140    struct tu_resolve_group resolve_group = {};
4141 
4142    for (unsigned i = 0; i < rect_count; i++) {
4143       unsigned x1 = rects[i].rect.offset.x;
4144       unsigned y1 = rects[i].rect.offset.y;
4145       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
4146       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
4147 
4148       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4149       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4150       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4151 
4152       for (unsigned j = 0; j < attachment_count; j++) {
4153          uint32_t a;
4154          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
4155             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
4156          else
4157             a = subpass->depth_stencil_attachment.attachment;
4158 
4159          if (a == VK_ATTACHMENT_UNUSED)
4160                continue;
4161 
4162          tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a,
4163                                        rects[i].baseArrayLayer,
4164                                        rects[i].layerCount,
4165                                        subpass->multiview_mask,
4166                                        attachments[j].aspectMask,
4167                                        &attachments[j].clearValue);
4168       }
4169    }
4170 
4171    tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
4172 }
4173 
4174 template <chip CHIP>
4175 static void
tu_clear_attachments(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4176 tu_clear_attachments(struct tu_cmd_buffer *cmd,
4177                      uint32_t attachmentCount,
4178                      const VkClearAttachment *pAttachments,
4179                      uint32_t rectCount,
4180                      const VkClearRect *pRects)
4181 {
4182    struct tu_cs *cs = &cmd->draw_cs;
4183 
4184    /* sysmem path behaves like a draw, note we don't have a way of using different
4185     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
4186     */
4187    tu_emit_cache_flush_renderpass<CHIP>(cmd);
4188 
4189    /* vkCmdClearAttachments is supposed to respect the predicate if active. The
4190     * easiest way to do this is to always use the 3d path, which always works
4191     * even with GMEM because it's just a simple draw using the existing
4192     * attachment state.
4193     *
4194     * Similarly, we also use the 3D path when in a secondary command buffer that
4195     * doesn't know the GMEM layout that will be chosen by the primary.
4196     */
4197    if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
4198       tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4199       return;
4200    }
4201 
4202    /* If we could skip tile load/stores based on any draws intersecting them at
4203     * binning time, then emit the clear as a 3D draw so that it contributes to
4204     * that visibility.
4205    */
4206    const struct tu_subpass *subpass = cmd->state.subpass;
4207    for (uint32_t i = 0; i < attachmentCount; i++) {
4208       uint32_t a;
4209       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4210          uint32_t c = pAttachments[i].colorAttachment;
4211          a = subpass->color_attachments[c].attachment;
4212       } else {
4213          a = subpass->depth_stencil_attachment.attachment;
4214       }
4215       if (a != VK_ATTACHMENT_UNUSED) {
4216          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4217          if (att->cond_load_allowed || att->cond_store_allowed) {
4218             tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4219             return;
4220          }
4221       }
4222    }
4223 
4224    /* Otherwise, emit 2D blits for gmem rendering. */
4225    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
4226    tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4227    tu_cond_exec_end(cs);
4228 
4229    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4230    tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4231    tu_cond_exec_end(cs);
4232 }
4233 
4234 static void
tu7_clear_attachment_generic_single_rect(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_render_pass_attachment * att,const VkClearAttachment * clear_att,uint32_t a,const VkClearRect * rect)4235 tu7_clear_attachment_generic_single_rect(
4236    struct tu_cmd_buffer *cmd,
4237    struct tu_cs *cs,
4238    struct tu_resolve_group *resolve_group,
4239    const struct tu_render_pass_attachment *att,
4240    const VkClearAttachment *clear_att,
4241    uint32_t a,
4242    const VkClearRect *rect)
4243 {
4244    const struct tu_subpass *subpass = cmd->state.subpass;
4245    unsigned x1 = rect->rect.offset.x;
4246    unsigned y1 = rect->rect.offset.y;
4247    unsigned x2 = x1 + rect->rect.extent.width - 1;
4248    unsigned y2 = y1 + rect->rect.extent.height - 1;
4249 
4250    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4251    tu_cs_emit(cs,
4252               A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4253    tu_cs_emit(cs,
4254               A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4255 
4256    auto value = &clear_att->clearValue;
4257 
4258    enum pipe_format format = vk_format_to_pipe_format(att->format);
4259    for_each_layer(i, subpass->multiview_mask, rect->layerCount) {
4260       uint32_t layer = i + rect->baseArrayLayer;
4261       uint32_t mask =
4262          aspect_write_mask_generic_clear(format, clear_att->aspectMask);
4263 
4264       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4265          if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4266             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
4267             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4268                                     false, layer, value, a);
4269          }
4270          if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4271             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
4272             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4273                                     layer, value, a);
4274          }
4275       } else {
4276          uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<A7XX>(resolve_group, att->format);
4277          tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4278       }
4279    }
4280 }
4281 
4282 static void
tu_clear_attachments_generic(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4283 tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
4284                              uint32_t attachmentCount,
4285                              const VkClearAttachment *pAttachments,
4286                              uint32_t rectCount,
4287                              const VkClearRect *pRects)
4288 {
4289    struct tu_cs *cs = &cmd->draw_cs;
4290 
4291    uint32_t clear_aspects = 0;
4292    for (uint32_t i = 0; i < attachmentCount; i++) {
4293       clear_aspects |= pAttachments[i].aspectMask;
4294    }
4295 
4296    /* Generic clear doesn't go through CCU (or other caches),
4297     * so we have to flush (clean+invalidate) corresponding caches.
4298     */
4299    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4300    if (clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
4301       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4302       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_COLOR).value);
4303    }
4304    if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4305       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4306       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_DEPTH).value);
4307    }
4308    tu_cs_emit_wfi(cs);
4309    tu_cond_exec_end(cs);
4310 
4311    struct tu_resolve_group resolve_group = {};
4312 
4313    const struct tu_subpass *subpass = cmd->state.subpass;
4314    for (uint32_t i = 0; i < attachmentCount; i++) {
4315       uint32_t a;
4316       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4317          uint32_t c = pAttachments[i].colorAttachment;
4318          a = subpass->color_attachments[c].attachment;
4319       } else {
4320          a = subpass->depth_stencil_attachment.attachment;
4321       }
4322       if (a != VK_ATTACHMENT_UNUSED) {
4323          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4324          const struct tu_image_view *iview = cmd->state.attachments[a];
4325          trace_start_generic_clear(&cmd->trace, cs, att->format,
4326                                    iview->view.ubwc_enabled, att->samples);
4327          for (unsigned j = 0; j < rectCount; j++) {
4328             tu7_clear_attachment_generic_single_rect(
4329                cmd, cs, &resolve_group, att, &pAttachments[i], a, &pRects[j]);
4330          }
4331          trace_end_generic_clear(&cmd->trace, cs);
4332       }
4333    }
4334 
4335    tu_emit_resolve_group<A7XX>(cmd, cs, &resolve_group);
4336 }
4337 
4338 template <chip CHIP>
4339 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4340 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
4341                        uint32_t attachmentCount,
4342                        const VkClearAttachment *pAttachments,
4343                        uint32_t rectCount,
4344                        const VkClearRect *pRects)
4345 {
4346    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4347 
4348    for (uint32_t j = 0; j < attachmentCount; j++) {
4349       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
4350          continue;
4351 
4352       tu_lrz_disable_during_renderpass<CHIP>(cmd);
4353    }
4354 
4355    if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
4356        /* Both having predication and not knowing layout could be solved
4357         * by cs patching, which is exactly what prop driver is doing.
4358         * We don't implement it because we don't expect a reasonable impact.
4359         */
4360        !(cmd->state.predication_active ||
4361          cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT)) {
4362       tu_clear_attachments_generic(cmd, attachmentCount, pAttachments, rectCount, pRects);
4363    } else {
4364       tu_clear_attachments<CHIP>(cmd, attachmentCount, pAttachments,
4365                                  rectCount, pRects);
4366    }
4367 }
4368 TU_GENX(tu_CmdClearAttachments);
4369 
4370 template <chip CHIP>
4371 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)4372 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4373                         struct tu_cs *cs,
4374                         VkFormat vk_format,
4375                         VkImageAspectFlags clear_mask,
4376                         uint32_t a,
4377                         bool separate_ds)
4378 {
4379    enum pipe_format format = vk_format_to_pipe_format(vk_format);
4380    const struct tu_framebuffer *fb = cmd->state.framebuffer;
4381    const struct tu_image_view *iview = cmd->state.attachments[a];
4382    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
4383    const struct blit_ops *ops = &r2d_ops<CHIP>;
4384    const VkClearValue *value = &cmd->state.clear_values[a];
4385    if (cmd->state.pass->attachments[a].samples > 1)
4386       ops = &r3d_ops<CHIP>;
4387 
4388    trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
4389                             cmd->state.pass->attachments[a].samples);
4390 
4391    ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
4392               cmd->state.pass->attachments[a].samples);
4393    ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
4394                cmd->state.render_area.extent);
4395    ops->clear_value(cmd, cs, format, value);
4396 
4397    for_each_layer(i, clear_views, fb->layers) {
4398       if (separate_ds) {
4399          if (vk_format == VK_FORMAT_D32_SFLOAT) {
4400             ops->dst_depth(cs, iview, i);
4401          } else {
4402             ops->dst_stencil(cs, iview, i);
4403          }
4404       } else {
4405          ops->dst(cs, &iview->view, i, format);
4406       }
4407       ops->run(cmd, cs);
4408    }
4409 
4410    ops->teardown(cmd, cs);
4411 
4412    trace_end_sysmem_clear(&cmd->trace, cs);
4413 }
4414 
4415 template <chip CHIP>
4416 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)4417 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4418                            struct tu_cs *cs,
4419                            uint32_t a)
4420 {
4421    const struct tu_render_pass_attachment *attachment =
4422       &cmd->state.pass->attachments[a];
4423 
4424    if (!attachment->clear_mask)
4425       return;
4426 
4427    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4428       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4429          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
4430                                  a, true);
4431       }
4432       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4433          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
4434                                  a, true);
4435       }
4436    } else {
4437       clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
4438                               a, false);
4439    }
4440 
4441    /* The spec doesn't explicitly say, but presumably the initial renderpass
4442     * clear is considered part of the renderpass, and therefore barriers
4443     * aren't required inside the subpass/renderpass.  Therefore we need to
4444     * flush CCU color into CCU depth here, just like with
4445     * vkCmdClearAttachments(). Note that because this only happens at the
4446     * beginning of a renderpass, and renderpass writes are considered
4447     * "incoherent", we shouldn't have to worry about syncing depth into color
4448     * beforehand as depth should already be flushed.
4449     */
4450    if (vk_format_is_depth_or_stencil(attachment->format)) {
4451       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4452       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
4453       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
4454    } else {
4455       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4456       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
4457    }
4458 
4459    tu_cs_emit_wfi(cs);
4460 }
4461 TU_GENX(tu_clear_sysmem_attachment);
4462 
4463 template <chip CHIP>
4464 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4465 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4466                          struct tu_cs *cs,
4467                          struct tu_resolve_group *resolve_group,
4468                          uint32_t a)
4469 {
4470    const struct tu_render_pass_attachment *attachment =
4471       &cmd->state.pass->attachments[a];
4472 
4473    if (!attachment->clear_mask)
4474       return;
4475 
4476    tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
4477                                  cmd->state.framebuffer->layers,
4478                                  attachment->clear_views,
4479                                  attachment->clear_mask,
4480                                  &cmd->state.clear_values[a]);
4481 }
4482 TU_GENX(tu_clear_gmem_attachment);
4483 
4484 void
tu7_generic_clear_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4485 tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
4486                              struct tu_cs *cs,
4487                              struct tu_resolve_group *resolve_group,
4488                              uint32_t a)
4489 {
4490    const struct tu_render_pass_attachment *att =
4491       &cmd->state.pass->attachments[a];
4492    const VkClearValue *value = &cmd->state.clear_values[a];
4493    const struct tu_image_view *iview = cmd->state.attachments[a];
4494 
4495    trace_start_generic_clear(&cmd->trace, cs, att->format,
4496                              iview->view.ubwc_enabled, att->samples);
4497 
4498    enum pipe_format format = vk_format_to_pipe_format(att->format);
4499    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4500       uint32_t layer = i + 0;
4501       uint32_t mask =
4502          aspect_write_mask_generic_clear(format, att->clear_mask);
4503       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4504          if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4505             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
4506             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4507                                     false, layer, value, a);
4508          }
4509          if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4510             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
4511             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4512                                     layer, value, a);
4513          }
4514       } else {
4515          uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<A7XX>(resolve_group, att->format);
4516          tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4517       }
4518    }
4519 
4520    tu_flush_for_access(&cmd->state.renderpass_cache,
4521                        TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4522 
4523    trace_end_generic_clear(&cmd->trace, cs);
4524 }
4525 
4526 template <chip CHIP>
4527 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,const VkClearValue * clear_value,enum a6xx_blit_event_type blit_event_type,bool separate_stencil)4528 tu_emit_blit(struct tu_cmd_buffer *cmd,
4529              struct tu_cs *cs,
4530              struct tu_resolve_group *resolve_group,
4531              const struct tu_image_view *iview,
4532              const struct tu_render_pass_attachment *attachment,
4533              const VkClearValue *clear_value,
4534              enum a6xx_blit_event_type blit_event_type,
4535              bool separate_stencil)
4536 {
4537    assert(blit_event_type != BLIT_EVENT_CLEAR);
4538    uint32_t clear_mask = 0;
4539 
4540    /* BLIT_EVENT_STORE_AND_CLEAR would presumably swallow the
4541     * BLIT_EVENT_CLEAR at the start of a renderpass, and be more efficient.
4542     */
4543    if (blit_event_type == BLIT_EVENT_STORE && clear_value &&
4544        attachment->clear_mask &&
4545        use_generic_clear_for_image_clear(cmd, iview->image)) {
4546       blit_event_type = BLIT_EVENT_STORE_AND_CLEAR;
4547 
4548       enum pipe_format format = vk_format_to_pipe_format(attachment->format);
4549       VkImageAspectFlags aspect_mask = attachment->clear_mask;
4550       if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
4551          if (separate_stencil)
4552             aspect_mask = VK_IMAGE_ASPECT_STENCIL_BIT;
4553          else
4554             aspect_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
4555       }
4556       if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
4557          if (separate_stencil)
4558             format = PIPE_FORMAT_S8_UINT;
4559          else
4560             format = PIPE_FORMAT_Z32_FLOAT;
4561       }
4562 
4563       clear_mask = aspect_write_mask_generic_clear(format, aspect_mask);
4564 
4565       uint32_t clear_vals[4] = {};
4566       pack_blit_event_clear_value(clear_value, format, clear_vals);
4567 
4568       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4569       tu_cs_emit_array(cs, clear_vals, 4);
4570    }
4571 
4572    enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER;
4573    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4574       if (!separate_stencil)
4575          buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
4576       else
4577          buffer_type = TU_RESOLVE_GROUP_STENCIL_BUFFER;
4578    } else if (attachment->format == VK_FORMAT_D24_UNORM_S8_UINT) {
4579       buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
4580    }
4581 
4582    uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, buffer_type);
4583    event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);
4584 
4585    for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
4586       event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
4587       event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
4588    }
4589 
4590    tu_flush_for_access(&cmd->state.cache, TU_ACCESS_BLIT_WRITE_GMEM,
4591                        TU_ACCESS_NONE);
4592 }
4593 
4594 static bool
blit_can_resolve(VkFormat format)4595 blit_can_resolve(VkFormat format)
4596 {
4597    const struct util_format_description *desc = vk_format_description(format);
4598 
4599    /* blit event can only do resolve for simple cases:
4600     * averaging samples as unsigned integers or choosing only one sample
4601     * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
4602     */
4603    if (vk_format_is_snorm(format))
4604       return false;
4605 
4606    /* can't do formats with larger channel sizes
4607     * note: this includes all float formats
4608     * note2: single channel integer formats seem OK
4609     */
4610    if (desc->channel[0].size > 10 && vk_format_is_color(format))
4611       return false;
4612 
4613    switch (format) {
4614    /* for unknown reasons blit event can't msaa resolve these formats when tiled
4615     * likely related to these formats having different layout from other cpp=2 formats
4616     */
4617    case VK_FORMAT_R8G8_UNORM:
4618    case VK_FORMAT_R8G8_UINT:
4619    case VK_FORMAT_R8G8_SINT:
4620    case VK_FORMAT_R8G8_SRGB:
4621       return false;
4622    default:
4623       break;
4624    }
4625 
4626    return true;
4627 }
4628 
4629 struct apply_load_coords_state {
4630    unsigned view;
4631 };
4632 
4633 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4634 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
4635                       struct tu_cs *cs,
4636                       void *data,
4637                       VkRect2D bin,
4638                       unsigned views,
4639                       VkExtent2D *frag_areas)
4640 {
4641    const struct apply_load_coords_state *state =
4642       (const struct apply_load_coords_state *)data;
4643    assert(state->view < views);
4644    VkExtent2D frag_area = frag_areas[state->view];
4645 
4646    assert(bin.extent.width % frag_area.width == 0);
4647    assert(bin.extent.height % frag_area.height == 0);
4648    uint32_t scaled_width = bin.extent.width / frag_area.width;
4649    uint32_t scaled_height = bin.extent.height / frag_area.height;
4650 
4651    const float coords[] = {
4652       bin.offset.x,                    bin.offset.y,
4653       bin.offset.x,                    bin.offset.y,
4654       bin.offset.x + scaled_width,     bin.offset.y + scaled_height,
4655       bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
4656    };
4657    r3d_coords_raw(cmd, cs, coords);
4658 }
4659 
4660 template <chip CHIP>
4661 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)4662 load_3d_blit(struct tu_cmd_buffer *cmd,
4663              struct tu_cs *cs,
4664              const struct tu_image_view *iview,
4665              const struct tu_render_pass_attachment *att,
4666              bool separate_stencil)
4667 {
4668    const struct tu_framebuffer *fb = cmd->state.framebuffer;
4669    enum pipe_format format = iview->view.format;
4670    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4671       if (separate_stencil)
4672          format = PIPE_FORMAT_S8_UINT;
4673       else
4674          format = PIPE_FORMAT_Z32_FLOAT;
4675    }
4676    r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
4677                    R3D_DST_GMEM, false, iview->view.ubwc_enabled,
4678                    iview->image->vk.samples);
4679 
4680    if (!cmd->state.pass->has_fdm) {
4681       r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
4682                  (VkExtent2D) { fb->width, fb->height });
4683    }
4684 
4685    /* Normal loads read directly from system memory, so we have to invalidate
4686     * UCHE in case it contains stale data.
4687     */
4688    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4689 
4690    /* Wait for CACHE_INVALIDATE to land */
4691    tu_cs_emit_wfi(cs);
4692 
4693    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4694       if (cmd->state.pass->has_fdm) {
4695          struct apply_load_coords_state state = {
4696             .view = att->clear_views ? i : 0,
4697          };
4698          tu_create_fdm_bin_patchpoint(cmd, cs, 4, fdm_apply_load_coords, state);
4699       }
4700 
4701       r3d_dst_gmem<CHIP>(cmd, cs, iview, att, separate_stencil, i);
4702 
4703       if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4704          if (separate_stencil)
4705             r3d_src_stencil(cmd, cs, iview, i);
4706          else
4707             r3d_src_depth(cmd, cs, iview, i);
4708       } else {
4709          r3d_src_gmem_load(cmd, cs, iview, i);
4710       }
4711 
4712       r3d_run(cmd, cs);
4713    }
4714 
4715    r3d_teardown<CHIP>(cmd, cs);
4716 
4717    /* It seems we need to WFI here for depth/stencil because color writes here
4718     * aren't synchronized with depth/stencil writes.
4719     *
4720     * Note: the blob also uses a WFI for color attachments but this hasn't
4721     * been seen to be necessary.
4722     */
4723    if (vk_format_is_depth_or_stencil(att->format))
4724       tu_cs_emit_wfi(cs);
4725 }
4726 
4727 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4728 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4729                               struct tu_cs *cs, bool load)
4730 {
4731    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
4732 
4733    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4734       return;
4735 
4736    uint64_t result_iova;
4737    if (load)
4738       result_iova = global_iova(cmd, dbg_gmem_taken_loads);
4739    else
4740       result_iova = global_iova(cmd, dbg_gmem_taken_stores);
4741 
4742    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4743    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4744    tu_cs_emit_qw(cs, result_iova);
4745    tu_cs_emit_qw(cs, result_iova);
4746    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4747 }
4748 
4749 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4750 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4751                             struct tu_cs *cs, bool load)
4752 {
4753    tu_cond_exec_end(cs);
4754 
4755    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4756       return;
4757 
4758    uint64_t result_iova;
4759    if (load)
4760       result_iova = global_iova(cmd, dbg_gmem_total_loads);
4761    else
4762       result_iova = global_iova(cmd, dbg_gmem_total_stores);
4763 
4764    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4765    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4766    tu_cs_emit_qw(cs, result_iova);
4767    tu_cs_emit_qw(cs, result_iova);
4768    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4769 }
4770 
4771 template <chip CHIP>
4772 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,bool cond_exec_allowed,bool force_load)4773 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
4774                         struct tu_cs *cs,
4775                         struct tu_resolve_group *resolve_group,
4776                         uint32_t a,
4777                         bool cond_exec_allowed,
4778                         bool force_load)
4779 {
4780    const struct tu_image_view *iview = cmd->state.attachments[a];
4781    const struct tu_render_pass_attachment *attachment =
4782       &cmd->state.pass->attachments[a];
4783 
4784    bool load_common = attachment->load || force_load;
4785    bool load_stencil =
4786       attachment->load_stencil ||
4787       (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
4788 
4789    if (!load_common && !load_stencil)
4790       return;
4791 
4792    trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
4793 
4794    /* If attachment will be cleared by vkCmdClearAttachments - it is likely
4795     * that it would be partially cleared, and since it is done by 2d blit
4796     * it doesn't produce geometry, so we have to unconditionally load.
4797     *
4798     * To simplify conditions treat partially cleared separate DS as fully
4799     * cleared and don't emit cond_exec.
4800     */
4801    bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
4802    if (cond_exec)
4803       tu_begin_load_store_cond_exec(cmd, cs, true);
4804 
4805    if (TU_DEBUG(3D_LOAD) ||
4806        cmd->state.pass->has_fdm) {
4807       if (load_common || load_stencil)
4808          tu_disable_draw_states(cmd, cs);
4809 
4810       if (load_common)
4811          load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
4812 
4813       if (load_stencil)
4814          load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
4815    } else {
4816       if (load_common)
4817          tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
4818 
4819       if (load_stencil)
4820          tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
4821    }
4822 
4823    if (cond_exec)
4824       tu_end_load_store_cond_exec(cmd, cs, true);
4825 
4826    trace_end_gmem_load(&cmd->trace, cs);
4827 }
4828 TU_GENX(tu_load_gmem_attachment);
4829 
4830 template <chip CHIP>
4831 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4832 store_cp_blit(struct tu_cmd_buffer *cmd,
4833               struct tu_cs *cs,
4834               const struct tu_image_view *iview,
4835               uint32_t samples,
4836               bool separate_stencil,
4837               enum pipe_format src_format,
4838               enum pipe_format dst_format,
4839               uint32_t layer,
4840               uint32_t gmem_offset,
4841               uint32_t cpp)
4842 {
4843    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
4844                           VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
4845                           iview->view.ubwc_enabled, true);
4846 
4847    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4848       if (!separate_stencil) {
4849          r2d_dst_depth(cs, iview, layer);
4850       } else {
4851          r2d_dst_stencil(cs, iview, layer);
4852       }
4853    } else {
4854       r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
4855    }
4856 
4857    enum a6xx_format fmt = blit_format_texture<CHIP>(src_format, TILE6_2, false, true).fmt;
4858    fixup_src_format(&src_format, dst_format, &fmt);
4859 
4860    tu_cs_emit_regs(cs,
4861                    SP_PS_2D_SRC_INFO(CHIP,
4862                       .color_format = fmt,
4863                       .tile_mode = TILE6_2,
4864                       .color_swap = WZYX,
4865                       .srgb = util_format_is_srgb(src_format),
4866                       .samples = tu_msaa_samples(samples),
4867                       .samples_average = !util_format_is_pure_integer(dst_format) &&
4868                                          !util_format_is_depth_or_stencil(dst_format),
4869                       .unk20 = 1,
4870                       .unk22 = 1),
4871                    SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
4872                    SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
4873                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
4874 
4875    /* sync GMEM writes with CACHE. */
4876    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4877    if (CHIP >= A7XX)
4878       /* On A7XX, we need to wait for any CP_EVENT_WRITE::BLIT operations
4879        * arising from GMEM load/clears to land before we can continue.
4880        */
4881       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
4882 
4883    /* Wait for cache event to land */
4884    tu_cs_emit_wfi(cs);
4885 
4886    r2d_run(cmd, cs);
4887 
4888    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4889     * sysmem, and we generally assume that GMEM renderpasses leave their
4890     * results in sysmem, so we need to flush manually here.
4891     */
4892    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4893 }
4894 
4895 template <chip CHIP>
4896 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4897 store_3d_blit(struct tu_cmd_buffer *cmd,
4898               struct tu_cs *cs,
4899               const struct tu_image_view *iview,
4900               VkSampleCountFlagBits dst_samples,
4901               bool separate_stencil,
4902               enum pipe_format src_format,
4903               enum pipe_format dst_format,
4904               const VkRect2D *render_area,
4905               uint32_t layer,
4906               uint32_t gmem_offset,
4907               uint32_t cpp)
4908 {
4909    /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
4910     * aren't set until we know whether we're HW binning or not, and we want to
4911     * avoid a dependence on that here to be able to store attachments before
4912     * the end of the renderpass in the future. Use the scratch space to
4913     * save/restore them dynamically.
4914     */
4915    tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4916    tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4917                   CP_REG_TO_SCRATCH_0_SCRATCH(0) |
4918                   CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4919    if (CHIP >= A7XX) {
4920       tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4921       tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4922                      CP_REG_TO_SCRATCH_0_SCRATCH(1) |
4923                      CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4924    }
4925 
4926    r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
4927                    0, false, iview->view.ubwc_enabled, dst_samples);
4928 
4929    r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
4930 
4931    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4932       if (!separate_stencil) {
4933          r3d_dst_depth<CHIP>(cs, iview, layer);
4934       } else {
4935          r3d_dst_stencil<CHIP>(cs, iview, layer);
4936       }
4937    } else {
4938       r3d_dst<CHIP>(cs, &iview->view, layer, src_format);
4939    }
4940 
4941    r3d_src_gmem<CHIP>(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
4942 
4943    /* sync GMEM writes with CACHE. */
4944    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4945 
4946    /* Wait for CACHE_INVALIDATE to land */
4947    tu_cs_emit_wfi(cs);
4948 
4949    r3d_run(cmd, cs);
4950 
4951    r3d_teardown<CHIP>(cmd, cs);
4952 
4953    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4954     * sysmem, and we generally assume that GMEM renderpasses leave their
4955     * results in sysmem, so we need to flush manually here. The 3d blit path
4956     * writes to depth images as a color RT, so there's no need to flush depth.
4957     */
4958    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4959 
4960    /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
4961    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4962    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4963                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4964                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4965 
4966    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4967    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
4968                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4969                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4970 
4971    if (CHIP >= A7XX) {
4972       tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4973       tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4974                         CP_SCRATCH_TO_REG_0_SCRATCH(1) |
4975                         CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4976    }
4977 }
4978 
4979 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)4980 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
4981 {
4982    struct tu_physical_device *phys_dev = cmd->device->physical_device;
4983    const struct tu_image_view *iview = cmd->state.attachments[a];
4984    const VkRect2D *render_area = &cmd->state.render_area;
4985 
4986    /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
4987    if (TU_DEBUG(UNALIGNED_STORE))
4988       return true;
4989 
4990    /* We always use the unaligned store path when scaling rendering. */
4991    if (cmd->state.pass->has_fdm)
4992       return true;
4993 
4994    uint32_t x1 = render_area->offset.x;
4995    uint32_t y1 = render_area->offset.y;
4996    uint32_t x2 = x1 + render_area->extent.width;
4997    uint32_t y2 = y1 + render_area->extent.height;
4998    /* x2/y2 can be unaligned if equal to the size of the image, since it will
4999     * write into padding space. The one exception is linear levels which don't
5000     * have the required y padding in the layout (except for the last level)
5001     */
5002    bool need_y2_align =
5003       y2 != iview->view.height || iview->view.need_y2_align;
5004 
5005    return (x1 % phys_dev->info->gmem_align_w ||
5006            (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
5007            y1 % phys_dev->info->gmem_align_h ||
5008            (y2 % phys_dev->info->gmem_align_h && need_y2_align));
5009 }
5010 
5011 /* Choose the GMEM layout (use the CCU space or not) based on whether the
5012  * current attachments will need.  This has to happen at vkBeginRenderPass()
5013  * time because tu_attachment_store_unaligned() looks at the image views, which
5014  * are only available at that point.  This should match the logic for the
5015  * !use_fast_path case in tu_store_gmem_attachment().
5016  */
5017 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)5018 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
5019 {
5020    cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
5021 
5022    for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
5023       if (!cmd->state.attachments[i])
5024          continue;
5025 
5026       struct tu_render_pass_attachment *att =
5027          &cmd->state.pass->attachments[i];
5028       if ((att->store || att->store_stencil) &&
5029           tu_attachment_store_unaligned(cmd, i))
5030          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5031       if (att->store && att->format == VK_FORMAT_S8_UINT)
5032          /* We cannot pick out S8 from D24S8/D32S8, so we conservatively disable
5033           * blit events for the S8_UINT format.
5034           */
5035          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5036       if (att->will_be_resolved && !blit_can_resolve(att->format))
5037          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5038    }
5039 
5040    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
5041 }
5042 
5043 struct apply_store_coords_state {
5044    unsigned view;
5045 };
5046 
5047 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)5048 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
5049                        struct tu_cs *cs,
5050                        void *data,
5051                        VkRect2D bin,
5052                        unsigned views,
5053                        VkExtent2D *frag_areas)
5054 {
5055    const struct apply_store_coords_state *state =
5056       (const struct apply_store_coords_state *)data;
5057    assert(state->view < views);
5058    VkExtent2D frag_area = frag_areas[state->view];
5059 
5060    /* The bin width/height must be a multiple of the frag_area to make sure
5061     * that the scaling happens correctly. This means there may be some
5062     * destination pixels jut out of the framebuffer, but they should be
5063     * clipped by the render area.
5064     */
5065    assert(bin.extent.width % frag_area.width == 0);
5066    assert(bin.extent.height % frag_area.height == 0);
5067    uint32_t scaled_width = bin.extent.width / frag_area.width;
5068    uint32_t scaled_height = bin.extent.height / frag_area.height;
5069 
5070    tu_cs_emit_regs(cs,
5071       A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
5072                           .y = bin.offset.y),
5073       A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
5074                           .y = bin.offset.y + bin.extent.height - 1));
5075    tu_cs_emit_regs(cs,
5076                    A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
5077                    A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
5078                    A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
5079                    A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
5080 }
5081 
5082 template <chip CHIP>
5083 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)5084 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
5085                          struct tu_cs *cs,
5086                          struct tu_resolve_group *resolve_group,
5087                          uint32_t a,
5088                          uint32_t gmem_a,
5089                          uint32_t layers,
5090                          uint32_t layer_mask,
5091                          bool cond_exec_allowed)
5092 {
5093    const VkRect2D *render_area = &cmd->state.render_area;
5094    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
5095    const struct tu_image_view *iview = cmd->state.attachments[a];
5096    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
5097    const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
5098    bool resolve = a != gmem_a;
5099    if (resolve)
5100       clear_value = NULL;
5101 
5102    if (!dst->store && !dst->store_stencil)
5103       return;
5104 
5105    bool unaligned = tu_attachment_store_unaligned(cmd, a);
5106 
5107    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
5108     * one for depth and other for stencil. When resolving a MSAA
5109     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
5110     */
5111    bool resolve_d32s8_s8 =
5112       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
5113       dst->format == VK_FORMAT_S8_UINT;
5114 
5115    /* The fast path doesn't support picking out the last component of a D24S8
5116     * texture reinterpreted as RGBA8_UNORM.
5117     */
5118    bool resolve_d24s8_s8 =
5119       src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
5120       dst->format == VK_FORMAT_S8_UINT;
5121 
5122    bool store_common = dst->store && !resolve_d32s8_s8;
5123    bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
5124 
5125    bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
5126                         (a == gmem_a || blit_can_resolve(dst->format));
5127 
5128    trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
5129 
5130    /* Unconditional store should happen only if attachment was cleared,
5131     * which could have happened either by load_op or via vkCmdClearAttachments.
5132     */
5133    bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
5134    if (cond_exec) {
5135       tu_begin_load_store_cond_exec(cmd, cs, false);
5136    }
5137 
5138    /* use fast path when render area is aligned, except for unsupported resolve cases */
5139    if (use_fast_path) {
5140       if (store_common)
5141          tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, false);
5142       if (store_separate_stencil)
5143          tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, true);
5144 
5145       if (cond_exec) {
5146          tu_end_load_store_cond_exec(cmd, cs, false);
5147       }
5148 
5149       trace_end_gmem_store(&cmd->trace, cs);
5150       return;
5151    }
5152 
5153    assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
5154 
5155    enum pipe_format src_format = vk_format_to_pipe_format(src->format);
5156    if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5157       src_format = PIPE_FORMAT_Z32_FLOAT;
5158 
5159    enum pipe_format dst_format = vk_format_to_pipe_format(dst->format);
5160    if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5161       dst_format = PIPE_FORMAT_Z32_FLOAT;
5162 
5163    if (dst->samples > 1) {
5164       /* If we hit this path, we have to disable draw states after every tile
5165        * instead of once at the end of the renderpass, so that they aren't
5166        * executed when calling CP_DRAW.
5167        *
5168        * TODO: store a flag somewhere so we don't do this more than once and
5169        * don't do it after the renderpass when this happens.
5170        */
5171       if (store_common || store_separate_stencil)
5172          tu_disable_draw_states(cmd, cs);
5173 
5174       for_each_layer(i, layer_mask, layers) {
5175          if (store_common) {
5176             store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
5177                           dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5178          }
5179          if (store_separate_stencil) {
5180             store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
5181                           PIPE_FORMAT_S8_UINT, render_area, i,
5182                           tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5183          }
5184       }
5185    } else {
5186       if (!cmd->state.pass->has_fdm) {
5187          r2d_coords(cmd, cs, render_area->offset, render_area->offset,
5188                     render_area->extent);
5189       } else {
5190          /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
5191           * area and the coordinates span the entire render area, but for
5192           * FDM we need to scale the coordinates so we need to take the
5193           * opposite aproach, specifying the exact bin size in the destination
5194           * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
5195           * area.
5196           */
5197          tu_cs_emit_regs(cs,
5198                          A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
5199                                                      .y = render_area->offset.y,),
5200                          A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
5201                                                      .y = render_area->offset.y + render_area->extent.height - 1,));
5202       }
5203 
5204       for_each_layer (i, layer_mask, layers) {
5205          if (cmd->state.pass->has_fdm) {
5206             unsigned view = layer_mask ? i : 0;
5207             struct apply_store_coords_state state = {
5208                .view = view,
5209             };
5210             tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
5211                                          state);
5212          }
5213          if (store_common) {
5214             store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
5215                           dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5216          }
5217          if (store_separate_stencil) {
5218             store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
5219                           PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5220          }
5221       }
5222    }
5223 
5224    if (cond_exec) {
5225       tu_end_load_store_cond_exec(cmd, cs, false);
5226    }
5227 
5228    trace_end_gmem_store(&cmd->trace, cs);
5229 }
5230 TU_GENX(tu_store_gmem_attachment);
5231