• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <jonathan@marek.ca>
7  */
8 
9 #include "tu_clear_blit.h"
10 
11 #include "ir3/ir3_nir.h"
12 
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18 
19 #include "tu_buffer.h"
20 #include "tu_cmd_buffer.h"
21 #include "tu_cs.h"
22 #include "tu_formats.h"
23 #include "tu_image.h"
24 #include "tu_tracepoints.h"
25 #include "tu_lrz.h"
26 
27 #include "common/freedreno_gpu_event.h"
28 #include "common/freedreno_lrz.h"
29 
30 static const VkOffset2D blt_no_coord = { ~0, ~0 };
31 
32 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)33 tu_pack_float32_for_unorm(float val, int bits)
34 {
35    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
36 }
37 
38 /* r2d_ = BLIT_OP_SCALE operations */
39 
40 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)41 format_to_ifmt(enum pipe_format format)
42 {
43    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
44        format == PIPE_FORMAT_Z24X8_UNORM)
45       return R2D_UNORM8;
46 
47    /* get_component_bits doesn't work with depth/stencil formats: */
48    if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
49       return R2D_FLOAT32;
50    if (format == PIPE_FORMAT_S8_UINT)
51       return R2D_INT8;
52    if (format == PIPE_FORMAT_A8_UNORM)
53       return R2D_UNORM8;
54 
55    /* use the size of the red channel to find the corresponding "ifmt" */
56    bool is_int = util_format_is_pure_integer(format);
57    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
58    case 4: case 5: case 8:
59       return is_int ? R2D_INT8 : R2D_UNORM8;
60    case 10: case 11:
61       return is_int ? R2D_INT16 : R2D_FLOAT16;
62    case 16:
63       if (util_format_is_float(format))
64          return R2D_FLOAT16;
65       return is_int ? R2D_INT16 : R2D_FLOAT32;
66    case 32:
67       return is_int ? R2D_INT32 : R2D_FLOAT32;
68     default:
69       unreachable("bad format");
70    }
71 }
72 
73 template <chip CHIP>
74 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode,bool is_mutable,bool gmem)75 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode, bool is_mutable, bool gmem)
76 {
77    struct tu_native_format fmt = tu6_format_texture(format, tile_mode, is_mutable);
78 
79    switch (format) {
80    case PIPE_FORMAT_Z24X8_UNORM:
81    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
82       /* Similar to in fdl6_view_init, we want to use
83        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
84        * src.  Since this is called when there is no image and thus no ubwc,
85        * we can always use FMT6_8_8_8_8_UNORM.
86        *
87        * Note (A7XX): Since it's erroneous to use FMT6_8_8_8_8_UNORM for a GMEM
88        * image (see blit_base_format), we use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
89        * instead.
90        */
91       fmt.fmt = CHIP >= A7XX && gmem ? FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 : FMT6_8_8_8_8_UNORM;
92       break;
93    default:
94       break;
95    }
96 
97    return fmt;
98 }
99 
100 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)101 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
102 {
103    struct tu_native_format fmt = tu6_format_color(format, tile_mode, false);
104 
105    switch (format) {
106    case PIPE_FORMAT_Z24X8_UNORM:
107    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
108       /* similar to blit_format_texture but for blit dst */
109       fmt.fmt = FMT6_8_8_8_8_UNORM;
110       break;
111    default:
112       break;
113    }
114 
115    return fmt;
116 }
117 
118 template <chip CHIP>
119 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc,bool gmem)120 blit_base_format(enum pipe_format format, bool ubwc, bool gmem)
121 {
122    if (CHIP >= A7XX && gmem)
123       /* A7XX requires D24S8 in GMEM to always be treated as
124        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 regardless of if the image
125        * is UBWC-compatible. Using FMT6_8_8_8_8_UNORM instead will result
126        * in misrendering around the edges of the destination image.
127        */
128       ubwc = true;
129 
130    if (ubwc) {
131       switch (format) {
132       case PIPE_FORMAT_Z24X8_UNORM:
133       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
134          /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
135          return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
136       default:
137          break;
138       }
139    }
140 
141    /* note: tu6_format_color doesn't care about tiling for .fmt field */
142    return blit_format_color(format, TILE6_LINEAR).fmt;
143 }
144 
145 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)146 r2d_coords(struct tu_cmd_buffer *cmd,
147            struct tu_cs *cs,
148            const VkOffset2D dst,
149            const VkOffset2D src,
150            const VkExtent2D extent)
151 {
152    tu_cs_emit_regs(cs,
153       A6XX_GRAS_2D_DST_TL(.x = dst.x,                    .y = dst.y),
154       A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
155 
156    if (src.x == blt_no_coord.x)
157       return;
158 
159    tu_cs_emit_regs(cs,
160                    A6XX_GRAS_2D_SRC_TL_X(src.x),
161                    A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
162                    A6XX_GRAS_2D_SRC_TL_Y(src.y),
163                    A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
164 }
165 
166 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)167 r2d_clear_value(struct tu_cmd_buffer *cmd,
168                 struct tu_cs *cs,
169                 enum pipe_format format,
170                 const VkClearValue *val)
171 {
172    uint32_t clear_value[4] = {};
173 
174    switch (format) {
175    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
176    case PIPE_FORMAT_Z24X8_UNORM:
177       /* cleared as r8g8b8a8_unorm using special format */
178       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
179       clear_value[1] = clear_value[0] >> 8;
180       clear_value[2] = clear_value[0] >> 16;
181       clear_value[3] = val->depthStencil.stencil;
182       break;
183    case PIPE_FORMAT_Z16_UNORM:
184    case PIPE_FORMAT_Z32_FLOAT:
185       /* R2D_FLOAT32 */
186       clear_value[0] = fui(val->depthStencil.depth);
187       break;
188    case PIPE_FORMAT_S8_UINT:
189       clear_value[0] = val->depthStencil.stencil;
190       break;
191    case PIPE_FORMAT_R9G9B9E5_FLOAT:
192       /* cleared as UINT32 */
193       clear_value[0] = float3_to_rgb9e5(val->color.float32);
194       break;
195    default:
196       assert(!util_format_is_depth_or_stencil(format));
197       const struct util_format_description *desc = util_format_description(format);
198       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
199 
200       assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
201              format == PIPE_FORMAT_R11G11B10_FLOAT);
202 
203       for (unsigned i = 0; i < 4; i++) {
204          if (desc->swizzle[i] > PIPE_SWIZZLE_W)
205             continue;
206 
207          const struct util_format_channel_description *ch =
208             &desc->channel[desc->swizzle[i]];
209          if (ifmt == R2D_UNORM8) {
210             float linear = val->color.float32[i];
211             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
212                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
213 
214             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
215                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
216             else
217                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
218          } else if (ifmt == R2D_FLOAT16) {
219             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
220          } else {
221             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
222                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
223             clear_value[i] = val->color.uint32[i];
224          }
225       }
226       break;
227    }
228 
229    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
230    tu_cs_emit_array(cs, clear_value, 4);
231 }
232 
233 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)234 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
235                  enum a6xx_format *fmt)
236 {
237    /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
238     * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
239     * We also have to move stencil, which is normally in the .w channel, into
240     * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
241     * problems, and avoids using a swap, which seems to sometimes not work
242     * with a D24S8 source, or a texture swizzle which is only supported with
243     * the 3d path. Sometimes this blit happens on already-constructed
244     * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
245     */
246    if (*src_format == PIPE_FORMAT_S8_UINT &&
247        (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
248         dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
249       *fmt = FMT6_A8_UNORM;
250       *src_format = PIPE_FORMAT_A8_UNORM;
251    }
252 }
253 
254 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)255 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
256                  enum a6xx_format *fmt)
257 {
258    if (*dst_format == PIPE_FORMAT_S8_UINT &&
259        (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
260         src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
261       *dst_format = PIPE_FORMAT_A8_UNORM;
262       *fmt = FMT6_A8_UNORM;
263    }
264 }
265 
266 template <chip CHIP>
267 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)268 r2d_src(struct tu_cmd_buffer *cmd,
269         struct tu_cs *cs,
270         const struct fdl6_view *iview,
271         uint32_t layer,
272         VkFilter filter,
273         enum pipe_format dst_format)
274 {
275    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
276    if (filter != VK_FILTER_NEAREST)
277       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
278 
279    enum a6xx_format fmt = (enum a6xx_format)(
280       src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
281    enum pipe_format src_format = iview->format;
282    fixup_src_format(&src_format, dst_format, &fmt);
283 
284    src_info =
285       (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
286       A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
287 
288    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
289    tu_cs_emit(cs, src_info);
290    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
291    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
292 
293    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
294    tu_cs_image_flag_ref(cs, iview, layer);
295 }
296 
297 template <chip CHIP>
298 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)299 r2d_src_depth(struct tu_cmd_buffer *cmd,
300                 struct tu_cs *cs,
301                 const struct tu_image_view *iview,
302                 uint32_t layer,
303                 VkFilter filter)
304 {
305    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
306    tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
307    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
308    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
309    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
310    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
311 
312    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
313    tu_cs_image_flag_ref(cs, &iview->view, layer);
314 }
315 
316 template <chip CHIP>
317 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)318 r2d_src_stencil(struct tu_cmd_buffer *cmd,
319                 struct tu_cs *cs,
320                 const struct tu_image_view *iview,
321                 uint32_t layer,
322                 VkFilter filter)
323 {
324    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
325    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
326    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
327    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
328    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
329 }
330 
331 template <chip CHIP>
332 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)333 r2d_src_buffer(struct tu_cmd_buffer *cmd,
334                struct tu_cs *cs,
335                enum pipe_format format,
336                uint64_t va, uint32_t pitch,
337                uint32_t width, uint32_t height,
338                enum pipe_format dst_format)
339 {
340    struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
341    enum a6xx_format color_format = fmt.fmt;
342    fixup_src_format(&format, dst_format, &color_format);
343 
344    tu_cs_emit_regs(cs,
345                    SP_PS_2D_SRC_INFO(CHIP,
346                       .color_format = color_format,
347                       .color_swap = fmt.swap,
348                       .srgb = util_format_is_srgb(format),
349                       .unk20 = 1,
350                       .unk22 = 1),
351                    SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
352                    SP_PS_2D_SRC(CHIP, .qword = va),
353                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
354 }
355 
356 template <chip CHIP>
357 static void
r2d_src_buffer_unaligned(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)358 r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd,
359                          struct tu_cs *cs,
360                          enum pipe_format format,
361                          uint64_t va,
362                          uint32_t pitch,
363                          uint32_t width,
364                          uint32_t height,
365                          enum pipe_format dst_format)
366 {
367    /* This functionality is only allowed on A7XX, this assertion statically
368     * disallows calling this function on prior generations by mistake.
369     */
370    static_assert(CHIP >= A7XX);
371 
372    struct tu_native_format fmt =
373       blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
374    enum a6xx_format color_format = fmt.fmt;
375    fixup_src_format(&format, dst_format, &color_format);
376 
377    uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
378    va &= ~0x3f;
379    tu_cs_emit_regs(cs,
380                    A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
381                                          .start_offset_texels = offset_texels,
382                                          .type = A6XX_TEX_IMG_BUFFER));
383 
384    tu_cs_emit_regs(cs,
385                    SP_PS_2D_SRC_INFO(CHIP, .color_format = color_format,
386                                      .color_swap = fmt.swap,
387                                      .srgb = util_format_is_srgb(format),
388                                      .unk20 = 1, .unk22 = 1),
389                    SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
390                    SP_PS_2D_SRC(CHIP, .qword = va),
391                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
392 }
393 
394 template <chip CHIP>
395 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)396 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
397         enum pipe_format src_format)
398 {
399    uint32_t dst_info = iview->RB_2D_DST_INFO;
400    enum a6xx_format fmt =
401       (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
402    enum pipe_format dst_format = iview->format;
403    fixup_dst_format(src_format, &dst_format, &fmt);
404 
405    dst_info =
406          (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
407    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
408    tu_cs_emit(cs, dst_info);
409    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
410 
411    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
412    tu_cs_image_flag_ref(cs, iview, layer);
413 }
414 
415 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)416 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
417 {
418    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
419    tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
420    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
421    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
422 
423    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
424    tu_cs_image_flag_ref(cs, &iview->view, layer);
425 }
426 
427 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)428 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
429 {
430    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
431    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
432    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
433    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
434 }
435 
436 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)437 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
438                enum pipe_format src_format)
439 {
440    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
441    enum a6xx_format color_fmt = fmt.fmt;
442    fixup_dst_format(src_format, &format, &color_fmt);
443    fmt.fmt = color_fmt;
444 
445    tu_cs_emit_regs(cs,
446                    A6XX_RB_2D_DST_INFO(
447                       .color_format = fmt.fmt,
448                       .color_swap = fmt.swap,
449                       .srgb = util_format_is_srgb(format)),
450                    A6XX_RB_2D_DST(.qword = va),
451                    A6XX_RB_2D_DST_PITCH(pitch));
452 }
453 
454 template <chip CHIP>
455 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)456 r2d_setup_common(struct tu_cmd_buffer *cmd,
457                  struct tu_cs *cs,
458                  enum pipe_format src_format,
459                  enum pipe_format dst_format,
460                  VkImageAspectFlags aspect_mask,
461                  unsigned blit_param,
462                  bool clear,
463                  bool ubwc,
464                  bool scissor)
465 {
466    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
467       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
468    }
469 
470    enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
471    fixup_dst_format(src_format, &dst_format, &fmt);
472    enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
473 
474    uint32_t unknown_8c01 = 0;
475 
476    /* note: the only format with partial clearing is D24S8 */
477    if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
478       /* preserve stencil channel */
479       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
480          unknown_8c01 = 0x08000041;
481       /* preserve depth channels */
482       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
483          unknown_8c01 = 0x00084001;
484    }
485 
486    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
487    tu_cs_emit(cs, unknown_8c01);    // TODO: seem to be always 0 on A7XX
488 
489    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
490          .rotate = (enum a6xx_rotation) blit_param,
491          .solid_color = clear,
492          .color_format = fmt,
493          .scissor = scissor,
494          .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
495          .mask = 0xf,
496          .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
497       ).value;
498 
499    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
500    tu_cs_emit(cs, blit_cntl);
501 
502    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
503    tu_cs_emit(cs, blit_cntl);
504 
505    if (CHIP > A6XX) {
506       tu_cs_emit_regs(cs, A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
507                                                 .start_offset_texels = 0,
508                                                 .type = A6XX_TEX_2D));
509    }
510 
511    if (fmt == FMT6_10_10_10_2_UNORM_DEST)
512       fmt = FMT6_16_16_16_16_FLOAT;
513 
514    tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
515          .sint = util_format_is_pure_sint(dst_format),
516          .uint = util_format_is_pure_uint(dst_format),
517          .color_format = fmt,
518          .srgb = util_format_is_srgb(dst_format),
519          .mask = 0xf));
520 }
521 
522 template <chip CHIP>
523 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)524 r2d_setup(struct tu_cmd_buffer *cmd,
525           struct tu_cs *cs,
526           enum pipe_format src_format,
527           enum pipe_format dst_format,
528           VkImageAspectFlags aspect_mask,
529           unsigned blit_param,
530           bool clear,
531           bool ubwc,
532           VkSampleCountFlagBits samples)
533 {
534    assert(samples == VK_SAMPLE_COUNT_1_BIT);
535 
536    if (!cmd->state.pass) {
537       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
538    }
539 
540    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
541 }
542 
543 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)544 r2d_teardown(struct tu_cmd_buffer *cmd,
545              struct tu_cs *cs)
546 {
547    /* nothing to do here */
548 }
549 
550 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)551 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
552 {
553    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
554        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
555       /* This a non-context register, so we have to WFI before changing. */
556       tu_cs_emit_wfi(cs);
557       tu_cs_emit_write_reg(
558          cs, REG_A6XX_RB_DBG_ECO_CNTL,
559          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
560    }
561 
562    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
563    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
564 
565    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
566        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
567       tu_cs_emit_wfi(cs);
568       tu_cs_emit_write_reg(
569          cs, REG_A6XX_RB_DBG_ECO_CNTL,
570          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
571    }
572 }
573 
574 /* r3d_ = shader path operations */
575 
576 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)577 load_const(nir_builder *b, unsigned base, unsigned components)
578 {
579    return nir_load_const_ir3(b, components, 32, nir_imm_int(b, 0),
580                              .base = base);
581 }
582 
583 static nir_shader *
build_blit_vs_shader(void)584 build_blit_vs_shader(void)
585 {
586    nir_builder _b =
587       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
588    nir_builder *b = &_b;
589    b->shader->info.internal = true;
590 
591    nir_variable *out_pos =
592       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
593                           "gl_Position");
594    out_pos->data.location = VARYING_SLOT_POS;
595 
596    nir_def *vert0_pos = load_const(b, 0, 2);
597    nir_def *vert1_pos = load_const(b, 4, 2);
598    nir_def *vertex = nir_load_vertex_id(b);
599 
600    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
601    pos = nir_vec4(b, nir_channel(b, pos, 0),
602                      nir_channel(b, pos, 1),
603                      nir_imm_float(b, 0.0),
604                      nir_imm_float(b, 1.0));
605 
606    nir_store_var(b, out_pos, pos, 0xf);
607 
608    nir_variable *out_coords =
609       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
610                           "coords");
611    out_coords->data.location = VARYING_SLOT_VAR0;
612 
613    nir_def *vert0_coords = load_const(b, 2, 2);
614    nir_def *vert1_coords = load_const(b, 6, 2);
615 
616    /* Only used with "z scale" blit path which uses a 3d texture */
617    nir_def *z_coord = load_const(b, 16, 1);
618 
619    nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
620    coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
621                      z_coord);
622 
623    nir_store_var(b, out_coords, coords, 0x7);
624 
625    return b->shader;
626 }
627 
628 static nir_shader *
build_clear_vs_shader(void)629 build_clear_vs_shader(void)
630 {
631    nir_builder _b =
632       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
633    nir_builder *b = &_b;
634    b->shader->info.internal = true;
635 
636    nir_variable *out_pos =
637       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
638                           "gl_Position");
639    out_pos->data.location = VARYING_SLOT_POS;
640 
641    nir_def *vert0_pos = load_const(b, 0, 2);
642    nir_def *vert1_pos = load_const(b, 4, 2);
643    /* c0.z is used to clear depth */
644    nir_def *depth = load_const(b, 2, 1);
645    nir_def *vertex = nir_load_vertex_id(b);
646 
647    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
648    pos = nir_vec4(b, nir_channel(b, pos, 0),
649                      nir_channel(b, pos, 1),
650                      depth, nir_imm_float(b, 1.0));
651 
652    nir_store_var(b, out_pos, pos, 0xf);
653 
654    nir_variable *out_layer =
655       nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
656                           "gl_Layer");
657    out_layer->data.location = VARYING_SLOT_LAYER;
658    nir_def *layer = load_const(b, 3, 1);
659    nir_store_var(b, out_layer, layer, 1);
660 
661    return b->shader;
662 }
663 
664 static nir_shader *
build_blit_fs_shader(bool zscale)665 build_blit_fs_shader(bool zscale)
666 {
667    nir_builder _b =
668       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
669                                      zscale ? "zscale blit fs" : "blit fs");
670    nir_builder *b = &_b;
671    b->shader->info.internal = true;
672 
673    nir_variable *out_color =
674       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
675                           "color0");
676    out_color->data.location = FRAG_RESULT_DATA0;
677 
678    unsigned coord_components = zscale ? 3 : 2;
679    nir_variable *in_coords =
680       nir_variable_create(b->shader, nir_var_shader_in,
681                           glsl_vec_type(coord_components),
682                           "coords");
683    in_coords->data.location = VARYING_SLOT_VAR0;
684 
685    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
686    /* Note: since we're just copying data, we rely on the HW ignoring the
687     * dest_type.
688     */
689    tex->dest_type = nir_type_int32;
690    tex->is_array = false;
691    tex->is_shadow = false;
692    tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
693 
694    tex->texture_index = 0;
695    tex->sampler_index = 0;
696 
697    b->shader->info.num_textures = 1;
698    BITSET_SET(b->shader->info.textures_used, 0);
699 
700    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
701                                      nir_load_var(b, in_coords));
702    tex->coord_components = coord_components;
703 
704    nir_def_init(&tex->instr, &tex->def, 4, 32);
705    nir_builder_instr_insert(b, &tex->instr);
706 
707    nir_store_var(b, out_color, &tex->def, 0xf);
708 
709    return b->shader;
710 }
711 
712 /* We can only read multisample textures via txf_ms, so we need a separate
713  * variant for them.
714  */
715 static nir_shader *
build_ms_copy_fs_shader(bool half_float)716 build_ms_copy_fs_shader(bool half_float)
717 {
718    nir_builder _b =
719       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
720                                      "multisample copy fs");
721    nir_builder *b = &_b;
722    b->shader->info.internal = true;
723 
724    nir_variable *out_color =
725       nir_variable_create(b->shader, nir_var_shader_out,
726                           half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
727                           "color0");
728    out_color->data.location = FRAG_RESULT_DATA0;
729 
730    nir_variable *in_coords =
731       nir_variable_create(b->shader, nir_var_shader_in,
732                           glsl_vec_type(2),
733                           "coords");
734    in_coords->data.location = VARYING_SLOT_VAR0;
735 
736    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
737 
738    tex->op = nir_texop_txf_ms;
739 
740    /* Note: since we're just copying data, we rely on the HW ignoring the
741     * dest_type.
742     */
743    tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
744    tex->is_array = false;
745    tex->is_shadow = false;
746    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
747 
748    tex->texture_index = 0;
749    tex->sampler_index = 0;
750 
751    b->shader->info.num_textures = 1;
752    BITSET_SET(b->shader->info.textures_used, 0);
753    BITSET_SET(b->shader->info.textures_used_by_txf, 0);
754 
755    nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
756 
757    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
758    tex->coord_components = 2;
759 
760    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
761                                      nir_load_sample_id(b));
762 
763    nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
764    nir_builder_instr_insert(b, &tex->instr);
765 
766    nir_store_var(b, out_color, &tex->def, 0xf);
767 
768    return b->shader;
769 }
770 
771 static nir_shader *
build_clear_fs_shader(unsigned mrts)772 build_clear_fs_shader(unsigned mrts)
773 {
774    nir_builder _b =
775       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
776                                      "mrt%u clear fs", mrts);
777    nir_builder *b = &_b;
778    b->shader->info.internal = true;
779 
780    for (unsigned i = 0; i < mrts; i++) {
781       nir_variable *out_color =
782          nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
783                              "color");
784       out_color->data.location = FRAG_RESULT_DATA0 + i;
785 
786       nir_def *color = load_const(b, 4 * i, 4);
787       nir_store_var(b, out_color, color, 0xf);
788    }
789 
790    return b->shader;
791 }
792 
793 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)794 compile_shader(struct tu_device *dev, struct nir_shader *nir,
795                unsigned consts, unsigned *offset, enum global_shader idx)
796 {
797    nir->options = ir3_get_compiler_options(dev->compiler);
798 
799    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
800    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
801 
802    struct ir3_const_allocations const_allocs = {};
803    if (consts > 0)
804       ir3_const_alloc(&const_allocs, IR3_CONST_ALLOC_UBO_RANGES, align(consts, 8), 1);
805 
806    const struct ir3_shader_options options = {
807       .api_wavesize = IR3_SINGLE_OR_DOUBLE,
808       .real_wavesize = IR3_SINGLE_OR_DOUBLE,
809       .const_allocs = const_allocs,
810       .fragdata_dynamic_remap =
811          idx >= GLOBAL_SH_VS_CLEAR && idx <= GLOBAL_SH_FS_CLEAR_MAX,
812    };
813 
814    ir3_finalize_nir(dev->compiler, &options.nir_options, nir);
815 
816    struct ir3_shader *sh =
817       ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
818 
819    struct ir3_shader_key key = {};
820    bool created;
821    struct ir3_shader_variant *so =
822       ir3_shader_get_variant(sh, &key, false, false, &created);
823 
824    struct tu6_global *global = dev->global_bo_map;
825 
826    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
827    dev->global_shaders[idx] = sh;
828    dev->global_shader_variants[idx] = so;
829    memcpy(&global->shaders[*offset], so->bin,
830           sizeof(uint32_t) * so->info.sizedwords);
831    dev->global_shader_va[idx] = dev->global_bo->iova +
832       offsetof_arr(struct tu6_global, shaders, *offset);
833    *offset += align(so->info.sizedwords, 32);
834 }
835 
836 void
tu_init_clear_blit_shaders(struct tu_device * dev)837 tu_init_clear_blit_shaders(struct tu_device *dev)
838 {
839    unsigned offset = 0;
840    compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
841    compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
842    compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
843    compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
844    compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
845    compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
846 
847    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
848       compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
849                      (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
850    }
851 }
852 
853 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)854 tu_destroy_clear_blit_shaders(struct tu_device *dev)
855 {
856    for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
857       if (dev->global_shaders[i])
858          ir3_shader_destroy(dev->global_shaders[i]);
859    }
860 }
861 
862 enum r3d_type {
863    R3D_CLEAR,
864    R3D_BLIT,
865    R3D_COPY_HALF,
866 };
867 
868 template <chip CHIP>
869 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)870 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
871            uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
872 {
873    enum global_shader vs_id =
874       type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
875 
876    struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
877    uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
878 
879    enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
880 
881    if (z_scale) {
882       fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
883    } else if (type == R3D_COPY_HALF) {
884       /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
885        *
886        * TODO: Add a half-float blit shader that uses texture() but with half
887        * registers to avoid NaN canonicaliztion for the single-sampled case.
888        */
889       fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
890    } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
891       fs_id = GLOBAL_SH_FS_COPY_MS;
892    }
893 
894    unsigned num_rts = util_bitcount(rts_mask);
895    if (type == R3D_CLEAR)
896       fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
897 
898    struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
899    uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
900 
901    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
902          .vs_state = true,
903          .hs_state = true,
904          .ds_state = true,
905          .gs_state = true,
906          .fs_state = true,
907          .gfx_ibo = true,
908          .gfx_shared_const = true,
909          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
910          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
911 
912    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
913    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
914    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
915    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
916    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
917 
918    struct tu_pvtmem_config pvtmem = {};
919    tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
920    tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
921 
922    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
923    if (CHIP == A7XX) {
924       tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
925    }
926 
927    tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
928 
929    if (CHIP >= A7XX) {
930       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
931 
932       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
933    }
934 
935    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
936    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
937    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
938 
939    tu6_emit_vs<CHIP>(cs, vs, 0);
940    tu6_emit_hs<CHIP>(cs, NULL);
941    tu6_emit_ds<CHIP>(cs, NULL);
942    tu6_emit_gs<CHIP>(cs, NULL);
943    tu6_emit_fs<CHIP>(cs, fs);
944 
945    tu_cs_emit_regs(cs,
946                    A6XX_GRAS_CL_CNTL(
947                       .clip_disable = 1,
948                       .vp_clip_code_ignore = 1,
949                       .vp_xform_disable = 1,
950                       .persp_division_disable = 1,));
951    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
952 
953    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
954    if (CHIP == A6XX) {
955       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
956    } else {
957       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
958 
959       tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
960             .raster_mode = TYPE_TILED,
961             .raster_direction = LR_TB));
962       tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
963       tu_cs_emit_regs(cs, A6XX_PC_DGEN_SU_CONSERVATIVE_RAS_CNTL());
964       tu_cs_emit_regs(cs, A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL());
965    }
966 
967    tu_cs_emit_regs(cs,
968                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
969                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
970    tu_cs_emit_regs(cs,
971                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
972                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
973 
974    tu_cs_emit_regs(cs,
975                    A6XX_VFD_INDEX_OFFSET(),
976                    A6XX_VFD_INSTANCE_START_OFFSET());
977 
978    if (rts_mask) {
979       unsigned rts_count = util_last_bit(rts_mask);
980       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
981       unsigned rt = 0;
982       for (unsigned i = 0; i < rts_count; i++) {
983          unsigned regid = 0;
984          if (rts_mask & (1u << i))
985             regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
986          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
987                         COND(regid & HALF_REG_ID,
988                              A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
989       }
990    }
991 
992    tu6_emit_msaa(cs, samples, false);
993 }
994 
995 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)996 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
997                           struct tu_cs *cs,
998                           uint32_t opcode,
999                           enum a6xx_state_block block,
1000                           uint32_t offset,
1001                           const void *consts,
1002                           uint32_t size_vec4)
1003 {
1004    assert(offset % cmd->device->compiler->const_upload_unit == 0);
1005 
1006    struct tu_cs_memory mem = {};
1007    VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
1008    if (result != VK_SUCCESS) {
1009       vk_command_buffer_set_error(&cmd->vk, result);
1010       return;
1011    }
1012 
1013    memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
1014 
1015    tu_cs_emit_pkt7(cs, opcode, 3);
1016    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
1017                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1018                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1019                   CP_LOAD_STATE6_0_STATE_BLOCK(block) |
1020                   CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
1021    tu_cs_emit_qw(cs, mem.iova);
1022 }
1023 
1024 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)1025 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
1026 {
1027    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
1028 }
1029 
1030 /* z coordinate for "z scale" blit path which uses a 3d texture */
1031 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)1032 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
1033 {
1034    const uint32_t coord[] = {
1035       fui(z),
1036       0,
1037       0,
1038       0,
1039    };
1040 
1041    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
1042 }
1043 
1044 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)1045 r3d_coords(struct tu_cmd_buffer *cmd,
1046            struct tu_cs *cs,
1047            const VkOffset2D dst,
1048            const VkOffset2D src,
1049            const VkExtent2D extent)
1050 {
1051    const bool no_src = src.x != blt_no_coord.x;
1052    int32_t src_x1 = no_src ? src.x : 0;
1053    int32_t src_y1 = no_src ? src.y : 0;
1054 
1055    const float coords[] = {
1056       dst.x,
1057       dst.y,
1058       src_x1,
1059       src_y1,
1060       dst.x + extent.width,
1061       dst.y + extent.height,
1062       src_x1 + extent.width,
1063       src_y1 + extent.height,
1064    };
1065    r3d_coords_raw(cmd, cs, coords);
1066 }
1067 
1068 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1069 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1070 {
1071    uint32_t coords[4] = {};
1072 
1073    switch (format) {
1074    case PIPE_FORMAT_Z24X8_UNORM:
1075    case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1076       /* cleared as r8g8b8a8_unorm using special format */
1077       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1078       coords[0] = fui((tmp & 0xff) / 255.0f);
1079       coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1080       coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1081       coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1082    } break;
1083    case PIPE_FORMAT_Z16_UNORM:
1084    case PIPE_FORMAT_Z32_FLOAT:
1085       coords[0] = fui(val->depthStencil.depth);
1086       coords[1] = 0;
1087       coords[2] = 0;
1088       coords[3] = 0;
1089       break;
1090    case PIPE_FORMAT_S8_UINT:
1091       coords[0] = val->depthStencil.stencil & 0xff;
1092       coords[1] = 0;
1093       coords[2] = 0;
1094       coords[3] = 0;
1095       break;
1096    default:
1097       /* as color formats use clear value as-is */
1098       assert(!util_format_is_depth_or_stencil(format));
1099       memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1100       break;
1101    }
1102 
1103    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1104 }
1105 
1106 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1107 r3d_src_common(struct tu_cmd_buffer *cmd,
1108                struct tu_cs *cs,
1109                const uint32_t *tex_const,
1110                uint32_t offset_base,
1111                uint32_t offset_ubwc,
1112                VkFilter filter)
1113 {
1114    struct tu_cs_memory texture = { };
1115    VkResult result = tu_cs_alloc(&cmd->sub_cs,
1116                                  2, /* allocate space for a sampler too */
1117                                  A6XX_TEX_CONST_DWORDS, &texture);
1118    if (result != VK_SUCCESS) {
1119       vk_command_buffer_set_error(&cmd->vk, result);
1120       return;
1121    }
1122 
1123    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1124 
1125    /* patch addresses for layer offset */
1126    *(uint64_t*) (texture.map + 4) += offset_base;
1127    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1128    texture.map[7] = ubwc_addr;
1129    texture.map[8] = ubwc_addr >> 32;
1130 
1131    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1132       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1133       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1134       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1135       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1136       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1137       0x60000; /* XXX used by blob, doesn't seem necessary */
1138    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1139       A6XX_TEX_SAMP_1_UNNORM_COORDS |
1140       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1141    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1142    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1143 
1144    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1145    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1146                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1147                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1148                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1149                CP_LOAD_STATE6_0_NUM_UNIT(1));
1150    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1151 
1152    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1153 
1154    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1155    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1156       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1157       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1158       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1159       CP_LOAD_STATE6_0_NUM_UNIT(1));
1160    tu_cs_emit_qw(cs, texture.iova);
1161 
1162    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1163    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1164 }
1165 
1166 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1167 r3d_src(struct tu_cmd_buffer *cmd,
1168         struct tu_cs *cs,
1169         const struct fdl6_view *iview,
1170         uint32_t layer,
1171         VkFilter filter,
1172         enum pipe_format dst_format)
1173 {
1174    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1175    memcpy(desc, iview->descriptor, sizeof(desc));
1176 
1177    enum a6xx_format fmt = (enum a6xx_format)(
1178       (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1179    enum pipe_format src_format = iview->format;
1180    fixup_src_format(&src_format, dst_format, &fmt);
1181    desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1182       A6XX_TEX_CONST_0_FMT(fmt);
1183 
1184    r3d_src_common(cmd, cs, desc,
1185                   iview->layer_size * layer,
1186                   iview->ubwc_layer_size * layer,
1187                   filter);
1188 }
1189 
1190 template <chip CHIP>
1191 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1192 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1193                struct tu_cs *cs,
1194                enum pipe_format format,
1195                uint64_t va, uint32_t pitch,
1196                uint32_t width, uint32_t height,
1197                enum pipe_format dst_format)
1198 {
1199    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1200 
1201    struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
1202    enum a6xx_format color_format = fmt.fmt;
1203    fixup_src_format(&format, dst_format, &color_format);
1204 
1205    desc[0] =
1206       COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1207       A6XX_TEX_CONST_0_FMT(color_format) |
1208       A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1209       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1210       A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1211       A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1212       A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1213    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1214    desc[2] =
1215       A6XX_TEX_CONST_2_PITCH(pitch) |
1216       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1217    desc[3] = 0;
1218    desc[4] = va;
1219    desc[5] = va >> 32;
1220    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1221       desc[i] = 0;
1222 
1223    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1224 }
1225 
1226 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1227 r3d_src_depth(struct tu_cmd_buffer *cmd,
1228               struct tu_cs *cs,
1229               const struct tu_image_view *iview,
1230               uint32_t layer)
1231 {
1232    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1233 
1234    memcpy(desc, iview->view.descriptor, sizeof(desc));
1235    uint64_t va = iview->depth_base_addr;
1236 
1237    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1238                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1239                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1240                 A6XX_TEX_CONST_0_SWAP__MASK);
1241    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1242               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1243               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1244               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1245               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1246    desc[2] =
1247       A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1248       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1249    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1250       (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1251    desc[4] = va;
1252    desc[5] = va >> 32;
1253 
1254    r3d_src_common(cmd, cs, desc,
1255                   iview->depth_layer_size * layer,
1256                   iview->view.ubwc_layer_size * layer,
1257                   VK_FILTER_NEAREST);
1258 }
1259 
1260 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1261 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1262                 struct tu_cs *cs,
1263                 const struct tu_image_view *iview,
1264                 uint32_t layer)
1265 {
1266    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1267 
1268    memcpy(desc, iview->view.descriptor, sizeof(desc));
1269    uint64_t va = iview->stencil_base_addr;
1270 
1271    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1272                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1273                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1274                 A6XX_TEX_CONST_0_SWAP__MASK);
1275    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1276               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1277               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1278               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1279               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1280    desc[2] =
1281       A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1282       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1283    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1284    desc[4] = va;
1285    desc[5] = va >> 32;
1286    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1287       desc[i] = 0;
1288 
1289    r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1290                   VK_FILTER_NEAREST);
1291 }
1292 
1293 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1294 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1295                   struct tu_cs *cs,
1296                   const struct tu_image_view *iview,
1297                   uint32_t layer)
1298 {
1299    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1300 
1301    memcpy(desc, iview->view.descriptor, sizeof(desc));
1302 
1303    /* Fixup D24 formats because we always load both depth and stencil. */
1304    enum pipe_format format = iview->view.format;
1305    if (format == PIPE_FORMAT_X24S8_UINT ||
1306        format == PIPE_FORMAT_Z24X8_UNORM ||
1307        format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1308       desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1309       if (iview->view.ubwc_enabled)
1310          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1311       else
1312          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1313    }
1314 
1315    /* When loading/storing GMEM we always load the full image and don't do any
1316     * swizzling or swapping, that's done in the draw when reading/writing
1317     * GMEM, so we need to fixup the swizzle and swap.
1318     */
1319    desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1320                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1321                 A6XX_TEX_CONST_0_SWAP__MASK);
1322    desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1323               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1324               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1325               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1326 
1327    r3d_src_common(cmd, cs, desc,
1328                   iview->view.layer_size * layer,
1329                   iview->view.ubwc_layer_size * layer,
1330                   VK_FILTER_NEAREST);
1331 }
1332 
1333 template <chip CHIP>
1334 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1335 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1336              struct tu_cs *cs,
1337              const struct tu_image_view *iview,
1338              enum pipe_format format,
1339              enum pipe_format dst_format,
1340              uint32_t gmem_offset,
1341              uint32_t cpp)
1342 {
1343    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1344    memcpy(desc, iview->view.descriptor, sizeof(desc));
1345 
1346    enum a6xx_format fmt =
1347       blit_format_texture<CHIP>(format, TILE6_2,
1348                                 iview->view.is_mutable, true).fmt;
1349    fixup_src_format(&format, dst_format, &fmt);
1350 
1351    /* patch the format so that depth/stencil get the right format and swizzle */
1352    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1353                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1354                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1355    desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1356                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1357                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1358                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1359                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1360 
1361    /* patched for gmem */
1362    desc[0] &= ~A6XX_TEX_CONST_0_TILE_MODE__MASK;
1363    if (!iview->view.is_mutable)
1364       desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
1365    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1366    desc[2] =
1367       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1368       A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1369    desc[3] = 0;
1370    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1371    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1372    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1373       desc[i] = 0;
1374 
1375    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1376 }
1377 
1378 template <chip CHIP>
1379 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1380 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1381         enum pipe_format src_format)
1382 {
1383    uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1384 
1385    enum a6xx_format fmt = (enum a6xx_format)(
1386       mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1387    enum pipe_format dst_format = iview->format;
1388    fixup_dst_format(src_format, &dst_format, &fmt);
1389    mrt_buf_info =
1390       (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1391       A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1392 
1393    tu_cs_emit_regs(cs,
1394       RB_MRT_BUF_INFO(CHIP, 0, .dword = mrt_buf_info),
1395       A6XX_RB_MRT_PITCH(0, iview->pitch),
1396       A6XX_RB_MRT_ARRAY_PITCH(0, iview->layer_size),
1397       A6XX_RB_MRT_BASE(0, .qword = tu_layer_address(iview, layer)),
1398       A6XX_RB_MRT_BASE_GMEM(0),
1399    );
1400 
1401    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1402    tu_cs_image_flag_ref(cs, iview, layer);
1403 
1404    /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1405     * FMT6_NV12_Y.
1406     */
1407    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1408 
1409    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->ubwc_enabled));
1410    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1411 }
1412 
1413 template <chip CHIP>
1414 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1415 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1416 {
1417    tu_cs_emit_regs(cs,
1418       RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_depth(iview, RB_MRT_BUF_INFO)),
1419       A6XX_RB_MRT_PITCH(0, iview->depth_pitch),
1420       A6XX_RB_MRT_ARRAY_PITCH(0, iview->depth_layer_size),
1421       A6XX_RB_MRT_BASE(0, .qword = iview->depth_base_addr + iview->depth_layer_size * layer),
1422       A6XX_RB_MRT_BASE_GMEM(0),
1423    );
1424 
1425    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1426    tu_cs_image_flag_ref(cs, &iview->view, layer);
1427 
1428    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->view.ubwc_enabled));
1429    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1430 }
1431 
1432 template <chip CHIP>
1433 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1434 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1435 {
1436    tu_cs_emit_regs(cs,
1437       RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_stencil(iview, RB_MRT_BUF_INFO)),
1438       A6XX_RB_MRT_PITCH(0, iview->stencil_pitch),
1439       A6XX_RB_MRT_ARRAY_PITCH(0, iview->stencil_layer_size),
1440       A6XX_RB_MRT_BASE(0, .qword = iview->stencil_base_addr + iview->stencil_layer_size * layer),
1441       A6XX_RB_MRT_BASE_GMEM(0),
1442    );
1443 
1444    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1445    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1446 }
1447 
1448 template <chip CHIP>
1449 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1450 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1451                enum pipe_format src_format)
1452 {
1453    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1454 
1455    enum a6xx_format color_fmt = fmt.fmt;
1456    fixup_dst_format(src_format, &format, &color_fmt);
1457 
1458    tu_cs_emit_regs(cs,
1459                    RB_MRT_BUF_INFO(CHIP, 0, .color_format = color_fmt, .color_swap = fmt.swap),
1460                    A6XX_RB_MRT_PITCH(0, pitch),
1461                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1462                    A6XX_RB_MRT_BASE(0, .qword = va),
1463                    A6XX_RB_MRT_BASE_GMEM(0, 0));
1464 
1465    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1466    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1467 }
1468 
1469 template <chip CHIP>
1470 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1471 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1472              const struct tu_image_view *iview,
1473              const struct tu_render_pass_attachment *att,
1474              bool separate_stencil, unsigned layer)
1475 {
1476    unsigned RB_MRT_BUF_INFO;
1477    unsigned gmem_offset;
1478 
1479    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1480       if (!separate_stencil) {
1481          RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1482          gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1483       } else {
1484          RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1485          gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1486       }
1487    } else {
1488       RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1489       gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1490    }
1491 
1492    tu_cs_emit_regs(cs,
1493                    RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
1494                    A6XX_RB_MRT_PITCH(0, 0),
1495                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1496                    A6XX_RB_MRT_BASE(0, 0),
1497                    A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1498 
1499    enum a6xx_format color_format =
1500       (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1501    tu_cs_emit_regs(cs,
1502                    A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1503 
1504    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1505    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1506 }
1507 
1508 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1509 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1510 {
1511    uint8_t mask = 0xf;
1512    assert(aspect_mask);
1513    /* note: the only format with partial writing is D24S8,
1514     * clear/blit uses the _AS_R8G8B8A8 format to access it
1515     */
1516    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1517       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1518          mask = 0x7;
1519       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1520          mask = 0x8;
1521    }
1522    return mask;
1523 }
1524 
1525 static uint8_t
aspect_write_mask_generic_clear(enum pipe_format format,VkImageAspectFlags aspect_mask)1526 aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspect_mask)
1527 {
1528    uint8_t mask = 0xf;
1529    assert(aspect_mask);
1530    /* note: the only format with partial writing is D24S8 */
1531    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1532       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1533          mask = 0x1;
1534       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1535          mask = 0x2;
1536       if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
1537          mask = 0x3;
1538    }
1539    return mask;
1540 }
1541 
1542 enum r3d_blit_param {
1543    R3D_Z_SCALE = 1 << 0,
1544    R3D_DST_GMEM = 1 << 1,
1545    R3D_COPY = 1 << 2,
1546 };
1547 
1548 template <chip CHIP>
1549 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1550 r3d_setup(struct tu_cmd_buffer *cmd,
1551           struct tu_cs *cs,
1552           enum pipe_format src_format,
1553           enum pipe_format dst_format,
1554           VkImageAspectFlags aspect_mask,
1555           unsigned blit_param,
1556           bool clear,
1557           bool ubwc,
1558           VkSampleCountFlagBits samples)
1559 {
1560    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1561       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1562    }
1563 
1564    enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
1565    fixup_dst_format(src_format, &dst_format, &fmt);
1566 
1567    if (!cmd->state.pass) {
1568       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1569       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1570    }
1571 
1572    if (!(blit_param & R3D_DST_GMEM)) {
1573       if (CHIP == A6XX) {
1574          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1575       } else {
1576          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1577       }
1578 
1579       tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1580 
1581       if (CHIP >= A7XX) {
1582          tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1583          tu_cs_emit_regs(cs,
1584             A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1585       }
1586    }
1587 
1588    enum r3d_type type;
1589    if (clear) {
1590       type = R3D_CLEAR;
1591    } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1592       /* Avoid canonicalizing NaNs in copies by using the special half-float
1593        * path that uses half regs.
1594        */
1595       type = R3D_COPY_HALF;
1596    } else {
1597       type = R3D_BLIT;
1598    }
1599 
1600    r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1601 
1602    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1603    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1604    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1605    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1606 
1607    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1608    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1609    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1610    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1611    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1612    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1613    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1614    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1615    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1616 
1617    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1618                         .color_format = fmt,
1619                         .color_sint = util_format_is_pure_sint(dst_format),
1620                         .color_uint = util_format_is_pure_uint(dst_format)));
1621 
1622    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1623       .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1624    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1625    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1626 
1627    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1628    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1629 
1630    if (CHIP >= A7XX) {
1631       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CNTL2(0));
1632       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
1633 
1634       tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
1635       tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
1636       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
1637    }
1638 
1639    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1640                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1641 
1642    /* Disable sample counting in order to not affect occlusion query. */
1643    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1644 
1645    tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL());
1646    if (CHIP >= A7XX) {
1647       tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL());
1648    }
1649 
1650    if (cmd->state.prim_generated_query_running_before_rp) {
1651       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1652    }
1653 
1654    if (cmd->state.predication_active) {
1655       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1656       tu_cs_emit(cs, 0);
1657    }
1658 }
1659 
1660 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1661 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1662 {
1663    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1664    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1665                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1666                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1667    tu_cs_emit(cs, 1); /* instance count */
1668    tu_cs_emit(cs, 2); /* vertex count */
1669 }
1670 
1671 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1672 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1673 {
1674    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1675    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1676                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1677                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1678    tu_cs_emit(cs, 1); /* instance count */
1679    tu_cs_emit(cs, 2); /* vertex count */
1680 }
1681 
1682 template <chip CHIP>
1683 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1684 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1685 {
1686    if (cmd->state.predication_active) {
1687       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1688       tu_cs_emit(cs, 1);
1689    }
1690 
1691    /* Re-enable sample counting. */
1692    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1693 
1694    if (cmd->state.prim_generated_query_running_before_rp) {
1695       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1696    }
1697 }
1698 
1699 /* blit ops - common interface for 2d/shader paths */
1700 
1701 struct blit_ops {
1702    void (*coords)(struct tu_cmd_buffer *cmd,
1703                   struct tu_cs *cs,
1704                   const VkOffset2D dst,
1705                   const VkOffset2D src,
1706                   const VkExtent2D extent);
1707    void (*clear_value)(struct tu_cmd_buffer *cmd,
1708                        struct tu_cs *cs,
1709                        enum pipe_format format,
1710                        const VkClearValue *val);
1711    void (*src)(
1712         struct tu_cmd_buffer *cmd,
1713         struct tu_cs *cs,
1714         const struct fdl6_view *iview,
1715         uint32_t layer,
1716         VkFilter filter,
1717         enum pipe_format dst_format);
1718    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1719                       enum pipe_format format,
1720                       uint64_t va, uint32_t pitch,
1721                       uint32_t width, uint32_t height,
1722                       enum pipe_format dst_format);
1723    void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1724                enum pipe_format src_format);
1725    void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1726    void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1727    void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1728                       enum pipe_format src_format);
1729    void (*setup)(struct tu_cmd_buffer *cmd,
1730                  struct tu_cs *cs,
1731                  enum pipe_format src_format,
1732                  enum pipe_format dst_format,
1733                  VkImageAspectFlags aspect_mask,
1734                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1735                  bool clear,
1736                  bool ubwc,
1737                  VkSampleCountFlagBits samples);
1738    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1739    void (*teardown)(struct tu_cmd_buffer *cmd,
1740                     struct tu_cs *cs);
1741 };
1742 
1743 template <chip CHIP>
1744 static const struct blit_ops r2d_ops = {
1745    .coords = r2d_coords,
1746    .clear_value = r2d_clear_value,
1747    .src = r2d_src<CHIP>,
1748    .src_buffer = r2d_src_buffer<CHIP>,
1749    .dst = r2d_dst<CHIP>,
1750    .dst_depth = r2d_dst_depth,
1751    .dst_stencil = r2d_dst_stencil,
1752    .dst_buffer = r2d_dst_buffer,
1753    .setup = r2d_setup<CHIP>,
1754    .run = r2d_run,
1755    .teardown = r2d_teardown,
1756 };
1757 
1758 template <chip CHIP>
1759 static const struct blit_ops r3d_ops = {
1760    .coords = r3d_coords,
1761    .clear_value = r3d_clear_value,
1762    .src = r3d_src,
1763    .src_buffer = r3d_src_buffer<CHIP>,
1764    .dst = r3d_dst<CHIP>,
1765    .dst_depth = r3d_dst_depth<CHIP>,
1766    .dst_stencil = r3d_dst_stencil<CHIP>,
1767    .dst_buffer = r3d_dst_buffer<CHIP>,
1768    .setup = r3d_setup<CHIP>,
1769    .run = r3d_run,
1770    .teardown = r3d_teardown<CHIP>,
1771 };
1772 
1773 /* passthrough set coords from 3D extents */
1774 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1775 coords(const struct blit_ops *ops,
1776        struct tu_cmd_buffer *cmd,
1777        struct tu_cs *cs,
1778        const VkOffset3D dst,
1779        const VkOffset3D src,
1780        const VkExtent3D extent)
1781 {
1782    ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1783                (VkExtent2D) {extent.width, extent.height});
1784 }
1785 
1786 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1787  * to be a bit careful because we have to pick a format with matching UBWC
1788  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1789  * everything.
1790  */
1791 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1792 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1793 {
1794    if (vk_format_is_compressed(vk_format)) {
1795       switch (vk_format_get_blocksize(vk_format)) {
1796       case 1: return PIPE_FORMAT_R8_UINT;
1797       case 2: return PIPE_FORMAT_R16_UINT;
1798       case 4: return PIPE_FORMAT_R32_UINT;
1799       case 8: return PIPE_FORMAT_R32G32_UINT;
1800       case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1801       default:
1802          unreachable("unhandled format size");
1803       }
1804    }
1805 
1806    enum pipe_format format = vk_format_to_pipe_format(vk_format);
1807 
1808    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1809     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1810     * (also -1.0), when we're supposed to be memcpying the bits. See
1811     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1812     */
1813    format = util_format_snorm_to_unorm(format);
1814 
1815    if (vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1816       return PIPE_FORMAT_R32_UINT;
1817 
1818    /* For VK_FORMAT_D32_SFLOAT_S8_UINT and YCbCr formats use our existing helpers */
1819    if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1820        vk_format_get_ycbcr_info(vk_format))
1821       return tu_aspects_to_plane(vk_format, aspect_mask);
1822 
1823    /* Otherwise, simply return the pipe_format */
1824    return format;
1825 }
1826 
1827 static void
pack_blit_event_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])1828 pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
1829 {
1830    switch (format) {
1831    case PIPE_FORMAT_Z24X8_UNORM:
1832    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1833       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1834                        val->depthStencil.stencil << 24;
1835       return;
1836    case PIPE_FORMAT_Z16_UNORM:
1837       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1838       return;
1839    case PIPE_FORMAT_Z32_FLOAT:
1840       clear_value[0] = fui(val->depthStencil.depth);
1841       return;
1842    case PIPE_FORMAT_S8_UINT:
1843       clear_value[0] = val->depthStencil.stencil;
1844       return;
1845    default:
1846       break;
1847    }
1848 
1849    float tmp[4];
1850    memcpy(tmp, val->color.float32, 4 * sizeof(float));
1851    if (util_format_is_srgb(format)) {
1852       for (int i = 0; i < 3; i++)
1853          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1854    }
1855 
1856 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1857    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1858    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1859    case 4:
1860       PACK_F(r4g4b4a4_unorm);
1861       break;
1862    case 5:
1863       if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1864          PACK_F(r5g6b5_unorm);
1865       else
1866          PACK_F(r5g5b5a1_unorm);
1867       break;
1868    case 8:
1869       if (util_format_is_snorm(format))
1870          PACK_F(r8g8b8a8_snorm);
1871       else if (util_format_is_unorm(format))
1872          PACK_F(r8g8b8a8_unorm);
1873       else
1874          pack_int8(clear_value, val->color.uint32);
1875       break;
1876    case 10:
1877       if (util_format_is_pure_integer(format))
1878          pack_int10_2(clear_value, val->color.uint32);
1879       else
1880          PACK_F(r10g10b10a2_unorm);
1881       break;
1882    case 11:
1883       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
1884       break;
1885    case 16:
1886       if (util_format_is_snorm(format))
1887          PACK_F(r16g16b16a16_snorm);
1888       else if (util_format_is_unorm(format))
1889          PACK_F(r16g16b16a16_unorm);
1890       else if (util_format_is_float(format))
1891          PACK_F(r16g16b16a16_float);
1892       else
1893          pack_int16(clear_value, val->color.uint32);
1894       break;
1895    case 32:
1896       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
1897       break;
1898    case 0:
1899       assert(format == PIPE_FORMAT_A8_UNORM);
1900       PACK_F(a8_unorm);
1901       break;
1902    default:
1903       unreachable("unexpected channel size");
1904    }
1905 #undef PACK_F
1906 }
1907 
1908 static void
event_blit_setup(struct tu_cs * cs,uint32_t buffer_id,const struct tu_render_pass_attachment * att,enum a6xx_blit_event_type blit_event_type,uint32_t clear_mask)1909 event_blit_setup(struct tu_cs *cs,
1910                  uint32_t buffer_id,
1911                  const struct tu_render_pass_attachment *att,
1912                  enum a6xx_blit_event_type blit_event_type,
1913                  uint32_t clear_mask)
1914 {
1915    tu_cs_emit_regs(
1916       cs, A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
1917 
1918    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1919    tu_cs_emit(cs, 0);
1920 
1921    tu_cs_emit_regs(
1922       cs,
1923       A6XX_RB_BLIT_INFO(.type = blit_event_type,
1924                         .sample_0 =
1925                            vk_format_is_int(att->format) ||
1926                            vk_format_is_depth_or_stencil(att->format),
1927                         .depth = vk_format_is_depth_or_stencil(att->format),
1928                         .clear_mask = clear_mask,
1929                         .buffer_id = buffer_id));
1930 }
1931 
1932 struct event_blit_dst_view {
1933    const struct tu_image *image;
1934    const struct fdl6_view *view;
1935 
1936    uint32_t layer;
1937 
1938    uint64_t depth_addr;
1939    uint32_t depth_pitch;
1940 
1941    uint64_t stencil_addr;
1942    uint32_t stencil_pitch;
1943 };
1944 
1945 static event_blit_dst_view
blt_view_from_tu_view(const struct tu_image_view * iview,uint32_t layer)1946 blt_view_from_tu_view(const struct tu_image_view *iview,
1947                       uint32_t layer)
1948 {
1949    struct event_blit_dst_view blt_view;
1950    blt_view.image = iview->image;
1951    blt_view.view = &iview->view;
1952    blt_view.layer = layer;
1953 
1954    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1955       blt_view.depth_addr =
1956          iview->depth_base_addr + iview->depth_layer_size * layer;
1957       blt_view.depth_pitch = iview->depth_pitch;
1958 
1959       blt_view.stencil_addr =
1960          iview->stencil_base_addr + iview->stencil_layer_size * layer;
1961       blt_view.stencil_pitch = iview->stencil_pitch;
1962    }
1963    return blt_view;
1964 }
1965 
1966 template <chip CHIP>
1967 static void
event_blit_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const event_blit_dst_view * blt_view,bool separate_stencil)1968 event_blit_run(struct tu_cmd_buffer *cmd,
1969                struct tu_cs *cs,
1970                const struct tu_render_pass_attachment *att,
1971                const event_blit_dst_view *blt_view,
1972                bool separate_stencil)
1973 {
1974    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
1975    if (blt_view->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1976       if (!separate_stencil) {
1977          tu_cs_emit(cs, tu_fdl_view_depth(blt_view->view, RB_BLIT_DST_INFO));
1978          tu_cs_emit_qw(cs, blt_view->depth_addr);
1979          tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(blt_view->depth_pitch).value);
1980 
1981          tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1982          tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1983       } else {
1984          tu_cs_emit(cs, tu_fdl_view_stencil(blt_view->view, RB_BLIT_DST_INFO) &
1985                            ~A6XX_RB_BLIT_DST_INFO_FLAGS);
1986          tu_cs_emit_qw(cs, blt_view->stencil_addr);
1987          tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(blt_view->stencil_pitch).value);
1988       }
1989    } else {
1990       tu_cs_emit(cs, blt_view->view->RB_BLIT_DST_INFO);
1991       tu_cs_image_ref_2d<CHIP>(cs, blt_view->view, blt_view->layer, false);
1992 
1993       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1994       tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1995    }
1996 
1997    if (att) {
1998       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
1999          tu_cs_emit_regs(
2000             cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(
2001                    cmd, att, blt_view->layer)));
2002       } else {
2003          tu_cs_emit_regs(cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(
2004                                 cmd, att, blt_view->layer)));
2005       }
2006    }
2007 
2008    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
2009 }
2010 
2011 static void
tu7_generic_layer_clear(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,bool separate_stencil,uint32_t layer,const VkClearValue * value,uint32_t a)2012 tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
2013                         struct tu_cs *cs,
2014                         uint32_t buffer_id,
2015                         enum pipe_format format,
2016                         uint8_t clear_mask,
2017                         bool separate_stencil,
2018                         uint32_t layer,
2019                         const VkClearValue *value,
2020                         uint32_t a)
2021 {
2022    const struct tu_render_pass_attachment *att =
2023       &cmd->state.pass->attachments[a];
2024    const struct tu_image_view *iview = cmd->state.attachments[a];
2025 
2026    uint32_t clear_vals[4] = {};
2027    pack_blit_event_clear_value(value, format, clear_vals);
2028 
2029    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2030    tu_cs_emit_array(cs, clear_vals, 4);
2031 
2032    event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
2033 
2034    event_blit_setup(cs, buffer_id, att, BLIT_EVENT_CLEAR, clear_mask);
2035    event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
2036 }
2037 
2038 
2039 
2040 /* Copies/fills/updates for buffers are happening through CCU but need
2041  * additional synchronization when write range is not aligned to 64 bytes.
2042  * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not
2043  * coherent between each other in CCU since format seem to be a part of a
2044  * cache key.
2045  *
2046  * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306
2047  *
2048  * The synchronization with writes from UCHE (e.g. with SSBO stores) are
2049  * solved by the fact that UCHE has byte level dirtiness tracking and that CCU
2050  * flush would happen always before UCHE flush for such case (e.g. both
2051  * renderpass and dispatch would flush pending CCU write).
2052  *
2053  * Additionally see:
2054  * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111
2055  */
2056 template <chip CHIP>
2057 static void
handle_buffer_unaligned_store(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t size,bool * unaligned_store)2058 handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd,
2059                               uint64_t dst_va,
2060                               uint64_t size,
2061                               bool *unaligned_store)
2062 {
2063    if (*unaligned_store)
2064       return;
2065 
2066    if ((dst_va & 63) || (size & 63)) {
2067       tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE,
2068                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
2069       /* Wait for invalidations to land. */
2070       cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2071       tu_emit_cache_flush<CHIP>(cmd);
2072       *unaligned_store = true;
2073    }
2074 }
2075 
2076 template <chip CHIP>
2077 static void
after_buffer_unaligned_buffer_store(struct tu_cmd_buffer * cmd,bool unaligned_store)2078 after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd,
2079                                     bool unaligned_store)
2080 {
2081    if (unaligned_store) {
2082       tu_flush_for_access(&cmd->state.cache,
2083                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
2084                           TU_ACCESS_NONE);
2085    }
2086 }
2087 
2088 template <chip CHIP>
2089 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)2090 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
2091               struct tu_cs *cs,
2092               struct tu_image *image,
2093               const VkClearValue *value)
2094 {
2095    const struct blit_ops *ops = &r2d_ops<CHIP>;
2096 
2097    /* It is assumed that LRZ cache is invalidated at this point for
2098     * the writes here to become visible to LRZ.
2099     *
2100     * LRZ writes are going through UCHE cache, flush UCHE before changing
2101     * LRZ via CCU. Don't need to invalidate CCU since we are presumably
2102     * writing whole cache lines we assume to be 64 bytes.
2103     */
2104    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
2105 
2106    ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
2107               VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
2108               VK_SAMPLE_COUNT_1_BIT);
2109    ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
2110    ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
2111                    image->iova + image->lrz_layout.lrz_offset,
2112                    image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
2113    uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers;
2114    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
2115                (VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height });
2116    ops->run(cmd, cs);
2117    ops->teardown(cmd, cs);
2118 
2119    /* Clearing writes via CCU color in the PS stage, and LRZ is read via
2120     * UCHE in the earlier GRAS stage.
2121     */
2122    cmd->state.cache.flush_bits |=
2123       TU_CMD_FLAG_CCU_CLEAN_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
2124       TU_CMD_FLAG_WAIT_FOR_IDLE;
2125 }
2126 TU_GENX(tu6_clear_lrz);
2127 
2128 template <chip CHIP>
2129 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)2130 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
2131                  struct tu_cs *cs,
2132                  struct tu_image *image)
2133 {
2134    const struct blit_ops *ops = &r2d_ops<CHIP>;
2135    VkClearValue clear = {};
2136    clear.color.uint32[0] = 0xffffffff;
2137 
2138    using LRZFC = fd_lrzfc_layout<CHIP>;
2139    uint64_t lrz_fc_iova = image->iova + image->lrz_layout.lrz_fc_offset;
2140    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2141               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2142               VK_SAMPLE_COUNT_1_BIT);
2143    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
2144    ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2145                    lrz_fc_iova + offsetof(LRZFC, fc1),
2146                    sizeof(LRZFC::fc1),
2147                    PIPE_FORMAT_R32_UINT);
2148    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2149       sizeof(LRZFC::fc1) / sizeof(uint32_t), 1
2150    });
2151    ops->run(cmd, cs);
2152    if constexpr (LRZFC::HAS_BIDIR) {
2153       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2154                       lrz_fc_iova + offsetof(LRZFC, fc2),
2155                       sizeof(LRZFC::fc2),
2156                       PIPE_FORMAT_R32_UINT);
2157       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2158          sizeof(LRZFC::fc2) / sizeof(uint32_t), 1
2159       });
2160       ops->run(cmd, cs);
2161    }
2162    ops->teardown(cmd, cs);
2163 }
2164 TU_GENX(tu6_dirty_lrz_fc);
2165 
2166 template<chip CHIP>
2167 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)2168 tu_image_view_copy_blit(struct fdl6_view *iview,
2169                         struct tu_image *image,
2170                         enum pipe_format format,
2171                         const VkImageSubresourceLayers *subres,
2172                         uint32_t layer,
2173                         bool z_scale)
2174 {
2175    VkImageAspectFlags aspect_mask = subres->aspectMask;
2176 
2177    /* always use the AS_R8G8B8A8 format for these */
2178    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
2179        format == PIPE_FORMAT_Z24X8_UNORM) {
2180       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
2181    }
2182 
2183    const struct fdl_layout *layout =
2184       &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
2185 
2186    const struct fdl_view_args args = {
2187       .chip = CHIP,
2188       .iova = image->iova,
2189       .base_miplevel = subres->mipLevel,
2190       .level_count = 1,
2191       .base_array_layer = subres->baseArrayLayer + layer,
2192       .layer_count = 1,
2193       .swiz = {
2194          PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
2195       },
2196       .format = tu_format_for_aspect(format, aspect_mask),
2197       .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
2198    };
2199    fdl6_view_init(iview, &layout, &args, false);
2200 }
2201 
2202 template<chip CHIP>
2203 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)2204 tu_image_view_copy(struct fdl6_view *iview,
2205                    struct tu_image *image,
2206                    enum pipe_format format,
2207                    const VkImageSubresourceLayers *subres,
2208                    uint32_t layer)
2209 {
2210    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2211 }
2212 
2213 template<chip CHIP>
2214 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)2215 tu_image_view_blit(struct fdl6_view *iview,
2216                    struct tu_image *image,
2217                    const VkImageSubresourceLayers *subres,
2218                    uint32_t layer)
2219 {
2220    enum pipe_format format = tu_aspects_to_plane(image->vk.format, subres->aspectMask);
2221    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2222 }
2223 
2224 template <chip CHIP>
2225 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)2226 tu6_blit_image(struct tu_cmd_buffer *cmd,
2227                struct tu_image *src_image,
2228                struct tu_image *dst_image,
2229                const VkImageBlit2 *info,
2230                VkFilter filter)
2231 {
2232    const struct blit_ops *ops = &r2d_ops<CHIP>;
2233    struct tu_cs *cs = &cmd->cs;
2234    bool z_scale = false;
2235    uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
2236 
2237    /* 2D blit can't do rotation mirroring from just coordinates */
2238    static const enum a6xx_rotation rotate[2][2] = {
2239       {ROTATE_0, ROTATE_HFLIP},
2240       {ROTATE_VFLIP, ROTATE_180},
2241    };
2242 
2243    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
2244                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
2245    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
2246                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
2247 
2248    int32_t src0_z = info->srcOffsets[0].z;
2249    int32_t src1_z = info->srcOffsets[1].z;
2250 
2251    if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
2252         info->dstOffsets[1].z - info->dstOffsets[0].z) ||
2253        info->srcOffsets[1].z < info->srcOffsets[0].z) {
2254       z_scale = true;
2255    }
2256 
2257    if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
2258       layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
2259       src0_z = info->srcOffsets[1].z;
2260       src1_z = info->srcOffsets[0].z;
2261    }
2262 
2263    if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
2264       assert(layers <= 1);
2265       layers = vk_image_subresource_layer_count(&dst_image->vk,
2266                                                 &info->dstSubresource);
2267    }
2268 
2269    /* BC1_RGB_* formats need to have their last components overriden with 1
2270     * when sampling, which is normally handled with the texture descriptor
2271     * swizzle. The 2d path can't handle that, so use the 3d path.
2272     *
2273     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
2274     * the 2d path.
2275     */
2276 
2277    unsigned blit_param = rotate[mirror_y][mirror_x];
2278    if (dst_image->layout[0].nr_samples > 1 ||
2279        src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
2280        src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
2281        filter == VK_FILTER_CUBIC_EXT ||
2282        z_scale) {
2283       ops = &r3d_ops<CHIP>;
2284       blit_param = z_scale ? R3D_Z_SCALE : 0;
2285    }
2286 
2287    /* use the right format in setup() for D32_S8 */
2288    enum pipe_format src_format = tu_aspects_to_plane(
2289       src_image->vk.format, info->srcSubresource.aspectMask);
2290    enum pipe_format dst_format = tu_aspects_to_plane(
2291       dst_image->vk.format, info->dstSubresource.aspectMask);
2292    trace_start_blit(&cmd->trace, cs,
2293                   ops == &r3d_ops<CHIP>,
2294                   src_image->vk.format,
2295                   dst_image->vk.format,
2296                   layers);
2297 
2298    ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
2299               blit_param, false, dst_image->layout[0].ubwc,
2300               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2301 
2302    if (ops == &r3d_ops<CHIP>) {
2303       const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
2304                                info->srcOffsets[0].x, info->srcOffsets[0].y,
2305                                info->dstOffsets[1].x, info->dstOffsets[1].y,
2306                                info->srcOffsets[1].x, info->srcOffsets[1].y };
2307       r3d_coords_raw(cmd, cs, coords);
2308    } else {
2309       tu_cs_emit_regs(cs,
2310          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
2311                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
2312          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
2313                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
2314       tu_cs_emit_regs(cs,
2315          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
2316          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
2317          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
2318          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
2319    }
2320 
2321    struct fdl6_view dst, src;
2322    tu_image_view_blit<CHIP>(
2323       &dst, dst_image, &info->dstSubresource,
2324       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
2325 
2326    if (z_scale) {
2327       tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
2328                                     &info->srcSubresource, 0, true);
2329       ops->src(cmd, cs, &src, 0, filter, dst_format);
2330    } else {
2331       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
2332    }
2333 
2334    for (uint32_t i = 0; i < layers; i++) {
2335       if (z_scale) {
2336          float t = ((float) i + 0.5f) / (float) layers;
2337          r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
2338       } else {
2339          ops->src(cmd, cs, &src, i, filter, dst_format);
2340       }
2341       ops->dst(cs, &dst, i, src_format);
2342       ops->run(cmd, cs);
2343    }
2344 
2345    ops->teardown(cmd, cs);
2346 
2347    trace_end_blit(&cmd->trace, cs);
2348 }
2349 
2350 template <chip CHIP>
2351 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)2352 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
2353                  const VkBlitImageInfo2 *pBlitImageInfo)
2354 
2355 {
2356    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2357    VK_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
2358    VK_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
2359 
2360    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
2361       /* can't blit both depth and stencil at once with D32_S8
2362        * TODO: more advanced 3D blit path to support it instead?
2363        */
2364       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2365           dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2366          VkImageBlit2 region = pBlitImageInfo->pRegions[i];
2367          u_foreach_bit(b, region.dstSubresource.aspectMask) {
2368             region.srcSubresource.aspectMask = BIT(b);
2369             region.dstSubresource.aspectMask = BIT(b);
2370             tu6_blit_image<CHIP>(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
2371          }
2372          continue;
2373       }
2374       tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
2375                      pBlitImageInfo->filter);
2376    }
2377 
2378    if (dst_image->lrz_layout.lrz_total_size) {
2379       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2380    }
2381 }
2382 TU_GENX(tu_CmdBlitImage2);
2383 
2384 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2385 copy_compressed(VkFormat format,
2386                 VkOffset3D *offset,
2387                 VkExtent3D *extent,
2388                 uint32_t *width,
2389                 uint32_t *height)
2390 {
2391    if (!vk_format_is_compressed(format))
2392       return;
2393 
2394    uint32_t block_width = vk_format_get_blockwidth(format);
2395    uint32_t block_height = vk_format_get_blockheight(format);
2396 
2397    offset->x /= block_width;
2398    offset->y /= block_height;
2399 
2400    if (extent) {
2401       extent->width = DIV_ROUND_UP(extent->width, block_width);
2402       extent->height = DIV_ROUND_UP(extent->height, block_height);
2403    }
2404    if (width)
2405       *width = DIV_ROUND_UP(*width, block_width);
2406    if (height)
2407       *height = DIV_ROUND_UP(*height, block_height);
2408 }
2409 
2410 template <chip CHIP>
2411 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2412 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2413                         struct tu_buffer *src_buffer,
2414                         struct tu_image *dst_image,
2415                         const VkBufferImageCopy2 *info)
2416 {
2417    struct tu_cs *cs = &cmd->cs;
2418    uint32_t layers = MAX2(info->imageExtent.depth,
2419                           vk_image_subresource_layer_count(&dst_image->vk,
2420                                                            &info->imageSubresource));
2421    enum pipe_format src_format =
2422       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2423    enum pipe_format dst_format =
2424       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2425    const struct blit_ops *ops = &r2d_ops<CHIP>;
2426 
2427    /* special case for buffer to stencil */
2428    if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2429        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2430       src_format = PIPE_FORMAT_S8_UINT;
2431    }
2432 
2433    /* note: could use "R8_UNORM" when no UBWC */
2434    bool has_unaligned = CHIP >= A7XX; /* If unaligned buffer copies are supported. */
2435    unsigned blit_param = 0;
2436    if (src_format == PIPE_FORMAT_Y8_UNORM ||
2437        tu_pipe_format_is_float16(src_format)) {
2438       ops = &r3d_ops<CHIP>;
2439       blit_param = R3D_COPY;
2440       has_unaligned = false;
2441    }
2442 
2443    VkOffset3D offset = info->imageOffset;
2444    VkExtent3D extent = info->imageExtent;
2445    uint32_t src_width = info->bufferRowLength ?: extent.width;
2446    uint32_t src_height = info->bufferImageHeight ?: extent.height;
2447 
2448    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2449 
2450    uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2451    uint32_t layer_size = src_height * pitch;
2452 
2453    ops->setup(cmd, cs, src_format, dst_format,
2454               info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2455               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2456 
2457    struct fdl6_view dst;
2458    tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2459                             &info->imageSubresource, offset.z);
2460 
2461    for (uint32_t i = 0; i < layers; i++) {
2462       ops->dst(cs, &dst, i, src_format);
2463 
2464       uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2465       bool unaligned = (src_va & 63) || (pitch & 63);
2466       if (!has_unaligned && unaligned) {
2467          for (uint32_t y = 0; y < extent.height; y++) {
2468             uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2469             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2470                             x + extent.width, 1, dst_format);
2471             ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y},  (VkOffset2D) {x},
2472                         (VkExtent2D) {extent.width, 1});
2473             ops->run(cmd, cs);
2474             src_va += pitch;
2475          }
2476       } else {
2477          if constexpr (CHIP >= A7XX) {
2478             /* Necessary to not trigger static assertion from A6XX variant. */
2479             if (has_unaligned) {
2480                r2d_src_buffer_unaligned<CHIP>(cmd, cs, src_format, src_va,
2481                                               pitch, extent.width,
2482                                               extent.height, dst_format);
2483             } else {
2484                ops->src_buffer(cmd, cs, src_format, src_va, pitch,
2485                                extent.width, extent.height, dst_format);
2486             }
2487          } else {
2488             ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width,
2489                             extent.height, dst_format);
2490          }
2491          coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2492          ops->run(cmd, cs);
2493       }
2494    }
2495 
2496    ops->teardown(cmd, cs);
2497 }
2498 
2499 template <chip CHIP>
2500 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2501 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2502                          const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2503 {
2504    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2505    VK_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2506    VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2507 
2508    for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2509       tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2510                               pCopyBufferToImageInfo->pRegions + i);
2511 
2512    if (dst_image->lrz_layout.lrz_total_size) {
2513       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2514    }
2515 }
2516 TU_GENX(tu_CmdCopyBufferToImage2);
2517 
2518 static void
tu_copy_memory_to_image(struct tu_device * device,struct tu_image * dst_image,const VkMemoryToImageCopyEXT * info,bool copy_memcpy)2519 tu_copy_memory_to_image(struct tu_device *device,
2520                         struct tu_image *dst_image,
2521                         const VkMemoryToImageCopyEXT *info,
2522                         bool copy_memcpy)
2523 {
2524    unsigned plane = tu6_plane_index(dst_image->vk.format,
2525                                     info->imageSubresource.aspectMask);
2526    const struct fdl_layout *layout = &dst_image->layout[plane];
2527 
2528    VkOffset3D offset = info->imageOffset;
2529    VkExtent3D extent = info->imageExtent;
2530    uint32_t src_width = info->memoryRowLength ?: extent.width;
2531    uint32_t src_height = info->memoryImageHeight ?: extent.height;
2532 
2533    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2534 
2535    uint32_t src_pitch = src_width * layout->cpp;
2536 
2537    unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2538       offset.z : info->imageSubresource.baseArrayLayer;
2539    uint32_t layers = MAX2(extent.depth,
2540                           vk_image_subresource_layer_count(&dst_image->vk,
2541                                                            &info->imageSubresource));
2542 
2543    uint32_t image_offset =
2544       fdl_surface_offset(layout,
2545                          info->imageSubresource.mipLevel,
2546                          start_layer);
2547 
2548    uint32_t dst_layer_stride =
2549       fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2550    uint32_t dst_layer_size =
2551       layout->slices[info->imageSubresource.mipLevel].size0;
2552    uint32_t src_layer_stride =
2553       copy_memcpy ? dst_layer_size :
2554       (src_width * src_height * layout->cpp);
2555    bool tiled =
2556       fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2557 
2558    const char *src = (const char *) info->pHostPointer;
2559    char *dst = (char *) dst_image->map + image_offset;
2560    for (unsigned layer = 0; layer < layers; layer++,
2561         src += src_layer_stride, dst += dst_layer_stride) {
2562       if (copy_memcpy) {
2563          memcpy(dst, src, src_layer_stride);
2564       } else if (!tiled) {
2565          uint32_t dst_pitch = fdl_pitch(layout,
2566                                         info->imageSubresource.mipLevel);
2567          for (unsigned y = 0; y < extent.height; y++) {
2568             memcpy(dst + dst_pitch * (y + offset.y) + offset.x * layout->cpp,
2569                    src + src_pitch * y,
2570                    extent.width * layout->cpp);
2571          }
2572       } else {
2573          fdl6_memcpy_linear_to_tiled(offset.x, offset.y,
2574                                      extent.width, extent.height,
2575                                      dst, src, layout,
2576                                      info->imageSubresource.mipLevel,
2577                                      src_pitch,
2578                                      &device->physical_device->ubwc_config);
2579       }
2580 
2581       if (dst_image->bo->cached_non_coherent) {
2582          tu_bo_sync_cache(device, dst_image->bo,
2583                           dst_image->bo_offset + image_offset,
2584                           dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
2585       }
2586    }
2587 }
2588 
2589 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyMemoryToImageEXT(VkDevice _device,const VkCopyMemoryToImageInfoEXT * info)2590 tu_CopyMemoryToImageEXT(VkDevice _device,
2591                         const VkCopyMemoryToImageInfoEXT *info)
2592 {
2593    VK_FROM_HANDLE(tu_device, device, _device);
2594    VK_FROM_HANDLE(tu_image, dst_image, info->dstImage);
2595 
2596    for (unsigned i = 0; i < info->regionCount; i++) {
2597       tu_copy_memory_to_image(device, dst_image, &info->pRegions[i],
2598                               info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2599    }
2600 
2601    if (dst_image->lrz_layout.lrz_total_size) {
2602       TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
2603    }
2604 
2605    return VK_SUCCESS;
2606 }
2607 
2608 template <chip CHIP>
2609 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info,bool * unaligned_store)2610 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2611                         struct tu_image *src_image,
2612                         struct tu_buffer *dst_buffer,
2613                         const VkBufferImageCopy2 *info,
2614                         bool *unaligned_store)
2615 {
2616    struct tu_cs *cs = &cmd->cs;
2617    uint32_t layers = MAX2(info->imageExtent.depth,
2618                           vk_image_subresource_layer_count(&src_image->vk,
2619                                                            &info->imageSubresource));
2620    enum pipe_format dst_format =
2621       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2622    enum pipe_format src_format =
2623       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2624    const struct blit_ops *ops = &r2d_ops<CHIP>;
2625 
2626    if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2627        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2628       dst_format = PIPE_FORMAT_S8_UINT;
2629    }
2630 
2631    /* note: could use "R8_UNORM" when no UBWC */
2632    unsigned blit_param = 0;
2633    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2634        tu_pipe_format_is_float16(src_format)) {
2635       ops = &r3d_ops<CHIP>;
2636       blit_param = R3D_COPY;
2637    }
2638 
2639    VkOffset3D offset = info->imageOffset;
2640    VkExtent3D extent = info->imageExtent;
2641    uint32_t dst_width = info->bufferRowLength ?: extent.width;
2642    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2643 
2644    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2645 
2646    uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2647    uint32_t layer_size = pitch * dst_height;
2648 
2649    handle_buffer_unaligned_store<CHIP>(cmd,
2650                                        dst_buffer->iova + info->bufferOffset,
2651                                        layer_size * layers, unaligned_store);
2652 
2653    ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2654               VK_SAMPLE_COUNT_1_BIT);
2655 
2656    struct fdl6_view src;
2657    tu_image_view_copy<CHIP>(&src, src_image, src_format,
2658                             &info->imageSubresource, offset.z);
2659 
2660    for (uint32_t i = 0; i < layers; i++) {
2661       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2662 
2663       uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2664       if ((dst_va & 63) || (pitch & 63)) {
2665          for (uint32_t y = 0; y < extent.height; y++) {
2666             uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2667             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2668             ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2669                         (VkExtent2D) {extent.width, 1});
2670             ops->run(cmd, cs);
2671             dst_va += pitch;
2672          }
2673       } else {
2674          ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2675          coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2676          ops->run(cmd, cs);
2677       }
2678    }
2679 
2680    ops->teardown(cmd, cs);
2681 }
2682 
2683 template <chip CHIP>
2684 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2685 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2686                          const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2687 {
2688    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2689    VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2690    VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2691 
2692    bool unaligned_store = false;
2693    for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2694       tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2695                               pCopyImageToBufferInfo->pRegions + i,
2696                               &unaligned_store);
2697 
2698    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2699 }
2700 TU_GENX(tu_CmdCopyImageToBuffer2);
2701 
2702 static void
tu_copy_image_to_memory(struct tu_device * device,struct tu_image * src_image,const VkImageToMemoryCopyEXT * info,bool copy_memcpy)2703 tu_copy_image_to_memory(struct tu_device *device,
2704                         struct tu_image *src_image,
2705                         const VkImageToMemoryCopyEXT *info,
2706                         bool copy_memcpy)
2707 {
2708    unsigned plane = tu6_plane_index(src_image->vk.format,
2709                                     info->imageSubresource.aspectMask);
2710    const struct fdl_layout *layout = &src_image->layout[plane];
2711 
2712    VkOffset3D offset = info->imageOffset;
2713    VkExtent3D extent = info->imageExtent;
2714    uint32_t dst_width = info->memoryRowLength ?: extent.width;
2715    uint32_t dst_height = info->memoryImageHeight ?: extent.height;
2716 
2717    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2718 
2719    uint32_t dst_pitch = dst_width * layout->cpp;
2720 
2721    unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2722       offset.z : info->imageSubresource.baseArrayLayer;
2723    uint32_t layers = MAX2(extent.depth,
2724                           vk_image_subresource_layer_count(&src_image->vk,
2725                                                            &info->imageSubresource));
2726 
2727    uint32_t image_offset =
2728       fdl_surface_offset(layout,
2729                          info->imageSubresource.mipLevel,
2730                          start_layer);
2731 
2732    uint32_t src_layer_stride =
2733       fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2734    uint32_t src_layer_size =
2735       layout->slices[info->imageSubresource.mipLevel].size0;
2736    uint32_t dst_layer_stride =
2737       copy_memcpy ? src_layer_size : (dst_width * dst_height * layout->cpp);
2738    bool tiled =
2739       fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2740 
2741    const char *src = (const char *) src_image->map + image_offset;
2742    char *dst = (char *) info->pHostPointer;
2743    for (unsigned layer = 0; layer < layers; layer++,
2744         src += src_layer_stride, dst += dst_layer_stride) {
2745       if (src_image->bo->cached_non_coherent) {
2746          tu_bo_sync_cache(device, src_image->bo,
2747                           src_image->bo_offset + image_offset,
2748                           src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
2749       }
2750 
2751       if (copy_memcpy) {
2752          memcpy(dst, src, dst_layer_stride);
2753       } else if (!tiled) {
2754          uint32_t src_pitch = fdl_pitch(layout,
2755                                         info->imageSubresource.mipLevel);
2756          for (unsigned y = 0; y < extent.height; y++) {
2757             memcpy(dst + dst_pitch * y,
2758                    src + src_pitch * (y + offset.y) + offset.x * layout->cpp,
2759                    extent.width * layout->cpp);
2760          }
2761       } else {
2762          fdl6_memcpy_tiled_to_linear(offset.x, offset.y,
2763                                      extent.width, extent.height,
2764                                      dst, src, layout,
2765                                      info->imageSubresource.mipLevel,
2766                                      dst_pitch,
2767                                      &device->physical_device->ubwc_config);
2768       }
2769    }
2770 }
2771 
2772 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToMemoryEXT(VkDevice _device,const VkCopyImageToMemoryInfoEXT * info)2773 tu_CopyImageToMemoryEXT(VkDevice _device,
2774                         const VkCopyImageToMemoryInfoEXT *info)
2775 {
2776    VK_FROM_HANDLE(tu_device, device, _device);
2777    VK_FROM_HANDLE(tu_image, image, info->srcImage);
2778 
2779    for (unsigned i = 0; i < info->regionCount; i++) {
2780       tu_copy_image_to_memory(device, image, &info->pRegions[i],
2781                               info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2782    }
2783 
2784    return VK_SUCCESS;
2785 }
2786 
2787 
2788 /* Tiled formats don't support swapping, which means that we can't support
2789  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2790  * formats like B5G5R5A1 have a separate linear-only format when sampling.
2791  * Currently we fake support for tiled swapped formats and use the unswapped
2792  * format instead, but this means that reinterpreting copies to and from
2793  * swapped formats can't be performed correctly unless we can swizzle the
2794  * components by reinterpreting the other image as the "correct" swapped
2795  * format, i.e. only when the other image is linear.
2796  */
2797 
2798 template <chip CHIP>
2799 static bool
is_swapped_format(enum pipe_format format,bool is_mutable)2800 is_swapped_format(enum pipe_format format, bool is_mutable)
2801 {
2802    struct tu_native_format linear = blit_format_texture<CHIP>(format, TILE6_LINEAR, is_mutable, false);
2803    struct tu_native_format tiled = blit_format_texture<CHIP>(format, TILE6_3, is_mutable, false);
2804    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2805 }
2806 
2807 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2808  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2809  * versa). This should mirror the logic in fdl6_layout.
2810  */
2811 static bool
image_is_r8g8(struct tu_image * image)2812 image_is_r8g8(struct tu_image *image)
2813 {
2814    return image->layout[0].cpp == 2 &&
2815       vk_format_get_nr_components(image->vk.format) == 2;
2816 }
2817 
2818 template <chip CHIP>
2819 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2820 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2821                        struct tu_image *src_image,
2822                        struct tu_image *dst_image,
2823                        const VkImageCopy2 *info)
2824 {
2825    const struct blit_ops *ops = &r2d_ops<CHIP>;
2826    struct tu_cs *cs = &cmd->cs;
2827 
2828    if (dst_image->layout[0].nr_samples > 1)
2829       ops = &r3d_ops<CHIP>;
2830 
2831    enum pipe_format format = PIPE_FORMAT_NONE;
2832    VkOffset3D src_offset = info->srcOffset;
2833    VkOffset3D dst_offset = info->dstOffset;
2834    VkExtent3D extent = info->extent;
2835    uint32_t layers_to_copy = MAX2(info->extent.depth,
2836                                   vk_image_subresource_layer_count(&src_image->vk,
2837                                                                    &info->srcSubresource));
2838 
2839    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2840     * Images":
2841     *
2842     *    When copying between compressed and uncompressed formats the extent
2843     *    members represent the texel dimensions of the source image and not
2844     *    the destination. When copying from a compressed image to an
2845     *    uncompressed image the image texel dimensions written to the
2846     *    uncompressed image will be source extent divided by the compressed
2847     *    texel block dimensions. When copying from an uncompressed image to a
2848     *    compressed image the image texel dimensions written to the compressed
2849     *    image will be the source extent multiplied by the compressed texel
2850     *    block dimensions.
2851     *
2852     * This means we only have to adjust the extent if the source image is
2853     * compressed.
2854     */
2855    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2856    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2857 
2858    enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2859    enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2860 
2861    /* note: could use "R8_UNORM" when no UBWC */
2862    unsigned blit_param = 0;
2863    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2864        src_format == PIPE_FORMAT_Y8_UNORM ||
2865        tu_pipe_format_is_float16(src_format) ||
2866        tu_pipe_format_is_float16(dst_format)) {
2867       ops = &r3d_ops<CHIP>;
2868       blit_param = R3D_COPY;
2869    }
2870 
2871    bool use_staging_blit = false;
2872 
2873    if (src_format == dst_format) {
2874       /* Images that share a format can always be copied directly because it's
2875        * the same as a blit.
2876        */
2877       format = src_format;
2878    } else if (!src_image->layout[0].tile_mode) {
2879       /* If an image is linear, we can always safely reinterpret it with the
2880        * other image's format and then do a regular blit.
2881        */
2882       format = dst_format;
2883    } else if (!dst_image->layout[0].tile_mode) {
2884       format = src_format;
2885    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2886       /* We can't currently copy r8g8 images to/from other cpp=2 images,
2887        * due to the different tile layout.
2888        */
2889       use_staging_blit = true;
2890    } else if (is_swapped_format<CHIP>(src_format,
2891                                       src_image->layout[0].is_mutable) ||
2892               is_swapped_format<CHIP>(dst_format,
2893                                       src_image->layout[0].is_mutable)) {
2894       /* If either format has a non-identity swap, then we can't copy
2895        * to/from it.
2896        */
2897       use_staging_blit = true;
2898    } else if (!src_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2899       format = dst_format;
2900    } else if (!dst_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2901       format = src_format;
2902    } else {
2903       /* Both formats use UBWC and so neither can be reinterpreted.
2904        * TODO: We could do an in-place decompression of the dst instead.
2905        */
2906       perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2907       use_staging_blit = true;
2908    }
2909 
2910    struct fdl6_view dst, src;
2911 
2912    if (use_staging_blit) {
2913       tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2914       tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2915 
2916       struct fdl_layout staging_layout = { 0 };
2917       VkOffset3D staging_offset = { 0 };
2918 
2919       staging_layout.tile_mode = TILE6_LINEAR;
2920       staging_layout.ubwc = false;
2921 
2922       uint32_t layer_count =
2923          vk_image_subresource_layer_count(&src_image->vk,
2924                                           &info->srcSubresource);
2925       fdl6_layout(&staging_layout,
2926                   &cmd->device->physical_device->dev_info,
2927                   src_format,
2928                   src_image->layout[0].nr_samples,
2929                   extent.width,
2930                   extent.height,
2931                   extent.depth,
2932                   1,
2933                   layer_count,
2934                   extent.depth > 1,
2935                   false,
2936                   NULL);
2937 
2938       struct tu_bo *staging_bo;
2939       VkResult result = tu_get_scratch_bo(cmd->device,
2940                                           staging_layout.size,
2941                                           &staging_bo);
2942       if (result != VK_SUCCESS) {
2943          vk_command_buffer_set_error(&cmd->vk, result);
2944          return;
2945       }
2946 
2947       struct fdl6_view staging;
2948       const struct fdl_layout *staging_layout_ptr = &staging_layout;
2949       const struct fdl_view_args copy_to_args = {
2950          .chip = CHIP,
2951          .iova = staging_bo->iova,
2952          .base_miplevel = 0,
2953          .level_count = 1,
2954          .base_array_layer = 0,
2955          .layer_count = layer_count,
2956          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2957          .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2958          .type = FDL_VIEW_TYPE_2D,
2959       };
2960       fdl6_view_init(&staging, &staging_layout_ptr, &copy_to_args, false);
2961 
2962       ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2963                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2964       coords(ops, cmd, cs, staging_offset, src_offset, extent);
2965 
2966       for (uint32_t i = 0; i < layers_to_copy; i++) {
2967          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2968          ops->dst(cs, &staging, i, src_format);
2969          ops->run(cmd, cs);
2970       }
2971 
2972       /* When executed by the user there has to be a pipeline barrier here,
2973        * but since we're doing it manually we'll have to flush ourselves.
2974        */
2975       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
2976       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2977       tu_cs_emit_wfi(cs);
2978 
2979       const struct fdl_view_args copy_from_args = {
2980          .chip = CHIP,
2981          .iova = staging_bo->iova,
2982          .base_miplevel = 0,
2983          .level_count = 1,
2984          .base_array_layer = 0,
2985          .layer_count = layer_count,
2986          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2987          .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2988          .type = FDL_VIEW_TYPE_2D,
2989       };
2990       fdl6_view_init(&staging, &staging_layout_ptr, &copy_from_args, false);
2991 
2992       ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2993                  blit_param, false, dst_image->layout[0].ubwc,
2994                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2995       coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2996 
2997       for (uint32_t i = 0; i < layers_to_copy; i++) {
2998          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2999          ops->dst(cs, &dst, i, dst_format);
3000          ops->run(cmd, cs);
3001       }
3002    } else {
3003       tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
3004       tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
3005 
3006       ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
3007                  blit_param, false, dst_image->layout[0].ubwc,
3008                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
3009       coords(ops, cmd, cs, dst_offset, src_offset, extent);
3010 
3011       for (uint32_t i = 0; i < layers_to_copy; i++) {
3012          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
3013          ops->dst(cs, &dst, i, format);
3014          ops->run(cmd, cs);
3015       }
3016    }
3017 
3018    ops->teardown(cmd, cs);
3019 }
3020 
3021 template <chip CHIP>
3022 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)3023 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
3024                  const VkCopyImageInfo2 *pCopyImageInfo)
3025 {
3026    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3027    VK_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
3028    VK_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
3029 
3030    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
3031       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3032          VkImageCopy2 info = pCopyImageInfo->pRegions[i];
3033          u_foreach_bit(b, info.dstSubresource.aspectMask) {
3034             info.srcSubresource.aspectMask = BIT(b);
3035             info.dstSubresource.aspectMask = BIT(b);
3036             tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
3037          }
3038          continue;
3039       }
3040 
3041       tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
3042                              pCopyImageInfo->pRegions + i);
3043    }
3044 
3045    if (dst_image->lrz_layout.lrz_total_size) {
3046       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
3047    }
3048 }
3049 TU_GENX(tu_CmdCopyImage2);
3050 
3051 static void
tu_copy_image_to_image_cpu(struct tu_device * device,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info,bool copy_memcpy)3052 tu_copy_image_to_image_cpu(struct tu_device *device,
3053                            struct tu_image *src_image,
3054                            struct tu_image *dst_image,
3055                            const VkImageCopy2 *info,
3056                            bool copy_memcpy)
3057 {
3058    unsigned src_plane = tu6_plane_index(src_image->vk.format,
3059                                         info->srcSubresource.aspectMask);
3060    unsigned dst_plane = tu6_plane_index(dst_image->vk.format,
3061                                         info->dstSubresource.aspectMask);
3062 
3063    const struct fdl_layout *src_layout = &src_image->layout[src_plane];
3064    const struct fdl_layout *dst_layout = &dst_image->layout[dst_plane];
3065 
3066    VkOffset3D src_offset = info->srcOffset;
3067    VkOffset3D dst_offset = info->dstOffset;
3068    VkExtent3D extent = info->extent;
3069    uint32_t layers_to_copy = MAX2(info->extent.depth,
3070                                   vk_image_subresource_layer_count(&src_image->vk,
3071                                                                    &info->srcSubresource));
3072 
3073    /* See comment above. */
3074    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
3075    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
3076 
3077    unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3078       src_offset.z : info->srcSubresource.baseArrayLayer;
3079    unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3080       dst_offset.z : info->dstSubresource.baseArrayLayer;
3081 
3082    uint32_t src_layer_stride =
3083       fdl_layer_stride(src_layout, info->srcSubresource.mipLevel);
3084    uint32_t src_layer_size =
3085       src_layout->slices[info->srcSubresource.mipLevel].size0;
3086    uint32_t dst_layer_stride =
3087       fdl_layer_stride(dst_layout, info->dstSubresource.mipLevel);
3088    uint32_t dst_layer_size =
3089       dst_layout->slices[info->dstSubresource.mipLevel].size0;
3090 
3091    uint32_t src_image_offset =
3092       fdl_surface_offset(src_layout,
3093                          info->srcSubresource.mipLevel,
3094                          src_start_layer);
3095    uint32_t dst_image_offset =
3096       fdl_surface_offset(dst_layout,
3097                          info->dstSubresource.mipLevel,
3098                          dst_start_layer);
3099 
3100    bool src_tiled =
3101       fdl_tile_mode(src_layout, info->srcSubresource.mipLevel) != 0;
3102    bool dst_tiled =
3103       fdl_tile_mode(dst_layout, info->dstSubresource.mipLevel) != 0;
3104 
3105    const char *src = (const char *) src_image->map + src_image_offset;
3106    char *dst = (char *) dst_image->map + dst_image_offset;
3107    for (unsigned layer = 0; layer < layers_to_copy; layer++,
3108         src += src_layer_stride, dst += dst_layer_stride) {
3109       if (src_image->bo->cached_non_coherent) {
3110          tu_bo_sync_cache(device, src_image->bo,
3111                           src_image->bo_offset + src_image_offset,
3112                           src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
3113       }
3114 
3115       uint32_t src_pitch = fdl_pitch(src_layout,
3116                                      info->srcSubresource.mipLevel);
3117       uint32_t dst_pitch = fdl_pitch(dst_layout,
3118                                      info->dstSubresource.mipLevel);
3119 
3120       if (copy_memcpy) {
3121          assert(src_layer_size == dst_layer_size);
3122          memcpy(dst, src, src_layer_size);
3123       } else if (!src_tiled && !dst_tiled) {
3124          for (unsigned y = 0; y < extent.height; y++) {
3125             memcpy(dst + dst_pitch * (y + dst_offset.y) + dst_offset.x * dst_layout->cpp,
3126                    src + src_pitch * (y + src_offset.y) + src_offset.x * src_layout->cpp,
3127                    extent.width * src_layout->cpp);
3128          }
3129       } else if (!src_tiled) {
3130          fdl6_memcpy_linear_to_tiled(dst_offset.x, dst_offset.y,
3131                                      extent.width, extent.height,
3132                                      dst,
3133                                      src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp,
3134                                      dst_layout,
3135                                      info->dstSubresource.mipLevel,
3136                                      src_pitch,
3137                                      &device->physical_device->ubwc_config);
3138       } else if (!dst_tiled) {
3139          fdl6_memcpy_tiled_to_linear(src_offset.x, src_offset.y,
3140                                      extent.width, extent.height,
3141                                      dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp,
3142                                      src,
3143                                      src_layout,
3144                                      info->dstSubresource.mipLevel,
3145                                      dst_pitch,
3146                                      &device->physical_device->ubwc_config);
3147       } else {
3148          /* Work tile-by-tile, holding the unswizzled tile in a temporary
3149           * buffer.
3150           */
3151          char temp_tile[256];
3152 
3153          uint32_t block_width, block_height;
3154          fdl6_get_ubwc_blockwidth(src_layout, &block_width, &block_height);
3155 
3156          uint32_t temp_pitch = block_width * src_layout->cpp;
3157 
3158          for (unsigned by = src_offset.y / block_height;
3159               by * block_height < src_offset.y + extent.height; by++) {
3160             uint32_t src_y_start = MAX2(src_offset.y, by * block_height);
3161             uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y;
3162             uint32_t height =
3163                MIN2((by + 1) * block_height, src_offset.y + extent.height) -
3164                src_y_start;
3165             for (unsigned bx = src_offset.x / block_width;
3166                  bx * block_width < src_offset.x + extent.width; bx++) {
3167                uint32_t src_x_start = MAX2(src_offset.x, bx * block_width);
3168                uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x;
3169                uint32_t width =
3170                   MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
3171                   src_x_start;
3172 
3173                fdl6_memcpy_tiled_to_linear(src_x_start, src_y_start,
3174                                            width, height,
3175                                            temp_tile, src, src_layout,
3176                                            info->srcSubresource.mipLevel,
3177                                            temp_pitch,
3178                                            &device->physical_device->ubwc_config);
3179                fdl6_memcpy_linear_to_tiled(dst_x_start, dst_y_start,
3180                                            width, height,
3181                                            dst, temp_tile, dst_layout,
3182                                            info->dstSubresource.mipLevel,
3183                                            temp_pitch,
3184                                            &device->physical_device->ubwc_config);
3185             }
3186          }
3187       }
3188 
3189       if (dst_image->bo->cached_non_coherent) {
3190          tu_bo_sync_cache(device, dst_image->bo,
3191                           dst_image->bo_offset + dst_image_offset,
3192                           dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
3193       }
3194    }
3195 }
3196 
3197 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToImageEXT(VkDevice _device,const VkCopyImageToImageInfoEXT * pCopyImageToImageInfo)3198 tu_CopyImageToImageEXT(VkDevice _device,
3199                        const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo)
3200 {
3201    VK_FROM_HANDLE(tu_device, device, _device);
3202    VK_FROM_HANDLE(tu_image, src_image, pCopyImageToImageInfo->srcImage);
3203    VK_FROM_HANDLE(tu_image, dst_image, pCopyImageToImageInfo->dstImage);
3204    bool copy_memcpy = pCopyImageToImageInfo->flags &
3205       VK_HOST_IMAGE_COPY_MEMCPY_EXT;
3206 
3207    for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) {
3208       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3209          VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i];
3210          u_foreach_bit(b, info.dstSubresource.aspectMask) {
3211             info.srcSubresource.aspectMask = BIT(b);
3212             info.dstSubresource.aspectMask = BIT(b);
3213             tu_copy_image_to_image_cpu(device, src_image, dst_image, &info,
3214                                        copy_memcpy);
3215          }
3216          continue;
3217       }
3218 
3219       tu_copy_image_to_image_cpu(device, src_image, dst_image,
3220                                  pCopyImageToImageInfo->pRegions + i,
3221                                  copy_memcpy);
3222    }
3223 
3224    if (dst_image->lrz_layout.lrz_total_size) {
3225       TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
3226    }
3227 
3228    return VK_SUCCESS;
3229 }
3230 
3231 VKAPI_ATTR VkResult VKAPI_CALL
tu_TransitionImageLayoutEXT(VkDevice device,uint32_t transitionCount,const VkHostImageLayoutTransitionInfoEXT * transitions)3232 tu_TransitionImageLayoutEXT(VkDevice device,
3233                             uint32_t transitionCount,
3234                             const VkHostImageLayoutTransitionInfoEXT *transitions)
3235 {
3236    /* We don't do anything with layouts so this should be a no-op */
3237    return VK_SUCCESS;
3238 }
3239 
3240 template <chip CHIP>
3241 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size,bool * unaligned_store)3242 copy_buffer(struct tu_cmd_buffer *cmd,
3243             uint64_t dst_va,
3244             uint64_t src_va,
3245             uint64_t size,
3246             uint32_t block_size,
3247             bool *unaligned_store)
3248 {
3249    const struct blit_ops *ops = &r2d_ops<CHIP>;
3250    struct tu_cs *cs = &cmd->cs;
3251    enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
3252    uint64_t blocks = size / block_size;
3253 
3254    handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
3255 
3256    ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
3257               VK_SAMPLE_COUNT_1_BIT);
3258 
3259    while (blocks) {
3260       uint32_t src_x = (src_va & 63) / block_size;
3261       uint32_t dst_x = (dst_va & 63) / block_size;
3262       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
3263 
3264       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
3265       ops->dst_buffer(     cs, format, dst_va & ~63, 0, format);
3266       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
3267       ops->run(cmd, cs);
3268 
3269       src_va += width * block_size;
3270       dst_va += width * block_size;
3271       blocks -= width;
3272    }
3273 
3274    ops->teardown(cmd, cs);
3275 }
3276 
3277 template <chip CHIP>
3278 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)3279 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
3280                   const VkCopyBufferInfo2 *pCopyBufferInfo)
3281 {
3282    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3283    VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
3284    VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
3285 
3286    bool unaligned_store = false;
3287    for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
3288       const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
3289       copy_buffer<CHIP>(cmd,
3290                   dst_buffer->iova + region->dstOffset,
3291                   src_buffer->iova + region->srcOffset,
3292                   region->size, 1, &unaligned_store);
3293    }
3294 
3295    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3296 }
3297 TU_GENX(tu_CmdCopyBuffer2);
3298 
3299 template <chip CHIP>
3300 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)3301 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
3302                    VkBuffer dstBuffer,
3303                    VkDeviceSize dstOffset,
3304                    VkDeviceSize dataSize,
3305                    const void *pData)
3306 {
3307    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3308    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3309 
3310    struct tu_cs_memory tmp;
3311    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
3312    if (result != VK_SUCCESS) {
3313       vk_command_buffer_set_error(&cmd->vk, result);
3314       return;
3315    }
3316 
3317    bool unaligned_store = false;
3318    memcpy(tmp.map, pData, dataSize);
3319    copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store);
3320 
3321    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3322 }
3323 TU_GENX(tu_CmdUpdateBuffer);
3324 
3325 template <chip CHIP>
3326 static void
tu_cmd_fill_buffer(VkCommandBuffer commandBuffer,VkDeviceAddress dstAddr,VkDeviceSize fillSize,uint32_t data)3327 tu_cmd_fill_buffer(VkCommandBuffer commandBuffer,
3328                    VkDeviceAddress dstAddr,
3329                    VkDeviceSize fillSize,
3330                    uint32_t data)
3331 {
3332    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3333    const struct blit_ops *ops = &r2d_ops<CHIP>;
3334    struct tu_cs *cs = &cmd->cs;
3335 
3336    uint32_t blocks = fillSize / 4;
3337 
3338    bool unaligned_store = false;
3339    handle_buffer_unaligned_store<CHIP>(cmd, dstAddr, fillSize, &unaligned_store);
3340 
3341    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
3342               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
3343               VK_SAMPLE_COUNT_1_BIT);
3344 
3345    VkClearValue clear_val = {};
3346    clear_val.color.uint32[0] = data;
3347    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
3348 
3349    while (blocks) {
3350       uint32_t dst_x = (dstAddr & 63) / 4;
3351       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
3352 
3353       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dstAddr & ~63, 0, PIPE_FORMAT_R32_UINT);
3354       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
3355       ops->run(cmd, cs);
3356 
3357       dstAddr += width * 4;
3358       blocks -= width;
3359    }
3360 
3361    ops->teardown(cmd, cs);
3362 
3363    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3364 }
3365 
3366 void
tu_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer,VkDeviceAddress dstAddr,VkDeviceSize fillSize,uint32_t data)3367 tu_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer,
3368                         VkDeviceAddress dstAddr,
3369                         VkDeviceSize fillSize,
3370                         uint32_t data)
3371 {
3372    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3373 
3374    TU_CALLX(cmd->device, tu_cmd_fill_buffer)(commandBuffer, dstAddr, fillSize,
3375                                              data);
3376 }
3377 
3378 template <chip CHIP>
3379 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)3380 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
3381                  VkBuffer dstBuffer,
3382                  VkDeviceSize dstOffset,
3383                  VkDeviceSize fillSize,
3384                  uint32_t data)
3385 {
3386    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3387 
3388    fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
3389 
3390    VkDeviceAddress dst_va = buffer->iova + dstOffset;
3391 
3392    tu_cmd_fill_buffer<CHIP>(commandBuffer, dst_va, fillSize, data);
3393 }
3394 TU_GENX(tu_CmdFillBuffer);
3395 
3396 template <chip CHIP>
3397 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)3398 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
3399                     const VkResolveImageInfo2 *pResolveImageInfo)
3400 {
3401    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3402    VK_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
3403    VK_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
3404    const struct blit_ops *ops = &r2d_ops<CHIP>;
3405    struct tu_cs *cs = &cmd->cs;
3406 
3407    enum pipe_format src_format =
3408       vk_format_to_pipe_format(src_image->vk.format);
3409    enum pipe_format dst_format =
3410       vk_format_to_pipe_format(dst_image->vk.format);
3411    ops->setup(cmd, cs, src_format, dst_format,
3412               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
3413               VK_SAMPLE_COUNT_1_BIT);
3414 
3415    for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
3416       const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
3417       uint32_t layers = MAX2(info->extent.depth,
3418                              vk_image_subresource_layer_count(&dst_image->vk,
3419                                                               &info->dstSubresource));
3420 
3421       /* TODO: aspect masks possible ? */
3422 
3423       coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
3424 
3425       struct fdl6_view dst, src;
3426       tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
3427       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
3428 
3429       for (uint32_t i = 0; i < layers; i++) {
3430          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
3431          ops->dst(cs, &dst, i, src_format);
3432          ops->run(cmd, cs);
3433       }
3434    }
3435 
3436    ops->teardown(cmd, cs);
3437 }
3438 TU_GENX(tu_CmdResolveImage2);
3439 
3440 #define for_each_layer(layer, layer_mask, layers) \
3441    for (uint32_t layer = 0; \
3442         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
3443         layer++) \
3444       if (!layer_mask || (layer_mask & BIT(layer)))
3445 
3446 template <chip CHIP>
3447 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)3448 resolve_sysmem(struct tu_cmd_buffer *cmd,
3449                struct tu_cs *cs,
3450                VkFormat vk_src_format,
3451                VkFormat vk_dst_format,
3452                const struct tu_image_view *src,
3453                const struct tu_image_view *dst,
3454                uint32_t layer_mask,
3455                uint32_t layers,
3456                const VkRect2D *rect,
3457                bool src_separate_ds,
3458                bool dst_separate_ds)
3459 {
3460    const struct blit_ops *ops = &r2d_ops<CHIP>;
3461 
3462    trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
3463 
3464    enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
3465    enum pipe_format dst_format = vk_format_to_pipe_format(vk_dst_format);
3466 
3467    ops->setup(cmd, cs, src_format, dst_format,
3468               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
3469               VK_SAMPLE_COUNT_1_BIT);
3470    ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
3471 
3472    for_each_layer(i, layer_mask, layers) {
3473       if (src_separate_ds) {
3474          if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3475             r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3476          } else {
3477             r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3478          }
3479       } else {
3480          ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
3481       }
3482 
3483       if (dst_separate_ds) {
3484          if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3485             ops->dst_depth(cs, dst, i);
3486          } else {
3487             ops->dst_stencil(cs, dst, i);
3488          }
3489       } else {
3490          ops->dst(cs, &dst->view, i, src_format);
3491       }
3492 
3493       ops->run(cmd, cs);
3494    }
3495 
3496    ops->teardown(cmd, cs);
3497 
3498    trace_end_sysmem_resolve(&cmd->trace, cs);
3499 }
3500 
3501 template <chip CHIP>
3502 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)3503 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
3504                   struct tu_cs *cs,
3505                   const struct tu_image_view *src,
3506                   const struct tu_image_view *dst,
3507                   uint32_t layer_mask,
3508                   uint32_t layers,
3509                   const VkRect2D *rect)
3510 {
3511    assert(src->vk.format == dst->vk.format ||
3512           (vk_format_is_depth_or_stencil(src->image->vk.format) &&
3513            vk_format_is_depth_or_stencil(dst->image->vk.format)));
3514 
3515    bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3516    bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3517 
3518    if (dst_separate_ds) {
3519       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
3520                      src, dst, layer_mask, layers, rect,
3521                      src_separate_ds, dst_separate_ds);
3522       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
3523                      src, dst, layer_mask, layers, rect,
3524                      src_separate_ds, dst_separate_ds);
3525    } else {
3526       resolve_sysmem<CHIP>(cmd, cs, src->vk.format, dst->vk.format,
3527                      src, dst, layer_mask, layers, rect,
3528                      src_separate_ds, dst_separate_ds);
3529    }
3530 }
3531 TU_GENX(tu_resolve_sysmem);
3532 
3533 template <chip CHIP>
3534 static uint32_t
tu_resolve_group_include_buffer(struct tu_resolve_group * resolve_group,VkFormat format)3535 tu_resolve_group_include_buffer(struct tu_resolve_group *resolve_group,
3536                                 VkFormat format)
3537 {
3538    /* Resolve groups are not usable on a6xx, so no pending resolve is
3539     * established. The default value of 0 is returned as the buffer ID.
3540     */
3541    if (CHIP == A6XX)
3542       return 0;
3543 
3544    resolve_group->pending_resolves = true;
3545 
3546    assert(format != VK_FORMAT_D32_SFLOAT_S8_UINT);
3547    /* D24_UNORM_S8_UINT should be assigned the depth buffer type, regardless of
3548     * whether depth, stencil or both are being resolved.
3549     */
3550    if (vk_format_has_depth(format))
3551       return 0x8;
3552    if (vk_format_has_stencil(format))
3553       return 0x9;
3554 
3555    const uint32_t max_color_buffers = 8;
3556    uint32_t buffer_id = resolve_group->color_buffer_id++;
3557    return buffer_id % max_color_buffers;
3558 }
3559 
3560 template <chip CHIP>
3561 void
tu_emit_resolve_group(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group)3562 tu_emit_resolve_group(struct tu_cmd_buffer *cmd,
3563                           struct tu_cs *cs,
3564                           struct tu_resolve_group *resolve_group)
3565 {
3566    /* Resolve groups are not usable on A6XX, so that template instantiation
3567     * should behave as a no-op.
3568     */
3569    if (CHIP == A6XX || !resolve_group->pending_resolves)
3570       return;
3571 
3572    resolve_group->color_buffer_id = 0;
3573    resolve_group->pending_resolves = false;
3574 
3575    tu_emit_raw_event_write<CHIP>(cmd, cs, CCU_END_RESOLVE_GROUP, false);
3576 }
3577 TU_GENX(tu_emit_resolve_group);
3578 
3579 template <chip CHIP>
3580 static void
clear_image_cp_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3581 clear_image_cp_blit(struct tu_cmd_buffer *cmd,
3582                     struct tu_image *image,
3583                     const VkClearValue *clear_value,
3584                     const VkImageSubresourceRange *range,
3585                     VkImageAspectFlags aspect_mask)
3586 {
3587    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3588    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3589    struct tu_cs *cs = &cmd->cs;
3590    enum pipe_format format;
3591    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
3592       format = PIPE_FORMAT_R32_UINT;
3593    } else {
3594       format = tu_aspects_to_plane(image->vk.format, aspect_mask);
3595    }
3596 
3597    if (image->layout[0].depth0 > 1) {
3598       assert(layer_count == 1);
3599       assert(range->baseArrayLayer == 0);
3600    }
3601 
3602    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
3603 
3604    ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
3605               (VkSampleCountFlagBits) image->layout[0].nr_samples);
3606    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
3607       ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
3608    else
3609       ops->clear_value(cmd, cs, format, clear_value);
3610 
3611    for (unsigned j = 0; j < level_count; j++) {
3612       if (image->layout[0].depth0 > 1)
3613          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
3614 
3615       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
3616                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
3617                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
3618                   });
3619 
3620       struct fdl6_view dst;
3621       const VkImageSubresourceLayers subresource = {
3622          .aspectMask = aspect_mask,
3623          .mipLevel = range->baseMipLevel + j,
3624          .baseArrayLayer = range->baseArrayLayer,
3625          .layerCount = 1,
3626       };
3627       tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
3628 
3629       for (uint32_t i = 0; i < layer_count; i++) {
3630          ops->dst(cs, &dst, i, format);
3631          ops->run(cmd, cs);
3632       }
3633    }
3634 
3635    ops->teardown(cmd, cs);
3636 }
3637 
3638 static void
clear_image_event_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3639 clear_image_event_blit(struct tu_cmd_buffer *cmd,
3640                        struct tu_image *image,
3641                        uint32_t buffer_id,
3642                        const VkClearValue *clear_value,
3643                        const VkImageSubresourceRange *range,
3644                        VkImageAspectFlags aspect_mask)
3645 {
3646    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3647    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3648    VkFormat vk_format = image->vk.format;
3649    if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3650       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
3651          vk_format = VK_FORMAT_S8_UINT;
3652       else
3653          vk_format = VK_FORMAT_D32_SFLOAT;
3654    }
3655 
3656    enum pipe_format format = vk_format_to_pipe_format(vk_format);
3657 
3658    if (image->layout[0].depth0 > 1) {
3659       assert(layer_count == 1);
3660       assert(range->baseArrayLayer == 0);
3661    }
3662 
3663    struct tu_cs *cs = &cmd->cs;
3664 
3665    tu_cs_emit_regs(cs,
3666                    A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
3667 
3668    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3669    tu_cs_emit(cs, 0);
3670 
3671    tu_cs_emit_regs(
3672       cs, A6XX_RB_BLIT_INFO(
3673                 .type = BLIT_EVENT_CLEAR,
3674                 .sample_0 = vk_format_is_int(vk_format) ||
3675                             vk_format_is_depth_or_stencil(vk_format),
3676                 .depth = vk_format_is_depth_or_stencil(vk_format),
3677                 .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask),
3678                 .buffer_id = buffer_id));
3679 
3680    uint32_t clear_vals[4] = {};
3681    pack_blit_event_clear_value(clear_value, format, clear_vals);
3682    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3683    tu_cs_emit_array(cs, clear_vals, 4);
3684 
3685    for (unsigned level = 0; level < level_count; level++) {
3686       if (image->layout[0].depth0 > 1)
3687          layer_count =
3688             u_minify(image->layout[0].depth0, range->baseMipLevel + level);
3689 
3690       uint32_t width =
3691          u_minify(image->layout[0].width0, range->baseMipLevel + level);
3692       uint32_t height =
3693          u_minify(image->layout[0].height0, range->baseMipLevel + level);
3694       tu_cs_emit_regs(
3695          cs, A6XX_RB_BLIT_SCISSOR_TL(.x = 0, .y = 0),
3696          A6XX_RB_BLIT_SCISSOR_BR(.x = width - 1, .y = height - 1));
3697 
3698       struct fdl6_view dst;
3699       const VkImageSubresourceLayers subresource = {
3700          .aspectMask = aspect_mask,
3701          .mipLevel = range->baseMipLevel + level,
3702          .baseArrayLayer = range->baseArrayLayer,
3703          .layerCount = 1,
3704       };
3705       tu_image_view_copy_blit<A7XX>(&dst, image, format, &subresource, 0, false);
3706 
3707       for (uint32_t layer = 0; layer < layer_count; layer++) {
3708 
3709          struct event_blit_dst_view blt_view = {
3710             .image = image,
3711             .view = &dst,
3712             .layer = layer,
3713          };
3714 
3715          if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3716             uint32_t real_level = range->baseMipLevel + level;
3717             uint32_t real_layer = range->baseArrayLayer + layer;
3718             if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
3719                struct fdl_layout *layout = &image->layout[0];
3720                blt_view.depth_addr =
3721                   image->iova +
3722                   fdl_surface_offset(layout, real_level, real_layer);
3723                blt_view.depth_pitch = fdl_pitch(layout, real_level);
3724             } else {
3725                struct fdl_layout *layout = &image->layout[1];
3726                blt_view.stencil_addr =
3727                   image->iova +
3728                   fdl_surface_offset(layout, real_level, real_layer);
3729                blt_view.stencil_pitch = fdl_pitch(layout, real_level);
3730             }
3731          }
3732 
3733          event_blit_run<A7XX>(cmd, cs, NULL, &blt_view,
3734                               aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT);
3735       }
3736    }
3737 }
3738 
3739 static bool
use_generic_clear_for_image_clear(struct tu_cmd_buffer * cmd,struct tu_image * image)3740 use_generic_clear_for_image_clear(struct tu_cmd_buffer *cmd,
3741                                   struct tu_image *image)
3742 {
3743    const struct fd_dev_info *info = cmd->device->physical_device->info;
3744    return info->a7xx.has_generic_clear &&
3745           /* A7XX supports R9G9B9E5_FLOAT as color attachment and supports
3746            * generic clears for it. A7XX TODO: allow R9G9B9E5_FLOAT
3747            * attachments.
3748            */
3749           image->vk.format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 &&
3750           /* Clearing VK_FORMAT_R8G8_* with fast-clear value, certain
3751            * dimensions (e.g. 960x540), and having GMEM renderpass afterwards
3752            * may lead to a GPU fault on A7XX.
3753            */
3754           !(info->a7xx.r8g8_faulty_fast_clear_quirk && image_is_r8g8(image));
3755 }
3756 
3757 template <chip CHIP>
3758 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3759 clear_image(struct tu_cmd_buffer *cmd,
3760             struct tu_image *image,
3761             uint32_t buffer_id,
3762             const VkClearValue *clear_value,
3763             const VkImageSubresourceRange *range,
3764             VkImageAspectFlags aspect_mask)
3765 {
3766    if (use_generic_clear_for_image_clear(cmd, image)) {
3767       clear_image_event_blit(cmd, image, buffer_id, clear_value, range, aspect_mask);
3768    } else {
3769       clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
3770    }
3771 }
3772 
3773 template <chip CHIP>
3774 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3775 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
3776                       VkImage image_h,
3777                       VkImageLayout imageLayout,
3778                       const VkClearColorValue *pColor,
3779                       uint32_t rangeCount,
3780                       const VkImageSubresourceRange *pRanges)
3781 {
3782    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3783    VK_FROM_HANDLE(tu_image, image, image_h);
3784 
3785    bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3786    if (use_generic_clear) {
3787       /* Generic clear doesn't go through CCU (or other caches). */
3788       cmd->state.cache.flush_bits |=
3789          TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_WAIT_FOR_IDLE;
3790       tu_emit_cache_flush<CHIP>(cmd);
3791    }
3792 
3793    struct tu_resolve_group resolve_group = {};
3794 
3795    for (unsigned i = 0; i < rangeCount; i++) {
3796       uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, image->vk.format);
3797       clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
3798    }
3799 
3800    tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3801    if (use_generic_clear) {
3802       /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3803        * proceed only after the just-emitted generic clears are complete.
3804        */
3805       cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3806       tu_emit_cache_flush<CHIP>(cmd);
3807    }
3808 }
3809 TU_GENX(tu_CmdClearColorImage);
3810 
3811 template <chip CHIP>
3812 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3813 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
3814                              VkImage image_h,
3815                              VkImageLayout imageLayout,
3816                              const VkClearDepthStencilValue *pDepthStencil,
3817                              uint32_t rangeCount,
3818                              const VkImageSubresourceRange *pRanges)
3819 {
3820    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3821    VK_FROM_HANDLE(tu_image, image, image_h);
3822 
3823    bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3824    if (use_generic_clear) {
3825       /* Generic clear doesn't go through CCU (or other caches). */
3826       cmd->state.cache.flush_bits |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
3827                                      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
3828                                      TU_CMD_FLAG_WAIT_FOR_IDLE;
3829       tu_emit_cache_flush<CHIP>(cmd);
3830    }
3831 
3832    struct tu_resolve_group resolve_group = {};
3833 
3834    for (unsigned i = 0; i < rangeCount; i++) {
3835       const VkImageSubresourceRange *range = &pRanges[i];
3836 
3837       if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3838          /* can't clear both depth and stencil at once, split up the aspect mask */
3839          u_foreach_bit(b, range->aspectMask) {
3840             uint32_t buffer_id = 0;
3841             if (BIT(b) == VK_IMAGE_ASPECT_DEPTH_BIT)
3842                buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, VK_FORMAT_D32_SFLOAT);
3843             if (BIT(b) == VK_IMAGE_ASPECT_STENCIL_BIT)
3844                buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, VK_FORMAT_S8_UINT);
3845 
3846             clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, BIT(b));
3847          }
3848          continue;
3849       }
3850 
3851       uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, image->vk.format);
3852       clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
3853    }
3854 
3855    tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3856    if (use_generic_clear) {
3857       /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3858        * proceed only after the just-emitted generic clears are complete.
3859        */
3860       cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3861       tu_emit_cache_flush<CHIP>(cmd);
3862    }
3863 
3864    tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
3865 }
3866 TU_GENX(tu_CmdClearDepthStencilImage);
3867 
3868 /* CmdClearAttachments uses the original color attachment index instead of the
3869  * remapped index used by the shader, and our MRTs use the remapped
3870  * indices, so we have to remap them. We should always be able to find a
3871  * shader attachment thanks to this VU:
3872  *
3873  *    VUID-vkCmdClearAttachments-colorAttachment-09503
3874  *    "The colorAttachment member of each element of pAttachments must not
3875  *    identify a color attachment that is currently mapped to
3876  *    VK_ATTACHMENT_UNUSED in commandBuffer via
3877  *    VkRenderingAttachmentLocationInfoKHR"
3878  */
3879 static unsigned
remap_attachment(struct tu_cmd_buffer * cmd,unsigned a)3880 remap_attachment(struct tu_cmd_buffer *cmd, unsigned a)
3881 {
3882    unsigned i = cmd->vk.dynamic_graphics_state.cal.color_map[a];
3883    assert(i != MESA_VK_ATTACHMENT_UNUSED &&
3884           "app violates VUID-vkCmdClearAttachments-colorAttachment-09503");
3885    return i;
3886 }
3887 
3888 template <chip CHIP>
3889 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3890 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
3891                             uint32_t attachment_count,
3892                             const VkClearAttachment *attachments,
3893                             uint32_t rect_count,
3894                             const VkClearRect *rects)
3895 {
3896    /* the shader path here is special, it avoids changing MRT/etc state */
3897    const struct tu_subpass *subpass = cmd->state.subpass;
3898    const uint32_t mrt_count = subpass->color_count;
3899    struct tu_cs *cs = &cmd->draw_cs;
3900    uint32_t clear_value[MAX_RTS][4];
3901    float z_clear_val = 0.0f;
3902    uint8_t s_clear_val = 0;
3903    uint32_t clear_rts = 0, clear_components = 0;
3904    bool z_clear = false;
3905    bool s_clear = false;
3906 
3907    trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
3908 
3909    for (uint32_t i = 0; i < attachment_count; i++) {
3910       uint32_t a;
3911       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3912          uint32_t c = attachments[i].colorAttachment;
3913          a = subpass->color_attachments[c].attachment;
3914          if (a == VK_ATTACHMENT_UNUSED)
3915             continue;
3916 
3917          uint32_t remapped = remap_attachment(cmd, c);
3918          clear_rts |= 1 << remapped;
3919          clear_components |= 0xf << (remapped * 4);
3920          memcpy(clear_value[remapped], &attachments[i].clearValue, 4 * sizeof(uint32_t));
3921       } else {
3922          a = subpass->depth_stencil_attachment.attachment;
3923          if (a == VK_ATTACHMENT_UNUSED)
3924             continue;
3925 
3926          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3927             z_clear = true;
3928             z_clear_val = attachments[i].clearValue.depthStencil.depth;
3929          }
3930 
3931          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3932             s_clear = true;
3933             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
3934          }
3935       }
3936    }
3937 
3938    /* We may not know the multisample count if there are no attachments, so
3939     * just bail early to avoid corner cases later.
3940     */
3941    if (clear_rts == 0 && !z_clear && !s_clear)
3942       return;
3943 
3944    /* disable all draw states so they don't interfere
3945     * TODO: use and re-use draw states
3946     * we have to disable draw states individually to preserve
3947     * input attachment states, because a secondary command buffer
3948     * won't be able to restore them
3949     */
3950    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3951    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
3952       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
3953           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
3954          continue;
3955       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
3956                      CP_SET_DRAW_STATE__0_DISABLE);
3957       tu_cs_emit_qw(cs, 0);
3958    }
3959    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
3960 
3961    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
3962    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
3963                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
3964                   0xfc000000);
3965    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
3966 
3967    r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
3968 
3969    /* Disable sample counting in order to not affect occlusion query. */
3970    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
3971 
3972    if (cmd->state.prim_generated_query_running_before_rp) {
3973       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
3974    }
3975 
3976    tu_cs_emit_regs(cs,
3977                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
3978    tu_cs_emit_regs(cs,
3979                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
3980 
3981    tu_cs_emit_regs(cs,
3982                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
3983 
3984    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
3985    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
3986    for (uint32_t i = 0; i < mrt_count; i++) {
3987       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
3988             .component_enable = COND(clear_rts & (1 << i), 0xf)));
3989    }
3990 
3991    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
3992    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
3993 
3994    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
3995    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3996          .z_test_enable = z_clear,
3997          .z_write_enable = z_clear,
3998          .zfunc = FUNC_ALWAYS));
3999    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
4000    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
4001    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
4002          .stencil_enable = s_clear,
4003          .func = FUNC_ALWAYS,
4004          .zpass = STENCIL_REPLACE));
4005    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
4006    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
4007    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
4008    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
4009 
4010    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
4011 
4012    unsigned num_rts = util_bitcount(clear_rts);
4013    uint32_t packed_clear_value[MAX_RTS][4];
4014 
4015    uint32_t idx = 0;
4016    u_foreach_bit(b, clear_rts) {
4017       memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
4018       idx++;
4019    }
4020 
4021    if (num_rts > 0)
4022       tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
4023                                 0, packed_clear_value, num_rts);
4024 
4025    for (uint32_t i = 0; i < rect_count; i++) {
4026       /* This should be true because of this valid usage for
4027        * vkCmdClearAttachments:
4028        *
4029        *    "If the render pass instance this is recorded in uses multiview,
4030        *    then baseArrayLayer must be zero and layerCount must be one"
4031        */
4032       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
4033 
4034       /* a630 doesn't support multiview masks, which means that we can't use
4035        * the normal multiview path without potentially recompiling a shader
4036        * on-demand or using a more complicated variant that takes the mask as
4037        * a const. Just use the layered path instead, since it shouldn't be
4038        * much worse.
4039        */
4040       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
4041       {
4042          const float coords[] = {
4043             rects[i].rect.offset.x,
4044             rects[i].rect.offset.y,
4045             z_clear_val,
4046             uif(rects[i].baseArrayLayer + layer),
4047             rects[i].rect.offset.x + rects[i].rect.extent.width,
4048             rects[i].rect.offset.y + rects[i].rect.extent.height,
4049             z_clear_val,
4050             1.0f,
4051          };
4052 
4053          r3d_coords_raw(cmd, cs, coords);
4054          r3d_run_vis(cmd, cs);
4055       }
4056    }
4057 
4058    /* Re-enable sample counting. */
4059    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
4060 
4061    if (cmd->state.prim_generated_query_running_before_rp) {
4062       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
4063    }
4064 
4065    trace_end_sysmem_clear_all(&cmd->trace, cs);
4066 }
4067 
4068 template <chip CHIP>
4069 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)4070 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4071                       struct tu_cs *cs,
4072                       uint32_t buffer_id,
4073                       enum pipe_format format,
4074                       uint8_t clear_mask,
4075                       uint32_t gmem_offset,
4076                       const VkClearValue *value)
4077 {
4078    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
4079    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
4080             blit_base_format<CHIP>(format, false, true)));
4081 
4082    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
4083                                          .clear_mask = clear_mask,
4084                                          .buffer_id = buffer_id));
4085 
4086    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
4087    tu_cs_emit(cs, gmem_offset);
4088 
4089    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
4090    tu_cs_emit(cs, 0);
4091 
4092    uint32_t clear_vals[4] = {};
4093    pack_blit_event_clear_value(value, format, clear_vals);
4094 
4095    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4096    tu_cs_emit_array(cs, clear_vals, 4);
4097 
4098    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
4099 }
4100 
4101 template <chip CHIP>
4102 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)4103 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4104                               struct tu_cs *cs,
4105                               struct tu_resolve_group *resolve_group,
4106                               uint32_t attachment,
4107                               uint32_t base_layer,
4108                               uint32_t layers,
4109                               uint32_t layer_mask,
4110                               VkImageAspectFlags mask,
4111                               const VkClearValue *value)
4112 {
4113    const struct tu_render_pass_attachment *att =
4114       &cmd->state.pass->attachments[attachment];
4115 
4116    trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
4117 
4118    tu_cs_emit_regs(cs,
4119                    A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
4120 
4121    enum pipe_format format = vk_format_to_pipe_format(att->format);
4122    for_each_layer(i, layer_mask, layers) {
4123       uint32_t layer = i + base_layer;
4124       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4125          if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4126             uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, VK_FORMAT_D32_SFLOAT);
4127             clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, 0xf,
4128                                   tu_attachment_gmem_offset(cmd, att, layer), value);
4129          }
4130          if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4131             uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, VK_FORMAT_S8_UINT);
4132             clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, 0xf,
4133                                   tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
4134          }
4135       } else {
4136          uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, att->format);
4137          clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, format, aspect_write_mask(format, mask),
4138                                tu_attachment_gmem_offset(cmd, att, layer), value);
4139       }
4140    }
4141 
4142    tu_flush_for_access(&cmd->state.renderpass_cache, TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4143 
4144    trace_end_gmem_clear(&cmd->trace, cs);
4145 }
4146 
4147 template <chip CHIP>
4148 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)4149 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
4150                           uint32_t attachment_count,
4151                           const VkClearAttachment *attachments,
4152                           uint32_t rect_count,
4153                           const VkClearRect *rects)
4154 {
4155    const struct tu_subpass *subpass = cmd->state.subpass;
4156    struct tu_cs *cs = &cmd->draw_cs;
4157 
4158    if (rect_count > 1)
4159       perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
4160 
4161    struct tu_resolve_group resolve_group = {};
4162 
4163    for (unsigned i = 0; i < rect_count; i++) {
4164       unsigned x1 = rects[i].rect.offset.x;
4165       unsigned y1 = rects[i].rect.offset.y;
4166       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
4167       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
4168 
4169       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4170       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4171       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4172 
4173       for (unsigned j = 0; j < attachment_count; j++) {
4174          uint32_t a;
4175          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
4176             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
4177          else
4178             a = subpass->depth_stencil_attachment.attachment;
4179 
4180          if (a == VK_ATTACHMENT_UNUSED)
4181                continue;
4182 
4183          tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a,
4184                                        rects[i].baseArrayLayer,
4185                                        rects[i].layerCount,
4186                                        subpass->multiview_mask,
4187                                        attachments[j].aspectMask,
4188                                        &attachments[j].clearValue);
4189       }
4190    }
4191 
4192    tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
4193 }
4194 
4195 template <chip CHIP>
4196 static void
tu_clear_attachments(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4197 tu_clear_attachments(struct tu_cmd_buffer *cmd,
4198                      uint32_t attachmentCount,
4199                      const VkClearAttachment *pAttachments,
4200                      uint32_t rectCount,
4201                      const VkClearRect *pRects)
4202 {
4203    struct tu_cs *cs = &cmd->draw_cs;
4204 
4205    /* sysmem path behaves like a draw, note we don't have a way of using different
4206     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
4207     */
4208    tu_emit_cache_flush_renderpass<CHIP>(cmd);
4209 
4210    /* vkCmdClearAttachments is supposed to respect the predicate if active. The
4211     * easiest way to do this is to always use the 3d path, which always works
4212     * even with GMEM because it's just a simple draw using the existing
4213     * attachment state.
4214     *
4215     * Similarly, we also use the 3D path when in a secondary command buffer that
4216     * doesn't know the GMEM layout that will be chosen by the primary.
4217     */
4218    if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
4219       tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4220       return;
4221    }
4222 
4223    /* If we could skip tile load/stores based on any draws intersecting them at
4224     * binning time, then emit the clear as a 3D draw so that it contributes to
4225     * that visibility.
4226    */
4227    const struct tu_subpass *subpass = cmd->state.subpass;
4228    for (uint32_t i = 0; i < attachmentCount; i++) {
4229       uint32_t a;
4230       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4231          uint32_t c = pAttachments[i].colorAttachment;
4232          a = subpass->color_attachments[c].attachment;
4233       } else {
4234          a = subpass->depth_stencil_attachment.attachment;
4235       }
4236       if (a != VK_ATTACHMENT_UNUSED) {
4237          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4238          if (att->cond_load_allowed || att->cond_store_allowed) {
4239             tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4240             return;
4241          }
4242       }
4243    }
4244 
4245    /* Otherwise, emit 2D blits for gmem rendering. */
4246    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
4247    tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4248    tu_cond_exec_end(cs);
4249 
4250    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4251    tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4252    tu_cond_exec_end(cs);
4253 }
4254 
4255 static void
tu7_clear_attachment_generic_single_rect(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_render_pass_attachment * att,const VkClearAttachment * clear_att,uint32_t a,const VkClearRect * rect)4256 tu7_clear_attachment_generic_single_rect(
4257    struct tu_cmd_buffer *cmd,
4258    struct tu_cs *cs,
4259    struct tu_resolve_group *resolve_group,
4260    const struct tu_render_pass_attachment *att,
4261    const VkClearAttachment *clear_att,
4262    uint32_t a,
4263    const VkClearRect *rect)
4264 {
4265    const struct tu_subpass *subpass = cmd->state.subpass;
4266    unsigned x1 = rect->rect.offset.x;
4267    unsigned y1 = rect->rect.offset.y;
4268    unsigned x2 = x1 + rect->rect.extent.width - 1;
4269    unsigned y2 = y1 + rect->rect.extent.height - 1;
4270 
4271    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4272    tu_cs_emit(cs,
4273               A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4274    tu_cs_emit(cs,
4275               A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4276 
4277    auto value = &clear_att->clearValue;
4278 
4279    enum pipe_format format = vk_format_to_pipe_format(att->format);
4280    for_each_layer(i, subpass->multiview_mask, rect->layerCount) {
4281       uint32_t layer = i + rect->baseArrayLayer;
4282       uint32_t mask =
4283          aspect_write_mask_generic_clear(format, clear_att->aspectMask);
4284 
4285       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4286          if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4287             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_D32_SFLOAT);
4288             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4289                                     false, layer, value, a);
4290          }
4291          if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4292             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_S8_UINT);
4293             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4294                                     layer, value, a);
4295          }
4296       } else {
4297          uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, att->format);
4298          tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4299       }
4300    }
4301 }
4302 
4303 static void
tu_clear_attachments_generic(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4304 tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
4305                              uint32_t attachmentCount,
4306                              const VkClearAttachment *pAttachments,
4307                              uint32_t rectCount,
4308                              const VkClearRect *pRects)
4309 {
4310    struct tu_cs *cs = &cmd->draw_cs;
4311 
4312    uint32_t clear_aspects = 0;
4313    for (uint32_t i = 0; i < attachmentCount; i++) {
4314       clear_aspects |= pAttachments[i].aspectMask;
4315    }
4316 
4317    /* Generic clear doesn't go through CCU (or other caches),
4318     * so we have to flush (clean+invalidate) corresponding caches.
4319     */
4320    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4321    if (clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
4322       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4323       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_COLOR).value);
4324    }
4325    if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4326       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4327       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_DEPTH).value);
4328    }
4329    tu_cs_emit_wfi(cs);
4330    tu_cond_exec_end(cs);
4331 
4332    struct tu_resolve_group resolve_group = {};
4333 
4334    const struct tu_subpass *subpass = cmd->state.subpass;
4335    for (uint32_t i = 0; i < attachmentCount; i++) {
4336       uint32_t a;
4337       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4338          uint32_t c = pAttachments[i].colorAttachment;
4339          a = subpass->color_attachments[c].attachment;
4340       } else {
4341          a = subpass->depth_stencil_attachment.attachment;
4342       }
4343       if (a != VK_ATTACHMENT_UNUSED) {
4344          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4345          const struct tu_image_view *iview = cmd->state.attachments[a];
4346          trace_start_generic_clear(&cmd->trace, cs, att->format,
4347                                    iview->view.ubwc_enabled, att->samples);
4348          for (unsigned j = 0; j < rectCount; j++) {
4349             tu7_clear_attachment_generic_single_rect(
4350                cmd, cs, &resolve_group, att, &pAttachments[i], a, &pRects[j]);
4351          }
4352          trace_end_generic_clear(&cmd->trace, cs);
4353       }
4354    }
4355 
4356    tu_emit_resolve_group<A7XX>(cmd, cs, &resolve_group);
4357 }
4358 
4359 template <chip CHIP>
4360 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4361 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
4362                        uint32_t attachmentCount,
4363                        const VkClearAttachment *pAttachments,
4364                        uint32_t rectCount,
4365                        const VkClearRect *pRects)
4366 {
4367    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4368 
4369    for (uint32_t j = 0; j < attachmentCount; j++) {
4370       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
4371          continue;
4372 
4373       tu_lrz_disable_during_renderpass<CHIP>(cmd, "CmdClearAttachments");
4374    }
4375 
4376    if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
4377        /* Both having predication and not knowing layout could be solved
4378         * by cs patching, which is exactly what prop driver is doing.
4379         * We don't implement it because we don't expect a reasonable impact.
4380         */
4381        !(cmd->state.predication_active ||
4382          cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT)) {
4383       tu_clear_attachments_generic(cmd, attachmentCount, pAttachments, rectCount, pRects);
4384    } else {
4385       tu_clear_attachments<CHIP>(cmd, attachmentCount, pAttachments,
4386                                  rectCount, pRects);
4387    }
4388 }
4389 TU_GENX(tu_CmdClearAttachments);
4390 
4391 template <chip CHIP>
4392 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)4393 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4394                         struct tu_cs *cs,
4395                         VkFormat vk_format,
4396                         VkImageAspectFlags clear_mask,
4397                         uint32_t a,
4398                         bool separate_ds)
4399 {
4400    enum pipe_format format = vk_format_to_pipe_format(vk_format);
4401    const struct tu_framebuffer *fb = cmd->state.framebuffer;
4402    const struct tu_image_view *iview = cmd->state.attachments[a];
4403    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
4404    const struct blit_ops *ops = &r2d_ops<CHIP>;
4405    const VkClearValue *value = &cmd->state.clear_values[a];
4406    if (cmd->state.pass->attachments[a].samples > 1)
4407       ops = &r3d_ops<CHIP>;
4408 
4409    trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
4410                             cmd->state.pass->attachments[a].samples);
4411 
4412    ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
4413               cmd->state.pass->attachments[a].samples);
4414    ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
4415                cmd->state.render_area.extent);
4416    ops->clear_value(cmd, cs, format, value);
4417 
4418    for_each_layer(i, clear_views, fb->layers) {
4419       if (separate_ds) {
4420          if (vk_format == VK_FORMAT_D32_SFLOAT) {
4421             ops->dst_depth(cs, iview, i);
4422          } else {
4423             ops->dst_stencil(cs, iview, i);
4424          }
4425       } else {
4426          ops->dst(cs, &iview->view, i, format);
4427       }
4428       ops->run(cmd, cs);
4429    }
4430 
4431    ops->teardown(cmd, cs);
4432 
4433    trace_end_sysmem_clear(&cmd->trace, cs);
4434 }
4435 
4436 template <chip CHIP>
4437 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)4438 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4439                            struct tu_cs *cs,
4440                            uint32_t a)
4441 {
4442    const struct tu_render_pass_attachment *attachment =
4443       &cmd->state.pass->attachments[a];
4444 
4445    if (!attachment->clear_mask)
4446       return;
4447 
4448    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4449       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4450          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
4451                                  a, true);
4452       }
4453       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4454          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
4455                                  a, true);
4456       }
4457    } else {
4458       clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
4459                               a, false);
4460    }
4461 
4462    /* The spec doesn't explicitly say, but presumably the initial renderpass
4463     * clear is considered part of the renderpass, and therefore barriers
4464     * aren't required inside the subpass/renderpass.  Therefore we need to
4465     * flush CCU color into CCU depth here, just like with
4466     * vkCmdClearAttachments(). Note that because this only happens at the
4467     * beginning of a renderpass, and renderpass writes are considered
4468     * "incoherent", we shouldn't have to worry about syncing depth into color
4469     * beforehand as depth should already be flushed.
4470     */
4471    if (vk_format_is_depth_or_stencil(attachment->format)) {
4472       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4473       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
4474       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
4475    } else {
4476       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4477       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
4478    }
4479 
4480    tu_cs_emit_wfi(cs);
4481 }
4482 TU_GENX(tu_clear_sysmem_attachment);
4483 
4484 template <chip CHIP>
4485 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4486 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4487                          struct tu_cs *cs,
4488                          struct tu_resolve_group *resolve_group,
4489                          uint32_t a)
4490 {
4491    const struct tu_render_pass_attachment *attachment =
4492       &cmd->state.pass->attachments[a];
4493 
4494    if (!attachment->clear_mask)
4495       return;
4496 
4497    tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
4498                                  cmd->state.framebuffer->layers,
4499                                  attachment->clear_views,
4500                                  attachment->clear_mask,
4501                                  &cmd->state.clear_values[a]);
4502 }
4503 TU_GENX(tu_clear_gmem_attachment);
4504 
4505 void
tu7_generic_clear_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4506 tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
4507                              struct tu_cs *cs,
4508                              struct tu_resolve_group *resolve_group,
4509                              uint32_t a)
4510 {
4511    const struct tu_render_pass_attachment *att =
4512       &cmd->state.pass->attachments[a];
4513    const VkClearValue *value = &cmd->state.clear_values[a];
4514    const struct tu_image_view *iview = cmd->state.attachments[a];
4515 
4516    trace_start_generic_clear(&cmd->trace, cs, att->format,
4517                              iview->view.ubwc_enabled, att->samples);
4518 
4519    enum pipe_format format = vk_format_to_pipe_format(att->format);
4520    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4521       uint32_t layer = i + 0;
4522       uint32_t mask =
4523          aspect_write_mask_generic_clear(format, att->clear_mask);
4524       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4525          if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4526             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_D32_SFLOAT);
4527             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4528                                     false, layer, value, a);
4529          }
4530          if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4531             uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_S8_UINT);
4532             tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4533                                     layer, value, a);
4534          }
4535       } else {
4536          uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, att->format);
4537          tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4538       }
4539    }
4540 
4541    tu_flush_for_access(&cmd->state.renderpass_cache,
4542                        TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4543 
4544    trace_end_generic_clear(&cmd->trace, cs);
4545 }
4546 
4547 template <chip CHIP>
4548 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,const VkClearValue * clear_value,enum a6xx_blit_event_type blit_event_type,bool separate_stencil)4549 tu_emit_blit(struct tu_cmd_buffer *cmd,
4550              struct tu_cs *cs,
4551              struct tu_resolve_group *resolve_group,
4552              const struct tu_image_view *iview,
4553              const struct tu_render_pass_attachment *attachment,
4554              const VkClearValue *clear_value,
4555              enum a6xx_blit_event_type blit_event_type,
4556              bool separate_stencil)
4557 {
4558    assert(blit_event_type != BLIT_EVENT_CLEAR);
4559    uint32_t clear_mask = 0;
4560 
4561    /* BLIT_EVENT_STORE_AND_CLEAR would presumably swallow the
4562     * BLIT_EVENT_CLEAR at the start of a renderpass, and be more efficient.
4563     */
4564    if (blit_event_type == BLIT_EVENT_STORE && clear_value &&
4565        attachment->clear_mask &&
4566        use_generic_clear_for_image_clear(cmd, iview->image)) {
4567       blit_event_type = BLIT_EVENT_STORE_AND_CLEAR;
4568 
4569       enum pipe_format format = vk_format_to_pipe_format(attachment->format);
4570       VkImageAspectFlags aspect_mask = attachment->clear_mask;
4571       if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
4572          if (separate_stencil)
4573             aspect_mask = VK_IMAGE_ASPECT_STENCIL_BIT;
4574          else
4575             aspect_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
4576       }
4577       if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
4578          if (separate_stencil)
4579             format = PIPE_FORMAT_S8_UINT;
4580          else
4581             format = PIPE_FORMAT_Z32_FLOAT;
4582       }
4583 
4584       clear_mask = aspect_write_mask_generic_clear(format, aspect_mask);
4585 
4586       uint32_t clear_vals[4] = {};
4587       pack_blit_event_clear_value(clear_value, format, clear_vals);
4588 
4589       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4590       tu_cs_emit_array(cs, clear_vals, 4);
4591    }
4592 
4593    VkFormat format = attachment->format;
4594    if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4595       format = separate_stencil ? VK_FORMAT_S8_UINT : VK_FORMAT_D32_SFLOAT;
4596    }
4597 
4598    uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, format);
4599    event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);
4600 
4601    for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
4602       event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
4603       event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
4604    }
4605 
4606    tu_flush_for_access(&cmd->state.cache, TU_ACCESS_BLIT_WRITE_GMEM,
4607                        TU_ACCESS_NONE);
4608 }
4609 
4610 static bool
blit_can_resolve(VkFormat format)4611 blit_can_resolve(VkFormat format)
4612 {
4613    const struct util_format_description *desc = vk_format_description(format);
4614 
4615    /* blit event can only do resolve for simple cases:
4616     * averaging samples as unsigned integers or choosing only one sample
4617     * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
4618     */
4619    if (vk_format_is_snorm(format))
4620       return false;
4621 
4622    /* can't do formats with larger channel sizes
4623     * note: this includes all float formats
4624     * note2: single channel integer formats seem OK
4625     */
4626    if (desc->channel[0].size > 10 && vk_format_is_color(format))
4627       return false;
4628 
4629    switch (format) {
4630    /* for unknown reasons blit event can't msaa resolve these formats when tiled
4631     * likely related to these formats having different layout from other cpp=2 formats
4632     */
4633    case VK_FORMAT_R8G8_UNORM:
4634    case VK_FORMAT_R8G8_UINT:
4635    case VK_FORMAT_R8G8_SINT:
4636    case VK_FORMAT_R8G8_SRGB:
4637       return false;
4638    default:
4639       break;
4640    }
4641 
4642    return true;
4643 }
4644 
4645 struct apply_load_coords_state {
4646    unsigned view;
4647 };
4648 
4649 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4650 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
4651                       struct tu_cs *cs,
4652                       void *data,
4653                       VkRect2D bin,
4654                       unsigned views,
4655                       VkExtent2D *frag_areas)
4656 {
4657    const struct apply_load_coords_state *state =
4658       (const struct apply_load_coords_state *)data;
4659    assert(state->view < views);
4660    VkExtent2D frag_area = frag_areas[state->view];
4661 
4662    assert(bin.extent.width % frag_area.width == 0);
4663    assert(bin.extent.height % frag_area.height == 0);
4664    uint32_t scaled_width = bin.extent.width / frag_area.width;
4665    uint32_t scaled_height = bin.extent.height / frag_area.height;
4666 
4667    const float coords[] = {
4668       bin.offset.x,                    bin.offset.y,
4669       bin.offset.x,                    bin.offset.y,
4670       bin.offset.x + scaled_width,     bin.offset.y + scaled_height,
4671       bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
4672    };
4673    r3d_coords_raw(cmd, cs, coords);
4674 }
4675 
4676 template <chip CHIP>
4677 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)4678 load_3d_blit(struct tu_cmd_buffer *cmd,
4679              struct tu_cs *cs,
4680              const struct tu_image_view *iview,
4681              const struct tu_render_pass_attachment *att,
4682              bool separate_stencil)
4683 {
4684    const struct tu_framebuffer *fb = cmd->state.framebuffer;
4685    enum pipe_format format = iview->view.format;
4686    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4687       if (separate_stencil)
4688          format = PIPE_FORMAT_S8_UINT;
4689       else
4690          format = PIPE_FORMAT_Z32_FLOAT;
4691    }
4692    r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
4693                    R3D_DST_GMEM, false, iview->view.ubwc_enabled,
4694                    iview->image->vk.samples);
4695 
4696    if (!cmd->state.pass->has_fdm) {
4697       r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
4698                  (VkExtent2D) { fb->width, fb->height });
4699    }
4700 
4701    /* Normal loads read directly from system memory, so we have to invalidate
4702     * UCHE in case it contains stale data.
4703     */
4704    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4705 
4706    /* Wait for CACHE_INVALIDATE to land */
4707    tu_cs_emit_wfi(cs);
4708 
4709    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4710       if (cmd->state.pass->has_fdm) {
4711          struct apply_load_coords_state state = {
4712             .view = att->clear_views ? i : 0,
4713          };
4714          tu_create_fdm_bin_patchpoint(cmd, cs, 4, fdm_apply_load_coords, state);
4715       }
4716 
4717       r3d_dst_gmem<CHIP>(cmd, cs, iview, att, separate_stencil, i);
4718 
4719       if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4720          if (separate_stencil)
4721             r3d_src_stencil(cmd, cs, iview, i);
4722          else
4723             r3d_src_depth(cmd, cs, iview, i);
4724       } else {
4725          r3d_src_gmem_load(cmd, cs, iview, i);
4726       }
4727 
4728       r3d_run(cmd, cs);
4729    }
4730 
4731    r3d_teardown<CHIP>(cmd, cs);
4732 
4733    /* It seems we need to WFI here for depth/stencil because color writes here
4734     * aren't synchronized with depth/stencil writes.
4735     *
4736     * Note: the blob also uses a WFI for color attachments but this hasn't
4737     * been seen to be necessary.
4738     */
4739    if (vk_format_is_depth_or_stencil(att->format))
4740       tu_cs_emit_wfi(cs);
4741 }
4742 
4743 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4744 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4745                               struct tu_cs *cs, bool load)
4746 {
4747    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
4748 
4749    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4750       return;
4751 
4752    uint64_t result_iova;
4753    if (load)
4754       result_iova = global_iova(cmd, dbg_gmem_taken_loads);
4755    else
4756       result_iova = global_iova(cmd, dbg_gmem_taken_stores);
4757 
4758    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4759    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4760    tu_cs_emit_qw(cs, result_iova);
4761    tu_cs_emit_qw(cs, result_iova);
4762    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4763 }
4764 
4765 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4766 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4767                             struct tu_cs *cs, bool load)
4768 {
4769    tu_cond_exec_end(cs);
4770 
4771    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4772       return;
4773 
4774    uint64_t result_iova;
4775    if (load)
4776       result_iova = global_iova(cmd, dbg_gmem_total_loads);
4777    else
4778       result_iova = global_iova(cmd, dbg_gmem_total_stores);
4779 
4780    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4781    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4782    tu_cs_emit_qw(cs, result_iova);
4783    tu_cs_emit_qw(cs, result_iova);
4784    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4785 }
4786 
4787 template <chip CHIP>
4788 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,bool cond_exec_allowed,bool force_load)4789 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
4790                         struct tu_cs *cs,
4791                         struct tu_resolve_group *resolve_group,
4792                         uint32_t a,
4793                         bool cond_exec_allowed,
4794                         bool force_load)
4795 {
4796    const struct tu_image_view *iview = cmd->state.attachments[a];
4797    const struct tu_render_pass_attachment *attachment =
4798       &cmd->state.pass->attachments[a];
4799 
4800    bool load_common = attachment->load || force_load;
4801    bool load_stencil =
4802       attachment->load_stencil ||
4803       (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
4804 
4805    if (!load_common && !load_stencil)
4806       return;
4807 
4808    trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
4809 
4810    /* If attachment will be cleared by vkCmdClearAttachments - it is likely
4811     * that it would be partially cleared, and since it is done by 2d blit
4812     * it doesn't produce geometry, so we have to unconditionally load.
4813     *
4814     * To simplify conditions treat partially cleared separate DS as fully
4815     * cleared and don't emit cond_exec.
4816     */
4817    bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
4818    if (cond_exec)
4819       tu_begin_load_store_cond_exec(cmd, cs, true);
4820 
4821    if (TU_DEBUG(3D_LOAD) ||
4822        cmd->state.pass->has_fdm) {
4823       if (load_common || load_stencil)
4824          tu_disable_draw_states(cmd, cs);
4825 
4826       if (load_common)
4827          load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
4828 
4829       if (load_stencil)
4830          load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
4831    } else {
4832       if (load_common)
4833          tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
4834 
4835       if (load_stencil)
4836          tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
4837    }
4838 
4839    if (cond_exec)
4840       tu_end_load_store_cond_exec(cmd, cs, true);
4841 
4842    trace_end_gmem_load(&cmd->trace, cs);
4843 }
4844 TU_GENX(tu_load_gmem_attachment);
4845 
4846 template <chip CHIP>
4847 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src_iview,const struct tu_image_view * dst_iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4848 store_cp_blit(struct tu_cmd_buffer *cmd,
4849               struct tu_cs *cs,
4850               const struct tu_image_view *src_iview,
4851               const struct tu_image_view *dst_iview,
4852               uint32_t samples,
4853               bool separate_stencil,
4854               enum pipe_format src_format,
4855               enum pipe_format dst_format,
4856               uint32_t layer,
4857               uint32_t gmem_offset,
4858               uint32_t cpp)
4859 {
4860    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
4861                           VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
4862                           dst_iview->view.ubwc_enabled, true);
4863 
4864    if (dst_iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4865       if (!separate_stencil) {
4866          r2d_dst_depth(cs, dst_iview, layer);
4867       } else {
4868          r2d_dst_stencil(cs, dst_iview, layer);
4869       }
4870    } else {
4871       r2d_dst<CHIP>(cs, &dst_iview->view, layer, src_format);
4872    }
4873 
4874    /* Note: we compute the swap here instead of using the color_swap as
4875     * programmed when we setup the color attachment because the attachment in
4876     * GMEM ignores the swap except when MUTABLEEN is enabled. If the
4877     * color attachment is linear, we need to use the identity swap even if the
4878     * original attachment has a non-identity swap.
4879     */
4880    struct tu_native_format fmt =
4881       blit_format_texture<CHIP>(src_format, TILE6_2,
4882                                 src_iview->view.is_mutable, true);
4883    enum a6xx_format format = fmt.fmt;
4884    fixup_src_format(&src_format, dst_format, &format);
4885 
4886    tu_cs_emit_regs(cs,
4887                    SP_PS_2D_SRC_INFO(CHIP,
4888                       .color_format = format,
4889                       .tile_mode = TILE6_2,
4890                       .color_swap = fmt.swap,
4891                       .srgb = util_format_is_srgb(src_format),
4892                       .samples = tu_msaa_samples(samples),
4893                       .samples_average = !util_format_is_pure_integer(dst_format) &&
4894                                          !util_format_is_depth_or_stencil(dst_format),
4895                       .unk20 = 1,
4896                       .unk22 = 1,
4897                       .mutableen = src_iview->view.is_mutable),
4898                    SP_PS_2D_SRC_SIZE(CHIP,
4899                       .width = dst_iview->vk.extent.width,
4900                       .height = dst_iview->vk.extent.height),
4901                    SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
4902                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
4903 
4904    /* sync GMEM writes with CACHE. */
4905    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4906    if (CHIP >= A7XX)
4907       /* On A7XX, we need to wait for any CP_EVENT_WRITE::BLIT operations
4908        * arising from GMEM load/clears to land before we can continue.
4909        */
4910       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
4911 
4912    /* Wait for cache event to land */
4913    tu_cs_emit_wfi(cs);
4914 
4915    r2d_run(cmd, cs);
4916 
4917    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4918     * sysmem, and we generally assume that GMEM renderpasses leave their
4919     * results in sysmem, so we need to flush manually here.
4920     */
4921    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4922 }
4923 
4924 template <chip CHIP>
4925 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src_iview,const struct tu_image_view * dst_iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4926 store_3d_blit(struct tu_cmd_buffer *cmd,
4927               struct tu_cs *cs,
4928               const struct tu_image_view *src_iview,
4929               const struct tu_image_view *dst_iview,
4930               VkSampleCountFlagBits dst_samples,
4931               bool separate_stencil,
4932               enum pipe_format src_format,
4933               enum pipe_format dst_format,
4934               const VkRect2D *render_area,
4935               uint32_t layer,
4936               uint32_t gmem_offset,
4937               uint32_t cpp)
4938 {
4939    /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
4940     * aren't set until we know whether we're HW binning or not, and we want to
4941     * avoid a dependence on that here to be able to store attachments before
4942     * the end of the renderpass in the future. Use the scratch space to
4943     * save/restore them dynamically.
4944     */
4945    tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4946    tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4947                   CP_REG_TO_SCRATCH_0_SCRATCH(0) |
4948                   CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4949    if (CHIP >= A7XX) {
4950       tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4951       tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4952                      CP_REG_TO_SCRATCH_0_SCRATCH(1) |
4953                      CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4954    }
4955 
4956    r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
4957                    0, false, dst_iview->view.ubwc_enabled, dst_samples);
4958 
4959    r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
4960 
4961    if (dst_iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4962       if (!separate_stencil) {
4963          r3d_dst_depth<CHIP>(cs, dst_iview, layer);
4964       } else {
4965          r3d_dst_stencil<CHIP>(cs, dst_iview, layer);
4966       }
4967    } else {
4968       r3d_dst<CHIP>(cs, &dst_iview->view, layer, src_format);
4969    }
4970 
4971    r3d_src_gmem<CHIP>(cmd, cs, src_iview, src_format, dst_format, gmem_offset, cpp);
4972 
4973    /* sync GMEM writes with CACHE. */
4974    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4975 
4976    /* Wait for CACHE_INVALIDATE to land */
4977    tu_cs_emit_wfi(cs);
4978 
4979    r3d_run(cmd, cs);
4980 
4981    r3d_teardown<CHIP>(cmd, cs);
4982 
4983    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4984     * sysmem, and we generally assume that GMEM renderpasses leave their
4985     * results in sysmem, so we need to flush manually here. The 3d blit path
4986     * writes to depth images as a color RT, so there's no need to flush depth.
4987     */
4988    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4989 
4990    /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
4991    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4992    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4993                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4994                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4995 
4996    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4997    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
4998                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4999                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
5000 
5001    if (CHIP >= A7XX) {
5002       tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
5003       tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
5004                         CP_SCRATCH_TO_REG_0_SCRATCH(1) |
5005                         CP_SCRATCH_TO_REG_0_CNT(1 - 1));
5006    }
5007 }
5008 
5009 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)5010 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
5011 {
5012    struct tu_physical_device *phys_dev = cmd->device->physical_device;
5013    const struct tu_image_view *iview = cmd->state.attachments[a];
5014    const VkRect2D *render_area = &cmd->state.render_area;
5015 
5016    /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
5017    if (TU_DEBUG(UNALIGNED_STORE))
5018       return true;
5019 
5020    /* We always use the unaligned store path when scaling rendering. */
5021    if (cmd->state.pass->has_fdm)
5022       return true;
5023 
5024    uint32_t x1 = render_area->offset.x;
5025    uint32_t y1 = render_area->offset.y;
5026    uint32_t x2 = x1 + render_area->extent.width;
5027    uint32_t y2 = y1 + render_area->extent.height;
5028    /* x2/y2 can be unaligned if equal to the size of the image, since it will
5029     * write into padding space. The one exception is linear levels which don't
5030     * have the required y padding in the layout (except for the last level)
5031     */
5032    bool need_y2_align =
5033       y2 != iview->view.height || iview->view.need_y2_align;
5034 
5035    return (x1 % phys_dev->info->gmem_align_w ||
5036            (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
5037            y1 % phys_dev->info->gmem_align_h ||
5038            (y2 % phys_dev->info->gmem_align_h && need_y2_align));
5039 }
5040 
5041 /* The fast path cannot handle mismatched mutability. */
5042 static bool
tu_attachment_store_mismatched_mutability(struct tu_cmd_buffer * cmd,uint32_t a,uint32_t gmem_a)5043 tu_attachment_store_mismatched_mutability(struct tu_cmd_buffer *cmd, uint32_t a,
5044                                           uint32_t gmem_a)
5045 {
5046    if (a == gmem_a)
5047       return false;
5048 
5049    const struct tu_image_view *dst_iview = cmd->state.attachments[a];
5050    const struct tu_image_view *src_iview = cmd->state.attachments[gmem_a];
5051 
5052    return dst_iview->view.is_mutable != src_iview->view.is_mutable;
5053 }
5054 
5055 /* Choose the GMEM layout (use the CCU space or not) based on whether the
5056  * current attachments will need.  This has to happen at vkBeginRenderPass()
5057  * time because tu_attachment_store_unaligned() looks at the image views, which
5058  * are only available at that point.  This should match the logic for the
5059  * !use_fast_path case in tu_store_gmem_attachment().
5060  */
5061 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)5062 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
5063 {
5064    cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
5065 
5066    for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
5067       if (!cmd->state.attachments[i])
5068          continue;
5069 
5070       struct tu_render_pass_attachment *att =
5071          &cmd->state.pass->attachments[i];
5072       if ((att->store || att->store_stencil) &&
5073           tu_attachment_store_unaligned(cmd, i))
5074          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5075       if (att->store && att->format == VK_FORMAT_S8_UINT)
5076          /* We cannot pick out S8 from D24S8/D32S8, so we conservatively disable
5077           * blit events for the S8_UINT format.
5078           */
5079          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5080       if (att->will_be_resolved && !blit_can_resolve(att->format))
5081          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5082    }
5083 
5084    for (unsigned i = 0; i < cmd->state.pass->subpass_count; i++) {
5085       const struct tu_subpass *subpass = &cmd->state.pass->subpasses[i];
5086       for (unsigned j = 0; j < subpass->resolve_count; j++) {
5087          uint32_t a = subpass->resolve_attachments[j].attachment;
5088          if (a == VK_ATTACHMENT_UNUSED)
5089             continue;
5090          uint32_t gmem_a =
5091             j == subpass->color_count ?
5092                subpass->depth_stencil_attachment.attachment :
5093                subpass->color_attachments[j].attachment;
5094          if (tu_attachment_store_mismatched_mutability(cmd, a, gmem_a))
5095             cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5096       }
5097    }
5098 
5099    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
5100 }
5101 
5102 struct apply_store_coords_state {
5103    unsigned view;
5104 };
5105 
5106 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)5107 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
5108                        struct tu_cs *cs,
5109                        void *data,
5110                        VkRect2D bin,
5111                        unsigned views,
5112                        VkExtent2D *frag_areas)
5113 {
5114    const struct apply_store_coords_state *state =
5115       (const struct apply_store_coords_state *)data;
5116    assert(state->view < views);
5117    VkExtent2D frag_area = frag_areas[state->view];
5118 
5119    /* The bin width/height must be a multiple of the frag_area to make sure
5120     * that the scaling happens correctly. This means there may be some
5121     * destination pixels jut out of the framebuffer, but they should be
5122     * clipped by the render area.
5123     */
5124    assert(bin.extent.width % frag_area.width == 0);
5125    assert(bin.extent.height % frag_area.height == 0);
5126    uint32_t scaled_width = bin.extent.width / frag_area.width;
5127    uint32_t scaled_height = bin.extent.height / frag_area.height;
5128 
5129    tu_cs_emit_regs(cs,
5130       A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
5131                           .y = bin.offset.y),
5132       A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
5133                           .y = bin.offset.y + bin.extent.height - 1));
5134    tu_cs_emit_regs(cs,
5135                    A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
5136                    A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
5137                    A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
5138                    A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
5139 }
5140 
5141 template <chip CHIP>
5142 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)5143 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
5144                          struct tu_cs *cs,
5145                          struct tu_resolve_group *resolve_group,
5146                          uint32_t a,
5147                          uint32_t gmem_a,
5148                          uint32_t layers,
5149                          uint32_t layer_mask,
5150                          bool cond_exec_allowed)
5151 {
5152    const VkRect2D *render_area = &cmd->state.render_area;
5153    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
5154    const struct tu_image_view *dst_iview = cmd->state.attachments[a];
5155    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
5156    const struct tu_image_view *src_iview = cmd->state.attachments[gmem_a];
5157    const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
5158    bool resolve = a != gmem_a;
5159    if (resolve)
5160       clear_value = NULL;
5161 
5162    if (!dst->store && !dst->store_stencil)
5163       return;
5164 
5165    bool unaligned = tu_attachment_store_unaligned(cmd, a);
5166    bool mismatched_mutability =
5167       tu_attachment_store_mismatched_mutability(cmd, a, gmem_a);
5168 
5169    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
5170     * one for depth and other for stencil. When resolving a MSAA
5171     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
5172     */
5173    bool resolve_d32s8_s8 =
5174       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
5175       dst->format == VK_FORMAT_S8_UINT;
5176 
5177    /* The fast path doesn't support picking out the last component of a D24S8
5178     * texture reinterpreted as RGBA8_UNORM.
5179     */
5180    bool resolve_d24s8_s8 =
5181       src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
5182       dst->format == VK_FORMAT_S8_UINT;
5183 
5184    bool store_common = dst->store && !resolve_d32s8_s8;
5185    bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
5186 
5187    bool use_fast_path = !unaligned && !mismatched_mutability &&
5188                         !resolve_d24s8_s8 &&
5189                         (a == gmem_a || blit_can_resolve(dst->format));
5190 
5191    trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
5192 
5193    /* Unconditional store should happen only if attachment was cleared,
5194     * which could have happened either by load_op or via vkCmdClearAttachments.
5195     */
5196    bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
5197    if (cond_exec) {
5198       tu_begin_load_store_cond_exec(cmd, cs, false);
5199    }
5200 
5201    /* use fast path when render area is aligned, except for unsupported resolve cases */
5202    if (use_fast_path) {
5203       if (store_common)
5204          tu_emit_blit<CHIP>(cmd, cs, resolve_group, dst_iview, src, clear_value, BLIT_EVENT_STORE, false);
5205       if (store_separate_stencil)
5206          tu_emit_blit<CHIP>(cmd, cs, resolve_group, dst_iview, src, clear_value, BLIT_EVENT_STORE, true);
5207 
5208       if (cond_exec) {
5209          tu_end_load_store_cond_exec(cmd, cs, false);
5210       }
5211 
5212       trace_end_gmem_store(&cmd->trace, cs);
5213       return;
5214    }
5215 
5216    assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
5217 
5218    enum pipe_format src_format = vk_format_to_pipe_format(src->format);
5219    if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5220       src_format = PIPE_FORMAT_Z32_FLOAT;
5221 
5222    enum pipe_format dst_format = vk_format_to_pipe_format(dst->format);
5223    if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5224       dst_format = PIPE_FORMAT_Z32_FLOAT;
5225 
5226    if (dst->samples > 1) {
5227       /* If we hit this path, we have to disable draw states after every tile
5228        * instead of once at the end of the renderpass, so that they aren't
5229        * executed when calling CP_DRAW.
5230        *
5231        * TODO: store a flag somewhere so we don't do this more than once and
5232        * don't do it after the renderpass when this happens.
5233        */
5234       if (store_common || store_separate_stencil)
5235          tu_disable_draw_states(cmd, cs);
5236 
5237       for_each_layer(i, layer_mask, layers) {
5238          if (store_common) {
5239             store_3d_blit<CHIP>(cmd, cs, src_iview, dst_iview, dst->samples, false, src_format,
5240                           dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5241          }
5242          if (store_separate_stencil) {
5243             store_3d_blit<CHIP>(cmd, cs, src_iview, dst_iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
5244                           PIPE_FORMAT_S8_UINT, render_area, i,
5245                           tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5246          }
5247       }
5248    } else {
5249       if (!cmd->state.pass->has_fdm) {
5250          r2d_coords(cmd, cs, render_area->offset, render_area->offset,
5251                     render_area->extent);
5252       } else {
5253          /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
5254           * area and the coordinates span the entire render area, but for
5255           * FDM we need to scale the coordinates so we need to take the
5256           * opposite aproach, specifying the exact bin size in the destination
5257           * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
5258           * area.
5259           */
5260          tu_cs_emit_regs(cs,
5261                          A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
5262                                                      .y = render_area->offset.y,),
5263                          A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
5264                                                      .y = render_area->offset.y + render_area->extent.height - 1,));
5265       }
5266 
5267       for_each_layer (i, layer_mask, layers) {
5268          if (cmd->state.pass->has_fdm) {
5269             unsigned view = layer_mask ? i : 0;
5270             struct apply_store_coords_state state = {
5271                .view = view,
5272             };
5273             tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
5274                                          state);
5275          }
5276          if (store_common) {
5277             store_cp_blit<CHIP>(cmd, cs, src_iview, dst_iview, src->samples, false, src_format,
5278                           dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5279          }
5280          if (store_separate_stencil) {
5281             store_cp_blit<CHIP>(cmd, cs, src_iview, dst_iview, src->samples, true, PIPE_FORMAT_S8_UINT,
5282                           PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5283          }
5284       }
5285    }
5286 
5287    if (cond_exec) {
5288       tu_end_load_store_cond_exec(cmd, cs, false);
5289    }
5290 
5291    trace_end_gmem_store(&cmd->trace, cs);
5292 }
5293 TU_GENX(tu_store_gmem_attachment);
5294