• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <jonathan@marek.ca>
7  */
8 
9 #include "tu_clear_blit.h"
10 
11 #include "ir3/ir3_nir.h"
12 
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18 
19 #include "tu_cmd_buffer.h"
20 #include "tu_cs.h"
21 #include "tu_formats.h"
22 #include "tu_image.h"
23 #include "tu_tracepoints.h"
24 
25 #include "common/freedreno_gpu_event.h"
26 
27 static const VkOffset2D blt_no_coord = { ~0, ~0 };
28 
29 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)30 tu_pack_float32_for_unorm(float val, int bits)
31 {
32    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
33 }
34 
35 /* r2d_ = BLIT_OP_SCALE operations */
36 
37 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)38 format_to_ifmt(enum pipe_format format)
39 {
40    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
41        format == PIPE_FORMAT_Z24X8_UNORM)
42       return R2D_UNORM8;
43 
44    /* get_component_bits doesn't work with depth/stencil formats: */
45    if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
46       return R2D_FLOAT32;
47    if (format == PIPE_FORMAT_S8_UINT)
48       return R2D_INT8;
49    if (format == PIPE_FORMAT_A8_UNORM)
50       return R2D_UNORM8;
51 
52    /* use the size of the red channel to find the corresponding "ifmt" */
53    bool is_int = util_format_is_pure_integer(format);
54    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
55    case 4: case 5: case 8:
56       return is_int ? R2D_INT8 : R2D_UNORM8;
57    case 10: case 11:
58       return is_int ? R2D_INT16 : R2D_FLOAT16;
59    case 16:
60       if (util_format_is_float(format))
61          return R2D_FLOAT16;
62       return is_int ? R2D_INT16 : R2D_FLOAT32;
63    case 32:
64       return is_int ? R2D_INT32 : R2D_FLOAT32;
65     default:
66       unreachable("bad format");
67    }
68 }
69 
70 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode)71 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode)
72 {
73    struct tu_native_format fmt = tu6_format_texture(format, tile_mode);
74 
75    switch (format) {
76    case PIPE_FORMAT_Z24X8_UNORM:
77    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
78       /* Similar to in fdl6_view_init, we want to use
79        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
80        * src.  Since this is called when there is no image and thus no ubwc,
81        * we can always use FMT6_8_8_8_8_UNORM.
82        */
83       fmt.fmt = FMT6_8_8_8_8_UNORM;
84       break;
85    default:
86       break;
87    }
88 
89    return fmt;
90 }
91 
92 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)93 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
94 {
95    struct tu_native_format fmt = tu6_format_color(format, tile_mode);
96 
97    switch (format) {
98    case PIPE_FORMAT_Z24X8_UNORM:
99    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
100       /* similar to blit_format_texture but for blit dst */
101       fmt.fmt = FMT6_8_8_8_8_UNORM;
102       break;
103    default:
104       break;
105    }
106 
107    return fmt;
108 }
109 
110 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc)111 blit_base_format(enum pipe_format format, bool ubwc)
112 {
113    if (ubwc) {
114       switch (format) {
115       case PIPE_FORMAT_Z24X8_UNORM:
116       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
117          /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
118          return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
119       default:
120          break;
121       }
122    }
123 
124    /* note: tu6_format_color doesn't care about tiling for .fmt field */
125    return blit_format_color(format, TILE6_LINEAR).fmt;
126 }
127 
128 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)129 r2d_coords(struct tu_cmd_buffer *cmd,
130            struct tu_cs *cs,
131            const VkOffset2D dst,
132            const VkOffset2D src,
133            const VkExtent2D extent)
134 {
135    tu_cs_emit_regs(cs,
136       A6XX_GRAS_2D_DST_TL(.x = dst.x,                    .y = dst.y),
137       A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
138 
139    if (src.x == blt_no_coord.x)
140       return;
141 
142    tu_cs_emit_regs(cs,
143                    A6XX_GRAS_2D_SRC_TL_X(src.x),
144                    A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
145                    A6XX_GRAS_2D_SRC_TL_Y(src.y),
146                    A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
147 }
148 
149 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)150 r2d_clear_value(struct tu_cmd_buffer *cmd,
151                 struct tu_cs *cs,
152                 enum pipe_format format,
153                 const VkClearValue *val)
154 {
155    uint32_t clear_value[4] = {};
156 
157    switch (format) {
158    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
159    case PIPE_FORMAT_Z24X8_UNORM:
160       /* cleared as r8g8b8a8_unorm using special format */
161       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
162       clear_value[1] = clear_value[0] >> 8;
163       clear_value[2] = clear_value[0] >> 16;
164       clear_value[3] = val->depthStencil.stencil;
165       break;
166    case PIPE_FORMAT_Z16_UNORM:
167    case PIPE_FORMAT_Z32_FLOAT:
168       /* R2D_FLOAT32 */
169       clear_value[0] = fui(val->depthStencil.depth);
170       break;
171    case PIPE_FORMAT_S8_UINT:
172       clear_value[0] = val->depthStencil.stencil;
173       break;
174    case PIPE_FORMAT_R9G9B9E5_FLOAT:
175       /* cleared as UINT32 */
176       clear_value[0] = float3_to_rgb9e5(val->color.float32);
177       break;
178    default:
179       assert(!util_format_is_depth_or_stencil(format));
180       const struct util_format_description *desc = util_format_description(format);
181       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
182 
183       assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
184              format == PIPE_FORMAT_R11G11B10_FLOAT);
185 
186       for (unsigned i = 0; i < 4; i++) {
187          if (desc->swizzle[i] > PIPE_SWIZZLE_W)
188             continue;
189 
190          const struct util_format_channel_description *ch =
191             &desc->channel[desc->swizzle[i]];
192          if (ifmt == R2D_UNORM8) {
193             float linear = val->color.float32[i];
194             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
195                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
196 
197             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
198                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
199             else
200                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
201          } else if (ifmt == R2D_FLOAT16) {
202             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
203          } else {
204             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
205                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
206             clear_value[i] = val->color.uint32[i];
207          }
208       }
209       break;
210    }
211 
212    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
213    tu_cs_emit_array(cs, clear_value, 4);
214 }
215 
216 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)217 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
218                  enum a6xx_format *fmt)
219 {
220    /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
221     * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
222     * We also have to move stencil, which is normally in the .w channel, into
223     * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
224     * problems, and avoids using a swap, which seems to sometimes not work
225     * with a D24S8 source, or a texture swizzle which is only supported with
226     * the 3d path. Sometimes this blit happens on already-constructed
227     * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
228     */
229    if (*src_format == PIPE_FORMAT_S8_UINT &&
230        (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
231         dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
232       *fmt = FMT6_A8_UNORM;
233       *src_format = PIPE_FORMAT_A8_UNORM;
234    }
235 }
236 
237 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)238 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
239                  enum a6xx_format *fmt)
240 {
241    if (*dst_format == PIPE_FORMAT_S8_UINT &&
242        (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
243         src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
244       *dst_format = PIPE_FORMAT_A8_UNORM;
245       *fmt = FMT6_A8_UNORM;
246    }
247 }
248 
249 template <chip CHIP>
250 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)251 r2d_src(struct tu_cmd_buffer *cmd,
252         struct tu_cs *cs,
253         const struct fdl6_view *iview,
254         uint32_t layer,
255         VkFilter filter,
256         enum pipe_format dst_format)
257 {
258    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
259    if (filter != VK_FILTER_NEAREST)
260       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
261 
262    enum a6xx_format fmt = (enum a6xx_format)(
263       src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
264    enum pipe_format src_format = iview->format;
265    fixup_src_format(&src_format, dst_format, &fmt);
266 
267    src_info =
268       (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
269       A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
270 
271    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
272    tu_cs_emit(cs, src_info);
273    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
274    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
275 
276    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
277    tu_cs_image_flag_ref(cs, iview, layer);
278 }
279 
280 template <chip CHIP>
281 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)282 r2d_src_depth(struct tu_cmd_buffer *cmd,
283                 struct tu_cs *cs,
284                 const struct tu_image_view *iview,
285                 uint32_t layer,
286                 VkFilter filter)
287 {
288    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
289    tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
290    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
291    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
292    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
293    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
294 
295    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
296    tu_cs_image_flag_ref(cs, &iview->view, layer);
297 }
298 
299 template <chip CHIP>
300 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)301 r2d_src_stencil(struct tu_cmd_buffer *cmd,
302                 struct tu_cs *cs,
303                 const struct tu_image_view *iview,
304                 uint32_t layer,
305                 VkFilter filter)
306 {
307    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
308    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
309    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
310    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
311    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
312 }
313 
314 template <chip CHIP>
315 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)316 r2d_src_buffer(struct tu_cmd_buffer *cmd,
317                struct tu_cs *cs,
318                enum pipe_format format,
319                uint64_t va, uint32_t pitch,
320                uint32_t width, uint32_t height,
321                enum pipe_format dst_format)
322 {
323    struct tu_native_format fmt = blit_format_texture(format, TILE6_LINEAR);
324    enum a6xx_format color_format = fmt.fmt;
325    fixup_src_format(&format, dst_format, &color_format);
326 
327    tu_cs_emit_regs(cs,
328                    SP_PS_2D_SRC_INFO(CHIP,
329                       .color_format = color_format,
330                       .color_swap = fmt.swap,
331                       .srgb = util_format_is_srgb(format),
332                       .unk20 = 1,
333                       .unk22 = 1),
334                    SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
335                    SP_PS_2D_SRC(CHIP, .qword = va),
336                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
337 }
338 
339 template <chip CHIP>
340 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)341 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
342         enum pipe_format src_format)
343 {
344    uint32_t dst_info = iview->RB_2D_DST_INFO;
345    enum a6xx_format fmt =
346       (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
347    enum pipe_format dst_format = iview->format;
348    fixup_dst_format(src_format, &dst_format, &fmt);
349 
350    dst_info =
351          (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
352    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
353    tu_cs_emit(cs, dst_info);
354    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
355 
356    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
357    tu_cs_image_flag_ref(cs, iview, layer);
358 }
359 
360 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)361 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
362 {
363    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
364    tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
365    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
366    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
367 
368    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
369    tu_cs_image_flag_ref(cs, &iview->view, layer);
370 }
371 
372 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)373 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
374 {
375    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
376    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
377    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
378    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
379 }
380 
381 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)382 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
383                enum pipe_format src_format)
384 {
385    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
386    enum a6xx_format color_fmt = fmt.fmt;
387    fixup_dst_format(src_format, &format, &color_fmt);
388    fmt.fmt = color_fmt;
389 
390    tu_cs_emit_regs(cs,
391                    A6XX_RB_2D_DST_INFO(
392                       .color_format = fmt.fmt,
393                       .color_swap = fmt.swap,
394                       .srgb = util_format_is_srgb(format)),
395                    A6XX_RB_2D_DST(.qword = va),
396                    A6XX_RB_2D_DST_PITCH(pitch));
397 }
398 
399 template <chip CHIP>
400 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)401 r2d_setup_common(struct tu_cmd_buffer *cmd,
402                  struct tu_cs *cs,
403                  enum pipe_format src_format,
404                  enum pipe_format dst_format,
405                  VkImageAspectFlags aspect_mask,
406                  unsigned blit_param,
407                  bool clear,
408                  bool ubwc,
409                  bool scissor)
410 {
411    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
412       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
413    }
414 
415    enum a6xx_format fmt = blit_base_format(dst_format, ubwc);
416    fixup_dst_format(src_format, &dst_format, &fmt);
417    enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
418 
419    uint32_t unknown_8c01 = 0;
420 
421    /* note: the only format with partial clearing is D24S8 */
422    if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
423       /* preserve stencil channel */
424       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
425          unknown_8c01 = 0x08000041;
426       /* preserve depth channels */
427       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
428          unknown_8c01 = 0x00084001;
429    }
430 
431    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
432    tu_cs_emit(cs, unknown_8c01);    // TODO: seem to be always 0 on A7XX
433 
434    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
435          .rotate = (enum a6xx_rotation) blit_param,
436          .solid_color = clear,
437          .color_format = fmt,
438          .scissor = scissor,
439          .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
440          .mask = 0xf,
441          .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
442       ).value;
443 
444    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
445    tu_cs_emit(cs, blit_cntl);
446 
447    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
448    tu_cs_emit(cs, blit_cntl);
449 
450    if (CHIP > A6XX) {
451       tu_cs_emit_pkt4(cs, REG_A7XX_SP_PS_UNKNOWN_B2D2, 1);
452       tu_cs_emit(cs, 0x20000000);
453    }
454 
455    if (fmt == FMT6_10_10_10_2_UNORM_DEST)
456       fmt = FMT6_16_16_16_16_FLOAT;
457 
458    tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
459          .sint = util_format_is_pure_sint(dst_format),
460          .uint = util_format_is_pure_uint(dst_format),
461          .color_format = fmt,
462          .srgb = util_format_is_srgb(dst_format),
463          .mask = 0xf));
464 }
465 
466 template <chip CHIP>
467 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)468 r2d_setup(struct tu_cmd_buffer *cmd,
469           struct tu_cs *cs,
470           enum pipe_format src_format,
471           enum pipe_format dst_format,
472           VkImageAspectFlags aspect_mask,
473           unsigned blit_param,
474           bool clear,
475           bool ubwc,
476           VkSampleCountFlagBits samples)
477 {
478    assert(samples == VK_SAMPLE_COUNT_1_BIT);
479 
480    if (!cmd->state.pass) {
481       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
482    }
483 
484    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
485 }
486 
487 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)488 r2d_teardown(struct tu_cmd_buffer *cmd,
489              struct tu_cs *cs)
490 {
491    /* nothing to do here */
492 }
493 
494 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)495 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
496 {
497    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
498        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
499       /* This a non-context register, so we have to WFI before changing. */
500       tu_cs_emit_wfi(cs);
501       tu_cs_emit_write_reg(
502          cs, REG_A6XX_RB_DBG_ECO_CNTL,
503          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
504    }
505 
506    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
507    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
508 
509    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
510        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
511       tu_cs_emit_wfi(cs);
512       tu_cs_emit_write_reg(
513          cs, REG_A6XX_RB_DBG_ECO_CNTL,
514          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
515    }
516 }
517 
518 /* r3d_ = shader path operations */
519 
520 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)521 load_const(nir_builder *b, unsigned base, unsigned components)
522 {
523    return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
524                            .base = base);
525 }
526 
527 static nir_shader *
build_blit_vs_shader(void)528 build_blit_vs_shader(void)
529 {
530    nir_builder _b =
531       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
532    nir_builder *b = &_b;
533    b->shader->info.internal = true;
534 
535    nir_variable *out_pos =
536       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
537                           "gl_Position");
538    out_pos->data.location = VARYING_SLOT_POS;
539 
540    nir_def *vert0_pos = load_const(b, 0, 2);
541    nir_def *vert1_pos = load_const(b, 4, 2);
542    nir_def *vertex = nir_load_vertex_id(b);
543 
544    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
545    pos = nir_vec4(b, nir_channel(b, pos, 0),
546                      nir_channel(b, pos, 1),
547                      nir_imm_float(b, 0.0),
548                      nir_imm_float(b, 1.0));
549 
550    nir_store_var(b, out_pos, pos, 0xf);
551 
552    nir_variable *out_coords =
553       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
554                           "coords");
555    out_coords->data.location = VARYING_SLOT_VAR0;
556 
557    nir_def *vert0_coords = load_const(b, 2, 2);
558    nir_def *vert1_coords = load_const(b, 6, 2);
559 
560    /* Only used with "z scale" blit path which uses a 3d texture */
561    nir_def *z_coord = load_const(b, 16, 1);
562 
563    nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
564    coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
565                      z_coord);
566 
567    nir_store_var(b, out_coords, coords, 0x7);
568 
569    return b->shader;
570 }
571 
572 static nir_shader *
build_clear_vs_shader(void)573 build_clear_vs_shader(void)
574 {
575    nir_builder _b =
576       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
577    nir_builder *b = &_b;
578    b->shader->info.internal = true;
579 
580    nir_variable *out_pos =
581       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
582                           "gl_Position");
583    out_pos->data.location = VARYING_SLOT_POS;
584 
585    nir_def *vert0_pos = load_const(b, 0, 2);
586    nir_def *vert1_pos = load_const(b, 4, 2);
587    /* c0.z is used to clear depth */
588    nir_def *depth = load_const(b, 2, 1);
589    nir_def *vertex = nir_load_vertex_id(b);
590 
591    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
592    pos = nir_vec4(b, nir_channel(b, pos, 0),
593                      nir_channel(b, pos, 1),
594                      depth, nir_imm_float(b, 1.0));
595 
596    nir_store_var(b, out_pos, pos, 0xf);
597 
598    nir_variable *out_layer =
599       nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
600                           "gl_Layer");
601    out_layer->data.location = VARYING_SLOT_LAYER;
602    nir_def *layer = load_const(b, 3, 1);
603    nir_store_var(b, out_layer, layer, 1);
604 
605    return b->shader;
606 }
607 
608 static nir_shader *
build_blit_fs_shader(bool zscale)609 build_blit_fs_shader(bool zscale)
610 {
611    nir_builder _b =
612       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
613                                      zscale ? "zscale blit fs" : "blit fs");
614    nir_builder *b = &_b;
615    b->shader->info.internal = true;
616 
617    nir_variable *out_color =
618       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
619                           "color0");
620    out_color->data.location = FRAG_RESULT_DATA0;
621 
622    unsigned coord_components = zscale ? 3 : 2;
623    nir_variable *in_coords =
624       nir_variable_create(b->shader, nir_var_shader_in,
625                           glsl_vec_type(coord_components),
626                           "coords");
627    in_coords->data.location = VARYING_SLOT_VAR0;
628 
629    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
630    /* Note: since we're just copying data, we rely on the HW ignoring the
631     * dest_type.
632     */
633    tex->dest_type = nir_type_int32;
634    tex->is_array = false;
635    tex->is_shadow = false;
636    tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
637 
638    tex->texture_index = 0;
639    tex->sampler_index = 0;
640 
641    b->shader->info.num_textures = 1;
642    BITSET_SET(b->shader->info.textures_used, 0);
643 
644    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
645                                      nir_load_var(b, in_coords));
646    tex->coord_components = coord_components;
647 
648    nir_def_init(&tex->instr, &tex->def, 4, 32);
649    nir_builder_instr_insert(b, &tex->instr);
650 
651    nir_store_var(b, out_color, &tex->def, 0xf);
652 
653    return b->shader;
654 }
655 
656 /* We can only read multisample textures via txf_ms, so we need a separate
657  * variant for them.
658  */
659 static nir_shader *
build_ms_copy_fs_shader(bool half_float)660 build_ms_copy_fs_shader(bool half_float)
661 {
662    nir_builder _b =
663       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
664                                      "multisample copy fs");
665    nir_builder *b = &_b;
666    b->shader->info.internal = true;
667 
668    nir_variable *out_color =
669       nir_variable_create(b->shader, nir_var_shader_out,
670                           half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
671                           "color0");
672    out_color->data.location = FRAG_RESULT_DATA0;
673 
674    nir_variable *in_coords =
675       nir_variable_create(b->shader, nir_var_shader_in,
676                           glsl_vec_type(2),
677                           "coords");
678    in_coords->data.location = VARYING_SLOT_VAR0;
679 
680    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
681 
682    tex->op = nir_texop_txf_ms;
683 
684    /* Note: since we're just copying data, we rely on the HW ignoring the
685     * dest_type.
686     */
687    tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
688    tex->is_array = false;
689    tex->is_shadow = false;
690    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
691 
692    tex->texture_index = 0;
693    tex->sampler_index = 0;
694 
695    b->shader->info.num_textures = 1;
696    BITSET_SET(b->shader->info.textures_used, 0);
697    BITSET_SET(b->shader->info.textures_used_by_txf, 0);
698 
699    nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
700 
701    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
702    tex->coord_components = 2;
703 
704    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
705                                      nir_load_sample_id(b));
706 
707    nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
708    nir_builder_instr_insert(b, &tex->instr);
709 
710    nir_store_var(b, out_color, &tex->def, 0xf);
711 
712    return b->shader;
713 }
714 
715 static nir_shader *
build_clear_fs_shader(unsigned mrts)716 build_clear_fs_shader(unsigned mrts)
717 {
718    nir_builder _b =
719       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
720                                      "mrt%u clear fs", mrts);
721    nir_builder *b = &_b;
722    b->shader->info.internal = true;
723 
724    for (unsigned i = 0; i < mrts; i++) {
725       nir_variable *out_color =
726          nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
727                              "color");
728       out_color->data.location = FRAG_RESULT_DATA0 + i;
729 
730       nir_def *color = load_const(b, 4 * i, 4);
731       nir_store_var(b, out_color, color, 0xf);
732    }
733 
734    return b->shader;
735 }
736 
737 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)738 compile_shader(struct tu_device *dev, struct nir_shader *nir,
739                unsigned consts, unsigned *offset, enum global_shader idx)
740 {
741    nir->options = ir3_get_compiler_options(dev->compiler);
742 
743    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
744    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
745 
746    ir3_finalize_nir(dev->compiler, nir);
747 
748    const struct ir3_shader_options options = {
749       .num_reserved_user_consts = align(consts, 8),
750       .api_wavesize = IR3_SINGLE_OR_DOUBLE,
751       .real_wavesize = IR3_SINGLE_OR_DOUBLE,
752    };
753    struct ir3_shader *sh =
754       ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
755 
756    struct ir3_shader_key key = {};
757    bool created;
758    struct ir3_shader_variant *so =
759       ir3_shader_get_variant(sh, &key, false, false, &created);
760 
761    struct tu6_global *global = dev->global_bo_map;
762 
763    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
764    dev->global_shaders[idx] = sh;
765    dev->global_shader_variants[idx] = so;
766    memcpy(&global->shaders[*offset], so->bin,
767           sizeof(uint32_t) * so->info.sizedwords);
768    dev->global_shader_va[idx] = dev->global_bo->iova +
769       offsetof_arr(struct tu6_global, shaders, *offset);
770    *offset += align(so->info.sizedwords, 32);
771 }
772 
773 void
tu_init_clear_blit_shaders(struct tu_device * dev)774 tu_init_clear_blit_shaders(struct tu_device *dev)
775 {
776    unsigned offset = 0;
777    compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
778    compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
779    compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
780    compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
781    compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
782    compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
783 
784    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
785       compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
786                      (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
787    }
788 }
789 
790 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)791 tu_destroy_clear_blit_shaders(struct tu_device *dev)
792 {
793    for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
794       if (dev->global_shaders[i])
795          ir3_shader_destroy(dev->global_shaders[i]);
796    }
797 }
798 
799 enum r3d_type {
800    R3D_CLEAR,
801    R3D_BLIT,
802    R3D_COPY_HALF,
803 };
804 
805 template <chip CHIP>
806 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)807 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
808            uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
809 {
810    enum global_shader vs_id =
811       type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
812 
813    struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
814    uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
815 
816    enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
817 
818    if (z_scale) {
819       fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
820    } else if (type == R3D_COPY_HALF) {
821       /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
822        *
823        * TODO: Add a half-float blit shader that uses texture() but with half
824        * registers to avoid NaN canonicaliztion for the single-sampled case.
825        */
826       fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
827    } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
828       fs_id = GLOBAL_SH_FS_COPY_MS;
829    }
830 
831    unsigned num_rts = util_bitcount(rts_mask);
832    if (type == R3D_CLEAR)
833       fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
834 
835    struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
836    uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
837 
838    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
839          .vs_state = true,
840          .hs_state = true,
841          .ds_state = true,
842          .gs_state = true,
843          .fs_state = true,
844          .cs_state = true,
845          .cs_ibo = true,
846          .gfx_ibo = true,
847          .gfx_shared_const = true,
848          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
849          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
850 
851    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
852    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
853    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
854    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
855    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
856 
857    struct tu_pvtmem_config pvtmem = {};
858    tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
859    tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
860 
861    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
862    if (CHIP == A7XX) {
863       tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
864    }
865 
866    tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
867 
868    if (CHIP >= A7XX) {
869       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
870 
871       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
872    }
873 
874    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
875    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
876    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
877 
878    tu6_emit_vs<CHIP>(cs, vs, 0);
879    tu6_emit_hs<CHIP>(cs, NULL);
880    tu6_emit_ds<CHIP>(cs, NULL);
881    tu6_emit_gs<CHIP>(cs, NULL);
882    tu6_emit_fs<CHIP>(cs, fs);
883 
884    tu_cs_emit_regs(cs,
885                    A6XX_GRAS_CL_CNTL(
886                       .clip_disable = 1,
887                       .vp_clip_code_ignore = 1,
888                       .vp_xform_disable = 1,
889                       .persp_division_disable = 1,));
890    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
891 
892    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
893    if (CHIP == A6XX) {
894       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
895    } else {
896       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
897    }
898 
899    tu_cs_emit_regs(cs,
900                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
901                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
902    tu_cs_emit_regs(cs,
903                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
904                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
905 
906    tu_cs_emit_regs(cs,
907                    A6XX_VFD_INDEX_OFFSET(),
908                    A6XX_VFD_INSTANCE_START_OFFSET());
909 
910    if (rts_mask) {
911       unsigned rts_count = util_last_bit(rts_mask);
912       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
913       unsigned rt = 0;
914       for (unsigned i = 0; i < rts_count; i++) {
915          unsigned regid = 0;
916          if (rts_mask & (1u << i))
917             regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
918          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
919                         COND(regid & HALF_REG_ID,
920                              A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
921       }
922    }
923 
924    tu6_emit_msaa(cs, samples, false);
925 }
926 
927 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)928 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
929                           struct tu_cs *cs,
930                           uint32_t opcode,
931                           enum a6xx_state_block block,
932                           uint32_t offset,
933                           const void *consts,
934                           uint32_t size_vec4)
935 {
936    assert(offset % cmd->device->compiler->const_upload_unit == 0);
937 
938    struct tu_cs_memory mem = {};
939    VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
940    if (result != VK_SUCCESS) {
941       vk_command_buffer_set_error(&cmd->vk, result);
942       return;
943    }
944 
945    memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
946 
947    tu_cs_emit_pkt7(cs, opcode, 3);
948    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
949                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
950                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
951                   CP_LOAD_STATE6_0_STATE_BLOCK(block) |
952                   CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
953    tu_cs_emit_qw(cs, mem.iova);
954 }
955 
956 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)957 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
958 {
959    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
960 }
961 
962 /* z coordinate for "z scale" blit path which uses a 3d texture */
963 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)964 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
965 {
966    const uint32_t coord[] = {
967       fui(z),
968       0,
969       0,
970       0,
971    };
972 
973    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
974 }
975 
976 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)977 r3d_coords(struct tu_cmd_buffer *cmd,
978            struct tu_cs *cs,
979            const VkOffset2D dst,
980            const VkOffset2D src,
981            const VkExtent2D extent)
982 {
983    const bool no_src = src.x != blt_no_coord.x;
984    int32_t src_x1 = no_src ? src.x : 0;
985    int32_t src_y1 = no_src ? src.y : 0;
986 
987    const float coords[] = {
988       dst.x,
989       dst.y,
990       src_x1,
991       src_y1,
992       dst.x + extent.width,
993       dst.y + extent.height,
994       src_x1 + extent.width,
995       src_y1 + extent.height,
996    };
997    r3d_coords_raw(cmd, cs, coords);
998 }
999 
1000 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1001 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1002 {
1003    uint32_t coords[4] = {};
1004 
1005    switch (format) {
1006    case PIPE_FORMAT_Z24X8_UNORM:
1007    case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1008       /* cleared as r8g8b8a8_unorm using special format */
1009       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1010       coords[0] = fui((tmp & 0xff) / 255.0f);
1011       coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1012       coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1013       coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1014    } break;
1015    case PIPE_FORMAT_Z16_UNORM:
1016    case PIPE_FORMAT_Z32_FLOAT:
1017       coords[0] = fui(val->depthStencil.depth);
1018       coords[1] = 0;
1019       coords[2] = 0;
1020       coords[3] = 0;
1021       break;
1022    case PIPE_FORMAT_S8_UINT:
1023       coords[0] = val->depthStencil.stencil & 0xff;
1024       coords[1] = 0;
1025       coords[2] = 0;
1026       coords[3] = 0;
1027       break;
1028    default:
1029       /* as color formats use clear value as-is */
1030       assert(!util_format_is_depth_or_stencil(format));
1031       memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1032       break;
1033    }
1034 
1035    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1036 }
1037 
1038 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1039 r3d_src_common(struct tu_cmd_buffer *cmd,
1040                struct tu_cs *cs,
1041                const uint32_t *tex_const,
1042                uint32_t offset_base,
1043                uint32_t offset_ubwc,
1044                VkFilter filter)
1045 {
1046    struct tu_cs_memory texture = { };
1047    VkResult result = tu_cs_alloc(&cmd->sub_cs,
1048                                  2, /* allocate space for a sampler too */
1049                                  A6XX_TEX_CONST_DWORDS, &texture);
1050    if (result != VK_SUCCESS) {
1051       vk_command_buffer_set_error(&cmd->vk, result);
1052       return;
1053    }
1054 
1055    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1056 
1057    /* patch addresses for layer offset */
1058    *(uint64_t*) (texture.map + 4) += offset_base;
1059    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1060    texture.map[7] = ubwc_addr;
1061    texture.map[8] = ubwc_addr >> 32;
1062 
1063    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1064       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1065       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1066       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1067       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1068       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1069       0x60000; /* XXX used by blob, doesn't seem necessary */
1070    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1071       A6XX_TEX_SAMP_1_UNNORM_COORDS |
1072       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1073    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1074    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1075 
1076    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1077    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1078                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1079                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1080                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1081                CP_LOAD_STATE6_0_NUM_UNIT(1));
1082    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1083 
1084    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1085 
1086    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1087    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1088       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1089       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1090       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1091       CP_LOAD_STATE6_0_NUM_UNIT(1));
1092    tu_cs_emit_qw(cs, texture.iova);
1093 
1094    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1095    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1096 }
1097 
1098 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1099 r3d_src(struct tu_cmd_buffer *cmd,
1100         struct tu_cs *cs,
1101         const struct fdl6_view *iview,
1102         uint32_t layer,
1103         VkFilter filter,
1104         enum pipe_format dst_format)
1105 {
1106    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1107    memcpy(desc, iview->descriptor, sizeof(desc));
1108 
1109    enum a6xx_format fmt = (enum a6xx_format)(
1110       (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1111    enum pipe_format src_format = iview->format;
1112    fixup_src_format(&src_format, dst_format, &fmt);
1113    desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1114       A6XX_TEX_CONST_0_FMT(fmt);
1115 
1116    r3d_src_common(cmd, cs, desc,
1117                   iview->layer_size * layer,
1118                   iview->ubwc_layer_size * layer,
1119                   filter);
1120 }
1121 
1122 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1123 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1124                struct tu_cs *cs,
1125                enum pipe_format format,
1126                uint64_t va, uint32_t pitch,
1127                uint32_t width, uint32_t height,
1128                enum pipe_format dst_format)
1129 {
1130    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1131 
1132    struct tu_native_format fmt = blit_format_texture(format, TILE6_LINEAR);
1133    enum a6xx_format color_format = fmt.fmt;
1134    fixup_src_format(&format, dst_format, &color_format);
1135 
1136    desc[0] =
1137       COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1138       A6XX_TEX_CONST_0_FMT(color_format) |
1139       A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1140       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1141       A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1142       A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1143       A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1144    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1145    desc[2] =
1146       A6XX_TEX_CONST_2_PITCH(pitch) |
1147       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1148    desc[3] = 0;
1149    desc[4] = va;
1150    desc[5] = va >> 32;
1151    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1152       desc[i] = 0;
1153 
1154    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1155 }
1156 
1157 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1158 r3d_src_depth(struct tu_cmd_buffer *cmd,
1159               struct tu_cs *cs,
1160               const struct tu_image_view *iview,
1161               uint32_t layer)
1162 {
1163    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1164 
1165    memcpy(desc, iview->view.descriptor, sizeof(desc));
1166    uint64_t va = iview->depth_base_addr;
1167 
1168    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1169                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1170                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1171                 A6XX_TEX_CONST_0_SWAP__MASK);
1172    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1173               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1174               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1175               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1176               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1177    desc[2] =
1178       A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1179       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1180    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1181       (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1182    desc[4] = va;
1183    desc[5] = va >> 32;
1184 
1185    r3d_src_common(cmd, cs, desc,
1186                   iview->depth_layer_size * layer,
1187                   iview->view.ubwc_layer_size * layer,
1188                   VK_FILTER_NEAREST);
1189 }
1190 
1191 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1192 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1193                 struct tu_cs *cs,
1194                 const struct tu_image_view *iview,
1195                 uint32_t layer)
1196 {
1197    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1198 
1199    memcpy(desc, iview->view.descriptor, sizeof(desc));
1200    uint64_t va = iview->stencil_base_addr;
1201 
1202    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1203                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1204                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1205                 A6XX_TEX_CONST_0_SWAP__MASK);
1206    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1207               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1208               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1209               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1210               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1211    desc[2] =
1212       A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1213       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1214    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1215    desc[4] = va;
1216    desc[5] = va >> 32;
1217    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1218       desc[i] = 0;
1219 
1220    r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1221                   VK_FILTER_NEAREST);
1222 }
1223 
1224 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1225 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1226                   struct tu_cs *cs,
1227                   const struct tu_image_view *iview,
1228                   uint32_t layer)
1229 {
1230    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1231 
1232    memcpy(desc, iview->view.descriptor, sizeof(desc));
1233 
1234    /* Fixup D24 formats because we always load both depth and stencil. */
1235    enum pipe_format format = iview->view.format;
1236    if (format == PIPE_FORMAT_X24S8_UINT ||
1237        format == PIPE_FORMAT_Z24X8_UNORM ||
1238        format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1239       desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1240       if (iview->view.ubwc_enabled)
1241          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1242       else
1243          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1244    }
1245 
1246    /* When loading/storing GMEM we always load the full image and don't do any
1247     * swizzling or swapping, that's done in the draw when reading/writing
1248     * GMEM, so we need to fixup the swizzle and swap.
1249     */
1250    desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1251                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1252                 A6XX_TEX_CONST_0_SWAP__MASK);
1253    desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1254               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1255               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1256               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1257 
1258    r3d_src_common(cmd, cs, desc,
1259                   iview->view.layer_size * layer,
1260                   iview->view.ubwc_layer_size * layer,
1261                   VK_FILTER_NEAREST);
1262 }
1263 
1264 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1265 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1266              struct tu_cs *cs,
1267              const struct tu_image_view *iview,
1268              enum pipe_format format,
1269              enum pipe_format dst_format,
1270              uint32_t gmem_offset,
1271              uint32_t cpp)
1272 {
1273    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1274    memcpy(desc, iview->view.descriptor, sizeof(desc));
1275 
1276    enum a6xx_format fmt = blit_format_texture(format, TILE6_LINEAR).fmt;
1277    fixup_src_format(&format, dst_format, &fmt);
1278 
1279    /* patch the format so that depth/stencil get the right format and swizzle */
1280    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1281                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1282                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1283    desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1284                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1285                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1286                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1287                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1288 
1289    /* patched for gmem */
1290    desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1291    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1292    desc[2] =
1293       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1294       A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1295    desc[3] = 0;
1296    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1297    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1298    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1299       desc[i] = 0;
1300 
1301    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1302 }
1303 
1304 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1305 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1306         enum pipe_format src_format)
1307 {
1308    uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1309 
1310    enum a6xx_format fmt = (enum a6xx_format)(
1311       mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1312    enum pipe_format dst_format = iview->format;
1313    fixup_dst_format(src_format, &dst_format, &fmt);
1314    mrt_buf_info =
1315       (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1316       A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1317    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1318    tu_cs_emit(cs, mrt_buf_info);
1319    tu_cs_image_ref(cs, iview, layer);
1320    tu_cs_emit(cs, 0);
1321 
1322    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1323    tu_cs_image_flag_ref(cs, iview, layer);
1324 
1325    /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1326     * FMT6_NV12_Y.
1327     */
1328    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1329 
1330    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
1331    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1332 }
1333 
1334 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1335 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1336 {
1337    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1338    tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO));
1339    tu_cs_image_depth_ref(cs, iview, layer);
1340    tu_cs_emit(cs, 0);
1341 
1342    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1343    tu_cs_image_flag_ref(cs, &iview->view, layer);
1344 
1345    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled));
1346    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1347 }
1348 
1349 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1350 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1351 {
1352    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1353    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
1354    tu_cs_image_stencil_ref(cs, iview, layer);
1355    tu_cs_emit(cs, 0);
1356 
1357    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1358    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1359 }
1360 
1361 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1362 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1363                enum pipe_format src_format)
1364 {
1365    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1366 
1367    enum a6xx_format color_fmt = fmt.fmt;
1368    fixup_dst_format(src_format, &format, &color_fmt);
1369 
1370    tu_cs_emit_regs(cs,
1371                    A6XX_RB_MRT_BUF_INFO(0, .color_format = color_fmt, .color_swap = fmt.swap),
1372                    A6XX_RB_MRT_PITCH(0, pitch),
1373                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1374                    A6XX_RB_MRT_BASE(0, .qword = va),
1375                    A6XX_RB_MRT_BASE_GMEM(0, 0));
1376 
1377    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1378    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1379 }
1380 
1381 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1382 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1383              const struct tu_image_view *iview,
1384              const struct tu_render_pass_attachment *att,
1385              bool separate_stencil, unsigned layer)
1386 {
1387    unsigned RB_MRT_BUF_INFO;
1388    unsigned gmem_offset;
1389 
1390    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1391       if (!separate_stencil) {
1392          RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1393          gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1394       } else {
1395          RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1396          gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1397       }
1398    } else {
1399       RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1400       gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1401    }
1402 
1403    tu_cs_emit_regs(cs,
1404                    A6XX_RB_MRT_BUF_INFO(0, .dword = RB_MRT_BUF_INFO),
1405                    A6XX_RB_MRT_PITCH(0, 0),
1406                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1407                    A6XX_RB_MRT_BASE(0, 0),
1408                    A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1409 
1410    enum a6xx_format color_format =
1411       (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1412    tu_cs_emit_regs(cs,
1413                    A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1414 
1415    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1416    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1417 }
1418 
1419 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1420 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1421 {
1422    uint8_t mask = 0xf;
1423    assert(aspect_mask);
1424    /* note: the only format with partial writing is D24S8,
1425     * clear/blit uses the _AS_R8G8B8A8 format to access it
1426     */
1427    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1428       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1429          mask = 0x7;
1430       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1431          mask = 0x8;
1432    }
1433    return mask;
1434 }
1435 
1436 enum r3d_blit_param {
1437    R3D_Z_SCALE = 1 << 0,
1438    R3D_DST_GMEM = 1 << 1,
1439    R3D_COPY = 1 << 2,
1440 };
1441 
1442 template <chip CHIP>
1443 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1444 r3d_setup(struct tu_cmd_buffer *cmd,
1445           struct tu_cs *cs,
1446           enum pipe_format src_format,
1447           enum pipe_format dst_format,
1448           VkImageAspectFlags aspect_mask,
1449           unsigned blit_param,
1450           bool clear,
1451           bool ubwc,
1452           VkSampleCountFlagBits samples)
1453 {
1454    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1455       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1456    }
1457 
1458    enum a6xx_format fmt = blit_base_format(dst_format, ubwc);
1459    fixup_dst_format(src_format, &dst_format, &fmt);
1460 
1461    if (!cmd->state.pass) {
1462       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1463       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1464    }
1465 
1466    if (!(blit_param & R3D_DST_GMEM)) {
1467       if (CHIP == A6XX) {
1468          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1469       } else {
1470          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1471       }
1472 
1473       tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1474 
1475       if (CHIP >= A7XX) {
1476          tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1477          tu_cs_emit_regs(cs,
1478             A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1479       }
1480    }
1481 
1482    enum r3d_type type;
1483    if (clear) {
1484       type = R3D_CLEAR;
1485    } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1486       /* Avoid canonicalizing NaNs in copies by using the special half-float
1487        * path that uses half regs.
1488        */
1489       type = R3D_COPY_HALF;
1490    } else {
1491       type = R3D_BLIT;
1492    }
1493 
1494    r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1495 
1496    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1497    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1498    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1499    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1500 
1501    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1502    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1503    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1504    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1505    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1506    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1507    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1508    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1509    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1510 
1511    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1512                         .color_format = fmt,
1513                         .color_sint = util_format_is_pure_sint(dst_format),
1514                         .color_uint = util_format_is_pure_uint(dst_format)));
1515 
1516    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1517       .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1518    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1519    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1520 
1521    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1522    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1523 
1524    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1525                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1526 
1527    /* Disable sample counting in order to not affect occlusion query. */
1528    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1529 
1530    if (cmd->state.prim_generated_query_running_before_rp) {
1531       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1532    }
1533 
1534    if (cmd->state.predication_active) {
1535       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1536       tu_cs_emit(cs, 0);
1537    }
1538 }
1539 
1540 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1541 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1542 {
1543    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1544    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1545                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1546                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1547    tu_cs_emit(cs, 1); /* instance count */
1548    tu_cs_emit(cs, 2); /* vertex count */
1549 }
1550 
1551 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1552 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1553 {
1554    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1555    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1556                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1557                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1558    tu_cs_emit(cs, 1); /* instance count */
1559    tu_cs_emit(cs, 2); /* vertex count */
1560 }
1561 
1562 template <chip CHIP>
1563 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1564 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1565 {
1566    if (cmd->state.predication_active) {
1567       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1568       tu_cs_emit(cs, 1);
1569    }
1570 
1571    /* Re-enable sample counting. */
1572    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1573 
1574    if (cmd->state.prim_generated_query_running_before_rp) {
1575       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1576    }
1577 }
1578 
1579 /* blit ops - common interface for 2d/shader paths */
1580 
1581 struct blit_ops {
1582    void (*coords)(struct tu_cmd_buffer *cmd,
1583                   struct tu_cs *cs,
1584                   const VkOffset2D dst,
1585                   const VkOffset2D src,
1586                   const VkExtent2D extent);
1587    void (*clear_value)(struct tu_cmd_buffer *cmd,
1588                        struct tu_cs *cs,
1589                        enum pipe_format format,
1590                        const VkClearValue *val);
1591    void (*src)(
1592         struct tu_cmd_buffer *cmd,
1593         struct tu_cs *cs,
1594         const struct fdl6_view *iview,
1595         uint32_t layer,
1596         VkFilter filter,
1597         enum pipe_format dst_format);
1598    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1599                       enum pipe_format format,
1600                       uint64_t va, uint32_t pitch,
1601                       uint32_t width, uint32_t height,
1602                       enum pipe_format dst_format);
1603    void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1604                enum pipe_format src_format);
1605    void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1606    void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1607    void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1608                       enum pipe_format src_format);
1609    void (*setup)(struct tu_cmd_buffer *cmd,
1610                  struct tu_cs *cs,
1611                  enum pipe_format src_format,
1612                  enum pipe_format dst_format,
1613                  VkImageAspectFlags aspect_mask,
1614                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1615                  bool clear,
1616                  bool ubwc,
1617                  VkSampleCountFlagBits samples);
1618    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1619    void (*teardown)(struct tu_cmd_buffer *cmd,
1620                     struct tu_cs *cs);
1621 };
1622 
1623 template <chip CHIP>
1624 static const struct blit_ops r2d_ops = {
1625    .coords = r2d_coords,
1626    .clear_value = r2d_clear_value,
1627    .src = r2d_src<CHIP>,
1628    .src_buffer = r2d_src_buffer<CHIP>,
1629    .dst = r2d_dst<CHIP>,
1630    .dst_depth = r2d_dst_depth,
1631    .dst_stencil = r2d_dst_stencil,
1632    .dst_buffer = r2d_dst_buffer,
1633    .setup = r2d_setup<CHIP>,
1634    .run = r2d_run,
1635    .teardown = r2d_teardown,
1636 };
1637 
1638 template <chip CHIP>
1639 static const struct blit_ops r3d_ops = {
1640    .coords = r3d_coords,
1641    .clear_value = r3d_clear_value,
1642    .src = r3d_src,
1643    .src_buffer = r3d_src_buffer,
1644    .dst = r3d_dst,
1645    .dst_depth = r3d_dst_depth,
1646    .dst_stencil = r3d_dst_stencil,
1647    .dst_buffer = r3d_dst_buffer,
1648    .setup = r3d_setup<CHIP>,
1649    .run = r3d_run,
1650    .teardown = r3d_teardown<CHIP>,
1651 };
1652 
1653 /* passthrough set coords from 3D extents */
1654 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1655 coords(const struct blit_ops *ops,
1656        struct tu_cmd_buffer *cmd,
1657        struct tu_cs *cs,
1658        const VkOffset3D dst,
1659        const VkOffset3D src,
1660        const VkExtent3D extent)
1661 {
1662    ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1663                (VkExtent2D) {extent.width, extent.height});
1664 }
1665 
1666 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1667  * to be a bit careful because we have to pick a format with matching UBWC
1668  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1669  * everything.
1670  */
1671 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1672 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1673 {
1674    if (vk_format_is_compressed(vk_format)) {
1675       switch (vk_format_get_blocksize(vk_format)) {
1676       case 1: return PIPE_FORMAT_R8_UINT;
1677       case 2: return PIPE_FORMAT_R16_UINT;
1678       case 4: return PIPE_FORMAT_R32_UINT;
1679       case 8: return PIPE_FORMAT_R32G32_UINT;
1680       case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1681       default:
1682          unreachable("unhandled format size");
1683       }
1684    }
1685 
1686    enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
1687 
1688    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1689     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1690     * (also -1.0), when we're supposed to be memcpying the bits. See
1691     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1692     */
1693    format = util_format_snorm_to_unorm(format);
1694 
1695    switch (format) {
1696    case PIPE_FORMAT_R9G9B9E5_FLOAT:
1697       return PIPE_FORMAT_R32_UINT;
1698 
1699    case PIPE_FORMAT_G8_B8R8_420_UNORM:
1700       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1701          return PIPE_FORMAT_R8G8_UNORM;
1702       else
1703          return PIPE_FORMAT_Y8_UNORM;
1704    case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1705       return PIPE_FORMAT_R8_UNORM;
1706 
1707    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1708       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1709          return PIPE_FORMAT_S8_UINT;
1710       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1711       return PIPE_FORMAT_Z32_FLOAT;
1712 
1713    default:
1714       return format;
1715    }
1716 }
1717 
1718 template <chip CHIP>
1719 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)1720 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1721               struct tu_cs *cs,
1722               struct tu_image *image,
1723               const VkClearValue *value)
1724 {
1725    const struct blit_ops *ops = &r2d_ops<CHIP>;
1726 
1727    /* It is assumed that LRZ cache is invalidated at this point for
1728     * the writes here to become visible to LRZ.
1729     *
1730     * LRZ writes are going through UCHE cache, flush UCHE before changing
1731     * LRZ via CCU. Don't need to invalidate CCU since we are presumably
1732     * writing whole cache lines we assume to be 64 bytes.
1733     */
1734    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_FLUSH);
1735 
1736    ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
1737               VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1738               VK_SAMPLE_COUNT_1_BIT);
1739    ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
1740    ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
1741                    image->iova + image->lrz_offset,
1742                    image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
1743    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
1744                (VkExtent2D) { image->lrz_pitch, image->lrz_height });
1745    ops->run(cmd, cs);
1746    ops->teardown(cmd, cs);
1747 
1748    /* Clearing writes via CCU color in the PS stage, and LRZ is read via
1749     * UCHE in the earlier GRAS stage.
1750     */
1751    cmd->state.cache.flush_bits |=
1752       TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
1753       TU_CMD_FLAG_WAIT_FOR_IDLE;
1754 }
1755 TU_GENX(tu6_clear_lrz);
1756 
1757 template <chip CHIP>
1758 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)1759 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
1760                  struct tu_cs *cs,
1761                  struct tu_image *image)
1762 {
1763    const struct blit_ops *ops = &r2d_ops<CHIP>;
1764    VkClearValue clear = {};
1765    clear.color.uint32[0] = 0xffffffff;
1766 
1767    /* LRZ fast-clear buffer is always allocated with 512 bytes size. */
1768    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
1769               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1770               VK_SAMPLE_COUNT_1_BIT);
1771    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
1772    ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
1773                    image->iova + image->lrz_fc_offset, 512,
1774                    PIPE_FORMAT_R32_UINT);
1775    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {128, 1});
1776    ops->run(cmd, cs);
1777    ops->teardown(cmd, cs);
1778 }
1779 TU_GENX(tu6_dirty_lrz_fc);
1780 
1781 template<chip CHIP>
1782 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)1783 tu_image_view_copy_blit(struct fdl6_view *iview,
1784                         struct tu_image *image,
1785                         enum pipe_format format,
1786                         const VkImageSubresourceLayers *subres,
1787                         uint32_t layer,
1788                         bool z_scale)
1789 {
1790    VkImageAspectFlags aspect_mask = subres->aspectMask;
1791 
1792    /* always use the AS_R8G8B8A8 format for these */
1793    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1794        format == PIPE_FORMAT_Z24X8_UNORM) {
1795       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1796    }
1797 
1798    const struct fdl_layout *layout =
1799       &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
1800 
1801    const struct fdl_view_args args = {
1802       .chip = CHIP,
1803       .iova = image->iova,
1804       .base_miplevel = subres->mipLevel,
1805       .level_count = 1,
1806       .base_array_layer = subres->baseArrayLayer + layer,
1807       .layer_count = 1,
1808       .swiz = {
1809          PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
1810       },
1811       .format = tu_format_for_aspect(format, aspect_mask),
1812       .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
1813    };
1814    fdl6_view_init(iview, &layout, &args, false);
1815 }
1816 
1817 template<chip CHIP>
1818 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)1819 tu_image_view_copy(struct fdl6_view *iview,
1820                    struct tu_image *image,
1821                    enum pipe_format format,
1822                    const VkImageSubresourceLayers *subres,
1823                    uint32_t layer)
1824 {
1825    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
1826 }
1827 
1828 template<chip CHIP>
1829 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)1830 tu_image_view_blit(struct fdl6_view *iview,
1831                    struct tu_image *image,
1832                    const VkImageSubresourceLayers *subres,
1833                    uint32_t layer)
1834 {
1835    enum pipe_format format =
1836       tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
1837                                                          subres->aspectMask));
1838    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
1839 }
1840 
1841 template <chip CHIP>
1842 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)1843 tu6_blit_image(struct tu_cmd_buffer *cmd,
1844                struct tu_image *src_image,
1845                struct tu_image *dst_image,
1846                const VkImageBlit2 *info,
1847                VkFilter filter)
1848 {
1849    const struct blit_ops *ops = &r2d_ops<CHIP>;
1850    struct tu_cs *cs = &cmd->cs;
1851    bool z_scale = false;
1852    uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1853 
1854    /* 2D blit can't do rotation mirroring from just coordinates */
1855    static const enum a6xx_rotation rotate[2][2] = {
1856       {ROTATE_0, ROTATE_HFLIP},
1857       {ROTATE_VFLIP, ROTATE_180},
1858    };
1859 
1860    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1861                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
1862    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1863                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
1864 
1865    int32_t src0_z = info->srcOffsets[0].z;
1866    int32_t src1_z = info->srcOffsets[1].z;
1867 
1868    if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1869         info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1870        info->srcOffsets[1].z < info->srcOffsets[0].z) {
1871       z_scale = true;
1872    }
1873 
1874    if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1875       layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1876       src0_z = info->srcOffsets[1].z;
1877       src1_z = info->srcOffsets[0].z;
1878    }
1879 
1880    if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
1881       assert(layers <= 1);
1882       layers = vk_image_subresource_layer_count(&dst_image->vk,
1883                                                 &info->dstSubresource);
1884    }
1885 
1886    /* BC1_RGB_* formats need to have their last components overriden with 1
1887     * when sampling, which is normally handled with the texture descriptor
1888     * swizzle. The 2d path can't handle that, so use the 3d path.
1889     *
1890     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1891     * the 2d path.
1892     */
1893 
1894    unsigned blit_param = rotate[mirror_y][mirror_x];
1895    if (dst_image->layout[0].nr_samples > 1 ||
1896        src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1897        src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1898        filter == VK_FILTER_CUBIC_EXT ||
1899        z_scale) {
1900       ops = &r3d_ops<CHIP>;
1901       blit_param = z_scale ? R3D_Z_SCALE : 0;
1902    }
1903 
1904    /* use the right format in setup() for D32_S8
1905     * TODO: this probably should use a helper
1906     */
1907    enum pipe_format src_format =
1908       tu6_plane_format(src_image->vk.format,
1909                        tu6_plane_index(src_image->vk.format,
1910                                        info->srcSubresource.aspectMask));
1911    enum pipe_format dst_format =
1912       tu6_plane_format(dst_image->vk.format,
1913                        tu6_plane_index(src_image->vk.format,
1914                                        info->srcSubresource.aspectMask));
1915    trace_start_blit(&cmd->trace, cs,
1916                   ops == &r3d_ops<CHIP>,
1917                   src_image->vk.format,
1918                   dst_image->vk.format,
1919                   layers);
1920 
1921    ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
1922               blit_param, false, dst_image->layout[0].ubwc,
1923               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
1924 
1925    if (ops == &r3d_ops<CHIP>) {
1926       const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
1927                                info->srcOffsets[0].x, info->srcOffsets[0].y,
1928                                info->dstOffsets[1].x, info->dstOffsets[1].y,
1929                                info->srcOffsets[1].x, info->srcOffsets[1].y };
1930       r3d_coords_raw(cmd, cs, coords);
1931    } else {
1932       tu_cs_emit_regs(cs,
1933          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1934                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1935          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1936                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1937       tu_cs_emit_regs(cs,
1938          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1939          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1940          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1941          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1942    }
1943 
1944    struct fdl6_view dst, src;
1945    tu_image_view_blit<CHIP>(
1946       &dst, dst_image, &info->dstSubresource,
1947       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1948 
1949    if (z_scale) {
1950       tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
1951                                     &info->srcSubresource, 0, true);
1952       ops->src(cmd, cs, &src, 0, filter, dst_format);
1953    } else {
1954       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1955    }
1956 
1957    for (uint32_t i = 0; i < layers; i++) {
1958       if (z_scale) {
1959          float t = ((float) i + 0.5f) / (float) layers;
1960          r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
1961       } else {
1962          ops->src(cmd, cs, &src, i, filter, dst_format);
1963       }
1964       ops->dst(cs, &dst, i, src_format);
1965       ops->run(cmd, cs);
1966    }
1967 
1968    ops->teardown(cmd, cs);
1969 
1970    trace_end_blit(&cmd->trace, cs);
1971 }
1972 
1973 template <chip CHIP>
1974 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)1975 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
1976                  const VkBlitImageInfo2 *pBlitImageInfo)
1977 
1978 {
1979    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1980    TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
1981    TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
1982 
1983    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
1984       /* can't blit both depth and stencil at once with D32_S8
1985        * TODO: more advanced 3D blit path to support it instead?
1986        */
1987       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1988           dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1989          VkImageBlit2 region = pBlitImageInfo->pRegions[i];
1990          u_foreach_bit(b, region.dstSubresource.aspectMask) {
1991             region.srcSubresource.aspectMask = BIT(b);
1992             region.dstSubresource.aspectMask = BIT(b);
1993             tu6_blit_image<CHIP>(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
1994          }
1995          continue;
1996       }
1997       tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
1998                      pBlitImageInfo->filter);
1999    }
2000 
2001    if (dst_image->lrz_height) {
2002       tu_disable_lrz(cmd, &cmd->cs, dst_image);
2003    }
2004 }
2005 TU_GENX(tu_CmdBlitImage2);
2006 
2007 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2008 copy_compressed(VkFormat format,
2009                 VkOffset3D *offset,
2010                 VkExtent3D *extent,
2011                 uint32_t *width,
2012                 uint32_t *height)
2013 {
2014    if (!vk_format_is_compressed(format))
2015       return;
2016 
2017    uint32_t block_width = vk_format_get_blockwidth(format);
2018    uint32_t block_height = vk_format_get_blockheight(format);
2019 
2020    offset->x /= block_width;
2021    offset->y /= block_height;
2022 
2023    if (extent) {
2024       extent->width = DIV_ROUND_UP(extent->width, block_width);
2025       extent->height = DIV_ROUND_UP(extent->height, block_height);
2026    }
2027    if (width)
2028       *width = DIV_ROUND_UP(*width, block_width);
2029    if (height)
2030       *height = DIV_ROUND_UP(*height, block_height);
2031 }
2032 
2033 template <chip CHIP>
2034 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2035 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2036                         struct tu_buffer *src_buffer,
2037                         struct tu_image *dst_image,
2038                         const VkBufferImageCopy2 *info)
2039 {
2040    struct tu_cs *cs = &cmd->cs;
2041    uint32_t layers = MAX2(info->imageExtent.depth,
2042                           vk_image_subresource_layer_count(&dst_image->vk,
2043                                                            &info->imageSubresource));
2044    enum pipe_format src_format =
2045       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2046    enum pipe_format dst_format =
2047       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2048    const struct blit_ops *ops = &r2d_ops<CHIP>;
2049 
2050    /* special case for buffer to stencil */
2051    if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2052        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2053       src_format = PIPE_FORMAT_S8_UINT;
2054    }
2055 
2056    /* note: could use "R8_UNORM" when no UBWC */
2057    unsigned blit_param = 0;
2058    if (src_format == PIPE_FORMAT_Y8_UNORM ||
2059        tu_pipe_format_is_float16(src_format)) {
2060       ops = &r3d_ops<CHIP>;
2061       blit_param = R3D_COPY;
2062    }
2063 
2064    VkOffset3D offset = info->imageOffset;
2065    VkExtent3D extent = info->imageExtent;
2066    uint32_t src_width = info->bufferRowLength ?: extent.width;
2067    uint32_t src_height = info->bufferImageHeight ?: extent.height;
2068 
2069    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2070 
2071    uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2072    uint32_t layer_size = src_height * pitch;
2073 
2074    ops->setup(cmd, cs, src_format, dst_format,
2075               info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2076               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2077 
2078    struct fdl6_view dst;
2079    tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2080                             &info->imageSubresource, offset.z);
2081 
2082    for (uint32_t i = 0; i < layers; i++) {
2083       ops->dst(cs, &dst, i, src_format);
2084 
2085       uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2086       if ((src_va & 63) || (pitch & 63)) {
2087          for (uint32_t y = 0; y < extent.height; y++) {
2088             uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2089             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2090                             x + extent.width, 1, dst_format);
2091             ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y},  (VkOffset2D) {x},
2092                         (VkExtent2D) {extent.width, 1});
2093             ops->run(cmd, cs);
2094             src_va += pitch;
2095          }
2096       } else {
2097          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
2098          coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2099          ops->run(cmd, cs);
2100       }
2101    }
2102 
2103    ops->teardown(cmd, cs);
2104 }
2105 
2106 template <chip CHIP>
2107 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2108 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2109                          const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2110 {
2111    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2112    TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2113    TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2114 
2115    for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2116       tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2117                               pCopyBufferToImageInfo->pRegions + i);
2118 
2119    if (dst_image->lrz_height) {
2120       tu_disable_lrz(cmd, &cmd->cs, dst_image);
2121    }
2122 }
2123 TU_GENX(tu_CmdCopyBufferToImage2);
2124 
2125 template <chip CHIP>
2126 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info)2127 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2128                         struct tu_image *src_image,
2129                         struct tu_buffer *dst_buffer,
2130                         const VkBufferImageCopy2 *info)
2131 {
2132    struct tu_cs *cs = &cmd->cs;
2133    uint32_t layers = MAX2(info->imageExtent.depth,
2134                           vk_image_subresource_layer_count(&src_image->vk,
2135                                                            &info->imageSubresource));
2136    enum pipe_format dst_format =
2137       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2138    enum pipe_format src_format =
2139       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2140    const struct blit_ops *ops = &r2d_ops<CHIP>;
2141 
2142    if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2143        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2144       dst_format = PIPE_FORMAT_S8_UINT;
2145    }
2146 
2147    /* note: could use "R8_UNORM" when no UBWC */
2148    unsigned blit_param = 0;
2149    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2150        tu_pipe_format_is_float16(src_format)) {
2151       ops = &r3d_ops<CHIP>;
2152       blit_param = R3D_COPY;
2153    }
2154 
2155    VkOffset3D offset = info->imageOffset;
2156    VkExtent3D extent = info->imageExtent;
2157    uint32_t dst_width = info->bufferRowLength ?: extent.width;
2158    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2159 
2160    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2161 
2162    uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2163    uint32_t layer_size = pitch * dst_height;
2164 
2165    ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2166               VK_SAMPLE_COUNT_1_BIT);
2167 
2168    struct fdl6_view src;
2169    tu_image_view_copy<CHIP>(&src, src_image, src_format,
2170                             &info->imageSubresource, offset.z);
2171 
2172    for (uint32_t i = 0; i < layers; i++) {
2173       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2174 
2175       uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2176       if ((dst_va & 63) || (pitch & 63)) {
2177          for (uint32_t y = 0; y < extent.height; y++) {
2178             uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2179             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2180             ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2181                         (VkExtent2D) {extent.width, 1});
2182             ops->run(cmd, cs);
2183             dst_va += pitch;
2184          }
2185       } else {
2186          ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2187          coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2188          ops->run(cmd, cs);
2189       }
2190    }
2191 
2192    ops->teardown(cmd, cs);
2193 }
2194 
2195 template <chip CHIP>
2196 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2197 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2198                          const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2199 {
2200    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2201    TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2202    TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2203 
2204    for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2205       tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2206                               pCopyImageToBufferInfo->pRegions + i);
2207 }
2208 TU_GENX(tu_CmdCopyImageToBuffer2);
2209 
2210 /* Tiled formats don't support swapping, which means that we can't support
2211  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2212  * formats like B5G5R5A1 have a separate linear-only format when sampling.
2213  * Currently we fake support for tiled swapped formats and use the unswapped
2214  * format instead, but this means that reinterpreting copies to and from
2215  * swapped formats can't be performed correctly unless we can swizzle the
2216  * components by reinterpreting the other image as the "correct" swapped
2217  * format, i.e. only when the other image is linear.
2218  */
2219 
2220 static bool
is_swapped_format(enum pipe_format format)2221 is_swapped_format(enum pipe_format format)
2222 {
2223    struct tu_native_format linear = blit_format_texture(format, TILE6_LINEAR);
2224    struct tu_native_format tiled = blit_format_texture(format, TILE6_3);
2225    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2226 }
2227 
2228 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2229  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2230  * versa). This should mirror the logic in fdl6_layout.
2231  */
2232 static bool
image_is_r8g8(struct tu_image * image)2233 image_is_r8g8(struct tu_image *image)
2234 {
2235    return image->layout[0].cpp == 2 &&
2236       vk_format_get_nr_components(image->vk.format) == 2;
2237 }
2238 
2239 template <chip CHIP>
2240 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2241 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2242                        struct tu_image *src_image,
2243                        struct tu_image *dst_image,
2244                        const VkImageCopy2 *info)
2245 {
2246    const struct blit_ops *ops = &r2d_ops<CHIP>;
2247    struct tu_cs *cs = &cmd->cs;
2248 
2249    if (dst_image->layout[0].nr_samples > 1)
2250       ops = &r3d_ops<CHIP>;
2251 
2252    enum pipe_format format = PIPE_FORMAT_NONE;
2253    VkOffset3D src_offset = info->srcOffset;
2254    VkOffset3D dst_offset = info->dstOffset;
2255    VkExtent3D extent = info->extent;
2256    uint32_t layers_to_copy = MAX2(info->extent.depth,
2257                                   vk_image_subresource_layer_count(&src_image->vk,
2258                                                                    &info->srcSubresource));
2259 
2260    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2261     * Images":
2262     *
2263     *    When copying between compressed and uncompressed formats the extent
2264     *    members represent the texel dimensions of the source image and not
2265     *    the destination. When copying from a compressed image to an
2266     *    uncompressed image the image texel dimensions written to the
2267     *    uncompressed image will be source extent divided by the compressed
2268     *    texel block dimensions. When copying from an uncompressed image to a
2269     *    compressed image the image texel dimensions written to the compressed
2270     *    image will be the source extent multiplied by the compressed texel
2271     *    block dimensions.
2272     *
2273     * This means we only have to adjust the extent if the source image is
2274     * compressed.
2275     */
2276    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2277    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2278 
2279    enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2280    enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2281 
2282    /* note: could use "R8_UNORM" when no UBWC */
2283    unsigned blit_param = 0;
2284    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2285        src_format == PIPE_FORMAT_Y8_UNORM ||
2286        tu_pipe_format_is_float16(src_format) ||
2287        tu_pipe_format_is_float16(dst_format)) {
2288       ops = &r3d_ops<CHIP>;
2289       blit_param = R3D_COPY;
2290    }
2291 
2292    bool use_staging_blit = false;
2293 
2294    if (src_format == dst_format) {
2295       /* Images that share a format can always be copied directly because it's
2296        * the same as a blit.
2297        */
2298       format = src_format;
2299    } else if (!src_image->layout[0].tile_mode) {
2300       /* If an image is linear, we can always safely reinterpret it with the
2301        * other image's format and then do a regular blit.
2302        */
2303       format = dst_format;
2304    } else if (!dst_image->layout[0].tile_mode) {
2305       format = src_format;
2306    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2307       /* We can't currently copy r8g8 images to/from other cpp=2 images,
2308        * due to the different tile layout.
2309        */
2310       use_staging_blit = true;
2311    } else if (is_swapped_format(src_format) ||
2312               is_swapped_format(dst_format)) {
2313       /* If either format has a non-identity swap, then we can't copy
2314        * to/from it.
2315        */
2316       use_staging_blit = true;
2317    } else if (!src_image->layout[0].ubwc) {
2318       format = dst_format;
2319    } else if (!dst_image->layout[0].ubwc) {
2320       format = src_format;
2321    } else {
2322       /* Both formats use UBWC and so neither can be reinterpreted.
2323        * TODO: We could do an in-place decompression of the dst instead.
2324        */
2325       perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2326       use_staging_blit = true;
2327    }
2328 
2329    struct fdl6_view dst, src;
2330 
2331    if (use_staging_blit) {
2332       tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2333       tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2334 
2335       struct fdl_layout staging_layout = { 0 };
2336       VkOffset3D staging_offset = { 0 };
2337 
2338       staging_layout.tile_mode = TILE6_LINEAR;
2339       staging_layout.ubwc = false;
2340 
2341       uint32_t layer_count =
2342          vk_image_subresource_layer_count(&src_image->vk,
2343                                           &info->srcSubresource);
2344       fdl6_layout(&staging_layout,
2345                   src_format,
2346                   src_image->layout[0].nr_samples,
2347                   extent.width,
2348                   extent.height,
2349                   extent.depth,
2350                   1,
2351                   layer_count,
2352                   extent.depth > 1,
2353                   NULL);
2354 
2355       struct tu_bo *staging_bo;
2356       VkResult result = tu_get_scratch_bo(cmd->device,
2357                                           staging_layout.size,
2358                                           &staging_bo);
2359       if (result != VK_SUCCESS) {
2360          vk_command_buffer_set_error(&cmd->vk, result);
2361          return;
2362       }
2363 
2364       struct fdl6_view staging;
2365       const struct fdl_layout *staging_layout_ptr = &staging_layout;
2366       const struct fdl_view_args copy_to_args = {
2367          .chip = CHIP,
2368          .iova = staging_bo->iova,
2369          .base_miplevel = 0,
2370          .level_count = 1,
2371          .base_array_layer = 0,
2372          .layer_count = layer_count,
2373          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2374          .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2375          .type = FDL_VIEW_TYPE_2D,
2376       };
2377       fdl6_view_init(&staging, &staging_layout_ptr, &copy_to_args, false);
2378 
2379       ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2380                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2381       coords(ops, cmd, cs, staging_offset, src_offset, extent);
2382 
2383       for (uint32_t i = 0; i < layers_to_copy; i++) {
2384          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2385          ops->dst(cs, &staging, i, src_format);
2386          ops->run(cmd, cs);
2387       }
2388 
2389       /* When executed by the user there has to be a pipeline barrier here,
2390        * but since we're doing it manually we'll have to flush ourselves.
2391        */
2392       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
2393       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2394       tu_cs_emit_wfi(cs);
2395 
2396       const struct fdl_view_args copy_from_args = {
2397          .chip = CHIP,
2398          .iova = staging_bo->iova,
2399          .base_miplevel = 0,
2400          .level_count = 1,
2401          .base_array_layer = 0,
2402          .layer_count = layer_count,
2403          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2404          .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2405          .type = FDL_VIEW_TYPE_2D,
2406       };
2407       fdl6_view_init(&staging, &staging_layout_ptr, &copy_from_args, false);
2408 
2409       ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2410                  blit_param, false, dst_image->layout[0].ubwc,
2411                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2412       coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2413 
2414       for (uint32_t i = 0; i < layers_to_copy; i++) {
2415          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2416          ops->dst(cs, &dst, i, dst_format);
2417          ops->run(cmd, cs);
2418       }
2419    } else {
2420       tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2421       tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
2422 
2423       ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2424                  blit_param, false, dst_image->layout[0].ubwc,
2425                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2426       coords(ops, cmd, cs, dst_offset, src_offset, extent);
2427 
2428       for (uint32_t i = 0; i < layers_to_copy; i++) {
2429          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2430          ops->dst(cs, &dst, i, format);
2431          ops->run(cmd, cs);
2432       }
2433    }
2434 
2435    ops->teardown(cmd, cs);
2436 }
2437 
2438 template <chip CHIP>
2439 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)2440 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
2441                  const VkCopyImageInfo2 *pCopyImageInfo)
2442 {
2443    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2444    TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2445    TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2446 
2447    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2448       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2449          VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2450          u_foreach_bit(b, info.dstSubresource.aspectMask) {
2451             info.srcSubresource.aspectMask = BIT(b);
2452             info.dstSubresource.aspectMask = BIT(b);
2453             tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
2454          }
2455          continue;
2456       }
2457 
2458       tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
2459                              pCopyImageInfo->pRegions + i);
2460    }
2461 
2462    if (dst_image->lrz_height) {
2463       tu_disable_lrz(cmd, &cmd->cs, dst_image);
2464    }
2465 }
2466 TU_GENX(tu_CmdCopyImage2);
2467 
2468 template <chip CHIP>
2469 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size)2470 copy_buffer(struct tu_cmd_buffer *cmd,
2471             uint64_t dst_va,
2472             uint64_t src_va,
2473             uint64_t size,
2474             uint32_t block_size)
2475 {
2476    const struct blit_ops *ops = &r2d_ops<CHIP>;
2477    struct tu_cs *cs = &cmd->cs;
2478    enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2479    uint64_t blocks = size / block_size;
2480 
2481    ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2482               VK_SAMPLE_COUNT_1_BIT);
2483 
2484    while (blocks) {
2485       uint32_t src_x = (src_va & 63) / block_size;
2486       uint32_t dst_x = (dst_va & 63) / block_size;
2487       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2488 
2489       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2490       ops->dst_buffer(     cs, format, dst_va & ~63, 0, format);
2491       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
2492       ops->run(cmd, cs);
2493 
2494       src_va += width * block_size;
2495       dst_va += width * block_size;
2496       blocks -= width;
2497    }
2498 
2499    ops->teardown(cmd, cs);
2500 }
2501 
2502 template <chip CHIP>
2503 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)2504 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
2505                   const VkCopyBufferInfo2 *pCopyBufferInfo)
2506 {
2507    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2508    TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2509    TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2510 
2511    for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2512       const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2513       copy_buffer<CHIP>(cmd,
2514                   dst_buffer->iova + region->dstOffset,
2515                   src_buffer->iova + region->srcOffset,
2516                   region->size, 1);
2517    }
2518 }
2519 TU_GENX(tu_CmdCopyBuffer2);
2520 
2521 template <chip CHIP>
2522 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)2523 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2524                    VkBuffer dstBuffer,
2525                    VkDeviceSize dstOffset,
2526                    VkDeviceSize dataSize,
2527                    const void *pData)
2528 {
2529    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2530    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2531 
2532    struct tu_cs_memory tmp;
2533    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2534    if (result != VK_SUCCESS) {
2535       vk_command_buffer_set_error(&cmd->vk, result);
2536       return;
2537    }
2538 
2539    memcpy(tmp.map, pData, dataSize);
2540    copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4);
2541 }
2542 TU_GENX(tu_CmdUpdateBuffer);
2543 
2544 template <chip CHIP>
2545 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)2546 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2547                  VkBuffer dstBuffer,
2548                  VkDeviceSize dstOffset,
2549                  VkDeviceSize fillSize,
2550                  uint32_t data)
2551 {
2552    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2553    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2554    const struct blit_ops *ops = &r2d_ops<CHIP>;
2555    struct tu_cs *cs = &cmd->cs;
2556 
2557    fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
2558 
2559    uint64_t dst_va = buffer->iova + dstOffset;
2560    uint32_t blocks = fillSize / 4;
2561 
2562    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2563               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2564               VK_SAMPLE_COUNT_1_BIT);
2565 
2566    VkClearValue clear_val = {};
2567    clear_val.color.uint32[0] = data;
2568    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
2569 
2570    while (blocks) {
2571       uint32_t dst_x = (dst_va & 63) / 4;
2572       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2573 
2574       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2575       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
2576       ops->run(cmd, cs);
2577 
2578       dst_va += width * 4;
2579       blocks -= width;
2580    }
2581 
2582    ops->teardown(cmd, cs);
2583 }
2584 TU_GENX(tu_CmdFillBuffer);
2585 
2586 template <chip CHIP>
2587 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)2588 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
2589                     const VkResolveImageInfo2 *pResolveImageInfo)
2590 {
2591    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2592    TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2593    TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2594    const struct blit_ops *ops = &r2d_ops<CHIP>;
2595    struct tu_cs *cs = &cmd->cs;
2596 
2597    enum pipe_format src_format =
2598       tu_vk_format_to_pipe_format(src_image->vk.format);
2599    enum pipe_format dst_format =
2600       tu_vk_format_to_pipe_format(dst_image->vk.format);
2601    ops->setup(cmd, cs, src_format, dst_format,
2602               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2603               VK_SAMPLE_COUNT_1_BIT);
2604 
2605    for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2606       const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2607       uint32_t layers = MAX2(info->extent.depth,
2608                              vk_image_subresource_layer_count(&dst_image->vk,
2609                                                               &info->dstSubresource));
2610 
2611       /* TODO: aspect masks possible ? */
2612 
2613       coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
2614 
2615       struct fdl6_view dst, src;
2616       tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2617       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2618 
2619       for (uint32_t i = 0; i < layers; i++) {
2620          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2621          ops->dst(cs, &dst, i, src_format);
2622          ops->run(cmd, cs);
2623       }
2624    }
2625 
2626    ops->teardown(cmd, cs);
2627 }
2628 TU_GENX(tu_CmdResolveImage2);
2629 
2630 #define for_each_layer(layer, layer_mask, layers) \
2631    for (uint32_t layer = 0; \
2632         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2633         layer++) \
2634       if (!layer_mask || (layer_mask & BIT(layer)))
2635 
2636 template <chip CHIP>
2637 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)2638 resolve_sysmem(struct tu_cmd_buffer *cmd,
2639                struct tu_cs *cs,
2640                VkFormat vk_src_format,
2641                VkFormat vk_dst_format,
2642                const struct tu_image_view *src,
2643                const struct tu_image_view *dst,
2644                uint32_t layer_mask,
2645                uint32_t layers,
2646                const VkRect2D *rect,
2647                bool src_separate_ds,
2648                bool dst_separate_ds)
2649 {
2650    const struct blit_ops *ops = &r2d_ops<CHIP>;
2651 
2652    trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
2653 
2654    enum pipe_format src_format = tu_vk_format_to_pipe_format(vk_src_format);
2655    enum pipe_format dst_format = tu_vk_format_to_pipe_format(vk_dst_format);
2656 
2657    ops->setup(cmd, cs, src_format, dst_format,
2658               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
2659               VK_SAMPLE_COUNT_1_BIT);
2660    ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
2661 
2662    for_each_layer(i, layer_mask, layers) {
2663       if (src_separate_ds) {
2664          if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2665             r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
2666          } else {
2667             r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
2668          }
2669       } else {
2670          ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
2671       }
2672 
2673       if (dst_separate_ds) {
2674          if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2675             ops->dst_depth(cs, dst, i);
2676          } else {
2677             ops->dst_stencil(cs, dst, i);
2678          }
2679       } else {
2680          ops->dst(cs, &dst->view, i, src_format);
2681       }
2682 
2683       ops->run(cmd, cs);
2684    }
2685 
2686    ops->teardown(cmd, cs);
2687 
2688    trace_end_sysmem_resolve(&cmd->trace, cs);
2689 }
2690 
2691 template <chip CHIP>
2692 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)2693 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2694                   struct tu_cs *cs,
2695                   const struct tu_image_view *src,
2696                   const struct tu_image_view *dst,
2697                   uint32_t layer_mask,
2698                   uint32_t layers,
2699                   const VkRect2D *rect)
2700 {
2701    assert(src->image->vk.format == dst->image->vk.format ||
2702           (vk_format_is_depth_or_stencil(src->image->vk.format) &&
2703            vk_format_is_depth_or_stencil(dst->image->vk.format)));
2704 
2705    bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2706    bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2707 
2708    if (dst_separate_ds) {
2709       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
2710                      src, dst, layer_mask, layers, rect,
2711                      src_separate_ds, dst_separate_ds);
2712       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
2713                      src, dst, layer_mask, layers, rect,
2714                      src_separate_ds, dst_separate_ds);
2715    } else {
2716       resolve_sysmem<CHIP>(cmd, cs, src->image->vk.format, dst->image->vk.format,
2717                      src, dst, layer_mask, layers, rect,
2718                      src_separate_ds, dst_separate_ds);
2719    }
2720 }
2721 TU_GENX(tu_resolve_sysmem);
2722 
2723 template <chip CHIP>
2724 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)2725 clear_image(struct tu_cmd_buffer *cmd,
2726             struct tu_image *image,
2727             const VkClearValue *clear_value,
2728             const VkImageSubresourceRange *range,
2729             VkImageAspectFlags aspect_mask)
2730 {
2731    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
2732    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
2733    struct tu_cs *cs = &cmd->cs;
2734    enum pipe_format format;
2735    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
2736       format = PIPE_FORMAT_R32_UINT;
2737    } else {
2738       format = tu6_plane_format(image->vk.format,
2739                                 tu6_plane_index(image->vk.format,
2740                                                 aspect_mask));
2741    }
2742 
2743    if (image->layout[0].depth0 > 1) {
2744       assert(layer_count == 1);
2745       assert(range->baseArrayLayer == 0);
2746    }
2747 
2748    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
2749 
2750    ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
2751               (VkSampleCountFlagBits) image->layout[0].nr_samples);
2752    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2753       ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
2754    else
2755       ops->clear_value(cmd, cs, format, clear_value);
2756 
2757    for (unsigned j = 0; j < level_count; j++) {
2758       if (image->layout[0].depth0 > 1)
2759          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2760 
2761       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2762                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
2763                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
2764                   });
2765 
2766       struct fdl6_view dst;
2767       const VkImageSubresourceLayers subresource = {
2768          .aspectMask = aspect_mask,
2769          .mipLevel = range->baseMipLevel + j,
2770          .baseArrayLayer = range->baseArrayLayer,
2771          .layerCount = 1,
2772       };
2773       tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
2774 
2775       for (uint32_t i = 0; i < layer_count; i++) {
2776          ops->dst(cs, &dst, i, format);
2777          ops->run(cmd, cs);
2778       }
2779    }
2780 
2781    ops->teardown(cmd, cs);
2782 }
2783 
2784 template <chip CHIP>
2785 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2786 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2787                       VkImage image_h,
2788                       VkImageLayout imageLayout,
2789                       const VkClearColorValue *pColor,
2790                       uint32_t rangeCount,
2791                       const VkImageSubresourceRange *pRanges)
2792 {
2793    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2794    TU_FROM_HANDLE(tu_image, image, image_h);
2795 
2796    for (unsigned i = 0; i < rangeCount; i++)
2797       clear_image<CHIP>(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2798 }
2799 TU_GENX(tu_CmdClearColorImage);
2800 
2801 template <chip CHIP>
2802 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2803 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2804                              VkImage image_h,
2805                              VkImageLayout imageLayout,
2806                              const VkClearDepthStencilValue *pDepthStencil,
2807                              uint32_t rangeCount,
2808                              const VkImageSubresourceRange *pRanges)
2809 {
2810    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2811    TU_FROM_HANDLE(tu_image, image, image_h);
2812 
2813    for (unsigned i = 0; i < rangeCount; i++) {
2814       const VkImageSubresourceRange *range = &pRanges[i];
2815 
2816       if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2817          /* can't clear both depth and stencil at once, split up the aspect mask */
2818          u_foreach_bit(b, range->aspectMask)
2819             clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2820          continue;
2821       }
2822 
2823       clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2824    }
2825 
2826    tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges);
2827 }
2828 TU_GENX(tu_CmdClearDepthStencilImage);
2829 
2830 template <chip CHIP>
2831 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2832 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2833                             uint32_t attachment_count,
2834                             const VkClearAttachment *attachments,
2835                             uint32_t rect_count,
2836                             const VkClearRect *rects)
2837 {
2838    /* the shader path here is special, it avoids changing MRT/etc state */
2839    const struct tu_subpass *subpass = cmd->state.subpass;
2840    const uint32_t mrt_count = subpass->color_count;
2841    struct tu_cs *cs = &cmd->draw_cs;
2842    uint32_t clear_value[MAX_RTS][4];
2843    float z_clear_val = 0.0f;
2844    uint8_t s_clear_val = 0;
2845    uint32_t clear_rts = 0, clear_components = 0;
2846    bool z_clear = false;
2847    bool s_clear = false;
2848 
2849    trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
2850 
2851    for (uint32_t i = 0; i < attachment_count; i++) {
2852       uint32_t a;
2853       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2854          uint32_t c = attachments[i].colorAttachment;
2855          a = subpass->color_attachments[c].attachment;
2856          if (a == VK_ATTACHMENT_UNUSED)
2857             continue;
2858 
2859          clear_rts |= 1 << c;
2860          clear_components |= 0xf << (c * 4);
2861          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2862       } else {
2863          a = subpass->depth_stencil_attachment.attachment;
2864          if (a == VK_ATTACHMENT_UNUSED)
2865             continue;
2866 
2867          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2868             z_clear = true;
2869             z_clear_val = attachments[i].clearValue.depthStencil.depth;
2870          }
2871 
2872          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2873             s_clear = true;
2874             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2875          }
2876       }
2877    }
2878 
2879    /* We may not know the multisample count if there are no attachments, so
2880     * just bail early to avoid corner cases later.
2881     */
2882    if (clear_rts == 0 && !z_clear && !s_clear)
2883       return;
2884 
2885    /* disable all draw states so they don't interfere
2886     * TODO: use and re-use draw states
2887     * we have to disable draw states individually to preserve
2888     * input attachment states, because a secondary command buffer
2889     * won't be able to restore them
2890     */
2891    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2892    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2893       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2894           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2895          continue;
2896       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2897                      CP_SET_DRAW_STATE__0_DISABLE);
2898       tu_cs_emit_qw(cs, 0);
2899    }
2900    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2901 
2902    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2903    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2904                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2905                   0xfc000000);
2906    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2907 
2908    r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
2909 
2910    /* Disable sample counting in order to not affect occlusion query. */
2911    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
2912 
2913    if (cmd->state.prim_generated_query_running_before_rp) {
2914       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
2915    }
2916 
2917    tu_cs_emit_regs(cs,
2918                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2919    tu_cs_emit_regs(cs,
2920                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2921 
2922    tu_cs_emit_regs(cs,
2923                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2924 
2925    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2926    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2927    for (uint32_t i = 0; i < mrt_count; i++) {
2928       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2929             .component_enable = COND(clear_rts & (1 << i), 0xf)));
2930    }
2931 
2932    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2933    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2934 
2935    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2936    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2937          .z_test_enable = z_clear,
2938          .z_write_enable = z_clear,
2939          .zfunc = FUNC_ALWAYS));
2940    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
2941    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2942    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2943          .stencil_enable = s_clear,
2944          .func = FUNC_ALWAYS,
2945          .zpass = STENCIL_REPLACE));
2946    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
2947    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2948    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2949    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2950 
2951    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
2952 
2953    unsigned num_rts = util_bitcount(clear_rts);
2954    uint32_t packed_clear_value[MAX_RTS][4];
2955 
2956    uint32_t idx = 0;
2957    u_foreach_bit(b, clear_rts) {
2958       memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
2959       idx++;
2960    }
2961 
2962    if (num_rts > 0)
2963       tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
2964                                 0, packed_clear_value, num_rts);
2965 
2966    for (uint32_t i = 0; i < rect_count; i++) {
2967       /* This should be true because of this valid usage for
2968        * vkCmdClearAttachments:
2969        *
2970        *    "If the render pass instance this is recorded in uses multiview,
2971        *    then baseArrayLayer must be zero and layerCount must be one"
2972        */
2973       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2974 
2975       /* a630 doesn't support multiview masks, which means that we can't use
2976        * the normal multiview path without potentially recompiling a shader
2977        * on-demand or using a more complicated variant that takes the mask as
2978        * a const. Just use the layered path instead, since it shouldn't be
2979        * much worse.
2980        */
2981       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
2982       {
2983          const float coords[] = {
2984             rects[i].rect.offset.x,
2985             rects[i].rect.offset.y,
2986             z_clear_val,
2987             uif(rects[i].baseArrayLayer + layer),
2988             rects[i].rect.offset.x + rects[i].rect.extent.width,
2989             rects[i].rect.offset.y + rects[i].rect.extent.height,
2990             z_clear_val,
2991             1.0f,
2992          };
2993 
2994          r3d_coords_raw(cmd, cs, coords);
2995          r3d_run_vis(cmd, cs);
2996       }
2997    }
2998 
2999    /* Re-enable sample counting. */
3000    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
3001 
3002    if (cmd->state.prim_generated_query_running_before_rp) {
3003       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
3004    }
3005 
3006    trace_end_sysmem_clear_all(&cmd->trace, cs);
3007 }
3008 
3009 static void
pack_gmem_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])3010 pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
3011 {
3012    switch (format) {
3013    case PIPE_FORMAT_Z24X8_UNORM:
3014    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
3015       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
3016                        val->depthStencil.stencil << 24;
3017       return;
3018    case PIPE_FORMAT_Z16_UNORM:
3019       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
3020       return;
3021    case PIPE_FORMAT_Z32_FLOAT:
3022       clear_value[0] = fui(val->depthStencil.depth);
3023       return;
3024    case PIPE_FORMAT_S8_UINT:
3025       clear_value[0] = val->depthStencil.stencil;
3026       return;
3027    default:
3028       break;
3029    }
3030 
3031    float tmp[4];
3032    memcpy(tmp, val->color.float32, 4 * sizeof(float));
3033    if (util_format_is_srgb(format)) {
3034       for (int i = 0; i < 3; i++)
3035          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
3036    }
3037 
3038 #define PACK_F(type) util_format_##type##_pack_rgba_float \
3039    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
3040    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
3041    case 4:
3042       PACK_F(r4g4b4a4_unorm);
3043       break;
3044    case 5:
3045       if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
3046          PACK_F(r5g6b5_unorm);
3047       else
3048          PACK_F(r5g5b5a1_unorm);
3049       break;
3050    case 8:
3051       if (util_format_is_snorm(format))
3052          PACK_F(r8g8b8a8_snorm);
3053       else if (util_format_is_unorm(format))
3054          PACK_F(r8g8b8a8_unorm);
3055       else
3056          pack_int8(clear_value, val->color.uint32);
3057       break;
3058    case 10:
3059       if (util_format_is_pure_integer(format))
3060          pack_int10_2(clear_value, val->color.uint32);
3061       else
3062          PACK_F(r10g10b10a2_unorm);
3063       break;
3064    case 11:
3065       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
3066       break;
3067    case 16:
3068       if (util_format_is_snorm(format))
3069          PACK_F(r16g16b16a16_snorm);
3070       else if (util_format_is_unorm(format))
3071          PACK_F(r16g16b16a16_unorm);
3072       else if (util_format_is_float(format))
3073          PACK_F(r16g16b16a16_float);
3074       else
3075          pack_int16(clear_value, val->color.uint32);
3076       break;
3077    case 32:
3078       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
3079       break;
3080    case 0:
3081       assert(format == PIPE_FORMAT_A8_UNORM);
3082       PACK_F(a8_unorm);
3083       break;
3084    default:
3085       unreachable("unexpected channel size");
3086    }
3087 #undef PACK_F
3088 }
3089 
3090 template <chip CHIP>
3091 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)3092 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3093                       struct tu_cs *cs,
3094                       enum pipe_format format,
3095                       uint8_t clear_mask,
3096                       uint32_t gmem_offset,
3097                       const VkClearValue *value)
3098 {
3099    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
3100    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
3101             blit_base_format(format, false)));
3102 
3103    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
3104 
3105    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
3106    tu_cs_emit(cs, gmem_offset);
3107 
3108    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3109    tu_cs_emit(cs, 0);
3110 
3111    uint32_t clear_vals[4] = {};
3112    pack_gmem_clear_value(value, format, clear_vals);
3113 
3114    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3115    tu_cs_emit_array(cs, clear_vals, 4);
3116 
3117    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
3118 }
3119 
3120 template <chip CHIP>
3121 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)3122 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3123                               struct tu_cs *cs,
3124                               uint32_t attachment,
3125                               uint32_t base_layer,
3126                               uint32_t layers,
3127                               uint32_t layer_mask,
3128                               VkImageAspectFlags mask,
3129                               const VkClearValue *value)
3130 {
3131    const struct tu_render_pass_attachment *att =
3132       &cmd->state.pass->attachments[attachment];
3133 
3134    trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
3135 
3136    tu_cs_emit_regs(cs,
3137                    A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
3138 
3139    enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
3140    for_each_layer(i, layer_mask, layers) {
3141       uint32_t layer = i + base_layer;
3142       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3143          if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3144             clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf,
3145                                   tu_attachment_gmem_offset(cmd, att, layer), value);
3146          }
3147          if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3148             clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf,
3149                                   tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
3150          }
3151       } else {
3152          clear_gmem_attachment<CHIP>(cmd, cs, format, aspect_write_mask(format, mask),
3153                                tu_attachment_gmem_offset(cmd, att, layer), value);
3154       }
3155    }
3156 
3157    trace_end_gmem_clear(&cmd->trace, cs);
3158 }
3159 
3160 template <chip CHIP>
3161 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3162 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
3163                           uint32_t attachment_count,
3164                           const VkClearAttachment *attachments,
3165                           uint32_t rect_count,
3166                           const VkClearRect *rects)
3167 {
3168    const struct tu_subpass *subpass = cmd->state.subpass;
3169    struct tu_cs *cs = &cmd->draw_cs;
3170 
3171    if (rect_count > 1)
3172       perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
3173 
3174    for (unsigned i = 0; i < rect_count; i++) {
3175       unsigned x1 = rects[i].rect.offset.x;
3176       unsigned y1 = rects[i].rect.offset.y;
3177       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
3178       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
3179 
3180       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
3181       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
3182       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
3183 
3184       for (unsigned j = 0; j < attachment_count; j++) {
3185          uint32_t a;
3186          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
3187             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
3188          else
3189             a = subpass->depth_stencil_attachment.attachment;
3190 
3191          if (a == VK_ATTACHMENT_UNUSED)
3192                continue;
3193 
3194          tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, rects[i].baseArrayLayer,
3195                                        rects[i].layerCount,
3196                                        subpass->multiview_mask,
3197                                        attachments[j].aspectMask,
3198                                        &attachments[j].clearValue);
3199       }
3200    }
3201 }
3202 
3203 template <chip CHIP>
3204 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3205 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
3206                        uint32_t attachmentCount,
3207                        const VkClearAttachment *pAttachments,
3208                        uint32_t rectCount,
3209                        const VkClearRect *pRects)
3210 {
3211    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3212    struct tu_cs *cs = &cmd->draw_cs;
3213 
3214    /* sysmem path behaves like a draw, note we don't have a way of using different
3215     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
3216     */
3217    tu_emit_cache_flush_renderpass<CHIP>(cmd);
3218 
3219    for (uint32_t j = 0; j < attachmentCount; j++) {
3220       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
3221          continue;
3222 
3223       tu_lrz_disable_during_renderpass(cmd);
3224    }
3225 
3226    /* vkCmdClearAttachments is supposed to respect the predicate if active. The
3227     * easiest way to do this is to always use the 3d path, which always works
3228     * even with GMEM because it's just a simple draw using the existing
3229     * attachment state.
3230     *
3231     * Similarly, we also use the 3D path when in a secondary command buffer that
3232     * doesn't know the GMEM layout that will be chosen by the primary.
3233     */
3234    if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
3235       tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3236       return;
3237    }
3238 
3239    /* If we could skip tile load/stores based on any draws intersecting them at
3240     * binning time, then emit the clear as a 3D draw so that it contributes to
3241     * that visibility.
3242    */
3243    const struct tu_subpass *subpass = cmd->state.subpass;
3244    for (uint32_t i = 0; i < attachmentCount; i++) {
3245       uint32_t a;
3246       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3247          uint32_t c = pAttachments[i].colorAttachment;
3248          a = subpass->color_attachments[c].attachment;
3249       } else {
3250          a = subpass->depth_stencil_attachment.attachment;
3251       }
3252       if (a != VK_ATTACHMENT_UNUSED) {
3253          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3254          if (att->cond_load_allowed || att->cond_store_allowed) {
3255             tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3256             return;
3257          }
3258       }
3259    }
3260 
3261    /* Otherwise, emit 2D blits for gmem rendering. */
3262    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
3263    tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3264    tu_cond_exec_end(cs);
3265 
3266    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3267    tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3268    tu_cond_exec_end(cs);
3269 }
3270 TU_GENX(tu_CmdClearAttachments);
3271 
3272 template <chip CHIP>
3273 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)3274 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3275                         struct tu_cs *cs,
3276                         VkFormat vk_format,
3277                         VkImageAspectFlags clear_mask,
3278                         uint32_t a,
3279                         bool separate_ds)
3280 {
3281    enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
3282    const struct tu_framebuffer *fb = cmd->state.framebuffer;
3283    const struct tu_image_view *iview = cmd->state.attachments[a];
3284    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
3285    const struct blit_ops *ops = &r2d_ops<CHIP>;
3286    const VkClearValue *value = &cmd->state.clear_values[a];
3287    if (cmd->state.pass->attachments[a].samples > 1)
3288       ops = &r3d_ops<CHIP>;
3289 
3290    trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
3291                             cmd->state.pass->attachments[a].samples);
3292 
3293    ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
3294               cmd->state.pass->attachments[a].samples);
3295    ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
3296                cmd->state.render_area.extent);
3297    ops->clear_value(cmd, cs, format, value);
3298 
3299    for_each_layer(i, clear_views, fb->layers) {
3300       if (separate_ds) {
3301          if (vk_format == VK_FORMAT_D32_SFLOAT) {
3302             ops->dst_depth(cs, iview, i);
3303          } else {
3304             ops->dst_stencil(cs, iview, i);
3305          }
3306       } else {
3307          ops->dst(cs, &iview->view, i, format);
3308       }
3309       ops->run(cmd, cs);
3310    }
3311 
3312    ops->teardown(cmd, cs);
3313 
3314    trace_end_sysmem_clear(&cmd->trace, cs);
3315 }
3316 
3317 template <chip CHIP>
3318 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3319 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3320                            struct tu_cs *cs,
3321                            uint32_t a)
3322 {
3323    const struct tu_render_pass_attachment *attachment =
3324       &cmd->state.pass->attachments[a];
3325 
3326    if (!attachment->clear_mask)
3327       return;
3328 
3329    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3330       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3331          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
3332                                  a, true);
3333       }
3334       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3335          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
3336                                  a, true);
3337       }
3338    } else {
3339       clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
3340                               a, false);
3341    }
3342 
3343    /* The spec doesn't explicitly say, but presumably the initial renderpass
3344     * clear is considered part of the renderpass, and therefore barriers
3345     * aren't required inside the subpass/renderpass.  Therefore we need to
3346     * flush CCU color into CCU depth here, just like with
3347     * vkCmdClearAttachments(). Note that because this only happens at the
3348     * beginning of a renderpass, and renderpass writes are considered
3349     * "incoherent", we shouldn't have to worry about syncing depth into color
3350     * beforehand as depth should already be flushed.
3351     */
3352    if (vk_format_is_depth_or_stencil(attachment->format)) {
3353       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3354       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_DEPTH);
3355       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
3356    } else {
3357       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3358       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
3359    }
3360 
3361    tu_cs_emit_wfi(cs);
3362 }
3363 TU_GENX(tu_clear_sysmem_attachment);
3364 
3365 template <chip CHIP>
3366 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3367 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3368                          struct tu_cs *cs,
3369                          uint32_t a)
3370 {
3371    const struct tu_render_pass_attachment *attachment =
3372       &cmd->state.pass->attachments[a];
3373 
3374    if (!attachment->clear_mask)
3375       return;
3376 
3377    tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, 0, cmd->state.framebuffer->layers,
3378                                  attachment->clear_views,
3379                                  attachment->clear_mask,
3380                                  &cmd->state.clear_values[a]);
3381 }
3382 TU_GENX(tu_clear_gmem_attachment);
3383 
3384 template <chip CHIP>
3385 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,bool resolve,bool separate_stencil)3386 tu_emit_blit(struct tu_cmd_buffer *cmd,
3387              struct tu_cs *cs,
3388              const struct tu_image_view *iview,
3389              const struct tu_render_pass_attachment *attachment,
3390              bool resolve,
3391              bool separate_stencil)
3392 {
3393    tu_cs_emit_regs(cs,
3394                    A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
3395 
3396    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
3397       .unk0 = !resolve,
3398       .gmem = !resolve,
3399       .sample_0 = vk_format_is_int(attachment->format) ||
3400          vk_format_is_depth_or_stencil(attachment->format),
3401       .depth = vk_format_is_depth_or_stencil(attachment->format),));
3402 
3403    for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
3404       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
3405       if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3406          if (!separate_stencil) {
3407             tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO));
3408             tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * i);
3409             tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
3410 
3411             tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
3412             tu_cs_image_flag_ref(cs, &iview->view, i);
3413          } else {
3414             tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
3415             tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * i);
3416             tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(iview->stencil_pitch).value);
3417          }
3418       } else {
3419          tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
3420          tu_cs_image_ref_2d<CHIP>(cs, &iview->view, i, false);
3421 
3422          tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
3423          tu_cs_image_flag_ref(cs, &iview->view, i);
3424       }
3425 
3426       if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
3427             tu_cs_emit_regs(cs,
3428                            A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment, i)));
3429       } else {
3430          tu_cs_emit_regs(cs,
3431                         A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment, i)));
3432       }
3433 
3434       tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3435       tu_cs_emit(cs, 0);
3436 
3437       tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
3438    }
3439 }
3440 
3441 static bool
blit_can_resolve(VkFormat format)3442 blit_can_resolve(VkFormat format)
3443 {
3444    const struct util_format_description *desc = vk_format_description(format);
3445 
3446    /* blit event can only do resolve for simple cases:
3447     * averaging samples as unsigned integers or choosing only one sample
3448     * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
3449     */
3450    if (vk_format_is_snorm(format))
3451       return false;
3452 
3453    /* can't do formats with larger channel sizes
3454     * note: this includes all float formats
3455     * note2: single channel integer formats seem OK
3456     */
3457    if (desc->channel[0].size > 10)
3458       return false;
3459 
3460    switch (format) {
3461    /* for unknown reasons blit event can't msaa resolve these formats when tiled
3462     * likely related to these formats having different layout from other cpp=2 formats
3463     */
3464    case VK_FORMAT_R8G8_UNORM:
3465    case VK_FORMAT_R8G8_UINT:
3466    case VK_FORMAT_R8G8_SINT:
3467    case VK_FORMAT_R8G8_SRGB:
3468    /* TODO: this one should be able to work? */
3469    case VK_FORMAT_D24_UNORM_S8_UINT:
3470       return false;
3471    default:
3472       break;
3473    }
3474 
3475    return true;
3476 }
3477 
3478 struct apply_load_coords_state {
3479    unsigned view;
3480 };
3481 
3482 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)3483 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
3484                       struct tu_cs *cs,
3485                       void *data,
3486                       VkRect2D bin,
3487                       unsigned views,
3488                       VkExtent2D *frag_areas)
3489 {
3490    const struct apply_load_coords_state *state =
3491       (const struct apply_load_coords_state *)data;
3492    assert(state->view < views);
3493    VkExtent2D frag_area = frag_areas[state->view];
3494 
3495    assert(bin.extent.width % frag_area.width == 0);
3496    assert(bin.extent.height % frag_area.height == 0);
3497    uint32_t scaled_width = bin.extent.width / frag_area.width;
3498    uint32_t scaled_height = bin.extent.height / frag_area.height;
3499 
3500    const float coords[] = {
3501       bin.offset.x,                    bin.offset.y,
3502       bin.offset.x,                    bin.offset.y,
3503       bin.offset.x + scaled_width,     bin.offset.y + scaled_height,
3504       bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
3505    };
3506    r3d_coords_raw(cmd, cs, coords);
3507 }
3508 
3509 template <chip CHIP>
3510 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)3511 load_3d_blit(struct tu_cmd_buffer *cmd,
3512              struct tu_cs *cs,
3513              const struct tu_image_view *iview,
3514              const struct tu_render_pass_attachment *att,
3515              bool separate_stencil)
3516 {
3517    const struct tu_framebuffer *fb = cmd->state.framebuffer;
3518    enum pipe_format format = iview->view.format;
3519    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3520       if (separate_stencil)
3521          format = PIPE_FORMAT_S8_UINT;
3522       else
3523          format = PIPE_FORMAT_Z32_FLOAT;
3524    }
3525    r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
3526                    R3D_DST_GMEM, false, iview->view.ubwc_enabled,
3527                    iview->image->vk.samples);
3528 
3529    if (!cmd->state.pass->has_fdm) {
3530       r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
3531                  (VkExtent2D) { fb->width, fb->height });
3532    }
3533 
3534    /* Normal loads read directly from system memory, so we have to invalidate
3535     * UCHE in case it contains stale data.
3536     */
3537    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
3538 
3539    /* Wait for CACHE_INVALIDATE to land */
3540    tu_cs_emit_wfi(cs);
3541 
3542    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
3543       if (cmd->state.pass->has_fdm) {
3544          struct apply_load_coords_state state = {
3545             .view = att->clear_views ? i : 0,
3546          };
3547          tu_create_fdm_bin_patchpoint(cmd, cs, 1 + 3 + 8, fdm_apply_load_coords, state);
3548       }
3549 
3550       r3d_dst_gmem(cmd, cs, iview, att, separate_stencil, i);
3551 
3552       if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3553          if (separate_stencil)
3554             r3d_src_stencil(cmd, cs, iview, i);
3555          else
3556             r3d_src_depth(cmd, cs, iview, i);
3557       } else {
3558          r3d_src_gmem_load(cmd, cs, iview, i);
3559       }
3560 
3561       r3d_run(cmd, cs);
3562    }
3563 
3564    r3d_teardown<CHIP>(cmd, cs);
3565 
3566    /* It seems we need to WFI here for depth/stencil because color writes here
3567     * aren't synchronized with depth/stencil writes.
3568     *
3569     * Note: the blob also uses a WFI for color attachments but this hasn't
3570     * been seen to be necessary.
3571     */
3572    if (vk_format_is_depth_or_stencil(att->format))
3573       tu_cs_emit_wfi(cs);
3574 }
3575 
3576 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)3577 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3578                               struct tu_cs *cs, bool load)
3579 {
3580    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
3581 
3582    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
3583       return;
3584 
3585    uint64_t result_iova;
3586    if (load)
3587       result_iova = global_iova(cmd, dbg_gmem_taken_loads);
3588    else
3589       result_iova = global_iova(cmd, dbg_gmem_taken_stores);
3590 
3591    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3592    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3593    tu_cs_emit_qw(cs, result_iova);
3594    tu_cs_emit_qw(cs, result_iova);
3595    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3596 }
3597 
3598 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)3599 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3600                             struct tu_cs *cs, bool load)
3601 {
3602    tu_cond_exec_end(cs);
3603 
3604    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
3605       return;
3606 
3607    uint64_t result_iova;
3608    if (load)
3609       result_iova = global_iova(cmd, dbg_gmem_total_loads);
3610    else
3611       result_iova = global_iova(cmd, dbg_gmem_total_stores);
3612 
3613    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3614    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3615    tu_cs_emit_qw(cs, result_iova);
3616    tu_cs_emit_qw(cs, result_iova);
3617    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3618 }
3619 
3620 template <chip CHIP>
3621 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool cond_exec_allowed,bool force_load)3622 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
3623                         struct tu_cs *cs,
3624                         uint32_t a,
3625                         bool cond_exec_allowed,
3626                         bool force_load)
3627 {
3628    const struct tu_image_view *iview = cmd->state.attachments[a];
3629    const struct tu_render_pass_attachment *attachment =
3630       &cmd->state.pass->attachments[a];
3631 
3632    bool load_common = attachment->load || force_load;
3633    bool load_stencil =
3634       attachment->load_stencil ||
3635       (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
3636 
3637    if (!load_common && !load_stencil)
3638       return;
3639 
3640    trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
3641 
3642    /* If attachment will be cleared by vkCmdClearAttachments - it is likely
3643     * that it would be partially cleared, and since it is done by 2d blit
3644     * it doesn't produce geometry, so we have to unconditionally load.
3645     *
3646     * To simplify conditions treat partially cleared separate DS as fully
3647     * cleared and don't emit cond_exec.
3648     */
3649    bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
3650    if (cond_exec)
3651       tu_begin_load_store_cond_exec(cmd, cs, true);
3652 
3653    if (TU_DEBUG(3D_LOAD) ||
3654        cmd->state.pass->has_fdm) {
3655       if (load_common || load_stencil)
3656          tu_disable_draw_states(cmd, cs);
3657 
3658       if (load_common)
3659          load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
3660 
3661       if (load_stencil)
3662          load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
3663    } else {
3664       if (load_common)
3665          tu_emit_blit<CHIP>(cmd, cs, iview, attachment, false, false);
3666 
3667       if (load_stencil)
3668          tu_emit_blit<CHIP>(cmd, cs, iview, attachment, false, true);
3669    }
3670 
3671    if (cond_exec)
3672       tu_end_load_store_cond_exec(cmd, cs, true);
3673 
3674    trace_end_gmem_load(&cmd->trace, cs);
3675 }
3676 TU_GENX(tu_load_gmem_attachment);
3677 
3678 template <chip CHIP>
3679 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)3680 store_cp_blit(struct tu_cmd_buffer *cmd,
3681               struct tu_cs *cs,
3682               const struct tu_image_view *iview,
3683               uint32_t samples,
3684               bool separate_stencil,
3685               enum pipe_format src_format,
3686               enum pipe_format dst_format,
3687               uint32_t layer,
3688               uint32_t gmem_offset,
3689               uint32_t cpp)
3690 {
3691    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
3692                           VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3693                           iview->view.ubwc_enabled, true);
3694 
3695    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3696       if (!separate_stencil) {
3697          r2d_dst_depth(cs, iview, layer);
3698       } else {
3699          r2d_dst_stencil(cs, iview, layer);
3700       }
3701    } else {
3702       r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
3703    }
3704 
3705    enum a6xx_format fmt = blit_format_texture(src_format, TILE6_2).fmt;
3706    fixup_src_format(&src_format, dst_format, &fmt);
3707 
3708    tu_cs_emit_regs(cs,
3709                    SP_PS_2D_SRC_INFO(CHIP,
3710                       .color_format = fmt,
3711                       .tile_mode = TILE6_2,
3712                       .color_swap = WZYX,
3713                       .srgb = util_format_is_srgb(src_format),
3714                       .samples = tu_msaa_samples(samples),
3715                       .samples_average = !util_format_is_pure_integer(dst_format) &&
3716                                          !util_format_is_depth_or_stencil(dst_format),
3717                       .unk20 = 1,
3718                       .unk22 = 1),
3719                    SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
3720                    SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
3721                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
3722 
3723    /* sync GMEM writes with CACHE. */
3724    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
3725 
3726    /* Wait for CACHE_INVALIDATE to land */
3727    tu_cs_emit_wfi(cs);
3728 
3729    r2d_run(cmd, cs);
3730 
3731    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3732     * sysmem, and we generally assume that GMEM renderpasses leave their
3733     * results in sysmem, so we need to flush manually here.
3734     */
3735    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3736 }
3737 
3738 template <chip CHIP>
3739 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)3740 store_3d_blit(struct tu_cmd_buffer *cmd,
3741               struct tu_cs *cs,
3742               const struct tu_image_view *iview,
3743               VkSampleCountFlagBits dst_samples,
3744               bool separate_stencil,
3745               enum pipe_format src_format,
3746               enum pipe_format dst_format,
3747               const VkRect2D *render_area,
3748               uint32_t layer,
3749               uint32_t gmem_offset,
3750               uint32_t cpp)
3751 {
3752    /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
3753     * aren't set until we know whether we're HW binning or not, and we want to
3754     * avoid a dependence on that here to be able to store attachments before
3755     * the end of the renderpass in the future. Use the scratch space to
3756     * save/restore them dynamically.
3757     */
3758    tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3759    tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3760                   CP_REG_TO_SCRATCH_0_SCRATCH(0) |
3761                   CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3762    if (CHIP >= A7XX) {
3763       tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3764       tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
3765                      CP_REG_TO_SCRATCH_0_SCRATCH(1) |
3766                      CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3767    }
3768 
3769    r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
3770                    0, false, iview->view.ubwc_enabled, dst_samples);
3771 
3772    r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
3773 
3774    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3775       if (!separate_stencil) {
3776          r3d_dst_depth(cs, iview, layer);
3777       } else {
3778          r3d_dst_stencil(cs, iview, layer);
3779       }
3780    } else {
3781       r3d_dst(cs, &iview->view, layer, src_format);
3782    }
3783 
3784    r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
3785 
3786    /* sync GMEM writes with CACHE. */
3787    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
3788 
3789    /* Wait for CACHE_INVALIDATE to land */
3790    tu_cs_emit_wfi(cs);
3791 
3792    r3d_run(cmd, cs);
3793 
3794    r3d_teardown<CHIP>(cmd, cs);
3795 
3796    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3797     * sysmem, and we generally assume that GMEM renderpasses leave their
3798     * results in sysmem, so we need to flush manually here. The 3d blit path
3799     * writes to depth images as a color RT, so there's no need to flush depth.
3800     */
3801    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3802 
3803    /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
3804    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3805    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3806                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3807                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3808 
3809    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3810    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
3811                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3812                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3813 
3814    if (CHIP >= A7XX) {
3815       tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3816       tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
3817                         CP_SCRATCH_TO_REG_0_SCRATCH(1) |
3818                         CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3819    }
3820 }
3821 
3822 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)3823 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
3824 {
3825    struct tu_physical_device *phys_dev = cmd->device->physical_device;
3826    const struct tu_image_view *iview = cmd->state.attachments[a];
3827    const VkRect2D *render_area = &cmd->state.render_area;
3828 
3829    /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
3830    if (TU_DEBUG(UNALIGNED_STORE))
3831       return true;
3832 
3833    /* We always use the unaligned store path when scaling rendering. */
3834    if (cmd->state.pass->has_fdm)
3835       return true;
3836 
3837    uint32_t x1 = render_area->offset.x;
3838    uint32_t y1 = render_area->offset.y;
3839    uint32_t x2 = x1 + render_area->extent.width;
3840    uint32_t y2 = y1 + render_area->extent.height;
3841    /* x2/y2 can be unaligned if equal to the size of the image, since it will
3842     * write into padding space. The one exception is linear levels which don't
3843     * have the required y padding in the layout (except for the last level)
3844     */
3845    bool need_y2_align =
3846       y2 != iview->view.height || iview->view.need_y2_align;
3847 
3848    return (x1 % phys_dev->info->gmem_align_w ||
3849            (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
3850            y1 % phys_dev->info->gmem_align_h ||
3851            (y2 % phys_dev->info->gmem_align_h && need_y2_align));
3852 }
3853 
3854 /* Choose the GMEM layout (use the CCU space or not) based on whether the
3855  * current attachments will need.  This has to happen at vkBeginRenderPass()
3856  * time because tu_attachment_store_unaligned() looks at the image views, which
3857  * are only available at that point.  This should match the logic for the
3858  * !unaligned case in tu_store_gmem_attachment().
3859  */
3860 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)3861 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
3862 {
3863    cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
3864 
3865    for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
3866       if (!cmd->state.attachments[i])
3867          continue;
3868 
3869       struct tu_render_pass_attachment *att =
3870          &cmd->state.pass->attachments[i];
3871       if ((att->store || att->store_stencil) &&
3872           tu_attachment_store_unaligned(cmd, i))
3873          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3874       if (att->will_be_resolved && !blit_can_resolve(att->format))
3875          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3876    }
3877 
3878    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
3879 }
3880 
3881 struct apply_store_coords_state {
3882    unsigned view;
3883 };
3884 
3885 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)3886 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
3887                        struct tu_cs *cs,
3888                        void *data,
3889                        VkRect2D bin,
3890                        unsigned views,
3891                        VkExtent2D *frag_areas)
3892 {
3893    const struct apply_store_coords_state *state =
3894       (const struct apply_store_coords_state *)data;
3895    assert(state->view < views);
3896    VkExtent2D frag_area = frag_areas[state->view];
3897 
3898    /* The bin width/height must be a multiple of the frag_area to make sure
3899     * that the scaling happens correctly. This means there may be some
3900     * destination pixels jut out of the framebuffer, but they should be
3901     * clipped by the render area.
3902     */
3903    assert(bin.extent.width % frag_area.width == 0);
3904    assert(bin.extent.height % frag_area.height == 0);
3905    uint32_t scaled_width = bin.extent.width / frag_area.width;
3906    uint32_t scaled_height = bin.extent.height / frag_area.height;
3907 
3908    tu_cs_emit_regs(cs,
3909       A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
3910                           .y = bin.offset.y),
3911       A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
3912                           .y = bin.offset.y + bin.extent.height - 1));
3913    tu_cs_emit_regs(cs,
3914                    A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
3915                    A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
3916                    A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
3917                    A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
3918 }
3919 
3920 template <chip CHIP>
3921 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)3922 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
3923                          struct tu_cs *cs,
3924                          uint32_t a,
3925                          uint32_t gmem_a,
3926                          uint32_t layers,
3927                          uint32_t layer_mask,
3928                          bool cond_exec_allowed)
3929 {
3930    const VkRect2D *render_area = &cmd->state.render_area;
3931    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
3932    const struct tu_image_view *iview = cmd->state.attachments[a];
3933    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
3934 
3935    if (!dst->store && !dst->store_stencil)
3936       return;
3937 
3938    bool unaligned = tu_attachment_store_unaligned(cmd, a);
3939 
3940    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
3941     * one for depth and other for stencil. When resolving a MSAA
3942     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
3943     */
3944    bool resolve_d32s8_s8 =
3945       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
3946       dst->format == VK_FORMAT_S8_UINT;
3947 
3948    /* The fast path doesn't support picking out the last component of a D24S8
3949     * texture reinterpreted as RGBA8_UNORM.
3950     */
3951    bool resolve_d24s8_s8 =
3952       src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
3953       dst->format == VK_FORMAT_S8_UINT;
3954 
3955    bool store_common = dst->store && !resolve_d32s8_s8;
3956    bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
3957 
3958    bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
3959                         (a == gmem_a || blit_can_resolve(dst->format));
3960 
3961    trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
3962 
3963    /* Unconditional store should happen only if attachment was cleared,
3964     * which could have happened either by load_op or via vkCmdClearAttachments.
3965     */
3966    bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
3967    if (cond_exec) {
3968       tu_begin_load_store_cond_exec(cmd, cs, false);
3969    }
3970 
3971    /* use fast path when render area is aligned, except for unsupported resolve cases */
3972    if (use_fast_path) {
3973       if (store_common)
3974          tu_emit_blit<CHIP>(cmd, cs, iview, src, true, false);
3975       if (store_separate_stencil)
3976          tu_emit_blit<CHIP>(cmd, cs, iview, src, true, true);
3977 
3978       if (cond_exec) {
3979          tu_end_load_store_cond_exec(cmd, cs, false);
3980       }
3981 
3982       trace_end_gmem_store(&cmd->trace, cs);
3983       return;
3984    }
3985 
3986    assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
3987 
3988    enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
3989    if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3990       src_format = PIPE_FORMAT_Z32_FLOAT;
3991 
3992    enum pipe_format dst_format = tu_vk_format_to_pipe_format(dst->format);
3993    if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3994       dst_format = PIPE_FORMAT_Z32_FLOAT;
3995 
3996    if (dst->samples > 1) {
3997       /* If we hit this path, we have to disable draw states after every tile
3998        * instead of once at the end of the renderpass, so that they aren't
3999        * executed when calling CP_DRAW.
4000        *
4001        * TODO: store a flag somewhere so we don't do this more than once and
4002        * don't do it after the renderpass when this happens.
4003        */
4004       if (store_common || store_separate_stencil)
4005          tu_disable_draw_states(cmd, cs);
4006 
4007       for_each_layer(i, layer_mask, layers) {
4008          if (store_common) {
4009             store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
4010                           dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4011          }
4012          if (store_separate_stencil) {
4013             store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
4014                           PIPE_FORMAT_S8_UINT, render_area, i,
4015                           tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4016          }
4017       }
4018    } else {
4019       if (!cmd->state.pass->has_fdm) {
4020          r2d_coords(cmd, cs, render_area->offset, render_area->offset,
4021                     render_area->extent);
4022       } else {
4023          /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
4024           * area and the coordinates span the entire render area, but for
4025           * FDM we need to scale the coordinates so we need to take the
4026           * opposite aproach, specifying the exact bin size in the destination
4027           * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
4028           * area.
4029           */
4030          tu_cs_emit_regs(cs,
4031                          A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
4032                                                      .y = render_area->offset.y,),
4033                          A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
4034                                                      .y = render_area->offset.y + render_area->extent.height - 1,));
4035       }
4036 
4037       for_each_layer (i, layer_mask, layers) {
4038          if (cmd->state.pass->has_fdm) {
4039             unsigned view = layer_mask ? i : 0;
4040             struct apply_store_coords_state state = {
4041                .view = view,
4042             };
4043             tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
4044                                          state);
4045          }
4046          if (store_common) {
4047             store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
4048                           dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4049          }
4050          if (store_separate_stencil) {
4051             store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
4052                           PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4053          }
4054       }
4055    }
4056 
4057    if (cond_exec) {
4058       tu_end_load_store_cond_exec(cmd, cs, false);
4059    }
4060 
4061    trace_end_gmem_store(&cmd->trace, cs);
4062 }
4063 TU_GENX(tu_store_gmem_attachment);
4064