1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_clear_blit.h"
10
11 #include "ir3/ir3_nir.h"
12
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18
19 #include "tu_buffer.h"
20 #include "tu_cmd_buffer.h"
21 #include "tu_cs.h"
22 #include "tu_formats.h"
23 #include "tu_image.h"
24 #include "tu_tracepoints.h"
25 #include "tu_lrz.h"
26
27 #include "common/freedreno_gpu_event.h"
28 #include "common/freedreno_lrz.h"
29
30 static const VkOffset2D blt_no_coord = { ~0, ~0 };
31
32 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)33 tu_pack_float32_for_unorm(float val, int bits)
34 {
35 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
36 }
37
38 /* r2d_ = BLIT_OP_SCALE operations */
39
40 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)41 format_to_ifmt(enum pipe_format format)
42 {
43 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
44 format == PIPE_FORMAT_Z24X8_UNORM)
45 return R2D_UNORM8;
46
47 /* get_component_bits doesn't work with depth/stencil formats: */
48 if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
49 return R2D_FLOAT32;
50 if (format == PIPE_FORMAT_S8_UINT)
51 return R2D_INT8;
52 if (format == PIPE_FORMAT_A8_UNORM)
53 return R2D_UNORM8;
54
55 /* use the size of the red channel to find the corresponding "ifmt" */
56 bool is_int = util_format_is_pure_integer(format);
57 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
58 case 4: case 5: case 8:
59 return is_int ? R2D_INT8 : R2D_UNORM8;
60 case 10: case 11:
61 return is_int ? R2D_INT16 : R2D_FLOAT16;
62 case 16:
63 if (util_format_is_float(format))
64 return R2D_FLOAT16;
65 return is_int ? R2D_INT16 : R2D_FLOAT32;
66 case 32:
67 return is_int ? R2D_INT32 : R2D_FLOAT32;
68 default:
69 unreachable("bad format");
70 }
71 }
72
73 template <chip CHIP>
74 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode,bool is_mutable,bool gmem)75 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode, bool is_mutable, bool gmem)
76 {
77 struct tu_native_format fmt = tu6_format_texture(format, tile_mode, is_mutable);
78
79 switch (format) {
80 case PIPE_FORMAT_Z24X8_UNORM:
81 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
82 /* Similar to in fdl6_view_init, we want to use
83 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
84 * src. Since this is called when there is no image and thus no ubwc,
85 * we can always use FMT6_8_8_8_8_UNORM.
86 *
87 * Note (A7XX): Since it's erroneous to use FMT6_8_8_8_8_UNORM for a GMEM
88 * image (see blit_base_format), we use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
89 * instead.
90 */
91 fmt.fmt = CHIP >= A7XX && gmem ? FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 : FMT6_8_8_8_8_UNORM;
92 break;
93 default:
94 break;
95 }
96
97 return fmt;
98 }
99
100 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)101 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
102 {
103 struct tu_native_format fmt = tu6_format_color(format, tile_mode, false);
104
105 switch (format) {
106 case PIPE_FORMAT_Z24X8_UNORM:
107 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
108 /* similar to blit_format_texture but for blit dst */
109 fmt.fmt = FMT6_8_8_8_8_UNORM;
110 break;
111 default:
112 break;
113 }
114
115 return fmt;
116 }
117
118 template <chip CHIP>
119 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc,bool gmem)120 blit_base_format(enum pipe_format format, bool ubwc, bool gmem)
121 {
122 if (CHIP >= A7XX && gmem)
123 /* A7XX requires D24S8 in GMEM to always be treated as
124 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 regardless of if the image
125 * is UBWC-compatible. Using FMT6_8_8_8_8_UNORM instead will result
126 * in misrendering around the edges of the destination image.
127 */
128 ubwc = true;
129
130 if (ubwc) {
131 switch (format) {
132 case PIPE_FORMAT_Z24X8_UNORM:
133 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
134 /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
135 return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
136 default:
137 break;
138 }
139 }
140
141 /* note: tu6_format_color doesn't care about tiling for .fmt field */
142 return blit_format_color(format, TILE6_LINEAR).fmt;
143 }
144
145 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)146 r2d_coords(struct tu_cmd_buffer *cmd,
147 struct tu_cs *cs,
148 const VkOffset2D dst,
149 const VkOffset2D src,
150 const VkExtent2D extent)
151 {
152 tu_cs_emit_regs(cs,
153 A6XX_GRAS_2D_DST_TL(.x = dst.x, .y = dst.y),
154 A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
155
156 if (src.x == blt_no_coord.x)
157 return;
158
159 tu_cs_emit_regs(cs,
160 A6XX_GRAS_2D_SRC_TL_X(src.x),
161 A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
162 A6XX_GRAS_2D_SRC_TL_Y(src.y),
163 A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
164 }
165
166 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)167 r2d_clear_value(struct tu_cmd_buffer *cmd,
168 struct tu_cs *cs,
169 enum pipe_format format,
170 const VkClearValue *val)
171 {
172 uint32_t clear_value[4] = {};
173
174 switch (format) {
175 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
176 case PIPE_FORMAT_Z24X8_UNORM:
177 /* cleared as r8g8b8a8_unorm using special format */
178 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
179 clear_value[1] = clear_value[0] >> 8;
180 clear_value[2] = clear_value[0] >> 16;
181 clear_value[3] = val->depthStencil.stencil;
182 break;
183 case PIPE_FORMAT_Z16_UNORM:
184 case PIPE_FORMAT_Z32_FLOAT:
185 /* R2D_FLOAT32 */
186 clear_value[0] = fui(val->depthStencil.depth);
187 break;
188 case PIPE_FORMAT_S8_UINT:
189 clear_value[0] = val->depthStencil.stencil;
190 break;
191 case PIPE_FORMAT_R9G9B9E5_FLOAT:
192 /* cleared as UINT32 */
193 clear_value[0] = float3_to_rgb9e5(val->color.float32);
194 break;
195 default:
196 assert(!util_format_is_depth_or_stencil(format));
197 const struct util_format_description *desc = util_format_description(format);
198 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
199
200 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
201 format == PIPE_FORMAT_R11G11B10_FLOAT);
202
203 for (unsigned i = 0; i < 4; i++) {
204 if (desc->swizzle[i] > PIPE_SWIZZLE_W)
205 continue;
206
207 const struct util_format_channel_description *ch =
208 &desc->channel[desc->swizzle[i]];
209 if (ifmt == R2D_UNORM8) {
210 float linear = val->color.float32[i];
211 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
212 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
213
214 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
215 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
216 else
217 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
218 } else if (ifmt == R2D_FLOAT16) {
219 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
220 } else {
221 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
222 ifmt == R2D_INT16 || ifmt == R2D_INT8);
223 clear_value[i] = val->color.uint32[i];
224 }
225 }
226 break;
227 }
228
229 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
230 tu_cs_emit_array(cs, clear_value, 4);
231 }
232
233 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)234 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
235 enum a6xx_format *fmt)
236 {
237 /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
238 * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
239 * We also have to move stencil, which is normally in the .w channel, into
240 * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
241 * problems, and avoids using a swap, which seems to sometimes not work
242 * with a D24S8 source, or a texture swizzle which is only supported with
243 * the 3d path. Sometimes this blit happens on already-constructed
244 * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
245 */
246 if (*src_format == PIPE_FORMAT_S8_UINT &&
247 (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
248 dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
249 *fmt = FMT6_A8_UNORM;
250 *src_format = PIPE_FORMAT_A8_UNORM;
251 }
252 }
253
254 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)255 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
256 enum a6xx_format *fmt)
257 {
258 if (*dst_format == PIPE_FORMAT_S8_UINT &&
259 (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
260 src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
261 *dst_format = PIPE_FORMAT_A8_UNORM;
262 *fmt = FMT6_A8_UNORM;
263 }
264 }
265
266 template <chip CHIP>
267 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)268 r2d_src(struct tu_cmd_buffer *cmd,
269 struct tu_cs *cs,
270 const struct fdl6_view *iview,
271 uint32_t layer,
272 VkFilter filter,
273 enum pipe_format dst_format)
274 {
275 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
276 if (filter != VK_FILTER_NEAREST)
277 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
278
279 enum a6xx_format fmt = (enum a6xx_format)(
280 src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
281 enum pipe_format src_format = iview->format;
282 fixup_src_format(&src_format, dst_format, &fmt);
283
284 src_info =
285 (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
286 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
287
288 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
289 tu_cs_emit(cs, src_info);
290 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
291 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
292
293 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
294 tu_cs_image_flag_ref(cs, iview, layer);
295 }
296
297 template <chip CHIP>
298 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)299 r2d_src_depth(struct tu_cmd_buffer *cmd,
300 struct tu_cs *cs,
301 const struct tu_image_view *iview,
302 uint32_t layer,
303 VkFilter filter)
304 {
305 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
306 tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
307 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
308 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
309 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
310 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
311
312 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
313 tu_cs_image_flag_ref(cs, &iview->view, layer);
314 }
315
316 template <chip CHIP>
317 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)318 r2d_src_stencil(struct tu_cmd_buffer *cmd,
319 struct tu_cs *cs,
320 const struct tu_image_view *iview,
321 uint32_t layer,
322 VkFilter filter)
323 {
324 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
325 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
326 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
327 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
328 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
329 }
330
331 template <chip CHIP>
332 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)333 r2d_src_buffer(struct tu_cmd_buffer *cmd,
334 struct tu_cs *cs,
335 enum pipe_format format,
336 uint64_t va, uint32_t pitch,
337 uint32_t width, uint32_t height,
338 enum pipe_format dst_format)
339 {
340 struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
341 enum a6xx_format color_format = fmt.fmt;
342 fixup_src_format(&format, dst_format, &color_format);
343
344 tu_cs_emit_regs(cs,
345 SP_PS_2D_SRC_INFO(CHIP,
346 .color_format = color_format,
347 .color_swap = fmt.swap,
348 .srgb = util_format_is_srgb(format),
349 .unk20 = 1,
350 .unk22 = 1),
351 SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
352 SP_PS_2D_SRC(CHIP, .qword = va),
353 SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
354 }
355
356 template <chip CHIP>
357 static void
r2d_src_buffer_unaligned(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)358 r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd,
359 struct tu_cs *cs,
360 enum pipe_format format,
361 uint64_t va,
362 uint32_t pitch,
363 uint32_t width,
364 uint32_t height,
365 enum pipe_format dst_format)
366 {
367 /* This functionality is only allowed on A7XX, this assertion statically
368 * disallows calling this function on prior generations by mistake.
369 */
370 static_assert(CHIP >= A7XX);
371
372 struct tu_native_format fmt =
373 blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
374 enum a6xx_format color_format = fmt.fmt;
375 fixup_src_format(&format, dst_format, &color_format);
376
377 uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
378 va &= ~0x3f;
379 tu_cs_emit_regs(cs,
380 A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
381 .start_offset_texels = offset_texels,
382 .type = A6XX_TEX_IMG_BUFFER));
383
384 tu_cs_emit_regs(cs,
385 SP_PS_2D_SRC_INFO(CHIP, .color_format = color_format,
386 .color_swap = fmt.swap,
387 .srgb = util_format_is_srgb(format),
388 .unk20 = 1, .unk22 = 1),
389 SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
390 SP_PS_2D_SRC(CHIP, .qword = va),
391 SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
392 }
393
394 template <chip CHIP>
395 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)396 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
397 enum pipe_format src_format)
398 {
399 uint32_t dst_info = iview->RB_2D_DST_INFO;
400 enum a6xx_format fmt =
401 (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
402 enum pipe_format dst_format = iview->format;
403 fixup_dst_format(src_format, &dst_format, &fmt);
404
405 dst_info =
406 (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
407 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
408 tu_cs_emit(cs, dst_info);
409 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
410
411 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
412 tu_cs_image_flag_ref(cs, iview, layer);
413 }
414
415 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)416 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
417 {
418 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
419 tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
420 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
421 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
422
423 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
424 tu_cs_image_flag_ref(cs, &iview->view, layer);
425 }
426
427 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)428 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
429 {
430 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
431 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
432 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
433 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
434 }
435
436 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)437 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
438 enum pipe_format src_format)
439 {
440 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
441 enum a6xx_format color_fmt = fmt.fmt;
442 fixup_dst_format(src_format, &format, &color_fmt);
443 fmt.fmt = color_fmt;
444
445 tu_cs_emit_regs(cs,
446 A6XX_RB_2D_DST_INFO(
447 .color_format = fmt.fmt,
448 .color_swap = fmt.swap,
449 .srgb = util_format_is_srgb(format)),
450 A6XX_RB_2D_DST(.qword = va),
451 A6XX_RB_2D_DST_PITCH(pitch));
452 }
453
454 template <chip CHIP>
455 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)456 r2d_setup_common(struct tu_cmd_buffer *cmd,
457 struct tu_cs *cs,
458 enum pipe_format src_format,
459 enum pipe_format dst_format,
460 VkImageAspectFlags aspect_mask,
461 unsigned blit_param,
462 bool clear,
463 bool ubwc,
464 bool scissor)
465 {
466 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
467 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
468 }
469
470 enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
471 fixup_dst_format(src_format, &dst_format, &fmt);
472 enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
473
474 uint32_t unknown_8c01 = 0;
475
476 /* note: the only format with partial clearing is D24S8 */
477 if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
478 /* preserve stencil channel */
479 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
480 unknown_8c01 = 0x08000041;
481 /* preserve depth channels */
482 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
483 unknown_8c01 = 0x00084001;
484 }
485
486 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
487 tu_cs_emit(cs, unknown_8c01); // TODO: seem to be always 0 on A7XX
488
489 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
490 .rotate = (enum a6xx_rotation) blit_param,
491 .solid_color = clear,
492 .color_format = fmt,
493 .scissor = scissor,
494 .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
495 .mask = 0xf,
496 .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
497 ).value;
498
499 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
500 tu_cs_emit(cs, blit_cntl);
501
502 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
503 tu_cs_emit(cs, blit_cntl);
504
505 if (CHIP > A6XX) {
506 tu_cs_emit_regs(cs, A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
507 .start_offset_texels = 0,
508 .type = A6XX_TEX_2D));
509 }
510
511 if (fmt == FMT6_10_10_10_2_UNORM_DEST)
512 fmt = FMT6_16_16_16_16_FLOAT;
513
514 tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
515 .sint = util_format_is_pure_sint(dst_format),
516 .uint = util_format_is_pure_uint(dst_format),
517 .color_format = fmt,
518 .srgb = util_format_is_srgb(dst_format),
519 .mask = 0xf));
520 }
521
522 template <chip CHIP>
523 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)524 r2d_setup(struct tu_cmd_buffer *cmd,
525 struct tu_cs *cs,
526 enum pipe_format src_format,
527 enum pipe_format dst_format,
528 VkImageAspectFlags aspect_mask,
529 unsigned blit_param,
530 bool clear,
531 bool ubwc,
532 VkSampleCountFlagBits samples)
533 {
534 assert(samples == VK_SAMPLE_COUNT_1_BIT);
535
536 if (!cmd->state.pass) {
537 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
538 }
539
540 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
541 }
542
543 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)544 r2d_teardown(struct tu_cmd_buffer *cmd,
545 struct tu_cs *cs)
546 {
547 /* nothing to do here */
548 }
549
550 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)551 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
552 {
553 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
554 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
555 /* This a non-context register, so we have to WFI before changing. */
556 tu_cs_emit_wfi(cs);
557 tu_cs_emit_write_reg(
558 cs, REG_A6XX_RB_DBG_ECO_CNTL,
559 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
560 }
561
562 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
563 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
564
565 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
566 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
567 tu_cs_emit_wfi(cs);
568 tu_cs_emit_write_reg(
569 cs, REG_A6XX_RB_DBG_ECO_CNTL,
570 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
571 }
572 }
573
574 /* r3d_ = shader path operations */
575
576 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)577 load_const(nir_builder *b, unsigned base, unsigned components)
578 {
579 return nir_load_const_ir3(b, components, 32, nir_imm_int(b, 0),
580 .base = base);
581 }
582
583 static nir_shader *
build_blit_vs_shader(void)584 build_blit_vs_shader(void)
585 {
586 nir_builder _b =
587 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
588 nir_builder *b = &_b;
589 b->shader->info.internal = true;
590
591 nir_variable *out_pos =
592 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
593 "gl_Position");
594 out_pos->data.location = VARYING_SLOT_POS;
595
596 nir_def *vert0_pos = load_const(b, 0, 2);
597 nir_def *vert1_pos = load_const(b, 4, 2);
598 nir_def *vertex = nir_load_vertex_id(b);
599
600 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
601 pos = nir_vec4(b, nir_channel(b, pos, 0),
602 nir_channel(b, pos, 1),
603 nir_imm_float(b, 0.0),
604 nir_imm_float(b, 1.0));
605
606 nir_store_var(b, out_pos, pos, 0xf);
607
608 nir_variable *out_coords =
609 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
610 "coords");
611 out_coords->data.location = VARYING_SLOT_VAR0;
612
613 nir_def *vert0_coords = load_const(b, 2, 2);
614 nir_def *vert1_coords = load_const(b, 6, 2);
615
616 /* Only used with "z scale" blit path which uses a 3d texture */
617 nir_def *z_coord = load_const(b, 16, 1);
618
619 nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
620 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
621 z_coord);
622
623 nir_store_var(b, out_coords, coords, 0x7);
624
625 return b->shader;
626 }
627
628 static nir_shader *
build_clear_vs_shader(void)629 build_clear_vs_shader(void)
630 {
631 nir_builder _b =
632 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
633 nir_builder *b = &_b;
634 b->shader->info.internal = true;
635
636 nir_variable *out_pos =
637 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
638 "gl_Position");
639 out_pos->data.location = VARYING_SLOT_POS;
640
641 nir_def *vert0_pos = load_const(b, 0, 2);
642 nir_def *vert1_pos = load_const(b, 4, 2);
643 /* c0.z is used to clear depth */
644 nir_def *depth = load_const(b, 2, 1);
645 nir_def *vertex = nir_load_vertex_id(b);
646
647 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
648 pos = nir_vec4(b, nir_channel(b, pos, 0),
649 nir_channel(b, pos, 1),
650 depth, nir_imm_float(b, 1.0));
651
652 nir_store_var(b, out_pos, pos, 0xf);
653
654 nir_variable *out_layer =
655 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
656 "gl_Layer");
657 out_layer->data.location = VARYING_SLOT_LAYER;
658 nir_def *layer = load_const(b, 3, 1);
659 nir_store_var(b, out_layer, layer, 1);
660
661 return b->shader;
662 }
663
664 static nir_shader *
build_blit_fs_shader(bool zscale)665 build_blit_fs_shader(bool zscale)
666 {
667 nir_builder _b =
668 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
669 zscale ? "zscale blit fs" : "blit fs");
670 nir_builder *b = &_b;
671 b->shader->info.internal = true;
672
673 nir_variable *out_color =
674 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
675 "color0");
676 out_color->data.location = FRAG_RESULT_DATA0;
677
678 unsigned coord_components = zscale ? 3 : 2;
679 nir_variable *in_coords =
680 nir_variable_create(b->shader, nir_var_shader_in,
681 glsl_vec_type(coord_components),
682 "coords");
683 in_coords->data.location = VARYING_SLOT_VAR0;
684
685 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
686 /* Note: since we're just copying data, we rely on the HW ignoring the
687 * dest_type.
688 */
689 tex->dest_type = nir_type_int32;
690 tex->is_array = false;
691 tex->is_shadow = false;
692 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
693
694 tex->texture_index = 0;
695 tex->sampler_index = 0;
696
697 b->shader->info.num_textures = 1;
698 BITSET_SET(b->shader->info.textures_used, 0);
699
700 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
701 nir_load_var(b, in_coords));
702 tex->coord_components = coord_components;
703
704 nir_def_init(&tex->instr, &tex->def, 4, 32);
705 nir_builder_instr_insert(b, &tex->instr);
706
707 nir_store_var(b, out_color, &tex->def, 0xf);
708
709 return b->shader;
710 }
711
712 /* We can only read multisample textures via txf_ms, so we need a separate
713 * variant for them.
714 */
715 static nir_shader *
build_ms_copy_fs_shader(bool half_float)716 build_ms_copy_fs_shader(bool half_float)
717 {
718 nir_builder _b =
719 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
720 "multisample copy fs");
721 nir_builder *b = &_b;
722 b->shader->info.internal = true;
723
724 nir_variable *out_color =
725 nir_variable_create(b->shader, nir_var_shader_out,
726 half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
727 "color0");
728 out_color->data.location = FRAG_RESULT_DATA0;
729
730 nir_variable *in_coords =
731 nir_variable_create(b->shader, nir_var_shader_in,
732 glsl_vec_type(2),
733 "coords");
734 in_coords->data.location = VARYING_SLOT_VAR0;
735
736 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
737
738 tex->op = nir_texop_txf_ms;
739
740 /* Note: since we're just copying data, we rely on the HW ignoring the
741 * dest_type.
742 */
743 tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
744 tex->is_array = false;
745 tex->is_shadow = false;
746 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
747
748 tex->texture_index = 0;
749 tex->sampler_index = 0;
750
751 b->shader->info.num_textures = 1;
752 BITSET_SET(b->shader->info.textures_used, 0);
753 BITSET_SET(b->shader->info.textures_used_by_txf, 0);
754
755 nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
756
757 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
758 tex->coord_components = 2;
759
760 tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
761 nir_load_sample_id(b));
762
763 nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
764 nir_builder_instr_insert(b, &tex->instr);
765
766 nir_store_var(b, out_color, &tex->def, 0xf);
767
768 return b->shader;
769 }
770
771 static nir_shader *
build_clear_fs_shader(unsigned mrts)772 build_clear_fs_shader(unsigned mrts)
773 {
774 nir_builder _b =
775 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
776 "mrt%u clear fs", mrts);
777 nir_builder *b = &_b;
778 b->shader->info.internal = true;
779
780 for (unsigned i = 0; i < mrts; i++) {
781 nir_variable *out_color =
782 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
783 "color");
784 out_color->data.location = FRAG_RESULT_DATA0 + i;
785
786 nir_def *color = load_const(b, 4 * i, 4);
787 nir_store_var(b, out_color, color, 0xf);
788 }
789
790 return b->shader;
791 }
792
793 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)794 compile_shader(struct tu_device *dev, struct nir_shader *nir,
795 unsigned consts, unsigned *offset, enum global_shader idx)
796 {
797 nir->options = ir3_get_compiler_options(dev->compiler);
798
799 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
800 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
801
802 struct ir3_const_allocations const_allocs = {};
803 if (consts > 0)
804 ir3_const_alloc(&const_allocs, IR3_CONST_ALLOC_UBO_RANGES, align(consts, 8), 1);
805
806 const struct ir3_shader_options options = {
807 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
808 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
809 .const_allocs = const_allocs,
810 .fragdata_dynamic_remap =
811 idx >= GLOBAL_SH_VS_CLEAR && idx <= GLOBAL_SH_FS_CLEAR_MAX,
812 };
813
814 ir3_finalize_nir(dev->compiler, &options.nir_options, nir);
815
816 struct ir3_shader *sh =
817 ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
818
819 struct ir3_shader_key key = {};
820 bool created;
821 struct ir3_shader_variant *so =
822 ir3_shader_get_variant(sh, &key, false, false, &created);
823
824 struct tu6_global *global = dev->global_bo_map;
825
826 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
827 dev->global_shaders[idx] = sh;
828 dev->global_shader_variants[idx] = so;
829 memcpy(&global->shaders[*offset], so->bin,
830 sizeof(uint32_t) * so->info.sizedwords);
831 dev->global_shader_va[idx] = dev->global_bo->iova +
832 offsetof_arr(struct tu6_global, shaders, *offset);
833 *offset += align(so->info.sizedwords, 32);
834 }
835
836 void
tu_init_clear_blit_shaders(struct tu_device * dev)837 tu_init_clear_blit_shaders(struct tu_device *dev)
838 {
839 unsigned offset = 0;
840 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
841 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
842 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
843 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
844 compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
845 compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
846
847 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
848 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
849 (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
850 }
851 }
852
853 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)854 tu_destroy_clear_blit_shaders(struct tu_device *dev)
855 {
856 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
857 if (dev->global_shaders[i])
858 ir3_shader_destroy(dev->global_shaders[i]);
859 }
860 }
861
862 enum r3d_type {
863 R3D_CLEAR,
864 R3D_BLIT,
865 R3D_COPY_HALF,
866 };
867
868 template <chip CHIP>
869 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)870 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
871 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
872 {
873 enum global_shader vs_id =
874 type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
875
876 struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
877 uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
878
879 enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
880
881 if (z_scale) {
882 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
883 } else if (type == R3D_COPY_HALF) {
884 /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
885 *
886 * TODO: Add a half-float blit shader that uses texture() but with half
887 * registers to avoid NaN canonicaliztion for the single-sampled case.
888 */
889 fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
890 } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
891 fs_id = GLOBAL_SH_FS_COPY_MS;
892 }
893
894 unsigned num_rts = util_bitcount(rts_mask);
895 if (type == R3D_CLEAR)
896 fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
897
898 struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
899 uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
900
901 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
902 .vs_state = true,
903 .hs_state = true,
904 .ds_state = true,
905 .gs_state = true,
906 .fs_state = true,
907 .gfx_ibo = true,
908 .gfx_shared_const = true,
909 .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
910 .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
911
912 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
913 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
914 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
915 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
916 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
917
918 struct tu_pvtmem_config pvtmem = {};
919 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
920 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
921
922 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
923 if (CHIP == A7XX) {
924 tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
925 }
926
927 tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
928
929 if (CHIP >= A7XX) {
930 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
931
932 tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
933 }
934
935 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
936 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
937 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
938
939 tu6_emit_vs<CHIP>(cs, vs, 0);
940 tu6_emit_hs<CHIP>(cs, NULL);
941 tu6_emit_ds<CHIP>(cs, NULL);
942 tu6_emit_gs<CHIP>(cs, NULL);
943 tu6_emit_fs<CHIP>(cs, fs);
944
945 tu_cs_emit_regs(cs,
946 A6XX_GRAS_CL_CNTL(
947 .clip_disable = 1,
948 .vp_clip_code_ignore = 1,
949 .vp_xform_disable = 1,
950 .persp_division_disable = 1,));
951 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
952
953 tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
954 if (CHIP == A6XX) {
955 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
956 } else {
957 tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
958
959 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
960 .raster_mode = TYPE_TILED,
961 .raster_direction = LR_TB));
962 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
963 tu_cs_emit_regs(cs, A6XX_PC_DGEN_SU_CONSERVATIVE_RAS_CNTL());
964 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL());
965 }
966
967 tu_cs_emit_regs(cs,
968 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
969 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
970 tu_cs_emit_regs(cs,
971 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
972 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
973
974 tu_cs_emit_regs(cs,
975 A6XX_VFD_INDEX_OFFSET(),
976 A6XX_VFD_INSTANCE_START_OFFSET());
977
978 if (rts_mask) {
979 unsigned rts_count = util_last_bit(rts_mask);
980 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
981 unsigned rt = 0;
982 for (unsigned i = 0; i < rts_count; i++) {
983 unsigned regid = 0;
984 if (rts_mask & (1u << i))
985 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
986 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
987 COND(regid & HALF_REG_ID,
988 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
989 }
990 }
991
992 tu6_emit_msaa(cs, samples, false);
993 }
994
995 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)996 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
997 struct tu_cs *cs,
998 uint32_t opcode,
999 enum a6xx_state_block block,
1000 uint32_t offset,
1001 const void *consts,
1002 uint32_t size_vec4)
1003 {
1004 assert(offset % cmd->device->compiler->const_upload_unit == 0);
1005
1006 struct tu_cs_memory mem = {};
1007 VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
1008 if (result != VK_SUCCESS) {
1009 vk_command_buffer_set_error(&cmd->vk, result);
1010 return;
1011 }
1012
1013 memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
1014
1015 tu_cs_emit_pkt7(cs, opcode, 3);
1016 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
1017 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1018 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1019 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
1020 CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
1021 tu_cs_emit_qw(cs, mem.iova);
1022 }
1023
1024 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)1025 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
1026 {
1027 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
1028 }
1029
1030 /* z coordinate for "z scale" blit path which uses a 3d texture */
1031 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)1032 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
1033 {
1034 const uint32_t coord[] = {
1035 fui(z),
1036 0,
1037 0,
1038 0,
1039 };
1040
1041 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
1042 }
1043
1044 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)1045 r3d_coords(struct tu_cmd_buffer *cmd,
1046 struct tu_cs *cs,
1047 const VkOffset2D dst,
1048 const VkOffset2D src,
1049 const VkExtent2D extent)
1050 {
1051 const bool no_src = src.x != blt_no_coord.x;
1052 int32_t src_x1 = no_src ? src.x : 0;
1053 int32_t src_y1 = no_src ? src.y : 0;
1054
1055 const float coords[] = {
1056 dst.x,
1057 dst.y,
1058 src_x1,
1059 src_y1,
1060 dst.x + extent.width,
1061 dst.y + extent.height,
1062 src_x1 + extent.width,
1063 src_y1 + extent.height,
1064 };
1065 r3d_coords_raw(cmd, cs, coords);
1066 }
1067
1068 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1069 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1070 {
1071 uint32_t coords[4] = {};
1072
1073 switch (format) {
1074 case PIPE_FORMAT_Z24X8_UNORM:
1075 case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1076 /* cleared as r8g8b8a8_unorm using special format */
1077 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1078 coords[0] = fui((tmp & 0xff) / 255.0f);
1079 coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1080 coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1081 coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1082 } break;
1083 case PIPE_FORMAT_Z16_UNORM:
1084 case PIPE_FORMAT_Z32_FLOAT:
1085 coords[0] = fui(val->depthStencil.depth);
1086 coords[1] = 0;
1087 coords[2] = 0;
1088 coords[3] = 0;
1089 break;
1090 case PIPE_FORMAT_S8_UINT:
1091 coords[0] = val->depthStencil.stencil & 0xff;
1092 coords[1] = 0;
1093 coords[2] = 0;
1094 coords[3] = 0;
1095 break;
1096 default:
1097 /* as color formats use clear value as-is */
1098 assert(!util_format_is_depth_or_stencil(format));
1099 memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1100 break;
1101 }
1102
1103 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1104 }
1105
1106 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1107 r3d_src_common(struct tu_cmd_buffer *cmd,
1108 struct tu_cs *cs,
1109 const uint32_t *tex_const,
1110 uint32_t offset_base,
1111 uint32_t offset_ubwc,
1112 VkFilter filter)
1113 {
1114 struct tu_cs_memory texture = { };
1115 VkResult result = tu_cs_alloc(&cmd->sub_cs,
1116 2, /* allocate space for a sampler too */
1117 A6XX_TEX_CONST_DWORDS, &texture);
1118 if (result != VK_SUCCESS) {
1119 vk_command_buffer_set_error(&cmd->vk, result);
1120 return;
1121 }
1122
1123 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1124
1125 /* patch addresses for layer offset */
1126 *(uint64_t*) (texture.map + 4) += offset_base;
1127 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1128 texture.map[7] = ubwc_addr;
1129 texture.map[8] = ubwc_addr >> 32;
1130
1131 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1132 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1133 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1134 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1135 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1136 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1137 0x60000; /* XXX used by blob, doesn't seem necessary */
1138 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1139 A6XX_TEX_SAMP_1_UNNORM_COORDS |
1140 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1141 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1142 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1143
1144 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1145 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1146 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1147 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1148 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1149 CP_LOAD_STATE6_0_NUM_UNIT(1));
1150 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1151
1152 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1153
1154 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1155 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1156 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1157 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1158 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1159 CP_LOAD_STATE6_0_NUM_UNIT(1));
1160 tu_cs_emit_qw(cs, texture.iova);
1161
1162 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1163 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1164 }
1165
1166 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1167 r3d_src(struct tu_cmd_buffer *cmd,
1168 struct tu_cs *cs,
1169 const struct fdl6_view *iview,
1170 uint32_t layer,
1171 VkFilter filter,
1172 enum pipe_format dst_format)
1173 {
1174 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1175 memcpy(desc, iview->descriptor, sizeof(desc));
1176
1177 enum a6xx_format fmt = (enum a6xx_format)(
1178 (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1179 enum pipe_format src_format = iview->format;
1180 fixup_src_format(&src_format, dst_format, &fmt);
1181 desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1182 A6XX_TEX_CONST_0_FMT(fmt);
1183
1184 r3d_src_common(cmd, cs, desc,
1185 iview->layer_size * layer,
1186 iview->ubwc_layer_size * layer,
1187 filter);
1188 }
1189
1190 template <chip CHIP>
1191 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1192 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1193 struct tu_cs *cs,
1194 enum pipe_format format,
1195 uint64_t va, uint32_t pitch,
1196 uint32_t width, uint32_t height,
1197 enum pipe_format dst_format)
1198 {
1199 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1200
1201 struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
1202 enum a6xx_format color_format = fmt.fmt;
1203 fixup_src_format(&format, dst_format, &color_format);
1204
1205 desc[0] =
1206 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1207 A6XX_TEX_CONST_0_FMT(color_format) |
1208 A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1209 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1210 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1211 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1212 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1213 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1214 desc[2] =
1215 A6XX_TEX_CONST_2_PITCH(pitch) |
1216 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1217 desc[3] = 0;
1218 desc[4] = va;
1219 desc[5] = va >> 32;
1220 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1221 desc[i] = 0;
1222
1223 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1224 }
1225
1226 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1227 r3d_src_depth(struct tu_cmd_buffer *cmd,
1228 struct tu_cs *cs,
1229 const struct tu_image_view *iview,
1230 uint32_t layer)
1231 {
1232 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1233
1234 memcpy(desc, iview->view.descriptor, sizeof(desc));
1235 uint64_t va = iview->depth_base_addr;
1236
1237 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1238 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1239 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1240 A6XX_TEX_CONST_0_SWAP__MASK);
1241 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1242 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1243 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1244 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1245 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1246 desc[2] =
1247 A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1248 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1249 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1250 (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1251 desc[4] = va;
1252 desc[5] = va >> 32;
1253
1254 r3d_src_common(cmd, cs, desc,
1255 iview->depth_layer_size * layer,
1256 iview->view.ubwc_layer_size * layer,
1257 VK_FILTER_NEAREST);
1258 }
1259
1260 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1261 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1262 struct tu_cs *cs,
1263 const struct tu_image_view *iview,
1264 uint32_t layer)
1265 {
1266 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1267
1268 memcpy(desc, iview->view.descriptor, sizeof(desc));
1269 uint64_t va = iview->stencil_base_addr;
1270
1271 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1272 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1273 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1274 A6XX_TEX_CONST_0_SWAP__MASK);
1275 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1276 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1277 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1278 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1279 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1280 desc[2] =
1281 A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1282 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1283 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1284 desc[4] = va;
1285 desc[5] = va >> 32;
1286 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1287 desc[i] = 0;
1288
1289 r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1290 VK_FILTER_NEAREST);
1291 }
1292
1293 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1294 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1295 struct tu_cs *cs,
1296 const struct tu_image_view *iview,
1297 uint32_t layer)
1298 {
1299 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1300
1301 memcpy(desc, iview->view.descriptor, sizeof(desc));
1302
1303 /* Fixup D24 formats because we always load both depth and stencil. */
1304 enum pipe_format format = iview->view.format;
1305 if (format == PIPE_FORMAT_X24S8_UINT ||
1306 format == PIPE_FORMAT_Z24X8_UNORM ||
1307 format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1308 desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1309 if (iview->view.ubwc_enabled)
1310 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1311 else
1312 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1313 }
1314
1315 /* When loading/storing GMEM we always load the full image and don't do any
1316 * swizzling or swapping, that's done in the draw when reading/writing
1317 * GMEM, so we need to fixup the swizzle and swap.
1318 */
1319 desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1320 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1321 A6XX_TEX_CONST_0_SWAP__MASK);
1322 desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1323 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1324 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1325 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1326
1327 r3d_src_common(cmd, cs, desc,
1328 iview->view.layer_size * layer,
1329 iview->view.ubwc_layer_size * layer,
1330 VK_FILTER_NEAREST);
1331 }
1332
1333 template <chip CHIP>
1334 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1335 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1336 struct tu_cs *cs,
1337 const struct tu_image_view *iview,
1338 enum pipe_format format,
1339 enum pipe_format dst_format,
1340 uint32_t gmem_offset,
1341 uint32_t cpp)
1342 {
1343 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1344 memcpy(desc, iview->view.descriptor, sizeof(desc));
1345
1346 enum a6xx_format fmt =
1347 blit_format_texture<CHIP>(format, TILE6_2,
1348 iview->view.is_mutable, true).fmt;
1349 fixup_src_format(&format, dst_format, &fmt);
1350
1351 /* patch the format so that depth/stencil get the right format and swizzle */
1352 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1353 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1354 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1355 desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1356 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1357 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1358 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1359 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1360
1361 /* patched for gmem */
1362 desc[0] &= ~A6XX_TEX_CONST_0_TILE_MODE__MASK;
1363 if (!iview->view.is_mutable)
1364 desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
1365 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1366 desc[2] =
1367 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1368 A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1369 desc[3] = 0;
1370 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1371 desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1372 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1373 desc[i] = 0;
1374
1375 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1376 }
1377
1378 template <chip CHIP>
1379 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1380 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1381 enum pipe_format src_format)
1382 {
1383 uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1384
1385 enum a6xx_format fmt = (enum a6xx_format)(
1386 mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1387 enum pipe_format dst_format = iview->format;
1388 fixup_dst_format(src_format, &dst_format, &fmt);
1389 mrt_buf_info =
1390 (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1391 A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1392
1393 tu_cs_emit_regs(cs,
1394 RB_MRT_BUF_INFO(CHIP, 0, .dword = mrt_buf_info),
1395 A6XX_RB_MRT_PITCH(0, iview->pitch),
1396 A6XX_RB_MRT_ARRAY_PITCH(0, iview->layer_size),
1397 A6XX_RB_MRT_BASE(0, .qword = tu_layer_address(iview, layer)),
1398 A6XX_RB_MRT_BASE_GMEM(0),
1399 );
1400
1401 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1402 tu_cs_image_flag_ref(cs, iview, layer);
1403
1404 /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1405 * FMT6_NV12_Y.
1406 */
1407 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1408
1409 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->ubwc_enabled));
1410 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1411 }
1412
1413 template <chip CHIP>
1414 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1415 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1416 {
1417 tu_cs_emit_regs(cs,
1418 RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_depth(iview, RB_MRT_BUF_INFO)),
1419 A6XX_RB_MRT_PITCH(0, iview->depth_pitch),
1420 A6XX_RB_MRT_ARRAY_PITCH(0, iview->depth_layer_size),
1421 A6XX_RB_MRT_BASE(0, .qword = iview->depth_base_addr + iview->depth_layer_size * layer),
1422 A6XX_RB_MRT_BASE_GMEM(0),
1423 );
1424
1425 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1426 tu_cs_image_flag_ref(cs, &iview->view, layer);
1427
1428 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->view.ubwc_enabled));
1429 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1430 }
1431
1432 template <chip CHIP>
1433 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1434 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1435 {
1436 tu_cs_emit_regs(cs,
1437 RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_stencil(iview, RB_MRT_BUF_INFO)),
1438 A6XX_RB_MRT_PITCH(0, iview->stencil_pitch),
1439 A6XX_RB_MRT_ARRAY_PITCH(0, iview->stencil_layer_size),
1440 A6XX_RB_MRT_BASE(0, .qword = iview->stencil_base_addr + iview->stencil_layer_size * layer),
1441 A6XX_RB_MRT_BASE_GMEM(0),
1442 );
1443
1444 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1445 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1446 }
1447
1448 template <chip CHIP>
1449 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1450 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1451 enum pipe_format src_format)
1452 {
1453 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1454
1455 enum a6xx_format color_fmt = fmt.fmt;
1456 fixup_dst_format(src_format, &format, &color_fmt);
1457
1458 tu_cs_emit_regs(cs,
1459 RB_MRT_BUF_INFO(CHIP, 0, .color_format = color_fmt, .color_swap = fmt.swap),
1460 A6XX_RB_MRT_PITCH(0, pitch),
1461 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1462 A6XX_RB_MRT_BASE(0, .qword = va),
1463 A6XX_RB_MRT_BASE_GMEM(0, 0));
1464
1465 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1466 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1467 }
1468
1469 template <chip CHIP>
1470 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1471 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1472 const struct tu_image_view *iview,
1473 const struct tu_render_pass_attachment *att,
1474 bool separate_stencil, unsigned layer)
1475 {
1476 unsigned RB_MRT_BUF_INFO;
1477 unsigned gmem_offset;
1478
1479 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1480 if (!separate_stencil) {
1481 RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1482 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1483 } else {
1484 RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1485 gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1486 }
1487 } else {
1488 RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1489 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1490 }
1491
1492 tu_cs_emit_regs(cs,
1493 RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
1494 A6XX_RB_MRT_PITCH(0, 0),
1495 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1496 A6XX_RB_MRT_BASE(0, 0),
1497 A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1498
1499 enum a6xx_format color_format =
1500 (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1501 tu_cs_emit_regs(cs,
1502 A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1503
1504 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1505 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1506 }
1507
1508 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1509 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1510 {
1511 uint8_t mask = 0xf;
1512 assert(aspect_mask);
1513 /* note: the only format with partial writing is D24S8,
1514 * clear/blit uses the _AS_R8G8B8A8 format to access it
1515 */
1516 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1517 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1518 mask = 0x7;
1519 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1520 mask = 0x8;
1521 }
1522 return mask;
1523 }
1524
1525 static uint8_t
aspect_write_mask_generic_clear(enum pipe_format format,VkImageAspectFlags aspect_mask)1526 aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspect_mask)
1527 {
1528 uint8_t mask = 0xf;
1529 assert(aspect_mask);
1530 /* note: the only format with partial writing is D24S8 */
1531 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1532 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1533 mask = 0x1;
1534 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1535 mask = 0x2;
1536 if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
1537 mask = 0x3;
1538 }
1539 return mask;
1540 }
1541
1542 enum r3d_blit_param {
1543 R3D_Z_SCALE = 1 << 0,
1544 R3D_DST_GMEM = 1 << 1,
1545 R3D_COPY = 1 << 2,
1546 };
1547
1548 template <chip CHIP>
1549 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1550 r3d_setup(struct tu_cmd_buffer *cmd,
1551 struct tu_cs *cs,
1552 enum pipe_format src_format,
1553 enum pipe_format dst_format,
1554 VkImageAspectFlags aspect_mask,
1555 unsigned blit_param,
1556 bool clear,
1557 bool ubwc,
1558 VkSampleCountFlagBits samples)
1559 {
1560 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1561 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1562 }
1563
1564 enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
1565 fixup_dst_format(src_format, &dst_format, &fmt);
1566
1567 if (!cmd->state.pass) {
1568 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1569 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1570 }
1571
1572 if (!(blit_param & R3D_DST_GMEM)) {
1573 if (CHIP == A6XX) {
1574 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1575 } else {
1576 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1577 }
1578
1579 tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1580
1581 if (CHIP >= A7XX) {
1582 tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1583 tu_cs_emit_regs(cs,
1584 A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1585 }
1586 }
1587
1588 enum r3d_type type;
1589 if (clear) {
1590 type = R3D_CLEAR;
1591 } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1592 /* Avoid canonicalizing NaNs in copies by using the special half-float
1593 * path that uses half regs.
1594 */
1595 type = R3D_COPY_HALF;
1596 } else {
1597 type = R3D_BLIT;
1598 }
1599
1600 r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1601
1602 tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1603 tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1604 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1605 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1606
1607 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1608 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1609 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1610 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1611 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1612 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1613 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1614 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1615 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1616
1617 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1618 .color_format = fmt,
1619 .color_sint = util_format_is_pure_sint(dst_format),
1620 .color_uint = util_format_is_pure_uint(dst_format)));
1621
1622 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1623 .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1624 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1625 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1626
1627 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1628 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1629
1630 if (CHIP >= A7XX) {
1631 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CNTL2(0));
1632 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
1633
1634 tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
1635 tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
1636 tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
1637 }
1638
1639 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1640 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1641
1642 /* Disable sample counting in order to not affect occlusion query. */
1643 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1644
1645 tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL());
1646 if (CHIP >= A7XX) {
1647 tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL());
1648 }
1649
1650 if (cmd->state.prim_generated_query_running_before_rp) {
1651 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1652 }
1653
1654 if (cmd->state.predication_active) {
1655 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1656 tu_cs_emit(cs, 0);
1657 }
1658 }
1659
1660 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1661 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1662 {
1663 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1664 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1665 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1666 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1667 tu_cs_emit(cs, 1); /* instance count */
1668 tu_cs_emit(cs, 2); /* vertex count */
1669 }
1670
1671 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1672 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1673 {
1674 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1675 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1676 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1677 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1678 tu_cs_emit(cs, 1); /* instance count */
1679 tu_cs_emit(cs, 2); /* vertex count */
1680 }
1681
1682 template <chip CHIP>
1683 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1684 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1685 {
1686 if (cmd->state.predication_active) {
1687 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1688 tu_cs_emit(cs, 1);
1689 }
1690
1691 /* Re-enable sample counting. */
1692 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1693
1694 if (cmd->state.prim_generated_query_running_before_rp) {
1695 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1696 }
1697 }
1698
1699 /* blit ops - common interface for 2d/shader paths */
1700
1701 struct blit_ops {
1702 void (*coords)(struct tu_cmd_buffer *cmd,
1703 struct tu_cs *cs,
1704 const VkOffset2D dst,
1705 const VkOffset2D src,
1706 const VkExtent2D extent);
1707 void (*clear_value)(struct tu_cmd_buffer *cmd,
1708 struct tu_cs *cs,
1709 enum pipe_format format,
1710 const VkClearValue *val);
1711 void (*src)(
1712 struct tu_cmd_buffer *cmd,
1713 struct tu_cs *cs,
1714 const struct fdl6_view *iview,
1715 uint32_t layer,
1716 VkFilter filter,
1717 enum pipe_format dst_format);
1718 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1719 enum pipe_format format,
1720 uint64_t va, uint32_t pitch,
1721 uint32_t width, uint32_t height,
1722 enum pipe_format dst_format);
1723 void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1724 enum pipe_format src_format);
1725 void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1726 void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1727 void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1728 enum pipe_format src_format);
1729 void (*setup)(struct tu_cmd_buffer *cmd,
1730 struct tu_cs *cs,
1731 enum pipe_format src_format,
1732 enum pipe_format dst_format,
1733 VkImageAspectFlags aspect_mask,
1734 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1735 bool clear,
1736 bool ubwc,
1737 VkSampleCountFlagBits samples);
1738 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1739 void (*teardown)(struct tu_cmd_buffer *cmd,
1740 struct tu_cs *cs);
1741 };
1742
1743 template <chip CHIP>
1744 static const struct blit_ops r2d_ops = {
1745 .coords = r2d_coords,
1746 .clear_value = r2d_clear_value,
1747 .src = r2d_src<CHIP>,
1748 .src_buffer = r2d_src_buffer<CHIP>,
1749 .dst = r2d_dst<CHIP>,
1750 .dst_depth = r2d_dst_depth,
1751 .dst_stencil = r2d_dst_stencil,
1752 .dst_buffer = r2d_dst_buffer,
1753 .setup = r2d_setup<CHIP>,
1754 .run = r2d_run,
1755 .teardown = r2d_teardown,
1756 };
1757
1758 template <chip CHIP>
1759 static const struct blit_ops r3d_ops = {
1760 .coords = r3d_coords,
1761 .clear_value = r3d_clear_value,
1762 .src = r3d_src,
1763 .src_buffer = r3d_src_buffer<CHIP>,
1764 .dst = r3d_dst<CHIP>,
1765 .dst_depth = r3d_dst_depth<CHIP>,
1766 .dst_stencil = r3d_dst_stencil<CHIP>,
1767 .dst_buffer = r3d_dst_buffer<CHIP>,
1768 .setup = r3d_setup<CHIP>,
1769 .run = r3d_run,
1770 .teardown = r3d_teardown<CHIP>,
1771 };
1772
1773 /* passthrough set coords from 3D extents */
1774 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1775 coords(const struct blit_ops *ops,
1776 struct tu_cmd_buffer *cmd,
1777 struct tu_cs *cs,
1778 const VkOffset3D dst,
1779 const VkOffset3D src,
1780 const VkExtent3D extent)
1781 {
1782 ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1783 (VkExtent2D) {extent.width, extent.height});
1784 }
1785
1786 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1787 * to be a bit careful because we have to pick a format with matching UBWC
1788 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1789 * everything.
1790 */
1791 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1792 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1793 {
1794 if (vk_format_is_compressed(vk_format)) {
1795 switch (vk_format_get_blocksize(vk_format)) {
1796 case 1: return PIPE_FORMAT_R8_UINT;
1797 case 2: return PIPE_FORMAT_R16_UINT;
1798 case 4: return PIPE_FORMAT_R32_UINT;
1799 case 8: return PIPE_FORMAT_R32G32_UINT;
1800 case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1801 default:
1802 unreachable("unhandled format size");
1803 }
1804 }
1805
1806 enum pipe_format format = vk_format_to_pipe_format(vk_format);
1807
1808 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat
1809 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1810 * (also -1.0), when we're supposed to be memcpying the bits. See
1811 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1812 */
1813 format = util_format_snorm_to_unorm(format);
1814
1815 if (vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1816 return PIPE_FORMAT_R32_UINT;
1817
1818 /* For VK_FORMAT_D32_SFLOAT_S8_UINT and YCbCr formats use our existing helpers */
1819 if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1820 vk_format_get_ycbcr_info(vk_format))
1821 return tu_aspects_to_plane(vk_format, aspect_mask);
1822
1823 /* Otherwise, simply return the pipe_format */
1824 return format;
1825 }
1826
1827 static void
pack_blit_event_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])1828 pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
1829 {
1830 switch (format) {
1831 case PIPE_FORMAT_Z24X8_UNORM:
1832 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1833 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1834 val->depthStencil.stencil << 24;
1835 return;
1836 case PIPE_FORMAT_Z16_UNORM:
1837 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1838 return;
1839 case PIPE_FORMAT_Z32_FLOAT:
1840 clear_value[0] = fui(val->depthStencil.depth);
1841 return;
1842 case PIPE_FORMAT_S8_UINT:
1843 clear_value[0] = val->depthStencil.stencil;
1844 return;
1845 default:
1846 break;
1847 }
1848
1849 float tmp[4];
1850 memcpy(tmp, val->color.float32, 4 * sizeof(float));
1851 if (util_format_is_srgb(format)) {
1852 for (int i = 0; i < 3; i++)
1853 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1854 }
1855
1856 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1857 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1858 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1859 case 4:
1860 PACK_F(r4g4b4a4_unorm);
1861 break;
1862 case 5:
1863 if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1864 PACK_F(r5g6b5_unorm);
1865 else
1866 PACK_F(r5g5b5a1_unorm);
1867 break;
1868 case 8:
1869 if (util_format_is_snorm(format))
1870 PACK_F(r8g8b8a8_snorm);
1871 else if (util_format_is_unorm(format))
1872 PACK_F(r8g8b8a8_unorm);
1873 else
1874 pack_int8(clear_value, val->color.uint32);
1875 break;
1876 case 10:
1877 if (util_format_is_pure_integer(format))
1878 pack_int10_2(clear_value, val->color.uint32);
1879 else
1880 PACK_F(r10g10b10a2_unorm);
1881 break;
1882 case 11:
1883 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
1884 break;
1885 case 16:
1886 if (util_format_is_snorm(format))
1887 PACK_F(r16g16b16a16_snorm);
1888 else if (util_format_is_unorm(format))
1889 PACK_F(r16g16b16a16_unorm);
1890 else if (util_format_is_float(format))
1891 PACK_F(r16g16b16a16_float);
1892 else
1893 pack_int16(clear_value, val->color.uint32);
1894 break;
1895 case 32:
1896 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
1897 break;
1898 case 0:
1899 assert(format == PIPE_FORMAT_A8_UNORM);
1900 PACK_F(a8_unorm);
1901 break;
1902 default:
1903 unreachable("unexpected channel size");
1904 }
1905 #undef PACK_F
1906 }
1907
1908 static void
event_blit_setup(struct tu_cs * cs,uint32_t buffer_id,const struct tu_render_pass_attachment * att,enum a6xx_blit_event_type blit_event_type,uint32_t clear_mask)1909 event_blit_setup(struct tu_cs *cs,
1910 uint32_t buffer_id,
1911 const struct tu_render_pass_attachment *att,
1912 enum a6xx_blit_event_type blit_event_type,
1913 uint32_t clear_mask)
1914 {
1915 tu_cs_emit_regs(
1916 cs, A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
1917
1918 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1919 tu_cs_emit(cs, 0);
1920
1921 tu_cs_emit_regs(
1922 cs,
1923 A6XX_RB_BLIT_INFO(.type = blit_event_type,
1924 .sample_0 =
1925 vk_format_is_int(att->format) ||
1926 vk_format_is_depth_or_stencil(att->format),
1927 .depth = vk_format_is_depth_or_stencil(att->format),
1928 .clear_mask = clear_mask,
1929 .buffer_id = buffer_id));
1930 }
1931
1932 struct event_blit_dst_view {
1933 const struct tu_image *image;
1934 const struct fdl6_view *view;
1935
1936 uint32_t layer;
1937
1938 uint64_t depth_addr;
1939 uint32_t depth_pitch;
1940
1941 uint64_t stencil_addr;
1942 uint32_t stencil_pitch;
1943 };
1944
1945 static event_blit_dst_view
blt_view_from_tu_view(const struct tu_image_view * iview,uint32_t layer)1946 blt_view_from_tu_view(const struct tu_image_view *iview,
1947 uint32_t layer)
1948 {
1949 struct event_blit_dst_view blt_view;
1950 blt_view.image = iview->image;
1951 blt_view.view = &iview->view;
1952 blt_view.layer = layer;
1953
1954 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1955 blt_view.depth_addr =
1956 iview->depth_base_addr + iview->depth_layer_size * layer;
1957 blt_view.depth_pitch = iview->depth_pitch;
1958
1959 blt_view.stencil_addr =
1960 iview->stencil_base_addr + iview->stencil_layer_size * layer;
1961 blt_view.stencil_pitch = iview->stencil_pitch;
1962 }
1963 return blt_view;
1964 }
1965
1966 template <chip CHIP>
1967 static void
event_blit_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const event_blit_dst_view * blt_view,bool separate_stencil)1968 event_blit_run(struct tu_cmd_buffer *cmd,
1969 struct tu_cs *cs,
1970 const struct tu_render_pass_attachment *att,
1971 const event_blit_dst_view *blt_view,
1972 bool separate_stencil)
1973 {
1974 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
1975 if (blt_view->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1976 if (!separate_stencil) {
1977 tu_cs_emit(cs, tu_fdl_view_depth(blt_view->view, RB_BLIT_DST_INFO));
1978 tu_cs_emit_qw(cs, blt_view->depth_addr);
1979 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(blt_view->depth_pitch).value);
1980
1981 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1982 tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1983 } else {
1984 tu_cs_emit(cs, tu_fdl_view_stencil(blt_view->view, RB_BLIT_DST_INFO) &
1985 ~A6XX_RB_BLIT_DST_INFO_FLAGS);
1986 tu_cs_emit_qw(cs, blt_view->stencil_addr);
1987 tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(blt_view->stencil_pitch).value);
1988 }
1989 } else {
1990 tu_cs_emit(cs, blt_view->view->RB_BLIT_DST_INFO);
1991 tu_cs_image_ref_2d<CHIP>(cs, blt_view->view, blt_view->layer, false);
1992
1993 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1994 tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1995 }
1996
1997 if (att) {
1998 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
1999 tu_cs_emit_regs(
2000 cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(
2001 cmd, att, blt_view->layer)));
2002 } else {
2003 tu_cs_emit_regs(cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(
2004 cmd, att, blt_view->layer)));
2005 }
2006 }
2007
2008 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
2009 }
2010
2011 static void
tu7_generic_layer_clear(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,bool separate_stencil,uint32_t layer,const VkClearValue * value,uint32_t a)2012 tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
2013 struct tu_cs *cs,
2014 uint32_t buffer_id,
2015 enum pipe_format format,
2016 uint8_t clear_mask,
2017 bool separate_stencil,
2018 uint32_t layer,
2019 const VkClearValue *value,
2020 uint32_t a)
2021 {
2022 const struct tu_render_pass_attachment *att =
2023 &cmd->state.pass->attachments[a];
2024 const struct tu_image_view *iview = cmd->state.attachments[a];
2025
2026 uint32_t clear_vals[4] = {};
2027 pack_blit_event_clear_value(value, format, clear_vals);
2028
2029 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2030 tu_cs_emit_array(cs, clear_vals, 4);
2031
2032 event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
2033
2034 event_blit_setup(cs, buffer_id, att, BLIT_EVENT_CLEAR, clear_mask);
2035 event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
2036 }
2037
2038
2039
2040 /* Copies/fills/updates for buffers are happening through CCU but need
2041 * additional synchronization when write range is not aligned to 64 bytes.
2042 * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not
2043 * coherent between each other in CCU since format seem to be a part of a
2044 * cache key.
2045 *
2046 * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306
2047 *
2048 * The synchronization with writes from UCHE (e.g. with SSBO stores) are
2049 * solved by the fact that UCHE has byte level dirtiness tracking and that CCU
2050 * flush would happen always before UCHE flush for such case (e.g. both
2051 * renderpass and dispatch would flush pending CCU write).
2052 *
2053 * Additionally see:
2054 * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111
2055 */
2056 template <chip CHIP>
2057 static void
handle_buffer_unaligned_store(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t size,bool * unaligned_store)2058 handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd,
2059 uint64_t dst_va,
2060 uint64_t size,
2061 bool *unaligned_store)
2062 {
2063 if (*unaligned_store)
2064 return;
2065
2066 if ((dst_va & 63) || (size & 63)) {
2067 tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE,
2068 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
2069 /* Wait for invalidations to land. */
2070 cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2071 tu_emit_cache_flush<CHIP>(cmd);
2072 *unaligned_store = true;
2073 }
2074 }
2075
2076 template <chip CHIP>
2077 static void
after_buffer_unaligned_buffer_store(struct tu_cmd_buffer * cmd,bool unaligned_store)2078 after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd,
2079 bool unaligned_store)
2080 {
2081 if (unaligned_store) {
2082 tu_flush_for_access(&cmd->state.cache,
2083 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
2084 TU_ACCESS_NONE);
2085 }
2086 }
2087
2088 template <chip CHIP>
2089 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)2090 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
2091 struct tu_cs *cs,
2092 struct tu_image *image,
2093 const VkClearValue *value)
2094 {
2095 const struct blit_ops *ops = &r2d_ops<CHIP>;
2096
2097 /* It is assumed that LRZ cache is invalidated at this point for
2098 * the writes here to become visible to LRZ.
2099 *
2100 * LRZ writes are going through UCHE cache, flush UCHE before changing
2101 * LRZ via CCU. Don't need to invalidate CCU since we are presumably
2102 * writing whole cache lines we assume to be 64 bytes.
2103 */
2104 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
2105
2106 ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
2107 VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
2108 VK_SAMPLE_COUNT_1_BIT);
2109 ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
2110 ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
2111 image->iova + image->lrz_layout.lrz_offset,
2112 image->lrz_layout.lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
2113 uint32_t lrz_height = image->lrz_layout.lrz_height * image->vk.array_layers;
2114 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
2115 (VkExtent2D) { image->lrz_layout.lrz_pitch, lrz_height });
2116 ops->run(cmd, cs);
2117 ops->teardown(cmd, cs);
2118
2119 /* Clearing writes via CCU color in the PS stage, and LRZ is read via
2120 * UCHE in the earlier GRAS stage.
2121 */
2122 cmd->state.cache.flush_bits |=
2123 TU_CMD_FLAG_CCU_CLEAN_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
2124 TU_CMD_FLAG_WAIT_FOR_IDLE;
2125 }
2126 TU_GENX(tu6_clear_lrz);
2127
2128 template <chip CHIP>
2129 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)2130 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
2131 struct tu_cs *cs,
2132 struct tu_image *image)
2133 {
2134 const struct blit_ops *ops = &r2d_ops<CHIP>;
2135 VkClearValue clear = {};
2136 clear.color.uint32[0] = 0xffffffff;
2137
2138 using LRZFC = fd_lrzfc_layout<CHIP>;
2139 uint64_t lrz_fc_iova = image->iova + image->lrz_layout.lrz_fc_offset;
2140 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2141 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2142 VK_SAMPLE_COUNT_1_BIT);
2143 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
2144 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2145 lrz_fc_iova + offsetof(LRZFC, fc1),
2146 sizeof(LRZFC::fc1),
2147 PIPE_FORMAT_R32_UINT);
2148 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2149 sizeof(LRZFC::fc1) / sizeof(uint32_t), 1
2150 });
2151 ops->run(cmd, cs);
2152 if constexpr (LRZFC::HAS_BIDIR) {
2153 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2154 lrz_fc_iova + offsetof(LRZFC, fc2),
2155 sizeof(LRZFC::fc2),
2156 PIPE_FORMAT_R32_UINT);
2157 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2158 sizeof(LRZFC::fc2) / sizeof(uint32_t), 1
2159 });
2160 ops->run(cmd, cs);
2161 }
2162 ops->teardown(cmd, cs);
2163 }
2164 TU_GENX(tu6_dirty_lrz_fc);
2165
2166 template<chip CHIP>
2167 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)2168 tu_image_view_copy_blit(struct fdl6_view *iview,
2169 struct tu_image *image,
2170 enum pipe_format format,
2171 const VkImageSubresourceLayers *subres,
2172 uint32_t layer,
2173 bool z_scale)
2174 {
2175 VkImageAspectFlags aspect_mask = subres->aspectMask;
2176
2177 /* always use the AS_R8G8B8A8 format for these */
2178 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
2179 format == PIPE_FORMAT_Z24X8_UNORM) {
2180 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
2181 }
2182
2183 const struct fdl_layout *layout =
2184 &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
2185
2186 const struct fdl_view_args args = {
2187 .chip = CHIP,
2188 .iova = image->iova,
2189 .base_miplevel = subres->mipLevel,
2190 .level_count = 1,
2191 .base_array_layer = subres->baseArrayLayer + layer,
2192 .layer_count = 1,
2193 .swiz = {
2194 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
2195 },
2196 .format = tu_format_for_aspect(format, aspect_mask),
2197 .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
2198 };
2199 fdl6_view_init(iview, &layout, &args, false);
2200 }
2201
2202 template<chip CHIP>
2203 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)2204 tu_image_view_copy(struct fdl6_view *iview,
2205 struct tu_image *image,
2206 enum pipe_format format,
2207 const VkImageSubresourceLayers *subres,
2208 uint32_t layer)
2209 {
2210 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2211 }
2212
2213 template<chip CHIP>
2214 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)2215 tu_image_view_blit(struct fdl6_view *iview,
2216 struct tu_image *image,
2217 const VkImageSubresourceLayers *subres,
2218 uint32_t layer)
2219 {
2220 enum pipe_format format = tu_aspects_to_plane(image->vk.format, subres->aspectMask);
2221 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2222 }
2223
2224 template <chip CHIP>
2225 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)2226 tu6_blit_image(struct tu_cmd_buffer *cmd,
2227 struct tu_image *src_image,
2228 struct tu_image *dst_image,
2229 const VkImageBlit2 *info,
2230 VkFilter filter)
2231 {
2232 const struct blit_ops *ops = &r2d_ops<CHIP>;
2233 struct tu_cs *cs = &cmd->cs;
2234 bool z_scale = false;
2235 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
2236
2237 /* 2D blit can't do rotation mirroring from just coordinates */
2238 static const enum a6xx_rotation rotate[2][2] = {
2239 {ROTATE_0, ROTATE_HFLIP},
2240 {ROTATE_VFLIP, ROTATE_180},
2241 };
2242
2243 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
2244 (info->dstOffsets[1].x < info->dstOffsets[0].x);
2245 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
2246 (info->dstOffsets[1].y < info->dstOffsets[0].y);
2247
2248 int32_t src0_z = info->srcOffsets[0].z;
2249 int32_t src1_z = info->srcOffsets[1].z;
2250
2251 if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
2252 info->dstOffsets[1].z - info->dstOffsets[0].z) ||
2253 info->srcOffsets[1].z < info->srcOffsets[0].z) {
2254 z_scale = true;
2255 }
2256
2257 if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
2258 layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
2259 src0_z = info->srcOffsets[1].z;
2260 src1_z = info->srcOffsets[0].z;
2261 }
2262
2263 if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
2264 assert(layers <= 1);
2265 layers = vk_image_subresource_layer_count(&dst_image->vk,
2266 &info->dstSubresource);
2267 }
2268
2269 /* BC1_RGB_* formats need to have their last components overriden with 1
2270 * when sampling, which is normally handled with the texture descriptor
2271 * swizzle. The 2d path can't handle that, so use the 3d path.
2272 *
2273 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
2274 * the 2d path.
2275 */
2276
2277 unsigned blit_param = rotate[mirror_y][mirror_x];
2278 if (dst_image->layout[0].nr_samples > 1 ||
2279 src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
2280 src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
2281 filter == VK_FILTER_CUBIC_EXT ||
2282 z_scale) {
2283 ops = &r3d_ops<CHIP>;
2284 blit_param = z_scale ? R3D_Z_SCALE : 0;
2285 }
2286
2287 /* use the right format in setup() for D32_S8 */
2288 enum pipe_format src_format = tu_aspects_to_plane(
2289 src_image->vk.format, info->srcSubresource.aspectMask);
2290 enum pipe_format dst_format = tu_aspects_to_plane(
2291 dst_image->vk.format, info->dstSubresource.aspectMask);
2292 trace_start_blit(&cmd->trace, cs,
2293 ops == &r3d_ops<CHIP>,
2294 src_image->vk.format,
2295 dst_image->vk.format,
2296 layers);
2297
2298 ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
2299 blit_param, false, dst_image->layout[0].ubwc,
2300 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2301
2302 if (ops == &r3d_ops<CHIP>) {
2303 const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
2304 info->srcOffsets[0].x, info->srcOffsets[0].y,
2305 info->dstOffsets[1].x, info->dstOffsets[1].y,
2306 info->srcOffsets[1].x, info->srcOffsets[1].y };
2307 r3d_coords_raw(cmd, cs, coords);
2308 } else {
2309 tu_cs_emit_regs(cs,
2310 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
2311 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
2312 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
2313 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
2314 tu_cs_emit_regs(cs,
2315 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
2316 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
2317 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
2318 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
2319 }
2320
2321 struct fdl6_view dst, src;
2322 tu_image_view_blit<CHIP>(
2323 &dst, dst_image, &info->dstSubresource,
2324 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
2325
2326 if (z_scale) {
2327 tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
2328 &info->srcSubresource, 0, true);
2329 ops->src(cmd, cs, &src, 0, filter, dst_format);
2330 } else {
2331 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
2332 }
2333
2334 for (uint32_t i = 0; i < layers; i++) {
2335 if (z_scale) {
2336 float t = ((float) i + 0.5f) / (float) layers;
2337 r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
2338 } else {
2339 ops->src(cmd, cs, &src, i, filter, dst_format);
2340 }
2341 ops->dst(cs, &dst, i, src_format);
2342 ops->run(cmd, cs);
2343 }
2344
2345 ops->teardown(cmd, cs);
2346
2347 trace_end_blit(&cmd->trace, cs);
2348 }
2349
2350 template <chip CHIP>
2351 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)2352 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
2353 const VkBlitImageInfo2 *pBlitImageInfo)
2354
2355 {
2356 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2357 VK_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
2358 VK_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
2359
2360 for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
2361 /* can't blit both depth and stencil at once with D32_S8
2362 * TODO: more advanced 3D blit path to support it instead?
2363 */
2364 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2365 dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2366 VkImageBlit2 region = pBlitImageInfo->pRegions[i];
2367 u_foreach_bit(b, region.dstSubresource.aspectMask) {
2368 region.srcSubresource.aspectMask = BIT(b);
2369 region.dstSubresource.aspectMask = BIT(b);
2370 tu6_blit_image<CHIP>(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter);
2371 }
2372 continue;
2373 }
2374 tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
2375 pBlitImageInfo->filter);
2376 }
2377
2378 if (dst_image->lrz_layout.lrz_total_size) {
2379 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2380 }
2381 }
2382 TU_GENX(tu_CmdBlitImage2);
2383
2384 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2385 copy_compressed(VkFormat format,
2386 VkOffset3D *offset,
2387 VkExtent3D *extent,
2388 uint32_t *width,
2389 uint32_t *height)
2390 {
2391 if (!vk_format_is_compressed(format))
2392 return;
2393
2394 uint32_t block_width = vk_format_get_blockwidth(format);
2395 uint32_t block_height = vk_format_get_blockheight(format);
2396
2397 offset->x /= block_width;
2398 offset->y /= block_height;
2399
2400 if (extent) {
2401 extent->width = DIV_ROUND_UP(extent->width, block_width);
2402 extent->height = DIV_ROUND_UP(extent->height, block_height);
2403 }
2404 if (width)
2405 *width = DIV_ROUND_UP(*width, block_width);
2406 if (height)
2407 *height = DIV_ROUND_UP(*height, block_height);
2408 }
2409
2410 template <chip CHIP>
2411 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2412 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2413 struct tu_buffer *src_buffer,
2414 struct tu_image *dst_image,
2415 const VkBufferImageCopy2 *info)
2416 {
2417 struct tu_cs *cs = &cmd->cs;
2418 uint32_t layers = MAX2(info->imageExtent.depth,
2419 vk_image_subresource_layer_count(&dst_image->vk,
2420 &info->imageSubresource));
2421 enum pipe_format src_format =
2422 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2423 enum pipe_format dst_format =
2424 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2425 const struct blit_ops *ops = &r2d_ops<CHIP>;
2426
2427 /* special case for buffer to stencil */
2428 if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2429 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2430 src_format = PIPE_FORMAT_S8_UINT;
2431 }
2432
2433 /* note: could use "R8_UNORM" when no UBWC */
2434 bool has_unaligned = CHIP >= A7XX; /* If unaligned buffer copies are supported. */
2435 unsigned blit_param = 0;
2436 if (src_format == PIPE_FORMAT_Y8_UNORM ||
2437 tu_pipe_format_is_float16(src_format)) {
2438 ops = &r3d_ops<CHIP>;
2439 blit_param = R3D_COPY;
2440 has_unaligned = false;
2441 }
2442
2443 VkOffset3D offset = info->imageOffset;
2444 VkExtent3D extent = info->imageExtent;
2445 uint32_t src_width = info->bufferRowLength ?: extent.width;
2446 uint32_t src_height = info->bufferImageHeight ?: extent.height;
2447
2448 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2449
2450 uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2451 uint32_t layer_size = src_height * pitch;
2452
2453 ops->setup(cmd, cs, src_format, dst_format,
2454 info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2455 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2456
2457 struct fdl6_view dst;
2458 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2459 &info->imageSubresource, offset.z);
2460
2461 for (uint32_t i = 0; i < layers; i++) {
2462 ops->dst(cs, &dst, i, src_format);
2463
2464 uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2465 bool unaligned = (src_va & 63) || (pitch & 63);
2466 if (!has_unaligned && unaligned) {
2467 for (uint32_t y = 0; y < extent.height; y++) {
2468 uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2469 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2470 x + extent.width, 1, dst_format);
2471 ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y}, (VkOffset2D) {x},
2472 (VkExtent2D) {extent.width, 1});
2473 ops->run(cmd, cs);
2474 src_va += pitch;
2475 }
2476 } else {
2477 if constexpr (CHIP >= A7XX) {
2478 /* Necessary to not trigger static assertion from A6XX variant. */
2479 if (has_unaligned) {
2480 r2d_src_buffer_unaligned<CHIP>(cmd, cs, src_format, src_va,
2481 pitch, extent.width,
2482 extent.height, dst_format);
2483 } else {
2484 ops->src_buffer(cmd, cs, src_format, src_va, pitch,
2485 extent.width, extent.height, dst_format);
2486 }
2487 } else {
2488 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width,
2489 extent.height, dst_format);
2490 }
2491 coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2492 ops->run(cmd, cs);
2493 }
2494 }
2495
2496 ops->teardown(cmd, cs);
2497 }
2498
2499 template <chip CHIP>
2500 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2501 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2502 const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2503 {
2504 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2505 VK_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2506 VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2507
2508 for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2509 tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2510 pCopyBufferToImageInfo->pRegions + i);
2511
2512 if (dst_image->lrz_layout.lrz_total_size) {
2513 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2514 }
2515 }
2516 TU_GENX(tu_CmdCopyBufferToImage2);
2517
2518 static void
tu_copy_memory_to_image(struct tu_device * device,struct tu_image * dst_image,const VkMemoryToImageCopyEXT * info,bool copy_memcpy)2519 tu_copy_memory_to_image(struct tu_device *device,
2520 struct tu_image *dst_image,
2521 const VkMemoryToImageCopyEXT *info,
2522 bool copy_memcpy)
2523 {
2524 unsigned plane = tu6_plane_index(dst_image->vk.format,
2525 info->imageSubresource.aspectMask);
2526 const struct fdl_layout *layout = &dst_image->layout[plane];
2527
2528 VkOffset3D offset = info->imageOffset;
2529 VkExtent3D extent = info->imageExtent;
2530 uint32_t src_width = info->memoryRowLength ?: extent.width;
2531 uint32_t src_height = info->memoryImageHeight ?: extent.height;
2532
2533 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2534
2535 uint32_t src_pitch = src_width * layout->cpp;
2536
2537 unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2538 offset.z : info->imageSubresource.baseArrayLayer;
2539 uint32_t layers = MAX2(extent.depth,
2540 vk_image_subresource_layer_count(&dst_image->vk,
2541 &info->imageSubresource));
2542
2543 uint32_t image_offset =
2544 fdl_surface_offset(layout,
2545 info->imageSubresource.mipLevel,
2546 start_layer);
2547
2548 uint32_t dst_layer_stride =
2549 fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2550 uint32_t dst_layer_size =
2551 layout->slices[info->imageSubresource.mipLevel].size0;
2552 uint32_t src_layer_stride =
2553 copy_memcpy ? dst_layer_size :
2554 (src_width * src_height * layout->cpp);
2555 bool tiled =
2556 fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2557
2558 const char *src = (const char *) info->pHostPointer;
2559 char *dst = (char *) dst_image->map + image_offset;
2560 for (unsigned layer = 0; layer < layers; layer++,
2561 src += src_layer_stride, dst += dst_layer_stride) {
2562 if (copy_memcpy) {
2563 memcpy(dst, src, src_layer_stride);
2564 } else if (!tiled) {
2565 uint32_t dst_pitch = fdl_pitch(layout,
2566 info->imageSubresource.mipLevel);
2567 for (unsigned y = 0; y < extent.height; y++) {
2568 memcpy(dst + dst_pitch * (y + offset.y) + offset.x * layout->cpp,
2569 src + src_pitch * y,
2570 extent.width * layout->cpp);
2571 }
2572 } else {
2573 fdl6_memcpy_linear_to_tiled(offset.x, offset.y,
2574 extent.width, extent.height,
2575 dst, src, layout,
2576 info->imageSubresource.mipLevel,
2577 src_pitch,
2578 &device->physical_device->ubwc_config);
2579 }
2580
2581 if (dst_image->bo->cached_non_coherent) {
2582 tu_bo_sync_cache(device, dst_image->bo,
2583 dst_image->bo_offset + image_offset,
2584 dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
2585 }
2586 }
2587 }
2588
2589 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyMemoryToImageEXT(VkDevice _device,const VkCopyMemoryToImageInfoEXT * info)2590 tu_CopyMemoryToImageEXT(VkDevice _device,
2591 const VkCopyMemoryToImageInfoEXT *info)
2592 {
2593 VK_FROM_HANDLE(tu_device, device, _device);
2594 VK_FROM_HANDLE(tu_image, dst_image, info->dstImage);
2595
2596 for (unsigned i = 0; i < info->regionCount; i++) {
2597 tu_copy_memory_to_image(device, dst_image, &info->pRegions[i],
2598 info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2599 }
2600
2601 if (dst_image->lrz_layout.lrz_total_size) {
2602 TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
2603 }
2604
2605 return VK_SUCCESS;
2606 }
2607
2608 template <chip CHIP>
2609 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info,bool * unaligned_store)2610 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2611 struct tu_image *src_image,
2612 struct tu_buffer *dst_buffer,
2613 const VkBufferImageCopy2 *info,
2614 bool *unaligned_store)
2615 {
2616 struct tu_cs *cs = &cmd->cs;
2617 uint32_t layers = MAX2(info->imageExtent.depth,
2618 vk_image_subresource_layer_count(&src_image->vk,
2619 &info->imageSubresource));
2620 enum pipe_format dst_format =
2621 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2622 enum pipe_format src_format =
2623 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2624 const struct blit_ops *ops = &r2d_ops<CHIP>;
2625
2626 if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2627 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2628 dst_format = PIPE_FORMAT_S8_UINT;
2629 }
2630
2631 /* note: could use "R8_UNORM" when no UBWC */
2632 unsigned blit_param = 0;
2633 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2634 tu_pipe_format_is_float16(src_format)) {
2635 ops = &r3d_ops<CHIP>;
2636 blit_param = R3D_COPY;
2637 }
2638
2639 VkOffset3D offset = info->imageOffset;
2640 VkExtent3D extent = info->imageExtent;
2641 uint32_t dst_width = info->bufferRowLength ?: extent.width;
2642 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2643
2644 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2645
2646 uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2647 uint32_t layer_size = pitch * dst_height;
2648
2649 handle_buffer_unaligned_store<CHIP>(cmd,
2650 dst_buffer->iova + info->bufferOffset,
2651 layer_size * layers, unaligned_store);
2652
2653 ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2654 VK_SAMPLE_COUNT_1_BIT);
2655
2656 struct fdl6_view src;
2657 tu_image_view_copy<CHIP>(&src, src_image, src_format,
2658 &info->imageSubresource, offset.z);
2659
2660 for (uint32_t i = 0; i < layers; i++) {
2661 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2662
2663 uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2664 if ((dst_va & 63) || (pitch & 63)) {
2665 for (uint32_t y = 0; y < extent.height; y++) {
2666 uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2667 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2668 ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2669 (VkExtent2D) {extent.width, 1});
2670 ops->run(cmd, cs);
2671 dst_va += pitch;
2672 }
2673 } else {
2674 ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2675 coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2676 ops->run(cmd, cs);
2677 }
2678 }
2679
2680 ops->teardown(cmd, cs);
2681 }
2682
2683 template <chip CHIP>
2684 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2685 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2686 const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2687 {
2688 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2689 VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2690 VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2691
2692 bool unaligned_store = false;
2693 for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2694 tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2695 pCopyImageToBufferInfo->pRegions + i,
2696 &unaligned_store);
2697
2698 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2699 }
2700 TU_GENX(tu_CmdCopyImageToBuffer2);
2701
2702 static void
tu_copy_image_to_memory(struct tu_device * device,struct tu_image * src_image,const VkImageToMemoryCopyEXT * info,bool copy_memcpy)2703 tu_copy_image_to_memory(struct tu_device *device,
2704 struct tu_image *src_image,
2705 const VkImageToMemoryCopyEXT *info,
2706 bool copy_memcpy)
2707 {
2708 unsigned plane = tu6_plane_index(src_image->vk.format,
2709 info->imageSubresource.aspectMask);
2710 const struct fdl_layout *layout = &src_image->layout[plane];
2711
2712 VkOffset3D offset = info->imageOffset;
2713 VkExtent3D extent = info->imageExtent;
2714 uint32_t dst_width = info->memoryRowLength ?: extent.width;
2715 uint32_t dst_height = info->memoryImageHeight ?: extent.height;
2716
2717 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2718
2719 uint32_t dst_pitch = dst_width * layout->cpp;
2720
2721 unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2722 offset.z : info->imageSubresource.baseArrayLayer;
2723 uint32_t layers = MAX2(extent.depth,
2724 vk_image_subresource_layer_count(&src_image->vk,
2725 &info->imageSubresource));
2726
2727 uint32_t image_offset =
2728 fdl_surface_offset(layout,
2729 info->imageSubresource.mipLevel,
2730 start_layer);
2731
2732 uint32_t src_layer_stride =
2733 fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2734 uint32_t src_layer_size =
2735 layout->slices[info->imageSubresource.mipLevel].size0;
2736 uint32_t dst_layer_stride =
2737 copy_memcpy ? src_layer_size : (dst_width * dst_height * layout->cpp);
2738 bool tiled =
2739 fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2740
2741 const char *src = (const char *) src_image->map + image_offset;
2742 char *dst = (char *) info->pHostPointer;
2743 for (unsigned layer = 0; layer < layers; layer++,
2744 src += src_layer_stride, dst += dst_layer_stride) {
2745 if (src_image->bo->cached_non_coherent) {
2746 tu_bo_sync_cache(device, src_image->bo,
2747 src_image->bo_offset + image_offset,
2748 src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
2749 }
2750
2751 if (copy_memcpy) {
2752 memcpy(dst, src, dst_layer_stride);
2753 } else if (!tiled) {
2754 uint32_t src_pitch = fdl_pitch(layout,
2755 info->imageSubresource.mipLevel);
2756 for (unsigned y = 0; y < extent.height; y++) {
2757 memcpy(dst + dst_pitch * y,
2758 src + src_pitch * (y + offset.y) + offset.x * layout->cpp,
2759 extent.width * layout->cpp);
2760 }
2761 } else {
2762 fdl6_memcpy_tiled_to_linear(offset.x, offset.y,
2763 extent.width, extent.height,
2764 dst, src, layout,
2765 info->imageSubresource.mipLevel,
2766 dst_pitch,
2767 &device->physical_device->ubwc_config);
2768 }
2769 }
2770 }
2771
2772 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToMemoryEXT(VkDevice _device,const VkCopyImageToMemoryInfoEXT * info)2773 tu_CopyImageToMemoryEXT(VkDevice _device,
2774 const VkCopyImageToMemoryInfoEXT *info)
2775 {
2776 VK_FROM_HANDLE(tu_device, device, _device);
2777 VK_FROM_HANDLE(tu_image, image, info->srcImage);
2778
2779 for (unsigned i = 0; i < info->regionCount; i++) {
2780 tu_copy_image_to_memory(device, image, &info->pRegions[i],
2781 info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2782 }
2783
2784 return VK_SUCCESS;
2785 }
2786
2787
2788 /* Tiled formats don't support swapping, which means that we can't support
2789 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2790 * formats like B5G5R5A1 have a separate linear-only format when sampling.
2791 * Currently we fake support for tiled swapped formats and use the unswapped
2792 * format instead, but this means that reinterpreting copies to and from
2793 * swapped formats can't be performed correctly unless we can swizzle the
2794 * components by reinterpreting the other image as the "correct" swapped
2795 * format, i.e. only when the other image is linear.
2796 */
2797
2798 template <chip CHIP>
2799 static bool
is_swapped_format(enum pipe_format format,bool is_mutable)2800 is_swapped_format(enum pipe_format format, bool is_mutable)
2801 {
2802 struct tu_native_format linear = blit_format_texture<CHIP>(format, TILE6_LINEAR, is_mutable, false);
2803 struct tu_native_format tiled = blit_format_texture<CHIP>(format, TILE6_3, is_mutable, false);
2804 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2805 }
2806
2807 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2808 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2809 * versa). This should mirror the logic in fdl6_layout.
2810 */
2811 static bool
image_is_r8g8(struct tu_image * image)2812 image_is_r8g8(struct tu_image *image)
2813 {
2814 return image->layout[0].cpp == 2 &&
2815 vk_format_get_nr_components(image->vk.format) == 2;
2816 }
2817
2818 template <chip CHIP>
2819 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2820 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2821 struct tu_image *src_image,
2822 struct tu_image *dst_image,
2823 const VkImageCopy2 *info)
2824 {
2825 const struct blit_ops *ops = &r2d_ops<CHIP>;
2826 struct tu_cs *cs = &cmd->cs;
2827
2828 if (dst_image->layout[0].nr_samples > 1)
2829 ops = &r3d_ops<CHIP>;
2830
2831 enum pipe_format format = PIPE_FORMAT_NONE;
2832 VkOffset3D src_offset = info->srcOffset;
2833 VkOffset3D dst_offset = info->dstOffset;
2834 VkExtent3D extent = info->extent;
2835 uint32_t layers_to_copy = MAX2(info->extent.depth,
2836 vk_image_subresource_layer_count(&src_image->vk,
2837 &info->srcSubresource));
2838
2839 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2840 * Images":
2841 *
2842 * When copying between compressed and uncompressed formats the extent
2843 * members represent the texel dimensions of the source image and not
2844 * the destination. When copying from a compressed image to an
2845 * uncompressed image the image texel dimensions written to the
2846 * uncompressed image will be source extent divided by the compressed
2847 * texel block dimensions. When copying from an uncompressed image to a
2848 * compressed image the image texel dimensions written to the compressed
2849 * image will be the source extent multiplied by the compressed texel
2850 * block dimensions.
2851 *
2852 * This means we only have to adjust the extent if the source image is
2853 * compressed.
2854 */
2855 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2856 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2857
2858 enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2859 enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2860
2861 /* note: could use "R8_UNORM" when no UBWC */
2862 unsigned blit_param = 0;
2863 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2864 src_format == PIPE_FORMAT_Y8_UNORM ||
2865 tu_pipe_format_is_float16(src_format) ||
2866 tu_pipe_format_is_float16(dst_format)) {
2867 ops = &r3d_ops<CHIP>;
2868 blit_param = R3D_COPY;
2869 }
2870
2871 bool use_staging_blit = false;
2872
2873 if (src_format == dst_format) {
2874 /* Images that share a format can always be copied directly because it's
2875 * the same as a blit.
2876 */
2877 format = src_format;
2878 } else if (!src_image->layout[0].tile_mode) {
2879 /* If an image is linear, we can always safely reinterpret it with the
2880 * other image's format and then do a regular blit.
2881 */
2882 format = dst_format;
2883 } else if (!dst_image->layout[0].tile_mode) {
2884 format = src_format;
2885 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2886 /* We can't currently copy r8g8 images to/from other cpp=2 images,
2887 * due to the different tile layout.
2888 */
2889 use_staging_blit = true;
2890 } else if (is_swapped_format<CHIP>(src_format,
2891 src_image->layout[0].is_mutable) ||
2892 is_swapped_format<CHIP>(dst_format,
2893 src_image->layout[0].is_mutable)) {
2894 /* If either format has a non-identity swap, then we can't copy
2895 * to/from it.
2896 */
2897 use_staging_blit = true;
2898 } else if (!src_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2899 format = dst_format;
2900 } else if (!dst_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2901 format = src_format;
2902 } else {
2903 /* Both formats use UBWC and so neither can be reinterpreted.
2904 * TODO: We could do an in-place decompression of the dst instead.
2905 */
2906 perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2907 use_staging_blit = true;
2908 }
2909
2910 struct fdl6_view dst, src;
2911
2912 if (use_staging_blit) {
2913 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2914 tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2915
2916 struct fdl_layout staging_layout = { 0 };
2917 VkOffset3D staging_offset = { 0 };
2918
2919 staging_layout.tile_mode = TILE6_LINEAR;
2920 staging_layout.ubwc = false;
2921
2922 uint32_t layer_count =
2923 vk_image_subresource_layer_count(&src_image->vk,
2924 &info->srcSubresource);
2925 fdl6_layout(&staging_layout,
2926 &cmd->device->physical_device->dev_info,
2927 src_format,
2928 src_image->layout[0].nr_samples,
2929 extent.width,
2930 extent.height,
2931 extent.depth,
2932 1,
2933 layer_count,
2934 extent.depth > 1,
2935 false,
2936 NULL);
2937
2938 struct tu_bo *staging_bo;
2939 VkResult result = tu_get_scratch_bo(cmd->device,
2940 staging_layout.size,
2941 &staging_bo);
2942 if (result != VK_SUCCESS) {
2943 vk_command_buffer_set_error(&cmd->vk, result);
2944 return;
2945 }
2946
2947 struct fdl6_view staging;
2948 const struct fdl_layout *staging_layout_ptr = &staging_layout;
2949 const struct fdl_view_args copy_to_args = {
2950 .chip = CHIP,
2951 .iova = staging_bo->iova,
2952 .base_miplevel = 0,
2953 .level_count = 1,
2954 .base_array_layer = 0,
2955 .layer_count = layer_count,
2956 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2957 .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2958 .type = FDL_VIEW_TYPE_2D,
2959 };
2960 fdl6_view_init(&staging, &staging_layout_ptr, ©_to_args, false);
2961
2962 ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2963 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2964 coords(ops, cmd, cs, staging_offset, src_offset, extent);
2965
2966 for (uint32_t i = 0; i < layers_to_copy; i++) {
2967 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2968 ops->dst(cs, &staging, i, src_format);
2969 ops->run(cmd, cs);
2970 }
2971
2972 /* When executed by the user there has to be a pipeline barrier here,
2973 * but since we're doing it manually we'll have to flush ourselves.
2974 */
2975 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
2976 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2977 tu_cs_emit_wfi(cs);
2978
2979 const struct fdl_view_args copy_from_args = {
2980 .chip = CHIP,
2981 .iova = staging_bo->iova,
2982 .base_miplevel = 0,
2983 .level_count = 1,
2984 .base_array_layer = 0,
2985 .layer_count = layer_count,
2986 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2987 .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2988 .type = FDL_VIEW_TYPE_2D,
2989 };
2990 fdl6_view_init(&staging, &staging_layout_ptr, ©_from_args, false);
2991
2992 ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2993 blit_param, false, dst_image->layout[0].ubwc,
2994 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2995 coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2996
2997 for (uint32_t i = 0; i < layers_to_copy; i++) {
2998 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2999 ops->dst(cs, &dst, i, dst_format);
3000 ops->run(cmd, cs);
3001 }
3002 } else {
3003 tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
3004 tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
3005
3006 ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
3007 blit_param, false, dst_image->layout[0].ubwc,
3008 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
3009 coords(ops, cmd, cs, dst_offset, src_offset, extent);
3010
3011 for (uint32_t i = 0; i < layers_to_copy; i++) {
3012 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
3013 ops->dst(cs, &dst, i, format);
3014 ops->run(cmd, cs);
3015 }
3016 }
3017
3018 ops->teardown(cmd, cs);
3019 }
3020
3021 template <chip CHIP>
3022 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)3023 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
3024 const VkCopyImageInfo2 *pCopyImageInfo)
3025 {
3026 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3027 VK_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
3028 VK_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
3029
3030 for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
3031 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3032 VkImageCopy2 info = pCopyImageInfo->pRegions[i];
3033 u_foreach_bit(b, info.dstSubresource.aspectMask) {
3034 info.srcSubresource.aspectMask = BIT(b);
3035 info.dstSubresource.aspectMask = BIT(b);
3036 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
3037 }
3038 continue;
3039 }
3040
3041 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
3042 pCopyImageInfo->pRegions + i);
3043 }
3044
3045 if (dst_image->lrz_layout.lrz_total_size) {
3046 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
3047 }
3048 }
3049 TU_GENX(tu_CmdCopyImage2);
3050
3051 static void
tu_copy_image_to_image_cpu(struct tu_device * device,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info,bool copy_memcpy)3052 tu_copy_image_to_image_cpu(struct tu_device *device,
3053 struct tu_image *src_image,
3054 struct tu_image *dst_image,
3055 const VkImageCopy2 *info,
3056 bool copy_memcpy)
3057 {
3058 unsigned src_plane = tu6_plane_index(src_image->vk.format,
3059 info->srcSubresource.aspectMask);
3060 unsigned dst_plane = tu6_plane_index(dst_image->vk.format,
3061 info->dstSubresource.aspectMask);
3062
3063 const struct fdl_layout *src_layout = &src_image->layout[src_plane];
3064 const struct fdl_layout *dst_layout = &dst_image->layout[dst_plane];
3065
3066 VkOffset3D src_offset = info->srcOffset;
3067 VkOffset3D dst_offset = info->dstOffset;
3068 VkExtent3D extent = info->extent;
3069 uint32_t layers_to_copy = MAX2(info->extent.depth,
3070 vk_image_subresource_layer_count(&src_image->vk,
3071 &info->srcSubresource));
3072
3073 /* See comment above. */
3074 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
3075 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
3076
3077 unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3078 src_offset.z : info->srcSubresource.baseArrayLayer;
3079 unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3080 dst_offset.z : info->dstSubresource.baseArrayLayer;
3081
3082 uint32_t src_layer_stride =
3083 fdl_layer_stride(src_layout, info->srcSubresource.mipLevel);
3084 uint32_t src_layer_size =
3085 src_layout->slices[info->srcSubresource.mipLevel].size0;
3086 uint32_t dst_layer_stride =
3087 fdl_layer_stride(dst_layout, info->dstSubresource.mipLevel);
3088 uint32_t dst_layer_size =
3089 dst_layout->slices[info->dstSubresource.mipLevel].size0;
3090
3091 uint32_t src_image_offset =
3092 fdl_surface_offset(src_layout,
3093 info->srcSubresource.mipLevel,
3094 src_start_layer);
3095 uint32_t dst_image_offset =
3096 fdl_surface_offset(dst_layout,
3097 info->dstSubresource.mipLevel,
3098 dst_start_layer);
3099
3100 bool src_tiled =
3101 fdl_tile_mode(src_layout, info->srcSubresource.mipLevel) != 0;
3102 bool dst_tiled =
3103 fdl_tile_mode(dst_layout, info->dstSubresource.mipLevel) != 0;
3104
3105 const char *src = (const char *) src_image->map + src_image_offset;
3106 char *dst = (char *) dst_image->map + dst_image_offset;
3107 for (unsigned layer = 0; layer < layers_to_copy; layer++,
3108 src += src_layer_stride, dst += dst_layer_stride) {
3109 if (src_image->bo->cached_non_coherent) {
3110 tu_bo_sync_cache(device, src_image->bo,
3111 src_image->bo_offset + src_image_offset,
3112 src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
3113 }
3114
3115 uint32_t src_pitch = fdl_pitch(src_layout,
3116 info->srcSubresource.mipLevel);
3117 uint32_t dst_pitch = fdl_pitch(dst_layout,
3118 info->dstSubresource.mipLevel);
3119
3120 if (copy_memcpy) {
3121 assert(src_layer_size == dst_layer_size);
3122 memcpy(dst, src, src_layer_size);
3123 } else if (!src_tiled && !dst_tiled) {
3124 for (unsigned y = 0; y < extent.height; y++) {
3125 memcpy(dst + dst_pitch * (y + dst_offset.y) + dst_offset.x * dst_layout->cpp,
3126 src + src_pitch * (y + src_offset.y) + src_offset.x * src_layout->cpp,
3127 extent.width * src_layout->cpp);
3128 }
3129 } else if (!src_tiled) {
3130 fdl6_memcpy_linear_to_tiled(dst_offset.x, dst_offset.y,
3131 extent.width, extent.height,
3132 dst,
3133 src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp,
3134 dst_layout,
3135 info->dstSubresource.mipLevel,
3136 src_pitch,
3137 &device->physical_device->ubwc_config);
3138 } else if (!dst_tiled) {
3139 fdl6_memcpy_tiled_to_linear(src_offset.x, src_offset.y,
3140 extent.width, extent.height,
3141 dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp,
3142 src,
3143 src_layout,
3144 info->dstSubresource.mipLevel,
3145 dst_pitch,
3146 &device->physical_device->ubwc_config);
3147 } else {
3148 /* Work tile-by-tile, holding the unswizzled tile in a temporary
3149 * buffer.
3150 */
3151 char temp_tile[256];
3152
3153 uint32_t block_width, block_height;
3154 fdl6_get_ubwc_blockwidth(src_layout, &block_width, &block_height);
3155
3156 uint32_t temp_pitch = block_width * src_layout->cpp;
3157
3158 for (unsigned by = src_offset.y / block_height;
3159 by * block_height < src_offset.y + extent.height; by++) {
3160 uint32_t src_y_start = MAX2(src_offset.y, by * block_height);
3161 uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y;
3162 uint32_t height =
3163 MIN2((by + 1) * block_height, src_offset.y + extent.height) -
3164 src_y_start;
3165 for (unsigned bx = src_offset.x / block_width;
3166 bx * block_width < src_offset.x + extent.width; bx++) {
3167 uint32_t src_x_start = MAX2(src_offset.x, bx * block_width);
3168 uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x;
3169 uint32_t width =
3170 MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
3171 src_x_start;
3172
3173 fdl6_memcpy_tiled_to_linear(src_x_start, src_y_start,
3174 width, height,
3175 temp_tile, src, src_layout,
3176 info->srcSubresource.mipLevel,
3177 temp_pitch,
3178 &device->physical_device->ubwc_config);
3179 fdl6_memcpy_linear_to_tiled(dst_x_start, dst_y_start,
3180 width, height,
3181 dst, temp_tile, dst_layout,
3182 info->dstSubresource.mipLevel,
3183 temp_pitch,
3184 &device->physical_device->ubwc_config);
3185 }
3186 }
3187 }
3188
3189 if (dst_image->bo->cached_non_coherent) {
3190 tu_bo_sync_cache(device, dst_image->bo,
3191 dst_image->bo_offset + dst_image_offset,
3192 dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
3193 }
3194 }
3195 }
3196
3197 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToImageEXT(VkDevice _device,const VkCopyImageToImageInfoEXT * pCopyImageToImageInfo)3198 tu_CopyImageToImageEXT(VkDevice _device,
3199 const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo)
3200 {
3201 VK_FROM_HANDLE(tu_device, device, _device);
3202 VK_FROM_HANDLE(tu_image, src_image, pCopyImageToImageInfo->srcImage);
3203 VK_FROM_HANDLE(tu_image, dst_image, pCopyImageToImageInfo->dstImage);
3204 bool copy_memcpy = pCopyImageToImageInfo->flags &
3205 VK_HOST_IMAGE_COPY_MEMCPY_EXT;
3206
3207 for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) {
3208 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3209 VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i];
3210 u_foreach_bit(b, info.dstSubresource.aspectMask) {
3211 info.srcSubresource.aspectMask = BIT(b);
3212 info.dstSubresource.aspectMask = BIT(b);
3213 tu_copy_image_to_image_cpu(device, src_image, dst_image, &info,
3214 copy_memcpy);
3215 }
3216 continue;
3217 }
3218
3219 tu_copy_image_to_image_cpu(device, src_image, dst_image,
3220 pCopyImageToImageInfo->pRegions + i,
3221 copy_memcpy);
3222 }
3223
3224 if (dst_image->lrz_layout.lrz_total_size) {
3225 TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
3226 }
3227
3228 return VK_SUCCESS;
3229 }
3230
3231 VKAPI_ATTR VkResult VKAPI_CALL
tu_TransitionImageLayoutEXT(VkDevice device,uint32_t transitionCount,const VkHostImageLayoutTransitionInfoEXT * transitions)3232 tu_TransitionImageLayoutEXT(VkDevice device,
3233 uint32_t transitionCount,
3234 const VkHostImageLayoutTransitionInfoEXT *transitions)
3235 {
3236 /* We don't do anything with layouts so this should be a no-op */
3237 return VK_SUCCESS;
3238 }
3239
3240 template <chip CHIP>
3241 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size,bool * unaligned_store)3242 copy_buffer(struct tu_cmd_buffer *cmd,
3243 uint64_t dst_va,
3244 uint64_t src_va,
3245 uint64_t size,
3246 uint32_t block_size,
3247 bool *unaligned_store)
3248 {
3249 const struct blit_ops *ops = &r2d_ops<CHIP>;
3250 struct tu_cs *cs = &cmd->cs;
3251 enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
3252 uint64_t blocks = size / block_size;
3253
3254 handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
3255
3256 ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
3257 VK_SAMPLE_COUNT_1_BIT);
3258
3259 while (blocks) {
3260 uint32_t src_x = (src_va & 63) / block_size;
3261 uint32_t dst_x = (dst_va & 63) / block_size;
3262 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
3263
3264 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
3265 ops->dst_buffer( cs, format, dst_va & ~63, 0, format);
3266 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
3267 ops->run(cmd, cs);
3268
3269 src_va += width * block_size;
3270 dst_va += width * block_size;
3271 blocks -= width;
3272 }
3273
3274 ops->teardown(cmd, cs);
3275 }
3276
3277 template <chip CHIP>
3278 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)3279 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
3280 const VkCopyBufferInfo2 *pCopyBufferInfo)
3281 {
3282 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3283 VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
3284 VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
3285
3286 bool unaligned_store = false;
3287 for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
3288 const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
3289 copy_buffer<CHIP>(cmd,
3290 dst_buffer->iova + region->dstOffset,
3291 src_buffer->iova + region->srcOffset,
3292 region->size, 1, &unaligned_store);
3293 }
3294
3295 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3296 }
3297 TU_GENX(tu_CmdCopyBuffer2);
3298
3299 template <chip CHIP>
3300 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)3301 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
3302 VkBuffer dstBuffer,
3303 VkDeviceSize dstOffset,
3304 VkDeviceSize dataSize,
3305 const void *pData)
3306 {
3307 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3308 VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3309
3310 struct tu_cs_memory tmp;
3311 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
3312 if (result != VK_SUCCESS) {
3313 vk_command_buffer_set_error(&cmd->vk, result);
3314 return;
3315 }
3316
3317 bool unaligned_store = false;
3318 memcpy(tmp.map, pData, dataSize);
3319 copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store);
3320
3321 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3322 }
3323 TU_GENX(tu_CmdUpdateBuffer);
3324
3325 template <chip CHIP>
3326 static void
tu_cmd_fill_buffer(VkCommandBuffer commandBuffer,VkDeviceAddress dstAddr,VkDeviceSize fillSize,uint32_t data)3327 tu_cmd_fill_buffer(VkCommandBuffer commandBuffer,
3328 VkDeviceAddress dstAddr,
3329 VkDeviceSize fillSize,
3330 uint32_t data)
3331 {
3332 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3333 const struct blit_ops *ops = &r2d_ops<CHIP>;
3334 struct tu_cs *cs = &cmd->cs;
3335
3336 uint32_t blocks = fillSize / 4;
3337
3338 bool unaligned_store = false;
3339 handle_buffer_unaligned_store<CHIP>(cmd, dstAddr, fillSize, &unaligned_store);
3340
3341 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
3342 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
3343 VK_SAMPLE_COUNT_1_BIT);
3344
3345 VkClearValue clear_val = {};
3346 clear_val.color.uint32[0] = data;
3347 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
3348
3349 while (blocks) {
3350 uint32_t dst_x = (dstAddr & 63) / 4;
3351 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
3352
3353 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dstAddr & ~63, 0, PIPE_FORMAT_R32_UINT);
3354 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
3355 ops->run(cmd, cs);
3356
3357 dstAddr += width * 4;
3358 blocks -= width;
3359 }
3360
3361 ops->teardown(cmd, cs);
3362
3363 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3364 }
3365
3366 void
tu_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer,VkDeviceAddress dstAddr,VkDeviceSize fillSize,uint32_t data)3367 tu_cmd_fill_buffer_addr(VkCommandBuffer commandBuffer,
3368 VkDeviceAddress dstAddr,
3369 VkDeviceSize fillSize,
3370 uint32_t data)
3371 {
3372 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3373
3374 TU_CALLX(cmd->device, tu_cmd_fill_buffer)(commandBuffer, dstAddr, fillSize,
3375 data);
3376 }
3377
3378 template <chip CHIP>
3379 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)3380 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
3381 VkBuffer dstBuffer,
3382 VkDeviceSize dstOffset,
3383 VkDeviceSize fillSize,
3384 uint32_t data)
3385 {
3386 VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3387
3388 fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
3389
3390 VkDeviceAddress dst_va = buffer->iova + dstOffset;
3391
3392 tu_cmd_fill_buffer<CHIP>(commandBuffer, dst_va, fillSize, data);
3393 }
3394 TU_GENX(tu_CmdFillBuffer);
3395
3396 template <chip CHIP>
3397 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)3398 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
3399 const VkResolveImageInfo2 *pResolveImageInfo)
3400 {
3401 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3402 VK_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
3403 VK_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
3404 const struct blit_ops *ops = &r2d_ops<CHIP>;
3405 struct tu_cs *cs = &cmd->cs;
3406
3407 enum pipe_format src_format =
3408 vk_format_to_pipe_format(src_image->vk.format);
3409 enum pipe_format dst_format =
3410 vk_format_to_pipe_format(dst_image->vk.format);
3411 ops->setup(cmd, cs, src_format, dst_format,
3412 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
3413 VK_SAMPLE_COUNT_1_BIT);
3414
3415 for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
3416 const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
3417 uint32_t layers = MAX2(info->extent.depth,
3418 vk_image_subresource_layer_count(&dst_image->vk,
3419 &info->dstSubresource));
3420
3421 /* TODO: aspect masks possible ? */
3422
3423 coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
3424
3425 struct fdl6_view dst, src;
3426 tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
3427 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
3428
3429 for (uint32_t i = 0; i < layers; i++) {
3430 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
3431 ops->dst(cs, &dst, i, src_format);
3432 ops->run(cmd, cs);
3433 }
3434 }
3435
3436 ops->teardown(cmd, cs);
3437 }
3438 TU_GENX(tu_CmdResolveImage2);
3439
3440 #define for_each_layer(layer, layer_mask, layers) \
3441 for (uint32_t layer = 0; \
3442 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
3443 layer++) \
3444 if (!layer_mask || (layer_mask & BIT(layer)))
3445
3446 template <chip CHIP>
3447 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)3448 resolve_sysmem(struct tu_cmd_buffer *cmd,
3449 struct tu_cs *cs,
3450 VkFormat vk_src_format,
3451 VkFormat vk_dst_format,
3452 const struct tu_image_view *src,
3453 const struct tu_image_view *dst,
3454 uint32_t layer_mask,
3455 uint32_t layers,
3456 const VkRect2D *rect,
3457 bool src_separate_ds,
3458 bool dst_separate_ds)
3459 {
3460 const struct blit_ops *ops = &r2d_ops<CHIP>;
3461
3462 trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
3463
3464 enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
3465 enum pipe_format dst_format = vk_format_to_pipe_format(vk_dst_format);
3466
3467 ops->setup(cmd, cs, src_format, dst_format,
3468 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
3469 VK_SAMPLE_COUNT_1_BIT);
3470 ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
3471
3472 for_each_layer(i, layer_mask, layers) {
3473 if (src_separate_ds) {
3474 if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3475 r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3476 } else {
3477 r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3478 }
3479 } else {
3480 ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
3481 }
3482
3483 if (dst_separate_ds) {
3484 if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3485 ops->dst_depth(cs, dst, i);
3486 } else {
3487 ops->dst_stencil(cs, dst, i);
3488 }
3489 } else {
3490 ops->dst(cs, &dst->view, i, src_format);
3491 }
3492
3493 ops->run(cmd, cs);
3494 }
3495
3496 ops->teardown(cmd, cs);
3497
3498 trace_end_sysmem_resolve(&cmd->trace, cs);
3499 }
3500
3501 template <chip CHIP>
3502 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)3503 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
3504 struct tu_cs *cs,
3505 const struct tu_image_view *src,
3506 const struct tu_image_view *dst,
3507 uint32_t layer_mask,
3508 uint32_t layers,
3509 const VkRect2D *rect)
3510 {
3511 assert(src->vk.format == dst->vk.format ||
3512 (vk_format_is_depth_or_stencil(src->image->vk.format) &&
3513 vk_format_is_depth_or_stencil(dst->image->vk.format)));
3514
3515 bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3516 bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3517
3518 if (dst_separate_ds) {
3519 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
3520 src, dst, layer_mask, layers, rect,
3521 src_separate_ds, dst_separate_ds);
3522 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
3523 src, dst, layer_mask, layers, rect,
3524 src_separate_ds, dst_separate_ds);
3525 } else {
3526 resolve_sysmem<CHIP>(cmd, cs, src->vk.format, dst->vk.format,
3527 src, dst, layer_mask, layers, rect,
3528 src_separate_ds, dst_separate_ds);
3529 }
3530 }
3531 TU_GENX(tu_resolve_sysmem);
3532
3533 template <chip CHIP>
3534 static uint32_t
tu_resolve_group_include_buffer(struct tu_resolve_group * resolve_group,VkFormat format)3535 tu_resolve_group_include_buffer(struct tu_resolve_group *resolve_group,
3536 VkFormat format)
3537 {
3538 /* Resolve groups are not usable on a6xx, so no pending resolve is
3539 * established. The default value of 0 is returned as the buffer ID.
3540 */
3541 if (CHIP == A6XX)
3542 return 0;
3543
3544 resolve_group->pending_resolves = true;
3545
3546 assert(format != VK_FORMAT_D32_SFLOAT_S8_UINT);
3547 /* D24_UNORM_S8_UINT should be assigned the depth buffer type, regardless of
3548 * whether depth, stencil or both are being resolved.
3549 */
3550 if (vk_format_has_depth(format))
3551 return 0x8;
3552 if (vk_format_has_stencil(format))
3553 return 0x9;
3554
3555 const uint32_t max_color_buffers = 8;
3556 uint32_t buffer_id = resolve_group->color_buffer_id++;
3557 return buffer_id % max_color_buffers;
3558 }
3559
3560 template <chip CHIP>
3561 void
tu_emit_resolve_group(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group)3562 tu_emit_resolve_group(struct tu_cmd_buffer *cmd,
3563 struct tu_cs *cs,
3564 struct tu_resolve_group *resolve_group)
3565 {
3566 /* Resolve groups are not usable on A6XX, so that template instantiation
3567 * should behave as a no-op.
3568 */
3569 if (CHIP == A6XX || !resolve_group->pending_resolves)
3570 return;
3571
3572 resolve_group->color_buffer_id = 0;
3573 resolve_group->pending_resolves = false;
3574
3575 tu_emit_raw_event_write<CHIP>(cmd, cs, CCU_END_RESOLVE_GROUP, false);
3576 }
3577 TU_GENX(tu_emit_resolve_group);
3578
3579 template <chip CHIP>
3580 static void
clear_image_cp_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3581 clear_image_cp_blit(struct tu_cmd_buffer *cmd,
3582 struct tu_image *image,
3583 const VkClearValue *clear_value,
3584 const VkImageSubresourceRange *range,
3585 VkImageAspectFlags aspect_mask)
3586 {
3587 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3588 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3589 struct tu_cs *cs = &cmd->cs;
3590 enum pipe_format format;
3591 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
3592 format = PIPE_FORMAT_R32_UINT;
3593 } else {
3594 format = tu_aspects_to_plane(image->vk.format, aspect_mask);
3595 }
3596
3597 if (image->layout[0].depth0 > 1) {
3598 assert(layer_count == 1);
3599 assert(range->baseArrayLayer == 0);
3600 }
3601
3602 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
3603
3604 ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
3605 (VkSampleCountFlagBits) image->layout[0].nr_samples);
3606 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
3607 ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
3608 else
3609 ops->clear_value(cmd, cs, format, clear_value);
3610
3611 for (unsigned j = 0; j < level_count; j++) {
3612 if (image->layout[0].depth0 > 1)
3613 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
3614
3615 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
3616 u_minify(image->layout[0].width0, range->baseMipLevel + j),
3617 u_minify(image->layout[0].height0, range->baseMipLevel + j)
3618 });
3619
3620 struct fdl6_view dst;
3621 const VkImageSubresourceLayers subresource = {
3622 .aspectMask = aspect_mask,
3623 .mipLevel = range->baseMipLevel + j,
3624 .baseArrayLayer = range->baseArrayLayer,
3625 .layerCount = 1,
3626 };
3627 tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
3628
3629 for (uint32_t i = 0; i < layer_count; i++) {
3630 ops->dst(cs, &dst, i, format);
3631 ops->run(cmd, cs);
3632 }
3633 }
3634
3635 ops->teardown(cmd, cs);
3636 }
3637
3638 static void
clear_image_event_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3639 clear_image_event_blit(struct tu_cmd_buffer *cmd,
3640 struct tu_image *image,
3641 uint32_t buffer_id,
3642 const VkClearValue *clear_value,
3643 const VkImageSubresourceRange *range,
3644 VkImageAspectFlags aspect_mask)
3645 {
3646 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3647 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3648 VkFormat vk_format = image->vk.format;
3649 if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3650 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
3651 vk_format = VK_FORMAT_S8_UINT;
3652 else
3653 vk_format = VK_FORMAT_D32_SFLOAT;
3654 }
3655
3656 enum pipe_format format = vk_format_to_pipe_format(vk_format);
3657
3658 if (image->layout[0].depth0 > 1) {
3659 assert(layer_count == 1);
3660 assert(range->baseArrayLayer == 0);
3661 }
3662
3663 struct tu_cs *cs = &cmd->cs;
3664
3665 tu_cs_emit_regs(cs,
3666 A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
3667
3668 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3669 tu_cs_emit(cs, 0);
3670
3671 tu_cs_emit_regs(
3672 cs, A6XX_RB_BLIT_INFO(
3673 .type = BLIT_EVENT_CLEAR,
3674 .sample_0 = vk_format_is_int(vk_format) ||
3675 vk_format_is_depth_or_stencil(vk_format),
3676 .depth = vk_format_is_depth_or_stencil(vk_format),
3677 .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask),
3678 .buffer_id = buffer_id));
3679
3680 uint32_t clear_vals[4] = {};
3681 pack_blit_event_clear_value(clear_value, format, clear_vals);
3682 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3683 tu_cs_emit_array(cs, clear_vals, 4);
3684
3685 for (unsigned level = 0; level < level_count; level++) {
3686 if (image->layout[0].depth0 > 1)
3687 layer_count =
3688 u_minify(image->layout[0].depth0, range->baseMipLevel + level);
3689
3690 uint32_t width =
3691 u_minify(image->layout[0].width0, range->baseMipLevel + level);
3692 uint32_t height =
3693 u_minify(image->layout[0].height0, range->baseMipLevel + level);
3694 tu_cs_emit_regs(
3695 cs, A6XX_RB_BLIT_SCISSOR_TL(.x = 0, .y = 0),
3696 A6XX_RB_BLIT_SCISSOR_BR(.x = width - 1, .y = height - 1));
3697
3698 struct fdl6_view dst;
3699 const VkImageSubresourceLayers subresource = {
3700 .aspectMask = aspect_mask,
3701 .mipLevel = range->baseMipLevel + level,
3702 .baseArrayLayer = range->baseArrayLayer,
3703 .layerCount = 1,
3704 };
3705 tu_image_view_copy_blit<A7XX>(&dst, image, format, &subresource, 0, false);
3706
3707 for (uint32_t layer = 0; layer < layer_count; layer++) {
3708
3709 struct event_blit_dst_view blt_view = {
3710 .image = image,
3711 .view = &dst,
3712 .layer = layer,
3713 };
3714
3715 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3716 uint32_t real_level = range->baseMipLevel + level;
3717 uint32_t real_layer = range->baseArrayLayer + layer;
3718 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
3719 struct fdl_layout *layout = &image->layout[0];
3720 blt_view.depth_addr =
3721 image->iova +
3722 fdl_surface_offset(layout, real_level, real_layer);
3723 blt_view.depth_pitch = fdl_pitch(layout, real_level);
3724 } else {
3725 struct fdl_layout *layout = &image->layout[1];
3726 blt_view.stencil_addr =
3727 image->iova +
3728 fdl_surface_offset(layout, real_level, real_layer);
3729 blt_view.stencil_pitch = fdl_pitch(layout, real_level);
3730 }
3731 }
3732
3733 event_blit_run<A7XX>(cmd, cs, NULL, &blt_view,
3734 aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT);
3735 }
3736 }
3737 }
3738
3739 static bool
use_generic_clear_for_image_clear(struct tu_cmd_buffer * cmd,struct tu_image * image)3740 use_generic_clear_for_image_clear(struct tu_cmd_buffer *cmd,
3741 struct tu_image *image)
3742 {
3743 const struct fd_dev_info *info = cmd->device->physical_device->info;
3744 return info->a7xx.has_generic_clear &&
3745 /* A7XX supports R9G9B9E5_FLOAT as color attachment and supports
3746 * generic clears for it. A7XX TODO: allow R9G9B9E5_FLOAT
3747 * attachments.
3748 */
3749 image->vk.format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 &&
3750 /* Clearing VK_FORMAT_R8G8_* with fast-clear value, certain
3751 * dimensions (e.g. 960x540), and having GMEM renderpass afterwards
3752 * may lead to a GPU fault on A7XX.
3753 */
3754 !(info->a7xx.r8g8_faulty_fast_clear_quirk && image_is_r8g8(image));
3755 }
3756
3757 template <chip CHIP>
3758 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3759 clear_image(struct tu_cmd_buffer *cmd,
3760 struct tu_image *image,
3761 uint32_t buffer_id,
3762 const VkClearValue *clear_value,
3763 const VkImageSubresourceRange *range,
3764 VkImageAspectFlags aspect_mask)
3765 {
3766 if (use_generic_clear_for_image_clear(cmd, image)) {
3767 clear_image_event_blit(cmd, image, buffer_id, clear_value, range, aspect_mask);
3768 } else {
3769 clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
3770 }
3771 }
3772
3773 template <chip CHIP>
3774 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3775 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
3776 VkImage image_h,
3777 VkImageLayout imageLayout,
3778 const VkClearColorValue *pColor,
3779 uint32_t rangeCount,
3780 const VkImageSubresourceRange *pRanges)
3781 {
3782 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3783 VK_FROM_HANDLE(tu_image, image, image_h);
3784
3785 bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3786 if (use_generic_clear) {
3787 /* Generic clear doesn't go through CCU (or other caches). */
3788 cmd->state.cache.flush_bits |=
3789 TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_WAIT_FOR_IDLE;
3790 tu_emit_cache_flush<CHIP>(cmd);
3791 }
3792
3793 struct tu_resolve_group resolve_group = {};
3794
3795 for (unsigned i = 0; i < rangeCount; i++) {
3796 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, image->vk.format);
3797 clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
3798 }
3799
3800 tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3801 if (use_generic_clear) {
3802 /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3803 * proceed only after the just-emitted generic clears are complete.
3804 */
3805 cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3806 tu_emit_cache_flush<CHIP>(cmd);
3807 }
3808 }
3809 TU_GENX(tu_CmdClearColorImage);
3810
3811 template <chip CHIP>
3812 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3813 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
3814 VkImage image_h,
3815 VkImageLayout imageLayout,
3816 const VkClearDepthStencilValue *pDepthStencil,
3817 uint32_t rangeCount,
3818 const VkImageSubresourceRange *pRanges)
3819 {
3820 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3821 VK_FROM_HANDLE(tu_image, image, image_h);
3822
3823 bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3824 if (use_generic_clear) {
3825 /* Generic clear doesn't go through CCU (or other caches). */
3826 cmd->state.cache.flush_bits |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
3827 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
3828 TU_CMD_FLAG_WAIT_FOR_IDLE;
3829 tu_emit_cache_flush<CHIP>(cmd);
3830 }
3831
3832 struct tu_resolve_group resolve_group = {};
3833
3834 for (unsigned i = 0; i < rangeCount; i++) {
3835 const VkImageSubresourceRange *range = &pRanges[i];
3836
3837 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3838 /* can't clear both depth and stencil at once, split up the aspect mask */
3839 u_foreach_bit(b, range->aspectMask) {
3840 uint32_t buffer_id = 0;
3841 if (BIT(b) == VK_IMAGE_ASPECT_DEPTH_BIT)
3842 buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, VK_FORMAT_D32_SFLOAT);
3843 if (BIT(b) == VK_IMAGE_ASPECT_STENCIL_BIT)
3844 buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, VK_FORMAT_S8_UINT);
3845
3846 clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, BIT(b));
3847 }
3848 continue;
3849 }
3850
3851 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, image->vk.format);
3852 clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
3853 }
3854
3855 tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3856 if (use_generic_clear) {
3857 /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3858 * proceed only after the just-emitted generic clears are complete.
3859 */
3860 cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3861 tu_emit_cache_flush<CHIP>(cmd);
3862 }
3863
3864 tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
3865 }
3866 TU_GENX(tu_CmdClearDepthStencilImage);
3867
3868 /* CmdClearAttachments uses the original color attachment index instead of the
3869 * remapped index used by the shader, and our MRTs use the remapped
3870 * indices, so we have to remap them. We should always be able to find a
3871 * shader attachment thanks to this VU:
3872 *
3873 * VUID-vkCmdClearAttachments-colorAttachment-09503
3874 * "The colorAttachment member of each element of pAttachments must not
3875 * identify a color attachment that is currently mapped to
3876 * VK_ATTACHMENT_UNUSED in commandBuffer via
3877 * VkRenderingAttachmentLocationInfoKHR"
3878 */
3879 static unsigned
remap_attachment(struct tu_cmd_buffer * cmd,unsigned a)3880 remap_attachment(struct tu_cmd_buffer *cmd, unsigned a)
3881 {
3882 unsigned i = cmd->vk.dynamic_graphics_state.cal.color_map[a];
3883 assert(i != MESA_VK_ATTACHMENT_UNUSED &&
3884 "app violates VUID-vkCmdClearAttachments-colorAttachment-09503");
3885 return i;
3886 }
3887
3888 template <chip CHIP>
3889 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3890 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
3891 uint32_t attachment_count,
3892 const VkClearAttachment *attachments,
3893 uint32_t rect_count,
3894 const VkClearRect *rects)
3895 {
3896 /* the shader path here is special, it avoids changing MRT/etc state */
3897 const struct tu_subpass *subpass = cmd->state.subpass;
3898 const uint32_t mrt_count = subpass->color_count;
3899 struct tu_cs *cs = &cmd->draw_cs;
3900 uint32_t clear_value[MAX_RTS][4];
3901 float z_clear_val = 0.0f;
3902 uint8_t s_clear_val = 0;
3903 uint32_t clear_rts = 0, clear_components = 0;
3904 bool z_clear = false;
3905 bool s_clear = false;
3906
3907 trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
3908
3909 for (uint32_t i = 0; i < attachment_count; i++) {
3910 uint32_t a;
3911 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3912 uint32_t c = attachments[i].colorAttachment;
3913 a = subpass->color_attachments[c].attachment;
3914 if (a == VK_ATTACHMENT_UNUSED)
3915 continue;
3916
3917 uint32_t remapped = remap_attachment(cmd, c);
3918 clear_rts |= 1 << remapped;
3919 clear_components |= 0xf << (remapped * 4);
3920 memcpy(clear_value[remapped], &attachments[i].clearValue, 4 * sizeof(uint32_t));
3921 } else {
3922 a = subpass->depth_stencil_attachment.attachment;
3923 if (a == VK_ATTACHMENT_UNUSED)
3924 continue;
3925
3926 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3927 z_clear = true;
3928 z_clear_val = attachments[i].clearValue.depthStencil.depth;
3929 }
3930
3931 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3932 s_clear = true;
3933 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
3934 }
3935 }
3936 }
3937
3938 /* We may not know the multisample count if there are no attachments, so
3939 * just bail early to avoid corner cases later.
3940 */
3941 if (clear_rts == 0 && !z_clear && !s_clear)
3942 return;
3943
3944 /* disable all draw states so they don't interfere
3945 * TODO: use and re-use draw states
3946 * we have to disable draw states individually to preserve
3947 * input attachment states, because a secondary command buffer
3948 * won't be able to restore them
3949 */
3950 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3951 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
3952 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
3953 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
3954 continue;
3955 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
3956 CP_SET_DRAW_STATE__0_DISABLE);
3957 tu_cs_emit_qw(cs, 0);
3958 }
3959 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
3960
3961 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
3962 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
3963 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
3964 0xfc000000);
3965 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
3966
3967 r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
3968
3969 /* Disable sample counting in order to not affect occlusion query. */
3970 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
3971
3972 if (cmd->state.prim_generated_query_running_before_rp) {
3973 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
3974 }
3975
3976 tu_cs_emit_regs(cs,
3977 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
3978 tu_cs_emit_regs(cs,
3979 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
3980
3981 tu_cs_emit_regs(cs,
3982 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
3983
3984 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
3985 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
3986 for (uint32_t i = 0; i < mrt_count; i++) {
3987 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
3988 .component_enable = COND(clear_rts & (1 << i), 0xf)));
3989 }
3990
3991 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
3992 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
3993
3994 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
3995 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3996 .z_test_enable = z_clear,
3997 .z_write_enable = z_clear,
3998 .zfunc = FUNC_ALWAYS));
3999 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
4000 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
4001 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
4002 .stencil_enable = s_clear,
4003 .func = FUNC_ALWAYS,
4004 .zpass = STENCIL_REPLACE));
4005 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
4006 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
4007 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
4008 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
4009
4010 tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
4011
4012 unsigned num_rts = util_bitcount(clear_rts);
4013 uint32_t packed_clear_value[MAX_RTS][4];
4014
4015 uint32_t idx = 0;
4016 u_foreach_bit(b, clear_rts) {
4017 memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
4018 idx++;
4019 }
4020
4021 if (num_rts > 0)
4022 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
4023 0, packed_clear_value, num_rts);
4024
4025 for (uint32_t i = 0; i < rect_count; i++) {
4026 /* This should be true because of this valid usage for
4027 * vkCmdClearAttachments:
4028 *
4029 * "If the render pass instance this is recorded in uses multiview,
4030 * then baseArrayLayer must be zero and layerCount must be one"
4031 */
4032 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
4033
4034 /* a630 doesn't support multiview masks, which means that we can't use
4035 * the normal multiview path without potentially recompiling a shader
4036 * on-demand or using a more complicated variant that takes the mask as
4037 * a const. Just use the layered path instead, since it shouldn't be
4038 * much worse.
4039 */
4040 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
4041 {
4042 const float coords[] = {
4043 rects[i].rect.offset.x,
4044 rects[i].rect.offset.y,
4045 z_clear_val,
4046 uif(rects[i].baseArrayLayer + layer),
4047 rects[i].rect.offset.x + rects[i].rect.extent.width,
4048 rects[i].rect.offset.y + rects[i].rect.extent.height,
4049 z_clear_val,
4050 1.0f,
4051 };
4052
4053 r3d_coords_raw(cmd, cs, coords);
4054 r3d_run_vis(cmd, cs);
4055 }
4056 }
4057
4058 /* Re-enable sample counting. */
4059 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
4060
4061 if (cmd->state.prim_generated_query_running_before_rp) {
4062 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
4063 }
4064
4065 trace_end_sysmem_clear_all(&cmd->trace, cs);
4066 }
4067
4068 template <chip CHIP>
4069 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)4070 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4071 struct tu_cs *cs,
4072 uint32_t buffer_id,
4073 enum pipe_format format,
4074 uint8_t clear_mask,
4075 uint32_t gmem_offset,
4076 const VkClearValue *value)
4077 {
4078 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
4079 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
4080 blit_base_format<CHIP>(format, false, true)));
4081
4082 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
4083 .clear_mask = clear_mask,
4084 .buffer_id = buffer_id));
4085
4086 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
4087 tu_cs_emit(cs, gmem_offset);
4088
4089 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
4090 tu_cs_emit(cs, 0);
4091
4092 uint32_t clear_vals[4] = {};
4093 pack_blit_event_clear_value(value, format, clear_vals);
4094
4095 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4096 tu_cs_emit_array(cs, clear_vals, 4);
4097
4098 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
4099 }
4100
4101 template <chip CHIP>
4102 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)4103 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4104 struct tu_cs *cs,
4105 struct tu_resolve_group *resolve_group,
4106 uint32_t attachment,
4107 uint32_t base_layer,
4108 uint32_t layers,
4109 uint32_t layer_mask,
4110 VkImageAspectFlags mask,
4111 const VkClearValue *value)
4112 {
4113 const struct tu_render_pass_attachment *att =
4114 &cmd->state.pass->attachments[attachment];
4115
4116 trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
4117
4118 tu_cs_emit_regs(cs,
4119 A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
4120
4121 enum pipe_format format = vk_format_to_pipe_format(att->format);
4122 for_each_layer(i, layer_mask, layers) {
4123 uint32_t layer = i + base_layer;
4124 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4125 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4126 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, VK_FORMAT_D32_SFLOAT);
4127 clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, 0xf,
4128 tu_attachment_gmem_offset(cmd, att, layer), value);
4129 }
4130 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4131 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, VK_FORMAT_S8_UINT);
4132 clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, 0xf,
4133 tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
4134 }
4135 } else {
4136 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, att->format);
4137 clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, format, aspect_write_mask(format, mask),
4138 tu_attachment_gmem_offset(cmd, att, layer), value);
4139 }
4140 }
4141
4142 tu_flush_for_access(&cmd->state.renderpass_cache, TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4143
4144 trace_end_gmem_clear(&cmd->trace, cs);
4145 }
4146
4147 template <chip CHIP>
4148 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)4149 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
4150 uint32_t attachment_count,
4151 const VkClearAttachment *attachments,
4152 uint32_t rect_count,
4153 const VkClearRect *rects)
4154 {
4155 const struct tu_subpass *subpass = cmd->state.subpass;
4156 struct tu_cs *cs = &cmd->draw_cs;
4157
4158 if (rect_count > 1)
4159 perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
4160
4161 struct tu_resolve_group resolve_group = {};
4162
4163 for (unsigned i = 0; i < rect_count; i++) {
4164 unsigned x1 = rects[i].rect.offset.x;
4165 unsigned y1 = rects[i].rect.offset.y;
4166 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
4167 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
4168
4169 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4170 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4171 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4172
4173 for (unsigned j = 0; j < attachment_count; j++) {
4174 uint32_t a;
4175 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
4176 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
4177 else
4178 a = subpass->depth_stencil_attachment.attachment;
4179
4180 if (a == VK_ATTACHMENT_UNUSED)
4181 continue;
4182
4183 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a,
4184 rects[i].baseArrayLayer,
4185 rects[i].layerCount,
4186 subpass->multiview_mask,
4187 attachments[j].aspectMask,
4188 &attachments[j].clearValue);
4189 }
4190 }
4191
4192 tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
4193 }
4194
4195 template <chip CHIP>
4196 static void
tu_clear_attachments(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4197 tu_clear_attachments(struct tu_cmd_buffer *cmd,
4198 uint32_t attachmentCount,
4199 const VkClearAttachment *pAttachments,
4200 uint32_t rectCount,
4201 const VkClearRect *pRects)
4202 {
4203 struct tu_cs *cs = &cmd->draw_cs;
4204
4205 /* sysmem path behaves like a draw, note we don't have a way of using different
4206 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
4207 */
4208 tu_emit_cache_flush_renderpass<CHIP>(cmd);
4209
4210 /* vkCmdClearAttachments is supposed to respect the predicate if active. The
4211 * easiest way to do this is to always use the 3d path, which always works
4212 * even with GMEM because it's just a simple draw using the existing
4213 * attachment state.
4214 *
4215 * Similarly, we also use the 3D path when in a secondary command buffer that
4216 * doesn't know the GMEM layout that will be chosen by the primary.
4217 */
4218 if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
4219 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4220 return;
4221 }
4222
4223 /* If we could skip tile load/stores based on any draws intersecting them at
4224 * binning time, then emit the clear as a 3D draw so that it contributes to
4225 * that visibility.
4226 */
4227 const struct tu_subpass *subpass = cmd->state.subpass;
4228 for (uint32_t i = 0; i < attachmentCount; i++) {
4229 uint32_t a;
4230 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4231 uint32_t c = pAttachments[i].colorAttachment;
4232 a = subpass->color_attachments[c].attachment;
4233 } else {
4234 a = subpass->depth_stencil_attachment.attachment;
4235 }
4236 if (a != VK_ATTACHMENT_UNUSED) {
4237 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4238 if (att->cond_load_allowed || att->cond_store_allowed) {
4239 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4240 return;
4241 }
4242 }
4243 }
4244
4245 /* Otherwise, emit 2D blits for gmem rendering. */
4246 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
4247 tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4248 tu_cond_exec_end(cs);
4249
4250 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4251 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4252 tu_cond_exec_end(cs);
4253 }
4254
4255 static void
tu7_clear_attachment_generic_single_rect(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_render_pass_attachment * att,const VkClearAttachment * clear_att,uint32_t a,const VkClearRect * rect)4256 tu7_clear_attachment_generic_single_rect(
4257 struct tu_cmd_buffer *cmd,
4258 struct tu_cs *cs,
4259 struct tu_resolve_group *resolve_group,
4260 const struct tu_render_pass_attachment *att,
4261 const VkClearAttachment *clear_att,
4262 uint32_t a,
4263 const VkClearRect *rect)
4264 {
4265 const struct tu_subpass *subpass = cmd->state.subpass;
4266 unsigned x1 = rect->rect.offset.x;
4267 unsigned y1 = rect->rect.offset.y;
4268 unsigned x2 = x1 + rect->rect.extent.width - 1;
4269 unsigned y2 = y1 + rect->rect.extent.height - 1;
4270
4271 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4272 tu_cs_emit(cs,
4273 A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4274 tu_cs_emit(cs,
4275 A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4276
4277 auto value = &clear_att->clearValue;
4278
4279 enum pipe_format format = vk_format_to_pipe_format(att->format);
4280 for_each_layer(i, subpass->multiview_mask, rect->layerCount) {
4281 uint32_t layer = i + rect->baseArrayLayer;
4282 uint32_t mask =
4283 aspect_write_mask_generic_clear(format, clear_att->aspectMask);
4284
4285 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4286 if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4287 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_D32_SFLOAT);
4288 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4289 false, layer, value, a);
4290 }
4291 if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4292 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_S8_UINT);
4293 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4294 layer, value, a);
4295 }
4296 } else {
4297 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, att->format);
4298 tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4299 }
4300 }
4301 }
4302
4303 static void
tu_clear_attachments_generic(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4304 tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
4305 uint32_t attachmentCount,
4306 const VkClearAttachment *pAttachments,
4307 uint32_t rectCount,
4308 const VkClearRect *pRects)
4309 {
4310 struct tu_cs *cs = &cmd->draw_cs;
4311
4312 uint32_t clear_aspects = 0;
4313 for (uint32_t i = 0; i < attachmentCount; i++) {
4314 clear_aspects |= pAttachments[i].aspectMask;
4315 }
4316
4317 /* Generic clear doesn't go through CCU (or other caches),
4318 * so we have to flush (clean+invalidate) corresponding caches.
4319 */
4320 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4321 if (clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
4322 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4323 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_COLOR).value);
4324 }
4325 if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4326 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4327 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_DEPTH).value);
4328 }
4329 tu_cs_emit_wfi(cs);
4330 tu_cond_exec_end(cs);
4331
4332 struct tu_resolve_group resolve_group = {};
4333
4334 const struct tu_subpass *subpass = cmd->state.subpass;
4335 for (uint32_t i = 0; i < attachmentCount; i++) {
4336 uint32_t a;
4337 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4338 uint32_t c = pAttachments[i].colorAttachment;
4339 a = subpass->color_attachments[c].attachment;
4340 } else {
4341 a = subpass->depth_stencil_attachment.attachment;
4342 }
4343 if (a != VK_ATTACHMENT_UNUSED) {
4344 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4345 const struct tu_image_view *iview = cmd->state.attachments[a];
4346 trace_start_generic_clear(&cmd->trace, cs, att->format,
4347 iview->view.ubwc_enabled, att->samples);
4348 for (unsigned j = 0; j < rectCount; j++) {
4349 tu7_clear_attachment_generic_single_rect(
4350 cmd, cs, &resolve_group, att, &pAttachments[i], a, &pRects[j]);
4351 }
4352 trace_end_generic_clear(&cmd->trace, cs);
4353 }
4354 }
4355
4356 tu_emit_resolve_group<A7XX>(cmd, cs, &resolve_group);
4357 }
4358
4359 template <chip CHIP>
4360 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4361 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
4362 uint32_t attachmentCount,
4363 const VkClearAttachment *pAttachments,
4364 uint32_t rectCount,
4365 const VkClearRect *pRects)
4366 {
4367 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4368
4369 for (uint32_t j = 0; j < attachmentCount; j++) {
4370 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
4371 continue;
4372
4373 tu_lrz_disable_during_renderpass<CHIP>(cmd, "CmdClearAttachments");
4374 }
4375
4376 if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
4377 /* Both having predication and not knowing layout could be solved
4378 * by cs patching, which is exactly what prop driver is doing.
4379 * We don't implement it because we don't expect a reasonable impact.
4380 */
4381 !(cmd->state.predication_active ||
4382 cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT)) {
4383 tu_clear_attachments_generic(cmd, attachmentCount, pAttachments, rectCount, pRects);
4384 } else {
4385 tu_clear_attachments<CHIP>(cmd, attachmentCount, pAttachments,
4386 rectCount, pRects);
4387 }
4388 }
4389 TU_GENX(tu_CmdClearAttachments);
4390
4391 template <chip CHIP>
4392 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)4393 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4394 struct tu_cs *cs,
4395 VkFormat vk_format,
4396 VkImageAspectFlags clear_mask,
4397 uint32_t a,
4398 bool separate_ds)
4399 {
4400 enum pipe_format format = vk_format_to_pipe_format(vk_format);
4401 const struct tu_framebuffer *fb = cmd->state.framebuffer;
4402 const struct tu_image_view *iview = cmd->state.attachments[a];
4403 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
4404 const struct blit_ops *ops = &r2d_ops<CHIP>;
4405 const VkClearValue *value = &cmd->state.clear_values[a];
4406 if (cmd->state.pass->attachments[a].samples > 1)
4407 ops = &r3d_ops<CHIP>;
4408
4409 trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
4410 cmd->state.pass->attachments[a].samples);
4411
4412 ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
4413 cmd->state.pass->attachments[a].samples);
4414 ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
4415 cmd->state.render_area.extent);
4416 ops->clear_value(cmd, cs, format, value);
4417
4418 for_each_layer(i, clear_views, fb->layers) {
4419 if (separate_ds) {
4420 if (vk_format == VK_FORMAT_D32_SFLOAT) {
4421 ops->dst_depth(cs, iview, i);
4422 } else {
4423 ops->dst_stencil(cs, iview, i);
4424 }
4425 } else {
4426 ops->dst(cs, &iview->view, i, format);
4427 }
4428 ops->run(cmd, cs);
4429 }
4430
4431 ops->teardown(cmd, cs);
4432
4433 trace_end_sysmem_clear(&cmd->trace, cs);
4434 }
4435
4436 template <chip CHIP>
4437 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)4438 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4439 struct tu_cs *cs,
4440 uint32_t a)
4441 {
4442 const struct tu_render_pass_attachment *attachment =
4443 &cmd->state.pass->attachments[a];
4444
4445 if (!attachment->clear_mask)
4446 return;
4447
4448 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4449 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4450 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
4451 a, true);
4452 }
4453 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4454 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
4455 a, true);
4456 }
4457 } else {
4458 clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
4459 a, false);
4460 }
4461
4462 /* The spec doesn't explicitly say, but presumably the initial renderpass
4463 * clear is considered part of the renderpass, and therefore barriers
4464 * aren't required inside the subpass/renderpass. Therefore we need to
4465 * flush CCU color into CCU depth here, just like with
4466 * vkCmdClearAttachments(). Note that because this only happens at the
4467 * beginning of a renderpass, and renderpass writes are considered
4468 * "incoherent", we shouldn't have to worry about syncing depth into color
4469 * beforehand as depth should already be flushed.
4470 */
4471 if (vk_format_is_depth_or_stencil(attachment->format)) {
4472 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4473 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
4474 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
4475 } else {
4476 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4477 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
4478 }
4479
4480 tu_cs_emit_wfi(cs);
4481 }
4482 TU_GENX(tu_clear_sysmem_attachment);
4483
4484 template <chip CHIP>
4485 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4486 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4487 struct tu_cs *cs,
4488 struct tu_resolve_group *resolve_group,
4489 uint32_t a)
4490 {
4491 const struct tu_render_pass_attachment *attachment =
4492 &cmd->state.pass->attachments[a];
4493
4494 if (!attachment->clear_mask)
4495 return;
4496
4497 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
4498 cmd->state.framebuffer->layers,
4499 attachment->clear_views,
4500 attachment->clear_mask,
4501 &cmd->state.clear_values[a]);
4502 }
4503 TU_GENX(tu_clear_gmem_attachment);
4504
4505 void
tu7_generic_clear_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4506 tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
4507 struct tu_cs *cs,
4508 struct tu_resolve_group *resolve_group,
4509 uint32_t a)
4510 {
4511 const struct tu_render_pass_attachment *att =
4512 &cmd->state.pass->attachments[a];
4513 const VkClearValue *value = &cmd->state.clear_values[a];
4514 const struct tu_image_view *iview = cmd->state.attachments[a];
4515
4516 trace_start_generic_clear(&cmd->trace, cs, att->format,
4517 iview->view.ubwc_enabled, att->samples);
4518
4519 enum pipe_format format = vk_format_to_pipe_format(att->format);
4520 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4521 uint32_t layer = i + 0;
4522 uint32_t mask =
4523 aspect_write_mask_generic_clear(format, att->clear_mask);
4524 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4525 if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4526 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_D32_SFLOAT);
4527 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4528 false, layer, value, a);
4529 }
4530 if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4531 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, VK_FORMAT_S8_UINT);
4532 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4533 layer, value, a);
4534 }
4535 } else {
4536 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, att->format);
4537 tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4538 }
4539 }
4540
4541 tu_flush_for_access(&cmd->state.renderpass_cache,
4542 TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4543
4544 trace_end_generic_clear(&cmd->trace, cs);
4545 }
4546
4547 template <chip CHIP>
4548 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,const VkClearValue * clear_value,enum a6xx_blit_event_type blit_event_type,bool separate_stencil)4549 tu_emit_blit(struct tu_cmd_buffer *cmd,
4550 struct tu_cs *cs,
4551 struct tu_resolve_group *resolve_group,
4552 const struct tu_image_view *iview,
4553 const struct tu_render_pass_attachment *attachment,
4554 const VkClearValue *clear_value,
4555 enum a6xx_blit_event_type blit_event_type,
4556 bool separate_stencil)
4557 {
4558 assert(blit_event_type != BLIT_EVENT_CLEAR);
4559 uint32_t clear_mask = 0;
4560
4561 /* BLIT_EVENT_STORE_AND_CLEAR would presumably swallow the
4562 * BLIT_EVENT_CLEAR at the start of a renderpass, and be more efficient.
4563 */
4564 if (blit_event_type == BLIT_EVENT_STORE && clear_value &&
4565 attachment->clear_mask &&
4566 use_generic_clear_for_image_clear(cmd, iview->image)) {
4567 blit_event_type = BLIT_EVENT_STORE_AND_CLEAR;
4568
4569 enum pipe_format format = vk_format_to_pipe_format(attachment->format);
4570 VkImageAspectFlags aspect_mask = attachment->clear_mask;
4571 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
4572 if (separate_stencil)
4573 aspect_mask = VK_IMAGE_ASPECT_STENCIL_BIT;
4574 else
4575 aspect_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
4576 }
4577 if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
4578 if (separate_stencil)
4579 format = PIPE_FORMAT_S8_UINT;
4580 else
4581 format = PIPE_FORMAT_Z32_FLOAT;
4582 }
4583
4584 clear_mask = aspect_write_mask_generic_clear(format, aspect_mask);
4585
4586 uint32_t clear_vals[4] = {};
4587 pack_blit_event_clear_value(clear_value, format, clear_vals);
4588
4589 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4590 tu_cs_emit_array(cs, clear_vals, 4);
4591 }
4592
4593 VkFormat format = attachment->format;
4594 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4595 format = separate_stencil ? VK_FORMAT_S8_UINT : VK_FORMAT_D32_SFLOAT;
4596 }
4597
4598 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, format);
4599 event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);
4600
4601 for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
4602 event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
4603 event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
4604 }
4605
4606 tu_flush_for_access(&cmd->state.cache, TU_ACCESS_BLIT_WRITE_GMEM,
4607 TU_ACCESS_NONE);
4608 }
4609
4610 static bool
blit_can_resolve(VkFormat format)4611 blit_can_resolve(VkFormat format)
4612 {
4613 const struct util_format_description *desc = vk_format_description(format);
4614
4615 /* blit event can only do resolve for simple cases:
4616 * averaging samples as unsigned integers or choosing only one sample
4617 * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
4618 */
4619 if (vk_format_is_snorm(format))
4620 return false;
4621
4622 /* can't do formats with larger channel sizes
4623 * note: this includes all float formats
4624 * note2: single channel integer formats seem OK
4625 */
4626 if (desc->channel[0].size > 10 && vk_format_is_color(format))
4627 return false;
4628
4629 switch (format) {
4630 /* for unknown reasons blit event can't msaa resolve these formats when tiled
4631 * likely related to these formats having different layout from other cpp=2 formats
4632 */
4633 case VK_FORMAT_R8G8_UNORM:
4634 case VK_FORMAT_R8G8_UINT:
4635 case VK_FORMAT_R8G8_SINT:
4636 case VK_FORMAT_R8G8_SRGB:
4637 return false;
4638 default:
4639 break;
4640 }
4641
4642 return true;
4643 }
4644
4645 struct apply_load_coords_state {
4646 unsigned view;
4647 };
4648
4649 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4650 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
4651 struct tu_cs *cs,
4652 void *data,
4653 VkRect2D bin,
4654 unsigned views,
4655 VkExtent2D *frag_areas)
4656 {
4657 const struct apply_load_coords_state *state =
4658 (const struct apply_load_coords_state *)data;
4659 assert(state->view < views);
4660 VkExtent2D frag_area = frag_areas[state->view];
4661
4662 assert(bin.extent.width % frag_area.width == 0);
4663 assert(bin.extent.height % frag_area.height == 0);
4664 uint32_t scaled_width = bin.extent.width / frag_area.width;
4665 uint32_t scaled_height = bin.extent.height / frag_area.height;
4666
4667 const float coords[] = {
4668 bin.offset.x, bin.offset.y,
4669 bin.offset.x, bin.offset.y,
4670 bin.offset.x + scaled_width, bin.offset.y + scaled_height,
4671 bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
4672 };
4673 r3d_coords_raw(cmd, cs, coords);
4674 }
4675
4676 template <chip CHIP>
4677 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)4678 load_3d_blit(struct tu_cmd_buffer *cmd,
4679 struct tu_cs *cs,
4680 const struct tu_image_view *iview,
4681 const struct tu_render_pass_attachment *att,
4682 bool separate_stencil)
4683 {
4684 const struct tu_framebuffer *fb = cmd->state.framebuffer;
4685 enum pipe_format format = iview->view.format;
4686 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4687 if (separate_stencil)
4688 format = PIPE_FORMAT_S8_UINT;
4689 else
4690 format = PIPE_FORMAT_Z32_FLOAT;
4691 }
4692 r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
4693 R3D_DST_GMEM, false, iview->view.ubwc_enabled,
4694 iview->image->vk.samples);
4695
4696 if (!cmd->state.pass->has_fdm) {
4697 r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
4698 (VkExtent2D) { fb->width, fb->height });
4699 }
4700
4701 /* Normal loads read directly from system memory, so we have to invalidate
4702 * UCHE in case it contains stale data.
4703 */
4704 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4705
4706 /* Wait for CACHE_INVALIDATE to land */
4707 tu_cs_emit_wfi(cs);
4708
4709 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4710 if (cmd->state.pass->has_fdm) {
4711 struct apply_load_coords_state state = {
4712 .view = att->clear_views ? i : 0,
4713 };
4714 tu_create_fdm_bin_patchpoint(cmd, cs, 4, fdm_apply_load_coords, state);
4715 }
4716
4717 r3d_dst_gmem<CHIP>(cmd, cs, iview, att, separate_stencil, i);
4718
4719 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4720 if (separate_stencil)
4721 r3d_src_stencil(cmd, cs, iview, i);
4722 else
4723 r3d_src_depth(cmd, cs, iview, i);
4724 } else {
4725 r3d_src_gmem_load(cmd, cs, iview, i);
4726 }
4727
4728 r3d_run(cmd, cs);
4729 }
4730
4731 r3d_teardown<CHIP>(cmd, cs);
4732
4733 /* It seems we need to WFI here for depth/stencil because color writes here
4734 * aren't synchronized with depth/stencil writes.
4735 *
4736 * Note: the blob also uses a WFI for color attachments but this hasn't
4737 * been seen to be necessary.
4738 */
4739 if (vk_format_is_depth_or_stencil(att->format))
4740 tu_cs_emit_wfi(cs);
4741 }
4742
4743 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4744 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4745 struct tu_cs *cs, bool load)
4746 {
4747 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
4748
4749 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4750 return;
4751
4752 uint64_t result_iova;
4753 if (load)
4754 result_iova = global_iova(cmd, dbg_gmem_taken_loads);
4755 else
4756 result_iova = global_iova(cmd, dbg_gmem_taken_stores);
4757
4758 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4759 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4760 tu_cs_emit_qw(cs, result_iova);
4761 tu_cs_emit_qw(cs, result_iova);
4762 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4763 }
4764
4765 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4766 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4767 struct tu_cs *cs, bool load)
4768 {
4769 tu_cond_exec_end(cs);
4770
4771 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4772 return;
4773
4774 uint64_t result_iova;
4775 if (load)
4776 result_iova = global_iova(cmd, dbg_gmem_total_loads);
4777 else
4778 result_iova = global_iova(cmd, dbg_gmem_total_stores);
4779
4780 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4781 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4782 tu_cs_emit_qw(cs, result_iova);
4783 tu_cs_emit_qw(cs, result_iova);
4784 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4785 }
4786
4787 template <chip CHIP>
4788 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,bool cond_exec_allowed,bool force_load)4789 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
4790 struct tu_cs *cs,
4791 struct tu_resolve_group *resolve_group,
4792 uint32_t a,
4793 bool cond_exec_allowed,
4794 bool force_load)
4795 {
4796 const struct tu_image_view *iview = cmd->state.attachments[a];
4797 const struct tu_render_pass_attachment *attachment =
4798 &cmd->state.pass->attachments[a];
4799
4800 bool load_common = attachment->load || force_load;
4801 bool load_stencil =
4802 attachment->load_stencil ||
4803 (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
4804
4805 if (!load_common && !load_stencil)
4806 return;
4807
4808 trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
4809
4810 /* If attachment will be cleared by vkCmdClearAttachments - it is likely
4811 * that it would be partially cleared, and since it is done by 2d blit
4812 * it doesn't produce geometry, so we have to unconditionally load.
4813 *
4814 * To simplify conditions treat partially cleared separate DS as fully
4815 * cleared and don't emit cond_exec.
4816 */
4817 bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
4818 if (cond_exec)
4819 tu_begin_load_store_cond_exec(cmd, cs, true);
4820
4821 if (TU_DEBUG(3D_LOAD) ||
4822 cmd->state.pass->has_fdm) {
4823 if (load_common || load_stencil)
4824 tu_disable_draw_states(cmd, cs);
4825
4826 if (load_common)
4827 load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
4828
4829 if (load_stencil)
4830 load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
4831 } else {
4832 if (load_common)
4833 tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
4834
4835 if (load_stencil)
4836 tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
4837 }
4838
4839 if (cond_exec)
4840 tu_end_load_store_cond_exec(cmd, cs, true);
4841
4842 trace_end_gmem_load(&cmd->trace, cs);
4843 }
4844 TU_GENX(tu_load_gmem_attachment);
4845
4846 template <chip CHIP>
4847 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src_iview,const struct tu_image_view * dst_iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4848 store_cp_blit(struct tu_cmd_buffer *cmd,
4849 struct tu_cs *cs,
4850 const struct tu_image_view *src_iview,
4851 const struct tu_image_view *dst_iview,
4852 uint32_t samples,
4853 bool separate_stencil,
4854 enum pipe_format src_format,
4855 enum pipe_format dst_format,
4856 uint32_t layer,
4857 uint32_t gmem_offset,
4858 uint32_t cpp)
4859 {
4860 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
4861 VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
4862 dst_iview->view.ubwc_enabled, true);
4863
4864 if (dst_iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4865 if (!separate_stencil) {
4866 r2d_dst_depth(cs, dst_iview, layer);
4867 } else {
4868 r2d_dst_stencil(cs, dst_iview, layer);
4869 }
4870 } else {
4871 r2d_dst<CHIP>(cs, &dst_iview->view, layer, src_format);
4872 }
4873
4874 /* Note: we compute the swap here instead of using the color_swap as
4875 * programmed when we setup the color attachment because the attachment in
4876 * GMEM ignores the swap except when MUTABLEEN is enabled. If the
4877 * color attachment is linear, we need to use the identity swap even if the
4878 * original attachment has a non-identity swap.
4879 */
4880 struct tu_native_format fmt =
4881 blit_format_texture<CHIP>(src_format, TILE6_2,
4882 src_iview->view.is_mutable, true);
4883 enum a6xx_format format = fmt.fmt;
4884 fixup_src_format(&src_format, dst_format, &format);
4885
4886 tu_cs_emit_regs(cs,
4887 SP_PS_2D_SRC_INFO(CHIP,
4888 .color_format = format,
4889 .tile_mode = TILE6_2,
4890 .color_swap = fmt.swap,
4891 .srgb = util_format_is_srgb(src_format),
4892 .samples = tu_msaa_samples(samples),
4893 .samples_average = !util_format_is_pure_integer(dst_format) &&
4894 !util_format_is_depth_or_stencil(dst_format),
4895 .unk20 = 1,
4896 .unk22 = 1,
4897 .mutableen = src_iview->view.is_mutable),
4898 SP_PS_2D_SRC_SIZE(CHIP,
4899 .width = dst_iview->vk.extent.width,
4900 .height = dst_iview->vk.extent.height),
4901 SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
4902 SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
4903
4904 /* sync GMEM writes with CACHE. */
4905 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4906 if (CHIP >= A7XX)
4907 /* On A7XX, we need to wait for any CP_EVENT_WRITE::BLIT operations
4908 * arising from GMEM load/clears to land before we can continue.
4909 */
4910 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
4911
4912 /* Wait for cache event to land */
4913 tu_cs_emit_wfi(cs);
4914
4915 r2d_run(cmd, cs);
4916
4917 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4918 * sysmem, and we generally assume that GMEM renderpasses leave their
4919 * results in sysmem, so we need to flush manually here.
4920 */
4921 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4922 }
4923
4924 template <chip CHIP>
4925 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src_iview,const struct tu_image_view * dst_iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4926 store_3d_blit(struct tu_cmd_buffer *cmd,
4927 struct tu_cs *cs,
4928 const struct tu_image_view *src_iview,
4929 const struct tu_image_view *dst_iview,
4930 VkSampleCountFlagBits dst_samples,
4931 bool separate_stencil,
4932 enum pipe_format src_format,
4933 enum pipe_format dst_format,
4934 const VkRect2D *render_area,
4935 uint32_t layer,
4936 uint32_t gmem_offset,
4937 uint32_t cpp)
4938 {
4939 /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
4940 * aren't set until we know whether we're HW binning or not, and we want to
4941 * avoid a dependence on that here to be able to store attachments before
4942 * the end of the renderpass in the future. Use the scratch space to
4943 * save/restore them dynamically.
4944 */
4945 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4946 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4947 CP_REG_TO_SCRATCH_0_SCRATCH(0) |
4948 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4949 if (CHIP >= A7XX) {
4950 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4951 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4952 CP_REG_TO_SCRATCH_0_SCRATCH(1) |
4953 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4954 }
4955
4956 r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
4957 0, false, dst_iview->view.ubwc_enabled, dst_samples);
4958
4959 r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
4960
4961 if (dst_iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4962 if (!separate_stencil) {
4963 r3d_dst_depth<CHIP>(cs, dst_iview, layer);
4964 } else {
4965 r3d_dst_stencil<CHIP>(cs, dst_iview, layer);
4966 }
4967 } else {
4968 r3d_dst<CHIP>(cs, &dst_iview->view, layer, src_format);
4969 }
4970
4971 r3d_src_gmem<CHIP>(cmd, cs, src_iview, src_format, dst_format, gmem_offset, cpp);
4972
4973 /* sync GMEM writes with CACHE. */
4974 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4975
4976 /* Wait for CACHE_INVALIDATE to land */
4977 tu_cs_emit_wfi(cs);
4978
4979 r3d_run(cmd, cs);
4980
4981 r3d_teardown<CHIP>(cmd, cs);
4982
4983 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4984 * sysmem, and we generally assume that GMEM renderpasses leave their
4985 * results in sysmem, so we need to flush manually here. The 3d blit path
4986 * writes to depth images as a color RT, so there's no need to flush depth.
4987 */
4988 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4989
4990 /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
4991 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4992 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4993 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4994 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4995
4996 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4997 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
4998 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4999 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
5000
5001 if (CHIP >= A7XX) {
5002 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
5003 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
5004 CP_SCRATCH_TO_REG_0_SCRATCH(1) |
5005 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
5006 }
5007 }
5008
5009 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)5010 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
5011 {
5012 struct tu_physical_device *phys_dev = cmd->device->physical_device;
5013 const struct tu_image_view *iview = cmd->state.attachments[a];
5014 const VkRect2D *render_area = &cmd->state.render_area;
5015
5016 /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
5017 if (TU_DEBUG(UNALIGNED_STORE))
5018 return true;
5019
5020 /* We always use the unaligned store path when scaling rendering. */
5021 if (cmd->state.pass->has_fdm)
5022 return true;
5023
5024 uint32_t x1 = render_area->offset.x;
5025 uint32_t y1 = render_area->offset.y;
5026 uint32_t x2 = x1 + render_area->extent.width;
5027 uint32_t y2 = y1 + render_area->extent.height;
5028 /* x2/y2 can be unaligned if equal to the size of the image, since it will
5029 * write into padding space. The one exception is linear levels which don't
5030 * have the required y padding in the layout (except for the last level)
5031 */
5032 bool need_y2_align =
5033 y2 != iview->view.height || iview->view.need_y2_align;
5034
5035 return (x1 % phys_dev->info->gmem_align_w ||
5036 (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
5037 y1 % phys_dev->info->gmem_align_h ||
5038 (y2 % phys_dev->info->gmem_align_h && need_y2_align));
5039 }
5040
5041 /* The fast path cannot handle mismatched mutability. */
5042 static bool
tu_attachment_store_mismatched_mutability(struct tu_cmd_buffer * cmd,uint32_t a,uint32_t gmem_a)5043 tu_attachment_store_mismatched_mutability(struct tu_cmd_buffer *cmd, uint32_t a,
5044 uint32_t gmem_a)
5045 {
5046 if (a == gmem_a)
5047 return false;
5048
5049 const struct tu_image_view *dst_iview = cmd->state.attachments[a];
5050 const struct tu_image_view *src_iview = cmd->state.attachments[gmem_a];
5051
5052 return dst_iview->view.is_mutable != src_iview->view.is_mutable;
5053 }
5054
5055 /* Choose the GMEM layout (use the CCU space or not) based on whether the
5056 * current attachments will need. This has to happen at vkBeginRenderPass()
5057 * time because tu_attachment_store_unaligned() looks at the image views, which
5058 * are only available at that point. This should match the logic for the
5059 * !use_fast_path case in tu_store_gmem_attachment().
5060 */
5061 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)5062 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
5063 {
5064 cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
5065
5066 for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
5067 if (!cmd->state.attachments[i])
5068 continue;
5069
5070 struct tu_render_pass_attachment *att =
5071 &cmd->state.pass->attachments[i];
5072 if ((att->store || att->store_stencil) &&
5073 tu_attachment_store_unaligned(cmd, i))
5074 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5075 if (att->store && att->format == VK_FORMAT_S8_UINT)
5076 /* We cannot pick out S8 from D24S8/D32S8, so we conservatively disable
5077 * blit events for the S8_UINT format.
5078 */
5079 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5080 if (att->will_be_resolved && !blit_can_resolve(att->format))
5081 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5082 }
5083
5084 for (unsigned i = 0; i < cmd->state.pass->subpass_count; i++) {
5085 const struct tu_subpass *subpass = &cmd->state.pass->subpasses[i];
5086 for (unsigned j = 0; j < subpass->resolve_count; j++) {
5087 uint32_t a = subpass->resolve_attachments[j].attachment;
5088 if (a == VK_ATTACHMENT_UNUSED)
5089 continue;
5090 uint32_t gmem_a =
5091 j == subpass->color_count ?
5092 subpass->depth_stencil_attachment.attachment :
5093 subpass->color_attachments[j].attachment;
5094 if (tu_attachment_store_mismatched_mutability(cmd, a, gmem_a))
5095 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5096 }
5097 }
5098
5099 cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
5100 }
5101
5102 struct apply_store_coords_state {
5103 unsigned view;
5104 };
5105
5106 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)5107 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
5108 struct tu_cs *cs,
5109 void *data,
5110 VkRect2D bin,
5111 unsigned views,
5112 VkExtent2D *frag_areas)
5113 {
5114 const struct apply_store_coords_state *state =
5115 (const struct apply_store_coords_state *)data;
5116 assert(state->view < views);
5117 VkExtent2D frag_area = frag_areas[state->view];
5118
5119 /* The bin width/height must be a multiple of the frag_area to make sure
5120 * that the scaling happens correctly. This means there may be some
5121 * destination pixels jut out of the framebuffer, but they should be
5122 * clipped by the render area.
5123 */
5124 assert(bin.extent.width % frag_area.width == 0);
5125 assert(bin.extent.height % frag_area.height == 0);
5126 uint32_t scaled_width = bin.extent.width / frag_area.width;
5127 uint32_t scaled_height = bin.extent.height / frag_area.height;
5128
5129 tu_cs_emit_regs(cs,
5130 A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
5131 .y = bin.offset.y),
5132 A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
5133 .y = bin.offset.y + bin.extent.height - 1));
5134 tu_cs_emit_regs(cs,
5135 A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
5136 A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
5137 A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
5138 A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
5139 }
5140
5141 template <chip CHIP>
5142 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)5143 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
5144 struct tu_cs *cs,
5145 struct tu_resolve_group *resolve_group,
5146 uint32_t a,
5147 uint32_t gmem_a,
5148 uint32_t layers,
5149 uint32_t layer_mask,
5150 bool cond_exec_allowed)
5151 {
5152 const VkRect2D *render_area = &cmd->state.render_area;
5153 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
5154 const struct tu_image_view *dst_iview = cmd->state.attachments[a];
5155 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
5156 const struct tu_image_view *src_iview = cmd->state.attachments[gmem_a];
5157 const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
5158 bool resolve = a != gmem_a;
5159 if (resolve)
5160 clear_value = NULL;
5161
5162 if (!dst->store && !dst->store_stencil)
5163 return;
5164
5165 bool unaligned = tu_attachment_store_unaligned(cmd, a);
5166 bool mismatched_mutability =
5167 tu_attachment_store_mismatched_mutability(cmd, a, gmem_a);
5168
5169 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
5170 * one for depth and other for stencil. When resolving a MSAA
5171 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
5172 */
5173 bool resolve_d32s8_s8 =
5174 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
5175 dst->format == VK_FORMAT_S8_UINT;
5176
5177 /* The fast path doesn't support picking out the last component of a D24S8
5178 * texture reinterpreted as RGBA8_UNORM.
5179 */
5180 bool resolve_d24s8_s8 =
5181 src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
5182 dst->format == VK_FORMAT_S8_UINT;
5183
5184 bool store_common = dst->store && !resolve_d32s8_s8;
5185 bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
5186
5187 bool use_fast_path = !unaligned && !mismatched_mutability &&
5188 !resolve_d24s8_s8 &&
5189 (a == gmem_a || blit_can_resolve(dst->format));
5190
5191 trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
5192
5193 /* Unconditional store should happen only if attachment was cleared,
5194 * which could have happened either by load_op or via vkCmdClearAttachments.
5195 */
5196 bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
5197 if (cond_exec) {
5198 tu_begin_load_store_cond_exec(cmd, cs, false);
5199 }
5200
5201 /* use fast path when render area is aligned, except for unsupported resolve cases */
5202 if (use_fast_path) {
5203 if (store_common)
5204 tu_emit_blit<CHIP>(cmd, cs, resolve_group, dst_iview, src, clear_value, BLIT_EVENT_STORE, false);
5205 if (store_separate_stencil)
5206 tu_emit_blit<CHIP>(cmd, cs, resolve_group, dst_iview, src, clear_value, BLIT_EVENT_STORE, true);
5207
5208 if (cond_exec) {
5209 tu_end_load_store_cond_exec(cmd, cs, false);
5210 }
5211
5212 trace_end_gmem_store(&cmd->trace, cs);
5213 return;
5214 }
5215
5216 assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
5217
5218 enum pipe_format src_format = vk_format_to_pipe_format(src->format);
5219 if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5220 src_format = PIPE_FORMAT_Z32_FLOAT;
5221
5222 enum pipe_format dst_format = vk_format_to_pipe_format(dst->format);
5223 if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5224 dst_format = PIPE_FORMAT_Z32_FLOAT;
5225
5226 if (dst->samples > 1) {
5227 /* If we hit this path, we have to disable draw states after every tile
5228 * instead of once at the end of the renderpass, so that they aren't
5229 * executed when calling CP_DRAW.
5230 *
5231 * TODO: store a flag somewhere so we don't do this more than once and
5232 * don't do it after the renderpass when this happens.
5233 */
5234 if (store_common || store_separate_stencil)
5235 tu_disable_draw_states(cmd, cs);
5236
5237 for_each_layer(i, layer_mask, layers) {
5238 if (store_common) {
5239 store_3d_blit<CHIP>(cmd, cs, src_iview, dst_iview, dst->samples, false, src_format,
5240 dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5241 }
5242 if (store_separate_stencil) {
5243 store_3d_blit<CHIP>(cmd, cs, src_iview, dst_iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
5244 PIPE_FORMAT_S8_UINT, render_area, i,
5245 tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5246 }
5247 }
5248 } else {
5249 if (!cmd->state.pass->has_fdm) {
5250 r2d_coords(cmd, cs, render_area->offset, render_area->offset,
5251 render_area->extent);
5252 } else {
5253 /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
5254 * area and the coordinates span the entire render area, but for
5255 * FDM we need to scale the coordinates so we need to take the
5256 * opposite aproach, specifying the exact bin size in the destination
5257 * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
5258 * area.
5259 */
5260 tu_cs_emit_regs(cs,
5261 A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
5262 .y = render_area->offset.y,),
5263 A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
5264 .y = render_area->offset.y + render_area->extent.height - 1,));
5265 }
5266
5267 for_each_layer (i, layer_mask, layers) {
5268 if (cmd->state.pass->has_fdm) {
5269 unsigned view = layer_mask ? i : 0;
5270 struct apply_store_coords_state state = {
5271 .view = view,
5272 };
5273 tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
5274 state);
5275 }
5276 if (store_common) {
5277 store_cp_blit<CHIP>(cmd, cs, src_iview, dst_iview, src->samples, false, src_format,
5278 dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5279 }
5280 if (store_separate_stencil) {
5281 store_cp_blit<CHIP>(cmd, cs, src_iview, dst_iview, src->samples, true, PIPE_FORMAT_S8_UINT,
5282 PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5283 }
5284 }
5285 }
5286
5287 if (cond_exec) {
5288 tu_end_load_store_cond_exec(cmd, cs, false);
5289 }
5290
5291 trace_end_gmem_store(&cmd->trace, cs);
5292 }
5293 TU_GENX(tu_store_gmem_attachment);
5294