1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_clear_blit.h"
10
11 #include "ir3/ir3_nir.h"
12
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18
19 #include "tu_buffer.h"
20 #include "tu_cmd_buffer.h"
21 #include "tu_cs.h"
22 #include "tu_formats.h"
23 #include "tu_image.h"
24 #include "tu_tracepoints.h"
25 #include "tu_lrz.h"
26
27 #include "common/freedreno_gpu_event.h"
28 #include "common/freedreno_lrz.h"
29
30 static const VkOffset2D blt_no_coord = { ~0, ~0 };
31
32 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)33 tu_pack_float32_for_unorm(float val, int bits)
34 {
35 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
36 }
37
38 /* r2d_ = BLIT_OP_SCALE operations */
39
40 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)41 format_to_ifmt(enum pipe_format format)
42 {
43 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
44 format == PIPE_FORMAT_Z24X8_UNORM)
45 return R2D_UNORM8;
46
47 /* get_component_bits doesn't work with depth/stencil formats: */
48 if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
49 return R2D_FLOAT32;
50 if (format == PIPE_FORMAT_S8_UINT)
51 return R2D_INT8;
52 if (format == PIPE_FORMAT_A8_UNORM)
53 return R2D_UNORM8;
54
55 /* use the size of the red channel to find the corresponding "ifmt" */
56 bool is_int = util_format_is_pure_integer(format);
57 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
58 case 4: case 5: case 8:
59 return is_int ? R2D_INT8 : R2D_UNORM8;
60 case 10: case 11:
61 return is_int ? R2D_INT16 : R2D_FLOAT16;
62 case 16:
63 if (util_format_is_float(format))
64 return R2D_FLOAT16;
65 return is_int ? R2D_INT16 : R2D_FLOAT32;
66 case 32:
67 return is_int ? R2D_INT32 : R2D_FLOAT32;
68 default:
69 unreachable("bad format");
70 }
71 }
72
73 template <chip CHIP>
74 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode,bool is_mutable,bool gmem)75 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode, bool is_mutable, bool gmem)
76 {
77 struct tu_native_format fmt = tu6_format_texture(format, tile_mode, is_mutable);
78
79 switch (format) {
80 case PIPE_FORMAT_Z24X8_UNORM:
81 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
82 /* Similar to in fdl6_view_init, we want to use
83 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
84 * src. Since this is called when there is no image and thus no ubwc,
85 * we can always use FMT6_8_8_8_8_UNORM.
86 *
87 * Note (A7XX): Since it's erroneous to use FMT6_8_8_8_8_UNORM for a GMEM
88 * image (see blit_base_format), we use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
89 * instead.
90 */
91 fmt.fmt = CHIP >= A7XX && gmem ? FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 : FMT6_8_8_8_8_UNORM;
92 break;
93 default:
94 break;
95 }
96
97 return fmt;
98 }
99
100 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)101 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
102 {
103 struct tu_native_format fmt = tu6_format_color(format, tile_mode, false);
104
105 switch (format) {
106 case PIPE_FORMAT_Z24X8_UNORM:
107 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
108 /* similar to blit_format_texture but for blit dst */
109 fmt.fmt = FMT6_8_8_8_8_UNORM;
110 break;
111 default:
112 break;
113 }
114
115 return fmt;
116 }
117
118 template <chip CHIP>
119 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc,bool gmem)120 blit_base_format(enum pipe_format format, bool ubwc, bool gmem)
121 {
122 if (CHIP >= A7XX && gmem)
123 /* A7XX requires D24S8 in GMEM to always be treated as
124 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 regardless of if the image
125 * is UBWC-compatible. Using FMT6_8_8_8_8_UNORM instead will result
126 * in misrendering around the edges of the destination image.
127 */
128 ubwc = true;
129
130 if (ubwc) {
131 switch (format) {
132 case PIPE_FORMAT_Z24X8_UNORM:
133 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
134 /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
135 return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
136 default:
137 break;
138 }
139 }
140
141 /* note: tu6_format_color doesn't care about tiling for .fmt field */
142 return blit_format_color(format, TILE6_LINEAR).fmt;
143 }
144
145 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)146 r2d_coords(struct tu_cmd_buffer *cmd,
147 struct tu_cs *cs,
148 const VkOffset2D dst,
149 const VkOffset2D src,
150 const VkExtent2D extent)
151 {
152 tu_cs_emit_regs(cs,
153 A6XX_GRAS_2D_DST_TL(.x = dst.x, .y = dst.y),
154 A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
155
156 if (src.x == blt_no_coord.x)
157 return;
158
159 tu_cs_emit_regs(cs,
160 A6XX_GRAS_2D_SRC_TL_X(src.x),
161 A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
162 A6XX_GRAS_2D_SRC_TL_Y(src.y),
163 A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
164 }
165
166 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)167 r2d_clear_value(struct tu_cmd_buffer *cmd,
168 struct tu_cs *cs,
169 enum pipe_format format,
170 const VkClearValue *val)
171 {
172 uint32_t clear_value[4] = {};
173
174 switch (format) {
175 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
176 case PIPE_FORMAT_Z24X8_UNORM:
177 /* cleared as r8g8b8a8_unorm using special format */
178 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
179 clear_value[1] = clear_value[0] >> 8;
180 clear_value[2] = clear_value[0] >> 16;
181 clear_value[3] = val->depthStencil.stencil;
182 break;
183 case PIPE_FORMAT_Z16_UNORM:
184 case PIPE_FORMAT_Z32_FLOAT:
185 /* R2D_FLOAT32 */
186 clear_value[0] = fui(val->depthStencil.depth);
187 break;
188 case PIPE_FORMAT_S8_UINT:
189 clear_value[0] = val->depthStencil.stencil;
190 break;
191 case PIPE_FORMAT_R9G9B9E5_FLOAT:
192 /* cleared as UINT32 */
193 clear_value[0] = float3_to_rgb9e5(val->color.float32);
194 break;
195 default:
196 assert(!util_format_is_depth_or_stencil(format));
197 const struct util_format_description *desc = util_format_description(format);
198 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
199
200 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
201 format == PIPE_FORMAT_R11G11B10_FLOAT);
202
203 for (unsigned i = 0; i < 4; i++) {
204 if (desc->swizzle[i] > PIPE_SWIZZLE_W)
205 continue;
206
207 const struct util_format_channel_description *ch =
208 &desc->channel[desc->swizzle[i]];
209 if (ifmt == R2D_UNORM8) {
210 float linear = val->color.float32[i];
211 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
212 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
213
214 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
215 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
216 else
217 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
218 } else if (ifmt == R2D_FLOAT16) {
219 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
220 } else {
221 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
222 ifmt == R2D_INT16 || ifmt == R2D_INT8);
223 clear_value[i] = val->color.uint32[i];
224 }
225 }
226 break;
227 }
228
229 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
230 tu_cs_emit_array(cs, clear_value, 4);
231 }
232
233 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)234 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
235 enum a6xx_format *fmt)
236 {
237 /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
238 * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
239 * We also have to move stencil, which is normally in the .w channel, into
240 * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
241 * problems, and avoids using a swap, which seems to sometimes not work
242 * with a D24S8 source, or a texture swizzle which is only supported with
243 * the 3d path. Sometimes this blit happens on already-constructed
244 * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
245 */
246 if (*src_format == PIPE_FORMAT_S8_UINT &&
247 (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
248 dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
249 *fmt = FMT6_A8_UNORM;
250 *src_format = PIPE_FORMAT_A8_UNORM;
251 }
252 }
253
254 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)255 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
256 enum a6xx_format *fmt)
257 {
258 if (*dst_format == PIPE_FORMAT_S8_UINT &&
259 (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
260 src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
261 *dst_format = PIPE_FORMAT_A8_UNORM;
262 *fmt = FMT6_A8_UNORM;
263 }
264 }
265
266 template <chip CHIP>
267 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)268 r2d_src(struct tu_cmd_buffer *cmd,
269 struct tu_cs *cs,
270 const struct fdl6_view *iview,
271 uint32_t layer,
272 VkFilter filter,
273 enum pipe_format dst_format)
274 {
275 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
276 if (filter != VK_FILTER_NEAREST)
277 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
278
279 enum a6xx_format fmt = (enum a6xx_format)(
280 src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
281 enum pipe_format src_format = iview->format;
282 fixup_src_format(&src_format, dst_format, &fmt);
283
284 src_info =
285 (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
286 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
287
288 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
289 tu_cs_emit(cs, src_info);
290 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
291 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
292
293 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
294 tu_cs_image_flag_ref(cs, iview, layer);
295 }
296
297 template <chip CHIP>
298 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)299 r2d_src_depth(struct tu_cmd_buffer *cmd,
300 struct tu_cs *cs,
301 const struct tu_image_view *iview,
302 uint32_t layer,
303 VkFilter filter)
304 {
305 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
306 tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
307 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
308 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
309 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
310 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
311
312 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
313 tu_cs_image_flag_ref(cs, &iview->view, layer);
314 }
315
316 template <chip CHIP>
317 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)318 r2d_src_stencil(struct tu_cmd_buffer *cmd,
319 struct tu_cs *cs,
320 const struct tu_image_view *iview,
321 uint32_t layer,
322 VkFilter filter)
323 {
324 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
325 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
326 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
327 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
328 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
329 }
330
331 template <chip CHIP>
332 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)333 r2d_src_buffer(struct tu_cmd_buffer *cmd,
334 struct tu_cs *cs,
335 enum pipe_format format,
336 uint64_t va, uint32_t pitch,
337 uint32_t width, uint32_t height,
338 enum pipe_format dst_format)
339 {
340 struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
341 enum a6xx_format color_format = fmt.fmt;
342 fixup_src_format(&format, dst_format, &color_format);
343
344 tu_cs_emit_regs(cs,
345 SP_PS_2D_SRC_INFO(CHIP,
346 .color_format = color_format,
347 .color_swap = fmt.swap,
348 .srgb = util_format_is_srgb(format),
349 .unk20 = 1,
350 .unk22 = 1),
351 SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
352 SP_PS_2D_SRC(CHIP, .qword = va),
353 SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
354 }
355
356 template <chip CHIP>
357 static void
r2d_src_buffer_unaligned(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)358 r2d_src_buffer_unaligned(struct tu_cmd_buffer *cmd,
359 struct tu_cs *cs,
360 enum pipe_format format,
361 uint64_t va,
362 uint32_t pitch,
363 uint32_t width,
364 uint32_t height,
365 enum pipe_format dst_format)
366 {
367 /* This functionality is only allowed on A7XX, this assertion statically
368 * disallows calling this function on prior generations by mistake.
369 */
370 static_assert(CHIP >= A7XX);
371
372 struct tu_native_format fmt =
373 blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
374 enum a6xx_format color_format = fmt.fmt;
375 fixup_src_format(&format, dst_format, &color_format);
376
377 uint32_t offset_texels = ((va & 0x3f) / util_format_get_blocksize(format));
378 va &= ~0x3f;
379 tu_cs_emit_regs(cs,
380 A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
381 .start_offset_texels = offset_texels,
382 .type = A6XX_TEX_IMG_BUFFER));
383
384 tu_cs_emit_regs(cs,
385 SP_PS_2D_SRC_INFO(CHIP, .color_format = color_format,
386 .color_swap = fmt.swap,
387 .srgb = util_format_is_srgb(format),
388 .unk20 = 1, .unk22 = 1),
389 SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
390 SP_PS_2D_SRC(CHIP, .qword = va),
391 SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
392 }
393
394 template <chip CHIP>
395 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)396 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
397 enum pipe_format src_format)
398 {
399 uint32_t dst_info = iview->RB_2D_DST_INFO;
400 enum a6xx_format fmt =
401 (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
402 enum pipe_format dst_format = iview->format;
403 fixup_dst_format(src_format, &dst_format, &fmt);
404
405 dst_info =
406 (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
407 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
408 tu_cs_emit(cs, dst_info);
409 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
410
411 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
412 tu_cs_image_flag_ref(cs, iview, layer);
413 }
414
415 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)416 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
417 {
418 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
419 tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
420 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
421 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
422
423 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
424 tu_cs_image_flag_ref(cs, &iview->view, layer);
425 }
426
427 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)428 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
429 {
430 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
431 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
432 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
433 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
434 }
435
436 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)437 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
438 enum pipe_format src_format)
439 {
440 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
441 enum a6xx_format color_fmt = fmt.fmt;
442 fixup_dst_format(src_format, &format, &color_fmt);
443 fmt.fmt = color_fmt;
444
445 tu_cs_emit_regs(cs,
446 A6XX_RB_2D_DST_INFO(
447 .color_format = fmt.fmt,
448 .color_swap = fmt.swap,
449 .srgb = util_format_is_srgb(format)),
450 A6XX_RB_2D_DST(.qword = va),
451 A6XX_RB_2D_DST_PITCH(pitch));
452 }
453
454 template <chip CHIP>
455 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)456 r2d_setup_common(struct tu_cmd_buffer *cmd,
457 struct tu_cs *cs,
458 enum pipe_format src_format,
459 enum pipe_format dst_format,
460 VkImageAspectFlags aspect_mask,
461 unsigned blit_param,
462 bool clear,
463 bool ubwc,
464 bool scissor)
465 {
466 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
467 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
468 }
469
470 enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
471 fixup_dst_format(src_format, &dst_format, &fmt);
472 enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
473
474 uint32_t unknown_8c01 = 0;
475
476 /* note: the only format with partial clearing is D24S8 */
477 if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
478 /* preserve stencil channel */
479 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
480 unknown_8c01 = 0x08000041;
481 /* preserve depth channels */
482 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
483 unknown_8c01 = 0x00084001;
484 }
485
486 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
487 tu_cs_emit(cs, unknown_8c01); // TODO: seem to be always 0 on A7XX
488
489 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
490 .rotate = (enum a6xx_rotation) blit_param,
491 .solid_color = clear,
492 .color_format = fmt,
493 .scissor = scissor,
494 .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
495 .mask = 0xf,
496 .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
497 ).value;
498
499 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
500 tu_cs_emit(cs, blit_cntl);
501
502 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
503 tu_cs_emit(cs, blit_cntl);
504
505 if (CHIP > A6XX) {
506 tu_cs_emit_regs(cs, A7XX_TPL1_2D_SRC_CNTL(.raw_copy = false,
507 .start_offset_texels = 0,
508 .type = A6XX_TEX_2D));
509 }
510
511 if (fmt == FMT6_10_10_10_2_UNORM_DEST)
512 fmt = FMT6_16_16_16_16_FLOAT;
513
514 tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
515 .sint = util_format_is_pure_sint(dst_format),
516 .uint = util_format_is_pure_uint(dst_format),
517 .color_format = fmt,
518 .srgb = util_format_is_srgb(dst_format),
519 .mask = 0xf));
520 }
521
522 template <chip CHIP>
523 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)524 r2d_setup(struct tu_cmd_buffer *cmd,
525 struct tu_cs *cs,
526 enum pipe_format src_format,
527 enum pipe_format dst_format,
528 VkImageAspectFlags aspect_mask,
529 unsigned blit_param,
530 bool clear,
531 bool ubwc,
532 VkSampleCountFlagBits samples)
533 {
534 assert(samples == VK_SAMPLE_COUNT_1_BIT);
535
536 if (!cmd->state.pass) {
537 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
538 }
539
540 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
541 }
542
543 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)544 r2d_teardown(struct tu_cmd_buffer *cmd,
545 struct tu_cs *cs)
546 {
547 /* nothing to do here */
548 }
549
550 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)551 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
552 {
553 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
554 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
555 /* This a non-context register, so we have to WFI before changing. */
556 tu_cs_emit_wfi(cs);
557 tu_cs_emit_write_reg(
558 cs, REG_A6XX_RB_DBG_ECO_CNTL,
559 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
560 }
561
562 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
563 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
564
565 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
566 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
567 tu_cs_emit_wfi(cs);
568 tu_cs_emit_write_reg(
569 cs, REG_A6XX_RB_DBG_ECO_CNTL,
570 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
571 }
572 }
573
574 /* r3d_ = shader path operations */
575
576 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)577 load_const(nir_builder *b, unsigned base, unsigned components)
578 {
579 return nir_load_const_ir3(b, components, 32, nir_imm_int(b, 0),
580 .base = base);
581 }
582
583 static nir_shader *
build_blit_vs_shader(void)584 build_blit_vs_shader(void)
585 {
586 nir_builder _b =
587 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
588 nir_builder *b = &_b;
589 b->shader->info.internal = true;
590
591 nir_variable *out_pos =
592 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
593 "gl_Position");
594 out_pos->data.location = VARYING_SLOT_POS;
595
596 nir_def *vert0_pos = load_const(b, 0, 2);
597 nir_def *vert1_pos = load_const(b, 4, 2);
598 nir_def *vertex = nir_load_vertex_id(b);
599
600 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
601 pos = nir_vec4(b, nir_channel(b, pos, 0),
602 nir_channel(b, pos, 1),
603 nir_imm_float(b, 0.0),
604 nir_imm_float(b, 1.0));
605
606 nir_store_var(b, out_pos, pos, 0xf);
607
608 nir_variable *out_coords =
609 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
610 "coords");
611 out_coords->data.location = VARYING_SLOT_VAR0;
612
613 nir_def *vert0_coords = load_const(b, 2, 2);
614 nir_def *vert1_coords = load_const(b, 6, 2);
615
616 /* Only used with "z scale" blit path which uses a 3d texture */
617 nir_def *z_coord = load_const(b, 16, 1);
618
619 nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
620 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
621 z_coord);
622
623 nir_store_var(b, out_coords, coords, 0x7);
624
625 return b->shader;
626 }
627
628 static nir_shader *
build_clear_vs_shader(void)629 build_clear_vs_shader(void)
630 {
631 nir_builder _b =
632 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
633 nir_builder *b = &_b;
634 b->shader->info.internal = true;
635
636 nir_variable *out_pos =
637 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
638 "gl_Position");
639 out_pos->data.location = VARYING_SLOT_POS;
640
641 nir_def *vert0_pos = load_const(b, 0, 2);
642 nir_def *vert1_pos = load_const(b, 4, 2);
643 /* c0.z is used to clear depth */
644 nir_def *depth = load_const(b, 2, 1);
645 nir_def *vertex = nir_load_vertex_id(b);
646
647 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
648 pos = nir_vec4(b, nir_channel(b, pos, 0),
649 nir_channel(b, pos, 1),
650 depth, nir_imm_float(b, 1.0));
651
652 nir_store_var(b, out_pos, pos, 0xf);
653
654 nir_variable *out_layer =
655 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
656 "gl_Layer");
657 out_layer->data.location = VARYING_SLOT_LAYER;
658 nir_def *layer = load_const(b, 3, 1);
659 nir_store_var(b, out_layer, layer, 1);
660
661 return b->shader;
662 }
663
664 static nir_shader *
build_blit_fs_shader(bool zscale)665 build_blit_fs_shader(bool zscale)
666 {
667 nir_builder _b =
668 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
669 zscale ? "zscale blit fs" : "blit fs");
670 nir_builder *b = &_b;
671 b->shader->info.internal = true;
672
673 nir_variable *out_color =
674 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
675 "color0");
676 out_color->data.location = FRAG_RESULT_DATA0;
677
678 unsigned coord_components = zscale ? 3 : 2;
679 nir_variable *in_coords =
680 nir_variable_create(b->shader, nir_var_shader_in,
681 glsl_vec_type(coord_components),
682 "coords");
683 in_coords->data.location = VARYING_SLOT_VAR0;
684
685 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
686 /* Note: since we're just copying data, we rely on the HW ignoring the
687 * dest_type.
688 */
689 tex->dest_type = nir_type_int32;
690 tex->is_array = false;
691 tex->is_shadow = false;
692 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
693
694 tex->texture_index = 0;
695 tex->sampler_index = 0;
696
697 b->shader->info.num_textures = 1;
698 BITSET_SET(b->shader->info.textures_used, 0);
699
700 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
701 nir_load_var(b, in_coords));
702 tex->coord_components = coord_components;
703
704 nir_def_init(&tex->instr, &tex->def, 4, 32);
705 nir_builder_instr_insert(b, &tex->instr);
706
707 nir_store_var(b, out_color, &tex->def, 0xf);
708
709 return b->shader;
710 }
711
712 /* We can only read multisample textures via txf_ms, so we need a separate
713 * variant for them.
714 */
715 static nir_shader *
build_ms_copy_fs_shader(bool half_float)716 build_ms_copy_fs_shader(bool half_float)
717 {
718 nir_builder _b =
719 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
720 "multisample copy fs");
721 nir_builder *b = &_b;
722 b->shader->info.internal = true;
723
724 nir_variable *out_color =
725 nir_variable_create(b->shader, nir_var_shader_out,
726 half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
727 "color0");
728 out_color->data.location = FRAG_RESULT_DATA0;
729
730 nir_variable *in_coords =
731 nir_variable_create(b->shader, nir_var_shader_in,
732 glsl_vec_type(2),
733 "coords");
734 in_coords->data.location = VARYING_SLOT_VAR0;
735
736 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
737
738 tex->op = nir_texop_txf_ms;
739
740 /* Note: since we're just copying data, we rely on the HW ignoring the
741 * dest_type.
742 */
743 tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
744 tex->is_array = false;
745 tex->is_shadow = false;
746 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
747
748 tex->texture_index = 0;
749 tex->sampler_index = 0;
750
751 b->shader->info.num_textures = 1;
752 BITSET_SET(b->shader->info.textures_used, 0);
753 BITSET_SET(b->shader->info.textures_used_by_txf, 0);
754
755 nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
756
757 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
758 tex->coord_components = 2;
759
760 tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
761 nir_load_sample_id(b));
762
763 nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
764 nir_builder_instr_insert(b, &tex->instr);
765
766 nir_store_var(b, out_color, &tex->def, 0xf);
767
768 return b->shader;
769 }
770
771 static nir_shader *
build_clear_fs_shader(unsigned mrts)772 build_clear_fs_shader(unsigned mrts)
773 {
774 nir_builder _b =
775 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
776 "mrt%u clear fs", mrts);
777 nir_builder *b = &_b;
778 b->shader->info.internal = true;
779
780 for (unsigned i = 0; i < mrts; i++) {
781 nir_variable *out_color =
782 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
783 "color");
784 out_color->data.location = FRAG_RESULT_DATA0 + i;
785
786 nir_def *color = load_const(b, 4 * i, 4);
787 nir_store_var(b, out_color, color, 0xf);
788 }
789
790 return b->shader;
791 }
792
793 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)794 compile_shader(struct tu_device *dev, struct nir_shader *nir,
795 unsigned consts, unsigned *offset, enum global_shader idx)
796 {
797 nir->options = ir3_get_compiler_options(dev->compiler);
798
799 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
800 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
801
802 struct ir3_const_allocations const_allocs = {};
803 if (consts > 0)
804 ir3_const_alloc(&const_allocs, IR3_CONST_ALLOC_UBO_RANGES, align(consts, 8), 1);
805
806 const struct ir3_shader_options options = {
807 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
808 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
809 .const_allocs = const_allocs,
810 };
811
812 ir3_finalize_nir(dev->compiler, &options.nir_options, nir);
813
814 struct ir3_shader *sh =
815 ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
816
817 struct ir3_shader_key key = {};
818 bool created;
819 struct ir3_shader_variant *so =
820 ir3_shader_get_variant(sh, &key, false, false, &created);
821
822 struct tu6_global *global = dev->global_bo_map;
823
824 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
825 dev->global_shaders[idx] = sh;
826 dev->global_shader_variants[idx] = so;
827 memcpy(&global->shaders[*offset], so->bin,
828 sizeof(uint32_t) * so->info.sizedwords);
829 dev->global_shader_va[idx] = dev->global_bo->iova +
830 offsetof_arr(struct tu6_global, shaders, *offset);
831 *offset += align(so->info.sizedwords, 32);
832 }
833
834 void
tu_init_clear_blit_shaders(struct tu_device * dev)835 tu_init_clear_blit_shaders(struct tu_device *dev)
836 {
837 unsigned offset = 0;
838 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
839 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
840 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
841 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
842 compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
843 compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
844
845 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
846 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
847 (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
848 }
849 }
850
851 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)852 tu_destroy_clear_blit_shaders(struct tu_device *dev)
853 {
854 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
855 if (dev->global_shaders[i])
856 ir3_shader_destroy(dev->global_shaders[i]);
857 }
858 }
859
860 enum r3d_type {
861 R3D_CLEAR,
862 R3D_BLIT,
863 R3D_COPY_HALF,
864 };
865
866 template <chip CHIP>
867 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)868 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
869 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
870 {
871 enum global_shader vs_id =
872 type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
873
874 struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
875 uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
876
877 enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
878
879 if (z_scale) {
880 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
881 } else if (type == R3D_COPY_HALF) {
882 /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
883 *
884 * TODO: Add a half-float blit shader that uses texture() but with half
885 * registers to avoid NaN canonicaliztion for the single-sampled case.
886 */
887 fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
888 } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
889 fs_id = GLOBAL_SH_FS_COPY_MS;
890 }
891
892 unsigned num_rts = util_bitcount(rts_mask);
893 if (type == R3D_CLEAR)
894 fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
895
896 struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
897 uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
898
899 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
900 .vs_state = true,
901 .hs_state = true,
902 .ds_state = true,
903 .gs_state = true,
904 .fs_state = true,
905 .gfx_ibo = true,
906 .gfx_shared_const = true,
907 .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
908 .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
909
910 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
911 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
912 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
913 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
914 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
915
916 struct tu_pvtmem_config pvtmem = {};
917 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
918 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
919
920 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
921 if (CHIP == A7XX) {
922 tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
923 }
924
925 tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
926
927 if (CHIP >= A7XX) {
928 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
929
930 tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
931 }
932
933 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
934 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
935 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
936
937 tu6_emit_vs<CHIP>(cs, vs, 0);
938 tu6_emit_hs<CHIP>(cs, NULL);
939 tu6_emit_ds<CHIP>(cs, NULL);
940 tu6_emit_gs<CHIP>(cs, NULL);
941 tu6_emit_fs<CHIP>(cs, fs);
942
943 tu_cs_emit_regs(cs,
944 A6XX_GRAS_CL_CNTL(
945 .clip_disable = 1,
946 .vp_clip_code_ignore = 1,
947 .vp_xform_disable = 1,
948 .persp_division_disable = 1,));
949 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
950
951 tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
952 if (CHIP == A6XX) {
953 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
954 } else {
955 tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
956 }
957
958 tu_cs_emit_regs(cs,
959 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
960 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
961 tu_cs_emit_regs(cs,
962 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
963 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
964
965 tu_cs_emit_regs(cs,
966 A6XX_VFD_INDEX_OFFSET(),
967 A6XX_VFD_INSTANCE_START_OFFSET());
968
969 if (rts_mask) {
970 unsigned rts_count = util_last_bit(rts_mask);
971 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
972 unsigned rt = 0;
973 for (unsigned i = 0; i < rts_count; i++) {
974 unsigned regid = 0;
975 if (rts_mask & (1u << i))
976 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
977 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
978 COND(regid & HALF_REG_ID,
979 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
980 }
981 }
982
983 tu6_emit_msaa(cs, samples, false);
984 }
985
986 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)987 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
988 struct tu_cs *cs,
989 uint32_t opcode,
990 enum a6xx_state_block block,
991 uint32_t offset,
992 const void *consts,
993 uint32_t size_vec4)
994 {
995 assert(offset % cmd->device->compiler->const_upload_unit == 0);
996
997 struct tu_cs_memory mem = {};
998 VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
999 if (result != VK_SUCCESS) {
1000 vk_command_buffer_set_error(&cmd->vk, result);
1001 return;
1002 }
1003
1004 memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
1005
1006 tu_cs_emit_pkt7(cs, opcode, 3);
1007 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
1008 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1009 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1010 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
1011 CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
1012 tu_cs_emit_qw(cs, mem.iova);
1013 }
1014
1015 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)1016 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
1017 {
1018 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
1019 }
1020
1021 /* z coordinate for "z scale" blit path which uses a 3d texture */
1022 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)1023 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
1024 {
1025 const uint32_t coord[] = {
1026 fui(z),
1027 0,
1028 0,
1029 0,
1030 };
1031
1032 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
1033 }
1034
1035 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)1036 r3d_coords(struct tu_cmd_buffer *cmd,
1037 struct tu_cs *cs,
1038 const VkOffset2D dst,
1039 const VkOffset2D src,
1040 const VkExtent2D extent)
1041 {
1042 const bool no_src = src.x != blt_no_coord.x;
1043 int32_t src_x1 = no_src ? src.x : 0;
1044 int32_t src_y1 = no_src ? src.y : 0;
1045
1046 const float coords[] = {
1047 dst.x,
1048 dst.y,
1049 src_x1,
1050 src_y1,
1051 dst.x + extent.width,
1052 dst.y + extent.height,
1053 src_x1 + extent.width,
1054 src_y1 + extent.height,
1055 };
1056 r3d_coords_raw(cmd, cs, coords);
1057 }
1058
1059 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1060 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1061 {
1062 uint32_t coords[4] = {};
1063
1064 switch (format) {
1065 case PIPE_FORMAT_Z24X8_UNORM:
1066 case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1067 /* cleared as r8g8b8a8_unorm using special format */
1068 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1069 coords[0] = fui((tmp & 0xff) / 255.0f);
1070 coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1071 coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1072 coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1073 } break;
1074 case PIPE_FORMAT_Z16_UNORM:
1075 case PIPE_FORMAT_Z32_FLOAT:
1076 coords[0] = fui(val->depthStencil.depth);
1077 coords[1] = 0;
1078 coords[2] = 0;
1079 coords[3] = 0;
1080 break;
1081 case PIPE_FORMAT_S8_UINT:
1082 coords[0] = val->depthStencil.stencil & 0xff;
1083 coords[1] = 0;
1084 coords[2] = 0;
1085 coords[3] = 0;
1086 break;
1087 default:
1088 /* as color formats use clear value as-is */
1089 assert(!util_format_is_depth_or_stencil(format));
1090 memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1091 break;
1092 }
1093
1094 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1095 }
1096
1097 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1098 r3d_src_common(struct tu_cmd_buffer *cmd,
1099 struct tu_cs *cs,
1100 const uint32_t *tex_const,
1101 uint32_t offset_base,
1102 uint32_t offset_ubwc,
1103 VkFilter filter)
1104 {
1105 struct tu_cs_memory texture = { };
1106 VkResult result = tu_cs_alloc(&cmd->sub_cs,
1107 2, /* allocate space for a sampler too */
1108 A6XX_TEX_CONST_DWORDS, &texture);
1109 if (result != VK_SUCCESS) {
1110 vk_command_buffer_set_error(&cmd->vk, result);
1111 return;
1112 }
1113
1114 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1115
1116 /* patch addresses for layer offset */
1117 *(uint64_t*) (texture.map + 4) += offset_base;
1118 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1119 texture.map[7] = ubwc_addr;
1120 texture.map[8] = ubwc_addr >> 32;
1121
1122 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1123 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1124 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1125 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1126 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1127 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1128 0x60000; /* XXX used by blob, doesn't seem necessary */
1129 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1130 A6XX_TEX_SAMP_1_UNNORM_COORDS |
1131 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1132 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1133 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1134
1135 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1136 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1137 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1138 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1139 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1140 CP_LOAD_STATE6_0_NUM_UNIT(1));
1141 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1142
1143 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1144
1145 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1146 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1147 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1148 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1149 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1150 CP_LOAD_STATE6_0_NUM_UNIT(1));
1151 tu_cs_emit_qw(cs, texture.iova);
1152
1153 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1154 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1155 }
1156
1157 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1158 r3d_src(struct tu_cmd_buffer *cmd,
1159 struct tu_cs *cs,
1160 const struct fdl6_view *iview,
1161 uint32_t layer,
1162 VkFilter filter,
1163 enum pipe_format dst_format)
1164 {
1165 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1166 memcpy(desc, iview->descriptor, sizeof(desc));
1167
1168 enum a6xx_format fmt = (enum a6xx_format)(
1169 (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1170 enum pipe_format src_format = iview->format;
1171 fixup_src_format(&src_format, dst_format, &fmt);
1172 desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1173 A6XX_TEX_CONST_0_FMT(fmt);
1174
1175 r3d_src_common(cmd, cs, desc,
1176 iview->layer_size * layer,
1177 iview->ubwc_layer_size * layer,
1178 filter);
1179 }
1180
1181 template <chip CHIP>
1182 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1183 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1184 struct tu_cs *cs,
1185 enum pipe_format format,
1186 uint64_t va, uint32_t pitch,
1187 uint32_t width, uint32_t height,
1188 enum pipe_format dst_format)
1189 {
1190 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1191
1192 struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, false);
1193 enum a6xx_format color_format = fmt.fmt;
1194 fixup_src_format(&format, dst_format, &color_format);
1195
1196 desc[0] =
1197 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1198 A6XX_TEX_CONST_0_FMT(color_format) |
1199 A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1200 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1201 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1202 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1203 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1204 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1205 desc[2] =
1206 A6XX_TEX_CONST_2_PITCH(pitch) |
1207 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1208 desc[3] = 0;
1209 desc[4] = va;
1210 desc[5] = va >> 32;
1211 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1212 desc[i] = 0;
1213
1214 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1215 }
1216
1217 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1218 r3d_src_depth(struct tu_cmd_buffer *cmd,
1219 struct tu_cs *cs,
1220 const struct tu_image_view *iview,
1221 uint32_t layer)
1222 {
1223 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1224
1225 memcpy(desc, iview->view.descriptor, sizeof(desc));
1226 uint64_t va = iview->depth_base_addr;
1227
1228 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1229 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1230 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1231 A6XX_TEX_CONST_0_SWAP__MASK);
1232 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1233 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1234 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1235 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1236 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1237 desc[2] =
1238 A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1239 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1240 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1241 (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1242 desc[4] = va;
1243 desc[5] = va >> 32;
1244
1245 r3d_src_common(cmd, cs, desc,
1246 iview->depth_layer_size * layer,
1247 iview->view.ubwc_layer_size * layer,
1248 VK_FILTER_NEAREST);
1249 }
1250
1251 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1252 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1253 struct tu_cs *cs,
1254 const struct tu_image_view *iview,
1255 uint32_t layer)
1256 {
1257 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1258
1259 memcpy(desc, iview->view.descriptor, sizeof(desc));
1260 uint64_t va = iview->stencil_base_addr;
1261
1262 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1263 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1264 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1265 A6XX_TEX_CONST_0_SWAP__MASK);
1266 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1267 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1268 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1269 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1270 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1271 desc[2] =
1272 A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1273 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1274 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1275 desc[4] = va;
1276 desc[5] = va >> 32;
1277 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1278 desc[i] = 0;
1279
1280 r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1281 VK_FILTER_NEAREST);
1282 }
1283
1284 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1285 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1286 struct tu_cs *cs,
1287 const struct tu_image_view *iview,
1288 uint32_t layer)
1289 {
1290 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1291
1292 memcpy(desc, iview->view.descriptor, sizeof(desc));
1293
1294 /* Fixup D24 formats because we always load both depth and stencil. */
1295 enum pipe_format format = iview->view.format;
1296 if (format == PIPE_FORMAT_X24S8_UINT ||
1297 format == PIPE_FORMAT_Z24X8_UNORM ||
1298 format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1299 desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1300 if (iview->view.ubwc_enabled)
1301 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1302 else
1303 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1304 }
1305
1306 /* When loading/storing GMEM we always load the full image and don't do any
1307 * swizzling or swapping, that's done in the draw when reading/writing
1308 * GMEM, so we need to fixup the swizzle and swap.
1309 */
1310 desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1311 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1312 A6XX_TEX_CONST_0_SWAP__MASK);
1313 desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1314 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1315 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1316 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1317
1318 r3d_src_common(cmd, cs, desc,
1319 iview->view.layer_size * layer,
1320 iview->view.ubwc_layer_size * layer,
1321 VK_FILTER_NEAREST);
1322 }
1323
1324 template <chip CHIP>
1325 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1326 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1327 struct tu_cs *cs,
1328 const struct tu_image_view *iview,
1329 enum pipe_format format,
1330 enum pipe_format dst_format,
1331 uint32_t gmem_offset,
1332 uint32_t cpp)
1333 {
1334 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1335 memcpy(desc, iview->view.descriptor, sizeof(desc));
1336
1337 enum a6xx_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false, true).fmt;
1338 fixup_src_format(&format, dst_format, &fmt);
1339
1340 /* patch the format so that depth/stencil get the right format and swizzle */
1341 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1342 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1343 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1344 desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1345 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1346 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1347 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1348 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1349
1350 /* patched for gmem */
1351 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1352 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1353 desc[2] =
1354 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1355 A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1356 desc[3] = 0;
1357 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1358 desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1359 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1360 desc[i] = 0;
1361
1362 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1363 }
1364
1365 template <chip CHIP>
1366 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1367 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1368 enum pipe_format src_format)
1369 {
1370 uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1371
1372 enum a6xx_format fmt = (enum a6xx_format)(
1373 mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1374 enum pipe_format dst_format = iview->format;
1375 fixup_dst_format(src_format, &dst_format, &fmt);
1376 mrt_buf_info =
1377 (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1378 A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1379
1380 tu_cs_emit_regs(cs,
1381 RB_MRT_BUF_INFO(CHIP, 0, .dword = mrt_buf_info),
1382 A6XX_RB_MRT_PITCH(0, iview->pitch),
1383 A6XX_RB_MRT_ARRAY_PITCH(0, iview->layer_size),
1384 A6XX_RB_MRT_BASE(0, .qword = tu_layer_address(iview, layer)),
1385 A6XX_RB_MRT_BASE_GMEM(0),
1386 );
1387
1388 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1389 tu_cs_image_flag_ref(cs, iview, layer);
1390
1391 /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1392 * FMT6_NV12_Y.
1393 */
1394 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1395
1396 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->ubwc_enabled));
1397 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1398 }
1399
1400 template <chip CHIP>
1401 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1402 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1403 {
1404 tu_cs_emit_regs(cs,
1405 RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_depth(iview, RB_MRT_BUF_INFO)),
1406 A6XX_RB_MRT_PITCH(0, iview->depth_pitch),
1407 A6XX_RB_MRT_ARRAY_PITCH(0, iview->depth_layer_size),
1408 A6XX_RB_MRT_BASE(0, .qword = iview->depth_base_addr + iview->depth_layer_size * layer),
1409 A6XX_RB_MRT_BASE_GMEM(0),
1410 );
1411
1412 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1413 tu_cs_image_flag_ref(cs, &iview->view, layer);
1414
1415 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->view.ubwc_enabled));
1416 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1417 }
1418
1419 template <chip CHIP>
1420 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1421 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1422 {
1423 tu_cs_emit_regs(cs,
1424 RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_stencil(iview, RB_MRT_BUF_INFO)),
1425 A6XX_RB_MRT_PITCH(0, iview->stencil_pitch),
1426 A6XX_RB_MRT_ARRAY_PITCH(0, iview->stencil_layer_size),
1427 A6XX_RB_MRT_BASE(0, .qword = iview->stencil_base_addr + iview->stencil_layer_size * layer),
1428 A6XX_RB_MRT_BASE_GMEM(0),
1429 );
1430
1431 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1432 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1433 }
1434
1435 template <chip CHIP>
1436 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1437 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1438 enum pipe_format src_format)
1439 {
1440 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1441
1442 enum a6xx_format color_fmt = fmt.fmt;
1443 fixup_dst_format(src_format, &format, &color_fmt);
1444
1445 tu_cs_emit_regs(cs,
1446 RB_MRT_BUF_INFO(CHIP, 0, .color_format = color_fmt, .color_swap = fmt.swap),
1447 A6XX_RB_MRT_PITCH(0, pitch),
1448 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1449 A6XX_RB_MRT_BASE(0, .qword = va),
1450 A6XX_RB_MRT_BASE_GMEM(0, 0));
1451
1452 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1453 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1454 }
1455
1456 template <chip CHIP>
1457 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1458 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1459 const struct tu_image_view *iview,
1460 const struct tu_render_pass_attachment *att,
1461 bool separate_stencil, unsigned layer)
1462 {
1463 unsigned RB_MRT_BUF_INFO;
1464 unsigned gmem_offset;
1465
1466 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1467 if (!separate_stencil) {
1468 RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1469 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1470 } else {
1471 RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1472 gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1473 }
1474 } else {
1475 RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1476 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1477 }
1478
1479 tu_cs_emit_regs(cs,
1480 RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
1481 A6XX_RB_MRT_PITCH(0, 0),
1482 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1483 A6XX_RB_MRT_BASE(0, 0),
1484 A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1485
1486 enum a6xx_format color_format =
1487 (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1488 tu_cs_emit_regs(cs,
1489 A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1490
1491 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1492 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1493 }
1494
1495 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1496 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1497 {
1498 uint8_t mask = 0xf;
1499 assert(aspect_mask);
1500 /* note: the only format with partial writing is D24S8,
1501 * clear/blit uses the _AS_R8G8B8A8 format to access it
1502 */
1503 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1504 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1505 mask = 0x7;
1506 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1507 mask = 0x8;
1508 }
1509 return mask;
1510 }
1511
1512 static uint8_t
aspect_write_mask_generic_clear(enum pipe_format format,VkImageAspectFlags aspect_mask)1513 aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspect_mask)
1514 {
1515 uint8_t mask = 0xf;
1516 assert(aspect_mask);
1517 /* note: the only format with partial writing is D24S8 */
1518 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1519 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1520 mask = 0x1;
1521 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1522 mask = 0x2;
1523 if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
1524 mask = 0x3;
1525 }
1526 return mask;
1527 }
1528
1529 enum r3d_blit_param {
1530 R3D_Z_SCALE = 1 << 0,
1531 R3D_DST_GMEM = 1 << 1,
1532 R3D_COPY = 1 << 2,
1533 };
1534
1535 template <chip CHIP>
1536 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1537 r3d_setup(struct tu_cmd_buffer *cmd,
1538 struct tu_cs *cs,
1539 enum pipe_format src_format,
1540 enum pipe_format dst_format,
1541 VkImageAspectFlags aspect_mask,
1542 unsigned blit_param,
1543 bool clear,
1544 bool ubwc,
1545 VkSampleCountFlagBits samples)
1546 {
1547 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1548 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1549 }
1550
1551 enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
1552 fixup_dst_format(src_format, &dst_format, &fmt);
1553
1554 if (!cmd->state.pass) {
1555 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1556 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1557 }
1558
1559 if (!(blit_param & R3D_DST_GMEM)) {
1560 if (CHIP == A6XX) {
1561 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1562 } else {
1563 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1564 }
1565
1566 tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1567
1568 if (CHIP >= A7XX) {
1569 tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1570 tu_cs_emit_regs(cs,
1571 A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1572 }
1573 }
1574
1575 enum r3d_type type;
1576 if (clear) {
1577 type = R3D_CLEAR;
1578 } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1579 /* Avoid canonicalizing NaNs in copies by using the special half-float
1580 * path that uses half regs.
1581 */
1582 type = R3D_COPY_HALF;
1583 } else {
1584 type = R3D_BLIT;
1585 }
1586
1587 r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1588
1589 tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1590 tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1591 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1592 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1593
1594 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1595 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1596 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1597 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1598 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1599 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1600 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1601 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1602 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1603
1604 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1605 .color_format = fmt,
1606 .color_sint = util_format_is_pure_sint(dst_format),
1607 .color_uint = util_format_is_pure_uint(dst_format)));
1608
1609 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1610 .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1611 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1612 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1613
1614 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1615 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1616
1617 if (CHIP >= A7XX) {
1618 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
1619
1620 tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
1621 tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
1622 tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
1623 }
1624
1625 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1626 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1627
1628 /* Disable sample counting in order to not affect occlusion query. */
1629 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1630
1631 tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL());
1632 if (CHIP >= A7XX) {
1633 tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL());
1634 }
1635
1636 if (cmd->state.prim_generated_query_running_before_rp) {
1637 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1638 }
1639
1640 if (cmd->state.predication_active) {
1641 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1642 tu_cs_emit(cs, 0);
1643 }
1644 }
1645
1646 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1647 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1648 {
1649 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1650 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1651 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1652 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1653 tu_cs_emit(cs, 1); /* instance count */
1654 tu_cs_emit(cs, 2); /* vertex count */
1655 }
1656
1657 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1658 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1659 {
1660 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1661 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1662 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1663 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1664 tu_cs_emit(cs, 1); /* instance count */
1665 tu_cs_emit(cs, 2); /* vertex count */
1666 }
1667
1668 template <chip CHIP>
1669 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1670 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1671 {
1672 if (cmd->state.predication_active) {
1673 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1674 tu_cs_emit(cs, 1);
1675 }
1676
1677 /* Re-enable sample counting. */
1678 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1679
1680 if (cmd->state.prim_generated_query_running_before_rp) {
1681 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1682 }
1683 }
1684
1685 /* blit ops - common interface for 2d/shader paths */
1686
1687 struct blit_ops {
1688 void (*coords)(struct tu_cmd_buffer *cmd,
1689 struct tu_cs *cs,
1690 const VkOffset2D dst,
1691 const VkOffset2D src,
1692 const VkExtent2D extent);
1693 void (*clear_value)(struct tu_cmd_buffer *cmd,
1694 struct tu_cs *cs,
1695 enum pipe_format format,
1696 const VkClearValue *val);
1697 void (*src)(
1698 struct tu_cmd_buffer *cmd,
1699 struct tu_cs *cs,
1700 const struct fdl6_view *iview,
1701 uint32_t layer,
1702 VkFilter filter,
1703 enum pipe_format dst_format);
1704 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1705 enum pipe_format format,
1706 uint64_t va, uint32_t pitch,
1707 uint32_t width, uint32_t height,
1708 enum pipe_format dst_format);
1709 void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1710 enum pipe_format src_format);
1711 void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1712 void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1713 void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1714 enum pipe_format src_format);
1715 void (*setup)(struct tu_cmd_buffer *cmd,
1716 struct tu_cs *cs,
1717 enum pipe_format src_format,
1718 enum pipe_format dst_format,
1719 VkImageAspectFlags aspect_mask,
1720 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1721 bool clear,
1722 bool ubwc,
1723 VkSampleCountFlagBits samples);
1724 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1725 void (*teardown)(struct tu_cmd_buffer *cmd,
1726 struct tu_cs *cs);
1727 };
1728
1729 template <chip CHIP>
1730 static const struct blit_ops r2d_ops = {
1731 .coords = r2d_coords,
1732 .clear_value = r2d_clear_value,
1733 .src = r2d_src<CHIP>,
1734 .src_buffer = r2d_src_buffer<CHIP>,
1735 .dst = r2d_dst<CHIP>,
1736 .dst_depth = r2d_dst_depth,
1737 .dst_stencil = r2d_dst_stencil,
1738 .dst_buffer = r2d_dst_buffer,
1739 .setup = r2d_setup<CHIP>,
1740 .run = r2d_run,
1741 .teardown = r2d_teardown,
1742 };
1743
1744 template <chip CHIP>
1745 static const struct blit_ops r3d_ops = {
1746 .coords = r3d_coords,
1747 .clear_value = r3d_clear_value,
1748 .src = r3d_src,
1749 .src_buffer = r3d_src_buffer<CHIP>,
1750 .dst = r3d_dst<CHIP>,
1751 .dst_depth = r3d_dst_depth<CHIP>,
1752 .dst_stencil = r3d_dst_stencil<CHIP>,
1753 .dst_buffer = r3d_dst_buffer<CHIP>,
1754 .setup = r3d_setup<CHIP>,
1755 .run = r3d_run,
1756 .teardown = r3d_teardown<CHIP>,
1757 };
1758
1759 /* passthrough set coords from 3D extents */
1760 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1761 coords(const struct blit_ops *ops,
1762 struct tu_cmd_buffer *cmd,
1763 struct tu_cs *cs,
1764 const VkOffset3D dst,
1765 const VkOffset3D src,
1766 const VkExtent3D extent)
1767 {
1768 ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1769 (VkExtent2D) {extent.width, extent.height});
1770 }
1771
1772 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1773 * to be a bit careful because we have to pick a format with matching UBWC
1774 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1775 * everything.
1776 */
1777 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1778 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1779 {
1780 if (vk_format_is_compressed(vk_format)) {
1781 switch (vk_format_get_blocksize(vk_format)) {
1782 case 1: return PIPE_FORMAT_R8_UINT;
1783 case 2: return PIPE_FORMAT_R16_UINT;
1784 case 4: return PIPE_FORMAT_R32_UINT;
1785 case 8: return PIPE_FORMAT_R32G32_UINT;
1786 case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1787 default:
1788 unreachable("unhandled format size");
1789 }
1790 }
1791
1792 enum pipe_format format = vk_format_to_pipe_format(vk_format);
1793
1794 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat
1795 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1796 * (also -1.0), when we're supposed to be memcpying the bits. See
1797 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1798 */
1799 format = util_format_snorm_to_unorm(format);
1800
1801 if (vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1802 return PIPE_FORMAT_R32_UINT;
1803
1804 /* For VK_FORMAT_D32_SFLOAT_S8_UINT and YCbCr formats use our existing helpers */
1805 if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1806 vk_format_get_ycbcr_info(vk_format))
1807 return tu_aspects_to_plane(vk_format, aspect_mask);
1808
1809 /* Otherwise, simply return the pipe_format */
1810 return format;
1811 }
1812
1813 static void
pack_blit_event_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])1814 pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
1815 {
1816 switch (format) {
1817 case PIPE_FORMAT_Z24X8_UNORM:
1818 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1819 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1820 val->depthStencil.stencil << 24;
1821 return;
1822 case PIPE_FORMAT_Z16_UNORM:
1823 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1824 return;
1825 case PIPE_FORMAT_Z32_FLOAT:
1826 clear_value[0] = fui(val->depthStencil.depth);
1827 return;
1828 case PIPE_FORMAT_S8_UINT:
1829 clear_value[0] = val->depthStencil.stencil;
1830 return;
1831 default:
1832 break;
1833 }
1834
1835 float tmp[4];
1836 memcpy(tmp, val->color.float32, 4 * sizeof(float));
1837 if (util_format_is_srgb(format)) {
1838 for (int i = 0; i < 3; i++)
1839 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1840 }
1841
1842 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1843 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1844 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1845 case 4:
1846 PACK_F(r4g4b4a4_unorm);
1847 break;
1848 case 5:
1849 if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1850 PACK_F(r5g6b5_unorm);
1851 else
1852 PACK_F(r5g5b5a1_unorm);
1853 break;
1854 case 8:
1855 if (util_format_is_snorm(format))
1856 PACK_F(r8g8b8a8_snorm);
1857 else if (util_format_is_unorm(format))
1858 PACK_F(r8g8b8a8_unorm);
1859 else
1860 pack_int8(clear_value, val->color.uint32);
1861 break;
1862 case 10:
1863 if (util_format_is_pure_integer(format))
1864 pack_int10_2(clear_value, val->color.uint32);
1865 else
1866 PACK_F(r10g10b10a2_unorm);
1867 break;
1868 case 11:
1869 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
1870 break;
1871 case 16:
1872 if (util_format_is_snorm(format))
1873 PACK_F(r16g16b16a16_snorm);
1874 else if (util_format_is_unorm(format))
1875 PACK_F(r16g16b16a16_unorm);
1876 else if (util_format_is_float(format))
1877 PACK_F(r16g16b16a16_float);
1878 else
1879 pack_int16(clear_value, val->color.uint32);
1880 break;
1881 case 32:
1882 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
1883 break;
1884 case 0:
1885 assert(format == PIPE_FORMAT_A8_UNORM);
1886 PACK_F(a8_unorm);
1887 break;
1888 default:
1889 unreachable("unexpected channel size");
1890 }
1891 #undef PACK_F
1892 }
1893
1894 static void
event_blit_setup(struct tu_cs * cs,uint32_t buffer_id,const struct tu_render_pass_attachment * att,enum a6xx_blit_event_type blit_event_type,uint32_t clear_mask)1895 event_blit_setup(struct tu_cs *cs,
1896 uint32_t buffer_id,
1897 const struct tu_render_pass_attachment *att,
1898 enum a6xx_blit_event_type blit_event_type,
1899 uint32_t clear_mask)
1900 {
1901 tu_cs_emit_regs(
1902 cs, A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
1903
1904 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1905 tu_cs_emit(cs, 0);
1906
1907 tu_cs_emit_regs(
1908 cs,
1909 A6XX_RB_BLIT_INFO(.type = blit_event_type,
1910 .sample_0 =
1911 vk_format_is_int(att->format) ||
1912 vk_format_is_depth_or_stencil(att->format),
1913 .depth = vk_format_is_depth_or_stencil(att->format),
1914 .clear_mask = clear_mask,
1915 .buffer_id = buffer_id));
1916 }
1917
1918 struct event_blit_dst_view {
1919 const struct tu_image *image;
1920 const struct fdl6_view *view;
1921
1922 uint32_t layer;
1923
1924 uint64_t depth_addr;
1925 uint32_t depth_pitch;
1926
1927 uint64_t stencil_addr;
1928 uint32_t stencil_pitch;
1929 };
1930
1931 static event_blit_dst_view
blt_view_from_tu_view(const struct tu_image_view * iview,uint32_t layer)1932 blt_view_from_tu_view(const struct tu_image_view *iview,
1933 uint32_t layer)
1934 {
1935 struct event_blit_dst_view blt_view;
1936 blt_view.image = iview->image;
1937 blt_view.view = &iview->view;
1938 blt_view.layer = layer;
1939
1940 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1941 blt_view.depth_addr =
1942 iview->depth_base_addr + iview->depth_layer_size * layer;
1943 blt_view.depth_pitch = iview->depth_pitch;
1944
1945 blt_view.stencil_addr =
1946 iview->stencil_base_addr + iview->stencil_layer_size * layer;
1947 blt_view.stencil_pitch = iview->stencil_pitch;
1948 }
1949 return blt_view;
1950 }
1951
1952 template <chip CHIP>
1953 static void
event_blit_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const event_blit_dst_view * blt_view,bool separate_stencil)1954 event_blit_run(struct tu_cmd_buffer *cmd,
1955 struct tu_cs *cs,
1956 const struct tu_render_pass_attachment *att,
1957 const event_blit_dst_view *blt_view,
1958 bool separate_stencil)
1959 {
1960 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
1961 if (blt_view->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1962 if (!separate_stencil) {
1963 tu_cs_emit(cs, tu_fdl_view_depth(blt_view->view, RB_BLIT_DST_INFO));
1964 tu_cs_emit_qw(cs, blt_view->depth_addr);
1965 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(blt_view->depth_pitch).value);
1966
1967 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1968 tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1969 } else {
1970 tu_cs_emit(cs, tu_fdl_view_stencil(blt_view->view, RB_BLIT_DST_INFO) &
1971 ~A6XX_RB_BLIT_DST_INFO_FLAGS);
1972 tu_cs_emit_qw(cs, blt_view->stencil_addr);
1973 tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(blt_view->stencil_pitch).value);
1974 }
1975 } else {
1976 tu_cs_emit(cs, blt_view->view->RB_BLIT_DST_INFO);
1977 tu_cs_image_ref_2d<CHIP>(cs, blt_view->view, blt_view->layer, false);
1978
1979 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1980 tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1981 }
1982
1983 if (att) {
1984 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
1985 tu_cs_emit_regs(
1986 cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(
1987 cmd, att, blt_view->layer)));
1988 } else {
1989 tu_cs_emit_regs(cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(
1990 cmd, att, blt_view->layer)));
1991 }
1992 }
1993
1994 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
1995 }
1996
1997 static void
tu7_generic_layer_clear(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,bool separate_stencil,uint32_t layer,const VkClearValue * value,uint32_t a)1998 tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
1999 struct tu_cs *cs,
2000 uint32_t buffer_id,
2001 enum pipe_format format,
2002 uint8_t clear_mask,
2003 bool separate_stencil,
2004 uint32_t layer,
2005 const VkClearValue *value,
2006 uint32_t a)
2007 {
2008 const struct tu_render_pass_attachment *att =
2009 &cmd->state.pass->attachments[a];
2010 const struct tu_image_view *iview = cmd->state.attachments[a];
2011
2012 uint32_t clear_vals[4] = {};
2013 pack_blit_event_clear_value(value, format, clear_vals);
2014
2015 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2016 tu_cs_emit_array(cs, clear_vals, 4);
2017
2018 event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
2019
2020 event_blit_setup(cs, buffer_id, att, BLIT_EVENT_CLEAR, clear_mask);
2021 event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
2022 }
2023
2024
2025
2026 /* Copies/fills/updates for buffers are happening through CCU but need
2027 * additional synchronization when write range is not aligned to 64 bytes.
2028 * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not
2029 * coherent between each other in CCU since format seem to be a part of a
2030 * cache key.
2031 *
2032 * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306
2033 *
2034 * The synchronization with writes from UCHE (e.g. with SSBO stores) are
2035 * solved by the fact that UCHE has byte level dirtiness tracking and that CCU
2036 * flush would happen always before UCHE flush for such case (e.g. both
2037 * renderpass and dispatch would flush pending CCU write).
2038 *
2039 * Additionally see:
2040 * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111
2041 */
2042 template <chip CHIP>
2043 static void
handle_buffer_unaligned_store(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t size,bool * unaligned_store)2044 handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd,
2045 uint64_t dst_va,
2046 uint64_t size,
2047 bool *unaligned_store)
2048 {
2049 if (*unaligned_store)
2050 return;
2051
2052 if ((dst_va & 63) || (size & 63)) {
2053 tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE,
2054 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
2055 /* Wait for invalidations to land. */
2056 cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2057 tu_emit_cache_flush<CHIP>(cmd);
2058 *unaligned_store = true;
2059 }
2060 }
2061
2062 template <chip CHIP>
2063 static void
after_buffer_unaligned_buffer_store(struct tu_cmd_buffer * cmd,bool unaligned_store)2064 after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd,
2065 bool unaligned_store)
2066 {
2067 if (unaligned_store) {
2068 tu_flush_for_access(&cmd->state.cache,
2069 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
2070 TU_ACCESS_NONE);
2071 }
2072 }
2073
2074 template <chip CHIP>
2075 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)2076 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
2077 struct tu_cs *cs,
2078 struct tu_image *image,
2079 const VkClearValue *value)
2080 {
2081 const struct blit_ops *ops = &r2d_ops<CHIP>;
2082
2083 /* It is assumed that LRZ cache is invalidated at this point for
2084 * the writes here to become visible to LRZ.
2085 *
2086 * LRZ writes are going through UCHE cache, flush UCHE before changing
2087 * LRZ via CCU. Don't need to invalidate CCU since we are presumably
2088 * writing whole cache lines we assume to be 64 bytes.
2089 */
2090 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
2091
2092 ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
2093 VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
2094 VK_SAMPLE_COUNT_1_BIT);
2095 ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
2096 ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
2097 image->iova + image->lrz_offset,
2098 image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
2099 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
2100 (VkExtent2D) { image->lrz_pitch, image->lrz_height });
2101 ops->run(cmd, cs);
2102 ops->teardown(cmd, cs);
2103
2104 /* Clearing writes via CCU color in the PS stage, and LRZ is read via
2105 * UCHE in the earlier GRAS stage.
2106 */
2107 cmd->state.cache.flush_bits |=
2108 TU_CMD_FLAG_CCU_CLEAN_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
2109 TU_CMD_FLAG_WAIT_FOR_IDLE;
2110 }
2111 TU_GENX(tu6_clear_lrz);
2112
2113 template <chip CHIP>
2114 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)2115 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
2116 struct tu_cs *cs,
2117 struct tu_image *image)
2118 {
2119 const struct blit_ops *ops = &r2d_ops<CHIP>;
2120 VkClearValue clear = {};
2121 clear.color.uint32[0] = 0xffffffff;
2122
2123 using LRZFC = fd_lrzfc_layout<CHIP>;
2124 uint64_t lrz_fc_iova = image->iova + image->lrz_fc_offset;
2125 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2126 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2127 VK_SAMPLE_COUNT_1_BIT);
2128 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
2129 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2130 lrz_fc_iova + offsetof(LRZFC, fc1),
2131 sizeof(LRZFC::fc1),
2132 PIPE_FORMAT_R32_UINT);
2133 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2134 sizeof(LRZFC::fc1) / sizeof(uint32_t), 1
2135 });
2136 ops->run(cmd, cs);
2137 if constexpr (LRZFC::HAS_BIDIR) {
2138 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2139 lrz_fc_iova + offsetof(LRZFC, fc2),
2140 sizeof(LRZFC::fc2),
2141 PIPE_FORMAT_R32_UINT);
2142 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2143 sizeof(LRZFC::fc2) / sizeof(uint32_t), 1
2144 });
2145 ops->run(cmd, cs);
2146 }
2147 ops->teardown(cmd, cs);
2148 }
2149 TU_GENX(tu6_dirty_lrz_fc);
2150
2151 template<chip CHIP>
2152 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)2153 tu_image_view_copy_blit(struct fdl6_view *iview,
2154 struct tu_image *image,
2155 enum pipe_format format,
2156 const VkImageSubresourceLayers *subres,
2157 uint32_t layer,
2158 bool z_scale)
2159 {
2160 VkImageAspectFlags aspect_mask = subres->aspectMask;
2161
2162 /* always use the AS_R8G8B8A8 format for these */
2163 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
2164 format == PIPE_FORMAT_Z24X8_UNORM) {
2165 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
2166 }
2167
2168 const struct fdl_layout *layout =
2169 &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
2170
2171 const struct fdl_view_args args = {
2172 .chip = CHIP,
2173 .iova = image->iova,
2174 .base_miplevel = subres->mipLevel,
2175 .level_count = 1,
2176 .base_array_layer = subres->baseArrayLayer + layer,
2177 .layer_count = 1,
2178 .swiz = {
2179 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
2180 },
2181 .format = tu_format_for_aspect(format, aspect_mask),
2182 .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
2183 };
2184 fdl6_view_init(iview, &layout, &args, false);
2185 }
2186
2187 template<chip CHIP>
2188 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)2189 tu_image_view_copy(struct fdl6_view *iview,
2190 struct tu_image *image,
2191 enum pipe_format format,
2192 const VkImageSubresourceLayers *subres,
2193 uint32_t layer)
2194 {
2195 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2196 }
2197
2198 template<chip CHIP>
2199 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)2200 tu_image_view_blit(struct fdl6_view *iview,
2201 struct tu_image *image,
2202 const VkImageSubresourceLayers *subres,
2203 uint32_t layer)
2204 {
2205 enum pipe_format format = tu_aspects_to_plane(image->vk.format, subres->aspectMask);
2206 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2207 }
2208
2209 template <chip CHIP>
2210 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)2211 tu6_blit_image(struct tu_cmd_buffer *cmd,
2212 struct tu_image *src_image,
2213 struct tu_image *dst_image,
2214 const VkImageBlit2 *info,
2215 VkFilter filter)
2216 {
2217 const struct blit_ops *ops = &r2d_ops<CHIP>;
2218 struct tu_cs *cs = &cmd->cs;
2219 bool z_scale = false;
2220 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
2221
2222 /* 2D blit can't do rotation mirroring from just coordinates */
2223 static const enum a6xx_rotation rotate[2][2] = {
2224 {ROTATE_0, ROTATE_HFLIP},
2225 {ROTATE_VFLIP, ROTATE_180},
2226 };
2227
2228 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
2229 (info->dstOffsets[1].x < info->dstOffsets[0].x);
2230 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
2231 (info->dstOffsets[1].y < info->dstOffsets[0].y);
2232
2233 int32_t src0_z = info->srcOffsets[0].z;
2234 int32_t src1_z = info->srcOffsets[1].z;
2235
2236 if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
2237 info->dstOffsets[1].z - info->dstOffsets[0].z) ||
2238 info->srcOffsets[1].z < info->srcOffsets[0].z) {
2239 z_scale = true;
2240 }
2241
2242 if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
2243 layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
2244 src0_z = info->srcOffsets[1].z;
2245 src1_z = info->srcOffsets[0].z;
2246 }
2247
2248 if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
2249 assert(layers <= 1);
2250 layers = vk_image_subresource_layer_count(&dst_image->vk,
2251 &info->dstSubresource);
2252 }
2253
2254 /* BC1_RGB_* formats need to have their last components overriden with 1
2255 * when sampling, which is normally handled with the texture descriptor
2256 * swizzle. The 2d path can't handle that, so use the 3d path.
2257 *
2258 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
2259 * the 2d path.
2260 */
2261
2262 unsigned blit_param = rotate[mirror_y][mirror_x];
2263 if (dst_image->layout[0].nr_samples > 1 ||
2264 src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
2265 src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
2266 filter == VK_FILTER_CUBIC_EXT ||
2267 z_scale) {
2268 ops = &r3d_ops<CHIP>;
2269 blit_param = z_scale ? R3D_Z_SCALE : 0;
2270 }
2271
2272 /* use the right format in setup() for D32_S8 */
2273 enum pipe_format src_format = tu_aspects_to_plane(
2274 src_image->vk.format, info->srcSubresource.aspectMask);
2275 enum pipe_format dst_format = tu_aspects_to_plane(
2276 dst_image->vk.format, info->dstSubresource.aspectMask);
2277 trace_start_blit(&cmd->trace, cs,
2278 ops == &r3d_ops<CHIP>,
2279 src_image->vk.format,
2280 dst_image->vk.format,
2281 layers);
2282
2283 ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
2284 blit_param, false, dst_image->layout[0].ubwc,
2285 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2286
2287 if (ops == &r3d_ops<CHIP>) {
2288 const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
2289 info->srcOffsets[0].x, info->srcOffsets[0].y,
2290 info->dstOffsets[1].x, info->dstOffsets[1].y,
2291 info->srcOffsets[1].x, info->srcOffsets[1].y };
2292 r3d_coords_raw(cmd, cs, coords);
2293 } else {
2294 tu_cs_emit_regs(cs,
2295 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
2296 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
2297 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
2298 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
2299 tu_cs_emit_regs(cs,
2300 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
2301 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
2302 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
2303 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
2304 }
2305
2306 struct fdl6_view dst, src;
2307 tu_image_view_blit<CHIP>(
2308 &dst, dst_image, &info->dstSubresource,
2309 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
2310
2311 if (z_scale) {
2312 tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
2313 &info->srcSubresource, 0, true);
2314 ops->src(cmd, cs, &src, 0, filter, dst_format);
2315 } else {
2316 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
2317 }
2318
2319 for (uint32_t i = 0; i < layers; i++) {
2320 if (z_scale) {
2321 float t = ((float) i + 0.5f) / (float) layers;
2322 r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
2323 } else {
2324 ops->src(cmd, cs, &src, i, filter, dst_format);
2325 }
2326 ops->dst(cs, &dst, i, src_format);
2327 ops->run(cmd, cs);
2328 }
2329
2330 ops->teardown(cmd, cs);
2331
2332 trace_end_blit(&cmd->trace, cs);
2333 }
2334
2335 template <chip CHIP>
2336 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)2337 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
2338 const VkBlitImageInfo2 *pBlitImageInfo)
2339
2340 {
2341 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2342 VK_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
2343 VK_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
2344
2345 for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
2346 /* can't blit both depth and stencil at once with D32_S8
2347 * TODO: more advanced 3D blit path to support it instead?
2348 */
2349 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2350 dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2351 VkImageBlit2 region = pBlitImageInfo->pRegions[i];
2352 u_foreach_bit(b, region.dstSubresource.aspectMask) {
2353 region.srcSubresource.aspectMask = BIT(b);
2354 region.dstSubresource.aspectMask = BIT(b);
2355 tu6_blit_image<CHIP>(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter);
2356 }
2357 continue;
2358 }
2359 tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
2360 pBlitImageInfo->filter);
2361 }
2362
2363 if (dst_image->lrz_height) {
2364 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2365 }
2366 }
2367 TU_GENX(tu_CmdBlitImage2);
2368
2369 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2370 copy_compressed(VkFormat format,
2371 VkOffset3D *offset,
2372 VkExtent3D *extent,
2373 uint32_t *width,
2374 uint32_t *height)
2375 {
2376 if (!vk_format_is_compressed(format))
2377 return;
2378
2379 uint32_t block_width = vk_format_get_blockwidth(format);
2380 uint32_t block_height = vk_format_get_blockheight(format);
2381
2382 offset->x /= block_width;
2383 offset->y /= block_height;
2384
2385 if (extent) {
2386 extent->width = DIV_ROUND_UP(extent->width, block_width);
2387 extent->height = DIV_ROUND_UP(extent->height, block_height);
2388 }
2389 if (width)
2390 *width = DIV_ROUND_UP(*width, block_width);
2391 if (height)
2392 *height = DIV_ROUND_UP(*height, block_height);
2393 }
2394
2395 template <chip CHIP>
2396 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2397 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2398 struct tu_buffer *src_buffer,
2399 struct tu_image *dst_image,
2400 const VkBufferImageCopy2 *info)
2401 {
2402 struct tu_cs *cs = &cmd->cs;
2403 uint32_t layers = MAX2(info->imageExtent.depth,
2404 vk_image_subresource_layer_count(&dst_image->vk,
2405 &info->imageSubresource));
2406 enum pipe_format src_format =
2407 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2408 enum pipe_format dst_format =
2409 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2410 const struct blit_ops *ops = &r2d_ops<CHIP>;
2411
2412 /* special case for buffer to stencil */
2413 if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2414 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2415 src_format = PIPE_FORMAT_S8_UINT;
2416 }
2417
2418 /* note: could use "R8_UNORM" when no UBWC */
2419 bool has_unaligned = CHIP >= A7XX; /* If unaligned buffer copies are supported. */
2420 unsigned blit_param = 0;
2421 if (src_format == PIPE_FORMAT_Y8_UNORM ||
2422 tu_pipe_format_is_float16(src_format)) {
2423 ops = &r3d_ops<CHIP>;
2424 blit_param = R3D_COPY;
2425 has_unaligned = false;
2426 }
2427
2428 VkOffset3D offset = info->imageOffset;
2429 VkExtent3D extent = info->imageExtent;
2430 uint32_t src_width = info->bufferRowLength ?: extent.width;
2431 uint32_t src_height = info->bufferImageHeight ?: extent.height;
2432
2433 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2434
2435 uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2436 uint32_t layer_size = src_height * pitch;
2437
2438 ops->setup(cmd, cs, src_format, dst_format,
2439 info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2440 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2441
2442 struct fdl6_view dst;
2443 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2444 &info->imageSubresource, offset.z);
2445
2446 for (uint32_t i = 0; i < layers; i++) {
2447 ops->dst(cs, &dst, i, src_format);
2448
2449 uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2450 bool unaligned = (src_va & 63) || (pitch & 63);
2451 if (!has_unaligned && unaligned) {
2452 for (uint32_t y = 0; y < extent.height; y++) {
2453 uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2454 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2455 x + extent.width, 1, dst_format);
2456 ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y}, (VkOffset2D) {x},
2457 (VkExtent2D) {extent.width, 1});
2458 ops->run(cmd, cs);
2459 src_va += pitch;
2460 }
2461 } else {
2462 if constexpr (CHIP >= A7XX) {
2463 /* Necessary to not trigger static assertion from A6XX variant. */
2464 if (has_unaligned) {
2465 r2d_src_buffer_unaligned<CHIP>(cmd, cs, src_format, src_va,
2466 pitch, extent.width,
2467 extent.height, dst_format);
2468 } else {
2469 ops->src_buffer(cmd, cs, src_format, src_va, pitch,
2470 extent.width, extent.height, dst_format);
2471 }
2472 } else {
2473 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width,
2474 extent.height, dst_format);
2475 }
2476 coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2477 ops->run(cmd, cs);
2478 }
2479 }
2480
2481 ops->teardown(cmd, cs);
2482 }
2483
2484 template <chip CHIP>
2485 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2486 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2487 const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2488 {
2489 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2490 VK_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2491 VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2492
2493 for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2494 tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2495 pCopyBufferToImageInfo->pRegions + i);
2496
2497 if (dst_image->lrz_height) {
2498 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2499 }
2500 }
2501 TU_GENX(tu_CmdCopyBufferToImage2);
2502
2503 static void
tu_copy_memory_to_image(struct tu_device * device,struct tu_image * dst_image,const VkMemoryToImageCopyEXT * info,bool copy_memcpy)2504 tu_copy_memory_to_image(struct tu_device *device,
2505 struct tu_image *dst_image,
2506 const VkMemoryToImageCopyEXT *info,
2507 bool copy_memcpy)
2508 {
2509 unsigned plane = tu6_plane_index(dst_image->vk.format,
2510 info->imageSubresource.aspectMask);
2511 const struct fdl_layout *layout = &dst_image->layout[plane];
2512
2513 VkOffset3D offset = info->imageOffset;
2514 VkExtent3D extent = info->imageExtent;
2515 uint32_t src_width = info->memoryRowLength ?: extent.width;
2516 uint32_t src_height = info->memoryImageHeight ?: extent.height;
2517
2518 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2519
2520 uint32_t src_pitch = src_width * layout->cpp;
2521
2522 unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2523 offset.z : info->imageSubresource.baseArrayLayer;
2524 uint32_t layers = MAX2(extent.depth,
2525 vk_image_subresource_layer_count(&dst_image->vk,
2526 &info->imageSubresource));
2527
2528 uint32_t image_offset =
2529 fdl_surface_offset(layout,
2530 info->imageSubresource.mipLevel,
2531 start_layer);
2532
2533 uint32_t dst_layer_stride =
2534 fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2535 uint32_t dst_layer_size =
2536 layout->slices[info->imageSubresource.mipLevel].size0;
2537 uint32_t src_layer_stride =
2538 copy_memcpy ? dst_layer_size :
2539 (src_width * src_height * layout->cpp);
2540 bool tiled =
2541 fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2542
2543 const char *src = (const char *) info->pHostPointer;
2544 char *dst = (char *) dst_image->map + image_offset;
2545 for (unsigned layer = 0; layer < layers; layer++,
2546 src += src_layer_stride, dst += dst_layer_stride) {
2547 if (copy_memcpy) {
2548 memcpy(dst, src, src_layer_stride);
2549 } else if (!tiled) {
2550 uint32_t dst_pitch = fdl_pitch(layout,
2551 info->imageSubresource.mipLevel);
2552 for (unsigned y = 0; y < extent.height; y++) {
2553 memcpy(dst + dst_pitch * (y + offset.y) + offset.x * layout->cpp,
2554 src + src_pitch * y,
2555 extent.width * layout->cpp);
2556 }
2557 } else {
2558 fdl6_memcpy_linear_to_tiled(offset.x, offset.y,
2559 extent.width, extent.height,
2560 dst, src, layout,
2561 info->imageSubresource.mipLevel,
2562 src_pitch,
2563 &device->physical_device->ubwc_config);
2564 }
2565
2566 if (dst_image->bo->cached_non_coherent) {
2567 tu_bo_sync_cache(device, dst_image->bo,
2568 dst_image->bo_offset + image_offset,
2569 dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
2570 }
2571 }
2572 }
2573
2574 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyMemoryToImageEXT(VkDevice _device,const VkCopyMemoryToImageInfoEXT * info)2575 tu_CopyMemoryToImageEXT(VkDevice _device,
2576 const VkCopyMemoryToImageInfoEXT *info)
2577 {
2578 VK_FROM_HANDLE(tu_device, device, _device);
2579 VK_FROM_HANDLE(tu_image, dst_image, info->dstImage);
2580
2581 for (unsigned i = 0; i < info->regionCount; i++) {
2582 tu_copy_memory_to_image(device, dst_image, &info->pRegions[i],
2583 info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2584 }
2585
2586 if (dst_image->lrz_height) {
2587 TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
2588 }
2589
2590 return VK_SUCCESS;
2591 }
2592
2593 template <chip CHIP>
2594 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info,bool * unaligned_store)2595 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2596 struct tu_image *src_image,
2597 struct tu_buffer *dst_buffer,
2598 const VkBufferImageCopy2 *info,
2599 bool *unaligned_store)
2600 {
2601 struct tu_cs *cs = &cmd->cs;
2602 uint32_t layers = MAX2(info->imageExtent.depth,
2603 vk_image_subresource_layer_count(&src_image->vk,
2604 &info->imageSubresource));
2605 enum pipe_format dst_format =
2606 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2607 enum pipe_format src_format =
2608 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2609 const struct blit_ops *ops = &r2d_ops<CHIP>;
2610
2611 if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2612 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2613 dst_format = PIPE_FORMAT_S8_UINT;
2614 }
2615
2616 /* note: could use "R8_UNORM" when no UBWC */
2617 unsigned blit_param = 0;
2618 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2619 tu_pipe_format_is_float16(src_format)) {
2620 ops = &r3d_ops<CHIP>;
2621 blit_param = R3D_COPY;
2622 }
2623
2624 VkOffset3D offset = info->imageOffset;
2625 VkExtent3D extent = info->imageExtent;
2626 uint32_t dst_width = info->bufferRowLength ?: extent.width;
2627 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2628
2629 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2630
2631 uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2632 uint32_t layer_size = pitch * dst_height;
2633
2634 handle_buffer_unaligned_store<CHIP>(cmd,
2635 dst_buffer->iova + info->bufferOffset,
2636 layer_size * layers, unaligned_store);
2637
2638 ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2639 VK_SAMPLE_COUNT_1_BIT);
2640
2641 struct fdl6_view src;
2642 tu_image_view_copy<CHIP>(&src, src_image, src_format,
2643 &info->imageSubresource, offset.z);
2644
2645 for (uint32_t i = 0; i < layers; i++) {
2646 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2647
2648 uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2649 if ((dst_va & 63) || (pitch & 63)) {
2650 for (uint32_t y = 0; y < extent.height; y++) {
2651 uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2652 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2653 ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2654 (VkExtent2D) {extent.width, 1});
2655 ops->run(cmd, cs);
2656 dst_va += pitch;
2657 }
2658 } else {
2659 ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2660 coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2661 ops->run(cmd, cs);
2662 }
2663 }
2664
2665 ops->teardown(cmd, cs);
2666 }
2667
2668 template <chip CHIP>
2669 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2670 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2671 const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2672 {
2673 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2674 VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2675 VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2676
2677 bool unaligned_store = false;
2678 for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2679 tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2680 pCopyImageToBufferInfo->pRegions + i,
2681 &unaligned_store);
2682
2683 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2684 }
2685 TU_GENX(tu_CmdCopyImageToBuffer2);
2686
2687 static void
tu_copy_image_to_memory(struct tu_device * device,struct tu_image * src_image,const VkImageToMemoryCopyEXT * info,bool copy_memcpy)2688 tu_copy_image_to_memory(struct tu_device *device,
2689 struct tu_image *src_image,
2690 const VkImageToMemoryCopyEXT *info,
2691 bool copy_memcpy)
2692 {
2693 unsigned plane = tu6_plane_index(src_image->vk.format,
2694 info->imageSubresource.aspectMask);
2695 const struct fdl_layout *layout = &src_image->layout[plane];
2696
2697 VkOffset3D offset = info->imageOffset;
2698 VkExtent3D extent = info->imageExtent;
2699 uint32_t dst_width = info->memoryRowLength ?: extent.width;
2700 uint32_t dst_height = info->memoryImageHeight ?: extent.height;
2701
2702 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2703
2704 uint32_t dst_pitch = dst_width * layout->cpp;
2705
2706 unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
2707 offset.z : info->imageSubresource.baseArrayLayer;
2708 uint32_t layers = MAX2(extent.depth,
2709 vk_image_subresource_layer_count(&src_image->vk,
2710 &info->imageSubresource));
2711
2712 uint32_t image_offset =
2713 fdl_surface_offset(layout,
2714 info->imageSubresource.mipLevel,
2715 start_layer);
2716
2717 uint32_t src_layer_stride =
2718 fdl_layer_stride(layout, info->imageSubresource.mipLevel);
2719 uint32_t src_layer_size =
2720 layout->slices[info->imageSubresource.mipLevel].size0;
2721 uint32_t dst_layer_stride =
2722 copy_memcpy ? src_layer_size : (dst_width * dst_height * layout->cpp);
2723 bool tiled =
2724 fdl_tile_mode(layout, info->imageSubresource.mipLevel) != 0;
2725
2726 const char *src = (const char *) src_image->map + image_offset;
2727 char *dst = (char *) info->pHostPointer;
2728 for (unsigned layer = 0; layer < layers; layer++,
2729 src += src_layer_stride, dst += dst_layer_stride) {
2730 if (src_image->bo->cached_non_coherent) {
2731 tu_bo_sync_cache(device, src_image->bo,
2732 src_image->bo_offset + image_offset,
2733 src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
2734 }
2735
2736 if (copy_memcpy) {
2737 memcpy(dst, src, dst_layer_stride);
2738 } else if (!tiled) {
2739 uint32_t src_pitch = fdl_pitch(layout,
2740 info->imageSubresource.mipLevel);
2741 for (unsigned y = 0; y < extent.height; y++) {
2742 memcpy(dst + dst_pitch * y,
2743 src + src_pitch * (y + offset.y) + offset.x * layout->cpp,
2744 extent.width * layout->cpp);
2745 }
2746 } else {
2747 fdl6_memcpy_tiled_to_linear(offset.x, offset.y,
2748 extent.width, extent.height,
2749 dst, src, layout,
2750 info->imageSubresource.mipLevel,
2751 dst_pitch,
2752 &device->physical_device->ubwc_config);
2753 }
2754 }
2755 }
2756
2757 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToMemoryEXT(VkDevice _device,const VkCopyImageToMemoryInfoEXT * info)2758 tu_CopyImageToMemoryEXT(VkDevice _device,
2759 const VkCopyImageToMemoryInfoEXT *info)
2760 {
2761 VK_FROM_HANDLE(tu_device, device, _device);
2762 VK_FROM_HANDLE(tu_image, image, info->srcImage);
2763
2764 for (unsigned i = 0; i < info->regionCount; i++) {
2765 tu_copy_image_to_memory(device, image, &info->pRegions[i],
2766 info->flags & VK_HOST_IMAGE_COPY_MEMCPY_EXT);
2767 }
2768
2769 return VK_SUCCESS;
2770 }
2771
2772
2773 /* Tiled formats don't support swapping, which means that we can't support
2774 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2775 * formats like B5G5R5A1 have a separate linear-only format when sampling.
2776 * Currently we fake support for tiled swapped formats and use the unswapped
2777 * format instead, but this means that reinterpreting copies to and from
2778 * swapped formats can't be performed correctly unless we can swizzle the
2779 * components by reinterpreting the other image as the "correct" swapped
2780 * format, i.e. only when the other image is linear.
2781 */
2782
2783 template <chip CHIP>
2784 static bool
is_swapped_format(enum pipe_format format,bool is_mutable)2785 is_swapped_format(enum pipe_format format, bool is_mutable)
2786 {
2787 struct tu_native_format linear = blit_format_texture<CHIP>(format, TILE6_LINEAR, is_mutable, false);
2788 struct tu_native_format tiled = blit_format_texture<CHIP>(format, TILE6_3, is_mutable, false);
2789 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2790 }
2791
2792 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2793 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2794 * versa). This should mirror the logic in fdl6_layout.
2795 */
2796 static bool
image_is_r8g8(struct tu_image * image)2797 image_is_r8g8(struct tu_image *image)
2798 {
2799 return image->layout[0].cpp == 2 &&
2800 vk_format_get_nr_components(image->vk.format) == 2;
2801 }
2802
2803 template <chip CHIP>
2804 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2805 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2806 struct tu_image *src_image,
2807 struct tu_image *dst_image,
2808 const VkImageCopy2 *info)
2809 {
2810 const struct blit_ops *ops = &r2d_ops<CHIP>;
2811 struct tu_cs *cs = &cmd->cs;
2812
2813 if (dst_image->layout[0].nr_samples > 1)
2814 ops = &r3d_ops<CHIP>;
2815
2816 enum pipe_format format = PIPE_FORMAT_NONE;
2817 VkOffset3D src_offset = info->srcOffset;
2818 VkOffset3D dst_offset = info->dstOffset;
2819 VkExtent3D extent = info->extent;
2820 uint32_t layers_to_copy = MAX2(info->extent.depth,
2821 vk_image_subresource_layer_count(&src_image->vk,
2822 &info->srcSubresource));
2823
2824 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2825 * Images":
2826 *
2827 * When copying between compressed and uncompressed formats the extent
2828 * members represent the texel dimensions of the source image and not
2829 * the destination. When copying from a compressed image to an
2830 * uncompressed image the image texel dimensions written to the
2831 * uncompressed image will be source extent divided by the compressed
2832 * texel block dimensions. When copying from an uncompressed image to a
2833 * compressed image the image texel dimensions written to the compressed
2834 * image will be the source extent multiplied by the compressed texel
2835 * block dimensions.
2836 *
2837 * This means we only have to adjust the extent if the source image is
2838 * compressed.
2839 */
2840 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2841 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2842
2843 enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2844 enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2845
2846 /* note: could use "R8_UNORM" when no UBWC */
2847 unsigned blit_param = 0;
2848 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2849 src_format == PIPE_FORMAT_Y8_UNORM ||
2850 tu_pipe_format_is_float16(src_format) ||
2851 tu_pipe_format_is_float16(dst_format)) {
2852 ops = &r3d_ops<CHIP>;
2853 blit_param = R3D_COPY;
2854 }
2855
2856 bool use_staging_blit = false;
2857
2858 if (src_format == dst_format) {
2859 /* Images that share a format can always be copied directly because it's
2860 * the same as a blit.
2861 */
2862 format = src_format;
2863 } else if (!src_image->layout[0].tile_mode) {
2864 /* If an image is linear, we can always safely reinterpret it with the
2865 * other image's format and then do a regular blit.
2866 */
2867 format = dst_format;
2868 } else if (!dst_image->layout[0].tile_mode) {
2869 format = src_format;
2870 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2871 /* We can't currently copy r8g8 images to/from other cpp=2 images,
2872 * due to the different tile layout.
2873 */
2874 use_staging_blit = true;
2875 } else if (is_swapped_format<CHIP>(src_format,
2876 src_image->layout[0].is_mutable) ||
2877 is_swapped_format<CHIP>(dst_format,
2878 src_image->layout[0].is_mutable)) {
2879 /* If either format has a non-identity swap, then we can't copy
2880 * to/from it.
2881 */
2882 use_staging_blit = true;
2883 } else if (!src_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2884 format = dst_format;
2885 } else if (!dst_image->layout[0].ubwc || src_image->layout[0].is_mutable) {
2886 format = src_format;
2887 } else {
2888 /* Both formats use UBWC and so neither can be reinterpreted.
2889 * TODO: We could do an in-place decompression of the dst instead.
2890 */
2891 perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2892 use_staging_blit = true;
2893 }
2894
2895 struct fdl6_view dst, src;
2896
2897 if (use_staging_blit) {
2898 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2899 tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2900
2901 struct fdl_layout staging_layout = { 0 };
2902 VkOffset3D staging_offset = { 0 };
2903
2904 staging_layout.tile_mode = TILE6_LINEAR;
2905 staging_layout.ubwc = false;
2906
2907 uint32_t layer_count =
2908 vk_image_subresource_layer_count(&src_image->vk,
2909 &info->srcSubresource);
2910 fdl6_layout(&staging_layout,
2911 &cmd->device->physical_device->dev_info,
2912 src_format,
2913 src_image->layout[0].nr_samples,
2914 extent.width,
2915 extent.height,
2916 extent.depth,
2917 1,
2918 layer_count,
2919 extent.depth > 1,
2920 false,
2921 NULL);
2922
2923 struct tu_bo *staging_bo;
2924 VkResult result = tu_get_scratch_bo(cmd->device,
2925 staging_layout.size,
2926 &staging_bo);
2927 if (result != VK_SUCCESS) {
2928 vk_command_buffer_set_error(&cmd->vk, result);
2929 return;
2930 }
2931
2932 struct fdl6_view staging;
2933 const struct fdl_layout *staging_layout_ptr = &staging_layout;
2934 const struct fdl_view_args copy_to_args = {
2935 .chip = CHIP,
2936 .iova = staging_bo->iova,
2937 .base_miplevel = 0,
2938 .level_count = 1,
2939 .base_array_layer = 0,
2940 .layer_count = layer_count,
2941 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2942 .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2943 .type = FDL_VIEW_TYPE_2D,
2944 };
2945 fdl6_view_init(&staging, &staging_layout_ptr, ©_to_args, false);
2946
2947 ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2948 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2949 coords(ops, cmd, cs, staging_offset, src_offset, extent);
2950
2951 for (uint32_t i = 0; i < layers_to_copy; i++) {
2952 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2953 ops->dst(cs, &staging, i, src_format);
2954 ops->run(cmd, cs);
2955 }
2956
2957 /* When executed by the user there has to be a pipeline barrier here,
2958 * but since we're doing it manually we'll have to flush ourselves.
2959 */
2960 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
2961 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2962 tu_cs_emit_wfi(cs);
2963
2964 const struct fdl_view_args copy_from_args = {
2965 .chip = CHIP,
2966 .iova = staging_bo->iova,
2967 .base_miplevel = 0,
2968 .level_count = 1,
2969 .base_array_layer = 0,
2970 .layer_count = layer_count,
2971 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2972 .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2973 .type = FDL_VIEW_TYPE_2D,
2974 };
2975 fdl6_view_init(&staging, &staging_layout_ptr, ©_from_args, false);
2976
2977 ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2978 blit_param, false, dst_image->layout[0].ubwc,
2979 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2980 coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2981
2982 for (uint32_t i = 0; i < layers_to_copy; i++) {
2983 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2984 ops->dst(cs, &dst, i, dst_format);
2985 ops->run(cmd, cs);
2986 }
2987 } else {
2988 tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2989 tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
2990
2991 ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2992 blit_param, false, dst_image->layout[0].ubwc,
2993 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2994 coords(ops, cmd, cs, dst_offset, src_offset, extent);
2995
2996 for (uint32_t i = 0; i < layers_to_copy; i++) {
2997 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2998 ops->dst(cs, &dst, i, format);
2999 ops->run(cmd, cs);
3000 }
3001 }
3002
3003 ops->teardown(cmd, cs);
3004 }
3005
3006 template <chip CHIP>
3007 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)3008 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
3009 const VkCopyImageInfo2 *pCopyImageInfo)
3010 {
3011 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3012 VK_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
3013 VK_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
3014
3015 for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
3016 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3017 VkImageCopy2 info = pCopyImageInfo->pRegions[i];
3018 u_foreach_bit(b, info.dstSubresource.aspectMask) {
3019 info.srcSubresource.aspectMask = BIT(b);
3020 info.dstSubresource.aspectMask = BIT(b);
3021 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
3022 }
3023 continue;
3024 }
3025
3026 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
3027 pCopyImageInfo->pRegions + i);
3028 }
3029
3030 if (dst_image->lrz_height) {
3031 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
3032 }
3033 }
3034 TU_GENX(tu_CmdCopyImage2);
3035
3036 static void
tu_copy_image_to_image_cpu(struct tu_device * device,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info,bool copy_memcpy)3037 tu_copy_image_to_image_cpu(struct tu_device *device,
3038 struct tu_image *src_image,
3039 struct tu_image *dst_image,
3040 const VkImageCopy2 *info,
3041 bool copy_memcpy)
3042 {
3043 unsigned src_plane = tu6_plane_index(src_image->vk.format,
3044 info->srcSubresource.aspectMask);
3045 unsigned dst_plane = tu6_plane_index(dst_image->vk.format,
3046 info->dstSubresource.aspectMask);
3047
3048 const struct fdl_layout *src_layout = &src_image->layout[src_plane];
3049 const struct fdl_layout *dst_layout = &dst_image->layout[dst_plane];
3050
3051 VkOffset3D src_offset = info->srcOffset;
3052 VkOffset3D dst_offset = info->dstOffset;
3053 VkExtent3D extent = info->extent;
3054 uint32_t layers_to_copy = MAX2(info->extent.depth,
3055 vk_image_subresource_layer_count(&src_image->vk,
3056 &info->srcSubresource));
3057
3058 /* See comment above. */
3059 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
3060 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
3061
3062 unsigned src_start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3063 src_offset.z : info->srcSubresource.baseArrayLayer;
3064 unsigned dst_start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D) ?
3065 dst_offset.z : info->dstSubresource.baseArrayLayer;
3066
3067 uint32_t src_layer_stride =
3068 fdl_layer_stride(src_layout, info->srcSubresource.mipLevel);
3069 uint32_t src_layer_size =
3070 src_layout->slices[info->srcSubresource.mipLevel].size0;
3071 uint32_t dst_layer_stride =
3072 fdl_layer_stride(dst_layout, info->dstSubresource.mipLevel);
3073 uint32_t dst_layer_size =
3074 dst_layout->slices[info->dstSubresource.mipLevel].size0;
3075
3076 uint32_t src_image_offset =
3077 fdl_surface_offset(src_layout,
3078 info->srcSubresource.mipLevel,
3079 src_start_layer);
3080 uint32_t dst_image_offset =
3081 fdl_surface_offset(dst_layout,
3082 info->dstSubresource.mipLevel,
3083 dst_start_layer);
3084
3085 bool src_tiled =
3086 fdl_tile_mode(src_layout, info->srcSubresource.mipLevel) != 0;
3087 bool dst_tiled =
3088 fdl_tile_mode(dst_layout, info->dstSubresource.mipLevel) != 0;
3089
3090 const char *src = (const char *) src_image->map + src_image_offset;
3091 char *dst = (char *) dst_image->map + dst_image_offset;
3092 for (unsigned layer = 0; layer < layers_to_copy; layer++,
3093 src += src_layer_stride, dst += dst_layer_stride) {
3094 if (src_image->bo->cached_non_coherent) {
3095 tu_bo_sync_cache(device, src_image->bo,
3096 src_image->bo_offset + src_image_offset,
3097 src_layer_size, TU_MEM_SYNC_CACHE_FROM_GPU);
3098 }
3099
3100 uint32_t src_pitch = fdl_pitch(src_layout,
3101 info->srcSubresource.mipLevel);
3102 uint32_t dst_pitch = fdl_pitch(dst_layout,
3103 info->dstSubresource.mipLevel);
3104
3105 if (copy_memcpy) {
3106 assert(src_layer_size == dst_layer_size);
3107 memcpy(dst, src, src_layer_size);
3108 } else if (!src_tiled && !dst_tiled) {
3109 for (unsigned y = 0; y < extent.height; y++) {
3110 memcpy(dst + dst_pitch * (y + dst_offset.y) + dst_offset.x * dst_layout->cpp,
3111 src + src_pitch * (y + src_offset.y) + src_offset.x * src_layout->cpp,
3112 extent.width * src_layout->cpp);
3113 }
3114 } else if (!src_tiled) {
3115 fdl6_memcpy_linear_to_tiled(dst_offset.x, dst_offset.y,
3116 extent.width, extent.height,
3117 dst,
3118 src + src_pitch * src_offset.y + src_offset.x * src_layout->cpp,
3119 dst_layout,
3120 info->dstSubresource.mipLevel,
3121 src_pitch,
3122 &device->physical_device->ubwc_config);
3123 } else if (!dst_tiled) {
3124 fdl6_memcpy_tiled_to_linear(src_offset.x, src_offset.y,
3125 extent.width, extent.height,
3126 dst + dst_pitch * dst_offset.y + dst_offset.x * dst_layout->cpp,
3127 src,
3128 src_layout,
3129 info->dstSubresource.mipLevel,
3130 dst_pitch,
3131 &device->physical_device->ubwc_config);
3132 } else {
3133 /* Work tile-by-tile, holding the unswizzled tile in a temporary
3134 * buffer.
3135 */
3136 char temp_tile[256];
3137
3138 uint32_t block_width, block_height;
3139 fdl6_get_ubwc_blockwidth(src_layout, &block_width, &block_height);
3140
3141 uint32_t temp_pitch = block_width * src_layout->cpp;
3142
3143 for (unsigned by = src_offset.y / block_height;
3144 by * block_height < src_offset.y + extent.height; by++) {
3145 uint32_t src_y_start = MAX2(src_offset.y, by * block_height);
3146 uint32_t dst_y_start = src_y_start - src_offset.y + dst_offset.y;
3147 uint32_t height =
3148 MIN2((by + 1) * block_height, src_offset.y + extent.height) -
3149 src_y_start;
3150 for (unsigned bx = src_offset.x / block_width;
3151 bx * block_width < src_offset.x + extent.width; bx++) {
3152 uint32_t src_x_start = MAX2(src_offset.x, bx * block_width);
3153 uint32_t dst_x_start = src_x_start - src_offset.x + dst_offset.x;
3154 uint32_t width =
3155 MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
3156 src_x_start;
3157
3158 fdl6_memcpy_tiled_to_linear(src_x_start, src_y_start,
3159 width, height,
3160 temp_tile, src, src_layout,
3161 info->srcSubresource.mipLevel,
3162 temp_pitch,
3163 &device->physical_device->ubwc_config);
3164 fdl6_memcpy_linear_to_tiled(dst_x_start, dst_y_start,
3165 width, height,
3166 dst, temp_tile, dst_layout,
3167 info->dstSubresource.mipLevel,
3168 temp_pitch,
3169 &device->physical_device->ubwc_config);
3170 }
3171 }
3172 }
3173
3174 if (dst_image->bo->cached_non_coherent) {
3175 tu_bo_sync_cache(device, dst_image->bo,
3176 dst_image->bo_offset + dst_image_offset,
3177 dst_layer_size, TU_MEM_SYNC_CACHE_TO_GPU);
3178 }
3179 }
3180 }
3181
3182 VKAPI_ATTR VkResult VKAPI_CALL
tu_CopyImageToImageEXT(VkDevice _device,const VkCopyImageToImageInfoEXT * pCopyImageToImageInfo)3183 tu_CopyImageToImageEXT(VkDevice _device,
3184 const VkCopyImageToImageInfoEXT *pCopyImageToImageInfo)
3185 {
3186 VK_FROM_HANDLE(tu_device, device, _device);
3187 VK_FROM_HANDLE(tu_image, src_image, pCopyImageToImageInfo->srcImage);
3188 VK_FROM_HANDLE(tu_image, dst_image, pCopyImageToImageInfo->dstImage);
3189 bool copy_memcpy = pCopyImageToImageInfo->flags &
3190 VK_HOST_IMAGE_COPY_MEMCPY_EXT;
3191
3192 for (uint32_t i = 0; i < pCopyImageToImageInfo->regionCount; ++i) {
3193 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3194 VkImageCopy2 info = pCopyImageToImageInfo->pRegions[i];
3195 u_foreach_bit(b, info.dstSubresource.aspectMask) {
3196 info.srcSubresource.aspectMask = BIT(b);
3197 info.dstSubresource.aspectMask = BIT(b);
3198 tu_copy_image_to_image_cpu(device, src_image, dst_image, &info,
3199 copy_memcpy);
3200 }
3201 continue;
3202 }
3203
3204 tu_copy_image_to_image_cpu(device, src_image, dst_image,
3205 pCopyImageToImageInfo->pRegions + i,
3206 copy_memcpy);
3207 }
3208
3209 if (dst_image->lrz_height) {
3210 TU_CALLX(device, tu_disable_lrz_cpu)(device, dst_image);
3211 }
3212
3213 return VK_SUCCESS;
3214 }
3215
3216 VKAPI_ATTR VkResult VKAPI_CALL
tu_TransitionImageLayoutEXT(VkDevice device,uint32_t transitionCount,const VkHostImageLayoutTransitionInfoEXT * transitions)3217 tu_TransitionImageLayoutEXT(VkDevice device,
3218 uint32_t transitionCount,
3219 const VkHostImageLayoutTransitionInfoEXT *transitions)
3220 {
3221 /* We don't do anything with layouts so this should be a no-op */
3222 return VK_SUCCESS;
3223 }
3224
3225 template <chip CHIP>
3226 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size,bool * unaligned_store)3227 copy_buffer(struct tu_cmd_buffer *cmd,
3228 uint64_t dst_va,
3229 uint64_t src_va,
3230 uint64_t size,
3231 uint32_t block_size,
3232 bool *unaligned_store)
3233 {
3234 const struct blit_ops *ops = &r2d_ops<CHIP>;
3235 struct tu_cs *cs = &cmd->cs;
3236 enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
3237 uint64_t blocks = size / block_size;
3238
3239 handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
3240
3241 ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
3242 VK_SAMPLE_COUNT_1_BIT);
3243
3244 while (blocks) {
3245 uint32_t src_x = (src_va & 63) / block_size;
3246 uint32_t dst_x = (dst_va & 63) / block_size;
3247 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
3248
3249 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
3250 ops->dst_buffer( cs, format, dst_va & ~63, 0, format);
3251 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
3252 ops->run(cmd, cs);
3253
3254 src_va += width * block_size;
3255 dst_va += width * block_size;
3256 blocks -= width;
3257 }
3258
3259 ops->teardown(cmd, cs);
3260 }
3261
3262 template <chip CHIP>
3263 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)3264 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
3265 const VkCopyBufferInfo2 *pCopyBufferInfo)
3266 {
3267 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3268 VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
3269 VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
3270
3271 bool unaligned_store = false;
3272 for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
3273 const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
3274 copy_buffer<CHIP>(cmd,
3275 dst_buffer->iova + region->dstOffset,
3276 src_buffer->iova + region->srcOffset,
3277 region->size, 1, &unaligned_store);
3278 }
3279
3280 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3281 }
3282 TU_GENX(tu_CmdCopyBuffer2);
3283
3284 template <chip CHIP>
3285 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)3286 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
3287 VkBuffer dstBuffer,
3288 VkDeviceSize dstOffset,
3289 VkDeviceSize dataSize,
3290 const void *pData)
3291 {
3292 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3293 VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3294
3295 struct tu_cs_memory tmp;
3296 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
3297 if (result != VK_SUCCESS) {
3298 vk_command_buffer_set_error(&cmd->vk, result);
3299 return;
3300 }
3301
3302 bool unaligned_store = false;
3303 memcpy(tmp.map, pData, dataSize);
3304 copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store);
3305
3306 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3307 }
3308 TU_GENX(tu_CmdUpdateBuffer);
3309
3310 template <chip CHIP>
3311 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)3312 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
3313 VkBuffer dstBuffer,
3314 VkDeviceSize dstOffset,
3315 VkDeviceSize fillSize,
3316 uint32_t data)
3317 {
3318 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3319 VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
3320 const struct blit_ops *ops = &r2d_ops<CHIP>;
3321 struct tu_cs *cs = &cmd->cs;
3322
3323 fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
3324
3325 uint64_t dst_va = buffer->iova + dstOffset;
3326 uint32_t blocks = fillSize / 4;
3327
3328 bool unaligned_store = false;
3329 handle_buffer_unaligned_store<CHIP>(cmd, dst_va, fillSize, &unaligned_store);
3330
3331 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
3332 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
3333 VK_SAMPLE_COUNT_1_BIT);
3334
3335 VkClearValue clear_val = {};
3336 clear_val.color.uint32[0] = data;
3337 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
3338
3339 while (blocks) {
3340 uint32_t dst_x = (dst_va & 63) / 4;
3341 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
3342
3343 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
3344 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
3345 ops->run(cmd, cs);
3346
3347 dst_va += width * 4;
3348 blocks -= width;
3349 }
3350
3351 ops->teardown(cmd, cs);
3352
3353 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
3354 }
3355 TU_GENX(tu_CmdFillBuffer);
3356
3357 template <chip CHIP>
3358 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)3359 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
3360 const VkResolveImageInfo2 *pResolveImageInfo)
3361 {
3362 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3363 VK_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
3364 VK_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
3365 const struct blit_ops *ops = &r2d_ops<CHIP>;
3366 struct tu_cs *cs = &cmd->cs;
3367
3368 enum pipe_format src_format =
3369 vk_format_to_pipe_format(src_image->vk.format);
3370 enum pipe_format dst_format =
3371 vk_format_to_pipe_format(dst_image->vk.format);
3372 ops->setup(cmd, cs, src_format, dst_format,
3373 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
3374 VK_SAMPLE_COUNT_1_BIT);
3375
3376 for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
3377 const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
3378 uint32_t layers = MAX2(info->extent.depth,
3379 vk_image_subresource_layer_count(&dst_image->vk,
3380 &info->dstSubresource));
3381
3382 /* TODO: aspect masks possible ? */
3383
3384 coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
3385
3386 struct fdl6_view dst, src;
3387 tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
3388 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
3389
3390 for (uint32_t i = 0; i < layers; i++) {
3391 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
3392 ops->dst(cs, &dst, i, src_format);
3393 ops->run(cmd, cs);
3394 }
3395 }
3396
3397 ops->teardown(cmd, cs);
3398 }
3399 TU_GENX(tu_CmdResolveImage2);
3400
3401 #define for_each_layer(layer, layer_mask, layers) \
3402 for (uint32_t layer = 0; \
3403 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
3404 layer++) \
3405 if (!layer_mask || (layer_mask & BIT(layer)))
3406
3407 template <chip CHIP>
3408 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)3409 resolve_sysmem(struct tu_cmd_buffer *cmd,
3410 struct tu_cs *cs,
3411 VkFormat vk_src_format,
3412 VkFormat vk_dst_format,
3413 const struct tu_image_view *src,
3414 const struct tu_image_view *dst,
3415 uint32_t layer_mask,
3416 uint32_t layers,
3417 const VkRect2D *rect,
3418 bool src_separate_ds,
3419 bool dst_separate_ds)
3420 {
3421 const struct blit_ops *ops = &r2d_ops<CHIP>;
3422
3423 trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
3424
3425 enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
3426 enum pipe_format dst_format = vk_format_to_pipe_format(vk_dst_format);
3427
3428 ops->setup(cmd, cs, src_format, dst_format,
3429 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
3430 VK_SAMPLE_COUNT_1_BIT);
3431 ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
3432
3433 for_each_layer(i, layer_mask, layers) {
3434 if (src_separate_ds) {
3435 if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3436 r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3437 } else {
3438 r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3439 }
3440 } else {
3441 ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
3442 }
3443
3444 if (dst_separate_ds) {
3445 if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3446 ops->dst_depth(cs, dst, i);
3447 } else {
3448 ops->dst_stencil(cs, dst, i);
3449 }
3450 } else {
3451 ops->dst(cs, &dst->view, i, src_format);
3452 }
3453
3454 ops->run(cmd, cs);
3455 }
3456
3457 ops->teardown(cmd, cs);
3458
3459 trace_end_sysmem_resolve(&cmd->trace, cs);
3460 }
3461
3462 template <chip CHIP>
3463 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)3464 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
3465 struct tu_cs *cs,
3466 const struct tu_image_view *src,
3467 const struct tu_image_view *dst,
3468 uint32_t layer_mask,
3469 uint32_t layers,
3470 const VkRect2D *rect)
3471 {
3472 assert(src->image->vk.format == dst->image->vk.format ||
3473 (vk_format_is_depth_or_stencil(src->image->vk.format) &&
3474 vk_format_is_depth_or_stencil(dst->image->vk.format)));
3475
3476 bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3477 bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3478
3479 if (dst_separate_ds) {
3480 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
3481 src, dst, layer_mask, layers, rect,
3482 src_separate_ds, dst_separate_ds);
3483 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
3484 src, dst, layer_mask, layers, rect,
3485 src_separate_ds, dst_separate_ds);
3486 } else {
3487 resolve_sysmem<CHIP>(cmd, cs, src->image->vk.format, dst->image->vk.format,
3488 src, dst, layer_mask, layers, rect,
3489 src_separate_ds, dst_separate_ds);
3490 }
3491 }
3492 TU_GENX(tu_resolve_sysmem);
3493
3494 enum tu_resolve_group_buffer_type {
3495 TU_RESOLVE_GROUP_COLOR_BUFFER,
3496 TU_RESOLVE_GROUP_DEPTH_BUFFER,
3497 TU_RESOLVE_GROUP_STENCIL_BUFFER,
3498 };
3499
3500 template <chip CHIP>
3501 static uint32_t
tu_resolve_group_include_buffer(struct tu_resolve_group * resolve_group,enum tu_resolve_group_buffer_type buffer_type)3502 tu_resolve_group_include_buffer(struct tu_resolve_group *resolve_group,
3503 enum tu_resolve_group_buffer_type buffer_type)
3504 {
3505 /* Resolve groups are not usable on a6xx, so no pending resolve is
3506 * established. The default value of 0 is returned as the buffer ID.
3507 */
3508 if (CHIP == A6XX)
3509 return 0;
3510
3511 resolve_group->pending_resolves = true;
3512
3513 if (buffer_type == TU_RESOLVE_GROUP_DEPTH_BUFFER)
3514 return 0x8;
3515 if (buffer_type == TU_RESOLVE_GROUP_STENCIL_BUFFER)
3516 return 0x9;
3517
3518 const uint32_t max_color_buffers = 8;
3519 uint32_t buffer_id = resolve_group->color_buffer_id++;
3520 return buffer_id % max_color_buffers;
3521 }
3522
3523 template <chip CHIP>
3524 static uint32_t
tu_resolve_group_include_buffer_for_format(struct tu_resolve_group * resolve_group,VkFormat format)3525 tu_resolve_group_include_buffer_for_format(struct tu_resolve_group *resolve_group,
3526 VkFormat format)
3527 {
3528 enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER;
3529
3530 /* D24_UNORM_S8_UINT should be assigned the depth buffer type, regardless of
3531 * whether depth, stencil or both are being resolved.
3532 */
3533 if (format == VK_FORMAT_D24_UNORM_S8_UINT)
3534 buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
3535
3536 return tu_resolve_group_include_buffer<CHIP>(resolve_group, buffer_type);
3537 }
3538
3539 template <chip CHIP>
3540 void
tu_emit_resolve_group(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group)3541 tu_emit_resolve_group(struct tu_cmd_buffer *cmd,
3542 struct tu_cs *cs,
3543 struct tu_resolve_group *resolve_group)
3544 {
3545 /* Resolve groups are not usable on A6XX, so that template instantiation
3546 * should behave as a no-op.
3547 */
3548 if (CHIP == A6XX || !resolve_group->pending_resolves)
3549 return;
3550
3551 resolve_group->color_buffer_id = 0;
3552 resolve_group->pending_resolves = false;
3553
3554 tu_emit_raw_event_write<CHIP>(cmd, cs, CCU_END_RESOLVE_GROUP, false);
3555 }
3556 TU_GENX(tu_emit_resolve_group);
3557
3558 template <chip CHIP>
3559 static void
clear_image_cp_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3560 clear_image_cp_blit(struct tu_cmd_buffer *cmd,
3561 struct tu_image *image,
3562 const VkClearValue *clear_value,
3563 const VkImageSubresourceRange *range,
3564 VkImageAspectFlags aspect_mask)
3565 {
3566 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3567 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3568 struct tu_cs *cs = &cmd->cs;
3569 enum pipe_format format;
3570 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
3571 format = PIPE_FORMAT_R32_UINT;
3572 } else {
3573 format = tu_aspects_to_plane(image->vk.format, aspect_mask);
3574 }
3575
3576 if (image->layout[0].depth0 > 1) {
3577 assert(layer_count == 1);
3578 assert(range->baseArrayLayer == 0);
3579 }
3580
3581 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
3582
3583 ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
3584 (VkSampleCountFlagBits) image->layout[0].nr_samples);
3585 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
3586 ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
3587 else
3588 ops->clear_value(cmd, cs, format, clear_value);
3589
3590 for (unsigned j = 0; j < level_count; j++) {
3591 if (image->layout[0].depth0 > 1)
3592 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
3593
3594 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
3595 u_minify(image->layout[0].width0, range->baseMipLevel + j),
3596 u_minify(image->layout[0].height0, range->baseMipLevel + j)
3597 });
3598
3599 struct fdl6_view dst;
3600 const VkImageSubresourceLayers subresource = {
3601 .aspectMask = aspect_mask,
3602 .mipLevel = range->baseMipLevel + j,
3603 .baseArrayLayer = range->baseArrayLayer,
3604 .layerCount = 1,
3605 };
3606 tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
3607
3608 for (uint32_t i = 0; i < layer_count; i++) {
3609 ops->dst(cs, &dst, i, format);
3610 ops->run(cmd, cs);
3611 }
3612 }
3613
3614 ops->teardown(cmd, cs);
3615 }
3616
3617 static void
clear_image_event_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3618 clear_image_event_blit(struct tu_cmd_buffer *cmd,
3619 struct tu_image *image,
3620 uint32_t buffer_id,
3621 const VkClearValue *clear_value,
3622 const VkImageSubresourceRange *range,
3623 VkImageAspectFlags aspect_mask)
3624 {
3625 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3626 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3627 VkFormat vk_format = image->vk.format;
3628 if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3629 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
3630 vk_format = VK_FORMAT_S8_UINT;
3631 else
3632 vk_format = VK_FORMAT_D32_SFLOAT;
3633 }
3634
3635 enum pipe_format format = vk_format_to_pipe_format(vk_format);
3636
3637 if (image->layout[0].depth0 > 1) {
3638 assert(layer_count == 1);
3639 assert(range->baseArrayLayer == 0);
3640 }
3641
3642 struct tu_cs *cs = &cmd->cs;
3643
3644 tu_cs_emit_regs(cs,
3645 A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
3646
3647 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3648 tu_cs_emit(cs, 0);
3649
3650 tu_cs_emit_regs(
3651 cs, A6XX_RB_BLIT_INFO(
3652 .type = BLIT_EVENT_CLEAR,
3653 .sample_0 = vk_format_is_int(vk_format) ||
3654 vk_format_is_depth_or_stencil(vk_format),
3655 .depth = vk_format_is_depth_or_stencil(vk_format),
3656 .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask),
3657 .buffer_id = buffer_id));
3658
3659 uint32_t clear_vals[4] = {};
3660 pack_blit_event_clear_value(clear_value, format, clear_vals);
3661 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3662 tu_cs_emit_array(cs, clear_vals, 4);
3663
3664 for (unsigned level = 0; level < level_count; level++) {
3665 if (image->layout[0].depth0 > 1)
3666 layer_count =
3667 u_minify(image->layout[0].depth0, range->baseMipLevel + level);
3668
3669 uint32_t width =
3670 u_minify(image->layout[0].width0, range->baseMipLevel + level);
3671 uint32_t height =
3672 u_minify(image->layout[0].height0, range->baseMipLevel + level);
3673 tu_cs_emit_regs(
3674 cs, A6XX_RB_BLIT_SCISSOR_TL(.x = 0, .y = 0),
3675 A6XX_RB_BLIT_SCISSOR_BR(.x = width - 1, .y = height - 1));
3676
3677 struct fdl6_view dst;
3678 const VkImageSubresourceLayers subresource = {
3679 .aspectMask = aspect_mask,
3680 .mipLevel = range->baseMipLevel + level,
3681 .baseArrayLayer = range->baseArrayLayer,
3682 .layerCount = 1,
3683 };
3684 tu_image_view_copy_blit<A7XX>(&dst, image, format, &subresource, 0, false);
3685
3686 for (uint32_t layer = 0; layer < layer_count; layer++) {
3687
3688 struct event_blit_dst_view blt_view = {
3689 .image = image,
3690 .view = &dst,
3691 .layer = layer,
3692 };
3693
3694 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3695 uint32_t real_level = range->baseMipLevel + level;
3696 uint32_t real_layer = range->baseArrayLayer + layer;
3697 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
3698 struct fdl_layout *layout = &image->layout[0];
3699 blt_view.depth_addr =
3700 image->iova +
3701 fdl_surface_offset(layout, real_level, real_layer);
3702 blt_view.depth_pitch = fdl_pitch(layout, real_level);
3703 } else {
3704 struct fdl_layout *layout = &image->layout[1];
3705 blt_view.stencil_addr =
3706 image->iova +
3707 fdl_surface_offset(layout, real_level, real_layer);
3708 blt_view.stencil_pitch = fdl_pitch(layout, real_level);
3709 }
3710 }
3711
3712 event_blit_run<A7XX>(cmd, cs, NULL, &blt_view,
3713 aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT);
3714 }
3715 }
3716 }
3717
3718 static bool
use_generic_clear_for_image_clear(struct tu_cmd_buffer * cmd,struct tu_image * image)3719 use_generic_clear_for_image_clear(struct tu_cmd_buffer *cmd,
3720 struct tu_image *image)
3721 {
3722 const struct fd_dev_info *info = cmd->device->physical_device->info;
3723 return info->a7xx.has_generic_clear &&
3724 /* A7XX supports R9G9B9E5_FLOAT as color attachment and supports
3725 * generic clears for it. A7XX TODO: allow R9G9B9E5_FLOAT
3726 * attachments.
3727 */
3728 image->vk.format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32 &&
3729 /* Clearing VK_FORMAT_R8G8_* with fast-clear value, certain
3730 * dimensions (e.g. 960x540), and having GMEM renderpass afterwards
3731 * may lead to a GPU fault on A7XX.
3732 */
3733 !(info->a7xx.r8g8_faulty_fast_clear_quirk && image_is_r8g8(image));
3734 }
3735
3736 template <chip CHIP>
3737 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,uint32_t buffer_id,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3738 clear_image(struct tu_cmd_buffer *cmd,
3739 struct tu_image *image,
3740 uint32_t buffer_id,
3741 const VkClearValue *clear_value,
3742 const VkImageSubresourceRange *range,
3743 VkImageAspectFlags aspect_mask)
3744 {
3745 if (use_generic_clear_for_image_clear(cmd, image)) {
3746 clear_image_event_blit(cmd, image, buffer_id, clear_value, range, aspect_mask);
3747 } else {
3748 clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
3749 }
3750 }
3751
3752 template <chip CHIP>
3753 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3754 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
3755 VkImage image_h,
3756 VkImageLayout imageLayout,
3757 const VkClearColorValue *pColor,
3758 uint32_t rangeCount,
3759 const VkImageSubresourceRange *pRanges)
3760 {
3761 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3762 VK_FROM_HANDLE(tu_image, image, image_h);
3763
3764 bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3765 if (use_generic_clear) {
3766 /* Generic clear doesn't go through CCU (or other caches). */
3767 cmd->state.cache.flush_bits |=
3768 TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_WAIT_FOR_IDLE;
3769 tu_emit_cache_flush<CHIP>(cmd);
3770 }
3771
3772 struct tu_resolve_group resolve_group = {};
3773
3774 for (unsigned i = 0; i < rangeCount; i++) {
3775 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_COLOR_BUFFER);
3776 clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
3777 }
3778
3779 tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3780 if (use_generic_clear) {
3781 /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3782 * proceed only after the just-emitted generic clears are complete.
3783 */
3784 cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3785 tu_emit_cache_flush<CHIP>(cmd);
3786 }
3787 }
3788 TU_GENX(tu_CmdClearColorImage);
3789
3790 template <chip CHIP>
3791 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3792 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
3793 VkImage image_h,
3794 VkImageLayout imageLayout,
3795 const VkClearDepthStencilValue *pDepthStencil,
3796 uint32_t rangeCount,
3797 const VkImageSubresourceRange *pRanges)
3798 {
3799 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3800 VK_FROM_HANDLE(tu_image, image, image_h);
3801
3802 bool use_generic_clear = use_generic_clear_for_image_clear(cmd, image);
3803 if (use_generic_clear) {
3804 /* Generic clear doesn't go through CCU (or other caches). */
3805 cmd->state.cache.flush_bits |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
3806 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
3807 TU_CMD_FLAG_WAIT_FOR_IDLE;
3808 tu_emit_cache_flush<CHIP>(cmd);
3809 }
3810
3811 struct tu_resolve_group resolve_group = {};
3812
3813 for (unsigned i = 0; i < rangeCount; i++) {
3814 const VkImageSubresourceRange *range = &pRanges[i];
3815
3816 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3817 /* can't clear both depth and stencil at once, split up the aspect mask */
3818 u_foreach_bit(b, range->aspectMask) {
3819 uint32_t buffer_id = 0;
3820 if (BIT(b) == VK_IMAGE_ASPECT_DEPTH_BIT)
3821 buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
3822 if (BIT(b) == VK_IMAGE_ASPECT_STENCIL_BIT)
3823 buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
3824
3825 clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, BIT(b));
3826 }
3827 continue;
3828 }
3829
3830 uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<CHIP>(&resolve_group, image->vk.format);
3831 clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
3832 }
3833
3834 tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
3835 if (use_generic_clear) {
3836 /* This will emit CCU_RESOLVE_CLEAN which will ensure any future resolves
3837 * proceed only after the just-emitted generic clears are complete.
3838 */
3839 cmd->state.cache.flush_bits |= TU_CMD_FLAG_BLIT_CACHE_CLEAN;
3840 tu_emit_cache_flush<CHIP>(cmd);
3841 }
3842
3843 tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
3844 }
3845 TU_GENX(tu_CmdClearDepthStencilImage);
3846
3847 /* CmdClearAttachments uses the original color attachment index instead of the
3848 * remapped index used by the shader, and our MRTs use the remapped
3849 * indices, so we have to remap them. We should always be able to find a
3850 * shader attachment thanks to this VU:
3851 *
3852 * VUID-vkCmdClearAttachments-colorAttachment-09503
3853 * "The colorAttachment member of each element of pAttachments must not
3854 * identify a color attachment that is currently mapped to
3855 * VK_ATTACHMENT_UNUSED in commandBuffer via
3856 * VkRenderingAttachmentLocationInfoKHR"
3857 */
3858 static unsigned
remap_attachment(struct tu_cmd_buffer * cmd,unsigned a)3859 remap_attachment(struct tu_cmd_buffer *cmd, unsigned a)
3860 {
3861 unsigned i = cmd->vk.dynamic_graphics_state.cal.color_map[a];
3862 assert(i != MESA_VK_ATTACHMENT_UNUSED &&
3863 "app violates VUID-vkCmdClearAttachments-colorAttachment-09503");
3864 return i;
3865 }
3866
3867 template <chip CHIP>
3868 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3869 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
3870 uint32_t attachment_count,
3871 const VkClearAttachment *attachments,
3872 uint32_t rect_count,
3873 const VkClearRect *rects)
3874 {
3875 /* the shader path here is special, it avoids changing MRT/etc state */
3876 const struct tu_subpass *subpass = cmd->state.subpass;
3877 const uint32_t mrt_count = subpass->color_count;
3878 struct tu_cs *cs = &cmd->draw_cs;
3879 uint32_t clear_value[MAX_RTS][4];
3880 float z_clear_val = 0.0f;
3881 uint8_t s_clear_val = 0;
3882 uint32_t clear_rts = 0, clear_components = 0;
3883 bool z_clear = false;
3884 bool s_clear = false;
3885
3886 trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
3887
3888 for (uint32_t i = 0; i < attachment_count; i++) {
3889 uint32_t a;
3890 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3891 uint32_t c = attachments[i].colorAttachment;
3892 a = subpass->color_attachments[c].attachment;
3893 if (a == VK_ATTACHMENT_UNUSED)
3894 continue;
3895
3896 uint32_t remapped = remap_attachment(cmd, c);
3897 clear_rts |= 1 << remapped;
3898 clear_components |= 0xf << (remapped * 4);
3899 memcpy(clear_value[remapped], &attachments[i].clearValue, 4 * sizeof(uint32_t));
3900 } else {
3901 a = subpass->depth_stencil_attachment.attachment;
3902 if (a == VK_ATTACHMENT_UNUSED)
3903 continue;
3904
3905 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3906 z_clear = true;
3907 z_clear_val = attachments[i].clearValue.depthStencil.depth;
3908 }
3909
3910 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3911 s_clear = true;
3912 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
3913 }
3914 }
3915 }
3916
3917 /* We may not know the multisample count if there are no attachments, so
3918 * just bail early to avoid corner cases later.
3919 */
3920 if (clear_rts == 0 && !z_clear && !s_clear)
3921 return;
3922
3923 /* disable all draw states so they don't interfere
3924 * TODO: use and re-use draw states
3925 * we have to disable draw states individually to preserve
3926 * input attachment states, because a secondary command buffer
3927 * won't be able to restore them
3928 */
3929 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3930 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
3931 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
3932 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
3933 continue;
3934 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
3935 CP_SET_DRAW_STATE__0_DISABLE);
3936 tu_cs_emit_qw(cs, 0);
3937 }
3938 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
3939
3940 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
3941 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
3942 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
3943 0xfc000000);
3944 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
3945
3946 r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
3947
3948 /* Disable sample counting in order to not affect occlusion query. */
3949 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
3950
3951 if (cmd->state.prim_generated_query_running_before_rp) {
3952 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
3953 }
3954
3955 tu_cs_emit_regs(cs,
3956 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
3957 tu_cs_emit_regs(cs,
3958 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
3959
3960 tu_cs_emit_regs(cs,
3961 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
3962
3963 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
3964 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
3965 for (uint32_t i = 0; i < mrt_count; i++) {
3966 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
3967 .component_enable = COND(clear_rts & (1 << i), 0xf)));
3968 }
3969
3970 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
3971 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
3972
3973 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
3974 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3975 .z_test_enable = z_clear,
3976 .z_write_enable = z_clear,
3977 .zfunc = FUNC_ALWAYS));
3978 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
3979 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
3980 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3981 .stencil_enable = s_clear,
3982 .func = FUNC_ALWAYS,
3983 .zpass = STENCIL_REPLACE));
3984 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
3985 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
3986 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
3987 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
3988
3989 tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
3990
3991 unsigned num_rts = util_bitcount(clear_rts);
3992 uint32_t packed_clear_value[MAX_RTS][4];
3993
3994 uint32_t idx = 0;
3995 u_foreach_bit(b, clear_rts) {
3996 memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
3997 idx++;
3998 }
3999
4000 if (num_rts > 0)
4001 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
4002 0, packed_clear_value, num_rts);
4003
4004 for (uint32_t i = 0; i < rect_count; i++) {
4005 /* This should be true because of this valid usage for
4006 * vkCmdClearAttachments:
4007 *
4008 * "If the render pass instance this is recorded in uses multiview,
4009 * then baseArrayLayer must be zero and layerCount must be one"
4010 */
4011 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
4012
4013 /* a630 doesn't support multiview masks, which means that we can't use
4014 * the normal multiview path without potentially recompiling a shader
4015 * on-demand or using a more complicated variant that takes the mask as
4016 * a const. Just use the layered path instead, since it shouldn't be
4017 * much worse.
4018 */
4019 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
4020 {
4021 const float coords[] = {
4022 rects[i].rect.offset.x,
4023 rects[i].rect.offset.y,
4024 z_clear_val,
4025 uif(rects[i].baseArrayLayer + layer),
4026 rects[i].rect.offset.x + rects[i].rect.extent.width,
4027 rects[i].rect.offset.y + rects[i].rect.extent.height,
4028 z_clear_val,
4029 1.0f,
4030 };
4031
4032 r3d_coords_raw(cmd, cs, coords);
4033 r3d_run_vis(cmd, cs);
4034 }
4035 }
4036
4037 /* Re-enable sample counting. */
4038 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
4039
4040 if (cmd->state.prim_generated_query_running_before_rp) {
4041 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
4042 }
4043
4044 trace_end_sysmem_clear_all(&cmd->trace, cs);
4045 }
4046
4047 template <chip CHIP>
4048 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t buffer_id,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)4049 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4050 struct tu_cs *cs,
4051 uint32_t buffer_id,
4052 enum pipe_format format,
4053 uint8_t clear_mask,
4054 uint32_t gmem_offset,
4055 const VkClearValue *value)
4056 {
4057 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
4058 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
4059 blit_base_format<CHIP>(format, false, true)));
4060
4061 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
4062 .clear_mask = clear_mask,
4063 .buffer_id = buffer_id));
4064
4065 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
4066 tu_cs_emit(cs, gmem_offset);
4067
4068 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
4069 tu_cs_emit(cs, 0);
4070
4071 uint32_t clear_vals[4] = {};
4072 pack_blit_event_clear_value(value, format, clear_vals);
4073
4074 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4075 tu_cs_emit_array(cs, clear_vals, 4);
4076
4077 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
4078 }
4079
4080 template <chip CHIP>
4081 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)4082 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4083 struct tu_cs *cs,
4084 struct tu_resolve_group *resolve_group,
4085 uint32_t attachment,
4086 uint32_t base_layer,
4087 uint32_t layers,
4088 uint32_t layer_mask,
4089 VkImageAspectFlags mask,
4090 const VkClearValue *value)
4091 {
4092 const struct tu_render_pass_attachment *att =
4093 &cmd->state.pass->attachments[attachment];
4094
4095 trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
4096
4097 tu_cs_emit_regs(cs,
4098 A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
4099
4100 enum pipe_format format = vk_format_to_pipe_format(att->format);
4101 for_each_layer(i, layer_mask, layers) {
4102 uint32_t layer = i + base_layer;
4103 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4104 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4105 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
4106 clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, 0xf,
4107 tu_attachment_gmem_offset(cmd, att, layer), value);
4108 }
4109 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4110 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
4111 clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, 0xf,
4112 tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
4113 }
4114 } else {
4115 uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<CHIP>(resolve_group, att->format);
4116 clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, format, aspect_write_mask(format, mask),
4117 tu_attachment_gmem_offset(cmd, att, layer), value);
4118 }
4119 }
4120
4121 tu_flush_for_access(&cmd->state.renderpass_cache, TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4122
4123 trace_end_gmem_clear(&cmd->trace, cs);
4124 }
4125
4126 template <chip CHIP>
4127 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)4128 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
4129 uint32_t attachment_count,
4130 const VkClearAttachment *attachments,
4131 uint32_t rect_count,
4132 const VkClearRect *rects)
4133 {
4134 const struct tu_subpass *subpass = cmd->state.subpass;
4135 struct tu_cs *cs = &cmd->draw_cs;
4136
4137 if (rect_count > 1)
4138 perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
4139
4140 struct tu_resolve_group resolve_group = {};
4141
4142 for (unsigned i = 0; i < rect_count; i++) {
4143 unsigned x1 = rects[i].rect.offset.x;
4144 unsigned y1 = rects[i].rect.offset.y;
4145 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
4146 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
4147
4148 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4149 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4150 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4151
4152 for (unsigned j = 0; j < attachment_count; j++) {
4153 uint32_t a;
4154 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
4155 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
4156 else
4157 a = subpass->depth_stencil_attachment.attachment;
4158
4159 if (a == VK_ATTACHMENT_UNUSED)
4160 continue;
4161
4162 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a,
4163 rects[i].baseArrayLayer,
4164 rects[i].layerCount,
4165 subpass->multiview_mask,
4166 attachments[j].aspectMask,
4167 &attachments[j].clearValue);
4168 }
4169 }
4170
4171 tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
4172 }
4173
4174 template <chip CHIP>
4175 static void
tu_clear_attachments(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4176 tu_clear_attachments(struct tu_cmd_buffer *cmd,
4177 uint32_t attachmentCount,
4178 const VkClearAttachment *pAttachments,
4179 uint32_t rectCount,
4180 const VkClearRect *pRects)
4181 {
4182 struct tu_cs *cs = &cmd->draw_cs;
4183
4184 /* sysmem path behaves like a draw, note we don't have a way of using different
4185 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
4186 */
4187 tu_emit_cache_flush_renderpass<CHIP>(cmd);
4188
4189 /* vkCmdClearAttachments is supposed to respect the predicate if active. The
4190 * easiest way to do this is to always use the 3d path, which always works
4191 * even with GMEM because it's just a simple draw using the existing
4192 * attachment state.
4193 *
4194 * Similarly, we also use the 3D path when in a secondary command buffer that
4195 * doesn't know the GMEM layout that will be chosen by the primary.
4196 */
4197 if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
4198 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4199 return;
4200 }
4201
4202 /* If we could skip tile load/stores based on any draws intersecting them at
4203 * binning time, then emit the clear as a 3D draw so that it contributes to
4204 * that visibility.
4205 */
4206 const struct tu_subpass *subpass = cmd->state.subpass;
4207 for (uint32_t i = 0; i < attachmentCount; i++) {
4208 uint32_t a;
4209 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4210 uint32_t c = pAttachments[i].colorAttachment;
4211 a = subpass->color_attachments[c].attachment;
4212 } else {
4213 a = subpass->depth_stencil_attachment.attachment;
4214 }
4215 if (a != VK_ATTACHMENT_UNUSED) {
4216 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4217 if (att->cond_load_allowed || att->cond_store_allowed) {
4218 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4219 return;
4220 }
4221 }
4222 }
4223
4224 /* Otherwise, emit 2D blits for gmem rendering. */
4225 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
4226 tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4227 tu_cond_exec_end(cs);
4228
4229 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4230 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
4231 tu_cond_exec_end(cs);
4232 }
4233
4234 static void
tu7_clear_attachment_generic_single_rect(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_render_pass_attachment * att,const VkClearAttachment * clear_att,uint32_t a,const VkClearRect * rect)4235 tu7_clear_attachment_generic_single_rect(
4236 struct tu_cmd_buffer *cmd,
4237 struct tu_cs *cs,
4238 struct tu_resolve_group *resolve_group,
4239 const struct tu_render_pass_attachment *att,
4240 const VkClearAttachment *clear_att,
4241 uint32_t a,
4242 const VkClearRect *rect)
4243 {
4244 const struct tu_subpass *subpass = cmd->state.subpass;
4245 unsigned x1 = rect->rect.offset.x;
4246 unsigned y1 = rect->rect.offset.y;
4247 unsigned x2 = x1 + rect->rect.extent.width - 1;
4248 unsigned y2 = y1 + rect->rect.extent.height - 1;
4249
4250 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
4251 tu_cs_emit(cs,
4252 A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
4253 tu_cs_emit(cs,
4254 A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
4255
4256 auto value = &clear_att->clearValue;
4257
4258 enum pipe_format format = vk_format_to_pipe_format(att->format);
4259 for_each_layer(i, subpass->multiview_mask, rect->layerCount) {
4260 uint32_t layer = i + rect->baseArrayLayer;
4261 uint32_t mask =
4262 aspect_write_mask_generic_clear(format, clear_att->aspectMask);
4263
4264 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4265 if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4266 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
4267 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4268 false, layer, value, a);
4269 }
4270 if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4271 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
4272 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4273 layer, value, a);
4274 }
4275 } else {
4276 uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<A7XX>(resolve_group, att->format);
4277 tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4278 }
4279 }
4280 }
4281
4282 static void
tu_clear_attachments_generic(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4283 tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
4284 uint32_t attachmentCount,
4285 const VkClearAttachment *pAttachments,
4286 uint32_t rectCount,
4287 const VkClearRect *pRects)
4288 {
4289 struct tu_cs *cs = &cmd->draw_cs;
4290
4291 uint32_t clear_aspects = 0;
4292 for (uint32_t i = 0; i < attachmentCount; i++) {
4293 clear_aspects |= pAttachments[i].aspectMask;
4294 }
4295
4296 /* Generic clear doesn't go through CCU (or other caches),
4297 * so we have to flush (clean+invalidate) corresponding caches.
4298 */
4299 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
4300 if (clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
4301 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4302 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_COLOR).value);
4303 }
4304 if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4305 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
4306 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_DEPTH).value);
4307 }
4308 tu_cs_emit_wfi(cs);
4309 tu_cond_exec_end(cs);
4310
4311 struct tu_resolve_group resolve_group = {};
4312
4313 const struct tu_subpass *subpass = cmd->state.subpass;
4314 for (uint32_t i = 0; i < attachmentCount; i++) {
4315 uint32_t a;
4316 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
4317 uint32_t c = pAttachments[i].colorAttachment;
4318 a = subpass->color_attachments[c].attachment;
4319 } else {
4320 a = subpass->depth_stencil_attachment.attachment;
4321 }
4322 if (a != VK_ATTACHMENT_UNUSED) {
4323 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
4324 const struct tu_image_view *iview = cmd->state.attachments[a];
4325 trace_start_generic_clear(&cmd->trace, cs, att->format,
4326 iview->view.ubwc_enabled, att->samples);
4327 for (unsigned j = 0; j < rectCount; j++) {
4328 tu7_clear_attachment_generic_single_rect(
4329 cmd, cs, &resolve_group, att, &pAttachments[i], a, &pRects[j]);
4330 }
4331 trace_end_generic_clear(&cmd->trace, cs);
4332 }
4333 }
4334
4335 tu_emit_resolve_group<A7XX>(cmd, cs, &resolve_group);
4336 }
4337
4338 template <chip CHIP>
4339 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)4340 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
4341 uint32_t attachmentCount,
4342 const VkClearAttachment *pAttachments,
4343 uint32_t rectCount,
4344 const VkClearRect *pRects)
4345 {
4346 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4347
4348 for (uint32_t j = 0; j < attachmentCount; j++) {
4349 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
4350 continue;
4351
4352 tu_lrz_disable_during_renderpass<CHIP>(cmd);
4353 }
4354
4355 if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
4356 /* Both having predication and not knowing layout could be solved
4357 * by cs patching, which is exactly what prop driver is doing.
4358 * We don't implement it because we don't expect a reasonable impact.
4359 */
4360 !(cmd->state.predication_active ||
4361 cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT)) {
4362 tu_clear_attachments_generic(cmd, attachmentCount, pAttachments, rectCount, pRects);
4363 } else {
4364 tu_clear_attachments<CHIP>(cmd, attachmentCount, pAttachments,
4365 rectCount, pRects);
4366 }
4367 }
4368 TU_GENX(tu_CmdClearAttachments);
4369
4370 template <chip CHIP>
4371 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)4372 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4373 struct tu_cs *cs,
4374 VkFormat vk_format,
4375 VkImageAspectFlags clear_mask,
4376 uint32_t a,
4377 bool separate_ds)
4378 {
4379 enum pipe_format format = vk_format_to_pipe_format(vk_format);
4380 const struct tu_framebuffer *fb = cmd->state.framebuffer;
4381 const struct tu_image_view *iview = cmd->state.attachments[a];
4382 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
4383 const struct blit_ops *ops = &r2d_ops<CHIP>;
4384 const VkClearValue *value = &cmd->state.clear_values[a];
4385 if (cmd->state.pass->attachments[a].samples > 1)
4386 ops = &r3d_ops<CHIP>;
4387
4388 trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
4389 cmd->state.pass->attachments[a].samples);
4390
4391 ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
4392 cmd->state.pass->attachments[a].samples);
4393 ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
4394 cmd->state.render_area.extent);
4395 ops->clear_value(cmd, cs, format, value);
4396
4397 for_each_layer(i, clear_views, fb->layers) {
4398 if (separate_ds) {
4399 if (vk_format == VK_FORMAT_D32_SFLOAT) {
4400 ops->dst_depth(cs, iview, i);
4401 } else {
4402 ops->dst_stencil(cs, iview, i);
4403 }
4404 } else {
4405 ops->dst(cs, &iview->view, i, format);
4406 }
4407 ops->run(cmd, cs);
4408 }
4409
4410 ops->teardown(cmd, cs);
4411
4412 trace_end_sysmem_clear(&cmd->trace, cs);
4413 }
4414
4415 template <chip CHIP>
4416 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)4417 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
4418 struct tu_cs *cs,
4419 uint32_t a)
4420 {
4421 const struct tu_render_pass_attachment *attachment =
4422 &cmd->state.pass->attachments[a];
4423
4424 if (!attachment->clear_mask)
4425 return;
4426
4427 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4428 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4429 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
4430 a, true);
4431 }
4432 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4433 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
4434 a, true);
4435 }
4436 } else {
4437 clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
4438 a, false);
4439 }
4440
4441 /* The spec doesn't explicitly say, but presumably the initial renderpass
4442 * clear is considered part of the renderpass, and therefore barriers
4443 * aren't required inside the subpass/renderpass. Therefore we need to
4444 * flush CCU color into CCU depth here, just like with
4445 * vkCmdClearAttachments(). Note that because this only happens at the
4446 * beginning of a renderpass, and renderpass writes are considered
4447 * "incoherent", we shouldn't have to worry about syncing depth into color
4448 * beforehand as depth should already be flushed.
4449 */
4450 if (vk_format_is_depth_or_stencil(attachment->format)) {
4451 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4452 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
4453 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
4454 } else {
4455 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4456 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
4457 }
4458
4459 tu_cs_emit_wfi(cs);
4460 }
4461 TU_GENX(tu_clear_sysmem_attachment);
4462
4463 template <chip CHIP>
4464 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4465 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
4466 struct tu_cs *cs,
4467 struct tu_resolve_group *resolve_group,
4468 uint32_t a)
4469 {
4470 const struct tu_render_pass_attachment *attachment =
4471 &cmd->state.pass->attachments[a];
4472
4473 if (!attachment->clear_mask)
4474 return;
4475
4476 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
4477 cmd->state.framebuffer->layers,
4478 attachment->clear_views,
4479 attachment->clear_mask,
4480 &cmd->state.clear_values[a]);
4481 }
4482 TU_GENX(tu_clear_gmem_attachment);
4483
4484 void
tu7_generic_clear_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a)4485 tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
4486 struct tu_cs *cs,
4487 struct tu_resolve_group *resolve_group,
4488 uint32_t a)
4489 {
4490 const struct tu_render_pass_attachment *att =
4491 &cmd->state.pass->attachments[a];
4492 const VkClearValue *value = &cmd->state.clear_values[a];
4493 const struct tu_image_view *iview = cmd->state.attachments[a];
4494
4495 trace_start_generic_clear(&cmd->trace, cs, att->format,
4496 iview->view.ubwc_enabled, att->samples);
4497
4498 enum pipe_format format = vk_format_to_pipe_format(att->format);
4499 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4500 uint32_t layer = i + 0;
4501 uint32_t mask =
4502 aspect_write_mask_generic_clear(format, att->clear_mask);
4503 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4504 if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4505 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
4506 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
4507 false, layer, value, a);
4508 }
4509 if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4510 uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
4511 tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
4512 layer, value, a);
4513 }
4514 } else {
4515 uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<A7XX>(resolve_group, att->format);
4516 tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
4517 }
4518 }
4519
4520 tu_flush_for_access(&cmd->state.renderpass_cache,
4521 TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
4522
4523 trace_end_generic_clear(&cmd->trace, cs);
4524 }
4525
4526 template <chip CHIP>
4527 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,const VkClearValue * clear_value,enum a6xx_blit_event_type blit_event_type,bool separate_stencil)4528 tu_emit_blit(struct tu_cmd_buffer *cmd,
4529 struct tu_cs *cs,
4530 struct tu_resolve_group *resolve_group,
4531 const struct tu_image_view *iview,
4532 const struct tu_render_pass_attachment *attachment,
4533 const VkClearValue *clear_value,
4534 enum a6xx_blit_event_type blit_event_type,
4535 bool separate_stencil)
4536 {
4537 assert(blit_event_type != BLIT_EVENT_CLEAR);
4538 uint32_t clear_mask = 0;
4539
4540 /* BLIT_EVENT_STORE_AND_CLEAR would presumably swallow the
4541 * BLIT_EVENT_CLEAR at the start of a renderpass, and be more efficient.
4542 */
4543 if (blit_event_type == BLIT_EVENT_STORE && clear_value &&
4544 attachment->clear_mask &&
4545 use_generic_clear_for_image_clear(cmd, iview->image)) {
4546 blit_event_type = BLIT_EVENT_STORE_AND_CLEAR;
4547
4548 enum pipe_format format = vk_format_to_pipe_format(attachment->format);
4549 VkImageAspectFlags aspect_mask = attachment->clear_mask;
4550 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
4551 if (separate_stencil)
4552 aspect_mask = VK_IMAGE_ASPECT_STENCIL_BIT;
4553 else
4554 aspect_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
4555 }
4556 if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
4557 if (separate_stencil)
4558 format = PIPE_FORMAT_S8_UINT;
4559 else
4560 format = PIPE_FORMAT_Z32_FLOAT;
4561 }
4562
4563 clear_mask = aspect_write_mask_generic_clear(format, aspect_mask);
4564
4565 uint32_t clear_vals[4] = {};
4566 pack_blit_event_clear_value(clear_value, format, clear_vals);
4567
4568 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4569 tu_cs_emit_array(cs, clear_vals, 4);
4570 }
4571
4572 enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER;
4573 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4574 if (!separate_stencil)
4575 buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
4576 else
4577 buffer_type = TU_RESOLVE_GROUP_STENCIL_BUFFER;
4578 } else if (attachment->format == VK_FORMAT_D24_UNORM_S8_UINT) {
4579 buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
4580 }
4581
4582 uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, buffer_type);
4583 event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);
4584
4585 for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
4586 event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
4587 event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
4588 }
4589
4590 tu_flush_for_access(&cmd->state.cache, TU_ACCESS_BLIT_WRITE_GMEM,
4591 TU_ACCESS_NONE);
4592 }
4593
4594 static bool
blit_can_resolve(VkFormat format)4595 blit_can_resolve(VkFormat format)
4596 {
4597 const struct util_format_description *desc = vk_format_description(format);
4598
4599 /* blit event can only do resolve for simple cases:
4600 * averaging samples as unsigned integers or choosing only one sample
4601 * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
4602 */
4603 if (vk_format_is_snorm(format))
4604 return false;
4605
4606 /* can't do formats with larger channel sizes
4607 * note: this includes all float formats
4608 * note2: single channel integer formats seem OK
4609 */
4610 if (desc->channel[0].size > 10 && vk_format_is_color(format))
4611 return false;
4612
4613 switch (format) {
4614 /* for unknown reasons blit event can't msaa resolve these formats when tiled
4615 * likely related to these formats having different layout from other cpp=2 formats
4616 */
4617 case VK_FORMAT_R8G8_UNORM:
4618 case VK_FORMAT_R8G8_UINT:
4619 case VK_FORMAT_R8G8_SINT:
4620 case VK_FORMAT_R8G8_SRGB:
4621 return false;
4622 default:
4623 break;
4624 }
4625
4626 return true;
4627 }
4628
4629 struct apply_load_coords_state {
4630 unsigned view;
4631 };
4632
4633 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4634 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
4635 struct tu_cs *cs,
4636 void *data,
4637 VkRect2D bin,
4638 unsigned views,
4639 VkExtent2D *frag_areas)
4640 {
4641 const struct apply_load_coords_state *state =
4642 (const struct apply_load_coords_state *)data;
4643 assert(state->view < views);
4644 VkExtent2D frag_area = frag_areas[state->view];
4645
4646 assert(bin.extent.width % frag_area.width == 0);
4647 assert(bin.extent.height % frag_area.height == 0);
4648 uint32_t scaled_width = bin.extent.width / frag_area.width;
4649 uint32_t scaled_height = bin.extent.height / frag_area.height;
4650
4651 const float coords[] = {
4652 bin.offset.x, bin.offset.y,
4653 bin.offset.x, bin.offset.y,
4654 bin.offset.x + scaled_width, bin.offset.y + scaled_height,
4655 bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
4656 };
4657 r3d_coords_raw(cmd, cs, coords);
4658 }
4659
4660 template <chip CHIP>
4661 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)4662 load_3d_blit(struct tu_cmd_buffer *cmd,
4663 struct tu_cs *cs,
4664 const struct tu_image_view *iview,
4665 const struct tu_render_pass_attachment *att,
4666 bool separate_stencil)
4667 {
4668 const struct tu_framebuffer *fb = cmd->state.framebuffer;
4669 enum pipe_format format = iview->view.format;
4670 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4671 if (separate_stencil)
4672 format = PIPE_FORMAT_S8_UINT;
4673 else
4674 format = PIPE_FORMAT_Z32_FLOAT;
4675 }
4676 r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
4677 R3D_DST_GMEM, false, iview->view.ubwc_enabled,
4678 iview->image->vk.samples);
4679
4680 if (!cmd->state.pass->has_fdm) {
4681 r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
4682 (VkExtent2D) { fb->width, fb->height });
4683 }
4684
4685 /* Normal loads read directly from system memory, so we have to invalidate
4686 * UCHE in case it contains stale data.
4687 */
4688 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4689
4690 /* Wait for CACHE_INVALIDATE to land */
4691 tu_cs_emit_wfi(cs);
4692
4693 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4694 if (cmd->state.pass->has_fdm) {
4695 struct apply_load_coords_state state = {
4696 .view = att->clear_views ? i : 0,
4697 };
4698 tu_create_fdm_bin_patchpoint(cmd, cs, 4, fdm_apply_load_coords, state);
4699 }
4700
4701 r3d_dst_gmem<CHIP>(cmd, cs, iview, att, separate_stencil, i);
4702
4703 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4704 if (separate_stencil)
4705 r3d_src_stencil(cmd, cs, iview, i);
4706 else
4707 r3d_src_depth(cmd, cs, iview, i);
4708 } else {
4709 r3d_src_gmem_load(cmd, cs, iview, i);
4710 }
4711
4712 r3d_run(cmd, cs);
4713 }
4714
4715 r3d_teardown<CHIP>(cmd, cs);
4716
4717 /* It seems we need to WFI here for depth/stencil because color writes here
4718 * aren't synchronized with depth/stencil writes.
4719 *
4720 * Note: the blob also uses a WFI for color attachments but this hasn't
4721 * been seen to be necessary.
4722 */
4723 if (vk_format_is_depth_or_stencil(att->format))
4724 tu_cs_emit_wfi(cs);
4725 }
4726
4727 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4728 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4729 struct tu_cs *cs, bool load)
4730 {
4731 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
4732
4733 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4734 return;
4735
4736 uint64_t result_iova;
4737 if (load)
4738 result_iova = global_iova(cmd, dbg_gmem_taken_loads);
4739 else
4740 result_iova = global_iova(cmd, dbg_gmem_taken_stores);
4741
4742 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4743 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4744 tu_cs_emit_qw(cs, result_iova);
4745 tu_cs_emit_qw(cs, result_iova);
4746 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4747 }
4748
4749 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4750 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4751 struct tu_cs *cs, bool load)
4752 {
4753 tu_cond_exec_end(cs);
4754
4755 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4756 return;
4757
4758 uint64_t result_iova;
4759 if (load)
4760 result_iova = global_iova(cmd, dbg_gmem_total_loads);
4761 else
4762 result_iova = global_iova(cmd, dbg_gmem_total_stores);
4763
4764 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4765 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4766 tu_cs_emit_qw(cs, result_iova);
4767 tu_cs_emit_qw(cs, result_iova);
4768 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4769 }
4770
4771 template <chip CHIP>
4772 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,bool cond_exec_allowed,bool force_load)4773 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
4774 struct tu_cs *cs,
4775 struct tu_resolve_group *resolve_group,
4776 uint32_t a,
4777 bool cond_exec_allowed,
4778 bool force_load)
4779 {
4780 const struct tu_image_view *iview = cmd->state.attachments[a];
4781 const struct tu_render_pass_attachment *attachment =
4782 &cmd->state.pass->attachments[a];
4783
4784 bool load_common = attachment->load || force_load;
4785 bool load_stencil =
4786 attachment->load_stencil ||
4787 (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
4788
4789 if (!load_common && !load_stencil)
4790 return;
4791
4792 trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
4793
4794 /* If attachment will be cleared by vkCmdClearAttachments - it is likely
4795 * that it would be partially cleared, and since it is done by 2d blit
4796 * it doesn't produce geometry, so we have to unconditionally load.
4797 *
4798 * To simplify conditions treat partially cleared separate DS as fully
4799 * cleared and don't emit cond_exec.
4800 */
4801 bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
4802 if (cond_exec)
4803 tu_begin_load_store_cond_exec(cmd, cs, true);
4804
4805 if (TU_DEBUG(3D_LOAD) ||
4806 cmd->state.pass->has_fdm) {
4807 if (load_common || load_stencil)
4808 tu_disable_draw_states(cmd, cs);
4809
4810 if (load_common)
4811 load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
4812
4813 if (load_stencil)
4814 load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
4815 } else {
4816 if (load_common)
4817 tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
4818
4819 if (load_stencil)
4820 tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
4821 }
4822
4823 if (cond_exec)
4824 tu_end_load_store_cond_exec(cmd, cs, true);
4825
4826 trace_end_gmem_load(&cmd->trace, cs);
4827 }
4828 TU_GENX(tu_load_gmem_attachment);
4829
4830 template <chip CHIP>
4831 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4832 store_cp_blit(struct tu_cmd_buffer *cmd,
4833 struct tu_cs *cs,
4834 const struct tu_image_view *iview,
4835 uint32_t samples,
4836 bool separate_stencil,
4837 enum pipe_format src_format,
4838 enum pipe_format dst_format,
4839 uint32_t layer,
4840 uint32_t gmem_offset,
4841 uint32_t cpp)
4842 {
4843 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
4844 VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
4845 iview->view.ubwc_enabled, true);
4846
4847 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4848 if (!separate_stencil) {
4849 r2d_dst_depth(cs, iview, layer);
4850 } else {
4851 r2d_dst_stencil(cs, iview, layer);
4852 }
4853 } else {
4854 r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
4855 }
4856
4857 enum a6xx_format fmt = blit_format_texture<CHIP>(src_format, TILE6_2, false, true).fmt;
4858 fixup_src_format(&src_format, dst_format, &fmt);
4859
4860 tu_cs_emit_regs(cs,
4861 SP_PS_2D_SRC_INFO(CHIP,
4862 .color_format = fmt,
4863 .tile_mode = TILE6_2,
4864 .color_swap = WZYX,
4865 .srgb = util_format_is_srgb(src_format),
4866 .samples = tu_msaa_samples(samples),
4867 .samples_average = !util_format_is_pure_integer(dst_format) &&
4868 !util_format_is_depth_or_stencil(dst_format),
4869 .unk20 = 1,
4870 .unk22 = 1),
4871 SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
4872 SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
4873 SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
4874
4875 /* sync GMEM writes with CACHE. */
4876 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4877 if (CHIP >= A7XX)
4878 /* On A7XX, we need to wait for any CP_EVENT_WRITE::BLIT operations
4879 * arising from GMEM load/clears to land before we can continue.
4880 */
4881 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
4882
4883 /* Wait for cache event to land */
4884 tu_cs_emit_wfi(cs);
4885
4886 r2d_run(cmd, cs);
4887
4888 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4889 * sysmem, and we generally assume that GMEM renderpasses leave their
4890 * results in sysmem, so we need to flush manually here.
4891 */
4892 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4893 }
4894
4895 template <chip CHIP>
4896 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4897 store_3d_blit(struct tu_cmd_buffer *cmd,
4898 struct tu_cs *cs,
4899 const struct tu_image_view *iview,
4900 VkSampleCountFlagBits dst_samples,
4901 bool separate_stencil,
4902 enum pipe_format src_format,
4903 enum pipe_format dst_format,
4904 const VkRect2D *render_area,
4905 uint32_t layer,
4906 uint32_t gmem_offset,
4907 uint32_t cpp)
4908 {
4909 /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
4910 * aren't set until we know whether we're HW binning or not, and we want to
4911 * avoid a dependence on that here to be able to store attachments before
4912 * the end of the renderpass in the future. Use the scratch space to
4913 * save/restore them dynamically.
4914 */
4915 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4916 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4917 CP_REG_TO_SCRATCH_0_SCRATCH(0) |
4918 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4919 if (CHIP >= A7XX) {
4920 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4921 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4922 CP_REG_TO_SCRATCH_0_SCRATCH(1) |
4923 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4924 }
4925
4926 r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
4927 0, false, iview->view.ubwc_enabled, dst_samples);
4928
4929 r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
4930
4931 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4932 if (!separate_stencil) {
4933 r3d_dst_depth<CHIP>(cs, iview, layer);
4934 } else {
4935 r3d_dst_stencil<CHIP>(cs, iview, layer);
4936 }
4937 } else {
4938 r3d_dst<CHIP>(cs, &iview->view, layer, src_format);
4939 }
4940
4941 r3d_src_gmem<CHIP>(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
4942
4943 /* sync GMEM writes with CACHE. */
4944 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4945
4946 /* Wait for CACHE_INVALIDATE to land */
4947 tu_cs_emit_wfi(cs);
4948
4949 r3d_run(cmd, cs);
4950
4951 r3d_teardown<CHIP>(cmd, cs);
4952
4953 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4954 * sysmem, and we generally assume that GMEM renderpasses leave their
4955 * results in sysmem, so we need to flush manually here. The 3d blit path
4956 * writes to depth images as a color RT, so there's no need to flush depth.
4957 */
4958 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4959
4960 /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
4961 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4962 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4963 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4964 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4965
4966 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4967 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
4968 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4969 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4970
4971 if (CHIP >= A7XX) {
4972 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4973 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4974 CP_SCRATCH_TO_REG_0_SCRATCH(1) |
4975 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4976 }
4977 }
4978
4979 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)4980 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
4981 {
4982 struct tu_physical_device *phys_dev = cmd->device->physical_device;
4983 const struct tu_image_view *iview = cmd->state.attachments[a];
4984 const VkRect2D *render_area = &cmd->state.render_area;
4985
4986 /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
4987 if (TU_DEBUG(UNALIGNED_STORE))
4988 return true;
4989
4990 /* We always use the unaligned store path when scaling rendering. */
4991 if (cmd->state.pass->has_fdm)
4992 return true;
4993
4994 uint32_t x1 = render_area->offset.x;
4995 uint32_t y1 = render_area->offset.y;
4996 uint32_t x2 = x1 + render_area->extent.width;
4997 uint32_t y2 = y1 + render_area->extent.height;
4998 /* x2/y2 can be unaligned if equal to the size of the image, since it will
4999 * write into padding space. The one exception is linear levels which don't
5000 * have the required y padding in the layout (except for the last level)
5001 */
5002 bool need_y2_align =
5003 y2 != iview->view.height || iview->view.need_y2_align;
5004
5005 return (x1 % phys_dev->info->gmem_align_w ||
5006 (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
5007 y1 % phys_dev->info->gmem_align_h ||
5008 (y2 % phys_dev->info->gmem_align_h && need_y2_align));
5009 }
5010
5011 /* Choose the GMEM layout (use the CCU space or not) based on whether the
5012 * current attachments will need. This has to happen at vkBeginRenderPass()
5013 * time because tu_attachment_store_unaligned() looks at the image views, which
5014 * are only available at that point. This should match the logic for the
5015 * !use_fast_path case in tu_store_gmem_attachment().
5016 */
5017 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)5018 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
5019 {
5020 cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
5021
5022 for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
5023 if (!cmd->state.attachments[i])
5024 continue;
5025
5026 struct tu_render_pass_attachment *att =
5027 &cmd->state.pass->attachments[i];
5028 if ((att->store || att->store_stencil) &&
5029 tu_attachment_store_unaligned(cmd, i))
5030 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5031 if (att->store && att->format == VK_FORMAT_S8_UINT)
5032 /* We cannot pick out S8 from D24S8/D32S8, so we conservatively disable
5033 * blit events for the S8_UINT format.
5034 */
5035 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5036 if (att->will_be_resolved && !blit_can_resolve(att->format))
5037 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
5038 }
5039
5040 cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
5041 }
5042
5043 struct apply_store_coords_state {
5044 unsigned view;
5045 };
5046
5047 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)5048 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
5049 struct tu_cs *cs,
5050 void *data,
5051 VkRect2D bin,
5052 unsigned views,
5053 VkExtent2D *frag_areas)
5054 {
5055 const struct apply_store_coords_state *state =
5056 (const struct apply_store_coords_state *)data;
5057 assert(state->view < views);
5058 VkExtent2D frag_area = frag_areas[state->view];
5059
5060 /* The bin width/height must be a multiple of the frag_area to make sure
5061 * that the scaling happens correctly. This means there may be some
5062 * destination pixels jut out of the framebuffer, but they should be
5063 * clipped by the render area.
5064 */
5065 assert(bin.extent.width % frag_area.width == 0);
5066 assert(bin.extent.height % frag_area.height == 0);
5067 uint32_t scaled_width = bin.extent.width / frag_area.width;
5068 uint32_t scaled_height = bin.extent.height / frag_area.height;
5069
5070 tu_cs_emit_regs(cs,
5071 A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
5072 .y = bin.offset.y),
5073 A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
5074 .y = bin.offset.y + bin.extent.height - 1));
5075 tu_cs_emit_regs(cs,
5076 A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
5077 A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
5078 A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
5079 A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
5080 }
5081
5082 template <chip CHIP>
5083 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_resolve_group * resolve_group,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)5084 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
5085 struct tu_cs *cs,
5086 struct tu_resolve_group *resolve_group,
5087 uint32_t a,
5088 uint32_t gmem_a,
5089 uint32_t layers,
5090 uint32_t layer_mask,
5091 bool cond_exec_allowed)
5092 {
5093 const VkRect2D *render_area = &cmd->state.render_area;
5094 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
5095 const struct tu_image_view *iview = cmd->state.attachments[a];
5096 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
5097 const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
5098 bool resolve = a != gmem_a;
5099 if (resolve)
5100 clear_value = NULL;
5101
5102 if (!dst->store && !dst->store_stencil)
5103 return;
5104
5105 bool unaligned = tu_attachment_store_unaligned(cmd, a);
5106
5107 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
5108 * one for depth and other for stencil. When resolving a MSAA
5109 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
5110 */
5111 bool resolve_d32s8_s8 =
5112 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
5113 dst->format == VK_FORMAT_S8_UINT;
5114
5115 /* The fast path doesn't support picking out the last component of a D24S8
5116 * texture reinterpreted as RGBA8_UNORM.
5117 */
5118 bool resolve_d24s8_s8 =
5119 src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
5120 dst->format == VK_FORMAT_S8_UINT;
5121
5122 bool store_common = dst->store && !resolve_d32s8_s8;
5123 bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
5124
5125 bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
5126 (a == gmem_a || blit_can_resolve(dst->format));
5127
5128 trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
5129
5130 /* Unconditional store should happen only if attachment was cleared,
5131 * which could have happened either by load_op or via vkCmdClearAttachments.
5132 */
5133 bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
5134 if (cond_exec) {
5135 tu_begin_load_store_cond_exec(cmd, cs, false);
5136 }
5137
5138 /* use fast path when render area is aligned, except for unsupported resolve cases */
5139 if (use_fast_path) {
5140 if (store_common)
5141 tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, false);
5142 if (store_separate_stencil)
5143 tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, true);
5144
5145 if (cond_exec) {
5146 tu_end_load_store_cond_exec(cmd, cs, false);
5147 }
5148
5149 trace_end_gmem_store(&cmd->trace, cs);
5150 return;
5151 }
5152
5153 assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
5154
5155 enum pipe_format src_format = vk_format_to_pipe_format(src->format);
5156 if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5157 src_format = PIPE_FORMAT_Z32_FLOAT;
5158
5159 enum pipe_format dst_format = vk_format_to_pipe_format(dst->format);
5160 if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
5161 dst_format = PIPE_FORMAT_Z32_FLOAT;
5162
5163 if (dst->samples > 1) {
5164 /* If we hit this path, we have to disable draw states after every tile
5165 * instead of once at the end of the renderpass, so that they aren't
5166 * executed when calling CP_DRAW.
5167 *
5168 * TODO: store a flag somewhere so we don't do this more than once and
5169 * don't do it after the renderpass when this happens.
5170 */
5171 if (store_common || store_separate_stencil)
5172 tu_disable_draw_states(cmd, cs);
5173
5174 for_each_layer(i, layer_mask, layers) {
5175 if (store_common) {
5176 store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
5177 dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5178 }
5179 if (store_separate_stencil) {
5180 store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
5181 PIPE_FORMAT_S8_UINT, render_area, i,
5182 tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5183 }
5184 }
5185 } else {
5186 if (!cmd->state.pass->has_fdm) {
5187 r2d_coords(cmd, cs, render_area->offset, render_area->offset,
5188 render_area->extent);
5189 } else {
5190 /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
5191 * area and the coordinates span the entire render area, but for
5192 * FDM we need to scale the coordinates so we need to take the
5193 * opposite aproach, specifying the exact bin size in the destination
5194 * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
5195 * area.
5196 */
5197 tu_cs_emit_regs(cs,
5198 A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
5199 .y = render_area->offset.y,),
5200 A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
5201 .y = render_area->offset.y + render_area->extent.height - 1,));
5202 }
5203
5204 for_each_layer (i, layer_mask, layers) {
5205 if (cmd->state.pass->has_fdm) {
5206 unsigned view = layer_mask ? i : 0;
5207 struct apply_store_coords_state state = {
5208 .view = view,
5209 };
5210 tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
5211 state);
5212 }
5213 if (store_common) {
5214 store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
5215 dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
5216 }
5217 if (store_separate_stencil) {
5218 store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
5219 PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
5220 }
5221 }
5222 }
5223
5224 if (cond_exec) {
5225 tu_end_load_store_cond_exec(cmd, cs, false);
5226 }
5227
5228 trace_end_gmem_store(&cmd->trace, cs);
5229 }
5230 TU_GENX(tu_store_gmem_attachment);
5231