1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_clear_blit.h"
10
11 #include "ir3/ir3_nir.h"
12
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18
19 #include "tu_cmd_buffer.h"
20 #include "tu_cs.h"
21 #include "tu_formats.h"
22 #include "tu_image.h"
23 #include "tu_tracepoints.h"
24
25 #include "common/freedreno_gpu_event.h"
26
27 static const VkOffset2D blt_no_coord = { ~0, ~0 };
28
29 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)30 tu_pack_float32_for_unorm(float val, int bits)
31 {
32 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
33 }
34
35 /* r2d_ = BLIT_OP_SCALE operations */
36
37 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)38 format_to_ifmt(enum pipe_format format)
39 {
40 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
41 format == PIPE_FORMAT_Z24X8_UNORM)
42 return R2D_UNORM8;
43
44 /* get_component_bits doesn't work with depth/stencil formats: */
45 if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
46 return R2D_FLOAT32;
47 if (format == PIPE_FORMAT_S8_UINT)
48 return R2D_INT8;
49 if (format == PIPE_FORMAT_A8_UNORM)
50 return R2D_UNORM8;
51
52 /* use the size of the red channel to find the corresponding "ifmt" */
53 bool is_int = util_format_is_pure_integer(format);
54 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
55 case 4: case 5: case 8:
56 return is_int ? R2D_INT8 : R2D_UNORM8;
57 case 10: case 11:
58 return is_int ? R2D_INT16 : R2D_FLOAT16;
59 case 16:
60 if (util_format_is_float(format))
61 return R2D_FLOAT16;
62 return is_int ? R2D_INT16 : R2D_FLOAT32;
63 case 32:
64 return is_int ? R2D_INT32 : R2D_FLOAT32;
65 default:
66 unreachable("bad format");
67 }
68 }
69
70 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode)71 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode)
72 {
73 struct tu_native_format fmt = tu6_format_texture(format, tile_mode);
74
75 switch (format) {
76 case PIPE_FORMAT_Z24X8_UNORM:
77 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
78 /* Similar to in fdl6_view_init, we want to use
79 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
80 * src. Since this is called when there is no image and thus no ubwc,
81 * we can always use FMT6_8_8_8_8_UNORM.
82 */
83 fmt.fmt = FMT6_8_8_8_8_UNORM;
84 break;
85 default:
86 break;
87 }
88
89 return fmt;
90 }
91
92 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)93 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
94 {
95 struct tu_native_format fmt = tu6_format_color(format, tile_mode);
96
97 switch (format) {
98 case PIPE_FORMAT_Z24X8_UNORM:
99 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
100 /* similar to blit_format_texture but for blit dst */
101 fmt.fmt = FMT6_8_8_8_8_UNORM;
102 break;
103 default:
104 break;
105 }
106
107 return fmt;
108 }
109
110 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc)111 blit_base_format(enum pipe_format format, bool ubwc)
112 {
113 if (ubwc) {
114 switch (format) {
115 case PIPE_FORMAT_Z24X8_UNORM:
116 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
117 /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
118 return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
119 default:
120 break;
121 }
122 }
123
124 /* note: tu6_format_color doesn't care about tiling for .fmt field */
125 return blit_format_color(format, TILE6_LINEAR).fmt;
126 }
127
128 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)129 r2d_coords(struct tu_cmd_buffer *cmd,
130 struct tu_cs *cs,
131 const VkOffset2D dst,
132 const VkOffset2D src,
133 const VkExtent2D extent)
134 {
135 tu_cs_emit_regs(cs,
136 A6XX_GRAS_2D_DST_TL(.x = dst.x, .y = dst.y),
137 A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
138
139 if (src.x == blt_no_coord.x)
140 return;
141
142 tu_cs_emit_regs(cs,
143 A6XX_GRAS_2D_SRC_TL_X(src.x),
144 A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
145 A6XX_GRAS_2D_SRC_TL_Y(src.y),
146 A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
147 }
148
149 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)150 r2d_clear_value(struct tu_cmd_buffer *cmd,
151 struct tu_cs *cs,
152 enum pipe_format format,
153 const VkClearValue *val)
154 {
155 uint32_t clear_value[4] = {};
156
157 switch (format) {
158 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
159 case PIPE_FORMAT_Z24X8_UNORM:
160 /* cleared as r8g8b8a8_unorm using special format */
161 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
162 clear_value[1] = clear_value[0] >> 8;
163 clear_value[2] = clear_value[0] >> 16;
164 clear_value[3] = val->depthStencil.stencil;
165 break;
166 case PIPE_FORMAT_Z16_UNORM:
167 case PIPE_FORMAT_Z32_FLOAT:
168 /* R2D_FLOAT32 */
169 clear_value[0] = fui(val->depthStencil.depth);
170 break;
171 case PIPE_FORMAT_S8_UINT:
172 clear_value[0] = val->depthStencil.stencil;
173 break;
174 case PIPE_FORMAT_R9G9B9E5_FLOAT:
175 /* cleared as UINT32 */
176 clear_value[0] = float3_to_rgb9e5(val->color.float32);
177 break;
178 default:
179 assert(!util_format_is_depth_or_stencil(format));
180 const struct util_format_description *desc = util_format_description(format);
181 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
182
183 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
184 format == PIPE_FORMAT_R11G11B10_FLOAT);
185
186 for (unsigned i = 0; i < 4; i++) {
187 if (desc->swizzle[i] > PIPE_SWIZZLE_W)
188 continue;
189
190 const struct util_format_channel_description *ch =
191 &desc->channel[desc->swizzle[i]];
192 if (ifmt == R2D_UNORM8) {
193 float linear = val->color.float32[i];
194 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
195 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
196
197 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
198 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
199 else
200 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
201 } else if (ifmt == R2D_FLOAT16) {
202 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
203 } else {
204 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
205 ifmt == R2D_INT16 || ifmt == R2D_INT8);
206 clear_value[i] = val->color.uint32[i];
207 }
208 }
209 break;
210 }
211
212 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
213 tu_cs_emit_array(cs, clear_value, 4);
214 }
215
216 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)217 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
218 enum a6xx_format *fmt)
219 {
220 /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
221 * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
222 * We also have to move stencil, which is normally in the .w channel, into
223 * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
224 * problems, and avoids using a swap, which seems to sometimes not work
225 * with a D24S8 source, or a texture swizzle which is only supported with
226 * the 3d path. Sometimes this blit happens on already-constructed
227 * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
228 */
229 if (*src_format == PIPE_FORMAT_S8_UINT &&
230 (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
231 dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
232 *fmt = FMT6_A8_UNORM;
233 *src_format = PIPE_FORMAT_A8_UNORM;
234 }
235 }
236
237 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)238 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
239 enum a6xx_format *fmt)
240 {
241 if (*dst_format == PIPE_FORMAT_S8_UINT &&
242 (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
243 src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
244 *dst_format = PIPE_FORMAT_A8_UNORM;
245 *fmt = FMT6_A8_UNORM;
246 }
247 }
248
249 template <chip CHIP>
250 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)251 r2d_src(struct tu_cmd_buffer *cmd,
252 struct tu_cs *cs,
253 const struct fdl6_view *iview,
254 uint32_t layer,
255 VkFilter filter,
256 enum pipe_format dst_format)
257 {
258 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
259 if (filter != VK_FILTER_NEAREST)
260 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
261
262 enum a6xx_format fmt = (enum a6xx_format)(
263 src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
264 enum pipe_format src_format = iview->format;
265 fixup_src_format(&src_format, dst_format, &fmt);
266
267 src_info =
268 (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
269 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
270
271 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
272 tu_cs_emit(cs, src_info);
273 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
274 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
275
276 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
277 tu_cs_image_flag_ref(cs, iview, layer);
278 }
279
280 template <chip CHIP>
281 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)282 r2d_src_depth(struct tu_cmd_buffer *cmd,
283 struct tu_cs *cs,
284 const struct tu_image_view *iview,
285 uint32_t layer,
286 VkFilter filter)
287 {
288 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
289 tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
290 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
291 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
292 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
293 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
294
295 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
296 tu_cs_image_flag_ref(cs, &iview->view, layer);
297 }
298
299 template <chip CHIP>
300 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)301 r2d_src_stencil(struct tu_cmd_buffer *cmd,
302 struct tu_cs *cs,
303 const struct tu_image_view *iview,
304 uint32_t layer,
305 VkFilter filter)
306 {
307 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
308 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
309 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
310 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
311 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
312 }
313
314 template <chip CHIP>
315 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)316 r2d_src_buffer(struct tu_cmd_buffer *cmd,
317 struct tu_cs *cs,
318 enum pipe_format format,
319 uint64_t va, uint32_t pitch,
320 uint32_t width, uint32_t height,
321 enum pipe_format dst_format)
322 {
323 struct tu_native_format fmt = blit_format_texture(format, TILE6_LINEAR);
324 enum a6xx_format color_format = fmt.fmt;
325 fixup_src_format(&format, dst_format, &color_format);
326
327 tu_cs_emit_regs(cs,
328 SP_PS_2D_SRC_INFO(CHIP,
329 .color_format = color_format,
330 .color_swap = fmt.swap,
331 .srgb = util_format_is_srgb(format),
332 .unk20 = 1,
333 .unk22 = 1),
334 SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
335 SP_PS_2D_SRC(CHIP, .qword = va),
336 SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
337 }
338
339 template <chip CHIP>
340 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)341 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
342 enum pipe_format src_format)
343 {
344 uint32_t dst_info = iview->RB_2D_DST_INFO;
345 enum a6xx_format fmt =
346 (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
347 enum pipe_format dst_format = iview->format;
348 fixup_dst_format(src_format, &dst_format, &fmt);
349
350 dst_info =
351 (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
352 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
353 tu_cs_emit(cs, dst_info);
354 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
355
356 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
357 tu_cs_image_flag_ref(cs, iview, layer);
358 }
359
360 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)361 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
362 {
363 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
364 tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
365 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
366 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
367
368 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
369 tu_cs_image_flag_ref(cs, &iview->view, layer);
370 }
371
372 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)373 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
374 {
375 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
376 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
377 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
378 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
379 }
380
381 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)382 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
383 enum pipe_format src_format)
384 {
385 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
386 enum a6xx_format color_fmt = fmt.fmt;
387 fixup_dst_format(src_format, &format, &color_fmt);
388 fmt.fmt = color_fmt;
389
390 tu_cs_emit_regs(cs,
391 A6XX_RB_2D_DST_INFO(
392 .color_format = fmt.fmt,
393 .color_swap = fmt.swap,
394 .srgb = util_format_is_srgb(format)),
395 A6XX_RB_2D_DST(.qword = va),
396 A6XX_RB_2D_DST_PITCH(pitch));
397 }
398
399 template <chip CHIP>
400 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)401 r2d_setup_common(struct tu_cmd_buffer *cmd,
402 struct tu_cs *cs,
403 enum pipe_format src_format,
404 enum pipe_format dst_format,
405 VkImageAspectFlags aspect_mask,
406 unsigned blit_param,
407 bool clear,
408 bool ubwc,
409 bool scissor)
410 {
411 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
412 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
413 }
414
415 enum a6xx_format fmt = blit_base_format(dst_format, ubwc);
416 fixup_dst_format(src_format, &dst_format, &fmt);
417 enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
418
419 uint32_t unknown_8c01 = 0;
420
421 /* note: the only format with partial clearing is D24S8 */
422 if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
423 /* preserve stencil channel */
424 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
425 unknown_8c01 = 0x08000041;
426 /* preserve depth channels */
427 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
428 unknown_8c01 = 0x00084001;
429 }
430
431 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
432 tu_cs_emit(cs, unknown_8c01); // TODO: seem to be always 0 on A7XX
433
434 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
435 .rotate = (enum a6xx_rotation) blit_param,
436 .solid_color = clear,
437 .color_format = fmt,
438 .scissor = scissor,
439 .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
440 .mask = 0xf,
441 .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
442 ).value;
443
444 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
445 tu_cs_emit(cs, blit_cntl);
446
447 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
448 tu_cs_emit(cs, blit_cntl);
449
450 if (CHIP > A6XX) {
451 tu_cs_emit_pkt4(cs, REG_A7XX_SP_PS_UNKNOWN_B2D2, 1);
452 tu_cs_emit(cs, 0x20000000);
453 }
454
455 if (fmt == FMT6_10_10_10_2_UNORM_DEST)
456 fmt = FMT6_16_16_16_16_FLOAT;
457
458 tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
459 .sint = util_format_is_pure_sint(dst_format),
460 .uint = util_format_is_pure_uint(dst_format),
461 .color_format = fmt,
462 .srgb = util_format_is_srgb(dst_format),
463 .mask = 0xf));
464 }
465
466 template <chip CHIP>
467 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)468 r2d_setup(struct tu_cmd_buffer *cmd,
469 struct tu_cs *cs,
470 enum pipe_format src_format,
471 enum pipe_format dst_format,
472 VkImageAspectFlags aspect_mask,
473 unsigned blit_param,
474 bool clear,
475 bool ubwc,
476 VkSampleCountFlagBits samples)
477 {
478 assert(samples == VK_SAMPLE_COUNT_1_BIT);
479
480 if (!cmd->state.pass) {
481 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
482 }
483
484 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
485 }
486
487 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)488 r2d_teardown(struct tu_cmd_buffer *cmd,
489 struct tu_cs *cs)
490 {
491 /* nothing to do here */
492 }
493
494 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)495 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
496 {
497 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
498 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
499 /* This a non-context register, so we have to WFI before changing. */
500 tu_cs_emit_wfi(cs);
501 tu_cs_emit_write_reg(
502 cs, REG_A6XX_RB_DBG_ECO_CNTL,
503 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
504 }
505
506 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
507 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
508
509 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
510 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
511 tu_cs_emit_wfi(cs);
512 tu_cs_emit_write_reg(
513 cs, REG_A6XX_RB_DBG_ECO_CNTL,
514 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
515 }
516 }
517
518 /* r3d_ = shader path operations */
519
520 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)521 load_const(nir_builder *b, unsigned base, unsigned components)
522 {
523 return nir_load_uniform(b, components, 32, nir_imm_int(b, 0),
524 .base = base);
525 }
526
527 static nir_shader *
build_blit_vs_shader(void)528 build_blit_vs_shader(void)
529 {
530 nir_builder _b =
531 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
532 nir_builder *b = &_b;
533 b->shader->info.internal = true;
534
535 nir_variable *out_pos =
536 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
537 "gl_Position");
538 out_pos->data.location = VARYING_SLOT_POS;
539
540 nir_def *vert0_pos = load_const(b, 0, 2);
541 nir_def *vert1_pos = load_const(b, 4, 2);
542 nir_def *vertex = nir_load_vertex_id(b);
543
544 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
545 pos = nir_vec4(b, nir_channel(b, pos, 0),
546 nir_channel(b, pos, 1),
547 nir_imm_float(b, 0.0),
548 nir_imm_float(b, 1.0));
549
550 nir_store_var(b, out_pos, pos, 0xf);
551
552 nir_variable *out_coords =
553 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
554 "coords");
555 out_coords->data.location = VARYING_SLOT_VAR0;
556
557 nir_def *vert0_coords = load_const(b, 2, 2);
558 nir_def *vert1_coords = load_const(b, 6, 2);
559
560 /* Only used with "z scale" blit path which uses a 3d texture */
561 nir_def *z_coord = load_const(b, 16, 1);
562
563 nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
564 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
565 z_coord);
566
567 nir_store_var(b, out_coords, coords, 0x7);
568
569 return b->shader;
570 }
571
572 static nir_shader *
build_clear_vs_shader(void)573 build_clear_vs_shader(void)
574 {
575 nir_builder _b =
576 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
577 nir_builder *b = &_b;
578 b->shader->info.internal = true;
579
580 nir_variable *out_pos =
581 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
582 "gl_Position");
583 out_pos->data.location = VARYING_SLOT_POS;
584
585 nir_def *vert0_pos = load_const(b, 0, 2);
586 nir_def *vert1_pos = load_const(b, 4, 2);
587 /* c0.z is used to clear depth */
588 nir_def *depth = load_const(b, 2, 1);
589 nir_def *vertex = nir_load_vertex_id(b);
590
591 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
592 pos = nir_vec4(b, nir_channel(b, pos, 0),
593 nir_channel(b, pos, 1),
594 depth, nir_imm_float(b, 1.0));
595
596 nir_store_var(b, out_pos, pos, 0xf);
597
598 nir_variable *out_layer =
599 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
600 "gl_Layer");
601 out_layer->data.location = VARYING_SLOT_LAYER;
602 nir_def *layer = load_const(b, 3, 1);
603 nir_store_var(b, out_layer, layer, 1);
604
605 return b->shader;
606 }
607
608 static nir_shader *
build_blit_fs_shader(bool zscale)609 build_blit_fs_shader(bool zscale)
610 {
611 nir_builder _b =
612 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
613 zscale ? "zscale blit fs" : "blit fs");
614 nir_builder *b = &_b;
615 b->shader->info.internal = true;
616
617 nir_variable *out_color =
618 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
619 "color0");
620 out_color->data.location = FRAG_RESULT_DATA0;
621
622 unsigned coord_components = zscale ? 3 : 2;
623 nir_variable *in_coords =
624 nir_variable_create(b->shader, nir_var_shader_in,
625 glsl_vec_type(coord_components),
626 "coords");
627 in_coords->data.location = VARYING_SLOT_VAR0;
628
629 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
630 /* Note: since we're just copying data, we rely on the HW ignoring the
631 * dest_type.
632 */
633 tex->dest_type = nir_type_int32;
634 tex->is_array = false;
635 tex->is_shadow = false;
636 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
637
638 tex->texture_index = 0;
639 tex->sampler_index = 0;
640
641 b->shader->info.num_textures = 1;
642 BITSET_SET(b->shader->info.textures_used, 0);
643
644 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
645 nir_load_var(b, in_coords));
646 tex->coord_components = coord_components;
647
648 nir_def_init(&tex->instr, &tex->def, 4, 32);
649 nir_builder_instr_insert(b, &tex->instr);
650
651 nir_store_var(b, out_color, &tex->def, 0xf);
652
653 return b->shader;
654 }
655
656 /* We can only read multisample textures via txf_ms, so we need a separate
657 * variant for them.
658 */
659 static nir_shader *
build_ms_copy_fs_shader(bool half_float)660 build_ms_copy_fs_shader(bool half_float)
661 {
662 nir_builder _b =
663 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
664 "multisample copy fs");
665 nir_builder *b = &_b;
666 b->shader->info.internal = true;
667
668 nir_variable *out_color =
669 nir_variable_create(b->shader, nir_var_shader_out,
670 half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
671 "color0");
672 out_color->data.location = FRAG_RESULT_DATA0;
673
674 nir_variable *in_coords =
675 nir_variable_create(b->shader, nir_var_shader_in,
676 glsl_vec_type(2),
677 "coords");
678 in_coords->data.location = VARYING_SLOT_VAR0;
679
680 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
681
682 tex->op = nir_texop_txf_ms;
683
684 /* Note: since we're just copying data, we rely on the HW ignoring the
685 * dest_type.
686 */
687 tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
688 tex->is_array = false;
689 tex->is_shadow = false;
690 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
691
692 tex->texture_index = 0;
693 tex->sampler_index = 0;
694
695 b->shader->info.num_textures = 1;
696 BITSET_SET(b->shader->info.textures_used, 0);
697 BITSET_SET(b->shader->info.textures_used_by_txf, 0);
698
699 nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
700
701 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
702 tex->coord_components = 2;
703
704 tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
705 nir_load_sample_id(b));
706
707 nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
708 nir_builder_instr_insert(b, &tex->instr);
709
710 nir_store_var(b, out_color, &tex->def, 0xf);
711
712 return b->shader;
713 }
714
715 static nir_shader *
build_clear_fs_shader(unsigned mrts)716 build_clear_fs_shader(unsigned mrts)
717 {
718 nir_builder _b =
719 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
720 "mrt%u clear fs", mrts);
721 nir_builder *b = &_b;
722 b->shader->info.internal = true;
723
724 for (unsigned i = 0; i < mrts; i++) {
725 nir_variable *out_color =
726 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
727 "color");
728 out_color->data.location = FRAG_RESULT_DATA0 + i;
729
730 nir_def *color = load_const(b, 4 * i, 4);
731 nir_store_var(b, out_color, color, 0xf);
732 }
733
734 return b->shader;
735 }
736
737 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)738 compile_shader(struct tu_device *dev, struct nir_shader *nir,
739 unsigned consts, unsigned *offset, enum global_shader idx)
740 {
741 nir->options = ir3_get_compiler_options(dev->compiler);
742
743 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
744 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
745
746 ir3_finalize_nir(dev->compiler, nir);
747
748 const struct ir3_shader_options options = {
749 .num_reserved_user_consts = align(consts, 8),
750 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
751 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
752 };
753 struct ir3_shader *sh =
754 ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
755
756 struct ir3_shader_key key = {};
757 bool created;
758 struct ir3_shader_variant *so =
759 ir3_shader_get_variant(sh, &key, false, false, &created);
760
761 struct tu6_global *global = dev->global_bo_map;
762
763 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
764 dev->global_shaders[idx] = sh;
765 dev->global_shader_variants[idx] = so;
766 memcpy(&global->shaders[*offset], so->bin,
767 sizeof(uint32_t) * so->info.sizedwords);
768 dev->global_shader_va[idx] = dev->global_bo->iova +
769 offsetof_arr(struct tu6_global, shaders, *offset);
770 *offset += align(so->info.sizedwords, 32);
771 }
772
773 void
tu_init_clear_blit_shaders(struct tu_device * dev)774 tu_init_clear_blit_shaders(struct tu_device *dev)
775 {
776 unsigned offset = 0;
777 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
778 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
779 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
780 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
781 compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
782 compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
783
784 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
785 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
786 (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
787 }
788 }
789
790 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)791 tu_destroy_clear_blit_shaders(struct tu_device *dev)
792 {
793 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
794 if (dev->global_shaders[i])
795 ir3_shader_destroy(dev->global_shaders[i]);
796 }
797 }
798
799 enum r3d_type {
800 R3D_CLEAR,
801 R3D_BLIT,
802 R3D_COPY_HALF,
803 };
804
805 template <chip CHIP>
806 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)807 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
808 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
809 {
810 enum global_shader vs_id =
811 type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
812
813 struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
814 uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
815
816 enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
817
818 if (z_scale) {
819 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
820 } else if (type == R3D_COPY_HALF) {
821 /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
822 *
823 * TODO: Add a half-float blit shader that uses texture() but with half
824 * registers to avoid NaN canonicaliztion for the single-sampled case.
825 */
826 fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
827 } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
828 fs_id = GLOBAL_SH_FS_COPY_MS;
829 }
830
831 unsigned num_rts = util_bitcount(rts_mask);
832 if (type == R3D_CLEAR)
833 fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
834
835 struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
836 uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
837
838 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
839 .vs_state = true,
840 .hs_state = true,
841 .ds_state = true,
842 .gs_state = true,
843 .fs_state = true,
844 .cs_state = true,
845 .cs_ibo = true,
846 .gfx_ibo = true,
847 .gfx_shared_const = true,
848 .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
849 .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
850
851 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
852 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
853 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
854 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
855 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
856
857 struct tu_pvtmem_config pvtmem = {};
858 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
859 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
860
861 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
862 if (CHIP == A7XX) {
863 tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
864 }
865
866 tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
867
868 if (CHIP >= A7XX) {
869 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
870
871 tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
872 }
873
874 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
875 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
876 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
877
878 tu6_emit_vs<CHIP>(cs, vs, 0);
879 tu6_emit_hs<CHIP>(cs, NULL);
880 tu6_emit_ds<CHIP>(cs, NULL);
881 tu6_emit_gs<CHIP>(cs, NULL);
882 tu6_emit_fs<CHIP>(cs, fs);
883
884 tu_cs_emit_regs(cs,
885 A6XX_GRAS_CL_CNTL(
886 .clip_disable = 1,
887 .vp_clip_code_ignore = 1,
888 .vp_xform_disable = 1,
889 .persp_division_disable = 1,));
890 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
891
892 tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
893 if (CHIP == A6XX) {
894 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
895 } else {
896 tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
897 }
898
899 tu_cs_emit_regs(cs,
900 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
901 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
902 tu_cs_emit_regs(cs,
903 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
904 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
905
906 tu_cs_emit_regs(cs,
907 A6XX_VFD_INDEX_OFFSET(),
908 A6XX_VFD_INSTANCE_START_OFFSET());
909
910 if (rts_mask) {
911 unsigned rts_count = util_last_bit(rts_mask);
912 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
913 unsigned rt = 0;
914 for (unsigned i = 0; i < rts_count; i++) {
915 unsigned regid = 0;
916 if (rts_mask & (1u << i))
917 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
918 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
919 COND(regid & HALF_REG_ID,
920 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
921 }
922 }
923
924 tu6_emit_msaa(cs, samples, false);
925 }
926
927 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)928 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
929 struct tu_cs *cs,
930 uint32_t opcode,
931 enum a6xx_state_block block,
932 uint32_t offset,
933 const void *consts,
934 uint32_t size_vec4)
935 {
936 assert(offset % cmd->device->compiler->const_upload_unit == 0);
937
938 struct tu_cs_memory mem = {};
939 VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
940 if (result != VK_SUCCESS) {
941 vk_command_buffer_set_error(&cmd->vk, result);
942 return;
943 }
944
945 memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
946
947 tu_cs_emit_pkt7(cs, opcode, 3);
948 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
949 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
950 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
951 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
952 CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
953 tu_cs_emit_qw(cs, mem.iova);
954 }
955
956 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)957 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
958 {
959 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
960 }
961
962 /* z coordinate for "z scale" blit path which uses a 3d texture */
963 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)964 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
965 {
966 const uint32_t coord[] = {
967 fui(z),
968 0,
969 0,
970 0,
971 };
972
973 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
974 }
975
976 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)977 r3d_coords(struct tu_cmd_buffer *cmd,
978 struct tu_cs *cs,
979 const VkOffset2D dst,
980 const VkOffset2D src,
981 const VkExtent2D extent)
982 {
983 const bool no_src = src.x != blt_no_coord.x;
984 int32_t src_x1 = no_src ? src.x : 0;
985 int32_t src_y1 = no_src ? src.y : 0;
986
987 const float coords[] = {
988 dst.x,
989 dst.y,
990 src_x1,
991 src_y1,
992 dst.x + extent.width,
993 dst.y + extent.height,
994 src_x1 + extent.width,
995 src_y1 + extent.height,
996 };
997 r3d_coords_raw(cmd, cs, coords);
998 }
999
1000 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1001 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1002 {
1003 uint32_t coords[4] = {};
1004
1005 switch (format) {
1006 case PIPE_FORMAT_Z24X8_UNORM:
1007 case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1008 /* cleared as r8g8b8a8_unorm using special format */
1009 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1010 coords[0] = fui((tmp & 0xff) / 255.0f);
1011 coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1012 coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1013 coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1014 } break;
1015 case PIPE_FORMAT_Z16_UNORM:
1016 case PIPE_FORMAT_Z32_FLOAT:
1017 coords[0] = fui(val->depthStencil.depth);
1018 coords[1] = 0;
1019 coords[2] = 0;
1020 coords[3] = 0;
1021 break;
1022 case PIPE_FORMAT_S8_UINT:
1023 coords[0] = val->depthStencil.stencil & 0xff;
1024 coords[1] = 0;
1025 coords[2] = 0;
1026 coords[3] = 0;
1027 break;
1028 default:
1029 /* as color formats use clear value as-is */
1030 assert(!util_format_is_depth_or_stencil(format));
1031 memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1032 break;
1033 }
1034
1035 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1036 }
1037
1038 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1039 r3d_src_common(struct tu_cmd_buffer *cmd,
1040 struct tu_cs *cs,
1041 const uint32_t *tex_const,
1042 uint32_t offset_base,
1043 uint32_t offset_ubwc,
1044 VkFilter filter)
1045 {
1046 struct tu_cs_memory texture = { };
1047 VkResult result = tu_cs_alloc(&cmd->sub_cs,
1048 2, /* allocate space for a sampler too */
1049 A6XX_TEX_CONST_DWORDS, &texture);
1050 if (result != VK_SUCCESS) {
1051 vk_command_buffer_set_error(&cmd->vk, result);
1052 return;
1053 }
1054
1055 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1056
1057 /* patch addresses for layer offset */
1058 *(uint64_t*) (texture.map + 4) += offset_base;
1059 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1060 texture.map[7] = ubwc_addr;
1061 texture.map[8] = ubwc_addr >> 32;
1062
1063 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1064 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1065 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1066 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1067 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1068 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1069 0x60000; /* XXX used by blob, doesn't seem necessary */
1070 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1071 A6XX_TEX_SAMP_1_UNNORM_COORDS |
1072 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1073 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1074 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1075
1076 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1077 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1078 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1079 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1080 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1081 CP_LOAD_STATE6_0_NUM_UNIT(1));
1082 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1083
1084 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1085
1086 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1087 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1088 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1089 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1090 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1091 CP_LOAD_STATE6_0_NUM_UNIT(1));
1092 tu_cs_emit_qw(cs, texture.iova);
1093
1094 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1095 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1096 }
1097
1098 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1099 r3d_src(struct tu_cmd_buffer *cmd,
1100 struct tu_cs *cs,
1101 const struct fdl6_view *iview,
1102 uint32_t layer,
1103 VkFilter filter,
1104 enum pipe_format dst_format)
1105 {
1106 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1107 memcpy(desc, iview->descriptor, sizeof(desc));
1108
1109 enum a6xx_format fmt = (enum a6xx_format)(
1110 (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1111 enum pipe_format src_format = iview->format;
1112 fixup_src_format(&src_format, dst_format, &fmt);
1113 desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1114 A6XX_TEX_CONST_0_FMT(fmt);
1115
1116 r3d_src_common(cmd, cs, desc,
1117 iview->layer_size * layer,
1118 iview->ubwc_layer_size * layer,
1119 filter);
1120 }
1121
1122 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1123 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1124 struct tu_cs *cs,
1125 enum pipe_format format,
1126 uint64_t va, uint32_t pitch,
1127 uint32_t width, uint32_t height,
1128 enum pipe_format dst_format)
1129 {
1130 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1131
1132 struct tu_native_format fmt = blit_format_texture(format, TILE6_LINEAR);
1133 enum a6xx_format color_format = fmt.fmt;
1134 fixup_src_format(&format, dst_format, &color_format);
1135
1136 desc[0] =
1137 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1138 A6XX_TEX_CONST_0_FMT(color_format) |
1139 A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1140 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1141 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1142 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1143 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1144 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1145 desc[2] =
1146 A6XX_TEX_CONST_2_PITCH(pitch) |
1147 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1148 desc[3] = 0;
1149 desc[4] = va;
1150 desc[5] = va >> 32;
1151 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1152 desc[i] = 0;
1153
1154 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1155 }
1156
1157 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1158 r3d_src_depth(struct tu_cmd_buffer *cmd,
1159 struct tu_cs *cs,
1160 const struct tu_image_view *iview,
1161 uint32_t layer)
1162 {
1163 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1164
1165 memcpy(desc, iview->view.descriptor, sizeof(desc));
1166 uint64_t va = iview->depth_base_addr;
1167
1168 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1169 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1170 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1171 A6XX_TEX_CONST_0_SWAP__MASK);
1172 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1173 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1174 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1175 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1176 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1177 desc[2] =
1178 A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1179 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1180 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1181 (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1182 desc[4] = va;
1183 desc[5] = va >> 32;
1184
1185 r3d_src_common(cmd, cs, desc,
1186 iview->depth_layer_size * layer,
1187 iview->view.ubwc_layer_size * layer,
1188 VK_FILTER_NEAREST);
1189 }
1190
1191 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1192 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1193 struct tu_cs *cs,
1194 const struct tu_image_view *iview,
1195 uint32_t layer)
1196 {
1197 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1198
1199 memcpy(desc, iview->view.descriptor, sizeof(desc));
1200 uint64_t va = iview->stencil_base_addr;
1201
1202 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1203 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1204 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1205 A6XX_TEX_CONST_0_SWAP__MASK);
1206 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1207 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1208 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1209 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1210 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1211 desc[2] =
1212 A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1213 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1214 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1215 desc[4] = va;
1216 desc[5] = va >> 32;
1217 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1218 desc[i] = 0;
1219
1220 r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1221 VK_FILTER_NEAREST);
1222 }
1223
1224 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1225 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1226 struct tu_cs *cs,
1227 const struct tu_image_view *iview,
1228 uint32_t layer)
1229 {
1230 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1231
1232 memcpy(desc, iview->view.descriptor, sizeof(desc));
1233
1234 /* Fixup D24 formats because we always load both depth and stencil. */
1235 enum pipe_format format = iview->view.format;
1236 if (format == PIPE_FORMAT_X24S8_UINT ||
1237 format == PIPE_FORMAT_Z24X8_UNORM ||
1238 format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1239 desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1240 if (iview->view.ubwc_enabled)
1241 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1242 else
1243 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1244 }
1245
1246 /* When loading/storing GMEM we always load the full image and don't do any
1247 * swizzling or swapping, that's done in the draw when reading/writing
1248 * GMEM, so we need to fixup the swizzle and swap.
1249 */
1250 desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1251 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1252 A6XX_TEX_CONST_0_SWAP__MASK);
1253 desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1254 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1255 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1256 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1257
1258 r3d_src_common(cmd, cs, desc,
1259 iview->view.layer_size * layer,
1260 iview->view.ubwc_layer_size * layer,
1261 VK_FILTER_NEAREST);
1262 }
1263
1264 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1265 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1266 struct tu_cs *cs,
1267 const struct tu_image_view *iview,
1268 enum pipe_format format,
1269 enum pipe_format dst_format,
1270 uint32_t gmem_offset,
1271 uint32_t cpp)
1272 {
1273 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1274 memcpy(desc, iview->view.descriptor, sizeof(desc));
1275
1276 enum a6xx_format fmt = blit_format_texture(format, TILE6_LINEAR).fmt;
1277 fixup_src_format(&format, dst_format, &fmt);
1278
1279 /* patch the format so that depth/stencil get the right format and swizzle */
1280 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1281 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1282 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1283 desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1284 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1285 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1286 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1287 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1288
1289 /* patched for gmem */
1290 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1291 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1292 desc[2] =
1293 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1294 A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1295 desc[3] = 0;
1296 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1297 desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1298 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1299 desc[i] = 0;
1300
1301 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1302 }
1303
1304 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1305 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1306 enum pipe_format src_format)
1307 {
1308 uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1309
1310 enum a6xx_format fmt = (enum a6xx_format)(
1311 mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1312 enum pipe_format dst_format = iview->format;
1313 fixup_dst_format(src_format, &dst_format, &fmt);
1314 mrt_buf_info =
1315 (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1316 A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1317 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1318 tu_cs_emit(cs, mrt_buf_info);
1319 tu_cs_image_ref(cs, iview, layer);
1320 tu_cs_emit(cs, 0);
1321
1322 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1323 tu_cs_image_flag_ref(cs, iview, layer);
1324
1325 /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1326 * FMT6_NV12_Y.
1327 */
1328 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1329
1330 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
1331 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1332 }
1333
1334 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1335 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1336 {
1337 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1338 tu_cs_emit(cs, tu_image_view_depth(iview, RB_MRT_BUF_INFO));
1339 tu_cs_image_depth_ref(cs, iview, layer);
1340 tu_cs_emit(cs, 0);
1341
1342 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1343 tu_cs_image_flag_ref(cs, &iview->view, layer);
1344
1345 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->view.ubwc_enabled));
1346 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1347 }
1348
1349 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1350 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1351 {
1352 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
1353 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
1354 tu_cs_image_stencil_ref(cs, iview, layer);
1355 tu_cs_emit(cs, 0);
1356
1357 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1358 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1359 }
1360
1361 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1362 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1363 enum pipe_format src_format)
1364 {
1365 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1366
1367 enum a6xx_format color_fmt = fmt.fmt;
1368 fixup_dst_format(src_format, &format, &color_fmt);
1369
1370 tu_cs_emit_regs(cs,
1371 A6XX_RB_MRT_BUF_INFO(0, .color_format = color_fmt, .color_swap = fmt.swap),
1372 A6XX_RB_MRT_PITCH(0, pitch),
1373 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1374 A6XX_RB_MRT_BASE(0, .qword = va),
1375 A6XX_RB_MRT_BASE_GMEM(0, 0));
1376
1377 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1378 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1379 }
1380
1381 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1382 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1383 const struct tu_image_view *iview,
1384 const struct tu_render_pass_attachment *att,
1385 bool separate_stencil, unsigned layer)
1386 {
1387 unsigned RB_MRT_BUF_INFO;
1388 unsigned gmem_offset;
1389
1390 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1391 if (!separate_stencil) {
1392 RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1393 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1394 } else {
1395 RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1396 gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1397 }
1398 } else {
1399 RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1400 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1401 }
1402
1403 tu_cs_emit_regs(cs,
1404 A6XX_RB_MRT_BUF_INFO(0, .dword = RB_MRT_BUF_INFO),
1405 A6XX_RB_MRT_PITCH(0, 0),
1406 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1407 A6XX_RB_MRT_BASE(0, 0),
1408 A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1409
1410 enum a6xx_format color_format =
1411 (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1412 tu_cs_emit_regs(cs,
1413 A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1414
1415 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
1416 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1417 }
1418
1419 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1420 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1421 {
1422 uint8_t mask = 0xf;
1423 assert(aspect_mask);
1424 /* note: the only format with partial writing is D24S8,
1425 * clear/blit uses the _AS_R8G8B8A8 format to access it
1426 */
1427 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1428 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1429 mask = 0x7;
1430 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1431 mask = 0x8;
1432 }
1433 return mask;
1434 }
1435
1436 enum r3d_blit_param {
1437 R3D_Z_SCALE = 1 << 0,
1438 R3D_DST_GMEM = 1 << 1,
1439 R3D_COPY = 1 << 2,
1440 };
1441
1442 template <chip CHIP>
1443 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1444 r3d_setup(struct tu_cmd_buffer *cmd,
1445 struct tu_cs *cs,
1446 enum pipe_format src_format,
1447 enum pipe_format dst_format,
1448 VkImageAspectFlags aspect_mask,
1449 unsigned blit_param,
1450 bool clear,
1451 bool ubwc,
1452 VkSampleCountFlagBits samples)
1453 {
1454 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1455 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1456 }
1457
1458 enum a6xx_format fmt = blit_base_format(dst_format, ubwc);
1459 fixup_dst_format(src_format, &dst_format, &fmt);
1460
1461 if (!cmd->state.pass) {
1462 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1463 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1464 }
1465
1466 if (!(blit_param & R3D_DST_GMEM)) {
1467 if (CHIP == A6XX) {
1468 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1469 } else {
1470 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1471 }
1472
1473 tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1474
1475 if (CHIP >= A7XX) {
1476 tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1477 tu_cs_emit_regs(cs,
1478 A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1479 }
1480 }
1481
1482 enum r3d_type type;
1483 if (clear) {
1484 type = R3D_CLEAR;
1485 } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1486 /* Avoid canonicalizing NaNs in copies by using the special half-float
1487 * path that uses half regs.
1488 */
1489 type = R3D_COPY_HALF;
1490 } else {
1491 type = R3D_BLIT;
1492 }
1493
1494 r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1495
1496 tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1497 tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1498 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1499 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1500
1501 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1502 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1503 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1504 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1505 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1506 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1507 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1508 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1509 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1510
1511 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1512 .color_format = fmt,
1513 .color_sint = util_format_is_pure_sint(dst_format),
1514 .color_uint = util_format_is_pure_uint(dst_format)));
1515
1516 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1517 .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1518 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1519 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1520
1521 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1522 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1523
1524 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1525 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1526
1527 /* Disable sample counting in order to not affect occlusion query. */
1528 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1529
1530 if (cmd->state.prim_generated_query_running_before_rp) {
1531 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1532 }
1533
1534 if (cmd->state.predication_active) {
1535 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1536 tu_cs_emit(cs, 0);
1537 }
1538 }
1539
1540 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1541 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1542 {
1543 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1544 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1545 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1546 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1547 tu_cs_emit(cs, 1); /* instance count */
1548 tu_cs_emit(cs, 2); /* vertex count */
1549 }
1550
1551 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1552 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1553 {
1554 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1555 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1556 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1557 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1558 tu_cs_emit(cs, 1); /* instance count */
1559 tu_cs_emit(cs, 2); /* vertex count */
1560 }
1561
1562 template <chip CHIP>
1563 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1564 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1565 {
1566 if (cmd->state.predication_active) {
1567 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1568 tu_cs_emit(cs, 1);
1569 }
1570
1571 /* Re-enable sample counting. */
1572 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1573
1574 if (cmd->state.prim_generated_query_running_before_rp) {
1575 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1576 }
1577 }
1578
1579 /* blit ops - common interface for 2d/shader paths */
1580
1581 struct blit_ops {
1582 void (*coords)(struct tu_cmd_buffer *cmd,
1583 struct tu_cs *cs,
1584 const VkOffset2D dst,
1585 const VkOffset2D src,
1586 const VkExtent2D extent);
1587 void (*clear_value)(struct tu_cmd_buffer *cmd,
1588 struct tu_cs *cs,
1589 enum pipe_format format,
1590 const VkClearValue *val);
1591 void (*src)(
1592 struct tu_cmd_buffer *cmd,
1593 struct tu_cs *cs,
1594 const struct fdl6_view *iview,
1595 uint32_t layer,
1596 VkFilter filter,
1597 enum pipe_format dst_format);
1598 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1599 enum pipe_format format,
1600 uint64_t va, uint32_t pitch,
1601 uint32_t width, uint32_t height,
1602 enum pipe_format dst_format);
1603 void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1604 enum pipe_format src_format);
1605 void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1606 void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1607 void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1608 enum pipe_format src_format);
1609 void (*setup)(struct tu_cmd_buffer *cmd,
1610 struct tu_cs *cs,
1611 enum pipe_format src_format,
1612 enum pipe_format dst_format,
1613 VkImageAspectFlags aspect_mask,
1614 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1615 bool clear,
1616 bool ubwc,
1617 VkSampleCountFlagBits samples);
1618 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1619 void (*teardown)(struct tu_cmd_buffer *cmd,
1620 struct tu_cs *cs);
1621 };
1622
1623 template <chip CHIP>
1624 static const struct blit_ops r2d_ops = {
1625 .coords = r2d_coords,
1626 .clear_value = r2d_clear_value,
1627 .src = r2d_src<CHIP>,
1628 .src_buffer = r2d_src_buffer<CHIP>,
1629 .dst = r2d_dst<CHIP>,
1630 .dst_depth = r2d_dst_depth,
1631 .dst_stencil = r2d_dst_stencil,
1632 .dst_buffer = r2d_dst_buffer,
1633 .setup = r2d_setup<CHIP>,
1634 .run = r2d_run,
1635 .teardown = r2d_teardown,
1636 };
1637
1638 template <chip CHIP>
1639 static const struct blit_ops r3d_ops = {
1640 .coords = r3d_coords,
1641 .clear_value = r3d_clear_value,
1642 .src = r3d_src,
1643 .src_buffer = r3d_src_buffer,
1644 .dst = r3d_dst,
1645 .dst_depth = r3d_dst_depth,
1646 .dst_stencil = r3d_dst_stencil,
1647 .dst_buffer = r3d_dst_buffer,
1648 .setup = r3d_setup<CHIP>,
1649 .run = r3d_run,
1650 .teardown = r3d_teardown<CHIP>,
1651 };
1652
1653 /* passthrough set coords from 3D extents */
1654 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1655 coords(const struct blit_ops *ops,
1656 struct tu_cmd_buffer *cmd,
1657 struct tu_cs *cs,
1658 const VkOffset3D dst,
1659 const VkOffset3D src,
1660 const VkExtent3D extent)
1661 {
1662 ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1663 (VkExtent2D) {extent.width, extent.height});
1664 }
1665
1666 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1667 * to be a bit careful because we have to pick a format with matching UBWC
1668 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1669 * everything.
1670 */
1671 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1672 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1673 {
1674 if (vk_format_is_compressed(vk_format)) {
1675 switch (vk_format_get_blocksize(vk_format)) {
1676 case 1: return PIPE_FORMAT_R8_UINT;
1677 case 2: return PIPE_FORMAT_R16_UINT;
1678 case 4: return PIPE_FORMAT_R32_UINT;
1679 case 8: return PIPE_FORMAT_R32G32_UINT;
1680 case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1681 default:
1682 unreachable("unhandled format size");
1683 }
1684 }
1685
1686 enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
1687
1688 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat
1689 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1690 * (also -1.0), when we're supposed to be memcpying the bits. See
1691 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1692 */
1693 format = util_format_snorm_to_unorm(format);
1694
1695 switch (format) {
1696 case PIPE_FORMAT_R9G9B9E5_FLOAT:
1697 return PIPE_FORMAT_R32_UINT;
1698
1699 case PIPE_FORMAT_G8_B8R8_420_UNORM:
1700 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1701 return PIPE_FORMAT_R8G8_UNORM;
1702 else
1703 return PIPE_FORMAT_Y8_UNORM;
1704 case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1705 return PIPE_FORMAT_R8_UNORM;
1706
1707 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1708 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1709 return PIPE_FORMAT_S8_UINT;
1710 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1711 return PIPE_FORMAT_Z32_FLOAT;
1712
1713 default:
1714 return format;
1715 }
1716 }
1717
1718 template <chip CHIP>
1719 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)1720 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
1721 struct tu_cs *cs,
1722 struct tu_image *image,
1723 const VkClearValue *value)
1724 {
1725 const struct blit_ops *ops = &r2d_ops<CHIP>;
1726
1727 /* It is assumed that LRZ cache is invalidated at this point for
1728 * the writes here to become visible to LRZ.
1729 *
1730 * LRZ writes are going through UCHE cache, flush UCHE before changing
1731 * LRZ via CCU. Don't need to invalidate CCU since we are presumably
1732 * writing whole cache lines we assume to be 64 bytes.
1733 */
1734 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_FLUSH);
1735
1736 ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
1737 VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
1738 VK_SAMPLE_COUNT_1_BIT);
1739 ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
1740 ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
1741 image->iova + image->lrz_offset,
1742 image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
1743 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
1744 (VkExtent2D) { image->lrz_pitch, image->lrz_height });
1745 ops->run(cmd, cs);
1746 ops->teardown(cmd, cs);
1747
1748 /* Clearing writes via CCU color in the PS stage, and LRZ is read via
1749 * UCHE in the earlier GRAS stage.
1750 */
1751 cmd->state.cache.flush_bits |=
1752 TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
1753 TU_CMD_FLAG_WAIT_FOR_IDLE;
1754 }
1755 TU_GENX(tu6_clear_lrz);
1756
1757 template <chip CHIP>
1758 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)1759 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
1760 struct tu_cs *cs,
1761 struct tu_image *image)
1762 {
1763 const struct blit_ops *ops = &r2d_ops<CHIP>;
1764 VkClearValue clear = {};
1765 clear.color.uint32[0] = 0xffffffff;
1766
1767 /* LRZ fast-clear buffer is always allocated with 512 bytes size. */
1768 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
1769 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
1770 VK_SAMPLE_COUNT_1_BIT);
1771 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
1772 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
1773 image->iova + image->lrz_fc_offset, 512,
1774 PIPE_FORMAT_R32_UINT);
1775 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {128, 1});
1776 ops->run(cmd, cs);
1777 ops->teardown(cmd, cs);
1778 }
1779 TU_GENX(tu6_dirty_lrz_fc);
1780
1781 template<chip CHIP>
1782 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)1783 tu_image_view_copy_blit(struct fdl6_view *iview,
1784 struct tu_image *image,
1785 enum pipe_format format,
1786 const VkImageSubresourceLayers *subres,
1787 uint32_t layer,
1788 bool z_scale)
1789 {
1790 VkImageAspectFlags aspect_mask = subres->aspectMask;
1791
1792 /* always use the AS_R8G8B8A8 format for these */
1793 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
1794 format == PIPE_FORMAT_Z24X8_UNORM) {
1795 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
1796 }
1797
1798 const struct fdl_layout *layout =
1799 &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
1800
1801 const struct fdl_view_args args = {
1802 .chip = CHIP,
1803 .iova = image->iova,
1804 .base_miplevel = subres->mipLevel,
1805 .level_count = 1,
1806 .base_array_layer = subres->baseArrayLayer + layer,
1807 .layer_count = 1,
1808 .swiz = {
1809 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
1810 },
1811 .format = tu_format_for_aspect(format, aspect_mask),
1812 .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
1813 };
1814 fdl6_view_init(iview, &layout, &args, false);
1815 }
1816
1817 template<chip CHIP>
1818 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)1819 tu_image_view_copy(struct fdl6_view *iview,
1820 struct tu_image *image,
1821 enum pipe_format format,
1822 const VkImageSubresourceLayers *subres,
1823 uint32_t layer)
1824 {
1825 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
1826 }
1827
1828 template<chip CHIP>
1829 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)1830 tu_image_view_blit(struct fdl6_view *iview,
1831 struct tu_image *image,
1832 const VkImageSubresourceLayers *subres,
1833 uint32_t layer)
1834 {
1835 enum pipe_format format =
1836 tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
1837 subres->aspectMask));
1838 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
1839 }
1840
1841 template <chip CHIP>
1842 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)1843 tu6_blit_image(struct tu_cmd_buffer *cmd,
1844 struct tu_image *src_image,
1845 struct tu_image *dst_image,
1846 const VkImageBlit2 *info,
1847 VkFilter filter)
1848 {
1849 const struct blit_ops *ops = &r2d_ops<CHIP>;
1850 struct tu_cs *cs = &cmd->cs;
1851 bool z_scale = false;
1852 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
1853
1854 /* 2D blit can't do rotation mirroring from just coordinates */
1855 static const enum a6xx_rotation rotate[2][2] = {
1856 {ROTATE_0, ROTATE_HFLIP},
1857 {ROTATE_VFLIP, ROTATE_180},
1858 };
1859
1860 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1861 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1862 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1863 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1864
1865 int32_t src0_z = info->srcOffsets[0].z;
1866 int32_t src1_z = info->srcOffsets[1].z;
1867
1868 if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
1869 info->dstOffsets[1].z - info->dstOffsets[0].z) ||
1870 info->srcOffsets[1].z < info->srcOffsets[0].z) {
1871 z_scale = true;
1872 }
1873
1874 if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
1875 layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
1876 src0_z = info->srcOffsets[1].z;
1877 src1_z = info->srcOffsets[0].z;
1878 }
1879
1880 if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
1881 assert(layers <= 1);
1882 layers = vk_image_subresource_layer_count(&dst_image->vk,
1883 &info->dstSubresource);
1884 }
1885
1886 /* BC1_RGB_* formats need to have their last components overriden with 1
1887 * when sampling, which is normally handled with the texture descriptor
1888 * swizzle. The 2d path can't handle that, so use the 3d path.
1889 *
1890 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1891 * the 2d path.
1892 */
1893
1894 unsigned blit_param = rotate[mirror_y][mirror_x];
1895 if (dst_image->layout[0].nr_samples > 1 ||
1896 src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1897 src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1898 filter == VK_FILTER_CUBIC_EXT ||
1899 z_scale) {
1900 ops = &r3d_ops<CHIP>;
1901 blit_param = z_scale ? R3D_Z_SCALE : 0;
1902 }
1903
1904 /* use the right format in setup() for D32_S8
1905 * TODO: this probably should use a helper
1906 */
1907 enum pipe_format src_format =
1908 tu6_plane_format(src_image->vk.format,
1909 tu6_plane_index(src_image->vk.format,
1910 info->srcSubresource.aspectMask));
1911 enum pipe_format dst_format =
1912 tu6_plane_format(dst_image->vk.format,
1913 tu6_plane_index(src_image->vk.format,
1914 info->srcSubresource.aspectMask));
1915 trace_start_blit(&cmd->trace, cs,
1916 ops == &r3d_ops<CHIP>,
1917 src_image->vk.format,
1918 dst_image->vk.format,
1919 layers);
1920
1921 ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
1922 blit_param, false, dst_image->layout[0].ubwc,
1923 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
1924
1925 if (ops == &r3d_ops<CHIP>) {
1926 const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
1927 info->srcOffsets[0].x, info->srcOffsets[0].y,
1928 info->dstOffsets[1].x, info->dstOffsets[1].y,
1929 info->srcOffsets[1].x, info->srcOffsets[1].y };
1930 r3d_coords_raw(cmd, cs, coords);
1931 } else {
1932 tu_cs_emit_regs(cs,
1933 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1934 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1935 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1936 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1937 tu_cs_emit_regs(cs,
1938 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1939 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1940 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1941 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1942 }
1943
1944 struct fdl6_view dst, src;
1945 tu_image_view_blit<CHIP>(
1946 &dst, dst_image, &info->dstSubresource,
1947 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
1948
1949 if (z_scale) {
1950 tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
1951 &info->srcSubresource, 0, true);
1952 ops->src(cmd, cs, &src, 0, filter, dst_format);
1953 } else {
1954 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1955 }
1956
1957 for (uint32_t i = 0; i < layers; i++) {
1958 if (z_scale) {
1959 float t = ((float) i + 0.5f) / (float) layers;
1960 r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
1961 } else {
1962 ops->src(cmd, cs, &src, i, filter, dst_format);
1963 }
1964 ops->dst(cs, &dst, i, src_format);
1965 ops->run(cmd, cs);
1966 }
1967
1968 ops->teardown(cmd, cs);
1969
1970 trace_end_blit(&cmd->trace, cs);
1971 }
1972
1973 template <chip CHIP>
1974 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)1975 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
1976 const VkBlitImageInfo2 *pBlitImageInfo)
1977
1978 {
1979 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1980 TU_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
1981 TU_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
1982
1983 for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
1984 /* can't blit both depth and stencil at once with D32_S8
1985 * TODO: more advanced 3D blit path to support it instead?
1986 */
1987 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1988 dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1989 VkImageBlit2 region = pBlitImageInfo->pRegions[i];
1990 u_foreach_bit(b, region.dstSubresource.aspectMask) {
1991 region.srcSubresource.aspectMask = BIT(b);
1992 region.dstSubresource.aspectMask = BIT(b);
1993 tu6_blit_image<CHIP>(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter);
1994 }
1995 continue;
1996 }
1997 tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
1998 pBlitImageInfo->filter);
1999 }
2000
2001 if (dst_image->lrz_height) {
2002 tu_disable_lrz(cmd, &cmd->cs, dst_image);
2003 }
2004 }
2005 TU_GENX(tu_CmdBlitImage2);
2006
2007 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2008 copy_compressed(VkFormat format,
2009 VkOffset3D *offset,
2010 VkExtent3D *extent,
2011 uint32_t *width,
2012 uint32_t *height)
2013 {
2014 if (!vk_format_is_compressed(format))
2015 return;
2016
2017 uint32_t block_width = vk_format_get_blockwidth(format);
2018 uint32_t block_height = vk_format_get_blockheight(format);
2019
2020 offset->x /= block_width;
2021 offset->y /= block_height;
2022
2023 if (extent) {
2024 extent->width = DIV_ROUND_UP(extent->width, block_width);
2025 extent->height = DIV_ROUND_UP(extent->height, block_height);
2026 }
2027 if (width)
2028 *width = DIV_ROUND_UP(*width, block_width);
2029 if (height)
2030 *height = DIV_ROUND_UP(*height, block_height);
2031 }
2032
2033 template <chip CHIP>
2034 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2035 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2036 struct tu_buffer *src_buffer,
2037 struct tu_image *dst_image,
2038 const VkBufferImageCopy2 *info)
2039 {
2040 struct tu_cs *cs = &cmd->cs;
2041 uint32_t layers = MAX2(info->imageExtent.depth,
2042 vk_image_subresource_layer_count(&dst_image->vk,
2043 &info->imageSubresource));
2044 enum pipe_format src_format =
2045 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2046 enum pipe_format dst_format =
2047 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2048 const struct blit_ops *ops = &r2d_ops<CHIP>;
2049
2050 /* special case for buffer to stencil */
2051 if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2052 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2053 src_format = PIPE_FORMAT_S8_UINT;
2054 }
2055
2056 /* note: could use "R8_UNORM" when no UBWC */
2057 unsigned blit_param = 0;
2058 if (src_format == PIPE_FORMAT_Y8_UNORM ||
2059 tu_pipe_format_is_float16(src_format)) {
2060 ops = &r3d_ops<CHIP>;
2061 blit_param = R3D_COPY;
2062 }
2063
2064 VkOffset3D offset = info->imageOffset;
2065 VkExtent3D extent = info->imageExtent;
2066 uint32_t src_width = info->bufferRowLength ?: extent.width;
2067 uint32_t src_height = info->bufferImageHeight ?: extent.height;
2068
2069 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2070
2071 uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2072 uint32_t layer_size = src_height * pitch;
2073
2074 ops->setup(cmd, cs, src_format, dst_format,
2075 info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2076 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2077
2078 struct fdl6_view dst;
2079 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2080 &info->imageSubresource, offset.z);
2081
2082 for (uint32_t i = 0; i < layers; i++) {
2083 ops->dst(cs, &dst, i, src_format);
2084
2085 uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2086 if ((src_va & 63) || (pitch & 63)) {
2087 for (uint32_t y = 0; y < extent.height; y++) {
2088 uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2089 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2090 x + extent.width, 1, dst_format);
2091 ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y}, (VkOffset2D) {x},
2092 (VkExtent2D) {extent.width, 1});
2093 ops->run(cmd, cs);
2094 src_va += pitch;
2095 }
2096 } else {
2097 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
2098 coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2099 ops->run(cmd, cs);
2100 }
2101 }
2102
2103 ops->teardown(cmd, cs);
2104 }
2105
2106 template <chip CHIP>
2107 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2108 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2109 const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2110 {
2111 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2112 TU_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2113 TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2114
2115 for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2116 tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2117 pCopyBufferToImageInfo->pRegions + i);
2118
2119 if (dst_image->lrz_height) {
2120 tu_disable_lrz(cmd, &cmd->cs, dst_image);
2121 }
2122 }
2123 TU_GENX(tu_CmdCopyBufferToImage2);
2124
2125 template <chip CHIP>
2126 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info)2127 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2128 struct tu_image *src_image,
2129 struct tu_buffer *dst_buffer,
2130 const VkBufferImageCopy2 *info)
2131 {
2132 struct tu_cs *cs = &cmd->cs;
2133 uint32_t layers = MAX2(info->imageExtent.depth,
2134 vk_image_subresource_layer_count(&src_image->vk,
2135 &info->imageSubresource));
2136 enum pipe_format dst_format =
2137 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2138 enum pipe_format src_format =
2139 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2140 const struct blit_ops *ops = &r2d_ops<CHIP>;
2141
2142 if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2143 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2144 dst_format = PIPE_FORMAT_S8_UINT;
2145 }
2146
2147 /* note: could use "R8_UNORM" when no UBWC */
2148 unsigned blit_param = 0;
2149 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2150 tu_pipe_format_is_float16(src_format)) {
2151 ops = &r3d_ops<CHIP>;
2152 blit_param = R3D_COPY;
2153 }
2154
2155 VkOffset3D offset = info->imageOffset;
2156 VkExtent3D extent = info->imageExtent;
2157 uint32_t dst_width = info->bufferRowLength ?: extent.width;
2158 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2159
2160 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2161
2162 uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2163 uint32_t layer_size = pitch * dst_height;
2164
2165 ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2166 VK_SAMPLE_COUNT_1_BIT);
2167
2168 struct fdl6_view src;
2169 tu_image_view_copy<CHIP>(&src, src_image, src_format,
2170 &info->imageSubresource, offset.z);
2171
2172 for (uint32_t i = 0; i < layers; i++) {
2173 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2174
2175 uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2176 if ((dst_va & 63) || (pitch & 63)) {
2177 for (uint32_t y = 0; y < extent.height; y++) {
2178 uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2179 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2180 ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2181 (VkExtent2D) {extent.width, 1});
2182 ops->run(cmd, cs);
2183 dst_va += pitch;
2184 }
2185 } else {
2186 ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2187 coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2188 ops->run(cmd, cs);
2189 }
2190 }
2191
2192 ops->teardown(cmd, cs);
2193 }
2194
2195 template <chip CHIP>
2196 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2197 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2198 const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2199 {
2200 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2201 TU_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2202 TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2203
2204 for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2205 tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2206 pCopyImageToBufferInfo->pRegions + i);
2207 }
2208 TU_GENX(tu_CmdCopyImageToBuffer2);
2209
2210 /* Tiled formats don't support swapping, which means that we can't support
2211 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2212 * formats like B5G5R5A1 have a separate linear-only format when sampling.
2213 * Currently we fake support for tiled swapped formats and use the unswapped
2214 * format instead, but this means that reinterpreting copies to and from
2215 * swapped formats can't be performed correctly unless we can swizzle the
2216 * components by reinterpreting the other image as the "correct" swapped
2217 * format, i.e. only when the other image is linear.
2218 */
2219
2220 static bool
is_swapped_format(enum pipe_format format)2221 is_swapped_format(enum pipe_format format)
2222 {
2223 struct tu_native_format linear = blit_format_texture(format, TILE6_LINEAR);
2224 struct tu_native_format tiled = blit_format_texture(format, TILE6_3);
2225 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2226 }
2227
2228 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2229 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2230 * versa). This should mirror the logic in fdl6_layout.
2231 */
2232 static bool
image_is_r8g8(struct tu_image * image)2233 image_is_r8g8(struct tu_image *image)
2234 {
2235 return image->layout[0].cpp == 2 &&
2236 vk_format_get_nr_components(image->vk.format) == 2;
2237 }
2238
2239 template <chip CHIP>
2240 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2241 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2242 struct tu_image *src_image,
2243 struct tu_image *dst_image,
2244 const VkImageCopy2 *info)
2245 {
2246 const struct blit_ops *ops = &r2d_ops<CHIP>;
2247 struct tu_cs *cs = &cmd->cs;
2248
2249 if (dst_image->layout[0].nr_samples > 1)
2250 ops = &r3d_ops<CHIP>;
2251
2252 enum pipe_format format = PIPE_FORMAT_NONE;
2253 VkOffset3D src_offset = info->srcOffset;
2254 VkOffset3D dst_offset = info->dstOffset;
2255 VkExtent3D extent = info->extent;
2256 uint32_t layers_to_copy = MAX2(info->extent.depth,
2257 vk_image_subresource_layer_count(&src_image->vk,
2258 &info->srcSubresource));
2259
2260 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2261 * Images":
2262 *
2263 * When copying between compressed and uncompressed formats the extent
2264 * members represent the texel dimensions of the source image and not
2265 * the destination. When copying from a compressed image to an
2266 * uncompressed image the image texel dimensions written to the
2267 * uncompressed image will be source extent divided by the compressed
2268 * texel block dimensions. When copying from an uncompressed image to a
2269 * compressed image the image texel dimensions written to the compressed
2270 * image will be the source extent multiplied by the compressed texel
2271 * block dimensions.
2272 *
2273 * This means we only have to adjust the extent if the source image is
2274 * compressed.
2275 */
2276 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2277 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2278
2279 enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2280 enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2281
2282 /* note: could use "R8_UNORM" when no UBWC */
2283 unsigned blit_param = 0;
2284 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2285 src_format == PIPE_FORMAT_Y8_UNORM ||
2286 tu_pipe_format_is_float16(src_format) ||
2287 tu_pipe_format_is_float16(dst_format)) {
2288 ops = &r3d_ops<CHIP>;
2289 blit_param = R3D_COPY;
2290 }
2291
2292 bool use_staging_blit = false;
2293
2294 if (src_format == dst_format) {
2295 /* Images that share a format can always be copied directly because it's
2296 * the same as a blit.
2297 */
2298 format = src_format;
2299 } else if (!src_image->layout[0].tile_mode) {
2300 /* If an image is linear, we can always safely reinterpret it with the
2301 * other image's format and then do a regular blit.
2302 */
2303 format = dst_format;
2304 } else if (!dst_image->layout[0].tile_mode) {
2305 format = src_format;
2306 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2307 /* We can't currently copy r8g8 images to/from other cpp=2 images,
2308 * due to the different tile layout.
2309 */
2310 use_staging_blit = true;
2311 } else if (is_swapped_format(src_format) ||
2312 is_swapped_format(dst_format)) {
2313 /* If either format has a non-identity swap, then we can't copy
2314 * to/from it.
2315 */
2316 use_staging_blit = true;
2317 } else if (!src_image->layout[0].ubwc) {
2318 format = dst_format;
2319 } else if (!dst_image->layout[0].ubwc) {
2320 format = src_format;
2321 } else {
2322 /* Both formats use UBWC and so neither can be reinterpreted.
2323 * TODO: We could do an in-place decompression of the dst instead.
2324 */
2325 perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2326 use_staging_blit = true;
2327 }
2328
2329 struct fdl6_view dst, src;
2330
2331 if (use_staging_blit) {
2332 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2333 tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2334
2335 struct fdl_layout staging_layout = { 0 };
2336 VkOffset3D staging_offset = { 0 };
2337
2338 staging_layout.tile_mode = TILE6_LINEAR;
2339 staging_layout.ubwc = false;
2340
2341 uint32_t layer_count =
2342 vk_image_subresource_layer_count(&src_image->vk,
2343 &info->srcSubresource);
2344 fdl6_layout(&staging_layout,
2345 src_format,
2346 src_image->layout[0].nr_samples,
2347 extent.width,
2348 extent.height,
2349 extent.depth,
2350 1,
2351 layer_count,
2352 extent.depth > 1,
2353 NULL);
2354
2355 struct tu_bo *staging_bo;
2356 VkResult result = tu_get_scratch_bo(cmd->device,
2357 staging_layout.size,
2358 &staging_bo);
2359 if (result != VK_SUCCESS) {
2360 vk_command_buffer_set_error(&cmd->vk, result);
2361 return;
2362 }
2363
2364 struct fdl6_view staging;
2365 const struct fdl_layout *staging_layout_ptr = &staging_layout;
2366 const struct fdl_view_args copy_to_args = {
2367 .chip = CHIP,
2368 .iova = staging_bo->iova,
2369 .base_miplevel = 0,
2370 .level_count = 1,
2371 .base_array_layer = 0,
2372 .layer_count = layer_count,
2373 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2374 .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2375 .type = FDL_VIEW_TYPE_2D,
2376 };
2377 fdl6_view_init(&staging, &staging_layout_ptr, ©_to_args, false);
2378
2379 ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2380 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2381 coords(ops, cmd, cs, staging_offset, src_offset, extent);
2382
2383 for (uint32_t i = 0; i < layers_to_copy; i++) {
2384 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2385 ops->dst(cs, &staging, i, src_format);
2386 ops->run(cmd, cs);
2387 }
2388
2389 /* When executed by the user there has to be a pipeline barrier here,
2390 * but since we're doing it manually we'll have to flush ourselves.
2391 */
2392 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
2393 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2394 tu_cs_emit_wfi(cs);
2395
2396 const struct fdl_view_args copy_from_args = {
2397 .chip = CHIP,
2398 .iova = staging_bo->iova,
2399 .base_miplevel = 0,
2400 .level_count = 1,
2401 .base_array_layer = 0,
2402 .layer_count = layer_count,
2403 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2404 .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2405 .type = FDL_VIEW_TYPE_2D,
2406 };
2407 fdl6_view_init(&staging, &staging_layout_ptr, ©_from_args, false);
2408
2409 ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2410 blit_param, false, dst_image->layout[0].ubwc,
2411 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2412 coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2413
2414 for (uint32_t i = 0; i < layers_to_copy; i++) {
2415 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2416 ops->dst(cs, &dst, i, dst_format);
2417 ops->run(cmd, cs);
2418 }
2419 } else {
2420 tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2421 tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
2422
2423 ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2424 blit_param, false, dst_image->layout[0].ubwc,
2425 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2426 coords(ops, cmd, cs, dst_offset, src_offset, extent);
2427
2428 for (uint32_t i = 0; i < layers_to_copy; i++) {
2429 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2430 ops->dst(cs, &dst, i, format);
2431 ops->run(cmd, cs);
2432 }
2433 }
2434
2435 ops->teardown(cmd, cs);
2436 }
2437
2438 template <chip CHIP>
2439 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)2440 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
2441 const VkCopyImageInfo2 *pCopyImageInfo)
2442 {
2443 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2444 TU_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2445 TU_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2446
2447 for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2448 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2449 VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2450 u_foreach_bit(b, info.dstSubresource.aspectMask) {
2451 info.srcSubresource.aspectMask = BIT(b);
2452 info.dstSubresource.aspectMask = BIT(b);
2453 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
2454 }
2455 continue;
2456 }
2457
2458 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
2459 pCopyImageInfo->pRegions + i);
2460 }
2461
2462 if (dst_image->lrz_height) {
2463 tu_disable_lrz(cmd, &cmd->cs, dst_image);
2464 }
2465 }
2466 TU_GENX(tu_CmdCopyImage2);
2467
2468 template <chip CHIP>
2469 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size)2470 copy_buffer(struct tu_cmd_buffer *cmd,
2471 uint64_t dst_va,
2472 uint64_t src_va,
2473 uint64_t size,
2474 uint32_t block_size)
2475 {
2476 const struct blit_ops *ops = &r2d_ops<CHIP>;
2477 struct tu_cs *cs = &cmd->cs;
2478 enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2479 uint64_t blocks = size / block_size;
2480
2481 ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2482 VK_SAMPLE_COUNT_1_BIT);
2483
2484 while (blocks) {
2485 uint32_t src_x = (src_va & 63) / block_size;
2486 uint32_t dst_x = (dst_va & 63) / block_size;
2487 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2488
2489 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2490 ops->dst_buffer( cs, format, dst_va & ~63, 0, format);
2491 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
2492 ops->run(cmd, cs);
2493
2494 src_va += width * block_size;
2495 dst_va += width * block_size;
2496 blocks -= width;
2497 }
2498
2499 ops->teardown(cmd, cs);
2500 }
2501
2502 template <chip CHIP>
2503 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)2504 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
2505 const VkCopyBufferInfo2 *pCopyBufferInfo)
2506 {
2507 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2508 TU_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2509 TU_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2510
2511 for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2512 const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2513 copy_buffer<CHIP>(cmd,
2514 dst_buffer->iova + region->dstOffset,
2515 src_buffer->iova + region->srcOffset,
2516 region->size, 1);
2517 }
2518 }
2519 TU_GENX(tu_CmdCopyBuffer2);
2520
2521 template <chip CHIP>
2522 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)2523 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2524 VkBuffer dstBuffer,
2525 VkDeviceSize dstOffset,
2526 VkDeviceSize dataSize,
2527 const void *pData)
2528 {
2529 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2530 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2531
2532 struct tu_cs_memory tmp;
2533 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2534 if (result != VK_SUCCESS) {
2535 vk_command_buffer_set_error(&cmd->vk, result);
2536 return;
2537 }
2538
2539 memcpy(tmp.map, pData, dataSize);
2540 copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4);
2541 }
2542 TU_GENX(tu_CmdUpdateBuffer);
2543
2544 template <chip CHIP>
2545 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)2546 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2547 VkBuffer dstBuffer,
2548 VkDeviceSize dstOffset,
2549 VkDeviceSize fillSize,
2550 uint32_t data)
2551 {
2552 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2553 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2554 const struct blit_ops *ops = &r2d_ops<CHIP>;
2555 struct tu_cs *cs = &cmd->cs;
2556
2557 fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
2558
2559 uint64_t dst_va = buffer->iova + dstOffset;
2560 uint32_t blocks = fillSize / 4;
2561
2562 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2563 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2564 VK_SAMPLE_COUNT_1_BIT);
2565
2566 VkClearValue clear_val = {};
2567 clear_val.color.uint32[0] = data;
2568 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
2569
2570 while (blocks) {
2571 uint32_t dst_x = (dst_va & 63) / 4;
2572 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2573
2574 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2575 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
2576 ops->run(cmd, cs);
2577
2578 dst_va += width * 4;
2579 blocks -= width;
2580 }
2581
2582 ops->teardown(cmd, cs);
2583 }
2584 TU_GENX(tu_CmdFillBuffer);
2585
2586 template <chip CHIP>
2587 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)2588 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
2589 const VkResolveImageInfo2 *pResolveImageInfo)
2590 {
2591 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2592 TU_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2593 TU_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2594 const struct blit_ops *ops = &r2d_ops<CHIP>;
2595 struct tu_cs *cs = &cmd->cs;
2596
2597 enum pipe_format src_format =
2598 tu_vk_format_to_pipe_format(src_image->vk.format);
2599 enum pipe_format dst_format =
2600 tu_vk_format_to_pipe_format(dst_image->vk.format);
2601 ops->setup(cmd, cs, src_format, dst_format,
2602 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2603 VK_SAMPLE_COUNT_1_BIT);
2604
2605 for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2606 const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2607 uint32_t layers = MAX2(info->extent.depth,
2608 vk_image_subresource_layer_count(&dst_image->vk,
2609 &info->dstSubresource));
2610
2611 /* TODO: aspect masks possible ? */
2612
2613 coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
2614
2615 struct fdl6_view dst, src;
2616 tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2617 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2618
2619 for (uint32_t i = 0; i < layers; i++) {
2620 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2621 ops->dst(cs, &dst, i, src_format);
2622 ops->run(cmd, cs);
2623 }
2624 }
2625
2626 ops->teardown(cmd, cs);
2627 }
2628 TU_GENX(tu_CmdResolveImage2);
2629
2630 #define for_each_layer(layer, layer_mask, layers) \
2631 for (uint32_t layer = 0; \
2632 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2633 layer++) \
2634 if (!layer_mask || (layer_mask & BIT(layer)))
2635
2636 template <chip CHIP>
2637 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)2638 resolve_sysmem(struct tu_cmd_buffer *cmd,
2639 struct tu_cs *cs,
2640 VkFormat vk_src_format,
2641 VkFormat vk_dst_format,
2642 const struct tu_image_view *src,
2643 const struct tu_image_view *dst,
2644 uint32_t layer_mask,
2645 uint32_t layers,
2646 const VkRect2D *rect,
2647 bool src_separate_ds,
2648 bool dst_separate_ds)
2649 {
2650 const struct blit_ops *ops = &r2d_ops<CHIP>;
2651
2652 trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
2653
2654 enum pipe_format src_format = tu_vk_format_to_pipe_format(vk_src_format);
2655 enum pipe_format dst_format = tu_vk_format_to_pipe_format(vk_dst_format);
2656
2657 ops->setup(cmd, cs, src_format, dst_format,
2658 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
2659 VK_SAMPLE_COUNT_1_BIT);
2660 ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
2661
2662 for_each_layer(i, layer_mask, layers) {
2663 if (src_separate_ds) {
2664 if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2665 r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
2666 } else {
2667 r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
2668 }
2669 } else {
2670 ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
2671 }
2672
2673 if (dst_separate_ds) {
2674 if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
2675 ops->dst_depth(cs, dst, i);
2676 } else {
2677 ops->dst_stencil(cs, dst, i);
2678 }
2679 } else {
2680 ops->dst(cs, &dst->view, i, src_format);
2681 }
2682
2683 ops->run(cmd, cs);
2684 }
2685
2686 ops->teardown(cmd, cs);
2687
2688 trace_end_sysmem_resolve(&cmd->trace, cs);
2689 }
2690
2691 template <chip CHIP>
2692 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)2693 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
2694 struct tu_cs *cs,
2695 const struct tu_image_view *src,
2696 const struct tu_image_view *dst,
2697 uint32_t layer_mask,
2698 uint32_t layers,
2699 const VkRect2D *rect)
2700 {
2701 assert(src->image->vk.format == dst->image->vk.format ||
2702 (vk_format_is_depth_or_stencil(src->image->vk.format) &&
2703 vk_format_is_depth_or_stencil(dst->image->vk.format)));
2704
2705 bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2706 bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
2707
2708 if (dst_separate_ds) {
2709 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
2710 src, dst, layer_mask, layers, rect,
2711 src_separate_ds, dst_separate_ds);
2712 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
2713 src, dst, layer_mask, layers, rect,
2714 src_separate_ds, dst_separate_ds);
2715 } else {
2716 resolve_sysmem<CHIP>(cmd, cs, src->image->vk.format, dst->image->vk.format,
2717 src, dst, layer_mask, layers, rect,
2718 src_separate_ds, dst_separate_ds);
2719 }
2720 }
2721 TU_GENX(tu_resolve_sysmem);
2722
2723 template <chip CHIP>
2724 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)2725 clear_image(struct tu_cmd_buffer *cmd,
2726 struct tu_image *image,
2727 const VkClearValue *clear_value,
2728 const VkImageSubresourceRange *range,
2729 VkImageAspectFlags aspect_mask)
2730 {
2731 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
2732 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
2733 struct tu_cs *cs = &cmd->cs;
2734 enum pipe_format format;
2735 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
2736 format = PIPE_FORMAT_R32_UINT;
2737 } else {
2738 format = tu6_plane_format(image->vk.format,
2739 tu6_plane_index(image->vk.format,
2740 aspect_mask));
2741 }
2742
2743 if (image->layout[0].depth0 > 1) {
2744 assert(layer_count == 1);
2745 assert(range->baseArrayLayer == 0);
2746 }
2747
2748 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
2749
2750 ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
2751 (VkSampleCountFlagBits) image->layout[0].nr_samples);
2752 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
2753 ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
2754 else
2755 ops->clear_value(cmd, cs, format, clear_value);
2756
2757 for (unsigned j = 0; j < level_count; j++) {
2758 if (image->layout[0].depth0 > 1)
2759 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
2760
2761 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2762 u_minify(image->layout[0].width0, range->baseMipLevel + j),
2763 u_minify(image->layout[0].height0, range->baseMipLevel + j)
2764 });
2765
2766 struct fdl6_view dst;
2767 const VkImageSubresourceLayers subresource = {
2768 .aspectMask = aspect_mask,
2769 .mipLevel = range->baseMipLevel + j,
2770 .baseArrayLayer = range->baseArrayLayer,
2771 .layerCount = 1,
2772 };
2773 tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
2774
2775 for (uint32_t i = 0; i < layer_count; i++) {
2776 ops->dst(cs, &dst, i, format);
2777 ops->run(cmd, cs);
2778 }
2779 }
2780
2781 ops->teardown(cmd, cs);
2782 }
2783
2784 template <chip CHIP>
2785 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2786 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
2787 VkImage image_h,
2788 VkImageLayout imageLayout,
2789 const VkClearColorValue *pColor,
2790 uint32_t rangeCount,
2791 const VkImageSubresourceRange *pRanges)
2792 {
2793 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2794 TU_FROM_HANDLE(tu_image, image, image_h);
2795
2796 for (unsigned i = 0; i < rangeCount; i++)
2797 clear_image<CHIP>(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
2798 }
2799 TU_GENX(tu_CmdClearColorImage);
2800
2801 template <chip CHIP>
2802 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)2803 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
2804 VkImage image_h,
2805 VkImageLayout imageLayout,
2806 const VkClearDepthStencilValue *pDepthStencil,
2807 uint32_t rangeCount,
2808 const VkImageSubresourceRange *pRanges)
2809 {
2810 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2811 TU_FROM_HANDLE(tu_image, image, image_h);
2812
2813 for (unsigned i = 0; i < rangeCount; i++) {
2814 const VkImageSubresourceRange *range = &pRanges[i];
2815
2816 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2817 /* can't clear both depth and stencil at once, split up the aspect mask */
2818 u_foreach_bit(b, range->aspectMask)
2819 clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
2820 continue;
2821 }
2822
2823 clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
2824 }
2825
2826 tu_lrz_clear_depth_image(cmd, image, pDepthStencil, rangeCount, pRanges);
2827 }
2828 TU_GENX(tu_CmdClearDepthStencilImage);
2829
2830 template <chip CHIP>
2831 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2832 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
2833 uint32_t attachment_count,
2834 const VkClearAttachment *attachments,
2835 uint32_t rect_count,
2836 const VkClearRect *rects)
2837 {
2838 /* the shader path here is special, it avoids changing MRT/etc state */
2839 const struct tu_subpass *subpass = cmd->state.subpass;
2840 const uint32_t mrt_count = subpass->color_count;
2841 struct tu_cs *cs = &cmd->draw_cs;
2842 uint32_t clear_value[MAX_RTS][4];
2843 float z_clear_val = 0.0f;
2844 uint8_t s_clear_val = 0;
2845 uint32_t clear_rts = 0, clear_components = 0;
2846 bool z_clear = false;
2847 bool s_clear = false;
2848
2849 trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
2850
2851 for (uint32_t i = 0; i < attachment_count; i++) {
2852 uint32_t a;
2853 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
2854 uint32_t c = attachments[i].colorAttachment;
2855 a = subpass->color_attachments[c].attachment;
2856 if (a == VK_ATTACHMENT_UNUSED)
2857 continue;
2858
2859 clear_rts |= 1 << c;
2860 clear_components |= 0xf << (c * 4);
2861 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
2862 } else {
2863 a = subpass->depth_stencil_attachment.attachment;
2864 if (a == VK_ATTACHMENT_UNUSED)
2865 continue;
2866
2867 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2868 z_clear = true;
2869 z_clear_val = attachments[i].clearValue.depthStencil.depth;
2870 }
2871
2872 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2873 s_clear = true;
2874 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
2875 }
2876 }
2877 }
2878
2879 /* We may not know the multisample count if there are no attachments, so
2880 * just bail early to avoid corner cases later.
2881 */
2882 if (clear_rts == 0 && !z_clear && !s_clear)
2883 return;
2884
2885 /* disable all draw states so they don't interfere
2886 * TODO: use and re-use draw states
2887 * we have to disable draw states individually to preserve
2888 * input attachment states, because a secondary command buffer
2889 * won't be able to restore them
2890 */
2891 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
2892 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
2893 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
2894 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
2895 continue;
2896 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
2897 CP_SET_DRAW_STATE__0_DISABLE);
2898 tu_cs_emit_qw(cs, 0);
2899 }
2900 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
2901
2902 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
2903 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
2904 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
2905 0xfc000000);
2906 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
2907
2908 r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
2909
2910 /* Disable sample counting in order to not affect occlusion query. */
2911 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
2912
2913 if (cmd->state.prim_generated_query_running_before_rp) {
2914 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
2915 }
2916
2917 tu_cs_emit_regs(cs,
2918 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
2919 tu_cs_emit_regs(cs,
2920 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
2921
2922 tu_cs_emit_regs(cs,
2923 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
2924
2925 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
2926 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
2927 for (uint32_t i = 0; i < mrt_count; i++) {
2928 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
2929 .component_enable = COND(clear_rts & (1 << i), 0xf)));
2930 }
2931
2932 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
2933 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
2934
2935 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
2936 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
2937 .z_test_enable = z_clear,
2938 .z_write_enable = z_clear,
2939 .zfunc = FUNC_ALWAYS));
2940 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
2941 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
2942 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
2943 .stencil_enable = s_clear,
2944 .func = FUNC_ALWAYS,
2945 .zpass = STENCIL_REPLACE));
2946 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
2947 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
2948 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
2949 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
2950
2951 tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
2952
2953 unsigned num_rts = util_bitcount(clear_rts);
2954 uint32_t packed_clear_value[MAX_RTS][4];
2955
2956 uint32_t idx = 0;
2957 u_foreach_bit(b, clear_rts) {
2958 memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
2959 idx++;
2960 }
2961
2962 if (num_rts > 0)
2963 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
2964 0, packed_clear_value, num_rts);
2965
2966 for (uint32_t i = 0; i < rect_count; i++) {
2967 /* This should be true because of this valid usage for
2968 * vkCmdClearAttachments:
2969 *
2970 * "If the render pass instance this is recorded in uses multiview,
2971 * then baseArrayLayer must be zero and layerCount must be one"
2972 */
2973 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
2974
2975 /* a630 doesn't support multiview masks, which means that we can't use
2976 * the normal multiview path without potentially recompiling a shader
2977 * on-demand or using a more complicated variant that takes the mask as
2978 * a const. Just use the layered path instead, since it shouldn't be
2979 * much worse.
2980 */
2981 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
2982 {
2983 const float coords[] = {
2984 rects[i].rect.offset.x,
2985 rects[i].rect.offset.y,
2986 z_clear_val,
2987 uif(rects[i].baseArrayLayer + layer),
2988 rects[i].rect.offset.x + rects[i].rect.extent.width,
2989 rects[i].rect.offset.y + rects[i].rect.extent.height,
2990 z_clear_val,
2991 1.0f,
2992 };
2993
2994 r3d_coords_raw(cmd, cs, coords);
2995 r3d_run_vis(cmd, cs);
2996 }
2997 }
2998
2999 /* Re-enable sample counting. */
3000 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
3001
3002 if (cmd->state.prim_generated_query_running_before_rp) {
3003 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
3004 }
3005
3006 trace_end_sysmem_clear_all(&cmd->trace, cs);
3007 }
3008
3009 static void
pack_gmem_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])3010 pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
3011 {
3012 switch (format) {
3013 case PIPE_FORMAT_Z24X8_UNORM:
3014 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
3015 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
3016 val->depthStencil.stencil << 24;
3017 return;
3018 case PIPE_FORMAT_Z16_UNORM:
3019 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
3020 return;
3021 case PIPE_FORMAT_Z32_FLOAT:
3022 clear_value[0] = fui(val->depthStencil.depth);
3023 return;
3024 case PIPE_FORMAT_S8_UINT:
3025 clear_value[0] = val->depthStencil.stencil;
3026 return;
3027 default:
3028 break;
3029 }
3030
3031 float tmp[4];
3032 memcpy(tmp, val->color.float32, 4 * sizeof(float));
3033 if (util_format_is_srgb(format)) {
3034 for (int i = 0; i < 3; i++)
3035 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
3036 }
3037
3038 #define PACK_F(type) util_format_##type##_pack_rgba_float \
3039 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
3040 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
3041 case 4:
3042 PACK_F(r4g4b4a4_unorm);
3043 break;
3044 case 5:
3045 if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
3046 PACK_F(r5g6b5_unorm);
3047 else
3048 PACK_F(r5g5b5a1_unorm);
3049 break;
3050 case 8:
3051 if (util_format_is_snorm(format))
3052 PACK_F(r8g8b8a8_snorm);
3053 else if (util_format_is_unorm(format))
3054 PACK_F(r8g8b8a8_unorm);
3055 else
3056 pack_int8(clear_value, val->color.uint32);
3057 break;
3058 case 10:
3059 if (util_format_is_pure_integer(format))
3060 pack_int10_2(clear_value, val->color.uint32);
3061 else
3062 PACK_F(r10g10b10a2_unorm);
3063 break;
3064 case 11:
3065 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
3066 break;
3067 case 16:
3068 if (util_format_is_snorm(format))
3069 PACK_F(r16g16b16a16_snorm);
3070 else if (util_format_is_unorm(format))
3071 PACK_F(r16g16b16a16_unorm);
3072 else if (util_format_is_float(format))
3073 PACK_F(r16g16b16a16_float);
3074 else
3075 pack_int16(clear_value, val->color.uint32);
3076 break;
3077 case 32:
3078 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
3079 break;
3080 case 0:
3081 assert(format == PIPE_FORMAT_A8_UNORM);
3082 PACK_F(a8_unorm);
3083 break;
3084 default:
3085 unreachable("unexpected channel size");
3086 }
3087 #undef PACK_F
3088 }
3089
3090 template <chip CHIP>
3091 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)3092 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3093 struct tu_cs *cs,
3094 enum pipe_format format,
3095 uint8_t clear_mask,
3096 uint32_t gmem_offset,
3097 const VkClearValue *value)
3098 {
3099 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
3100 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
3101 blit_base_format(format, false)));
3102
3103 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
3104
3105 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
3106 tu_cs_emit(cs, gmem_offset);
3107
3108 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3109 tu_cs_emit(cs, 0);
3110
3111 uint32_t clear_vals[4] = {};
3112 pack_gmem_clear_value(value, format, clear_vals);
3113
3114 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3115 tu_cs_emit_array(cs, clear_vals, 4);
3116
3117 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
3118 }
3119
3120 template <chip CHIP>
3121 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)3122 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3123 struct tu_cs *cs,
3124 uint32_t attachment,
3125 uint32_t base_layer,
3126 uint32_t layers,
3127 uint32_t layer_mask,
3128 VkImageAspectFlags mask,
3129 const VkClearValue *value)
3130 {
3131 const struct tu_render_pass_attachment *att =
3132 &cmd->state.pass->attachments[attachment];
3133
3134 trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
3135
3136 tu_cs_emit_regs(cs,
3137 A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
3138
3139 enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
3140 for_each_layer(i, layer_mask, layers) {
3141 uint32_t layer = i + base_layer;
3142 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3143 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3144 clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf,
3145 tu_attachment_gmem_offset(cmd, att, layer), value);
3146 }
3147 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3148 clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf,
3149 tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
3150 }
3151 } else {
3152 clear_gmem_attachment<CHIP>(cmd, cs, format, aspect_write_mask(format, mask),
3153 tu_attachment_gmem_offset(cmd, att, layer), value);
3154 }
3155 }
3156
3157 trace_end_gmem_clear(&cmd->trace, cs);
3158 }
3159
3160 template <chip CHIP>
3161 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3162 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
3163 uint32_t attachment_count,
3164 const VkClearAttachment *attachments,
3165 uint32_t rect_count,
3166 const VkClearRect *rects)
3167 {
3168 const struct tu_subpass *subpass = cmd->state.subpass;
3169 struct tu_cs *cs = &cmd->draw_cs;
3170
3171 if (rect_count > 1)
3172 perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
3173
3174 for (unsigned i = 0; i < rect_count; i++) {
3175 unsigned x1 = rects[i].rect.offset.x;
3176 unsigned y1 = rects[i].rect.offset.y;
3177 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
3178 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
3179
3180 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
3181 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
3182 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
3183
3184 for (unsigned j = 0; j < attachment_count; j++) {
3185 uint32_t a;
3186 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
3187 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
3188 else
3189 a = subpass->depth_stencil_attachment.attachment;
3190
3191 if (a == VK_ATTACHMENT_UNUSED)
3192 continue;
3193
3194 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, rects[i].baseArrayLayer,
3195 rects[i].layerCount,
3196 subpass->multiview_mask,
3197 attachments[j].aspectMask,
3198 &attachments[j].clearValue);
3199 }
3200 }
3201 }
3202
3203 template <chip CHIP>
3204 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3205 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
3206 uint32_t attachmentCount,
3207 const VkClearAttachment *pAttachments,
3208 uint32_t rectCount,
3209 const VkClearRect *pRects)
3210 {
3211 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3212 struct tu_cs *cs = &cmd->draw_cs;
3213
3214 /* sysmem path behaves like a draw, note we don't have a way of using different
3215 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
3216 */
3217 tu_emit_cache_flush_renderpass<CHIP>(cmd);
3218
3219 for (uint32_t j = 0; j < attachmentCount; j++) {
3220 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
3221 continue;
3222
3223 tu_lrz_disable_during_renderpass(cmd);
3224 }
3225
3226 /* vkCmdClearAttachments is supposed to respect the predicate if active. The
3227 * easiest way to do this is to always use the 3d path, which always works
3228 * even with GMEM because it's just a simple draw using the existing
3229 * attachment state.
3230 *
3231 * Similarly, we also use the 3D path when in a secondary command buffer that
3232 * doesn't know the GMEM layout that will be chosen by the primary.
3233 */
3234 if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
3235 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3236 return;
3237 }
3238
3239 /* If we could skip tile load/stores based on any draws intersecting them at
3240 * binning time, then emit the clear as a 3D draw so that it contributes to
3241 * that visibility.
3242 */
3243 const struct tu_subpass *subpass = cmd->state.subpass;
3244 for (uint32_t i = 0; i < attachmentCount; i++) {
3245 uint32_t a;
3246 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3247 uint32_t c = pAttachments[i].colorAttachment;
3248 a = subpass->color_attachments[c].attachment;
3249 } else {
3250 a = subpass->depth_stencil_attachment.attachment;
3251 }
3252 if (a != VK_ATTACHMENT_UNUSED) {
3253 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3254 if (att->cond_load_allowed || att->cond_store_allowed) {
3255 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3256 return;
3257 }
3258 }
3259 }
3260
3261 /* Otherwise, emit 2D blits for gmem rendering. */
3262 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
3263 tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3264 tu_cond_exec_end(cs);
3265
3266 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3267 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3268 tu_cond_exec_end(cs);
3269 }
3270 TU_GENX(tu_CmdClearAttachments);
3271
3272 template <chip CHIP>
3273 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)3274 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3275 struct tu_cs *cs,
3276 VkFormat vk_format,
3277 VkImageAspectFlags clear_mask,
3278 uint32_t a,
3279 bool separate_ds)
3280 {
3281 enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
3282 const struct tu_framebuffer *fb = cmd->state.framebuffer;
3283 const struct tu_image_view *iview = cmd->state.attachments[a];
3284 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
3285 const struct blit_ops *ops = &r2d_ops<CHIP>;
3286 const VkClearValue *value = &cmd->state.clear_values[a];
3287 if (cmd->state.pass->attachments[a].samples > 1)
3288 ops = &r3d_ops<CHIP>;
3289
3290 trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
3291 cmd->state.pass->attachments[a].samples);
3292
3293 ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
3294 cmd->state.pass->attachments[a].samples);
3295 ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
3296 cmd->state.render_area.extent);
3297 ops->clear_value(cmd, cs, format, value);
3298
3299 for_each_layer(i, clear_views, fb->layers) {
3300 if (separate_ds) {
3301 if (vk_format == VK_FORMAT_D32_SFLOAT) {
3302 ops->dst_depth(cs, iview, i);
3303 } else {
3304 ops->dst_stencil(cs, iview, i);
3305 }
3306 } else {
3307 ops->dst(cs, &iview->view, i, format);
3308 }
3309 ops->run(cmd, cs);
3310 }
3311
3312 ops->teardown(cmd, cs);
3313
3314 trace_end_sysmem_clear(&cmd->trace, cs);
3315 }
3316
3317 template <chip CHIP>
3318 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3319 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3320 struct tu_cs *cs,
3321 uint32_t a)
3322 {
3323 const struct tu_render_pass_attachment *attachment =
3324 &cmd->state.pass->attachments[a];
3325
3326 if (!attachment->clear_mask)
3327 return;
3328
3329 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3330 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3331 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
3332 a, true);
3333 }
3334 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3335 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
3336 a, true);
3337 }
3338 } else {
3339 clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
3340 a, false);
3341 }
3342
3343 /* The spec doesn't explicitly say, but presumably the initial renderpass
3344 * clear is considered part of the renderpass, and therefore barriers
3345 * aren't required inside the subpass/renderpass. Therefore we need to
3346 * flush CCU color into CCU depth here, just like with
3347 * vkCmdClearAttachments(). Note that because this only happens at the
3348 * beginning of a renderpass, and renderpass writes are considered
3349 * "incoherent", we shouldn't have to worry about syncing depth into color
3350 * beforehand as depth should already be flushed.
3351 */
3352 if (vk_format_is_depth_or_stencil(attachment->format)) {
3353 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3354 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_DEPTH);
3355 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
3356 } else {
3357 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3358 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
3359 }
3360
3361 tu_cs_emit_wfi(cs);
3362 }
3363 TU_GENX(tu_clear_sysmem_attachment);
3364
3365 template <chip CHIP>
3366 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3367 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3368 struct tu_cs *cs,
3369 uint32_t a)
3370 {
3371 const struct tu_render_pass_attachment *attachment =
3372 &cmd->state.pass->attachments[a];
3373
3374 if (!attachment->clear_mask)
3375 return;
3376
3377 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, 0, cmd->state.framebuffer->layers,
3378 attachment->clear_views,
3379 attachment->clear_mask,
3380 &cmd->state.clear_values[a]);
3381 }
3382 TU_GENX(tu_clear_gmem_attachment);
3383
3384 template <chip CHIP>
3385 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,bool resolve,bool separate_stencil)3386 tu_emit_blit(struct tu_cmd_buffer *cmd,
3387 struct tu_cs *cs,
3388 const struct tu_image_view *iview,
3389 const struct tu_render_pass_attachment *attachment,
3390 bool resolve,
3391 bool separate_stencil)
3392 {
3393 tu_cs_emit_regs(cs,
3394 A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
3395
3396 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
3397 .unk0 = !resolve,
3398 .gmem = !resolve,
3399 .sample_0 = vk_format_is_int(attachment->format) ||
3400 vk_format_is_depth_or_stencil(attachment->format),
3401 .depth = vk_format_is_depth_or_stencil(attachment->format),));
3402
3403 for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
3404 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
3405 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3406 if (!separate_stencil) {
3407 tu_cs_emit(cs, tu_image_view_depth(iview, RB_BLIT_DST_INFO));
3408 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * i);
3409 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
3410
3411 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
3412 tu_cs_image_flag_ref(cs, &iview->view, i);
3413 } else {
3414 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
3415 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * i);
3416 tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(iview->stencil_pitch).value);
3417 }
3418 } else {
3419 tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
3420 tu_cs_image_ref_2d<CHIP>(cs, &iview->view, i, false);
3421
3422 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
3423 tu_cs_image_flag_ref(cs, &iview->view, i);
3424 }
3425
3426 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
3427 tu_cs_emit_regs(cs,
3428 A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(cmd, attachment, i)));
3429 } else {
3430 tu_cs_emit_regs(cs,
3431 A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(cmd, attachment, i)));
3432 }
3433
3434 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3435 tu_cs_emit(cs, 0);
3436
3437 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
3438 }
3439 }
3440
3441 static bool
blit_can_resolve(VkFormat format)3442 blit_can_resolve(VkFormat format)
3443 {
3444 const struct util_format_description *desc = vk_format_description(format);
3445
3446 /* blit event can only do resolve for simple cases:
3447 * averaging samples as unsigned integers or choosing only one sample
3448 * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
3449 */
3450 if (vk_format_is_snorm(format))
3451 return false;
3452
3453 /* can't do formats with larger channel sizes
3454 * note: this includes all float formats
3455 * note2: single channel integer formats seem OK
3456 */
3457 if (desc->channel[0].size > 10)
3458 return false;
3459
3460 switch (format) {
3461 /* for unknown reasons blit event can't msaa resolve these formats when tiled
3462 * likely related to these formats having different layout from other cpp=2 formats
3463 */
3464 case VK_FORMAT_R8G8_UNORM:
3465 case VK_FORMAT_R8G8_UINT:
3466 case VK_FORMAT_R8G8_SINT:
3467 case VK_FORMAT_R8G8_SRGB:
3468 /* TODO: this one should be able to work? */
3469 case VK_FORMAT_D24_UNORM_S8_UINT:
3470 return false;
3471 default:
3472 break;
3473 }
3474
3475 return true;
3476 }
3477
3478 struct apply_load_coords_state {
3479 unsigned view;
3480 };
3481
3482 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)3483 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
3484 struct tu_cs *cs,
3485 void *data,
3486 VkRect2D bin,
3487 unsigned views,
3488 VkExtent2D *frag_areas)
3489 {
3490 const struct apply_load_coords_state *state =
3491 (const struct apply_load_coords_state *)data;
3492 assert(state->view < views);
3493 VkExtent2D frag_area = frag_areas[state->view];
3494
3495 assert(bin.extent.width % frag_area.width == 0);
3496 assert(bin.extent.height % frag_area.height == 0);
3497 uint32_t scaled_width = bin.extent.width / frag_area.width;
3498 uint32_t scaled_height = bin.extent.height / frag_area.height;
3499
3500 const float coords[] = {
3501 bin.offset.x, bin.offset.y,
3502 bin.offset.x, bin.offset.y,
3503 bin.offset.x + scaled_width, bin.offset.y + scaled_height,
3504 bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
3505 };
3506 r3d_coords_raw(cmd, cs, coords);
3507 }
3508
3509 template <chip CHIP>
3510 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)3511 load_3d_blit(struct tu_cmd_buffer *cmd,
3512 struct tu_cs *cs,
3513 const struct tu_image_view *iview,
3514 const struct tu_render_pass_attachment *att,
3515 bool separate_stencil)
3516 {
3517 const struct tu_framebuffer *fb = cmd->state.framebuffer;
3518 enum pipe_format format = iview->view.format;
3519 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3520 if (separate_stencil)
3521 format = PIPE_FORMAT_S8_UINT;
3522 else
3523 format = PIPE_FORMAT_Z32_FLOAT;
3524 }
3525 r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
3526 R3D_DST_GMEM, false, iview->view.ubwc_enabled,
3527 iview->image->vk.samples);
3528
3529 if (!cmd->state.pass->has_fdm) {
3530 r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
3531 (VkExtent2D) { fb->width, fb->height });
3532 }
3533
3534 /* Normal loads read directly from system memory, so we have to invalidate
3535 * UCHE in case it contains stale data.
3536 */
3537 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
3538
3539 /* Wait for CACHE_INVALIDATE to land */
3540 tu_cs_emit_wfi(cs);
3541
3542 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
3543 if (cmd->state.pass->has_fdm) {
3544 struct apply_load_coords_state state = {
3545 .view = att->clear_views ? i : 0,
3546 };
3547 tu_create_fdm_bin_patchpoint(cmd, cs, 1 + 3 + 8, fdm_apply_load_coords, state);
3548 }
3549
3550 r3d_dst_gmem(cmd, cs, iview, att, separate_stencil, i);
3551
3552 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3553 if (separate_stencil)
3554 r3d_src_stencil(cmd, cs, iview, i);
3555 else
3556 r3d_src_depth(cmd, cs, iview, i);
3557 } else {
3558 r3d_src_gmem_load(cmd, cs, iview, i);
3559 }
3560
3561 r3d_run(cmd, cs);
3562 }
3563
3564 r3d_teardown<CHIP>(cmd, cs);
3565
3566 /* It seems we need to WFI here for depth/stencil because color writes here
3567 * aren't synchronized with depth/stencil writes.
3568 *
3569 * Note: the blob also uses a WFI for color attachments but this hasn't
3570 * been seen to be necessary.
3571 */
3572 if (vk_format_is_depth_or_stencil(att->format))
3573 tu_cs_emit_wfi(cs);
3574 }
3575
3576 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)3577 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3578 struct tu_cs *cs, bool load)
3579 {
3580 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
3581
3582 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
3583 return;
3584
3585 uint64_t result_iova;
3586 if (load)
3587 result_iova = global_iova(cmd, dbg_gmem_taken_loads);
3588 else
3589 result_iova = global_iova(cmd, dbg_gmem_taken_stores);
3590
3591 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3592 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3593 tu_cs_emit_qw(cs, result_iova);
3594 tu_cs_emit_qw(cs, result_iova);
3595 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3596 }
3597
3598 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)3599 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
3600 struct tu_cs *cs, bool load)
3601 {
3602 tu_cond_exec_end(cs);
3603
3604 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
3605 return;
3606
3607 uint64_t result_iova;
3608 if (load)
3609 result_iova = global_iova(cmd, dbg_gmem_total_loads);
3610 else
3611 result_iova = global_iova(cmd, dbg_gmem_total_stores);
3612
3613 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
3614 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
3615 tu_cs_emit_qw(cs, result_iova);
3616 tu_cs_emit_qw(cs, result_iova);
3617 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
3618 }
3619
3620 template <chip CHIP>
3621 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool cond_exec_allowed,bool force_load)3622 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
3623 struct tu_cs *cs,
3624 uint32_t a,
3625 bool cond_exec_allowed,
3626 bool force_load)
3627 {
3628 const struct tu_image_view *iview = cmd->state.attachments[a];
3629 const struct tu_render_pass_attachment *attachment =
3630 &cmd->state.pass->attachments[a];
3631
3632 bool load_common = attachment->load || force_load;
3633 bool load_stencil =
3634 attachment->load_stencil ||
3635 (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
3636
3637 if (!load_common && !load_stencil)
3638 return;
3639
3640 trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
3641
3642 /* If attachment will be cleared by vkCmdClearAttachments - it is likely
3643 * that it would be partially cleared, and since it is done by 2d blit
3644 * it doesn't produce geometry, so we have to unconditionally load.
3645 *
3646 * To simplify conditions treat partially cleared separate DS as fully
3647 * cleared and don't emit cond_exec.
3648 */
3649 bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
3650 if (cond_exec)
3651 tu_begin_load_store_cond_exec(cmd, cs, true);
3652
3653 if (TU_DEBUG(3D_LOAD) ||
3654 cmd->state.pass->has_fdm) {
3655 if (load_common || load_stencil)
3656 tu_disable_draw_states(cmd, cs);
3657
3658 if (load_common)
3659 load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
3660
3661 if (load_stencil)
3662 load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
3663 } else {
3664 if (load_common)
3665 tu_emit_blit<CHIP>(cmd, cs, iview, attachment, false, false);
3666
3667 if (load_stencil)
3668 tu_emit_blit<CHIP>(cmd, cs, iview, attachment, false, true);
3669 }
3670
3671 if (cond_exec)
3672 tu_end_load_store_cond_exec(cmd, cs, true);
3673
3674 trace_end_gmem_load(&cmd->trace, cs);
3675 }
3676 TU_GENX(tu_load_gmem_attachment);
3677
3678 template <chip CHIP>
3679 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)3680 store_cp_blit(struct tu_cmd_buffer *cmd,
3681 struct tu_cs *cs,
3682 const struct tu_image_view *iview,
3683 uint32_t samples,
3684 bool separate_stencil,
3685 enum pipe_format src_format,
3686 enum pipe_format dst_format,
3687 uint32_t layer,
3688 uint32_t gmem_offset,
3689 uint32_t cpp)
3690 {
3691 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
3692 VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
3693 iview->view.ubwc_enabled, true);
3694
3695 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3696 if (!separate_stencil) {
3697 r2d_dst_depth(cs, iview, layer);
3698 } else {
3699 r2d_dst_stencil(cs, iview, layer);
3700 }
3701 } else {
3702 r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
3703 }
3704
3705 enum a6xx_format fmt = blit_format_texture(src_format, TILE6_2).fmt;
3706 fixup_src_format(&src_format, dst_format, &fmt);
3707
3708 tu_cs_emit_regs(cs,
3709 SP_PS_2D_SRC_INFO(CHIP,
3710 .color_format = fmt,
3711 .tile_mode = TILE6_2,
3712 .color_swap = WZYX,
3713 .srgb = util_format_is_srgb(src_format),
3714 .samples = tu_msaa_samples(samples),
3715 .samples_average = !util_format_is_pure_integer(dst_format) &&
3716 !util_format_is_depth_or_stencil(dst_format),
3717 .unk20 = 1,
3718 .unk22 = 1),
3719 SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
3720 SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
3721 SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
3722
3723 /* sync GMEM writes with CACHE. */
3724 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
3725
3726 /* Wait for CACHE_INVALIDATE to land */
3727 tu_cs_emit_wfi(cs);
3728
3729 r2d_run(cmd, cs);
3730
3731 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3732 * sysmem, and we generally assume that GMEM renderpasses leave their
3733 * results in sysmem, so we need to flush manually here.
3734 */
3735 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3736 }
3737
3738 template <chip CHIP>
3739 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)3740 store_3d_blit(struct tu_cmd_buffer *cmd,
3741 struct tu_cs *cs,
3742 const struct tu_image_view *iview,
3743 VkSampleCountFlagBits dst_samples,
3744 bool separate_stencil,
3745 enum pipe_format src_format,
3746 enum pipe_format dst_format,
3747 const VkRect2D *render_area,
3748 uint32_t layer,
3749 uint32_t gmem_offset,
3750 uint32_t cpp)
3751 {
3752 /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
3753 * aren't set until we know whether we're HW binning or not, and we want to
3754 * avoid a dependence on that here to be able to store attachments before
3755 * the end of the renderpass in the future. Use the scratch space to
3756 * save/restore them dynamically.
3757 */
3758 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3759 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3760 CP_REG_TO_SCRATCH_0_SCRATCH(0) |
3761 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3762 if (CHIP >= A7XX) {
3763 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
3764 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
3765 CP_REG_TO_SCRATCH_0_SCRATCH(1) |
3766 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
3767 }
3768
3769 r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
3770 0, false, iview->view.ubwc_enabled, dst_samples);
3771
3772 r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
3773
3774 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3775 if (!separate_stencil) {
3776 r3d_dst_depth(cs, iview, layer);
3777 } else {
3778 r3d_dst_stencil(cs, iview, layer);
3779 }
3780 } else {
3781 r3d_dst(cs, &iview->view, layer, src_format);
3782 }
3783
3784 r3d_src_gmem(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
3785
3786 /* sync GMEM writes with CACHE. */
3787 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
3788
3789 /* Wait for CACHE_INVALIDATE to land */
3790 tu_cs_emit_wfi(cs);
3791
3792 r3d_run(cmd, cs);
3793
3794 r3d_teardown<CHIP>(cmd, cs);
3795
3796 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
3797 * sysmem, and we generally assume that GMEM renderpasses leave their
3798 * results in sysmem, so we need to flush manually here. The 3d blit path
3799 * writes to depth images as a color RT, so there's no need to flush depth.
3800 */
3801 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_FLUSH_COLOR);
3802
3803 /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
3804 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3805 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
3806 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3807 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3808
3809 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3810 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
3811 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
3812 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3813
3814 if (CHIP >= A7XX) {
3815 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
3816 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
3817 CP_SCRATCH_TO_REG_0_SCRATCH(1) |
3818 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
3819 }
3820 }
3821
3822 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)3823 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
3824 {
3825 struct tu_physical_device *phys_dev = cmd->device->physical_device;
3826 const struct tu_image_view *iview = cmd->state.attachments[a];
3827 const VkRect2D *render_area = &cmd->state.render_area;
3828
3829 /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
3830 if (TU_DEBUG(UNALIGNED_STORE))
3831 return true;
3832
3833 /* We always use the unaligned store path when scaling rendering. */
3834 if (cmd->state.pass->has_fdm)
3835 return true;
3836
3837 uint32_t x1 = render_area->offset.x;
3838 uint32_t y1 = render_area->offset.y;
3839 uint32_t x2 = x1 + render_area->extent.width;
3840 uint32_t y2 = y1 + render_area->extent.height;
3841 /* x2/y2 can be unaligned if equal to the size of the image, since it will
3842 * write into padding space. The one exception is linear levels which don't
3843 * have the required y padding in the layout (except for the last level)
3844 */
3845 bool need_y2_align =
3846 y2 != iview->view.height || iview->view.need_y2_align;
3847
3848 return (x1 % phys_dev->info->gmem_align_w ||
3849 (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
3850 y1 % phys_dev->info->gmem_align_h ||
3851 (y2 % phys_dev->info->gmem_align_h && need_y2_align));
3852 }
3853
3854 /* Choose the GMEM layout (use the CCU space or not) based on whether the
3855 * current attachments will need. This has to happen at vkBeginRenderPass()
3856 * time because tu_attachment_store_unaligned() looks at the image views, which
3857 * are only available at that point. This should match the logic for the
3858 * !unaligned case in tu_store_gmem_attachment().
3859 */
3860 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)3861 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
3862 {
3863 cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
3864
3865 for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
3866 if (!cmd->state.attachments[i])
3867 continue;
3868
3869 struct tu_render_pass_attachment *att =
3870 &cmd->state.pass->attachments[i];
3871 if ((att->store || att->store_stencil) &&
3872 tu_attachment_store_unaligned(cmd, i))
3873 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3874 if (att->will_be_resolved && !blit_can_resolve(att->format))
3875 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
3876 }
3877
3878 cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
3879 }
3880
3881 struct apply_store_coords_state {
3882 unsigned view;
3883 };
3884
3885 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)3886 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
3887 struct tu_cs *cs,
3888 void *data,
3889 VkRect2D bin,
3890 unsigned views,
3891 VkExtent2D *frag_areas)
3892 {
3893 const struct apply_store_coords_state *state =
3894 (const struct apply_store_coords_state *)data;
3895 assert(state->view < views);
3896 VkExtent2D frag_area = frag_areas[state->view];
3897
3898 /* The bin width/height must be a multiple of the frag_area to make sure
3899 * that the scaling happens correctly. This means there may be some
3900 * destination pixels jut out of the framebuffer, but they should be
3901 * clipped by the render area.
3902 */
3903 assert(bin.extent.width % frag_area.width == 0);
3904 assert(bin.extent.height % frag_area.height == 0);
3905 uint32_t scaled_width = bin.extent.width / frag_area.width;
3906 uint32_t scaled_height = bin.extent.height / frag_area.height;
3907
3908 tu_cs_emit_regs(cs,
3909 A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
3910 .y = bin.offset.y),
3911 A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
3912 .y = bin.offset.y + bin.extent.height - 1));
3913 tu_cs_emit_regs(cs,
3914 A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
3915 A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
3916 A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
3917 A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
3918 }
3919
3920 template <chip CHIP>
3921 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)3922 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
3923 struct tu_cs *cs,
3924 uint32_t a,
3925 uint32_t gmem_a,
3926 uint32_t layers,
3927 uint32_t layer_mask,
3928 bool cond_exec_allowed)
3929 {
3930 const VkRect2D *render_area = &cmd->state.render_area;
3931 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
3932 const struct tu_image_view *iview = cmd->state.attachments[a];
3933 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
3934
3935 if (!dst->store && !dst->store_stencil)
3936 return;
3937
3938 bool unaligned = tu_attachment_store_unaligned(cmd, a);
3939
3940 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
3941 * one for depth and other for stencil. When resolving a MSAA
3942 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
3943 */
3944 bool resolve_d32s8_s8 =
3945 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
3946 dst->format == VK_FORMAT_S8_UINT;
3947
3948 /* The fast path doesn't support picking out the last component of a D24S8
3949 * texture reinterpreted as RGBA8_UNORM.
3950 */
3951 bool resolve_d24s8_s8 =
3952 src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
3953 dst->format == VK_FORMAT_S8_UINT;
3954
3955 bool store_common = dst->store && !resolve_d32s8_s8;
3956 bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
3957
3958 bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
3959 (a == gmem_a || blit_can_resolve(dst->format));
3960
3961 trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
3962
3963 /* Unconditional store should happen only if attachment was cleared,
3964 * which could have happened either by load_op or via vkCmdClearAttachments.
3965 */
3966 bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
3967 if (cond_exec) {
3968 tu_begin_load_store_cond_exec(cmd, cs, false);
3969 }
3970
3971 /* use fast path when render area is aligned, except for unsupported resolve cases */
3972 if (use_fast_path) {
3973 if (store_common)
3974 tu_emit_blit<CHIP>(cmd, cs, iview, src, true, false);
3975 if (store_separate_stencil)
3976 tu_emit_blit<CHIP>(cmd, cs, iview, src, true, true);
3977
3978 if (cond_exec) {
3979 tu_end_load_store_cond_exec(cmd, cs, false);
3980 }
3981
3982 trace_end_gmem_store(&cmd->trace, cs);
3983 return;
3984 }
3985
3986 assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
3987
3988 enum pipe_format src_format = tu_vk_format_to_pipe_format(src->format);
3989 if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3990 src_format = PIPE_FORMAT_Z32_FLOAT;
3991
3992 enum pipe_format dst_format = tu_vk_format_to_pipe_format(dst->format);
3993 if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
3994 dst_format = PIPE_FORMAT_Z32_FLOAT;
3995
3996 if (dst->samples > 1) {
3997 /* If we hit this path, we have to disable draw states after every tile
3998 * instead of once at the end of the renderpass, so that they aren't
3999 * executed when calling CP_DRAW.
4000 *
4001 * TODO: store a flag somewhere so we don't do this more than once and
4002 * don't do it after the renderpass when this happens.
4003 */
4004 if (store_common || store_separate_stencil)
4005 tu_disable_draw_states(cmd, cs);
4006
4007 for_each_layer(i, layer_mask, layers) {
4008 if (store_common) {
4009 store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
4010 dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4011 }
4012 if (store_separate_stencil) {
4013 store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
4014 PIPE_FORMAT_S8_UINT, render_area, i,
4015 tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4016 }
4017 }
4018 } else {
4019 if (!cmd->state.pass->has_fdm) {
4020 r2d_coords(cmd, cs, render_area->offset, render_area->offset,
4021 render_area->extent);
4022 } else {
4023 /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
4024 * area and the coordinates span the entire render area, but for
4025 * FDM we need to scale the coordinates so we need to take the
4026 * opposite aproach, specifying the exact bin size in the destination
4027 * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
4028 * area.
4029 */
4030 tu_cs_emit_regs(cs,
4031 A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
4032 .y = render_area->offset.y,),
4033 A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
4034 .y = render_area->offset.y + render_area->extent.height - 1,));
4035 }
4036
4037 for_each_layer (i, layer_mask, layers) {
4038 if (cmd->state.pass->has_fdm) {
4039 unsigned view = layer_mask ? i : 0;
4040 struct apply_store_coords_state state = {
4041 .view = view,
4042 };
4043 tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
4044 state);
4045 }
4046 if (store_common) {
4047 store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
4048 dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4049 }
4050 if (store_separate_stencil) {
4051 store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
4052 PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4053 }
4054 }
4055 }
4056
4057 if (cond_exec) {
4058 tu_end_load_store_cond_exec(cmd, cs, false);
4059 }
4060
4061 trace_end_gmem_store(&cmd->trace, cs);
4062 }
4063 TU_GENX(tu_store_gmem_attachment);
4064