1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <jonathan@marek.ca>
7 */
8
9 #include "tu_private.h"
10
11 #include "tu_cs.h"
12 #include "vk_format.h"
13
14 #include "util/format_r11g11b10f.h"
15 #include "util/format_rgb9e5.h"
16 #include "util/format_srgb.h"
17 #include "util/half_float.h"
18
19 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)20 tu_pack_float32_for_unorm(float val, int bits)
21 {
22 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
23 }
24
25 /* r2d_ = BLIT_OP_SCALE operations */
26
27 static enum a6xx_2d_ifmt
format_to_ifmt(VkFormat format)28 format_to_ifmt(VkFormat format)
29 {
30 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
31 format == VK_FORMAT_X8_D24_UNORM_PACK32)
32 return R2D_UNORM8;
33
34 /* get_component_bits doesn't work with depth/stencil formats: */
35 if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
36 return R2D_FLOAT32;
37 if (format == VK_FORMAT_S8_UINT)
38 return R2D_INT8;
39
40 /* use the size of the red channel to find the corresponding "ifmt" */
41 bool is_int = vk_format_is_int(format);
42 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
43 case 4: case 5: case 8:
44 return is_int ? R2D_INT8 : R2D_UNORM8;
45 case 10: case 11:
46 return is_int ? R2D_INT16 : R2D_FLOAT16;
47 case 16:
48 if (vk_format_is_float(format))
49 return R2D_FLOAT16;
50 return is_int ? R2D_INT16 : R2D_FLOAT32;
51 case 32:
52 return is_int ? R2D_INT32 : R2D_FLOAT32;
53 default:
54 unreachable("bad format");
55 return 0;
56 }
57 }
58
59 static void
r2d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)60 r2d_coords(struct tu_cs *cs,
61 const VkOffset2D *dst,
62 const VkOffset2D *src,
63 const VkExtent2D *extent)
64 {
65 tu_cs_emit_regs(cs,
66 A6XX_GRAS_2D_DST_TL(.x = dst->x, .y = dst->y),
67 A6XX_GRAS_2D_DST_BR(.x = dst->x + extent->width - 1, .y = dst->y + extent->height - 1));
68
69 if (!src)
70 return;
71
72 tu_cs_emit_regs(cs,
73 A6XX_GRAS_2D_SRC_TL_X(src->x),
74 A6XX_GRAS_2D_SRC_BR_X(src->x + extent->width - 1),
75 A6XX_GRAS_2D_SRC_TL_Y(src->y),
76 A6XX_GRAS_2D_SRC_BR_Y(src->y + extent->height - 1));
77 }
78
79 static void
r2d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)80 r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
81 {
82 uint32_t clear_value[4] = {};
83
84 switch (format) {
85 case VK_FORMAT_X8_D24_UNORM_PACK32:
86 case VK_FORMAT_D24_UNORM_S8_UINT:
87 /* cleared as r8g8b8a8_unorm using special format */
88 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
89 clear_value[1] = clear_value[0] >> 8;
90 clear_value[2] = clear_value[0] >> 16;
91 clear_value[3] = val->depthStencil.stencil;
92 break;
93 case VK_FORMAT_D16_UNORM:
94 case VK_FORMAT_D32_SFLOAT:
95 /* R2D_FLOAT32 */
96 clear_value[0] = fui(val->depthStencil.depth);
97 break;
98 case VK_FORMAT_S8_UINT:
99 clear_value[0] = val->depthStencil.stencil;
100 break;
101 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
102 /* cleared as UINT32 */
103 clear_value[0] = float3_to_rgb9e5(val->color.float32);
104 break;
105 default:
106 assert(!vk_format_is_depth_or_stencil(format));
107 const struct util_format_description *desc = vk_format_description(format);
108 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
109
110 assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
111 format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
112
113 for (unsigned i = 0; i < desc->nr_channels; i++) {
114 const struct util_format_channel_description *ch = &desc->channel[i];
115 if (ifmt == R2D_UNORM8) {
116 float linear = val->color.float32[i];
117 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
118 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
119
120 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
121 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
122 else
123 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
124 } else if (ifmt == R2D_FLOAT16) {
125 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
126 } else {
127 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
128 ifmt == R2D_INT16 || ifmt == R2D_INT8);
129 clear_value[i] = val->color.uint32[i];
130 }
131 }
132 break;
133 }
134
135 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
136 tu_cs_emit_array(cs, clear_value, 4);
137 }
138
139 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)140 r2d_src(struct tu_cmd_buffer *cmd,
141 struct tu_cs *cs,
142 const struct tu_image_view *iview,
143 uint32_t layer,
144 VkFilter filter)
145 {
146 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
147 if (filter != VK_FILTER_NEAREST)
148 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
149
150 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
151 tu_cs_emit(cs, src_info);
152 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
153 tu_cs_image_ref_2d(cs, iview, layer, true);
154
155 tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_FLAGS_LO, 3);
156 tu_cs_image_flag_ref(cs, iview, layer);
157 }
158
159 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)160 r2d_src_buffer(struct tu_cmd_buffer *cmd,
161 struct tu_cs *cs,
162 VkFormat vk_format,
163 uint64_t va, uint32_t pitch,
164 uint32_t width, uint32_t height)
165 {
166 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
167
168 tu_cs_emit_regs(cs,
169 A6XX_SP_PS_2D_SRC_INFO(
170 .color_format = format.fmt,
171 .color_swap = format.swap,
172 .srgb = vk_format_is_srgb(vk_format),
173 .unk20 = 1,
174 .unk22 = 1),
175 A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
176 A6XX_SP_PS_2D_SRC_LO((uint32_t) va),
177 A6XX_SP_PS_2D_SRC_HI(va >> 32),
178 A6XX_SP_PS_2D_SRC_PITCH(.pitch = pitch));
179 }
180
181 static void
r2d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)182 r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
183 {
184 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
185 tu_cs_emit(cs, iview->RB_2D_DST_INFO);
186 tu_cs_image_ref_2d(cs, iview, layer, false);
187
188 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS_LO, 3);
189 tu_cs_image_flag_ref(cs, iview, layer);
190 }
191
192 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)193 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
194 {
195 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
196 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
197 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
198 tu_cs_emit(cs, iview->stencil_PITCH);
199 }
200
201 static void
r2d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)202 r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
203 {
204 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
205
206 tu_cs_emit_regs(cs,
207 A6XX_RB_2D_DST_INFO(
208 .color_format = format.fmt,
209 .color_swap = format.swap,
210 .srgb = vk_format_is_srgb(vk_format)),
211 A6XX_RB_2D_DST_LO((uint32_t) va),
212 A6XX_RB_2D_DST_HI(va >> 32),
213 A6XX_RB_2D_DST_PITCH(pitch));
214 }
215
216 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,enum a6xx_rotation rotation,bool clear,bool ubwc,bool scissor)217 r2d_setup_common(struct tu_cmd_buffer *cmd,
218 struct tu_cs *cs,
219 VkFormat vk_format,
220 VkImageAspectFlags aspect_mask,
221 enum a6xx_rotation rotation,
222 bool clear,
223 bool ubwc,
224 bool scissor)
225 {
226 enum a6xx_format format = tu6_base_format(vk_format);
227 enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
228 uint32_t unknown_8c01 = 0;
229
230 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
231 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
232 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
233 }
234
235 /* note: the only format with partial clearing is D24S8 */
236 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
237 /* preserve stencil channel */
238 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
239 unknown_8c01 = 0x08000041;
240 /* preserve depth channels */
241 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
242 unknown_8c01 = 0x00084001;
243 }
244
245 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
246 tu_cs_emit(cs, unknown_8c01);
247
248 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
249 .scissor = scissor,
250 .rotate = rotation,
251 .solid_color = clear,
252 .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
253 .color_format = format,
254 .mask = 0xf,
255 .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
256 ).value;
257
258 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
259 tu_cs_emit(cs, blit_cntl);
260
261 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
262 tu_cs_emit(cs, blit_cntl);
263
264 if (format == FMT6_10_10_10_2_UNORM_DEST)
265 format = FMT6_16_16_16_16_FLOAT;
266
267 tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
268 .sint = vk_format_is_sint(vk_format),
269 .uint = vk_format_is_uint(vk_format),
270 .color_format = format,
271 .srgb = vk_format_is_srgb(vk_format),
272 .mask = 0xf));
273 }
274
275 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,enum a6xx_rotation rotation,bool clear,bool ubwc)276 r2d_setup(struct tu_cmd_buffer *cmd,
277 struct tu_cs *cs,
278 VkFormat vk_format,
279 VkImageAspectFlags aspect_mask,
280 enum a6xx_rotation rotation,
281 bool clear,
282 bool ubwc)
283 {
284 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
285
286 r2d_setup_common(cmd, cs, vk_format, aspect_mask, rotation, clear, ubwc, false);
287 }
288
289 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)290 r2d_teardown(struct tu_cmd_buffer *cmd,
291 struct tu_cs *cs)
292 {
293 /* nothing to do here */
294 }
295
296 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)297 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
298 {
299 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
300 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
301 }
302
303 /* r3d_ = shader path operations */
304
305 void
tu_init_clear_blit_shaders(struct tu6_global * global)306 tu_init_clear_blit_shaders(struct tu6_global *global)
307 {
308 #define MOV(args...) { .cat1 = { .opc_cat = 1, .src_type = TYPE_S32, .dst_type = TYPE_S32, args } }
309 #define CAT2(op, args...) { .cat2 = { .opc_cat = 2, .opc = (op) & 63, .full = 1, args } }
310 #define CAT3(op, args...) { .cat3 = { .opc_cat = 3, .opc = (op) & 63, args } }
311
312 static const instr_t vs_code[] = {
313 /* r0.xyz = r0.w ? c1.xyz : c0.xyz
314 * r1.xy = r0.w ? c1.zw : c0.zw
315 * r0.w = 1.0f
316 */
317 CAT3(OPC_SEL_B32, .repeat = 2, .dst = 0,
318 .c1 = {.src1_c = 1, .src1 = 4}, .src1_r = 1,
319 .src2 = 3,
320 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 0}),
321 CAT3(OPC_SEL_B32, .repeat = 1, .dst = 4,
322 .c1 = {.src1_c = 1, .src1 = 6}, .src1_r = 1,
323 .src2 = 3,
324 .c2 = {.src3_c = 1, .dummy = 1, .src3 = 2}),
325 MOV(.dst = 3, .src_im = 1, .fim_val = 1.0f ),
326 { .cat0 = { .opc = OPC_END } },
327 };
328
329 static const instr_t fs_blit[] = {
330 /* " bary.f (ei)r63.x, 0, r0.x" note the blob doesn't have this in its
331 * blit path (its not clear what allows it to not have it)
332 */
333 CAT2(OPC_BARY_F, .ei = 1, .full = 1, .dst = 63 * 4, .src1_im = 1),
334 { .cat0 = { .opc = OPC_END } },
335 };
336
337 memcpy(&global->shaders[GLOBAL_SH_VS], vs_code, sizeof(vs_code));
338 memcpy(&global->shaders[GLOBAL_SH_FS_BLIT], fs_blit, sizeof(fs_blit));
339
340 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
341 instr_t *code = global->shaders[GLOBAL_SH_FS_CLEAR0 + num_rts];
342 for (uint32_t i = 0; i < num_rts; i++) {
343 /* (rpt3)mov.s32s32 r0.x, (r)c[i].x */
344 *code++ = (instr_t) MOV(.repeat = 3, .dst = i * 4, .src_c = 1, .src_r = 1, .src = i * 4);
345 }
346 *code++ = (instr_t) { .cat0 = { .opc = OPC_END } };
347 }
348 }
349
350 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool blit,uint32_t num_rts,bool layered_clear)351 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit, uint32_t num_rts,
352 bool layered_clear)
353 {
354 struct ir3_const_state dummy_const_state = {};
355 struct ir3_shader dummy_shader = {};
356
357 struct ir3_shader_variant vs = {
358 .type = MESA_SHADER_VERTEX,
359 .instrlen = 1,
360 .constlen = 4,
361 .info.max_reg = 1,
362 .inputs_count = 1,
363 .inputs[0] = {
364 .slot = SYSTEM_VALUE_VERTEX_ID,
365 .regid = regid(0, 3),
366 .sysval = true,
367 },
368 .outputs_count = blit ? 2 : 1,
369 .outputs[0] = {
370 .slot = VARYING_SLOT_POS,
371 .regid = regid(0, 0),
372 },
373 .outputs[1] = {
374 .slot = VARYING_SLOT_VAR0,
375 .regid = regid(1, 0),
376 },
377 .shader = &dummy_shader,
378 .const_state = &dummy_const_state,
379 };
380 if (layered_clear) {
381 vs.outputs[1].slot = VARYING_SLOT_LAYER;
382 vs.outputs[1].regid = regid(1, 1);
383 vs.outputs_count = 2;
384 }
385
386 struct ir3_shader_variant fs = {
387 .type = MESA_SHADER_FRAGMENT,
388 .instrlen = 1, /* max of 9 instructions with num_rts = 8 */
389 .constlen = align(num_rts, 4),
390 .info.max_reg = MAX2(num_rts, 1) - 1,
391 .total_in = blit ? 2 : 0,
392 .num_samp = blit ? 1 : 0,
393 .inputs_count = blit ? 2 : 0,
394 .inputs[0] = {
395 .slot = VARYING_SLOT_VAR0,
396 .inloc = 0,
397 .compmask = 3,
398 .bary = true,
399 },
400 .inputs[1] = {
401 .slot = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL,
402 .regid = regid(0, 0),
403 .sysval = 1,
404 },
405 .num_sampler_prefetch = blit ? 1 : 0,
406 .sampler_prefetch[0] = {
407 .src = 0,
408 .wrmask = 0xf,
409 .cmd = 4,
410 },
411 .shader = &dummy_shader,
412 .const_state = &dummy_const_state,
413 };
414
415 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
416 .vs_state = true,
417 .hs_state = true,
418 .ds_state = true,
419 .gs_state = true,
420 .fs_state = true,
421 .cs_state = true,
422 .gfx_ibo = true,
423 .cs_ibo = true,
424 .gfx_shared_const = true,
425 .gfx_bindless = 0x1f,
426 .cs_bindless = 0x1f));
427
428 tu6_emit_xs_config(cs, MESA_SHADER_VERTEX, &vs, global_iova(cmd, shaders[GLOBAL_SH_VS]));
429 tu6_emit_xs_config(cs, MESA_SHADER_TESS_CTRL, NULL, 0);
430 tu6_emit_xs_config(cs, MESA_SHADER_TESS_EVAL, NULL, 0);
431 tu6_emit_xs_config(cs, MESA_SHADER_GEOMETRY, NULL, 0);
432 tu6_emit_xs_config(cs, MESA_SHADER_FRAGMENT, &fs,
433 global_iova(cmd, shaders[blit ? GLOBAL_SH_FS_BLIT : (GLOBAL_SH_FS_CLEAR0 + num_rts)]));
434
435 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
436 tu_cs_emit_regs(cs, A6XX_VFD_CONTROL_0());
437
438 /* Copy what the blob does here. This will emit an extra 0x3f
439 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
440 * this is working around yet.
441 */
442 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
443 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
444 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
445 tu_cs_emit(cs, 0);
446 tu_cs_emit_regs(cs, A6XX_VFD_MULTIVIEW_CNTL());
447
448 tu6_emit_vpc(cs, &vs, NULL, NULL, NULL, &fs, 0, false);
449
450 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
451 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
452 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
453
454 tu6_emit_fs_inputs(cs, &fs);
455
456 tu_cs_emit_regs(cs,
457 A6XX_GRAS_CL_CNTL(
458 .persp_division_disable = 1,
459 .vp_xform_disable = 1,
460 .vp_clip_code_ignore = 1,
461 .clip_disable = 1));
462 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
463
464 tu_cs_emit_regs(cs, A6XX_PC_RASTER_CNTL());
465 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
466
467 tu_cs_emit_regs(cs,
468 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
469 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
470 tu_cs_emit_regs(cs,
471 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
472 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
473
474 tu_cs_emit_regs(cs,
475 A6XX_VFD_INDEX_OFFSET(),
476 A6XX_VFD_INSTANCE_START_OFFSET());
477 }
478
479 static void
r3d_coords_raw(struct tu_cs * cs,const float * coords)480 r3d_coords_raw(struct tu_cs *cs, const float *coords)
481 {
482 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_GEOM, 3 + 8);
483 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
484 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
485 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
486 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
487 CP_LOAD_STATE6_0_NUM_UNIT(2));
488 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
489 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
490 tu_cs_emit_array(cs, (const uint32_t *) coords, 8);
491 }
492
493 static void
r3d_coords(struct tu_cs * cs,const VkOffset2D * dst,const VkOffset2D * src,const VkExtent2D * extent)494 r3d_coords(struct tu_cs *cs,
495 const VkOffset2D *dst,
496 const VkOffset2D *src,
497 const VkExtent2D *extent)
498 {
499 int32_t src_x1 = src ? src->x : 0;
500 int32_t src_y1 = src ? src->y : 0;
501 r3d_coords_raw(cs, (float[]) {
502 dst->x, dst->y,
503 src_x1, src_y1,
504 dst->x + extent->width, dst->y + extent->height,
505 src_x1 + extent->width, src_y1 + extent->height,
506 });
507 }
508
509 static void
r3d_clear_value(struct tu_cs * cs,VkFormat format,const VkClearValue * val)510 r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
511 {
512 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
513 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
514 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
515 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
516 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
517 CP_LOAD_STATE6_0_NUM_UNIT(1));
518 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
519 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
520 switch (format) {
521 case VK_FORMAT_X8_D24_UNORM_PACK32:
522 case VK_FORMAT_D24_UNORM_S8_UINT: {
523 /* cleared as r8g8b8a8_unorm using special format */
524 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
525 tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
526 tu_cs_emit(cs, fui((tmp >> 8 & 0xff) / 255.0f));
527 tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
528 tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
529 } break;
530 case VK_FORMAT_D16_UNORM:
531 case VK_FORMAT_D32_SFLOAT:
532 tu_cs_emit(cs, fui(val->depthStencil.depth));
533 tu_cs_emit(cs, 0);
534 tu_cs_emit(cs, 0);
535 tu_cs_emit(cs, 0);
536 break;
537 case VK_FORMAT_S8_UINT:
538 tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
539 tu_cs_emit(cs, 0);
540 tu_cs_emit(cs, 0);
541 tu_cs_emit(cs, 0);
542 break;
543 default:
544 /* as color formats use clear value as-is */
545 assert(!vk_format_is_depth_or_stencil(format));
546 tu_cs_emit_array(cs, val->color.uint32, 4);
547 break;
548 }
549 }
550
551 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)552 r3d_src_common(struct tu_cmd_buffer *cmd,
553 struct tu_cs *cs,
554 const uint32_t *tex_const,
555 uint32_t offset_base,
556 uint32_t offset_ubwc,
557 VkFilter filter)
558 {
559 struct tu_cs_memory texture = { };
560 VkResult result = tu_cs_alloc(&cmd->sub_cs,
561 2, /* allocate space for a sampler too */
562 A6XX_TEX_CONST_DWORDS, &texture);
563 if (result != VK_SUCCESS) {
564 cmd->record_result = result;
565 return;
566 }
567
568 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
569
570 /* patch addresses for layer offset */
571 *(uint64_t*) (texture.map + 4) += offset_base;
572 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
573 texture.map[7] = ubwc_addr;
574 texture.map[8] = ubwc_addr >> 32;
575
576 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
577 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
578 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
579 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
580 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
581 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
582 0x60000; /* XXX used by blob, doesn't seem necessary */
583 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
584 0x1 | /* XXX used by blob, doesn't seem necessary */
585 A6XX_TEX_SAMP_1_UNNORM_COORDS |
586 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
587 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
588 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
589
590 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
591 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
592 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
593 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
594 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
595 CP_LOAD_STATE6_0_NUM_UNIT(1));
596 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
597
598 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_SAMP_LO, 2);
599 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
600
601 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
602 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
603 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
604 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
605 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
606 CP_LOAD_STATE6_0_NUM_UNIT(1));
607 tu_cs_emit_qw(cs, texture.iova);
608
609 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2);
610 tu_cs_emit_qw(cs, texture.iova);
611
612 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
613 }
614
615 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)616 r3d_src(struct tu_cmd_buffer *cmd,
617 struct tu_cs *cs,
618 const struct tu_image_view *iview,
619 uint32_t layer,
620 VkFilter filter)
621 {
622 r3d_src_common(cmd, cs, iview->descriptor,
623 iview->layer_size * layer,
624 iview->ubwc_layer_size * layer,
625 filter);
626 }
627
628 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height)629 r3d_src_buffer(struct tu_cmd_buffer *cmd,
630 struct tu_cs *cs,
631 VkFormat vk_format,
632 uint64_t va, uint32_t pitch,
633 uint32_t width, uint32_t height)
634 {
635 uint32_t desc[A6XX_TEX_CONST_DWORDS];
636
637 struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
638
639 desc[0] =
640 COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
641 A6XX_TEX_CONST_0_FMT(format.fmt) |
642 A6XX_TEX_CONST_0_SWAP(format.swap) |
643 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
644 // XXX to swizzle into .w for stencil buffer_to_image
645 A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
646 A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
647 A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
648 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
649 desc[2] =
650 A6XX_TEX_CONST_2_PITCH(pitch) |
651 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
652 desc[3] = 0;
653 desc[4] = va;
654 desc[5] = va >> 32;
655 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
656 desc[i] = 0;
657
658 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
659 }
660
661 static void
r3d_dst(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)662 r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
663 {
664 tu6_emit_msaa(cs, iview->image->layout[0].nr_samples); /* TODO: move to setup */
665
666 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
667 tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
668 tu_cs_image_ref(cs, iview, layer);
669 tu_cs_emit(cs, 0);
670
671 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
672 tu_cs_image_flag_ref(cs, iview, layer);
673
674 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
675 }
676
677 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)678 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
679 {
680 tu6_emit_msaa(cs, iview->image->layout[0].nr_samples); /* TODO: move to setup */
681
682 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
683 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_MRT_BUF_INFO));
684 tu_cs_image_stencil_ref(cs, iview, layer);
685 tu_cs_emit(cs, 0);
686
687 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
688 }
689
690 static void
r3d_dst_buffer(struct tu_cs * cs,VkFormat vk_format,uint64_t va,uint32_t pitch)691 r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
692 {
693 struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
694
695 tu6_emit_msaa(cs, 1); /* TODO: move to setup */
696
697 tu_cs_emit_regs(cs,
698 A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
699 A6XX_RB_MRT_PITCH(0, pitch),
700 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
701 A6XX_RB_MRT_BASE_LO(0, (uint32_t) va),
702 A6XX_RB_MRT_BASE_HI(0, va >> 32),
703 A6XX_RB_MRT_BASE_GMEM(0, 0));
704
705 tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL());
706 }
707
708 static uint8_t
aspect_write_mask(VkFormat vk_format,VkImageAspectFlags aspect_mask)709 aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
710 {
711 uint8_t mask = 0xf;
712 assert(aspect_mask);
713 /* note: the only format with partial writing is D24S8,
714 * clear/blit uses the _AS_R8G8B8A8 format to access it
715 */
716 if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
717 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
718 mask = 0x7;
719 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
720 mask = 0x8;
721 }
722 return mask;
723 }
724
725 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags aspect_mask,enum a6xx_rotation rotation,bool clear,bool ubwc)726 r3d_setup(struct tu_cmd_buffer *cmd,
727 struct tu_cs *cs,
728 VkFormat vk_format,
729 VkImageAspectFlags aspect_mask,
730 enum a6xx_rotation rotation,
731 bool clear,
732 bool ubwc)
733 {
734 enum a6xx_format format = tu6_base_format(vk_format);
735
736 if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
737 vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
738 format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
739 }
740
741 if (!cmd->state.pass) {
742 tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
743 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
744 }
745
746 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.dword = 0xc00000));
747 tu_cs_emit_regs(cs, A6XX_RB_BIN_CONTROL(.dword = 0xc00000));
748
749 r3d_common(cmd, cs, !clear, clear ? 1 : 0, false);
750
751 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
752 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
753 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
754 0xfc000000);
755 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(1));
756
757 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 1);
758 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(0));
759
760 tu_cs_emit_regs(cs,
761 A6XX_RB_FS_OUTPUT_CNTL0(),
762 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
763
764 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
765 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
766
767 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
768 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
769 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
770 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
771 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
772 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
773 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
774
775 tu_cs_emit_regs(cs, A6XX_RB_RENDER_COMPONENTS(.rt0 = 0xf));
776 tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
777
778 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
779 .color_format = format,
780 .color_sint = vk_format_is_sint(vk_format),
781 .color_uint = vk_format_is_uint(vk_format)));
782
783 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
784 .component_enable = aspect_write_mask(vk_format, aspect_mask)));
785 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
786 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
787
788 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
789 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
790
791 if (cmd->state.predication_active) {
792 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
793 tu_cs_emit(cs, 0);
794 }
795 }
796
797 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)798 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
799 {
800 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
801 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
802 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
803 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
804 tu_cs_emit(cs, 1); /* instance count */
805 tu_cs_emit(cs, 2); /* vertex count */
806 }
807
808 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)809 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
810 {
811 if (cmd->state.predication_active) {
812 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
813 tu_cs_emit(cs, 1);
814 }
815 }
816
817 /* blit ops - common interface for 2d/shader paths */
818
819 struct blit_ops {
820 void (*coords)(struct tu_cs *cs,
821 const VkOffset2D *dst,
822 const VkOffset2D *src,
823 const VkExtent2D *extent);
824 void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
825 void (*src)(
826 struct tu_cmd_buffer *cmd,
827 struct tu_cs *cs,
828 const struct tu_image_view *iview,
829 uint32_t layer,
830 VkFilter filter);
831 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
832 VkFormat vk_format,
833 uint64_t va, uint32_t pitch,
834 uint32_t width, uint32_t height);
835 void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
836 void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
837 void (*setup)(struct tu_cmd_buffer *cmd,
838 struct tu_cs *cs,
839 VkFormat vk_format,
840 VkImageAspectFlags aspect_mask,
841 enum a6xx_rotation rotation,
842 bool clear,
843 bool ubwc);
844 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
845 void (*teardown)(struct tu_cmd_buffer *cmd,
846 struct tu_cs *cs);
847 };
848
849 static const struct blit_ops r2d_ops = {
850 .coords = r2d_coords,
851 .clear_value = r2d_clear_value,
852 .src = r2d_src,
853 .src_buffer = r2d_src_buffer,
854 .dst = r2d_dst,
855 .dst_buffer = r2d_dst_buffer,
856 .setup = r2d_setup,
857 .run = r2d_run,
858 .teardown = r2d_teardown,
859 };
860
861 static const struct blit_ops r3d_ops = {
862 .coords = r3d_coords,
863 .clear_value = r3d_clear_value,
864 .src = r3d_src,
865 .src_buffer = r3d_src_buffer,
866 .dst = r3d_dst,
867 .dst_buffer = r3d_dst_buffer,
868 .setup = r3d_setup,
869 .run = r3d_run,
870 .teardown = r3d_teardown,
871 };
872
873 /* passthrough set coords from 3D extents */
874 static void
coords(const struct blit_ops * ops,struct tu_cs * cs,const VkOffset3D * dst,const VkOffset3D * src,const VkExtent3D * extent)875 coords(const struct blit_ops *ops,
876 struct tu_cs *cs,
877 const VkOffset3D *dst,
878 const VkOffset3D *src,
879 const VkExtent3D *extent)
880 {
881 ops->coords(cs, (const VkOffset2D*) dst, (const VkOffset2D*) src, (const VkExtent2D*) extent);
882 }
883
884 static VkFormat
copy_format(VkFormat format,VkImageAspectFlags aspect_mask,bool copy_buffer)885 copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
886 {
887 if (vk_format_is_compressed(format)) {
888 switch (vk_format_get_blocksize(format)) {
889 case 1: return VK_FORMAT_R8_UINT;
890 case 2: return VK_FORMAT_R16_UINT;
891 case 4: return VK_FORMAT_R32_UINT;
892 case 8: return VK_FORMAT_R32G32_UINT;
893 case 16:return VK_FORMAT_R32G32B32A32_UINT;
894 default:
895 unreachable("unhandled format size");
896 }
897 }
898
899 switch (format) {
900 case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
901 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
902 return VK_FORMAT_R8G8_UNORM;
903 /* fallthrough */
904 case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
905 return VK_FORMAT_R8_UNORM;
906 case VK_FORMAT_D24_UNORM_S8_UINT:
907 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
908 return VK_FORMAT_R8_UNORM;
909 /* fallthrough */
910 default:
911 return format;
912 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
913 return VK_FORMAT_R32_UINT;
914 case VK_FORMAT_D32_SFLOAT_S8_UINT:
915 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
916 return VK_FORMAT_S8_UINT;
917 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
918 return VK_FORMAT_D32_SFLOAT;
919 }
920 }
921
922 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)923 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
924 struct tu_cs *cs,
925 struct tu_image *image,
926 const VkClearValue *value)
927 {
928 const struct blit_ops *ops = &r2d_ops;
929
930 ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, ROTATE_0, true, false);
931 ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
932 ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
933 image->bo->iova + image->bo_offset + image->lrz_offset,
934 image->lrz_pitch * 2);
935 ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
936 ops->run(cmd, cs);
937 ops->teardown(cmd, cs);
938 }
939
940 static void
tu_image_view_copy_blit(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read)941 tu_image_view_copy_blit(struct tu_image_view *iview,
942 struct tu_image *image,
943 VkFormat format,
944 const VkImageSubresourceLayers *subres,
945 uint32_t layer,
946 bool stencil_read)
947 {
948 VkImageAspectFlags aspect_mask = subres->aspectMask;
949
950 /* always use the AS_R8G8B8A8 format for these */
951 if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
952 format == VK_FORMAT_X8_D24_UNORM_PACK32) {
953 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
954 }
955
956 tu_image_view_init(iview, &(VkImageViewCreateInfo) {
957 .image = tu_image_to_handle(image),
958 .viewType = VK_IMAGE_VIEW_TYPE_2D,
959 .format = format,
960 /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
961 .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
962 .subresourceRange = {
963 .aspectMask = aspect_mask,
964 .baseMipLevel = subres->mipLevel,
965 .levelCount = 1,
966 .baseArrayLayer = subres->baseArrayLayer + layer,
967 .layerCount = 1,
968 },
969 }, false);
970 }
971
972 static void
tu_image_view_copy(struct tu_image_view * iview,struct tu_image * image,VkFormat format,const VkImageSubresourceLayers * subres,uint32_t layer,bool stencil_read)973 tu_image_view_copy(struct tu_image_view *iview,
974 struct tu_image *image,
975 VkFormat format,
976 const VkImageSubresourceLayers *subres,
977 uint32_t layer,
978 bool stencil_read)
979 {
980 format = copy_format(format, subres->aspectMask, false);
981 tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read);
982 }
983
984 static void
tu_image_view_blit(struct tu_image_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)985 tu_image_view_blit(struct tu_image_view *iview,
986 struct tu_image *image,
987 const VkImageSubresourceLayers *subres,
988 uint32_t layer)
989 {
990 tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false);
991 }
992
993 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit * info,VkFilter filter)994 tu6_blit_image(struct tu_cmd_buffer *cmd,
995 struct tu_image *src_image,
996 struct tu_image *dst_image,
997 const VkImageBlit *info,
998 VkFilter filter)
999 {
1000 const struct blit_ops *ops = &r2d_ops;
1001 struct tu_cs *cs = &cmd->cs;
1002 uint32_t layers;
1003
1004 /* 2D blit can't do rotation mirroring from just coordinates */
1005 static const enum a6xx_rotation rotate[2][2] = {
1006 {ROTATE_0, ROTATE_HFLIP},
1007 {ROTATE_VFLIP, ROTATE_180},
1008 };
1009
1010 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
1011 (info->dstOffsets[1].x < info->dstOffsets[0].x);
1012 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
1013 (info->dstOffsets[1].y < info->dstOffsets[0].y);
1014 bool mirror_z = (info->srcOffsets[1].z < info->srcOffsets[0].z) !=
1015 (info->dstOffsets[1].z < info->dstOffsets[0].z);
1016
1017 if (mirror_z) {
1018 tu_finishme("blit z mirror\n");
1019 return;
1020 }
1021
1022 if (info->srcOffsets[1].z - info->srcOffsets[0].z !=
1023 info->dstOffsets[1].z - info->dstOffsets[0].z) {
1024 tu_finishme("blit z filter\n");
1025 return;
1026 }
1027
1028 layers = info->srcOffsets[1].z - info->srcOffsets[0].z;
1029 if (info->dstSubresource.layerCount > 1) {
1030 assert(layers <= 1);
1031 layers = info->dstSubresource.layerCount;
1032 }
1033
1034 /* BC1_RGB_* formats need to have their last components overriden with 1
1035 * when sampling, which is normally handled with the texture descriptor
1036 * swizzle. The 2d path can't handle that, so use the 3d path.
1037 *
1038 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
1039 * the 2d path.
1040 */
1041
1042 if (dst_image->layout[0].nr_samples > 1 ||
1043 src_image->vk_format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
1044 src_image->vk_format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
1045 filter == VK_FILTER_CUBIC_EXT)
1046 ops = &r3d_ops;
1047
1048 /* use the right format in setup() for D32_S8
1049 * TODO: this probably should use a helper
1050 */
1051 VkFormat format = dst_image->vk_format;
1052 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1053 if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT)
1054 format = VK_FORMAT_D32_SFLOAT;
1055 else if (info->dstSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT)
1056 format = VK_FORMAT_S8_UINT;
1057 else
1058 unreachable("unexpected D32_S8 aspect mask in blit_image");
1059 }
1060
1061 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1062 rotate[mirror_y][mirror_x], false, dst_image->layout[0].ubwc);
1063
1064 if (ops == &r3d_ops) {
1065 r3d_coords_raw(cs, (float[]) {
1066 info->dstOffsets[0].x, info->dstOffsets[0].y,
1067 info->srcOffsets[0].x, info->srcOffsets[0].y,
1068 info->dstOffsets[1].x, info->dstOffsets[1].y,
1069 info->srcOffsets[1].x, info->srcOffsets[1].y
1070 });
1071 } else {
1072 tu_cs_emit_regs(cs,
1073 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
1074 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
1075 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
1076 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
1077 tu_cs_emit_regs(cs,
1078 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
1079 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
1080 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
1081 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
1082 }
1083
1084 struct tu_image_view dst, src;
1085 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffsets[0].z);
1086 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
1087
1088 for (uint32_t i = 0; i < layers; i++) {
1089 ops->dst(cs, &dst, i);
1090 ops->src(cmd, cs, &src, i, filter);
1091 ops->run(cmd, cs);
1092 }
1093
1094 ops->teardown(cmd, cs);
1095 }
1096
1097 void
tu_CmdBlitImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageBlit * pRegions,VkFilter filter)1098 tu_CmdBlitImage(VkCommandBuffer commandBuffer,
1099 VkImage srcImage,
1100 VkImageLayout srcImageLayout,
1101 VkImage dstImage,
1102 VkImageLayout dstImageLayout,
1103 uint32_t regionCount,
1104 const VkImageBlit *pRegions,
1105 VkFilter filter)
1106
1107 {
1108 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1109 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1110 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1111
1112 for (uint32_t i = 0; i < regionCount; ++i) {
1113 /* can't blit both depth and stencil at once with D32_S8
1114 * TODO: more advanced 3D blit path to support it instead?
1115 */
1116 if (src_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
1117 dst_image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1118 VkImageBlit region = pRegions[i];
1119 uint32_t b;
1120 for_each_bit(b, pRegions[i].dstSubresource.aspectMask) {
1121 region.srcSubresource.aspectMask = BIT(b);
1122 region.dstSubresource.aspectMask = BIT(b);
1123 tu6_blit_image(cmd, src_image, dst_image, ®ion, filter);
1124 }
1125 continue;
1126 }
1127 tu6_blit_image(cmd, src_image, dst_image, pRegions + i, filter);
1128 }
1129 }
1130
1131 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)1132 copy_compressed(VkFormat format,
1133 VkOffset3D *offset,
1134 VkExtent3D *extent,
1135 uint32_t *width,
1136 uint32_t *height)
1137 {
1138 if (!vk_format_is_compressed(format))
1139 return;
1140
1141 uint32_t block_width = vk_format_get_blockwidth(format);
1142 uint32_t block_height = vk_format_get_blockheight(format);
1143
1144 offset->x /= block_width;
1145 offset->y /= block_height;
1146
1147 if (extent) {
1148 extent->width = DIV_ROUND_UP(extent->width, block_width);
1149 extent->height = DIV_ROUND_UP(extent->height, block_height);
1150 }
1151 if (width)
1152 *width = DIV_ROUND_UP(*width, block_width);
1153 if (height)
1154 *height = DIV_ROUND_UP(*height, block_height);
1155 }
1156
1157 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy * info)1158 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
1159 struct tu_buffer *src_buffer,
1160 struct tu_image *dst_image,
1161 const VkBufferImageCopy *info)
1162 {
1163 struct tu_cs *cs = &cmd->cs;
1164 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1165 VkFormat src_format =
1166 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
1167 const struct blit_ops *ops = &r2d_ops;
1168
1169 /* special case for buffer to stencil */
1170 if (dst_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1171 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1172 ops = &r3d_ops;
1173 }
1174
1175 /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
1176 * which matters for UBWC. buffer_to_image/etc can fail because of this
1177 */
1178
1179 VkOffset3D offset = info->imageOffset;
1180 VkExtent3D extent = info->imageExtent;
1181 uint32_t src_width = info->bufferRowLength ?: extent.width;
1182 uint32_t src_height = info->bufferImageHeight ?: extent.height;
1183
1184 copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
1185
1186 uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
1187 uint32_t layer_size = src_height * pitch;
1188
1189 ops->setup(cmd, cs,
1190 copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
1191 info->imageSubresource.aspectMask, ROTATE_0, false, dst_image->layout[0].ubwc);
1192
1193 struct tu_image_view dst;
1194 tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
1195
1196 for (uint32_t i = 0; i < layers; i++) {
1197 ops->dst(cs, &dst, i);
1198
1199 uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
1200 if ((src_va & 63) || (pitch & 63)) {
1201 for (uint32_t y = 0; y < extent.height; y++) {
1202 uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
1203 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
1204 x + extent.width, 1);
1205 ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y}, &(VkOffset2D){x},
1206 &(VkExtent2D) {extent.width, 1});
1207 ops->run(cmd, cs);
1208 src_va += pitch;
1209 }
1210 } else {
1211 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height);
1212 coords(ops, cs, &offset, &(VkOffset3D){}, &extent);
1213 ops->run(cmd, cs);
1214 }
1215 }
1216
1217 ops->teardown(cmd, cs);
1218 }
1219
1220 void
tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkBufferImageCopy * pRegions)1221 tu_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
1222 VkBuffer srcBuffer,
1223 VkImage dstImage,
1224 VkImageLayout dstImageLayout,
1225 uint32_t regionCount,
1226 const VkBufferImageCopy *pRegions)
1227 {
1228 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1229 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1230 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1231
1232 for (unsigned i = 0; i < regionCount; ++i)
1233 tu_copy_buffer_to_image(cmd, src_buffer, dst_image, pRegions + i);
1234 }
1235
1236 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy * info)1237 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
1238 struct tu_image *src_image,
1239 struct tu_buffer *dst_buffer,
1240 const VkBufferImageCopy *info)
1241 {
1242 struct tu_cs *cs = &cmd->cs;
1243 uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
1244 VkFormat dst_format =
1245 copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
1246 bool stencil_read = false;
1247
1248 if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
1249 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
1250 stencil_read = true;
1251 }
1252
1253 const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
1254 VkOffset3D offset = info->imageOffset;
1255 VkExtent3D extent = info->imageExtent;
1256 uint32_t dst_width = info->bufferRowLength ?: extent.width;
1257 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
1258
1259 copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
1260
1261 uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
1262 uint32_t layer_size = pitch * dst_height;
1263
1264 ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1265
1266 struct tu_image_view src;
1267 tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
1268
1269 for (uint32_t i = 0; i < layers; i++) {
1270 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1271
1272 uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
1273 if ((dst_va & 63) || (pitch & 63)) {
1274 for (uint32_t y = 0; y < extent.height; y++) {
1275 uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
1276 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
1277 ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
1278 &(VkExtent2D) {extent.width, 1});
1279 ops->run(cmd, cs);
1280 dst_va += pitch;
1281 }
1282 } else {
1283 ops->dst_buffer(cs, dst_format, dst_va, pitch);
1284 coords(ops, cs, &(VkOffset3D) {0, 0}, &offset, &extent);
1285 ops->run(cmd, cs);
1286 }
1287 }
1288
1289 ops->teardown(cmd, cs);
1290 }
1291
1292 void
tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferImageCopy * pRegions)1293 tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1294 VkImage srcImage,
1295 VkImageLayout srcImageLayout,
1296 VkBuffer dstBuffer,
1297 uint32_t regionCount,
1298 const VkBufferImageCopy *pRegions)
1299 {
1300 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1301 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1302 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1303
1304 for (unsigned i = 0; i < regionCount; ++i)
1305 tu_copy_image_to_buffer(cmd, src_image, dst_buffer, pRegions + i);
1306 }
1307
1308 /* Tiled formats don't support swapping, which means that we can't support
1309 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
1310 * formats like B5G5R5A1 have a separate linear-only format when sampling.
1311 * Currently we fake support for tiled swapped formats and use the unswapped
1312 * format instead, but this means that reinterpreting copies to and from
1313 * swapped formats can't be performed correctly unless we can swizzle the
1314 * components by reinterpreting the other image as the "correct" swapped
1315 * format, i.e. only when the other image is linear.
1316 */
1317
1318 static bool
is_swapped_format(VkFormat format)1319 is_swapped_format(VkFormat format)
1320 {
1321 struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
1322 struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
1323 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
1324 }
1325
1326 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
1327 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
1328 * versa). This should mirror the logic in fdl6_layout.
1329 */
1330 static bool
image_is_r8g8(struct tu_image * image)1331 image_is_r8g8(struct tu_image *image)
1332 {
1333 return image->layout[0].cpp == 2 &&
1334 vk_format_get_nr_components(image->vk_format) == 2;
1335 }
1336
1337 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy * info)1338 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
1339 struct tu_image *src_image,
1340 struct tu_image *dst_image,
1341 const VkImageCopy *info)
1342 {
1343 const struct blit_ops *ops = &r2d_ops;
1344 struct tu_cs *cs = &cmd->cs;
1345
1346 if (dst_image->layout[0].nr_samples > 1)
1347 ops = &r3d_ops;
1348
1349 VkFormat format = VK_FORMAT_UNDEFINED;
1350 VkOffset3D src_offset = info->srcOffset;
1351 VkOffset3D dst_offset = info->dstOffset;
1352 VkExtent3D extent = info->extent;
1353
1354 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
1355 * Images":
1356 *
1357 * When copying between compressed and uncompressed formats the extent
1358 * members represent the texel dimensions of the source image and not
1359 * the destination. When copying from a compressed image to an
1360 * uncompressed image the image texel dimensions written to the
1361 * uncompressed image will be source extent divided by the compressed
1362 * texel block dimensions. When copying from an uncompressed image to a
1363 * compressed image the image texel dimensions written to the compressed
1364 * image will be the source extent multiplied by the compressed texel
1365 * block dimensions.
1366 *
1367 * This means we only have to adjust the extent if the source image is
1368 * compressed.
1369 */
1370 copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
1371 copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
1372
1373 VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
1374 VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
1375
1376 bool use_staging_blit = false;
1377
1378 if (src_format == dst_format) {
1379 /* Images that share a format can always be copied directly because it's
1380 * the same as a blit.
1381 */
1382 format = src_format;
1383 } else if (!src_image->layout[0].tile_mode) {
1384 /* If an image is linear, we can always safely reinterpret it with the
1385 * other image's format and then do a regular blit.
1386 */
1387 format = dst_format;
1388 } else if (!dst_image->layout[0].tile_mode) {
1389 format = src_format;
1390 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
1391 /* We can't currently copy r8g8 images to/from other cpp=2 images,
1392 * due to the different tile layout.
1393 */
1394 use_staging_blit = true;
1395 } else if (is_swapped_format(src_format) ||
1396 is_swapped_format(dst_format)) {
1397 /* If either format has a non-identity swap, then we can't copy
1398 * to/from it.
1399 */
1400 use_staging_blit = true;
1401 } else if (!src_image->layout[0].ubwc) {
1402 format = dst_format;
1403 } else if (!dst_image->layout[0].ubwc) {
1404 format = src_format;
1405 } else {
1406 /* Both formats use UBWC and so neither can be reinterpreted.
1407 * TODO: We could do an in-place decompression of the dst instead.
1408 */
1409 use_staging_blit = true;
1410 }
1411
1412 struct tu_image_view dst, src;
1413
1414 if (use_staging_blit) {
1415 tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
1416 tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
1417
1418 struct tu_image staging_image = {
1419 .vk_format = src_format,
1420 .level_count = 1,
1421 .layer_count = info->srcSubresource.layerCount,
1422 .bo_offset = 0,
1423 };
1424
1425 VkImageSubresourceLayers staging_subresource = {
1426 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1427 .mipLevel = 0,
1428 .baseArrayLayer = 0,
1429 .layerCount = info->srcSubresource.layerCount,
1430 };
1431
1432 VkOffset3D staging_offset = { 0 };
1433
1434 staging_image.layout[0].tile_mode = TILE6_LINEAR;
1435 staging_image.layout[0].ubwc = false;
1436
1437 fdl6_layout(&staging_image.layout[0],
1438 vk_format_to_pipe_format(staging_image.vk_format),
1439 src_image->layout[0].nr_samples,
1440 extent.width,
1441 extent.height,
1442 extent.depth,
1443 staging_image.level_count,
1444 staging_image.layer_count,
1445 extent.depth > 1,
1446 NULL);
1447
1448 VkResult result = tu_get_scratch_bo(cmd->device,
1449 staging_image.layout[0].size,
1450 &staging_image.bo);
1451 if (result != VK_SUCCESS) {
1452 cmd->record_result = result;
1453 return;
1454 }
1455
1456 struct tu_image_view staging;
1457 tu_image_view_copy(&staging, &staging_image, src_format,
1458 &staging_subresource, 0, false);
1459
1460 ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1461 coords(ops, cs, &staging_offset, &src_offset, &extent);
1462
1463 for (uint32_t i = 0; i < info->extent.depth; i++) {
1464 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1465 ops->dst(cs, &staging, i);
1466 ops->run(cmd, cs);
1467 }
1468
1469 /* When executed by the user there has to be a pipeline barrier here,
1470 * but since we're doing it manually we'll have to flush ourselves.
1471 */
1472 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
1473 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
1474
1475 tu_image_view_copy(&staging, &staging_image, dst_format,
1476 &staging_subresource, 0, false);
1477
1478 ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
1479 ROTATE_0, false, dst_image->layout[0].ubwc);
1480 coords(ops, cs, &dst_offset, &staging_offset, &extent);
1481
1482 for (uint32_t i = 0; i < info->extent.depth; i++) {
1483 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST);
1484 ops->dst(cs, &dst, i);
1485 ops->run(cmd, cs);
1486 }
1487 } else {
1488 tu_image_view_copy(&dst, dst_image, format, &info->dstSubresource, dst_offset.z, false);
1489 tu_image_view_copy(&src, src_image, format, &info->srcSubresource, src_offset.z, false);
1490
1491 ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
1492 ROTATE_0, false, dst_image->layout[0].ubwc);
1493 coords(ops, cs, &dst_offset, &src_offset, &extent);
1494
1495 for (uint32_t i = 0; i < info->extent.depth; i++) {
1496 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1497 ops->dst(cs, &dst, i);
1498 ops->run(cmd, cs);
1499 }
1500 }
1501
1502 ops->teardown(cmd, cs);
1503 }
1504
1505 void
tu_CmdCopyImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage destImage,VkImageLayout destImageLayout,uint32_t regionCount,const VkImageCopy * pRegions)1506 tu_CmdCopyImage(VkCommandBuffer commandBuffer,
1507 VkImage srcImage,
1508 VkImageLayout srcImageLayout,
1509 VkImage destImage,
1510 VkImageLayout destImageLayout,
1511 uint32_t regionCount,
1512 const VkImageCopy *pRegions)
1513 {
1514 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1515 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1516 TU_FROM_HANDLE(tu_image, dst_image, destImage);
1517
1518 for (uint32_t i = 0; i < regionCount; ++i)
1519 tu_copy_image_to_image(cmd, src_image, dst_image, pRegions + i);
1520 }
1521
1522 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size)1523 copy_buffer(struct tu_cmd_buffer *cmd,
1524 uint64_t dst_va,
1525 uint64_t src_va,
1526 uint64_t size,
1527 uint32_t block_size)
1528 {
1529 const struct blit_ops *ops = &r2d_ops;
1530 struct tu_cs *cs = &cmd->cs;
1531 VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
1532 uint64_t blocks = size / block_size;
1533
1534 ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false, false);
1535
1536 while (blocks) {
1537 uint32_t src_x = (src_va & 63) / block_size;
1538 uint32_t dst_x = (dst_va & 63) / block_size;
1539 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
1540
1541 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1);
1542 ops->dst_buffer( cs, format, dst_va & ~63, 0);
1543 ops->coords(cs, &(VkOffset2D) {dst_x}, &(VkOffset2D) {src_x}, &(VkExtent2D) {width, 1});
1544 ops->run(cmd, cs);
1545
1546 src_va += width * block_size;
1547 dst_va += width * block_size;
1548 blocks -= width;
1549 }
1550
1551 ops->teardown(cmd, cs);
1552 }
1553
1554 void
tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferCopy * pRegions)1555 tu_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1556 VkBuffer srcBuffer,
1557 VkBuffer dstBuffer,
1558 uint32_t regionCount,
1559 const VkBufferCopy *pRegions)
1560 {
1561 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1562 TU_FROM_HANDLE(tu_buffer, src_buffer, srcBuffer);
1563 TU_FROM_HANDLE(tu_buffer, dst_buffer, dstBuffer);
1564
1565 for (unsigned i = 0; i < regionCount; ++i) {
1566 copy_buffer(cmd,
1567 tu_buffer_iova(dst_buffer) + pRegions[i].dstOffset,
1568 tu_buffer_iova(src_buffer) + pRegions[i].srcOffset,
1569 pRegions[i].size, 1);
1570 }
1571 }
1572
1573 void
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1574 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1575 VkBuffer dstBuffer,
1576 VkDeviceSize dstOffset,
1577 VkDeviceSize dataSize,
1578 const void *pData)
1579 {
1580 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1581 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1582
1583 struct tu_cs_memory tmp;
1584 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64, &tmp);
1585 if (result != VK_SUCCESS) {
1586 cmd->record_result = result;
1587 return;
1588 }
1589
1590 memcpy(tmp.map, pData, dataSize);
1591 copy_buffer(cmd, tu_buffer_iova(buffer) + dstOffset, tmp.iova, dataSize, 4);
1592 }
1593
1594 void
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)1595 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
1596 VkBuffer dstBuffer,
1597 VkDeviceSize dstOffset,
1598 VkDeviceSize fillSize,
1599 uint32_t data)
1600 {
1601 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1602 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
1603 const struct blit_ops *ops = &r2d_ops;
1604 struct tu_cs *cs = &cmd->cs;
1605
1606 if (fillSize == VK_WHOLE_SIZE)
1607 fillSize = buffer->size - dstOffset;
1608
1609 uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
1610 uint32_t blocks = fillSize / 4;
1611
1612 ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, true, false);
1613 ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
1614
1615 while (blocks) {
1616 uint32_t dst_x = (dst_va & 63) / 4;
1617 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
1618
1619 ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
1620 ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
1621 ops->run(cmd, cs);
1622
1623 dst_va += width * 4;
1624 blocks -= width;
1625 }
1626
1627 ops->teardown(cmd, cs);
1628 }
1629
1630 void
tu_CmdResolveImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageResolve * pRegions)1631 tu_CmdResolveImage(VkCommandBuffer commandBuffer,
1632 VkImage srcImage,
1633 VkImageLayout srcImageLayout,
1634 VkImage dstImage,
1635 VkImageLayout dstImageLayout,
1636 uint32_t regionCount,
1637 const VkImageResolve *pRegions)
1638 {
1639 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1640 TU_FROM_HANDLE(tu_image, src_image, srcImage);
1641 TU_FROM_HANDLE(tu_image, dst_image, dstImage);
1642 const struct blit_ops *ops = &r2d_ops;
1643 struct tu_cs *cs = &cmd->cs;
1644
1645 ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1646 ROTATE_0, false, dst_image->layout[0].ubwc);
1647
1648 for (uint32_t i = 0; i < regionCount; ++i) {
1649 const VkImageResolve *info = &pRegions[i];
1650 uint32_t layers = MAX2(info->extent.depth, info->dstSubresource.layerCount);
1651
1652 assert(info->srcSubresource.layerCount == info->dstSubresource.layerCount);
1653 /* TODO: aspect masks possible ? */
1654
1655 coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
1656
1657 struct tu_image_view dst, src;
1658 tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
1659 tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
1660
1661 for (uint32_t i = 0; i < layers; i++) {
1662 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
1663 ops->dst(cs, &dst, i);
1664 ops->run(cmd, cs);
1665 }
1666 }
1667
1668 ops->teardown(cmd, cs);
1669 }
1670
1671 #define for_each_layer(layer, layer_mask, layers) \
1672 for (uint32_t layer = 0; \
1673 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
1674 layer++) \
1675 if (!layer_mask || (layer_mask & BIT(layer)))
1676
1677 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image_view * src,struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)1678 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
1679 struct tu_cs *cs,
1680 struct tu_image_view *src,
1681 struct tu_image_view *dst,
1682 uint32_t layer_mask,
1683 uint32_t layers,
1684 const VkRect2D *rect)
1685 {
1686 const struct blit_ops *ops = &r2d_ops;
1687
1688 assert(src->image->vk_format == dst->image->vk_format);
1689
1690 ops->setup(cmd, cs, dst->image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
1691 ROTATE_0, false, dst->ubwc_enabled);
1692 ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
1693
1694 for_each_layer(i, layer_mask, layers) {
1695 ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
1696 ops->dst(cs, dst, i);
1697 ops->run(cmd, cs);
1698 }
1699
1700 ops->teardown(cmd, cs);
1701 }
1702
1703 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)1704 clear_image(struct tu_cmd_buffer *cmd,
1705 struct tu_image *image,
1706 const VkClearValue *clear_value,
1707 const VkImageSubresourceRange *range,
1708 VkImageAspectFlags aspect_mask)
1709 {
1710 uint32_t level_count = tu_get_levelCount(image, range);
1711 uint32_t layer_count = tu_get_layerCount(image, range);
1712 struct tu_cs *cs = &cmd->cs;
1713 VkFormat format = image->vk_format;
1714 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1715 format = copy_format(format, aspect_mask, false);
1716
1717 if (image->layout[0].depth0 > 1) {
1718 assert(layer_count == 1);
1719 assert(range->baseArrayLayer == 0);
1720 }
1721
1722 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops : &r2d_ops;
1723
1724 ops->setup(cmd, cs, format, aspect_mask, ROTATE_0, true, image->layout[0].ubwc);
1725 if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
1726 ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
1727 else
1728 ops->clear_value(cs, format, clear_value);
1729
1730 for (unsigned j = 0; j < level_count; j++) {
1731 if (image->layout[0].depth0 > 1)
1732 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
1733
1734 ops->coords(cs, &(VkOffset2D){}, NULL, &(VkExtent2D) {
1735 u_minify(image->layout[0].width0, range->baseMipLevel + j),
1736 u_minify(image->layout[0].height0, range->baseMipLevel + j)
1737 });
1738
1739 struct tu_image_view dst;
1740 tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
1741 .aspectMask = aspect_mask,
1742 .mipLevel = range->baseMipLevel + j,
1743 .baseArrayLayer = range->baseArrayLayer,
1744 .layerCount = 1,
1745 }, 0, false);
1746
1747 for (uint32_t i = 0; i < layer_count; i++) {
1748 ops->dst(cs, &dst, i);
1749 ops->run(cmd, cs);
1750 }
1751 }
1752
1753 ops->teardown(cmd, cs);
1754 }
1755
1756 void
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)1757 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
1758 VkImage image_h,
1759 VkImageLayout imageLayout,
1760 const VkClearColorValue *pColor,
1761 uint32_t rangeCount,
1762 const VkImageSubresourceRange *pRanges)
1763 {
1764 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1765 TU_FROM_HANDLE(tu_image, image, image_h);
1766
1767 for (unsigned i = 0; i < rangeCount; i++)
1768 clear_image(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
1769 }
1770
1771 void
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)1772 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1773 VkImage image_h,
1774 VkImageLayout imageLayout,
1775 const VkClearDepthStencilValue *pDepthStencil,
1776 uint32_t rangeCount,
1777 const VkImageSubresourceRange *pRanges)
1778 {
1779 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1780 TU_FROM_HANDLE(tu_image, image, image_h);
1781
1782 for (unsigned i = 0; i < rangeCount; i++) {
1783 const VkImageSubresourceRange *range = &pRanges[i];
1784
1785 if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1786 /* can't clear both depth and stencil at once, split up the aspect mask */
1787 uint32_t b;
1788 for_each_bit(b, range->aspectMask)
1789 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
1790 continue;
1791 }
1792
1793 clear_image(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
1794 }
1795 }
1796
1797 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)1798 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
1799 uint32_t attachment_count,
1800 const VkClearAttachment *attachments,
1801 uint32_t rect_count,
1802 const VkClearRect *rects)
1803 {
1804 /* the shader path here is special, it avoids changing MRT/etc state */
1805 const struct tu_render_pass *pass = cmd->state.pass;
1806 const struct tu_subpass *subpass = cmd->state.subpass;
1807 const uint32_t mrt_count = subpass->color_count;
1808 struct tu_cs *cs = &cmd->draw_cs;
1809 uint32_t clear_value[MAX_RTS][4];
1810 float z_clear_val = 0.0f;
1811 uint8_t s_clear_val = 0;
1812 uint32_t clear_rts = 0, clear_components = 0, num_rts = 0, b;
1813 bool z_clear = false;
1814 bool s_clear = false;
1815 bool layered_clear = false;
1816 uint32_t max_samples = 1;
1817
1818 for (uint32_t i = 0; i < attachment_count; i++) {
1819 uint32_t a;
1820 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1821 uint32_t c = attachments[i].colorAttachment;
1822 a = subpass->color_attachments[c].attachment;
1823 if (a == VK_ATTACHMENT_UNUSED)
1824 continue;
1825
1826 clear_rts |= 1 << c;
1827 clear_components |= 0xf << (c * 4);
1828 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
1829 } else {
1830 a = subpass->depth_stencil_attachment.attachment;
1831 if (a == VK_ATTACHMENT_UNUSED)
1832 continue;
1833
1834 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1835 z_clear = true;
1836 z_clear_val = attachments[i].clearValue.depthStencil.depth;
1837 }
1838
1839 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1840 s_clear = true;
1841 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
1842 }
1843 }
1844
1845 max_samples = MAX2(max_samples, pass->attachments[a].samples);
1846 }
1847
1848 /* disable all draw states so they don't interfere
1849 * TODO: use and re-use draw states
1850 * we have to disable draw states individually to preserve
1851 * input attachment states, because a secondary command buffer
1852 * won't be able to restore them
1853 */
1854 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
1855 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
1856 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
1857 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
1858 continue;
1859 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
1860 CP_SET_DRAW_STATE__0_DISABLE);
1861 tu_cs_emit_qw(cs, 0);
1862 }
1863 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
1864
1865 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1866 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
1867 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
1868 0xfc000000);
1869 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1870
1871 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), mrt_count);
1872 for (uint32_t i = 0; i < mrt_count; i++) {
1873 if (clear_rts & (1 << i))
1874 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(num_rts++ * 4));
1875 else
1876 tu_cs_emit(cs, 0);
1877 }
1878
1879 for (uint32_t i = 0; i < rect_count; i++) {
1880 if (rects[i].baseArrayLayer || rects[i].layerCount > 1)
1881 layered_clear = true;
1882 }
1883
1884 /* a630 doesn't support multiview masks, which means that we can't use the
1885 * normal multiview path without potentially recompiling a shader on-demand
1886 * or using a more complicated variant that takes the mask as a const. Just
1887 * use the layered path instead, since it shouldn't be much worse.
1888 */
1889 if (subpass->multiview_mask) {
1890 layered_clear = true;
1891 }
1892
1893 r3d_common(cmd, cs, false, num_rts, layered_clear);
1894
1895 tu_cs_emit_regs(cs,
1896 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
1897 tu_cs_emit_regs(cs,
1898 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
1899
1900 tu_cs_emit_regs(cs,
1901 A6XX_RB_FS_OUTPUT_CNTL0(),
1902 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
1903
1904 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1905 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
1906 for (uint32_t i = 0; i < mrt_count; i++) {
1907 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
1908 .component_enable = COND(clear_rts & (1 << i), 0xf)));
1909 }
1910
1911 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1912 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1913
1914 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1915 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
1916 .z_enable = z_clear,
1917 .z_write_enable = z_clear,
1918 .zfunc = FUNC_ALWAYS));
1919 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1920 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
1921 .stencil_enable = s_clear,
1922 .func = FUNC_ALWAYS,
1923 .zpass = STENCIL_REPLACE));
1924 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
1925 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
1926 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
1927
1928 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4 * num_rts);
1929 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1930 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1931 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1932 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_SHADER) |
1933 CP_LOAD_STATE6_0_NUM_UNIT(num_rts));
1934 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1935 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1936 for_each_bit(b, clear_rts)
1937 tu_cs_emit_array(cs, clear_value[b], 4);
1938
1939 for (uint32_t i = 0; i < rect_count; i++) {
1940 /* This should be true because of this valid usage for
1941 * vkCmdClearAttachments:
1942 *
1943 * "If the render pass instance this is recorded in uses multiview,
1944 * then baseArrayLayer must be zero and layerCount must be one"
1945 */
1946 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
1947
1948 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount) {
1949 r3d_coords_raw(cs, (float[]) {
1950 rects[i].rect.offset.x, rects[i].rect.offset.y,
1951 z_clear_val, uif(rects[i].baseArrayLayer + layer),
1952 rects[i].rect.offset.x + rects[i].rect.extent.width,
1953 rects[i].rect.offset.y + rects[i].rect.extent.height,
1954 z_clear_val, 1.0f,
1955 });
1956 r3d_run(cmd, cs);
1957 }
1958 }
1959 }
1960
1961 static void
pack_gmem_clear_value(const VkClearValue * val,VkFormat format,uint32_t clear_value[4])1962 pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
1963 {
1964 switch (format) {
1965 case VK_FORMAT_X8_D24_UNORM_PACK32:
1966 case VK_FORMAT_D24_UNORM_S8_UINT:
1967 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1968 val->depthStencil.stencil << 24;
1969 return;
1970 case VK_FORMAT_D16_UNORM:
1971 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1972 return;
1973 case VK_FORMAT_D32_SFLOAT:
1974 clear_value[0] = fui(val->depthStencil.depth);
1975 return;
1976 case VK_FORMAT_S8_UINT:
1977 clear_value[0] = val->depthStencil.stencil;
1978 return;
1979 default:
1980 break;
1981 }
1982
1983 float tmp[4];
1984 memcpy(tmp, val->color.float32, 4 * sizeof(float));
1985 if (vk_format_is_srgb(format)) {
1986 for (int i = 0; i < 3; i++)
1987 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1988 }
1989
1990 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1991 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1992 switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1993 case 4:
1994 PACK_F(r4g4b4a4_unorm);
1995 break;
1996 case 5:
1997 if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1998 PACK_F(r5g6b5_unorm);
1999 else
2000 PACK_F(r5g5b5a1_unorm);
2001 break;
2002 case 8:
2003 if (vk_format_is_snorm(format))
2004 PACK_F(r8g8b8a8_snorm);
2005 else if (vk_format_is_unorm(format))
2006 PACK_F(r8g8b8a8_unorm);
2007 else
2008 pack_int8(clear_value, val->color.uint32);
2009 break;
2010 case 10:
2011 if (vk_format_is_int(format))
2012 pack_int10_2(clear_value, val->color.uint32);
2013 else
2014 PACK_F(r10g10b10a2_unorm);
2015 break;
2016 case 11:
2017 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
2018 break;
2019 case 16:
2020 if (vk_format_is_snorm(format))
2021 PACK_F(r16g16b16a16_snorm);
2022 else if (vk_format_is_unorm(format))
2023 PACK_F(r16g16b16a16_unorm);
2024 else if (vk_format_is_float(format))
2025 PACK_F(r16g16b16a16_float);
2026 else
2027 pack_int16(clear_value, val->color.uint32);
2028 break;
2029 case 32:
2030 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
2031 break;
2032 default:
2033 unreachable("unexpected channel size");
2034 }
2035 #undef PACK_F
2036 }
2037
2038 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)2039 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2040 struct tu_cs *cs,
2041 VkFormat format,
2042 uint8_t clear_mask,
2043 uint32_t gmem_offset,
2044 const VkClearValue *value)
2045 {
2046 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
2047 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(tu6_base_format(format)));
2048
2049 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.gmem = 1, .clear_mask = clear_mask));
2050
2051 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
2052 tu_cs_emit(cs, gmem_offset);
2053
2054 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
2055 tu_cs_emit(cs, 0);
2056
2057 uint32_t clear_vals[4] = {};
2058 pack_gmem_clear_value(value, format, clear_vals);
2059
2060 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
2061 tu_cs_emit_array(cs, clear_vals, 4);
2062
2063 tu6_emit_event_write(cmd, cs, BLIT);
2064 }
2065
2066 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,VkImageAspectFlags mask,const VkClearValue * value)2067 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2068 struct tu_cs *cs,
2069 uint32_t attachment,
2070 VkImageAspectFlags mask,
2071 const VkClearValue *value)
2072 {
2073 const struct tu_render_pass_attachment *att =
2074 &cmd->state.pass->attachments[attachment];
2075
2076 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2077 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
2078 clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
2079 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
2080 clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
2081 return;
2082 }
2083
2084 clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
2085 }
2086
2087 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)2088 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
2089 uint32_t attachment_count,
2090 const VkClearAttachment *attachments,
2091 uint32_t rect_count,
2092 const VkClearRect *rects)
2093 {
2094 const struct tu_subpass *subpass = cmd->state.subpass;
2095 struct tu_cs *cs = &cmd->draw_cs;
2096
2097 /* TODO: swap the loops for smaller cmdstream */
2098 for (unsigned i = 0; i < rect_count; i++) {
2099 unsigned x1 = rects[i].rect.offset.x;
2100 unsigned y1 = rects[i].rect.offset.y;
2101 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
2102 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
2103
2104 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
2105 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
2106 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
2107
2108 for (unsigned j = 0; j < attachment_count; j++) {
2109 uint32_t a;
2110 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
2111 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
2112 else
2113 a = subpass->depth_stencil_attachment.attachment;
2114
2115 if (a == VK_ATTACHMENT_UNUSED)
2116 continue;
2117
2118 tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask,
2119 &attachments[j].clearValue);
2120 }
2121 }
2122 }
2123
2124 void
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)2125 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
2126 uint32_t attachmentCount,
2127 const VkClearAttachment *pAttachments,
2128 uint32_t rectCount,
2129 const VkClearRect *pRects)
2130 {
2131 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2132 struct tu_cs *cs = &cmd->draw_cs;
2133
2134 /* sysmem path behaves like a draw, note we don't have a way of using different
2135 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
2136 */
2137 tu_emit_cache_flush_renderpass(cmd, cs);
2138
2139 for (uint32_t j = 0; j < attachmentCount; j++) {
2140 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
2141 continue;
2142 cmd->state.lrz.valid = false;
2143 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2144 }
2145
2146 /* vkCmdClearAttachments is supposed to respect the predicate if active.
2147 * The easiest way to do this is to always use the 3d path, which always
2148 * works even with GMEM because it's just a simple draw using the existing
2149 * attachment state. However it seems that IGNORE_VISIBILITY draws must be
2150 * skipped in the binning pass, since otherwise they produce binning data
2151 * which isn't consumed and leads to the wrong binning data being read, so
2152 * condition on GMEM | SYSMEM.
2153 */
2154 if (cmd->state.predication_active) {
2155 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM |
2156 CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2157 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2158 tu_cond_exec_end(cs);
2159 return;
2160 }
2161
2162 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
2163 tu_clear_gmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2164 tu_cond_exec_end(cs);
2165
2166 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2167 tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects);
2168 tu_cond_exec_end(cs);
2169 }
2170
2171 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat format,VkImageAspectFlags clear_mask,const VkRenderPassBeginInfo * info,uint32_t a,bool separate_stencil)2172 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2173 struct tu_cs *cs,
2174 VkFormat format,
2175 VkImageAspectFlags clear_mask,
2176 const VkRenderPassBeginInfo *info,
2177 uint32_t a,
2178 bool separate_stencil)
2179 {
2180 const struct tu_framebuffer *fb = cmd->state.framebuffer;
2181 const struct tu_image_view *iview = fb->attachments[a].attachment;
2182 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
2183 const struct blit_ops *ops = &r2d_ops;
2184 if (cmd->state.pass->attachments[a].samples > 1)
2185 ops = &r3d_ops;
2186
2187 ops->setup(cmd, cs, format, clear_mask, ROTATE_0, true, iview->ubwc_enabled);
2188 ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
2189 ops->clear_value(cs, format, &info->pClearValues[a]);
2190
2191 for_each_layer(i, clear_views, fb->layers) {
2192 if (separate_stencil) {
2193 if (ops == &r3d_ops)
2194 r3d_dst_stencil(cs, iview, i);
2195 else
2196 r2d_dst_stencil(cs, iview, i);
2197 } else {
2198 ops->dst(cs, iview, i);
2199 }
2200 ops->run(cmd, cs);
2201 }
2202
2203 ops->teardown(cmd, cs);
2204 }
2205
2206 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2207 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
2208 struct tu_cs *cs,
2209 uint32_t a,
2210 const VkRenderPassBeginInfo *info)
2211 {
2212 const struct tu_render_pass_attachment *attachment =
2213 &cmd->state.pass->attachments[a];
2214
2215 if (!attachment->clear_mask)
2216 return;
2217
2218 /* Wait for any flushes at the beginning of the renderpass to complete */
2219 tu_cs_emit_wfi(cs);
2220
2221 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2222 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2223 clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
2224 info, a, false);
2225 }
2226 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
2227 clear_sysmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
2228 info, a, true);
2229 }
2230 } else {
2231 clear_sysmem_attachment(cmd, cs, attachment->format, attachment->clear_mask,
2232 info, a, false);
2233 }
2234
2235 /* The spec doesn't explicitly say, but presumably the initial renderpass
2236 * clear is considered part of the renderpass, and therefore barriers
2237 * aren't required inside the subpass/renderpass. Therefore we need to
2238 * flush CCU color into CCU depth here, just like with
2239 * vkCmdClearAttachments(). Note that because this only happens at the
2240 * beginning of a renderpass, and renderpass writes are considered
2241 * "incoherent", we shouldn't have to worry about syncing depth into color
2242 * beforehand as depth should already be flushed.
2243 */
2244 if (vk_format_is_depth_or_stencil(attachment->format)) {
2245 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2246 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_DEPTH);
2247 } else {
2248 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2249 tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
2250 }
2251 }
2252
2253 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,const VkRenderPassBeginInfo * info)2254 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
2255 struct tu_cs *cs,
2256 uint32_t a,
2257 const VkRenderPassBeginInfo *info)
2258 {
2259 const struct tu_render_pass_attachment *attachment =
2260 &cmd->state.pass->attachments[a];
2261
2262 if (!attachment->clear_mask)
2263 return;
2264
2265 tu_cs_emit_regs(cs, A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2266
2267 tu_emit_clear_gmem_attachment(cmd, cs, a, attachment->clear_mask,
2268 &info->pClearValues[a]);
2269 }
2270
2271 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,bool resolve,bool separate_stencil)2272 tu_emit_blit(struct tu_cmd_buffer *cmd,
2273 struct tu_cs *cs,
2274 const struct tu_image_view *iview,
2275 const struct tu_render_pass_attachment *attachment,
2276 bool resolve,
2277 bool separate_stencil)
2278 {
2279 tu_cs_emit_regs(cs,
2280 A6XX_RB_MSAA_CNTL(tu_msaa_samples(attachment->samples)));
2281
2282 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(
2283 .unk0 = !resolve,
2284 .gmem = !resolve,
2285 /* "integer" bit disables msaa resolve averaging */
2286 .integer = vk_format_is_int(attachment->format)));
2287
2288 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
2289 if (separate_stencil) {
2290 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_BLIT_DST_INFO) & ~A6XX_RB_BLIT_DST_INFO_FLAGS);
2291 tu_cs_emit_qw(cs, iview->stencil_base_addr);
2292 tu_cs_emit(cs, iview->stencil_PITCH);
2293
2294 tu_cs_emit_regs(cs,
2295 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
2296 } else {
2297 tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
2298 tu_cs_image_ref_2d(cs, iview, 0, false);
2299
2300 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST_LO, 3);
2301 tu_cs_image_flag_ref(cs, iview, 0);
2302
2303 tu_cs_emit_regs(cs,
2304 A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
2305 }
2306
2307 tu6_emit_event_write(cmd, cs, BLIT);
2308 }
2309
2310 static bool
blit_can_resolve(VkFormat format)2311 blit_can_resolve(VkFormat format)
2312 {
2313 const struct util_format_description *desc = vk_format_description(format);
2314
2315 /* blit event can only do resolve for simple cases:
2316 * averaging samples as unsigned integers or choosing only one sample
2317 */
2318 if (vk_format_is_snorm(format) || vk_format_is_srgb(format))
2319 return false;
2320
2321 /* can't do formats with larger channel sizes
2322 * note: this includes all float formats
2323 * note2: single channel integer formats seem OK
2324 */
2325 if (desc->channel[0].size > 10)
2326 return false;
2327
2328 switch (format) {
2329 /* for unknown reasons blit event can't msaa resolve these formats when tiled
2330 * likely related to these formats having different layout from other cpp=2 formats
2331 */
2332 case VK_FORMAT_R8G8_UNORM:
2333 case VK_FORMAT_R8G8_UINT:
2334 case VK_FORMAT_R8G8_SINT:
2335 /* TODO: this one should be able to work? */
2336 case VK_FORMAT_D24_UNORM_S8_UINT:
2337 return false;
2338 default:
2339 break;
2340 }
2341
2342 return true;
2343 }
2344
2345 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool force_load)2346 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
2347 struct tu_cs *cs,
2348 uint32_t a,
2349 bool force_load)
2350 {
2351 const struct tu_image_view *iview =
2352 cmd->state.framebuffer->attachments[a].attachment;
2353 const struct tu_render_pass_attachment *attachment =
2354 &cmd->state.pass->attachments[a];
2355
2356 if (attachment->load || force_load)
2357 tu_emit_blit(cmd, cs, iview, attachment, false, false);
2358
2359 if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
2360 tu_emit_blit(cmd, cs, iview, attachment, false, true);
2361 }
2362
2363 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image_view * iview,uint32_t samples,bool separate_stencil,VkFormat format,uint32_t gmem_offset,uint32_t cpp)2364 store_cp_blit(struct tu_cmd_buffer *cmd,
2365 struct tu_cs *cs,
2366 struct tu_image_view *iview,
2367 uint32_t samples,
2368 bool separate_stencil,
2369 VkFormat format,
2370 uint32_t gmem_offset,
2371 uint32_t cpp)
2372 {
2373 r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, ROTATE_0, false,
2374 iview->ubwc_enabled, true);
2375 if (separate_stencil)
2376 r2d_dst_stencil(cs, iview, 0);
2377 else
2378 r2d_dst(cs, iview, 0);
2379
2380 tu_cs_emit_regs(cs,
2381 A6XX_SP_PS_2D_SRC_INFO(
2382 .color_format = tu6_format_texture(format, TILE6_2).fmt,
2383 .tile_mode = TILE6_2,
2384 .srgb = vk_format_is_srgb(format),
2385 .samples = tu_msaa_samples(samples),
2386 .samples_average = !vk_format_is_int(format),
2387 .unk20 = 1,
2388 .unk22 = 1),
2389 /* note: src size does not matter when not scaling */
2390 A6XX_SP_PS_2D_SRC_SIZE( .width = 0x3fff, .height = 0x3fff),
2391 A6XX_SP_PS_2D_SRC_LO(cmd->device->physical_device->gmem_base + gmem_offset),
2392 A6XX_SP_PS_2D_SRC_HI(),
2393 A6XX_SP_PS_2D_SRC_PITCH(.pitch = cmd->state.framebuffer->tile0.width * cpp));
2394
2395 /* sync GMEM writes with CACHE. */
2396 tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
2397
2398 /* Wait for CACHE_INVALIDATE to land */
2399 tu_cs_emit_wfi(cs);
2400
2401 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
2402 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
2403
2404 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
2405 * sysmem, and we generally assume that GMEM renderpasses leave their
2406 * results in sysmem, so we need to flush manually here.
2407 */
2408 tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
2409 }
2410
2411 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a)2412 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
2413 struct tu_cs *cs,
2414 uint32_t a,
2415 uint32_t gmem_a)
2416 {
2417 struct tu_physical_device *phys_dev = cmd->device->physical_device;
2418 const VkRect2D *render_area = &cmd->state.render_area;
2419 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
2420 struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
2421 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
2422
2423 if (!dst->store && !dst->store_stencil)
2424 return;
2425
2426 uint32_t x1 = render_area->offset.x;
2427 uint32_t y1 = render_area->offset.y;
2428 uint32_t x2 = x1 + render_area->extent.width;
2429 uint32_t y2 = y1 + render_area->extent.height;
2430 /* x2/y2 can be unaligned if equal to the size of the image,
2431 * since it will write into padding space
2432 * the one exception is linear levels which don't have the
2433 * required y padding in the layout (except for the last level)
2434 */
2435 bool need_y2_align =
2436 y2 != iview->extent.height || iview->need_y2_align;
2437
2438 bool unaligned =
2439 x1 % phys_dev->info.gmem_align_w ||
2440 (x2 % phys_dev->info.gmem_align_w && x2 != iview->extent.width) ||
2441 y1 % phys_dev->info.gmem_align_h || (y2 % phys_dev->info.gmem_align_h && need_y2_align);
2442
2443 /* use fast path when render area is aligned, except for unsupported resolve cases */
2444 if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
2445 if (dst->store)
2446 tu_emit_blit(cmd, cs, iview, src, true, false);
2447 if (dst->store_stencil)
2448 tu_emit_blit(cmd, cs, iview, src, true, true);
2449 return;
2450 }
2451
2452 if (dst->samples > 1) {
2453 /* I guess we need to use shader path in this case?
2454 * need a testcase which fails because of this
2455 */
2456 tu_finishme("unaligned store of msaa attachment\n");
2457 return;
2458 }
2459
2460 r2d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
2461
2462 VkFormat format = src->format;
2463 if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
2464 format = VK_FORMAT_D32_SFLOAT;
2465
2466 if (dst->store) {
2467 store_cp_blit(cmd, cs, iview, src->samples, false, format,
2468 src->gmem_offset, src->cpp);
2469 }
2470 if (dst->store_stencil) {
2471 store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
2472 src->gmem_offset_stencil, src->samples);
2473 }
2474 }
2475