1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2015-2021 Advanced Micro Devices, Inc.
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "si_build_pm4.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11
12
13 static
si_prepare_for_sdma_copy(struct si_context * sctx,struct si_texture * dst,struct si_texture * src)14 bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src)
15 {
16 if (dst->surface.bpe != src->surface.bpe)
17 return false;
18
19 /* MSAA: Blits don't exist in the real world. */
20 if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
21 return false;
22
23 if (dst->buffer.b.b.last_level != 0 || src->buffer.b.b.last_level != 0)
24 return false;
25
26 return true;
27 }
28
minify_as_blocks(unsigned width,unsigned level,unsigned blk_w)29 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
30 {
31 width = u_minify(width, level);
32 return DIV_ROUND_UP(width, blk_w);
33 }
34
encode_legacy_tile_info(struct si_context * sctx,struct si_texture * tex)35 static unsigned encode_legacy_tile_info(struct si_context *sctx, struct si_texture *tex)
36 {
37 struct radeon_info *info = &sctx->screen->info;
38 unsigned tile_index = tex->surface.u.legacy.tiling_index[0];
39 unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
40 unsigned tile_mode = info->si_tile_mode_array[tile_index];
41 unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
42
43 return util_logbase2(tex->surface.bpe) |
44 (G_009910_ARRAY_MODE(tile_mode) << 3) |
45 (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
46 /* Non-depth modes don't have TILE_SPLIT set. */
47 ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
48 (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
49 (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
50 (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
51 (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
52 (G_009910_PIPE_CONFIG(tile_mode) << 26);
53 }
54
si_sdma_v4_v5_copy_texture(struct si_context * sctx,struct si_texture * sdst,struct si_texture * ssrc)55 static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_texture *sdst,
56 struct si_texture *ssrc)
57 {
58 bool is_v5 = sctx->gfx_level >= GFX10;
59 bool is_v5_2 = sctx->gfx_level >= GFX10_3;
60 unsigned bpp = sdst->surface.bpe;
61 uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
62 uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
63 unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
64 unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
65 unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
66 unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
67
68 bool tmz = (ssrc->buffer.flags & RADEON_FLAG_ENCRYPTED);
69 assert (!tmz || (sdst->buffer.flags & RADEON_FLAG_ENCRYPTED));
70
71 /* Linear -> linear sub-window copy. */
72 if (ssrc->surface.is_linear && sdst->surface.is_linear) {
73 struct radeon_cmdbuf *cs = sctx->sdma_cs;
74
75 uint64_t bytes = (uint64_t)src_pitch * copy_height * bpp;
76 uint32_t chunk_size = 1u << (is_v5_2 ? 30 : 22);
77 uint32_t chunk_count = DIV_ROUND_UP(bytes, chunk_size);
78
79 src_address += ssrc->surface.u.gfx9.offset[0];
80 dst_address += sdst->surface.u.gfx9.offset[0];
81
82 radeon_begin(cs);
83 for (int i = 0; i < chunk_count; i++) {
84 uint32_t size = MIN2(chunk_size, bytes);
85 radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
86 SDMA_COPY_SUB_OPCODE_LINEAR,
87 (tmz ? 4 : 0)));
88 radeon_emit(size - 1);
89 radeon_emit(0);
90 radeon_emit(src_address);
91 radeon_emit(src_address >> 32);
92 radeon_emit(dst_address);
93 radeon_emit(dst_address >> 32);
94
95 src_address += size;
96 dst_address += size;
97 bytes -= size;
98 }
99 radeon_end();
100 return true;
101 }
102
103 /* Linear <-> Tiled sub-window copy */
104 if (ssrc->surface.is_linear != sdst->surface.is_linear) {
105 struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
106 struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
107 unsigned tiled_width = DIV_ROUND_UP(tiled->buffer.b.b.width0, tiled->surface.blk_w);
108 unsigned tiled_height = DIV_ROUND_UP(tiled->buffer.b.b.height0, tiled->surface.blk_h);
109 unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
110 uint64_t linear_slice_pitch = linear->surface.u.gfx9.surf_slice_size / bpp;
111 uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
112 uint64_t linear_address = linear == ssrc ? src_address : dst_address;
113 struct radeon_cmdbuf *cs = sctx->sdma_cs;
114 /* Only SDMA 5 supports DCC with SDMA */
115 bool dcc = vi_dcc_enabled(tiled, 0) && is_v5;
116 assert(tiled->buffer.b.b.depth0 == 1);
117
118 linear_address += linear->surface.u.gfx9.offset[0];
119
120 /* Check if everything fits into the bitfields */
121 if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) &&
122 linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) &&
123 copy_width <= (1 << 14) && copy_height <= (1 << 14)))
124 return false;
125
126 radeon_begin(cs);
127 radeon_emit(
128 SDMA_PACKET(SDMA_OPCODE_COPY,
129 SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW,
130 (tmz ? 4 : 0)) |
131 dcc << 19 |
132 (is_v5 ? 0 : tiled->buffer.b.b.last_level) << 20 |
133 (linear == sdst ? 1u : 0) << 31);
134 radeon_emit((uint32_t)tiled_address | (tiled->surface.tile_swizzle << 8));
135 radeon_emit((uint32_t)(tiled_address >> 32));
136 radeon_emit(0);
137 radeon_emit(((tiled_width - 1) << 16));
138 radeon_emit((tiled_height - 1));
139 radeon_emit(util_logbase2(bpp) |
140 tiled->surface.u.gfx9.swizzle_mode << 3 |
141 tiled->surface.u.gfx9.resource_type << 9 |
142 (is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16);
143 radeon_emit((uint32_t)linear_address);
144 radeon_emit((uint32_t)(linear_address >> 32));
145 radeon_emit(0);
146 radeon_emit(((linear_pitch - 1) << 16));
147 radeon_emit(linear_slice_pitch - 1);
148 radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
149 radeon_emit(0);
150
151 if (dcc) {
152 unsigned hw_fmt = ac_get_cb_format(sctx->gfx_level, tiled->buffer.b.b.format);
153 unsigned hw_type = ac_get_cb_number_type(tiled->buffer.b.b.format);
154 uint64_t md_address = tiled_address + tiled->surface.meta_offset;
155
156 /* Add metadata */
157 radeon_emit((uint32_t)md_address);
158 radeon_emit((uint32_t)(md_address >> 32));
159 radeon_emit(hw_fmt |
160 vi_alpha_is_on_msb(sctx->screen, tiled->buffer.b.b.format) << 8 |
161 hw_type << 9 |
162 tiled->surface.u.gfx9.color.dcc.max_compressed_block_size << 24 |
163 V_028C78_MAX_BLOCK_SIZE_256B << 26 |
164 tmz << 29 |
165 tiled->surface.u.gfx9.color.dcc.pipe_aligned << 31);
166 }
167 radeon_end();
168 return true;
169 }
170
171 return false;
172 }
173
174 static
cik_sdma_copy_texture(struct si_context * sctx,struct si_texture * sdst,struct si_texture * ssrc)175 bool cik_sdma_copy_texture(struct si_context *sctx, struct si_texture *sdst, struct si_texture *ssrc)
176 {
177 struct radeon_info *info = &sctx->screen->info;
178 unsigned bpp = sdst->surface.bpe;
179 uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[0].offset_256B * 256;
180 uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[0].offset_256B * 256;
181 unsigned dst_mode = sdst->surface.u.legacy.level[0].mode;
182 unsigned src_mode = ssrc->surface.u.legacy.level[0].mode;
183 unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[0];
184 unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[0];
185 unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
186 unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
187 unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
188 unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
189 unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
190 unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
191 unsigned dst_pitch = sdst->surface.u.legacy.level[0].nblk_x;
192 unsigned src_pitch = ssrc->surface.u.legacy.level[0].nblk_x;
193 uint64_t dst_slice_pitch =
194 ((uint64_t)sdst->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
195 uint64_t src_slice_pitch =
196 ((uint64_t)ssrc->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
197 unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, 0, sdst->surface.blk_w);
198 unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, 0, ssrc->surface.blk_w);
199 unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
200 unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
201
202 dst_address |= dst_tile_swizzle << 8;
203 src_address |= src_tile_swizzle << 8;
204
205 /* Linear -> linear sub-window copy. */
206 if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
207 /* check if everything fits into the bitfields */
208 src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
209 dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
210 /* HW limitation - GFX7: */
211 (sctx->gfx_level != GFX7 ||
212 (copy_width < (1 << 14) && copy_height < (1 << 14))) &&
213 /* HW limitation - some GFX7 parts: */
214 ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
215 (copy_width != (1 << 14) && copy_height != (1 << 14)))) {
216 struct radeon_cmdbuf *cs = sctx->sdma_cs;
217
218 radeon_begin(cs);
219 radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
220 (util_logbase2(bpp) << 29));
221 radeon_emit(src_address);
222 radeon_emit(src_address >> 32);
223 radeon_emit(0);
224 radeon_emit((src_pitch - 1) << 16);
225 radeon_emit(src_slice_pitch - 1);
226 radeon_emit(dst_address);
227 radeon_emit(dst_address >> 32);
228 radeon_emit(0);
229 radeon_emit((dst_pitch - 1) << 16);
230 radeon_emit(dst_slice_pitch - 1);
231 if (sctx->gfx_level == GFX7) {
232 radeon_emit(copy_width | (copy_height << 16));
233 radeon_emit(0);
234 } else {
235 radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
236 radeon_emit(0);
237 }
238 radeon_end();
239 return true;
240 }
241
242 /* Tiled <-> linear sub-window copy. */
243 if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
244 struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
245 struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
246 unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
247 unsigned linear_width = linear == ssrc ? src_width : dst_width;
248 unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
249 unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
250 unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
251 unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
252 uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
253 uint64_t linear_address = linear == ssrc ? src_address : dst_address;
254 unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
255
256 assert(tiled_pitch % 8 == 0);
257 assert(tiled_slice_pitch % 64 == 0);
258 unsigned pitch_tile_max = tiled_pitch / 8 - 1;
259 unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
260 unsigned xalign = MAX2(1, 4 / bpp);
261 unsigned copy_width_aligned = copy_width;
262
263 /* If the region ends at the last pixel and is unaligned, we
264 * can copy the remainder of the line that is not visible to
265 * make it aligned.
266 */
267 if (copy_width % xalign != 0 && 0 + copy_width == linear_width &&
268 copy_width == tiled_width &&
269 align(copy_width, xalign) <= linear_pitch &&
270 align(copy_width, xalign) <= tiled_pitch)
271 copy_width_aligned = align(copy_width, xalign);
272
273 /* HW limitations. */
274 if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
275 linear_pitch - 1 == 0x3fff && bpp == 16)
276 return false;
277
278 if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
279 sctx->family == CHIP_KABINI) &&
280 (copy_width == (1 << 14) || copy_height == (1 << 14)))
281 return false;
282
283 /* The hw can read outside of the given linear buffer bounds,
284 * or access those pages but not touch the memory in case
285 * of writes. (it still causes a VM fault)
286 *
287 * Out-of-bounds memory access or page directory access must
288 * be prevented.
289 */
290 int64_t start_linear_address, end_linear_address;
291 unsigned granularity;
292
293 /* Deduce the size of reads from the linear surface. */
294 switch (tiled_micro_mode) {
295 case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
296 granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
297 break;
298 case V_009910_ADDR_SURF_THIN_MICRO_TILING:
299 case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
300 if (0 /* TODO: THICK microtiling */)
301 granularity =
302 bpp == 1 ? 32 / (8 * bpp)
303 : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
304 else
305 granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
306 break;
307 default:
308 return false;
309 }
310
311 /* The linear reads start at tiled_x & ~(granularity - 1).
312 * If linear_x == 0 && tiled_x % granularity != 0, the hw
313 * starts reading from an address preceding linear_address!!!
314 */
315 start_linear_address =
316 (uint64_t)linear->surface.u.legacy.level[0].offset_256B * 256;
317
318 end_linear_address =
319 (uint64_t)linear->surface.u.legacy.level[0].offset_256B * 256 +
320 bpp * ((copy_height - 1) * (uint64_t)linear_pitch + copy_width);
321
322 if ((0 + copy_width) % granularity)
323 end_linear_address += granularity - (0 + copy_width) % granularity;
324
325 if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
326 return false;
327
328 /* Check requirements. */
329 if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
330 copy_width_aligned % xalign == 0 &&
331 tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
332 /* check if everything fits into the bitfields */
333 tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
334 slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
335 linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
336 copy_height <= (1 << 14)) {
337 struct radeon_cmdbuf *cs = sctx->sdma_cs;
338 uint32_t direction = linear == sdst ? 1u << 31 : 0;
339
340 radeon_begin(cs);
341 radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
342 SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
343 direction);
344 radeon_emit(tiled_address);
345 radeon_emit(tiled_address >> 32);
346 radeon_emit(0);
347 radeon_emit(pitch_tile_max << 16);
348 radeon_emit(slice_tile_max);
349 radeon_emit(encode_legacy_tile_info(sctx, tiled));
350 radeon_emit(linear_address);
351 radeon_emit(linear_address >> 32);
352 radeon_emit(0);
353 radeon_emit(((linear_pitch - 1) << 16));
354 radeon_emit(linear_slice_pitch - 1);
355 if (sctx->gfx_level == GFX7) {
356 radeon_emit(copy_width_aligned | (copy_height << 16));
357 radeon_emit(1);
358 } else {
359 radeon_emit((copy_width_aligned - 1) | ((copy_height - 1) << 16));
360 radeon_emit(0);
361 }
362 radeon_end();
363 return true;
364 }
365 }
366
367 return false;
368 }
369
si_sdma_copy_image(struct si_context * sctx,struct si_texture * dst,struct si_texture * src)370 bool si_sdma_copy_image(struct si_context *sctx, struct si_texture *dst, struct si_texture *src)
371 {
372 struct radeon_winsys *ws = sctx->ws;
373
374 if (!sctx->sdma_cs) {
375 if (sctx->screen->debug_flags & DBG(NO_DMA) || sctx->gfx_level < GFX7)
376 return false;
377
378 sctx->sdma_cs = CALLOC_STRUCT(radeon_cmdbuf);
379 if (ws->cs_create(sctx->sdma_cs, sctx->ctx, AMD_IP_SDMA, NULL, NULL))
380 return false;
381 }
382
383 if (!si_prepare_for_sdma_copy(sctx, dst, src))
384 return false;
385
386 /* TODO: DCC compression is possible on GFX10+. See si_set_mutable_tex_desc_fields for
387 * additional constraints.
388 * For now, the only use-case of SDMA is DRI_PRIME tiled->linear copy, and linear dst
389 * never has DCC.
390 */
391 if (vi_dcc_enabled(dst, 0))
392 return false;
393
394 /* Decompress DCC on older chips where SDMA can't read it. */
395 if (vi_dcc_enabled(src, 0) && sctx->gfx_level < GFX10)
396 si_decompress_dcc(sctx, src);
397
398 /* Always flush the gfx queue to get the winsys to handle the dependencies for us. */
399 si_flush_gfx_cs(sctx, 0, NULL);
400
401 switch (sctx->gfx_level) {
402 case GFX7:
403 case GFX8:
404 if (!cik_sdma_copy_texture(sctx, dst, src))
405 return false;
406 break;
407 case GFX9:
408 case GFX10:
409 case GFX10_3:
410 case GFX11:
411 case GFX11_5:
412 if (!si_sdma_v4_v5_copy_texture(sctx, dst, src))
413 return false;
414 break;
415 default:
416 return false;
417 }
418
419 radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &src->buffer, RADEON_USAGE_READ |
420 RADEON_PRIO_SAMPLER_TEXTURE);
421 radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &dst->buffer, RADEON_USAGE_WRITE |
422 RADEON_PRIO_SAMPLER_TEXTURE);
423
424 unsigned flags = RADEON_FLUSH_START_NEXT_GFX_IB_NOW;
425 if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
426 if ((bool) (src->buffer.flags & RADEON_FLAG_ENCRYPTED) !=
427 sctx->ws->cs_is_secure(sctx->sdma_cs)) {
428 flags = RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION;
429 }
430 }
431
432 return ws->cs_flush(sctx->sdma_cs, flags, NULL) == 0;
433 }
434