• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2015-2021 Advanced Micro Devices, Inc.
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "si_build_pm4.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11 
12 
13 static
si_prepare_for_sdma_copy(struct si_context * sctx,struct si_texture * dst,struct si_texture * src)14 bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src)
15 {
16    if (dst->surface.bpe != src->surface.bpe)
17       return false;
18 
19    /* MSAA: Blits don't exist in the real world. */
20    if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
21       return false;
22 
23    if (dst->buffer.b.b.last_level != 0 || src->buffer.b.b.last_level != 0)
24       return false;
25 
26    return true;
27 }
28 
minify_as_blocks(unsigned width,unsigned level,unsigned blk_w)29 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
30 {
31    width = u_minify(width, level);
32    return DIV_ROUND_UP(width, blk_w);
33 }
34 
encode_legacy_tile_info(struct si_context * sctx,struct si_texture * tex)35 static unsigned encode_legacy_tile_info(struct si_context *sctx, struct si_texture *tex)
36 {
37    struct radeon_info *info = &sctx->screen->info;
38    unsigned tile_index = tex->surface.u.legacy.tiling_index[0];
39    unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
40    unsigned tile_mode = info->si_tile_mode_array[tile_index];
41    unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
42 
43    return util_logbase2(tex->surface.bpe) |
44           (G_009910_ARRAY_MODE(tile_mode) << 3) |
45           (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
46           /* Non-depth modes don't have TILE_SPLIT set. */
47           ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
48           (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
49           (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
50           (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
51           (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
52           (G_009910_PIPE_CONFIG(tile_mode) << 26);
53 }
54 
si_sdma_v4_v5_copy_texture(struct si_context * sctx,struct si_texture * sdst,struct si_texture * ssrc)55 static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_texture *sdst,
56                                        struct si_texture *ssrc)
57 {
58    bool is_v5 = sctx->gfx_level >= GFX10;
59    bool is_v5_2 = sctx->gfx_level >= GFX10_3;
60    unsigned bpp = sdst->surface.bpe;
61    uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
62    uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
63    unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
64    unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
65    unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
66    unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
67 
68    bool tmz = (ssrc->buffer.flags & RADEON_FLAG_ENCRYPTED);
69    assert (!tmz || (sdst->buffer.flags & RADEON_FLAG_ENCRYPTED));
70 
71    /* Linear -> linear sub-window copy. */
72    if (ssrc->surface.is_linear && sdst->surface.is_linear) {
73       struct radeon_cmdbuf *cs = sctx->sdma_cs;
74 
75       uint64_t bytes = (uint64_t)src_pitch * copy_height * bpp;
76       uint32_t chunk_size = 1u << (is_v5_2 ? 30 : 22);
77       uint32_t chunk_count = DIV_ROUND_UP(bytes, chunk_size);
78 
79       src_address += ssrc->surface.u.gfx9.offset[0];
80       dst_address += sdst->surface.u.gfx9.offset[0];
81 
82       radeon_begin(cs);
83       for (int i = 0; i < chunk_count; i++) {
84          uint32_t size = MIN2(chunk_size, bytes);
85          radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
86                                      SDMA_COPY_SUB_OPCODE_LINEAR,
87                                      (tmz ? 4 : 0)));
88          radeon_emit(size - 1);
89          radeon_emit(0);
90          radeon_emit(src_address);
91          radeon_emit(src_address >> 32);
92          radeon_emit(dst_address);
93          radeon_emit(dst_address >> 32);
94 
95          src_address += size;
96          dst_address += size;
97          bytes -= size;
98       }
99       radeon_end();
100       return true;
101    }
102 
103    /* Linear <-> Tiled sub-window copy */
104    if (ssrc->surface.is_linear != sdst->surface.is_linear) {
105       struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
106       struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
107       unsigned tiled_width = DIV_ROUND_UP(tiled->buffer.b.b.width0, tiled->surface.blk_w);
108       unsigned tiled_height = DIV_ROUND_UP(tiled->buffer.b.b.height0, tiled->surface.blk_h);
109       unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
110       uint64_t linear_slice_pitch = linear->surface.u.gfx9.surf_slice_size / bpp;
111       uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
112       uint64_t linear_address = linear == ssrc ? src_address : dst_address;
113       struct radeon_cmdbuf *cs = sctx->sdma_cs;
114       /* Only SDMA 5 supports DCC with SDMA */
115       bool dcc = vi_dcc_enabled(tiled, 0) && is_v5;
116       assert(tiled->buffer.b.b.depth0 == 1);
117 
118       linear_address += linear->surface.u.gfx9.offset[0];
119 
120       /* Check if everything fits into the bitfields */
121       if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) &&
122             linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) &&
123             copy_width <= (1 << 14) && copy_height <= (1 << 14)))
124          return false;
125 
126       radeon_begin(cs);
127       radeon_emit(
128          SDMA_PACKET(SDMA_OPCODE_COPY,
129                      SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW,
130                          (tmz ? 4 : 0)) |
131          dcc << 19 |
132          (is_v5 ? 0 : tiled->buffer.b.b.last_level) << 20 |
133          (linear == sdst ? 1u : 0) << 31);
134       radeon_emit((uint32_t)tiled_address | (tiled->surface.tile_swizzle << 8));
135       radeon_emit((uint32_t)(tiled_address >> 32));
136       radeon_emit(0);
137       radeon_emit(((tiled_width - 1) << 16));
138       radeon_emit((tiled_height - 1));
139       radeon_emit(util_logbase2(bpp) |
140                   tiled->surface.u.gfx9.swizzle_mode << 3 |
141                   tiled->surface.u.gfx9.resource_type << 9 |
142                   (is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16);
143       radeon_emit((uint32_t)linear_address);
144       radeon_emit((uint32_t)(linear_address >> 32));
145       radeon_emit(0);
146       radeon_emit(((linear_pitch - 1) << 16));
147       radeon_emit(linear_slice_pitch - 1);
148       radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
149       radeon_emit(0);
150 
151       if (dcc) {
152          unsigned hw_fmt = ac_get_cb_format(sctx->gfx_level, tiled->buffer.b.b.format);
153          unsigned hw_type = ac_get_cb_number_type(tiled->buffer.b.b.format);
154          uint64_t md_address = tiled_address + tiled->surface.meta_offset;
155 
156          /* Add metadata */
157          radeon_emit((uint32_t)md_address);
158          radeon_emit((uint32_t)(md_address >> 32));
159          radeon_emit(hw_fmt |
160                      vi_alpha_is_on_msb(sctx->screen, tiled->buffer.b.b.format) << 8 |
161                      hw_type << 9 |
162                      tiled->surface.u.gfx9.color.dcc.max_compressed_block_size << 24 |
163                      V_028C78_MAX_BLOCK_SIZE_256B << 26 |
164                      tmz << 29 |
165                      tiled->surface.u.gfx9.color.dcc.pipe_aligned << 31);
166       }
167       radeon_end();
168       return true;
169    }
170 
171    return false;
172 }
173 
174 static
cik_sdma_copy_texture(struct si_context * sctx,struct si_texture * sdst,struct si_texture * ssrc)175 bool cik_sdma_copy_texture(struct si_context *sctx, struct si_texture *sdst, struct si_texture *ssrc)
176 {
177    struct radeon_info *info = &sctx->screen->info;
178    unsigned bpp = sdst->surface.bpe;
179    uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[0].offset_256B * 256;
180    uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[0].offset_256B * 256;
181    unsigned dst_mode = sdst->surface.u.legacy.level[0].mode;
182    unsigned src_mode = ssrc->surface.u.legacy.level[0].mode;
183    unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[0];
184    unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[0];
185    unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
186    unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
187    unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
188    unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
189    unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
190    unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
191    unsigned dst_pitch = sdst->surface.u.legacy.level[0].nblk_x;
192    unsigned src_pitch = ssrc->surface.u.legacy.level[0].nblk_x;
193    uint64_t dst_slice_pitch =
194       ((uint64_t)sdst->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
195    uint64_t src_slice_pitch =
196       ((uint64_t)ssrc->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
197    unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, 0, sdst->surface.blk_w);
198    unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, 0, ssrc->surface.blk_w);
199    unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
200    unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
201 
202    dst_address |= dst_tile_swizzle << 8;
203    src_address |= src_tile_swizzle << 8;
204 
205    /* Linear -> linear sub-window copy. */
206    if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
207        /* check if everything fits into the bitfields */
208        src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
209        dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
210        /* HW limitation - GFX7: */
211        (sctx->gfx_level != GFX7 ||
212         (copy_width < (1 << 14) && copy_height < (1 << 14))) &&
213        /* HW limitation - some GFX7 parts: */
214        ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
215         (copy_width != (1 << 14) && copy_height != (1 << 14)))) {
216       struct radeon_cmdbuf *cs = sctx->sdma_cs;
217 
218       radeon_begin(cs);
219       radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
220                   (util_logbase2(bpp) << 29));
221       radeon_emit(src_address);
222       radeon_emit(src_address >> 32);
223       radeon_emit(0);
224       radeon_emit((src_pitch - 1) << 16);
225       radeon_emit(src_slice_pitch - 1);
226       radeon_emit(dst_address);
227       radeon_emit(dst_address >> 32);
228       radeon_emit(0);
229       radeon_emit((dst_pitch - 1) << 16);
230       radeon_emit(dst_slice_pitch - 1);
231       if (sctx->gfx_level == GFX7) {
232          radeon_emit(copy_width | (copy_height << 16));
233          radeon_emit(0);
234       } else {
235          radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
236          radeon_emit(0);
237       }
238       radeon_end();
239       return true;
240    }
241 
242    /* Tiled <-> linear sub-window copy. */
243    if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
244       struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
245       struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
246       unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
247       unsigned linear_width = linear == ssrc ? src_width : dst_width;
248       unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
249       unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
250       unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
251       unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
252       uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
253       uint64_t linear_address = linear == ssrc ? src_address : dst_address;
254       unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
255 
256       assert(tiled_pitch % 8 == 0);
257       assert(tiled_slice_pitch % 64 == 0);
258       unsigned pitch_tile_max = tiled_pitch / 8 - 1;
259       unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
260       unsigned xalign = MAX2(1, 4 / bpp);
261       unsigned copy_width_aligned = copy_width;
262 
263       /* If the region ends at the last pixel and is unaligned, we
264        * can copy the remainder of the line that is not visible to
265        * make it aligned.
266        */
267       if (copy_width % xalign != 0 && 0 + copy_width == linear_width &&
268           copy_width == tiled_width &&
269           align(copy_width, xalign) <= linear_pitch &&
270           align(copy_width, xalign) <= tiled_pitch)
271          copy_width_aligned = align(copy_width, xalign);
272 
273       /* HW limitations. */
274       if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
275           linear_pitch - 1 == 0x3fff && bpp == 16)
276          return false;
277 
278       if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
279            sctx->family == CHIP_KABINI) &&
280           (copy_width == (1 << 14) || copy_height == (1 << 14)))
281          return false;
282 
283       /* The hw can read outside of the given linear buffer bounds,
284        * or access those pages but not touch the memory in case
285        * of writes. (it still causes a VM fault)
286        *
287        * Out-of-bounds memory access or page directory access must
288        * be prevented.
289        */
290       int64_t start_linear_address, end_linear_address;
291       unsigned granularity;
292 
293       /* Deduce the size of reads from the linear surface. */
294       switch (tiled_micro_mode) {
295       case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
296          granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
297          break;
298       case V_009910_ADDR_SURF_THIN_MICRO_TILING:
299       case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
300          if (0 /* TODO: THICK microtiling */)
301             granularity =
302                bpp == 1 ? 32 / (8 * bpp)
303                         : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
304          else
305             granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
306          break;
307       default:
308          return false;
309       }
310 
311       /* The linear reads start at tiled_x & ~(granularity - 1).
312        * If linear_x == 0 && tiled_x % granularity != 0, the hw
313        * starts reading from an address preceding linear_address!!!
314        */
315       start_linear_address =
316          (uint64_t)linear->surface.u.legacy.level[0].offset_256B * 256;
317 
318       end_linear_address =
319          (uint64_t)linear->surface.u.legacy.level[0].offset_256B * 256 +
320          bpp * ((copy_height - 1) * (uint64_t)linear_pitch + copy_width);
321 
322       if ((0 + copy_width) % granularity)
323          end_linear_address += granularity - (0 + copy_width) % granularity;
324 
325       if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
326          return false;
327 
328       /* Check requirements. */
329       if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
330           copy_width_aligned % xalign == 0 &&
331           tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
332           /* check if everything fits into the bitfields */
333           tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
334           slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
335           linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
336           copy_height <= (1 << 14)) {
337          struct radeon_cmdbuf *cs = sctx->sdma_cs;
338          uint32_t direction = linear == sdst ? 1u << 31 : 0;
339 
340          radeon_begin(cs);
341          radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
342                                  SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
343                      direction);
344          radeon_emit(tiled_address);
345          radeon_emit(tiled_address >> 32);
346          radeon_emit(0);
347          radeon_emit(pitch_tile_max << 16);
348          radeon_emit(slice_tile_max);
349          radeon_emit(encode_legacy_tile_info(sctx, tiled));
350          radeon_emit(linear_address);
351          radeon_emit(linear_address >> 32);
352          radeon_emit(0);
353          radeon_emit(((linear_pitch - 1) << 16));
354          radeon_emit(linear_slice_pitch - 1);
355          if (sctx->gfx_level == GFX7) {
356             radeon_emit(copy_width_aligned | (copy_height << 16));
357             radeon_emit(1);
358          } else {
359             radeon_emit((copy_width_aligned - 1) | ((copy_height - 1) << 16));
360             radeon_emit(0);
361          }
362          radeon_end();
363          return true;
364       }
365    }
366 
367    return false;
368 }
369 
si_sdma_copy_image(struct si_context * sctx,struct si_texture * dst,struct si_texture * src)370 bool si_sdma_copy_image(struct si_context *sctx, struct si_texture *dst, struct si_texture *src)
371 {
372    struct radeon_winsys *ws = sctx->ws;
373 
374    if (!sctx->sdma_cs) {
375       if (sctx->screen->debug_flags & DBG(NO_DMA) || sctx->gfx_level < GFX7)
376          return false;
377 
378       sctx->sdma_cs = CALLOC_STRUCT(radeon_cmdbuf);
379       if (ws->cs_create(sctx->sdma_cs, sctx->ctx, AMD_IP_SDMA, NULL, NULL))
380          return false;
381    }
382 
383    if (!si_prepare_for_sdma_copy(sctx, dst, src))
384       return false;
385 
386    /* TODO: DCC compression is possible on GFX10+. See si_set_mutable_tex_desc_fields for
387     * additional constraints.
388     * For now, the only use-case of SDMA is DRI_PRIME tiled->linear copy, and linear dst
389     * never has DCC.
390     */
391    if (vi_dcc_enabled(dst, 0))
392       return false;
393 
394    /* Decompress DCC on older chips where SDMA can't read it. */
395    if (vi_dcc_enabled(src, 0) && sctx->gfx_level < GFX10)
396       si_decompress_dcc(sctx, src);
397 
398    /* Always flush the gfx queue to get the winsys to handle the dependencies for us. */
399    si_flush_gfx_cs(sctx, 0, NULL);
400 
401    switch (sctx->gfx_level) {
402       case GFX7:
403       case GFX8:
404          if (!cik_sdma_copy_texture(sctx, dst, src))
405             return false;
406          break;
407       case GFX9:
408       case GFX10:
409       case GFX10_3:
410       case GFX11:
411       case GFX11_5:
412          if (!si_sdma_v4_v5_copy_texture(sctx, dst, src))
413             return false;
414          break;
415       default:
416          return false;
417    }
418 
419    radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &src->buffer, RADEON_USAGE_READ |
420                              RADEON_PRIO_SAMPLER_TEXTURE);
421    radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &dst->buffer, RADEON_USAGE_WRITE |
422                              RADEON_PRIO_SAMPLER_TEXTURE);
423 
424    unsigned flags = RADEON_FLUSH_START_NEXT_GFX_IB_NOW;
425    if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
426       if ((bool) (src->buffer.flags & RADEON_FLAG_ENCRYPTED) !=
427           sctx->ws->cs_is_secure(sctx->sdma_cs)) {
428          flags = RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION;
429       }
430    }
431 
432    return ws->cs_flush(sctx->sdma_cs, flags, NULL) == 0;
433 }
434