• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2015-2021 Advanced Micro Devices, Inc.
4  * Copyright 2023 Valve Corporation
5  * All Rights Reserved.
6  *
7  * SPDX-License-Identifier: MIT
8  */
9 
10 #include "radv_sdma.h"
11 #include "util/macros.h"
12 #include "util/u_memory.h"
13 #include "radv_buffer.h"
14 #include "radv_cs.h"
15 #include "radv_formats.h"
16 
17 #include "ac_formats.h"
18 
19 struct radv_sdma_chunked_copy_info {
20    unsigned extent_horizontal_blocks;
21    unsigned extent_vertical_blocks;
22    unsigned aligned_row_pitch;
23    unsigned num_rows_per_copy;
24 };
25 
26 static const VkExtent3D radv_sdma_t2t_alignment_2d_and_planar[] = {
27    {16, 16, 1}, /* 1 bpp */
28    {16, 8, 1},  /* 2 bpp */
29    {8, 8, 1},   /* 4 bpp */
30    {8, 4, 1},   /* 8 bpp */
31    {4, 4, 1},   /* 16 bpp */
32 };
33 
34 static const VkExtent3D radv_sdma_t2t_alignment_3d[] = {
35    {8, 4, 8}, /* 1 bpp */
36    {4, 4, 8}, /* 2 bpp */
37    {4, 4, 4}, /* 4 bpp */
38    {4, 2, 4}, /* 8 bpp */
39    {2, 2, 4}, /* 16 bpp */
40 };
41 
42 ALWAYS_INLINE static unsigned
radv_sdma_pitch_alignment(const struct radv_device * device,const unsigned bpp)43 radv_sdma_pitch_alignment(const struct radv_device *device, const unsigned bpp)
44 {
45    const struct radv_physical_device *pdev = radv_device_physical(device);
46 
47    if (pdev->info.sdma_ip_version >= SDMA_5_0)
48       return MAX2(1, 4 / bpp);
49 
50    return 4;
51 }
52 
53 ALWAYS_INLINE static void
radv_sdma_check_pitches(const unsigned pitch,const unsigned slice_pitch,const unsigned bpp,const bool uses_depth)54 radv_sdma_check_pitches(const unsigned pitch, const unsigned slice_pitch, const unsigned bpp, const bool uses_depth)
55 {
56    ASSERTED const unsigned pitch_alignment = MAX2(1, 4 / bpp);
57    assert(pitch);
58    assert(pitch <= (1 << 14));
59    assert(util_is_aligned(pitch, pitch_alignment));
60 
61    if (uses_depth) {
62       ASSERTED const unsigned slice_pitch_alignment = 4;
63       assert(slice_pitch);
64       assert(slice_pitch <= (1 << 28));
65       assert(util_is_aligned(slice_pitch, slice_pitch_alignment));
66    }
67 }
68 
69 ALWAYS_INLINE static enum gfx9_resource_type
radv_sdma_surface_resource_type(const struct radv_device * const device,const struct radeon_surf * const surf)70 radv_sdma_surface_resource_type(const struct radv_device *const device, const struct radeon_surf *const surf)
71 {
72    const struct radv_physical_device *pdev = radv_device_physical(device);
73 
74    if (pdev->info.sdma_ip_version >= SDMA_5_0) {
75       /* Use the 2D resource type for rotated or Z swizzles. */
76       if ((surf->u.gfx9.resource_type == RADEON_RESOURCE_1D || surf->u.gfx9.resource_type == RADEON_RESOURCE_3D) &&
77           (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER || surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH))
78          return RADEON_RESOURCE_2D;
79    }
80 
81    return surf->u.gfx9.resource_type;
82 }
83 
84 ALWAYS_INLINE static uint32_t
radv_sdma_surface_type_from_aspect_mask(const VkImageAspectFlags aspectMask)85 radv_sdma_surface_type_from_aspect_mask(const VkImageAspectFlags aspectMask)
86 {
87    if (aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
88       return 1;
89    else if (aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
90       return 2;
91 
92    return 0;
93 }
94 
95 ALWAYS_INLINE static VkExtent3D
radv_sdma_pixel_extent_to_blocks(const VkExtent3D extent,const unsigned blk_w,const unsigned blk_h)96 radv_sdma_pixel_extent_to_blocks(const VkExtent3D extent, const unsigned blk_w, const unsigned blk_h)
97 {
98    const VkExtent3D r = {
99       .width = DIV_ROUND_UP(extent.width, blk_w),
100       .height = DIV_ROUND_UP(extent.height, blk_h),
101       .depth = extent.depth,
102    };
103 
104    return r;
105 }
106 
107 ALWAYS_INLINE static VkOffset3D
radv_sdma_pixel_offset_to_blocks(const VkOffset3D offset,const unsigned blk_w,const unsigned blk_h)108 radv_sdma_pixel_offset_to_blocks(const VkOffset3D offset, const unsigned blk_w, const unsigned blk_h)
109 {
110    const VkOffset3D r = {
111       .x = DIV_ROUND_UP(offset.x, blk_w),
112       .y = DIV_ROUND_UP(offset.y, blk_h),
113       .z = offset.z,
114    };
115 
116    return r;
117 }
118 
119 ALWAYS_INLINE static unsigned
radv_sdma_pixels_to_blocks(const unsigned linear_pitch,const unsigned blk_w)120 radv_sdma_pixels_to_blocks(const unsigned linear_pitch, const unsigned blk_w)
121 {
122    return DIV_ROUND_UP(linear_pitch, blk_w);
123 }
124 
125 ALWAYS_INLINE static unsigned
radv_sdma_pixel_area_to_blocks(const unsigned linear_slice_pitch,const unsigned blk_w,const unsigned blk_h)126 radv_sdma_pixel_area_to_blocks(const unsigned linear_slice_pitch, const unsigned blk_w, const unsigned blk_h)
127 {
128    return DIV_ROUND_UP(DIV_ROUND_UP(linear_slice_pitch, blk_w), blk_h);
129 }
130 
131 static struct radv_sdma_chunked_copy_info
radv_sdma_get_chunked_copy_info(const struct radv_device * const device,const struct radv_sdma_surf * const img,const VkExtent3D extent)132 radv_sdma_get_chunked_copy_info(const struct radv_device *const device, const struct radv_sdma_surf *const img,
133                                 const VkExtent3D extent)
134 {
135    const unsigned extent_horizontal_blocks = DIV_ROUND_UP(extent.width, img->blk_w);
136    const unsigned extent_vertical_blocks = DIV_ROUND_UP(extent.height, img->blk_h);
137    const unsigned aligned_row_pitch = ALIGN(extent_horizontal_blocks, 4);
138    const unsigned aligned_row_bytes = aligned_row_pitch * img->bpp;
139 
140    /* Assume that we can always copy at least one full row at a time. */
141    const unsigned max_num_rows_per_copy = MIN2(RADV_SDMA_TRANSFER_TEMP_BYTES / aligned_row_bytes, extent.height);
142    assert(max_num_rows_per_copy);
143 
144    /* Ensure that the number of rows copied at a time is a power of two. */
145    const unsigned num_rows_per_copy = MAX2(1, util_next_power_of_two(max_num_rows_per_copy + 1) / 2);
146 
147    const struct radv_sdma_chunked_copy_info r = {
148       .extent_horizontal_blocks = extent_horizontal_blocks,
149       .extent_vertical_blocks = extent_vertical_blocks,
150       .aligned_row_pitch = aligned_row_pitch,
151       .num_rows_per_copy = num_rows_per_copy,
152    };
153 
154    return r;
155 }
156 
157 static uint32_t
radv_sdma_get_bpe(const struct radv_image * const image,VkImageAspectFlags aspect_mask)158 radv_sdma_get_bpe(const struct radv_image *const image, VkImageAspectFlags aspect_mask)
159 {
160    const unsigned plane_idx = radv_plane_from_aspect(aspect_mask);
161    const struct radeon_surf *surf = &image->planes[plane_idx].surface;
162    const bool is_stencil_only = aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT;
163 
164    return is_stencil_only ? 1 : surf->bpe;
165 }
166 
167 struct radv_sdma_surf
radv_sdma_get_buf_surf(const struct radv_buffer * const buffer,const struct radv_image * const image,const VkBufferImageCopy2 * const region,const VkImageAspectFlags aspect_mask)168 radv_sdma_get_buf_surf(const struct radv_buffer *const buffer, const struct radv_image *const image,
169                        const VkBufferImageCopy2 *const region, const VkImageAspectFlags aspect_mask)
170 {
171    assert(util_bitcount(aspect_mask) == 1);
172 
173    const unsigned pitch = (region->bufferRowLength ? region->bufferRowLength : region->imageExtent.width);
174    const unsigned slice_pitch =
175       (region->bufferImageHeight ? region->bufferImageHeight : region->imageExtent.height) * pitch;
176 
177    const unsigned plane_idx = radv_plane_from_aspect(region->imageSubresource.aspectMask);
178    const struct radeon_surf *surf = &image->planes[plane_idx].surface;
179    const uint32_t bpe = radv_sdma_get_bpe(image, region->imageSubresource.aspectMask);
180 
181    const struct radv_sdma_surf info = {
182       .va = radv_buffer_get_va(buffer->bo) + buffer->offset + region->bufferOffset,
183       .pitch = pitch,
184       .slice_pitch = slice_pitch,
185       .bpp = bpe,
186       .blk_w = surf->blk_w,
187       .blk_h = surf->blk_h,
188       .is_linear = true,
189    };
190 
191    return info;
192 }
193 
194 static uint32_t
radv_sdma_get_metadata_config(const struct radv_device * const device,const struct radv_image * const image,const struct radeon_surf * const surf,const VkImageSubresourceLayers subresource,const VkImageAspectFlags aspect_mask)195 radv_sdma_get_metadata_config(const struct radv_device *const device, const struct radv_image *const image,
196                               const struct radeon_surf *const surf, const VkImageSubresourceLayers subresource,
197                               const VkImageAspectFlags aspect_mask)
198 {
199    const struct radv_physical_device *pdev = radv_device_physical(device);
200 
201    if (!pdev->info.sdma_supports_compression ||
202        !(radv_dcc_enabled(image, subresource.mipLevel) || radv_image_has_htile(image))) {
203       return 0;
204    }
205 
206    const VkFormat format = vk_format_get_aspect_format(image->vk.format, aspect_mask);
207    const struct util_format_description *desc = vk_format_description(format);
208 
209    const uint32_t data_format = ac_get_cb_format(pdev->info.gfx_level, radv_format_to_pipe_format(format));
210    const uint32_t alpha_is_on_msb = ac_alpha_is_on_msb(&pdev->info, radv_format_to_pipe_format(format));
211    const uint32_t number_type = radv_translate_buffer_numformat(desc, vk_format_get_first_non_void_channel(format));
212    const uint32_t surface_type = radv_sdma_surface_type_from_aspect_mask(aspect_mask);
213    const uint32_t max_comp_block_size = surf->u.gfx9.color.dcc.max_compressed_block_size;
214    const uint32_t max_uncomp_block_size = radv_get_dcc_max_uncompressed_block_size(device, image);
215    const uint32_t pipe_aligned = surf->u.gfx9.color.dcc.pipe_aligned;
216 
217    return data_format | alpha_is_on_msb << 8 | number_type << 9 | surface_type << 12 | max_comp_block_size << 24 |
218           max_uncomp_block_size << 26 | pipe_aligned << 31;
219 }
220 
221 static uint32_t
radv_sdma_get_tiled_info_dword(const struct radv_device * const device,const struct radv_image * const image,const struct radeon_surf * const surf,const VkImageSubresourceLayers subresource)222 radv_sdma_get_tiled_info_dword(const struct radv_device *const device, const struct radv_image *const image,
223                                const struct radeon_surf *const surf, const VkImageSubresourceLayers subresource)
224 {
225    const struct radv_physical_device *pdev = radv_device_physical(device);
226    const uint32_t bpe = radv_sdma_get_bpe(image, subresource.aspectMask);
227    const uint32_t element_size = util_logbase2(bpe);
228    const uint32_t swizzle_mode = surf->has_stencil ? surf->u.gfx9.zs.stencil_swizzle_mode : surf->u.gfx9.swizzle_mode;
229    const enum gfx9_resource_type dimension = radv_sdma_surface_resource_type(device, surf);
230    uint32_t info = element_size | swizzle_mode << 3;
231    const enum sdma_version ver = pdev->info.sdma_ip_version;
232    const uint32_t mip_max = MAX2(image->vk.mip_levels, 1);
233    const uint32_t mip_id = subresource.mipLevel;
234 
235    if (ver >= SDMA_7_0) {
236       return info | (mip_max - 1) << 16 | mip_id << 24;
237    } else if (ver >= SDMA_5_0) {
238       return info | dimension << 9 | (mip_max - 1) << 16 | mip_id << 20;
239    } else if (ver >= SDMA_4_0) {
240       return info | dimension << 9 | surf->u.gfx9.epitch << 16;
241    } else {
242       unreachable("unsupported SDMA version");
243    }
244 }
245 
246 static uint32_t
radv_sdma_get_tiled_header_dword(const struct radv_device * const device,const struct radv_image * const image,const VkImageSubresourceLayers subresource)247 radv_sdma_get_tiled_header_dword(const struct radv_device *const device, const struct radv_image *const image,
248                                  const VkImageSubresourceLayers subresource)
249 {
250    const struct radv_physical_device *pdev = radv_device_physical(device);
251    const enum sdma_version ver = pdev->info.sdma_ip_version;
252 
253    if (ver >= SDMA_5_0) {
254       return 0;
255    } else if (ver >= SDMA_4_0) {
256       const uint32_t mip_max = MAX2(image->vk.mip_levels, 1);
257       const uint32_t mip_id = subresource.mipLevel;
258       return (mip_max - 1) << 20 | mip_id << 24;
259    } else {
260       unreachable("unsupported SDMA version");
261    }
262 }
263 
264 struct radv_sdma_surf
radv_sdma_get_surf(const struct radv_device * const device,const struct radv_image * const image,const VkImageSubresourceLayers subresource,const VkOffset3D offset,const VkImageAspectFlags aspect_mask)265 radv_sdma_get_surf(const struct radv_device *const device, const struct radv_image *const image,
266                    const VkImageSubresourceLayers subresource, const VkOffset3D offset,
267                    const VkImageAspectFlags aspect_mask)
268 {
269    assert(util_bitcount(aspect_mask) == 1);
270 
271    const struct radv_physical_device *pdev = radv_device_physical(device);
272    const unsigned plane_idx = radv_plane_from_aspect(aspect_mask);
273    const unsigned binding_idx = image->disjoint ? plane_idx : 0;
274    const struct radeon_surf *const surf = &image->planes[plane_idx].surface;
275    const uint64_t va = radv_image_get_va(image, binding_idx);
276    const uint32_t bpe = radv_sdma_get_bpe(image, aspect_mask);
277    struct radv_sdma_surf info = {
278       .extent =
279          {
280             .width = vk_format_get_plane_width(image->vk.format, plane_idx, image->vk.extent.width),
281             .height = vk_format_get_plane_height(image->vk.format, plane_idx, image->vk.extent.height),
282             .depth = image->vk.image_type == VK_IMAGE_TYPE_3D ? image->vk.extent.depth : image->vk.array_layers,
283          },
284       .offset =
285          {
286             .x = offset.x,
287             .y = offset.y,
288             .z = image->vk.image_type == VK_IMAGE_TYPE_3D ? offset.z : subresource.baseArrayLayer,
289          },
290       .bpp = bpe,
291       .blk_w = surf->blk_w,
292       .blk_h = surf->blk_h,
293       .mip_levels = image->vk.mip_levels,
294       .micro_tile_mode = surf->micro_tile_mode,
295       .is_linear = surf->is_linear,
296       .is_3d = surf->u.gfx9.resource_type == RADEON_RESOURCE_3D,
297    };
298 
299    const uint64_t surf_offset =
300       (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) ? surf->u.gfx9.zs.stencil_offset : surf->u.gfx9.surf_offset;
301 
302    if (surf->is_linear) {
303       info.va = va + surf_offset + surf->u.gfx9.offset[subresource.mipLevel];
304       info.pitch = surf->u.gfx9.pitch[subresource.mipLevel];
305       info.slice_pitch = surf->blk_w * surf->blk_h * surf->u.gfx9.surf_slice_size / bpe;
306    } else {
307       /* 1D resources should be linear. */
308       assert(surf->u.gfx9.resource_type != RADEON_RESOURCE_1D);
309 
310       info.va = (va + surf_offset) | surf->tile_swizzle << 8;
311 
312       info.info_dword = radv_sdma_get_tiled_info_dword(device, image, surf, subresource);
313       info.header_dword = radv_sdma_get_tiled_header_dword(device, image, subresource);
314 
315       if (pdev->info.sdma_supports_compression &&
316           (radv_dcc_enabled(image, subresource.mipLevel) || radv_image_has_htile(image))) {
317          info.meta_va = va + surf->meta_offset;
318          info.meta_config = radv_sdma_get_metadata_config(device, image, surf, subresource, aspect_mask);
319       }
320    }
321 
322    return info;
323 }
324 
325 static void
radv_sdma_emit_nop(const struct radv_device * device,struct radeon_cmdbuf * cs)326 radv_sdma_emit_nop(const struct radv_device *device, struct radeon_cmdbuf *cs)
327 {
328    /* SDMA NOP acts as a fence command and causes the SDMA engine to wait for pending copy operations. */
329    radeon_check_space(device->ws, cs, 1);
330    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
331 }
332 
333 void
radv_sdma_copy_buffer(const struct radv_device * device,struct radeon_cmdbuf * cs,uint64_t src_va,uint64_t dst_va,uint64_t size)334 radv_sdma_copy_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, uint64_t src_va, uint64_t dst_va,
335                       uint64_t size)
336 {
337    if (size == 0)
338       return;
339 
340    const struct radv_physical_device *pdev = radv_device_physical(device);
341    const enum sdma_version ver = pdev->info.sdma_ip_version;
342    const unsigned max_size_per_packet = ver >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES;
343 
344    unsigned align = ~0u;
345    unsigned ncopy = DIV_ROUND_UP(size, max_size_per_packet);
346 
347    assert(ver >= SDMA_2_0);
348 
349    /* SDMA FW automatically enables a faster dword copy mode when
350     * source, destination and size are all dword-aligned.
351     *
352     * When source and destination are dword-aligned, round down the size to
353     * take advantage of faster dword copy, and copy the remaining few bytes
354     * with the last copy packet.
355     */
356    if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) {
357       align = ~0x3u;
358       ncopy++;
359    }
360 
361    radeon_check_space(device->ws, cs, ncopy * 7);
362 
363    for (unsigned i = 0; i < ncopy; i++) {
364       unsigned csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
365       radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0));
366       radeon_emit(cs, ver >= SDMA_4_0 ? csize - 1 : csize);
367       radeon_emit(cs, 0); /* src/dst endian swap */
368       radeon_emit(cs, src_va);
369       radeon_emit(cs, src_va >> 32);
370       radeon_emit(cs, dst_va);
371       radeon_emit(cs, dst_va >> 32);
372       dst_va += csize;
373       src_va += csize;
374       size -= csize;
375    }
376 }
377 
378 void
radv_sdma_fill_buffer(const struct radv_device * device,struct radeon_cmdbuf * cs,const uint64_t va,const uint64_t size,const uint32_t value)379 radv_sdma_fill_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, const uint64_t va,
380                       const uint64_t size, const uint32_t value)
381 {
382    const struct radv_physical_device *pdev = radv_device_physical(device);
383 
384    const uint32_t fill_size = 2; /* This means that the count is in dwords. */
385    const uint32_t constant_fill_header = SDMA_PACKET(SDMA_OPCODE_CONSTANT_FILL, 0, 0) | (fill_size & 0x3) << 30;
386 
387    /* This packet is the same since SDMA v2.4, haven't bothered to check older versions. */
388    const enum sdma_version ver = pdev->info.sdma_ip_version;
389    assert(ver >= SDMA_2_4);
390 
391    /* Maximum allowed fill size depends on the GPU.
392     * Emit as many packets as necessary to fill all the bytes we need.
393     */
394    const uint64_t max_fill_bytes = BITFIELD64_MASK(ver >= SDMA_6_0 ? 30 : 22) & ~0x3;
395    const unsigned num_packets = DIV_ROUND_UP(size, max_fill_bytes);
396    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, num_packets * 5);
397 
398    for (unsigned i = 0; i < num_packets; ++i) {
399       const uint64_t offset = i * max_fill_bytes;
400       const uint64_t fill_bytes = MIN2(size - offset, max_fill_bytes);
401       const uint64_t fill_va = va + offset;
402 
403       radeon_emit(cs, constant_fill_header);
404       radeon_emit(cs, fill_va);
405       radeon_emit(cs, fill_va >> 32);
406       radeon_emit(cs, value);
407       radeon_emit(cs, fill_bytes - 1); /* Must be programmed in bytes, even if the fill is done in dwords. */
408    }
409 
410    assert(cs->cdw <= cdw_max);
411 }
412 
413 static void
radv_sdma_emit_copy_linear_sub_window(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * const src,const struct radv_sdma_surf * const dst,const VkExtent3D pix_extent)414 radv_sdma_emit_copy_linear_sub_window(const struct radv_device *device, struct radeon_cmdbuf *cs,
415                                       const struct radv_sdma_surf *const src, const struct radv_sdma_surf *const dst,
416                                       const VkExtent3D pix_extent)
417 {
418    /* This packet is the same since SDMA v2.4, haven't bothered to check older versions.
419     * The main difference is the bitfield sizes:
420     *
421     * v2.4 - src/dst_pitch: 14 bits, rect_z: 11 bits
422     * v4.0 - src/dst_pitch: 19 bits, rect_z: 11 bits
423     * v5.0 - src/dst_pitch: 19 bits, rect_z: 13 bits
424     *
425     * We currently use the smallest limits (from SDMA v2.4).
426     */
427 
428    const struct radv_physical_device *pdev = radv_device_physical(device);
429    const VkOffset3D src_off = radv_sdma_pixel_offset_to_blocks(src->offset, src->blk_w, src->blk_h);
430    const VkOffset3D dst_off = radv_sdma_pixel_offset_to_blocks(dst->offset, dst->blk_w, dst->blk_h);
431    const VkExtent3D ext = radv_sdma_pixel_extent_to_blocks(pix_extent, src->blk_w, src->blk_h);
432    const unsigned src_pitch = radv_sdma_pixels_to_blocks(src->pitch, src->blk_w);
433    const unsigned dst_pitch = radv_sdma_pixels_to_blocks(dst->pitch, dst->blk_w);
434    const unsigned src_slice_pitch = radv_sdma_pixel_area_to_blocks(src->slice_pitch, src->blk_w, src->blk_h);
435    const unsigned dst_slice_pitch = radv_sdma_pixel_area_to_blocks(dst->slice_pitch, dst->blk_w, dst->blk_h);
436    const enum sdma_version ver = pdev->info.sdma_ip_version;
437 
438    assert(src->bpp == dst->bpp);
439    assert(util_is_power_of_two_nonzero(src->bpp));
440    radv_sdma_check_pitches(src->pitch, src->slice_pitch, src->bpp, false);
441    radv_sdma_check_pitches(dst->pitch, dst->slice_pitch, dst->bpp, false);
442 
443    ASSERTED unsigned cdw_end = radeon_check_space(device->ws, cs, 13);
444 
445    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | util_logbase2(src->bpp)
446                                                                                                  << 29);
447    radeon_emit(cs, src->va);
448    radeon_emit(cs, src->va >> 32);
449    radeon_emit(cs, src_off.x | src_off.y << 16);
450    radeon_emit(cs, src_off.z | (src_pitch - 1) << (ver >= SDMA_7_0 ? 16 : 13));
451    radeon_emit(cs, src_slice_pitch - 1);
452    radeon_emit(cs, dst->va);
453    radeon_emit(cs, dst->va >> 32);
454    radeon_emit(cs, dst_off.x | dst_off.y << 16);
455    radeon_emit(cs, dst_off.z | (dst_pitch - 1) << (ver >= SDMA_7_0 ? 16 : 13));
456    radeon_emit(cs, dst_slice_pitch - 1);
457    radeon_emit(cs, (ext.width - 1) | (ext.height - 1) << 16);
458    radeon_emit(cs, (ext.depth - 1));
459 
460    assert(cs->cdw == cdw_end);
461 }
462 
463 static void
radv_sdma_emit_copy_tiled_sub_window(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * const tiled,const struct radv_sdma_surf * const linear,const VkExtent3D pix_extent,const bool detile)464 radv_sdma_emit_copy_tiled_sub_window(const struct radv_device *device, struct radeon_cmdbuf *cs,
465                                      const struct radv_sdma_surf *const tiled,
466                                      const struct radv_sdma_surf *const linear, const VkExtent3D pix_extent,
467                                      const bool detile)
468 {
469    const struct radv_physical_device *pdev = radv_device_physical(device);
470 
471    if (!pdev->info.sdma_supports_compression) {
472       assert(!tiled->meta_va);
473    }
474 
475    const VkOffset3D linear_off = radv_sdma_pixel_offset_to_blocks(linear->offset, linear->blk_w, linear->blk_h);
476    const VkOffset3D tiled_off = radv_sdma_pixel_offset_to_blocks(tiled->offset, tiled->blk_w, tiled->blk_h);
477    const VkExtent3D tiled_ext = radv_sdma_pixel_extent_to_blocks(tiled->extent, tiled->blk_w, tiled->blk_h);
478    const VkExtent3D ext = radv_sdma_pixel_extent_to_blocks(pix_extent, tiled->blk_w, tiled->blk_h);
479    const unsigned linear_pitch = radv_sdma_pixels_to_blocks(linear->pitch, tiled->blk_w);
480    const unsigned linear_slice_pitch = radv_sdma_pixel_area_to_blocks(linear->slice_pitch, tiled->blk_w, tiled->blk_h);
481    const bool dcc = !!tiled->meta_va;
482    const bool uses_depth = linear_off.z != 0 || tiled_off.z != 0 || ext.depth != 1;
483 
484    assert(util_is_power_of_two_nonzero(tiled->bpp));
485    radv_sdma_check_pitches(linear_pitch, linear_slice_pitch, tiled->bpp, uses_depth);
486 
487    ASSERTED unsigned cdw_end = radeon_check_space(device->ws, cs, 14 + (dcc ? 3 : 0));
488 
489    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | dcc << 19 | detile << 31 |
490                       tiled->header_dword);
491    radeon_emit(cs, tiled->va);
492    radeon_emit(cs, tiled->va >> 32);
493    radeon_emit(cs, tiled_off.x | tiled_off.y << 16);
494    radeon_emit(cs, tiled_off.z | (tiled_ext.width - 1) << 16);
495    radeon_emit(cs, (tiled_ext.height - 1) | (tiled_ext.depth - 1) << 16);
496    radeon_emit(cs, tiled->info_dword);
497    radeon_emit(cs, linear->va);
498    radeon_emit(cs, linear->va >> 32);
499    radeon_emit(cs, linear_off.x | linear_off.y << 16);
500    radeon_emit(cs, linear_off.z | (linear_pitch - 1) << 16);
501    radeon_emit(cs, linear_slice_pitch - 1);
502    radeon_emit(cs, (ext.width - 1) | (ext.height - 1) << 16);
503    radeon_emit(cs, (ext.depth - 1));
504 
505    if (tiled->meta_va) {
506       const unsigned write_compress_enable = !detile;
507       radeon_emit(cs, tiled->meta_va);
508       radeon_emit(cs, tiled->meta_va >> 32);
509       radeon_emit(cs, tiled->meta_config | write_compress_enable << 28);
510    }
511 
512    assert(cs->cdw == cdw_end);
513 }
514 
515 static void
radv_sdma_emit_copy_t2t_sub_window(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * const src,const struct radv_sdma_surf * const dst,const VkExtent3D px_extent)516 radv_sdma_emit_copy_t2t_sub_window(const struct radv_device *device, struct radeon_cmdbuf *cs,
517                                    const struct radv_sdma_surf *const src, const struct radv_sdma_surf *const dst,
518                                    const VkExtent3D px_extent)
519 {
520    const struct radv_physical_device *pdev = radv_device_physical(device);
521 
522    /* We currently only support the SDMA v4+ versions of this packet. */
523    assert(pdev->info.sdma_ip_version >= SDMA_4_0);
524 
525    /* On GFX10+ this supports DCC, but cannot copy a compressed surface to another compressed surface. */
526    assert(!src->meta_va || !dst->meta_va);
527 
528    if (pdev->info.sdma_ip_version >= SDMA_4_0 && pdev->info.sdma_ip_version < SDMA_5_0) {
529       /* SDMA v4 doesn't support mip_id selection in the T2T copy packet. */
530       assert(src->header_dword >> 24 == 0);
531       assert(dst->header_dword >> 24 == 0);
532       /* SDMA v4 doesn't support any image metadata. */
533       assert(!src->meta_va);
534       assert(!dst->meta_va);
535    }
536 
537    /* Despite the name, this can indicate DCC or HTILE metadata. */
538    const uint32_t dcc = src->meta_va || dst->meta_va;
539    /* 0 = compress (src is uncompressed), 1 = decompress (src is compressed). */
540    const uint32_t dcc_dir = src->meta_va && !dst->meta_va;
541 
542    const VkOffset3D src_off = radv_sdma_pixel_offset_to_blocks(src->offset, src->blk_w, src->blk_h);
543    const VkOffset3D dst_off = radv_sdma_pixel_offset_to_blocks(dst->offset, dst->blk_w, dst->blk_h);
544    const VkExtent3D src_ext = radv_sdma_pixel_extent_to_blocks(src->extent, src->blk_w, src->blk_h);
545    const VkExtent3D dst_ext = radv_sdma_pixel_extent_to_blocks(dst->extent, dst->blk_w, dst->blk_h);
546    const VkExtent3D ext = radv_sdma_pixel_extent_to_blocks(px_extent, src->blk_w, src->blk_h);
547 
548    assert(util_is_power_of_two_nonzero(src->bpp));
549    assert(util_is_power_of_two_nonzero(dst->bpp));
550 
551    ASSERTED unsigned cdw_end = radeon_check_space(device->ws, cs, 15 + (dcc ? 3 : 0));
552 
553    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0) | dcc << 19 | dcc_dir << 31 |
554                       src->header_dword);
555    radeon_emit(cs, src->va);
556    radeon_emit(cs, src->va >> 32);
557    radeon_emit(cs, src_off.x | src_off.y << 16);
558    radeon_emit(cs, src_off.z | (src_ext.width - 1) << 16);
559    radeon_emit(cs, (src_ext.height - 1) | (src_ext.depth - 1) << 16);
560    radeon_emit(cs, src->info_dword);
561    radeon_emit(cs, dst->va);
562    radeon_emit(cs, dst->va >> 32);
563    radeon_emit(cs, dst_off.x | dst_off.y << 16);
564    radeon_emit(cs, dst_off.z | (dst_ext.width - 1) << 16);
565    radeon_emit(cs, (dst_ext.height - 1) | (dst_ext.depth - 1) << 16);
566    radeon_emit(cs, dst->info_dword);
567    radeon_emit(cs, (ext.width - 1) | (ext.height - 1) << 16);
568    radeon_emit(cs, (ext.depth - 1));
569 
570    if (dst->meta_va) {
571       const uint32_t write_compress_enable = 1;
572       radeon_emit(cs, dst->meta_va);
573       radeon_emit(cs, dst->meta_va >> 32);
574       radeon_emit(cs, dst->meta_config | write_compress_enable << 28);
575    } else if (src->meta_va) {
576       radeon_emit(cs, src->meta_va);
577       radeon_emit(cs, src->meta_va >> 32);
578       radeon_emit(cs, src->meta_config);
579    }
580 
581    assert(cs->cdw == cdw_end);
582 }
583 
584 void
radv_sdma_copy_buffer_image(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * buf,const struct radv_sdma_surf * img,const VkExtent3D extent,bool to_image)585 radv_sdma_copy_buffer_image(const struct radv_device *device, struct radeon_cmdbuf *cs,
586                             const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img, const VkExtent3D extent,
587                             bool to_image)
588 {
589    if (img->is_linear) {
590       if (to_image)
591          radv_sdma_emit_copy_linear_sub_window(device, cs, buf, img, extent);
592       else
593          radv_sdma_emit_copy_linear_sub_window(device, cs, img, buf, extent);
594    } else {
595       radv_sdma_emit_copy_tiled_sub_window(device, cs, img, buf, extent, !to_image);
596    }
597 }
598 
599 bool
radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device * device,const struct radv_sdma_surf * buf,const struct radv_sdma_surf * img,const VkExtent3D ext)600 radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device *device, const struct radv_sdma_surf *buf,
601                                           const struct radv_sdma_surf *img, const VkExtent3D ext)
602 {
603    const unsigned pitch_blocks = radv_sdma_pixels_to_blocks(buf->pitch, img->blk_w);
604    if (!util_is_aligned(pitch_blocks, radv_sdma_pitch_alignment(device, img->bpp)))
605       return true;
606 
607    const bool uses_depth = img->offset.z != 0 || ext.depth != 1;
608    if (!img->is_linear && uses_depth) {
609       const unsigned slice_pitch_blocks = radv_sdma_pixel_area_to_blocks(buf->slice_pitch, img->blk_w, img->blk_h);
610       if (!util_is_aligned(slice_pitch_blocks, 4))
611          return true;
612    }
613 
614    return false;
615 }
616 
617 void
radv_sdma_copy_buffer_image_unaligned(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * buf,const struct radv_sdma_surf * img_in,const VkExtent3D base_extent,struct radeon_winsys_bo * temp_bo,bool to_image)618 radv_sdma_copy_buffer_image_unaligned(const struct radv_device *device, struct radeon_cmdbuf *cs,
619                                       const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img_in,
620                                       const VkExtent3D base_extent, struct radeon_winsys_bo *temp_bo, bool to_image)
621 {
622    const struct radv_sdma_chunked_copy_info info = radv_sdma_get_chunked_copy_info(device, img_in, base_extent);
623    struct radv_sdma_surf img = *img_in;
624    struct radv_sdma_surf tmp = {
625       .va = temp_bo->va,
626       .bpp = img.bpp,
627       .blk_w = img.blk_w,
628       .blk_h = img.blk_h,
629       .pitch = info.aligned_row_pitch * img.blk_w,
630       .slice_pitch = info.aligned_row_pitch * img.blk_w * info.extent_vertical_blocks * img.blk_h,
631    };
632 
633    VkExtent3D extent = base_extent;
634    const unsigned buf_pitch_blocks = DIV_ROUND_UP(buf->pitch, img.blk_w);
635    const unsigned buf_slice_pitch_blocks = DIV_ROUND_UP(DIV_ROUND_UP(buf->slice_pitch, img.blk_w), img.blk_h);
636    assert(buf_pitch_blocks);
637    assert(buf_slice_pitch_blocks);
638    extent.depth = 1;
639 
640    for (unsigned slice = 0; slice < base_extent.depth; ++slice) {
641       for (unsigned row = 0; row < info.extent_vertical_blocks; row += info.num_rows_per_copy) {
642          const unsigned rows = MIN2(info.extent_vertical_blocks - row, info.num_rows_per_copy);
643 
644          img.offset.y = img_in->offset.y + row * img.blk_h;
645          img.offset.z = img_in->offset.z + slice;
646          extent.height = rows * img.blk_h;
647          tmp.slice_pitch = tmp.pitch * rows * img.blk_h;
648 
649          if (!to_image) {
650             /* Copy the rows from the source image to the temporary buffer. */
651             if (img.is_linear)
652                radv_sdma_emit_copy_linear_sub_window(device, cs, &img, &tmp, extent);
653             else
654                radv_sdma_emit_copy_tiled_sub_window(device, cs, &img, &tmp, extent, true);
655 
656             /* Wait for the copy to finish. */
657             radv_sdma_emit_nop(device, cs);
658          }
659 
660          /* buffer to image: copy each row from source buffer to temporary buffer.
661           * image to buffer: copy each row from temporary buffer to destination buffer.
662           */
663          for (unsigned r = 0; r < rows; ++r) {
664             const uint64_t buf_va =
665                buf->va + slice * buf_slice_pitch_blocks * img.bpp + (row + r) * buf_pitch_blocks * img.bpp;
666             const uint64_t tmp_va = tmp.va + r * info.aligned_row_pitch * img.bpp;
667             radv_sdma_copy_buffer(device, cs, to_image ? buf_va : tmp_va, to_image ? tmp_va : buf_va,
668                                   info.extent_horizontal_blocks * img.bpp);
669          }
670 
671          /* Wait for the copy to finish. */
672          radv_sdma_emit_nop(device, cs);
673 
674          if (to_image) {
675             /* Copy the rows from the temporary buffer to the destination image. */
676             if (img.is_linear)
677                radv_sdma_emit_copy_linear_sub_window(device, cs, &tmp, &img, extent);
678             else
679                radv_sdma_emit_copy_tiled_sub_window(device, cs, &img, &tmp, extent, false);
680 
681             /* Wait for the copy to finish. */
682             radv_sdma_emit_nop(device, cs);
683          }
684       }
685    }
686 }
687 
688 void
radv_sdma_copy_image(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * src,const struct radv_sdma_surf * dst,const VkExtent3D extent)689 radv_sdma_copy_image(const struct radv_device *device, struct radeon_cmdbuf *cs, const struct radv_sdma_surf *src,
690                      const struct radv_sdma_surf *dst, const VkExtent3D extent)
691 {
692    if (src->is_linear) {
693       if (dst->is_linear) {
694          radv_sdma_emit_copy_linear_sub_window(device, cs, src, dst, extent);
695       } else {
696          radv_sdma_emit_copy_tiled_sub_window(device, cs, dst, src, extent, false);
697       }
698    } else {
699       if (dst->is_linear) {
700          radv_sdma_emit_copy_tiled_sub_window(device, cs, src, dst, extent, true);
701       } else {
702          radv_sdma_emit_copy_t2t_sub_window(device, cs, src, dst, extent);
703       }
704    }
705 }
706 
707 bool
radv_sdma_use_t2t_scanline_copy(const struct radv_device * device,const struct radv_sdma_surf * src,const struct radv_sdma_surf * dst,const VkExtent3D extent)708 radv_sdma_use_t2t_scanline_copy(const struct radv_device *device, const struct radv_sdma_surf *src,
709                                 const struct radv_sdma_surf *dst, const VkExtent3D extent)
710 {
711    /* These need a linear-to-linear / linear-to-tiled copy. */
712    if (src->is_linear || dst->is_linear)
713       return false;
714 
715    /* SDMA can't do format conversion. */
716    assert(src->bpp == dst->bpp);
717 
718    const struct radv_physical_device *pdev = radv_device_physical(device);
719    const enum sdma_version ver = pdev->info.sdma_ip_version;
720    if (ver < SDMA_5_0) {
721       /* SDMA v4.x and older doesn't support proper mip level selection. */
722       if (src->mip_levels > 1 || dst->mip_levels > 1)
723          return true;
724    }
725 
726    /* The two images can have a different block size,
727     * but must have the same swizzle mode.
728     */
729    if (src->micro_tile_mode != dst->micro_tile_mode)
730       return true;
731 
732    /* The T2T subwindow copy packet only has fields for one metadata configuration.
733     * It can either compress or decompress, or copy uncompressed images, but it
734     * can't copy from a compressed image to another.
735     */
736    if (src->meta_va && dst->meta_va)
737       return true;
738 
739    const bool needs_3d_alignment = src->is_3d && (src->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
740                                                   src->micro_tile_mode == RADEON_MICRO_MODE_STANDARD);
741    const unsigned log2bpp = util_logbase2(src->bpp);
742    const VkExtent3D *const alignment =
743       needs_3d_alignment ? &radv_sdma_t2t_alignment_3d[log2bpp] : &radv_sdma_t2t_alignment_2d_and_planar[log2bpp];
744 
745    const VkExtent3D copy_extent_blk = radv_sdma_pixel_extent_to_blocks(extent, src->blk_w, src->blk_h);
746    const VkOffset3D src_offset_blk = radv_sdma_pixel_offset_to_blocks(src->offset, src->blk_w, src->blk_h);
747    const VkOffset3D dst_offset_blk = radv_sdma_pixel_offset_to_blocks(dst->offset, dst->blk_w, dst->blk_h);
748 
749    if (!util_is_aligned(copy_extent_blk.width, alignment->width) ||
750        !util_is_aligned(copy_extent_blk.height, alignment->height) ||
751        !util_is_aligned(copy_extent_blk.depth, alignment->depth))
752       return true;
753 
754    if (!util_is_aligned(src_offset_blk.x, alignment->width) || !util_is_aligned(src_offset_blk.y, alignment->height) ||
755        !util_is_aligned(src_offset_blk.z, alignment->depth))
756       return true;
757 
758    if (!util_is_aligned(dst_offset_blk.x, alignment->width) || !util_is_aligned(dst_offset_blk.y, alignment->height) ||
759        !util_is_aligned(dst_offset_blk.z, alignment->depth))
760       return true;
761 
762    return false;
763 }
764 
765 void
radv_sdma_copy_image_t2t_scanline(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * src,const struct radv_sdma_surf * dst,const VkExtent3D extent,struct radeon_winsys_bo * temp_bo)766 radv_sdma_copy_image_t2t_scanline(const struct radv_device *device, struct radeon_cmdbuf *cs,
767                                   const struct radv_sdma_surf *src, const struct radv_sdma_surf *dst,
768                                   const VkExtent3D extent, struct radeon_winsys_bo *temp_bo)
769 {
770    const struct radv_sdma_chunked_copy_info info = radv_sdma_get_chunked_copy_info(device, src, extent);
771    struct radv_sdma_surf t2l_src = *src;
772    struct radv_sdma_surf t2l_dst = {
773       .va = temp_bo->va,
774       .bpp = src->bpp,
775       .blk_w = src->blk_w,
776       .blk_h = src->blk_h,
777       .pitch = info.aligned_row_pitch * src->blk_w,
778    };
779    struct radv_sdma_surf l2t_dst = *dst;
780    struct radv_sdma_surf l2t_src = {
781       .va = temp_bo->va,
782       .bpp = dst->bpp,
783       .blk_w = dst->blk_w,
784       .blk_h = dst->blk_h,
785       .pitch = info.aligned_row_pitch * dst->blk_w,
786    };
787 
788    for (unsigned slice = 0; slice < extent.depth; ++slice) {
789       for (unsigned row = 0; row < info.extent_vertical_blocks; row += info.num_rows_per_copy) {
790          const unsigned rows = MIN2(info.extent_vertical_blocks - row, info.num_rows_per_copy);
791 
792          const VkExtent3D t2l_extent = {
793             .width = info.extent_horizontal_blocks * src->blk_w,
794             .height = rows * src->blk_h,
795             .depth = 1,
796          };
797 
798          t2l_src.offset.y = src->offset.y + row * src->blk_h;
799          t2l_src.offset.z = src->offset.z + slice;
800          t2l_dst.slice_pitch = t2l_dst.pitch * t2l_extent.height;
801 
802          radv_sdma_emit_copy_tiled_sub_window(device, cs, &t2l_src, &t2l_dst, t2l_extent, true);
803          radv_sdma_emit_nop(device, cs);
804 
805          const VkExtent3D l2t_extent = {
806             .width = info.extent_horizontal_blocks * dst->blk_w,
807             .height = rows * dst->blk_h,
808             .depth = 1,
809          };
810 
811          l2t_dst.offset.y = dst->offset.y + row * dst->blk_h;
812          l2t_dst.offset.z = dst->offset.z + slice;
813          l2t_src.slice_pitch = l2t_src.pitch * l2t_extent.height;
814 
815          radv_sdma_emit_copy_tiled_sub_window(device, cs, &l2t_dst, &l2t_src, l2t_extent, false);
816          radv_sdma_emit_nop(device, cs);
817       }
818    }
819 }
820