• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2015-2021 Advanced Micro Devices, Inc.
4  * Copyright 2023 Valve Corporation
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "radv_sdma.h"
28 #include "util/macros.h"
29 #include "util/u_memory.h"
30 #include "radv_cs.h"
31 #include "radv_private.h"
32 
33 struct radv_sdma_chunked_copy_info {
34    unsigned extent_horizontal_blocks;
35    unsigned extent_vertical_blocks;
36    unsigned aligned_row_pitch;
37    unsigned num_rows_per_copy;
38 };
39 
40 static const VkExtent3D radv_sdma_t2t_alignment_2d_and_planar[] = {
41    {16, 16, 1}, /* 1 bpp */
42    {16, 8, 1},  /* 2 bpp */
43    {8, 8, 1},   /* 4 bpp */
44    {8, 4, 1},   /* 8 bpp */
45    {4, 4, 1},   /* 16 bpp */
46 };
47 
48 static const VkExtent3D radv_sdma_t2t_alignment_3d[] = {
49    {8, 4, 8}, /* 1 bpp */
50    {4, 4, 8}, /* 2 bpp */
51    {4, 4, 4}, /* 4 bpp */
52    {4, 2, 4}, /* 8 bpp */
53    {2, 2, 4}, /* 16 bpp */
54 };
55 
56 ALWAYS_INLINE static unsigned
radv_sdma_pitch_alignment(const struct radv_device * device,const unsigned bpp)57 radv_sdma_pitch_alignment(const struct radv_device *device, const unsigned bpp)
58 {
59    if (device->physical_device->rad_info.sdma_ip_version >= SDMA_5_0)
60       return MAX2(1, 4 / bpp);
61 
62    return 4;
63 }
64 
65 ALWAYS_INLINE static void
radv_sdma_check_pitches(const unsigned pitch,const unsigned slice_pitch,const unsigned bpp,const bool uses_depth)66 radv_sdma_check_pitches(const unsigned pitch, const unsigned slice_pitch, const unsigned bpp, const bool uses_depth)
67 {
68    ASSERTED const unsigned pitch_alignment = MAX2(1, 4 / bpp);
69    assert(pitch);
70    assert(pitch <= (1 << 14));
71    assert(radv_is_aligned(pitch, pitch_alignment));
72 
73    if (uses_depth) {
74       ASSERTED const unsigned slice_pitch_alignment = 4;
75       assert(slice_pitch);
76       assert(slice_pitch <= (1 << 28));
77       assert(radv_is_aligned(slice_pitch, slice_pitch_alignment));
78    }
79 }
80 
81 ALWAYS_INLINE static enum gfx9_resource_type
radv_sdma_surface_resource_type(const struct radv_device * const device,const struct radeon_surf * const surf)82 radv_sdma_surface_resource_type(const struct radv_device *const device, const struct radeon_surf *const surf)
83 {
84    if (device->physical_device->rad_info.sdma_ip_version >= SDMA_5_0) {
85       /* Use the 2D resource type for rotated or Z swizzles. */
86       if ((surf->u.gfx9.resource_type == RADEON_RESOURCE_1D || surf->u.gfx9.resource_type == RADEON_RESOURCE_3D) &&
87           (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER || surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH))
88          return RADEON_RESOURCE_2D;
89    }
90 
91    return surf->u.gfx9.resource_type;
92 }
93 
94 ALWAYS_INLINE static uint32_t
radv_sdma_surface_type_from_aspect_mask(const VkImageAspectFlags aspectMask)95 radv_sdma_surface_type_from_aspect_mask(const VkImageAspectFlags aspectMask)
96 {
97    if (aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
98       return 1;
99    else if (aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
100       return 2;
101 
102    return 0;
103 }
104 
105 ALWAYS_INLINE static VkExtent3D
radv_sdma_pixel_extent_to_blocks(const VkExtent3D extent,const unsigned blk_w,const unsigned blk_h)106 radv_sdma_pixel_extent_to_blocks(const VkExtent3D extent, const unsigned blk_w, const unsigned blk_h)
107 {
108    const VkExtent3D r = {
109       .width = DIV_ROUND_UP(extent.width, blk_w),
110       .height = DIV_ROUND_UP(extent.height, blk_h),
111       .depth = extent.depth,
112    };
113 
114    return r;
115 }
116 
117 ALWAYS_INLINE static VkOffset3D
radv_sdma_pixel_offset_to_blocks(const VkOffset3D offset,const unsigned blk_w,const unsigned blk_h)118 radv_sdma_pixel_offset_to_blocks(const VkOffset3D offset, const unsigned blk_w, const unsigned blk_h)
119 {
120    const VkOffset3D r = {
121       .x = DIV_ROUND_UP(offset.x, blk_w),
122       .y = DIV_ROUND_UP(offset.y, blk_h),
123       .z = offset.z,
124    };
125 
126    return r;
127 }
128 
129 ALWAYS_INLINE static unsigned
radv_sdma_pixels_to_blocks(const unsigned linear_pitch,const unsigned blk_w)130 radv_sdma_pixels_to_blocks(const unsigned linear_pitch, const unsigned blk_w)
131 {
132    return DIV_ROUND_UP(linear_pitch, blk_w);
133 }
134 
135 ALWAYS_INLINE static unsigned
radv_sdma_pixel_area_to_blocks(const unsigned linear_slice_pitch,const unsigned blk_w,const unsigned blk_h)136 radv_sdma_pixel_area_to_blocks(const unsigned linear_slice_pitch, const unsigned blk_w, const unsigned blk_h)
137 {
138    return DIV_ROUND_UP(DIV_ROUND_UP(linear_slice_pitch, blk_w), blk_h);
139 }
140 
141 static struct radv_sdma_chunked_copy_info
radv_sdma_get_chunked_copy_info(const struct radv_device * const device,const struct radv_sdma_surf * const img,const VkExtent3D extent)142 radv_sdma_get_chunked_copy_info(const struct radv_device *const device, const struct radv_sdma_surf *const img,
143                                 const VkExtent3D extent)
144 {
145    const unsigned extent_horizontal_blocks = DIV_ROUND_UP(extent.width, img->blk_w);
146    const unsigned extent_vertical_blocks = DIV_ROUND_UP(extent.height, img->blk_h);
147    const unsigned aligned_row_pitch = ALIGN(extent_horizontal_blocks, 4);
148    const unsigned aligned_row_bytes = aligned_row_pitch * img->bpp;
149 
150    /* Assume that we can always copy at least one full row at a time. */
151    const unsigned max_num_rows_per_copy = MIN2(RADV_SDMA_TRANSFER_TEMP_BYTES / aligned_row_bytes, extent.height);
152    assert(max_num_rows_per_copy);
153 
154    /* Ensure that the number of rows copied at a time is a power of two. */
155    const unsigned num_rows_per_copy = MAX2(1, util_next_power_of_two(max_num_rows_per_copy + 1) / 2);
156 
157    const struct radv_sdma_chunked_copy_info r = {
158       .extent_horizontal_blocks = extent_horizontal_blocks,
159       .extent_vertical_blocks = extent_vertical_blocks,
160       .aligned_row_pitch = aligned_row_pitch,
161       .num_rows_per_copy = num_rows_per_copy,
162    };
163 
164    return r;
165 }
166 
167 struct radv_sdma_surf
radv_sdma_get_buf_surf(const struct radv_buffer * const buffer,const struct radv_image * const image,const VkBufferImageCopy2 * const region,const VkImageAspectFlags aspect_mask)168 radv_sdma_get_buf_surf(const struct radv_buffer *const buffer, const struct radv_image *const image,
169                        const VkBufferImageCopy2 *const region, const VkImageAspectFlags aspect_mask)
170 {
171    assert(util_bitcount(aspect_mask) == 1);
172 
173    const unsigned pitch = (region->bufferRowLength ? region->bufferRowLength : region->imageExtent.width);
174    const unsigned slice_pitch =
175       (region->bufferImageHeight ? region->bufferImageHeight : region->imageExtent.height) * pitch;
176 
177    const unsigned plane_idx = radv_plane_from_aspect(region->imageSubresource.aspectMask);
178    const struct radeon_surf *surf = &image->planes[plane_idx].surface;
179    const struct radv_sdma_surf info = {
180       .va = radv_buffer_get_va(buffer->bo) + buffer->offset + region->bufferOffset,
181       .pitch = pitch,
182       .slice_pitch = slice_pitch,
183       .bpp = surf->bpe,
184       .blk_w = surf->blk_w,
185       .blk_h = surf->blk_h,
186       .is_linear = true,
187    };
188 
189    return info;
190 }
191 
192 static uint32_t
radv_sdma_get_metadata_config(const struct radv_device * const device,const struct radv_image * const image,const struct radeon_surf * const surf,const VkImageSubresourceLayers subresource,const VkImageAspectFlags aspect_mask)193 radv_sdma_get_metadata_config(const struct radv_device *const device, const struct radv_image *const image,
194                               const struct radeon_surf *const surf, const VkImageSubresourceLayers subresource,
195                               const VkImageAspectFlags aspect_mask)
196 {
197    if (!device->physical_device->rad_info.sdma_supports_compression ||
198        !(radv_dcc_enabled(image, subresource.mipLevel) || radv_image_has_htile(image))) {
199       return 0;
200    }
201 
202    const VkFormat format = vk_format_get_aspect_format(image->vk.format, aspect_mask);
203    const struct util_format_description *desc = vk_format_description(format);
204 
205    const uint32_t data_format =
206       ac_get_cb_format(device->physical_device->rad_info.gfx_level, vk_format_to_pipe_format(format));
207    const uint32_t alpha_is_on_msb = vi_alpha_is_on_msb(device, format);
208    const uint32_t number_type = radv_translate_buffer_numformat(desc, vk_format_get_first_non_void_channel(format));
209    const uint32_t surface_type = radv_sdma_surface_type_from_aspect_mask(aspect_mask);
210    const uint32_t max_comp_block_size = surf->u.gfx9.color.dcc.max_compressed_block_size;
211    const uint32_t max_uncomp_block_size = radv_get_dcc_max_uncompressed_block_size(device, image);
212    const uint32_t pipe_aligned = surf->u.gfx9.color.dcc.pipe_aligned;
213 
214    return data_format | alpha_is_on_msb << 8 | number_type << 9 | surface_type << 12 | max_comp_block_size << 24 |
215           max_uncomp_block_size << 26 | pipe_aligned << 31;
216 }
217 
218 static uint32_t
radv_sdma_get_tiled_info_dword(const struct radv_device * const device,const struct radv_image * const image,const struct radeon_surf * const surf,const VkImageSubresourceLayers subresource)219 radv_sdma_get_tiled_info_dword(const struct radv_device *const device, const struct radv_image *const image,
220                                const struct radeon_surf *const surf, const VkImageSubresourceLayers subresource)
221 {
222    const uint32_t element_size = util_logbase2(surf->bpe);
223    const uint32_t swizzle_mode = surf->has_stencil ? surf->u.gfx9.zs.stencil_swizzle_mode : surf->u.gfx9.swizzle_mode;
224    const enum gfx9_resource_type dimension = radv_sdma_surface_resource_type(device, surf);
225    const uint32_t info = element_size | swizzle_mode << 3 | dimension << 9;
226    const enum sdma_version ver = device->physical_device->rad_info.sdma_ip_version;
227 
228    if (ver >= SDMA_5_0) {
229       const uint32_t mip_max = MAX2(image->vk.mip_levels, 1);
230       const uint32_t mip_id = subresource.mipLevel;
231 
232       return info | (mip_max - 1) << 16 | mip_id << 20;
233    } else if (ver >= SDMA_4_0) {
234       return info | surf->u.gfx9.epitch << 16;
235    } else {
236       unreachable("unsupported SDMA version");
237    }
238 }
239 
240 static uint32_t
radv_sdma_get_tiled_header_dword(const struct radv_device * const device,const struct radv_image * const image,const VkImageSubresourceLayers subresource)241 radv_sdma_get_tiled_header_dword(const struct radv_device *const device, const struct radv_image *const image,
242                                  const VkImageSubresourceLayers subresource)
243 {
244    const enum sdma_version ver = device->physical_device->rad_info.sdma_ip_version;
245 
246    if (ver >= SDMA_5_0) {
247       return 0;
248    } else if (ver >= SDMA_4_0) {
249       const uint32_t mip_max = MAX2(image->vk.mip_levels, 1);
250       const uint32_t mip_id = subresource.mipLevel;
251       return (mip_max - 1) << 20 | mip_id << 24;
252    } else {
253       unreachable("unsupported SDMA version");
254    }
255 }
256 
257 struct radv_sdma_surf
radv_sdma_get_surf(const struct radv_device * const device,const struct radv_image * const image,const VkImageSubresourceLayers subresource,const VkOffset3D offset,const VkImageAspectFlags aspect_mask)258 radv_sdma_get_surf(const struct radv_device *const device, const struct radv_image *const image,
259                    const VkImageSubresourceLayers subresource, const VkOffset3D offset,
260                    const VkImageAspectFlags aspect_mask)
261 {
262    assert(util_bitcount(aspect_mask) == 1);
263 
264    const unsigned plane_idx = radv_plane_from_aspect(aspect_mask);
265    const unsigned binding_idx = image->disjoint ? plane_idx : 0;
266    const struct radv_image_binding *binding = &image->bindings[binding_idx];
267    const struct radeon_surf *const surf = &image->planes[plane_idx].surface;
268    struct radv_sdma_surf info = {
269       .extent =
270          {
271             .width = vk_format_get_plane_width(image->vk.format, plane_idx, image->vk.extent.width),
272             .height = vk_format_get_plane_height(image->vk.format, plane_idx, image->vk.extent.height),
273             .depth = image->vk.image_type == VK_IMAGE_TYPE_3D ? image->vk.extent.depth : image->vk.array_layers,
274          },
275       .offset =
276          {
277             .x = offset.x,
278             .y = offset.y,
279             .z = image->vk.image_type == VK_IMAGE_TYPE_3D ? offset.z : subresource.baseArrayLayer,
280          },
281       .bpp = surf->bpe,
282       .blk_w = surf->blk_w,
283       .blk_h = surf->blk_h,
284       .mip_levels = image->vk.mip_levels,
285       .micro_tile_mode = surf->micro_tile_mode,
286       .is_linear = surf->is_linear,
287       .is_3d = surf->u.gfx9.resource_type == RADEON_RESOURCE_3D,
288    };
289 
290    if (surf->is_linear) {
291       info.va =
292          binding->bo->va + binding->offset + surf->u.gfx9.surf_offset + surf->u.gfx9.offset[subresource.mipLevel];
293       info.pitch = surf->u.gfx9.pitch[subresource.mipLevel];
294       info.slice_pitch = surf->blk_w * surf->blk_h * surf->u.gfx9.surf_slice_size / surf->bpe;
295    } else {
296       /* 1D resources should be linear. */
297       assert(surf->u.gfx9.resource_type != RADEON_RESOURCE_1D);
298 
299       info.va = (binding->bo->va + binding->offset + surf->u.gfx9.surf_offset) | surf->tile_swizzle << 8;
300       info.info_dword = radv_sdma_get_tiled_info_dword(device, image, surf, subresource);
301       info.header_dword = radv_sdma_get_tiled_header_dword(device, image, subresource);
302 
303       if (device->physical_device->rad_info.sdma_supports_compression &&
304           (radv_dcc_enabled(image, subresource.mipLevel) || radv_image_has_htile(image))) {
305          info.meta_va = binding->bo->va + binding->offset + surf->meta_offset;
306          info.meta_config = radv_sdma_get_metadata_config(device, image, surf, subresource, aspect_mask);
307       }
308    }
309 
310    return info;
311 }
312 
313 static void
radv_sdma_emit_nop(const struct radv_device * device,struct radeon_cmdbuf * cs)314 radv_sdma_emit_nop(const struct radv_device *device, struct radeon_cmdbuf *cs)
315 {
316    /* SDMA NOP acts as a fence command and causes the SDMA engine to wait for pending copy operations. */
317    radeon_check_space(device->ws, cs, 1);
318    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
319 }
320 
321 void
radv_sdma_copy_buffer(const struct radv_device * device,struct radeon_cmdbuf * cs,uint64_t src_va,uint64_t dst_va,uint64_t size)322 radv_sdma_copy_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, uint64_t src_va, uint64_t dst_va,
323                       uint64_t size)
324 {
325    if (size == 0)
326       return;
327 
328    const enum sdma_version ver = device->physical_device->rad_info.sdma_ip_version;
329    const unsigned max_size_per_packet = ver >= SDMA_5_2 ? SDMA_V5_2_COPY_MAX_BYTES : SDMA_V2_0_COPY_MAX_BYTES;
330 
331    unsigned align = ~0u;
332    unsigned ncopy = DIV_ROUND_UP(size, max_size_per_packet);
333 
334    assert(ver >= SDMA_2_0);
335 
336    /* SDMA FW automatically enables a faster dword copy mode when
337     * source, destination and size are all dword-aligned.
338     *
339     * When source and destination are dword-aligned, round down the size to
340     * take advantage of faster dword copy, and copy the remaining few bytes
341     * with the last copy packet.
342     */
343    if ((src_va & 0x3) == 0 && (dst_va & 0x3) == 0 && size > 4 && (size & 0x3) != 0) {
344       align = ~0x3u;
345       ncopy++;
346    }
347 
348    radeon_check_space(device->ws, cs, ncopy * 7);
349 
350    for (unsigned i = 0; i < ncopy; i++) {
351       unsigned csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
352       radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0));
353       radeon_emit(cs, ver >= SDMA_4_0 ? csize - 1 : csize);
354       radeon_emit(cs, 0); /* src/dst endian swap */
355       radeon_emit(cs, src_va);
356       radeon_emit(cs, src_va >> 32);
357       radeon_emit(cs, dst_va);
358       radeon_emit(cs, dst_va >> 32);
359       dst_va += csize;
360       src_va += csize;
361       size -= csize;
362    }
363 }
364 
365 void
radv_sdma_fill_buffer(const struct radv_device * device,struct radeon_cmdbuf * cs,const uint64_t va,const uint64_t size,const uint32_t value)366 radv_sdma_fill_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, const uint64_t va,
367                       const uint64_t size, const uint32_t value)
368 {
369    const uint32_t fill_size = 2; /* This means that the count is in dwords. */
370    const uint32_t constant_fill_header = SDMA_PACKET(SDMA_OPCODE_CONSTANT_FILL, 0, 0) | (fill_size & 0x3) << 30;
371 
372    /* This packet is the same since SDMA v2.4, haven't bothered to check older versions. */
373    const enum sdma_version ver = device->physical_device->rad_info.sdma_ip_version;
374    assert(ver >= SDMA_2_4);
375 
376    /* Maximum allowed fill size depends on the GPU.
377     * Emit as many packets as necessary to fill all the bytes we need.
378     */
379    const uint64_t max_fill_bytes = BITFIELD64_MASK(ver >= SDMA_6_0 ? 30 : 22) & ~0x3;
380    const unsigned num_packets = DIV_ROUND_UP(size, max_fill_bytes);
381    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, num_packets * 5);
382 
383    for (unsigned i = 0; i < num_packets; ++i) {
384       const uint64_t offset = i * max_fill_bytes;
385       const uint64_t fill_bytes = MIN2(size - offset, max_fill_bytes);
386       const uint64_t fill_va = va + offset;
387 
388       radeon_emit(cs, constant_fill_header);
389       radeon_emit(cs, fill_va);
390       radeon_emit(cs, fill_va >> 32);
391       radeon_emit(cs, value);
392       radeon_emit(cs, fill_bytes - 1); /* Must be programmed in bytes, even if the fill is done in dwords. */
393    }
394 
395    assert(cs->cdw <= cdw_max);
396 }
397 
398 static void
radv_sdma_emit_copy_linear_sub_window(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * const src,const struct radv_sdma_surf * const dst,const VkExtent3D pix_extent)399 radv_sdma_emit_copy_linear_sub_window(const struct radv_device *device, struct radeon_cmdbuf *cs,
400                                       const struct radv_sdma_surf *const src, const struct radv_sdma_surf *const dst,
401                                       const VkExtent3D pix_extent)
402 {
403    /* This packet is the same since SDMA v2.4, haven't bothered to check older versions.
404     * The main difference is the bitfield sizes:
405     *
406     * v2.4 - src/dst_pitch: 14 bits, rect_z: 11 bits
407     * v4.0 - src/dst_pitch: 19 bits, rect_z: 11 bits
408     * v5.0 - src/dst_pitch: 19 bits, rect_z: 13 bits
409     *
410     * We currently use the smallest limits (from SDMA v2.4).
411     */
412 
413    const VkOffset3D src_off = radv_sdma_pixel_offset_to_blocks(src->offset, src->blk_w, src->blk_h);
414    const VkOffset3D dst_off = radv_sdma_pixel_offset_to_blocks(dst->offset, dst->blk_w, dst->blk_h);
415    const VkExtent3D ext = radv_sdma_pixel_extent_to_blocks(pix_extent, src->blk_w, src->blk_h);
416    const unsigned src_pitch = radv_sdma_pixels_to_blocks(src->pitch, src->blk_w);
417    const unsigned dst_pitch = radv_sdma_pixels_to_blocks(dst->pitch, dst->blk_w);
418    const unsigned src_slice_pitch = radv_sdma_pixel_area_to_blocks(src->slice_pitch, src->blk_w, src->blk_h);
419    const unsigned dst_slice_pitch = radv_sdma_pixel_area_to_blocks(dst->slice_pitch, dst->blk_w, dst->blk_h);
420 
421    assert(src->bpp == dst->bpp);
422    assert(util_is_power_of_two_nonzero(src->bpp));
423    radv_sdma_check_pitches(src->pitch, src->slice_pitch, src->bpp, false);
424    radv_sdma_check_pitches(dst->pitch, dst->slice_pitch, dst->bpp, false);
425 
426    ASSERTED unsigned cdw_end = radeon_check_space(device->ws, cs, 13);
427 
428    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) | util_logbase2(src->bpp)
429                                                                                                  << 29);
430    radeon_emit(cs, src->va);
431    radeon_emit(cs, src->va >> 32);
432    radeon_emit(cs, src_off.x | src_off.y << 16);
433    radeon_emit(cs, src_off.z | (src_pitch - 1) << 13);
434    radeon_emit(cs, src_slice_pitch - 1);
435    radeon_emit(cs, dst->va);
436    radeon_emit(cs, dst->va >> 32);
437    radeon_emit(cs, dst_off.x | dst_off.y << 16);
438    radeon_emit(cs, dst_off.z | (dst_pitch - 1) << 13);
439    radeon_emit(cs, dst_slice_pitch - 1);
440    radeon_emit(cs, (ext.width - 1) | (ext.height - 1) << 16);
441    radeon_emit(cs, (ext.depth - 1));
442 
443    assert(cs->cdw == cdw_end);
444 }
445 
446 static void
radv_sdma_emit_copy_tiled_sub_window(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * const tiled,const struct radv_sdma_surf * const linear,const VkExtent3D pix_extent,const bool detile)447 radv_sdma_emit_copy_tiled_sub_window(const struct radv_device *device, struct radeon_cmdbuf *cs,
448                                      const struct radv_sdma_surf *const tiled,
449                                      const struct radv_sdma_surf *const linear, const VkExtent3D pix_extent,
450                                      const bool detile)
451 {
452    if (!device->physical_device->rad_info.sdma_supports_compression) {
453       assert(!tiled->meta_va);
454    }
455 
456    const VkOffset3D linear_off = radv_sdma_pixel_offset_to_blocks(linear->offset, linear->blk_w, linear->blk_h);
457    const VkOffset3D tiled_off = radv_sdma_pixel_offset_to_blocks(tiled->offset, tiled->blk_w, tiled->blk_h);
458    const VkExtent3D tiled_ext = radv_sdma_pixel_extent_to_blocks(tiled->extent, tiled->blk_w, tiled->blk_h);
459    const VkExtent3D ext = radv_sdma_pixel_extent_to_blocks(pix_extent, tiled->blk_w, tiled->blk_h);
460    const unsigned linear_pitch = radv_sdma_pixels_to_blocks(linear->pitch, tiled->blk_w);
461    const unsigned linear_slice_pitch = radv_sdma_pixel_area_to_blocks(linear->slice_pitch, tiled->blk_w, tiled->blk_h);
462    const bool dcc = !!tiled->meta_va;
463    const bool uses_depth = linear_off.z != 0 || tiled_off.z != 0 || ext.depth != 1;
464 
465    assert(util_is_power_of_two_nonzero(tiled->bpp));
466    radv_sdma_check_pitches(linear_pitch, linear_slice_pitch, tiled->bpp, uses_depth);
467 
468    ASSERTED unsigned cdw_end = radeon_check_space(device->ws, cs, 14 + (dcc ? 3 : 0));
469 
470    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) | dcc << 19 | detile << 31 |
471                       tiled->header_dword);
472    radeon_emit(cs, tiled->va);
473    radeon_emit(cs, tiled->va >> 32);
474    radeon_emit(cs, tiled_off.x | tiled_off.y << 16);
475    radeon_emit(cs, tiled_off.z | (tiled_ext.width - 1) << 16);
476    radeon_emit(cs, (tiled_ext.height - 1) | (tiled_ext.depth - 1) << 16);
477    radeon_emit(cs, tiled->info_dword);
478    radeon_emit(cs, linear->va);
479    radeon_emit(cs, linear->va >> 32);
480    radeon_emit(cs, linear_off.x | linear_off.y << 16);
481    radeon_emit(cs, linear_off.z | (linear_pitch - 1) << 16);
482    radeon_emit(cs, linear_slice_pitch - 1);
483    radeon_emit(cs, (ext.width - 1) | (ext.height - 1) << 16);
484    radeon_emit(cs, (ext.depth - 1));
485 
486    if (tiled->meta_va) {
487       const unsigned write_compress_enable = !detile;
488       radeon_emit(cs, tiled->meta_va);
489       radeon_emit(cs, tiled->meta_va >> 32);
490       radeon_emit(cs, tiled->meta_config | write_compress_enable << 28);
491    }
492 
493    assert(cs->cdw == cdw_end);
494 }
495 
496 static void
radv_sdma_emit_copy_t2t_sub_window(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * const src,const struct radv_sdma_surf * const dst,const VkExtent3D px_extent)497 radv_sdma_emit_copy_t2t_sub_window(const struct radv_device *device, struct radeon_cmdbuf *cs,
498                                    const struct radv_sdma_surf *const src, const struct radv_sdma_surf *const dst,
499                                    const VkExtent3D px_extent)
500 {
501    /* We currently only support the SDMA v4+ versions of this packet. */
502    assert(device->physical_device->rad_info.sdma_ip_version >= SDMA_4_0);
503 
504    /* On GFX10+ this supports DCC, but cannot copy a compressed surface to another compressed surface. */
505    assert(!src->meta_va || !dst->meta_va);
506 
507    if (device->physical_device->rad_info.sdma_ip_version >= SDMA_4_0 &&
508        device->physical_device->rad_info.sdma_ip_version < SDMA_5_0) {
509       /* SDMA v4 doesn't support mip_id selection in the T2T copy packet. */
510       assert(src->header_dword >> 24 == 0);
511       assert(dst->header_dword >> 24 == 0);
512       /* SDMA v4 doesn't support any image metadata. */
513       assert(!src->meta_va);
514       assert(!dst->meta_va);
515    }
516 
517    /* Despite the name, this can indicate DCC or HTILE metadata. */
518    const uint32_t dcc = src->meta_va || dst->meta_va;
519    /* 0 = compress (src is uncompressed), 1 = decompress (src is compressed). */
520    const uint32_t dcc_dir = src->meta_va && !dst->meta_va;
521 
522    const VkOffset3D src_off = radv_sdma_pixel_offset_to_blocks(src->offset, src->blk_w, src->blk_h);
523    const VkOffset3D dst_off = radv_sdma_pixel_offset_to_blocks(dst->offset, dst->blk_w, dst->blk_h);
524    const VkExtent3D src_ext = radv_sdma_pixel_extent_to_blocks(src->extent, src->blk_w, src->blk_h);
525    const VkExtent3D dst_ext = radv_sdma_pixel_extent_to_blocks(dst->extent, dst->blk_w, dst->blk_h);
526    const VkExtent3D ext = radv_sdma_pixel_extent_to_blocks(px_extent, src->blk_w, src->blk_h);
527 
528    assert(util_is_power_of_two_nonzero(src->bpp));
529    assert(util_is_power_of_two_nonzero(dst->bpp));
530 
531    ASSERTED unsigned cdw_end = radeon_check_space(device->ws, cs, 15 + (dcc ? 3 : 0));
532 
533    radeon_emit(cs, SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0) | dcc << 19 | dcc_dir << 31 |
534                       src->header_dword);
535    radeon_emit(cs, src->va);
536    radeon_emit(cs, src->va >> 32);
537    radeon_emit(cs, src_off.x | src_off.y << 16);
538    radeon_emit(cs, src_off.z | (src_ext.width - 1) << 16);
539    radeon_emit(cs, (src_ext.height - 1) | (src_ext.depth - 1) << 16);
540    radeon_emit(cs, src->info_dword);
541    radeon_emit(cs, dst->va);
542    radeon_emit(cs, dst->va >> 32);
543    radeon_emit(cs, dst_off.x | dst_off.y << 16);
544    radeon_emit(cs, dst_off.z | (dst_ext.width - 1) << 16);
545    radeon_emit(cs, (dst_ext.height - 1) | (dst_ext.depth - 1) << 16);
546    radeon_emit(cs, dst->info_dword);
547    radeon_emit(cs, (ext.width - 1) | (ext.height - 1) << 16);
548    radeon_emit(cs, (ext.depth - 1));
549 
550    if (dst->meta_va) {
551       const uint32_t write_compress_enable = 1;
552       radeon_emit(cs, dst->meta_va);
553       radeon_emit(cs, dst->meta_va >> 32);
554       radeon_emit(cs, dst->meta_config | write_compress_enable << 28);
555    } else if (src->meta_va) {
556       radeon_emit(cs, src->meta_va);
557       radeon_emit(cs, src->meta_va >> 32);
558       radeon_emit(cs, src->meta_config);
559    }
560 
561    assert(cs->cdw == cdw_end);
562 }
563 
564 void
radv_sdma_copy_buffer_image(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * buf,const struct radv_sdma_surf * img,const VkExtent3D extent,bool to_image)565 radv_sdma_copy_buffer_image(const struct radv_device *device, struct radeon_cmdbuf *cs,
566                             const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img, const VkExtent3D extent,
567                             bool to_image)
568 {
569    if (img->is_linear) {
570       if (to_image)
571          radv_sdma_emit_copy_linear_sub_window(device, cs, buf, img, extent);
572       else
573          radv_sdma_emit_copy_linear_sub_window(device, cs, img, buf, extent);
574    } else {
575       radv_sdma_emit_copy_tiled_sub_window(device, cs, img, buf, extent, !to_image);
576    }
577 }
578 
579 bool
radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device * device,const struct radv_sdma_surf * buf,const struct radv_sdma_surf * img,const VkExtent3D ext)580 radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device *device, const struct radv_sdma_surf *buf,
581                                           const struct radv_sdma_surf *img, const VkExtent3D ext)
582 {
583    const unsigned pitch_blocks = radv_sdma_pixels_to_blocks(buf->pitch, img->blk_w);
584    if (!radv_is_aligned(pitch_blocks, radv_sdma_pitch_alignment(device, img->bpp)))
585       return true;
586 
587    const bool uses_depth = img->offset.z != 0 || ext.depth != 1;
588    if (!img->is_linear && uses_depth) {
589       const unsigned slice_pitch_blocks = radv_sdma_pixel_area_to_blocks(buf->slice_pitch, img->blk_w, img->blk_h);
590       if (!radv_is_aligned(slice_pitch_blocks, 4))
591          return true;
592    }
593 
594    return false;
595 }
596 
597 void
radv_sdma_copy_buffer_image_unaligned(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * buf,const struct radv_sdma_surf * img_in,const VkExtent3D base_extent,struct radeon_winsys_bo * temp_bo,bool to_image)598 radv_sdma_copy_buffer_image_unaligned(const struct radv_device *device, struct radeon_cmdbuf *cs,
599                                       const struct radv_sdma_surf *buf, const struct radv_sdma_surf *img_in,
600                                       const VkExtent3D base_extent, struct radeon_winsys_bo *temp_bo, bool to_image)
601 {
602    const struct radv_sdma_chunked_copy_info info = radv_sdma_get_chunked_copy_info(device, img_in, base_extent);
603    struct radv_sdma_surf img = *img_in;
604    struct radv_sdma_surf tmp = {
605       .va = temp_bo->va,
606       .bpp = img.bpp,
607       .blk_w = img.blk_w,
608       .blk_h = img.blk_h,
609       .pitch = info.aligned_row_pitch * img.blk_w,
610       .slice_pitch = info.aligned_row_pitch * img.blk_w * info.extent_vertical_blocks * img.blk_h,
611    };
612 
613    VkExtent3D extent = base_extent;
614    const unsigned buf_pitch_blocks = DIV_ROUND_UP(buf->pitch, img.blk_w);
615    const unsigned buf_slice_pitch_blocks = DIV_ROUND_UP(DIV_ROUND_UP(buf->slice_pitch, img.blk_w), img.blk_h);
616    assert(buf_pitch_blocks);
617    assert(buf_slice_pitch_blocks);
618    extent.depth = 1;
619 
620    for (unsigned slice = 0; slice < base_extent.depth; ++slice) {
621       for (unsigned row = 0; row < info.extent_vertical_blocks; row += info.num_rows_per_copy) {
622          const unsigned rows = MIN2(info.extent_vertical_blocks - row, info.num_rows_per_copy);
623 
624          img.offset.y = img_in->offset.y + row * img.blk_h;
625          img.offset.z = img_in->offset.z + slice;
626          extent.height = rows * img.blk_h;
627          tmp.slice_pitch = tmp.pitch * rows * img.blk_h;
628 
629          if (!to_image) {
630             /* Copy the rows from the source image to the temporary buffer. */
631             if (img.is_linear)
632                radv_sdma_emit_copy_linear_sub_window(device, cs, &img, &tmp, extent);
633             else
634                radv_sdma_emit_copy_tiled_sub_window(device, cs, &img, &tmp, extent, true);
635 
636             /* Wait for the copy to finish. */
637             radv_sdma_emit_nop(device, cs);
638          }
639 
640          /* buffer to image: copy each row from source buffer to temporary buffer.
641           * image to buffer: copy each row from temporary buffer to destination buffer.
642           */
643          for (unsigned r = 0; r < rows; ++r) {
644             const uint64_t buf_va =
645                buf->va + slice * buf_slice_pitch_blocks * img.bpp + (row + r) * buf_pitch_blocks * img.bpp;
646             const uint64_t tmp_va = tmp.va + r * info.aligned_row_pitch * img.bpp;
647             radv_sdma_copy_buffer(device, cs, to_image ? buf_va : tmp_va, to_image ? tmp_va : buf_va,
648                                   info.extent_horizontal_blocks * img.bpp);
649          }
650 
651          /* Wait for the copy to finish. */
652          radv_sdma_emit_nop(device, cs);
653 
654          if (to_image) {
655             /* Copy the rows from the temporary buffer to the destination image. */
656             if (img.is_linear)
657                radv_sdma_emit_copy_linear_sub_window(device, cs, &tmp, &img, extent);
658             else
659                radv_sdma_emit_copy_tiled_sub_window(device, cs, &img, &tmp, extent, false);
660 
661             /* Wait for the copy to finish. */
662             radv_sdma_emit_nop(device, cs);
663          }
664       }
665    }
666 }
667 
668 void
radv_sdma_copy_image(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * src,const struct radv_sdma_surf * dst,const VkExtent3D extent)669 radv_sdma_copy_image(const struct radv_device *device, struct radeon_cmdbuf *cs, const struct radv_sdma_surf *src,
670                      const struct radv_sdma_surf *dst, const VkExtent3D extent)
671 {
672    if (src->is_linear) {
673       if (dst->is_linear) {
674          radv_sdma_emit_copy_linear_sub_window(device, cs, src, dst, extent);
675       } else {
676          radv_sdma_emit_copy_tiled_sub_window(device, cs, dst, src, extent, false);
677       }
678    } else {
679       if (dst->is_linear) {
680          radv_sdma_emit_copy_tiled_sub_window(device, cs, src, dst, extent, true);
681       } else {
682          radv_sdma_emit_copy_t2t_sub_window(device, cs, src, dst, extent);
683       }
684    }
685 }
686 
687 bool
radv_sdma_use_t2t_scanline_copy(const struct radv_device * device,const struct radv_sdma_surf * src,const struct radv_sdma_surf * dst,const VkExtent3D extent)688 radv_sdma_use_t2t_scanline_copy(const struct radv_device *device, const struct radv_sdma_surf *src,
689                                 const struct radv_sdma_surf *dst, const VkExtent3D extent)
690 {
691    /* These need a linear-to-linear / linear-to-tiled copy. */
692    if (src->is_linear || dst->is_linear)
693       return false;
694 
695    /* SDMA can't do format conversion. */
696    assert(src->bpp == dst->bpp);
697 
698    const enum sdma_version ver = device->physical_device->rad_info.sdma_ip_version;
699    if (ver < SDMA_5_0) {
700       /* SDMA v4.x and older doesn't support proper mip level selection. */
701       if (src->mip_levels > 1 || dst->mip_levels > 1)
702          return true;
703    }
704 
705    /* The two images can have a different block size,
706     * but must have the same swizzle mode.
707     */
708    if (src->micro_tile_mode != dst->micro_tile_mode)
709       return true;
710 
711    /* The T2T subwindow copy packet only has fields for one metadata configuration.
712     * It can either compress or decompress, or copy uncompressed images, but it
713     * can't copy from a compressed image to another.
714     */
715    if (src->meta_va && dst->meta_va)
716       return true;
717 
718    const bool needs_3d_alignment = src->is_3d && (src->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
719                                                   src->micro_tile_mode == RADEON_MICRO_MODE_STANDARD);
720    const unsigned log2bpp = util_logbase2(src->bpp);
721    const VkExtent3D *const alignment =
722       needs_3d_alignment ? &radv_sdma_t2t_alignment_3d[log2bpp] : &radv_sdma_t2t_alignment_2d_and_planar[log2bpp];
723 
724    const VkExtent3D copy_extent_blk = radv_sdma_pixel_extent_to_blocks(extent, src->blk_w, src->blk_h);
725    const VkOffset3D src_offset_blk = radv_sdma_pixel_offset_to_blocks(src->offset, src->blk_w, src->blk_h);
726    const VkOffset3D dst_offset_blk = radv_sdma_pixel_offset_to_blocks(dst->offset, dst->blk_w, dst->blk_h);
727 
728    if (!radv_is_aligned(copy_extent_blk.width, alignment->width) ||
729        !radv_is_aligned(copy_extent_blk.height, alignment->height) ||
730        !radv_is_aligned(copy_extent_blk.depth, alignment->depth))
731       return true;
732 
733    if (!radv_is_aligned(src_offset_blk.x, alignment->width) || !radv_is_aligned(src_offset_blk.y, alignment->height) ||
734        !radv_is_aligned(src_offset_blk.z, alignment->depth))
735       return true;
736 
737    if (!radv_is_aligned(dst_offset_blk.x, alignment->width) || !radv_is_aligned(dst_offset_blk.y, alignment->height) ||
738        !radv_is_aligned(dst_offset_blk.z, alignment->depth))
739       return true;
740 
741    return false;
742 }
743 
744 void
radv_sdma_copy_image_t2t_scanline(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_sdma_surf * src,const struct radv_sdma_surf * dst,const VkExtent3D extent,struct radeon_winsys_bo * temp_bo)745 radv_sdma_copy_image_t2t_scanline(const struct radv_device *device, struct radeon_cmdbuf *cs,
746                                   const struct radv_sdma_surf *src, const struct radv_sdma_surf *dst,
747                                   const VkExtent3D extent, struct radeon_winsys_bo *temp_bo)
748 {
749    const struct radv_sdma_chunked_copy_info info = radv_sdma_get_chunked_copy_info(device, src, extent);
750    struct radv_sdma_surf t2l_src = *src;
751    struct radv_sdma_surf t2l_dst = {
752       .va = temp_bo->va,
753       .bpp = src->bpp,
754       .blk_w = src->blk_w,
755       .blk_h = src->blk_h,
756       .pitch = info.aligned_row_pitch * src->blk_w,
757    };
758    struct radv_sdma_surf l2t_dst = *dst;
759    struct radv_sdma_surf l2t_src = {
760       .va = temp_bo->va,
761       .bpp = dst->bpp,
762       .blk_w = dst->blk_w,
763       .blk_h = dst->blk_h,
764       .pitch = info.aligned_row_pitch * dst->blk_w,
765    };
766 
767    for (unsigned slice = 0; slice < extent.depth; ++slice) {
768       for (unsigned row = 0; row < info.extent_vertical_blocks; row += info.num_rows_per_copy) {
769          const unsigned rows = MIN2(info.extent_vertical_blocks - row, info.num_rows_per_copy);
770 
771          const VkExtent3D t2l_extent = {
772             .width = info.extent_horizontal_blocks * src->blk_w,
773             .height = rows * src->blk_h,
774             .depth = 1,
775          };
776 
777          t2l_src.offset.y = src->offset.y + row * src->blk_h;
778          t2l_src.offset.z = src->offset.z + slice;
779          t2l_dst.slice_pitch = t2l_dst.pitch * t2l_extent.height;
780 
781          radv_sdma_emit_copy_tiled_sub_window(device, cs, &t2l_src, &t2l_dst, t2l_extent, true);
782          radv_sdma_emit_nop(device, cs);
783 
784          const VkExtent3D l2t_extent = {
785             .width = info.extent_horizontal_blocks * dst->blk_w,
786             .height = rows * dst->blk_h,
787             .depth = 1,
788          };
789 
790          l2t_dst.offset.y = dst->offset.y + row * dst->blk_h;
791          l2t_dst.offset.z = dst->offset.z + slice;
792          l2t_src.slice_pitch = l2t_src.pitch * l2t_extent.height;
793 
794          radv_sdma_emit_copy_tiled_sub_window(device, cs, &l2t_dst, &l2t_src, l2t_extent, false);
795          radv_sdma_emit_nop(device, cs);
796       }
797    }
798 }
799