• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "compiler/nir/nir_builder.h"
28 #include "util/u_pack_color.h"
29 #include "vk_common_entrypoints.h"
30 
31 static uint32_t
meta_blit_key_hash(const void * key)32 meta_blit_key_hash(const void *key)
33 {
34    return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
35 }
36 
37 static bool
meta_blit_key_compare(const void * key1,const void * key2)38 meta_blit_key_compare(const void *key1, const void *key2)
39 {
40    return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
41 }
42 
43 static bool
44 texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
45                          VkImageAspectFlags aspect,
46                          struct v3dv_image *image,
47                          VkFormat dst_format,
48                          VkFormat src_format,
49                          struct v3dv_buffer *buffer,
50                          uint32_t buffer_bpp,
51                          VkColorComponentFlags cmask,
52                          VkComponentMapping *cswizzle,
53                          uint32_t region_count,
54                          const VkBufferImageCopy2 *regions);
55 
56 static bool
create_blit_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * descriptor_set_layout,VkPipelineLayout * pipeline_layout)57 create_blit_pipeline_layout(struct v3dv_device *device,
58                             VkDescriptorSetLayout *descriptor_set_layout,
59                             VkPipelineLayout *pipeline_layout)
60 {
61    VkResult result;
62 
63    if (*descriptor_set_layout == 0) {
64       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
65          .binding = 0,
66          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
67          .descriptorCount = 1,
68          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
69       };
70       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
71          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
72          .bindingCount = 1,
73          .pBindings = &descriptor_set_layout_binding,
74       };
75       result =
76          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
77                                         &descriptor_set_layout_info,
78                                         &device->vk.alloc,
79                                         descriptor_set_layout);
80       if (result != VK_SUCCESS)
81          return false;
82    }
83 
84    assert(*pipeline_layout == 0);
85    VkPipelineLayoutCreateInfo pipeline_layout_info = {
86       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
87       .setLayoutCount = 1,
88       .pSetLayouts = descriptor_set_layout,
89       .pushConstantRangeCount = 1,
90       .pPushConstantRanges =
91          &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
92    };
93 
94    result =
95       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
96                                 &pipeline_layout_info,
97                                 &device->vk.alloc,
98                                 pipeline_layout);
99    return result == VK_SUCCESS;
100 }
101 
102 void
v3dv_meta_blit_init(struct v3dv_device * device)103 v3dv_meta_blit_init(struct v3dv_device *device)
104 {
105    for (uint32_t i = 0; i < 3; i++) {
106       device->meta.blit.cache[i] =
107          _mesa_hash_table_create(NULL,
108                                  meta_blit_key_hash,
109                                  meta_blit_key_compare);
110    }
111 
112    create_blit_pipeline_layout(device,
113                                &device->meta.blit.ds_layout,
114                                &device->meta.blit.p_layout);
115 }
116 
117 static void
destroy_meta_blit_pipeline(VkDevice vk_device,uint64_t obj,VkAllocationCallbacks * alloc)118 destroy_meta_blit_pipeline(VkDevice vk_device,
119                            uint64_t obj,
120                            VkAllocationCallbacks *alloc)
121 {
122    struct v3dv_meta_blit_pipeline *p =
123       (struct v3dv_meta_blit_pipeline *)(uintptr_t) obj;
124    v3dv_DestroyPipeline(vk_device, p->pipeline, alloc);
125    v3dv_DestroyRenderPass(vk_device, p->pass, alloc);
126    v3dv_DestroyRenderPass(vk_device, p->pass_no_load, alloc);
127    vk_free(alloc, p);
128 }
129 
130 void
v3dv_meta_blit_finish(struct v3dv_device * device)131 v3dv_meta_blit_finish(struct v3dv_device *device)
132 {
133    VkDevice _device = v3dv_device_to_handle(device);
134 
135    for (uint32_t i = 0; i < 3; i++) {
136       hash_table_foreach(device->meta.blit.cache[i], entry) {
137          destroy_meta_blit_pipeline(_device, (uintptr_t)entry->data,
138                                     &device->vk.alloc);
139       }
140       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
141    }
142 
143    if (device->meta.blit.p_layout) {
144       v3dv_DestroyPipelineLayout(_device, device->meta.blit.p_layout,
145                                  &device->vk.alloc);
146    }
147 
148    if (device->meta.blit.ds_layout) {
149       v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.ds_layout,
150                                       &device->vk.alloc);
151    }
152 }
153 
154 static uint32_t
meta_texel_buffer_copy_key_hash(const void * key)155 meta_texel_buffer_copy_key_hash(const void *key)
156 {
157    return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
158 }
159 
160 static bool
meta_texel_buffer_copy_key_compare(const void * key1,const void * key2)161 meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
162 {
163    return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
164 }
165 
166 static bool
create_texel_buffer_copy_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * ds_layout,VkPipelineLayout * p_layout)167 create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
168                                          VkDescriptorSetLayout *ds_layout,
169                                          VkPipelineLayout *p_layout)
170 {
171    VkResult result;
172 
173    if (*ds_layout == 0) {
174       VkDescriptorSetLayoutBinding ds_layout_binding = {
175          .binding = 0,
176          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
177          .descriptorCount = 1,
178          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
179       };
180       VkDescriptorSetLayoutCreateInfo ds_layout_info = {
181          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
182          .bindingCount = 1,
183          .pBindings = &ds_layout_binding,
184       };
185       result =
186          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
187                                         &ds_layout_info,
188                                         &device->vk.alloc,
189                                         ds_layout);
190       if (result != VK_SUCCESS)
191          return false;
192    }
193 
194    assert(*p_layout == 0);
195    /* FIXME: this is abusing a bit the API, since not all of our copy
196     * pipelines have a geometry shader. We could create 2 different pipeline
197     * layouts, but this works for us for now.
198     */
199 #define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
200 #define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
201 #define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
202 #define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
203    VkPushConstantRange ranges[2] = {
204       { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
205       { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
206    };
207 
208    VkPipelineLayoutCreateInfo p_layout_info = {
209       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
210       .setLayoutCount = 1,
211       .pSetLayouts = ds_layout,
212       .pushConstantRangeCount = 2,
213       .pPushConstantRanges = ranges,
214    };
215 
216    result =
217       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
218                                 &p_layout_info,
219                                 &device->vk.alloc,
220                                 p_layout);
221    return result == VK_SUCCESS;
222 }
223 
224 void
v3dv_meta_texel_buffer_copy_init(struct v3dv_device * device)225 v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
226 {
227    for (uint32_t i = 0; i < 3; i++) {
228       device->meta.texel_buffer_copy.cache[i] =
229          _mesa_hash_table_create(NULL,
230                                  meta_texel_buffer_copy_key_hash,
231                                  meta_texel_buffer_copy_key_compare);
232    }
233 
234    create_texel_buffer_copy_pipeline_layout(
235       device,
236       &device->meta.texel_buffer_copy.ds_layout,
237       &device->meta.texel_buffer_copy.p_layout);
238 }
239 
240 static void
destroy_meta_texel_buffer_copy_pipeline(VkDevice vk_device,uint64_t obj,VkAllocationCallbacks * alloc)241 destroy_meta_texel_buffer_copy_pipeline(VkDevice vk_device,
242                                         uint64_t obj,
243                                         VkAllocationCallbacks *alloc)
244 {
245    struct v3dv_meta_texel_buffer_copy_pipeline *p =
246       (struct v3dv_meta_texel_buffer_copy_pipeline *)(uintptr_t) obj;
247    v3dv_DestroyPipeline(vk_device, p->pipeline, alloc);
248    v3dv_DestroyRenderPass(vk_device, p->pass, alloc);
249    v3dv_DestroyRenderPass(vk_device, p->pass_no_load, alloc);
250    vk_free(alloc, p);
251 }
252 
253 void
v3dv_meta_texel_buffer_copy_finish(struct v3dv_device * device)254 v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
255 {
256    VkDevice _device = v3dv_device_to_handle(device);
257 
258    for (uint32_t i = 0; i < 3; i++) {
259       hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
260          destroy_meta_texel_buffer_copy_pipeline(_device, (uintptr_t)entry->data,
261                                                  &device->vk.alloc);
262       }
263       _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
264    }
265 
266    if (device->meta.texel_buffer_copy.p_layout) {
267       v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.p_layout,
268                                  &device->vk.alloc);
269    }
270 
271    if (device->meta.texel_buffer_copy.ds_layout) {
272       v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.ds_layout,
273                                       &device->vk.alloc);
274    }
275 }
276 
277 static VkFormat
get_compatible_tlb_format(VkFormat format)278 get_compatible_tlb_format(VkFormat format)
279 {
280    switch (format) {
281    case VK_FORMAT_R8G8B8A8_SNORM:
282       return VK_FORMAT_R8G8B8A8_UINT;
283 
284    case VK_FORMAT_R8G8_SNORM:
285       return VK_FORMAT_R8G8_UINT;
286 
287    case VK_FORMAT_R8_SNORM:
288       return VK_FORMAT_R8_UINT;
289 
290    case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
291       return VK_FORMAT_A8B8G8R8_UINT_PACK32;
292 
293    case VK_FORMAT_R16_UNORM:
294    case VK_FORMAT_R16_SNORM:
295       return VK_FORMAT_R16_UINT;
296 
297    case VK_FORMAT_R16G16_UNORM:
298    case VK_FORMAT_R16G16_SNORM:
299       return VK_FORMAT_R16G16_UINT;
300 
301    case VK_FORMAT_R16G16B16A16_UNORM:
302    case VK_FORMAT_R16G16B16A16_SNORM:
303       return VK_FORMAT_R16G16B16A16_UINT;
304 
305    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
306       return VK_FORMAT_R32_SFLOAT;
307 
308    /* We can't render to compressed formats using the TLB so instead we use
309     * a compatible format with the same bpp as the compressed format. Because
310     * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
311     * case of ETC), when we implement copies with the compatible format we
312     * will have to divide offsets and dimensions on the compressed image by
313     * the compressed block size.
314     */
315    case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
316    case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
317    case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
318    case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
319    case VK_FORMAT_BC2_UNORM_BLOCK:
320    case VK_FORMAT_BC2_SRGB_BLOCK:
321    case VK_FORMAT_BC3_SRGB_BLOCK:
322    case VK_FORMAT_BC3_UNORM_BLOCK:
323    case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
324    case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
325    case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
326    case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
327    case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
328    case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
329    case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
330    case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
331    case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
332    case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
333    case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
334    case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
335    case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
336    case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
337    case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
338    case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
339    case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
340    case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
341    case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
342    case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
343    case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
344    case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
345    case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
346    case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
347    case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
348    case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
349    case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
350    case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
351       return VK_FORMAT_R32G32B32A32_UINT;
352 
353    case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
354    case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
355    case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
356    case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
357    case VK_FORMAT_EAC_R11_UNORM_BLOCK:
358    case VK_FORMAT_EAC_R11_SNORM_BLOCK:
359    case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
360    case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
361    case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
362    case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
363       return VK_FORMAT_R16G16B16A16_UINT;
364 
365    default:
366       return VK_FORMAT_UNDEFINED;
367    }
368 }
369 
370 /**
371  * Checks if we can implement an image copy or clear operation using the TLB
372  * hardware.
373  *
374  * The extent and miplevel are only used to validate tile stores (to match the
375  * region to store against the miplevel dimensions to avoid avoid cases where
376  * the region to store is not a aligned to tile boundaries). If extent is
377  * NULL no checks are done (which is fine if the image will only be used for a
378  * TLB load or when we know in advance that the store will be for the entire
379  * size of the image miplevel).
380  *
381  * For tlb copies we are doing a per-plane copy, so for multi-plane formats,
382  * the compatible format will be single-plane.
383  */
384 bool
v3dv_meta_can_use_tlb(struct v3dv_image * image,uint8_t plane,uint8_t miplevel,const VkOffset3D * offset,const VkExtent3D * extent,VkFormat * compat_format)385 v3dv_meta_can_use_tlb(struct v3dv_image *image,
386                       uint8_t plane,
387                       uint8_t miplevel,
388                       const VkOffset3D *offset,
389                       const VkExtent3D *extent,
390                       VkFormat *compat_format)
391 {
392    if (offset->x != 0 || offset->y != 0)
393       return false;
394 
395    /* FIXME: this is suboptimal, what we really want to check is that the
396     * extent of the region to copy is the full slice or a multiple of the
397     * tile size.
398     */
399    if (extent) {
400       struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel];
401       if (slice->width != extent->width || slice->height != extent->height)
402          return false;
403    }
404 
405    if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
406       if (compat_format)
407          *compat_format = image->planes[plane].vk_format;
408       return true;
409    }
410 
411    /* If the image format is not TLB-supported, then check if we can use
412     * a compatible format instead.
413     */
414    if (compat_format) {
415       *compat_format = get_compatible_tlb_format(image->planes[plane].vk_format);
416       if (*compat_format != VK_FORMAT_UNDEFINED) {
417          assert(vk_format_get_plane_count(*compat_format) == 1);
418          return true;
419       }
420    }
421 
422    return false;
423 }
424 
425 /* Implements a copy using the TLB.
426  *
427  * This only works if we are copying from offset (0,0), since a TLB store for
428  * tile (x,y) will be written at the same tile offset into the destination.
429  * When this requirement is not met, we need to use a blit instead.
430  *
431  * Returns true if the implementation supports the requested operation (even if
432  * it failed to process it, for example, due to an out-of-memory error).
433  *
434  */
435 static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region)436 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
437                          struct v3dv_buffer *buffer,
438                          struct v3dv_image *image,
439                          const VkBufferImageCopy2 *region)
440 {
441    VkFormat fb_format;
442    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
443    assert(plane < image->plane_count);
444 
445    if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
446                               &region->imageOffset, &region->imageExtent,
447                               &fb_format)) {
448       return false;
449    }
450 
451    uint32_t internal_type, internal_bpp;
452    v3d_X((&cmd_buffer->device->devinfo), get_internal_type_bpp_for_image_aspects)
453       (fb_format, region->imageSubresource.aspectMask,
454        &internal_type, &internal_bpp);
455 
456    uint32_t num_layers;
457    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
458       num_layers = vk_image_subresource_layer_count(&image->vk,
459                                                     &region->imageSubresource);
460    } else {
461       num_layers = region->imageExtent.depth;
462    }
463    assert(num_layers > 0);
464 
465    struct v3dv_job *job =
466       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
467    if (!job)
468       return true;
469 
470    /* Handle copy from compressed format using a compatible format */
471    const uint32_t block_w =
472       vk_format_get_blockwidth(image->planes[plane].vk_format);
473    const uint32_t block_h =
474       vk_format_get_blockheight(image->planes[plane].vk_format);
475    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
476    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
477 
478    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
479                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
480                         false);
481 
482    struct v3dv_meta_framebuffer framebuffer;
483    v3d_X((&job->device->devinfo), meta_framebuffer_init)(&framebuffer, fb_format,
484                                               internal_type, &job->frame_tiling);
485 
486    v3d_X((&job->device->devinfo), job_emit_binning_flush)(job);
487    v3d_X((&job->device->devinfo), meta_emit_copy_image_to_buffer_rcl)
488       (job, buffer, image, &framebuffer, region);
489 
490    v3dv_cmd_buffer_finish_job(cmd_buffer);
491 
492    return true;
493 }
494 
495 static bool
496 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
497             struct v3dv_image *dst,
498             VkFormat dst_format,
499             struct v3dv_image *src,
500             VkFormat src_format,
501             VkColorComponentFlags cmask,
502             VkComponentMapping *cswizzle,
503             const VkImageBlit2 *region,
504             VkFilter filter,
505             bool dst_is_padded_image);
506 
507 
508 /**
509  * A structure that contains all the information we may need in various
510  * processes involving image to buffer copies implemented with blit paths.
511  */
512 struct image_to_buffer_info {
513    /* Source image info */
514    VkFormat src_format;
515    uint8_t plane;
516    VkColorComponentFlags cmask;
517    VkComponentMapping cswizzle;
518    VkImageAspectFlags src_copy_aspect;
519    uint32_t block_width;
520    uint32_t block_height;
521 
522    /* Destination buffer info */
523    VkFormat dst_format;
524    uint32_t buf_width;
525    uint32_t buf_height;
526    uint32_t buf_bpp;
527    VkImageAspectFlags dst_copy_aspect;
528 };
529 
530 static VkImageBlit2
blit_region_for_image_to_buffer(const VkOffset3D * offset,const VkExtent3D * extent,uint32_t mip_level,uint32_t base_layer,uint32_t layer_offset,struct image_to_buffer_info * info)531 blit_region_for_image_to_buffer(const VkOffset3D *offset,
532                                 const VkExtent3D *extent,
533                                 uint32_t mip_level,
534                                 uint32_t base_layer,
535                                 uint32_t layer_offset,
536                                 struct image_to_buffer_info *info)
537 {
538    VkImageBlit2 output = {
539       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
540       .srcSubresource = {
541          .aspectMask = info->src_copy_aspect,
542          .mipLevel = mip_level,
543          .baseArrayLayer = base_layer + layer_offset,
544          .layerCount = 1,
545       },
546       .srcOffsets = {
547          {
548             DIV_ROUND_UP(offset->x, info->block_width),
549             DIV_ROUND_UP(offset->y, info->block_height),
550             offset->z + layer_offset,
551          },
552          {
553             DIV_ROUND_UP(offset->x + extent->width, info->block_width),
554             DIV_ROUND_UP(offset->y + extent->height, info->block_height),
555             offset->z + layer_offset + 1,
556          },
557       },
558       .dstSubresource = {
559          .aspectMask = info->dst_copy_aspect,
560          .mipLevel = 0,
561          .baseArrayLayer = 0,
562          .layerCount = 1,
563       },
564       .dstOffsets = {
565          { 0, 0, 0 },
566          {
567             DIV_ROUND_UP(extent->width, info->block_width),
568             DIV_ROUND_UP(extent->height, info->block_height),
569             1
570          },
571       },
572    };
573 
574    return output;
575 }
576 
577 /**
578  * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can
579  * use to implement buffer to image copies with blit paths.
580  *
581  * Returns false if the copy operation can't be implemented with a blit.
582  */
583 static bool
gather_image_to_buffer_info(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region,struct image_to_buffer_info * out_info)584 gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer,
585                             struct v3dv_image *image,
586                             const VkBufferImageCopy2 *region,
587                             struct image_to_buffer_info *out_info)
588 {
589    bool supported = false;
590 
591    VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask;
592    /* For multi-planar images we copy one plane at a time using an image alias
593     * with a color aspect for each plane.
594     */
595    if (image->plane_count > 1)
596       dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
597 
598    VkImageAspectFlags src_copy_aspect = region->imageSubresource.aspectMask;
599    uint8_t plane = v3dv_plane_from_aspect(src_copy_aspect);
600    assert(plane < image->plane_count);
601 
602    /* Generally, the bpp of the data in the buffer matches that of the
603     * source image. The exception is the case where we are copying
604     * stencil (8bpp) to a combined d24s8 image (32bpp).
605     */
606    uint32_t buffer_bpp = image->planes[plane].cpp;
607 
608    /* Because we are going to implement the copy as a blit, we need to create
609     * a linear image from the destination buffer and we also want our blit
610     * source and destination formats to be the same (to avoid any format
611     * conversions), so we choose a canonical format that matches the
612     * source image bpp.
613     *
614     * The exception to the above is copying from combined depth/stencil images
615     * because we are copying only one aspect of the image, so we need to setup
616     * our formats, color write mask and source swizzle mask to match that.
617     */
618    VkFormat dst_format;
619    VkFormat src_format;
620    VkColorComponentFlags cmask = 0; /* All components */
621    VkComponentMapping cswizzle = {
622       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
623       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
624       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
625       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
626    };
627    switch (buffer_bpp) {
628    case 16:
629       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
630       dst_format = VK_FORMAT_R32G32B32A32_UINT;
631       src_format = dst_format;
632       break;
633    case 8:
634       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
635       dst_format = VK_FORMAT_R16G16B16A16_UINT;
636       src_format = dst_format;
637       break;
638    case 4:
639       switch (dst_copy_aspect) {
640       case VK_IMAGE_ASPECT_COLOR_BIT:
641          src_format = VK_FORMAT_R8G8B8A8_UINT;
642          dst_format = VK_FORMAT_R8G8B8A8_UINT;
643          break;
644       case VK_IMAGE_ASPECT_DEPTH_BIT:
645          assert(image->plane_count == 1);
646          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
647                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
648                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
649          if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
650             src_format = VK_FORMAT_R32_UINT;
651             dst_format = VK_FORMAT_R32_UINT;
652          } else {
653             /* We want to write depth in the buffer in the first 24-bits,
654              * however, the hardware has depth in bits 8-31, so swizzle the
655              * the source components to match what we want. Also, we don't
656              * want to write bits 24-31 in the destination.
657              */
658             src_format = VK_FORMAT_R8G8B8A8_UINT;
659             dst_format = VK_FORMAT_R8G8B8A8_UINT;
660             cmask = VK_COLOR_COMPONENT_R_BIT |
661                     VK_COLOR_COMPONENT_G_BIT |
662                     VK_COLOR_COMPONENT_B_BIT;
663             cswizzle.r = VK_COMPONENT_SWIZZLE_G;
664             cswizzle.g = VK_COMPONENT_SWIZZLE_B;
665             cswizzle.b = VK_COMPONENT_SWIZZLE_A;
666             cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
667          }
668          break;
669       case VK_IMAGE_ASPECT_STENCIL_BIT:
670          assert(image->plane_count == 1);
671          assert(dst_copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
672          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
673          /* Copying from S8D24. We want to write 8-bit stencil values only,
674           * so adjust the buffer bpp for that. Since the hardware stores stencil
675           * in the LSB, we can just do a RGBA8UI to R8UI blit.
676           */
677          src_format = VK_FORMAT_R8G8B8A8_UINT;
678          dst_format = VK_FORMAT_R8_UINT;
679          buffer_bpp = 1;
680          break;
681       default:
682          unreachable("unsupported aspect");
683          return supported;
684       };
685       break;
686    case 2:
687       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
688              dst_copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
689       dst_format = VK_FORMAT_R16_UINT;
690       src_format = dst_format;
691       break;
692    case 1:
693       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
694       dst_format = VK_FORMAT_R8_UINT;
695       src_format = dst_format;
696       break;
697    default:
698       unreachable("unsupported bit-size");
699       return supported;
700    };
701 
702    /* The hardware doesn't support linear depth/stencil stores, so we
703     * implement copies of depth/stencil aspect as color copies using a
704     * compatible color format.
705     */
706    assert(vk_format_is_color(src_format));
707    assert(vk_format_is_color(dst_format));
708    dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
709 
710    /* We should be able to handle the blit if we got this far */
711    supported = true;
712 
713    /* Obtain the 2D buffer region spec */
714    uint32_t buf_width, buf_height;
715    if (region->bufferRowLength == 0)
716       buf_width = region->imageExtent.width;
717    else
718       buf_width = region->bufferRowLength;
719 
720    if (region->bufferImageHeight == 0)
721       buf_height = region->imageExtent.height;
722    else
723       buf_height = region->bufferImageHeight;
724 
725    /* If the image is compressed, the bpp refers to blocks, not pixels */
726    uint32_t block_width =
727       vk_format_get_blockwidth(image->planes[plane].vk_format);
728    uint32_t block_height =
729       vk_format_get_blockheight(image->planes[plane].vk_format);
730    buf_width = DIV_ROUND_UP(buf_width, block_width);
731    buf_height = DIV_ROUND_UP(buf_height, block_height);
732 
733    out_info->src_format = src_format;
734    out_info->dst_format = dst_format;
735    out_info->src_copy_aspect = src_copy_aspect;
736    out_info->dst_copy_aspect = dst_copy_aspect;
737    out_info->buf_width = buf_width;
738    out_info->buf_height = buf_height;
739    out_info->buf_bpp = buffer_bpp;
740    out_info->block_width = block_width;
741    out_info->block_height = block_height;
742    out_info->cmask = cmask;
743    out_info->cswizzle = cswizzle;
744    out_info->plane = plane;
745 
746    return supported;
747 }
748 
749 /* Creates a linear image to alias buffer memory. It also includes that image
750  * as a private object in the cmd_buffer.
751  *
752  * This is used for cases where we want to implement an image to buffer copy,
753  * but we need to rely on a mechanism that uses an image as destination, like
754  * blitting.
755  */
756 static VkResult
create_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region,struct image_to_buffer_info * info,uint32_t layer,VkImage * out_image)757 create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
758                          struct v3dv_buffer *buffer,
759                          const VkBufferImageCopy2 *region,
760                          struct image_to_buffer_info *info,
761                          uint32_t layer,
762                          VkImage *out_image)
763 {
764    VkImageCreateInfo image_info = {
765       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
766       .imageType = VK_IMAGE_TYPE_2D,
767       .format = info->dst_format,
768       .extent = { info->buf_width, info->buf_height, 1 },
769       .mipLevels = 1,
770       .arrayLayers = 1,
771       .samples = VK_SAMPLE_COUNT_1_BIT,
772       .tiling = VK_IMAGE_TILING_LINEAR,
773       .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
774       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
775       .queueFamilyIndexCount = 0,
776       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
777    };
778 
779    VkResult result;
780    struct v3dv_device *device = cmd_buffer->device;
781    VkDevice _device = v3dv_device_to_handle(device);
782 
783    VkImage buffer_image;
784    result =
785       v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
786    if (result != VK_SUCCESS)
787       return result;
788 
789    *out_image = buffer_image;
790 
791    v3dv_cmd_buffer_add_private_obj(
792       cmd_buffer, (uintptr_t)buffer_image,
793       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
794 
795    /* Bind the buffer memory to the image
796     */
797    VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
798       layer * info->buf_width * info->buf_height * info->buf_bpp;
799 
800    result =
801       vk_common_BindImageMemory(_device, buffer_image,
802                                 v3dv_device_memory_to_handle(buffer->mem),
803                                 buffer_offset);
804    return result;
805 }
806 
807 /**
808  * Creates an image with a single mip level that aliases the memory of a
809  * mip level in another image, re-interpreting the memory with an uncompressed
810  * format. The image is added to the command buffer as a private object for
811  * disposal.
812  */
813 static bool
create_image_mip_level_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,VkFormat format,uint32_t plane,uint32_t mip_level,uint32_t layer,VkImage * alias)814 create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer,
815                              struct v3dv_image *image,
816                              VkFormat format,
817                              uint32_t plane,
818                              uint32_t mip_level,
819                              uint32_t layer,
820                              VkImage *alias)
821 {
822    VkResult result;
823    assert(!vk_format_is_compressed(format));
824 
825    struct v3dv_device *device = cmd_buffer->device;
826    VkDevice vk_device = v3dv_device_to_handle(device);
827    uint32_t mip_width = image->planes[plane].slices[mip_level].width;
828    uint32_t mip_height = image->planes[plane].slices[mip_level].height;
829 
830    uint32_t block_width =
831       vk_format_get_blockwidth(image->planes[plane].vk_format);
832    uint32_t block_height =
833       vk_format_get_blockheight(image->planes[plane].vk_format);
834 
835    VkImageCreateInfo info = {
836       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
837       .imageType = image->vk.image_type,
838       .format = format,
839       .extent = { DIV_ROUND_UP(mip_width, block_width),
840                   DIV_ROUND_UP(mip_height, block_height),
841                   1 },
842       .mipLevels = 1,
843       .arrayLayers = 1,
844       .samples = image->vk.samples,
845       .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
846       .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
847       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
848       .queueFamilyIndexCount = 0,
849       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
850    };
851    result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias);
852    if (result != VK_SUCCESS)
853       return false;
854 
855    /* The alias we have just created has just one mip, but we may be aliasing
856     * any mip in the original image. Because the slice setup changes based on
857     * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally)
858     * and this can influence the tiling layout selected for the slice, we want
859     * to make sure we copy the slice description from the actual mip level in
860     * the original image, and then rewrite any fields that we need for the
861     * alias. Particularly, we want to make the offset 0 because we are going to
862     * bind the underlying image memory exactly at the start of the selected mip.
863     * We also want to relax the image alignment requirements to the minimum
864     * (the one imposed by the Texture Base Address field) since we may not be
865     * aliasing a level 0 (for which we typically want a page alignment for
866     * optimal performance).
867     */
868    V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias);
869    v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level];
870    v3dv_alias->planes[plane].slices[0].width = info.extent.width;
871    v3dv_alias->planes[plane].slices[0].height = info.extent.height;
872    v3dv_alias->planes[plane].slices[0].offset = 0;
873    v3dv_alias->planes[plane].alignment = 64;
874 
875    v3dv_cmd_buffer_add_private_obj(
876       cmd_buffer, (uintptr_t)*alias,
877       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
878 
879    result =
880       vk_common_BindImageMemory(vk_device, *alias,
881                                 v3dv_device_memory_to_handle(image->planes[plane].mem),
882                                 v3dv_layer_offset(image, mip_level, layer, plane));
883    return result == VK_SUCCESS;
884 }
885 
886 /**
887  * Returns true if the implementation supports the requested operation (even if
888  * it failed to process it, for example, due to an out-of-memory error).
889  */
890 static bool
copy_image_to_buffer_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region)891 copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
892                           struct v3dv_buffer *buffer,
893                           struct v3dv_image *image,
894                           const VkBufferImageCopy2 *region)
895 {
896    bool handled = false;
897    struct image_to_buffer_info info;
898 
899    /* This path uses a shader blit which doesn't support linear images. Return
900     * early to avoid all the heavy lifting in preparation for the
901     * blit_shader() call that is bound to fail in that scenario.
902     */
903    if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) {
904       return handled;
905    }
906 
907    handled = gather_image_to_buffer_info(cmd_buffer, image, region,
908                                          &info);
909 
910    if (!handled)
911       return handled;
912 
913    /* We should be able to handle the blit if we got this far */
914    handled = true;
915 
916    /* Compute layers to copy */
917    uint32_t num_layers;
918    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
919       num_layers = vk_image_subresource_layer_count(&image->vk,
920                                                     &region->imageSubresource);
921    } else {
922       num_layers = region->imageExtent.depth;
923    }
924    assert(num_layers > 0);
925 
926    /* Copy requested layers */
927    VkResult result;
928    VkImageBlit2 blit_region;
929    uint32_t mip_level = region->imageSubresource.mipLevel;
930    uint32_t base_layer = region->imageSubresource.baseArrayLayer;
931    for (uint32_t i = 0; i < num_layers; i++) {
932       uint32_t layer_offset = i;
933 
934       if (vk_format_is_compressed(image->vk.format)) {
935          /* Our blit interface can see the real format of the images to detect
936           * copies between compressed and uncompressed images and adapt the
937           * blit region accordingly. Here we are just doing a raw copy of
938           * compressed data, but we are passing an uncompressed view of the
939           * buffer for the blit destination image (since compressed formats are
940           * not renderable), so we also want to provide an uncompressed view of
941           * the source image.
942           *
943           * It is important that we create the alias over the selected mip
944           * level (instead of aliasing the entire image) because an uncompressed
945           * view of the image won't have the same number of mip levels as the
946           * original image and the implicit mip size calculations the hw will
947           * do to sample from a non-zero mip level may not match exactly between
948           * compressed and uncompressed views.
949           */
950          VkImage alias;
951          if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format,
952                                            info.plane, mip_level,
953                                            base_layer + layer_offset,
954                                            &alias)) {
955             return handled;
956          }
957 
958          /* We are aliasing the selected mip level and layer with a
959           * single-mip and single-layer image.
960           */
961          image = v3dv_image_from_handle(alias);
962          mip_level = 0;
963          base_layer = 0;
964          layer_offset = 0;
965       }
966 
967       /* Create the destination blit image from the destination buffer */
968       VkImage buffer_image;
969       result =
970          create_image_from_buffer(cmd_buffer, buffer, region, &info,
971                                   i, &buffer_image);
972       if (result != VK_SUCCESS)
973          return handled;
974 
975       /* Blit-copy the requested image extent.
976        *
977        * Since we are copying, the blit must use the same format on the
978        * destination and source images to avoid format conversions. The
979        * only exception is copying stencil, which we upload to a R8UI source
980        * image, but that we need to blit to a S8D24 destination (the only
981        * stencil format we support).
982        */
983       blit_region =
984          blit_region_for_image_to_buffer(&region->imageOffset,
985                                          &region->imageExtent,
986                                          mip_level, base_layer, layer_offset,
987                                          &info);
988 
989       handled = blit_shader(cmd_buffer,
990                             v3dv_image_from_handle(buffer_image),
991                             info.dst_format,
992                             image, info.src_format,
993                             info.cmask, &info.cswizzle,
994                             &blit_region, VK_FILTER_NEAREST, false);
995       if (!handled) {
996          /* This is unexpected, we should have a supported blit spec */
997          unreachable("Unable to blit buffer to destination image");
998          return false;
999       }
1000    }
1001 
1002    assert(handled);
1003    return true;
1004 }
1005 
1006 static bool
1007 copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1008                                struct v3dv_image *dst,
1009                                struct v3dv_image *src,
1010                                const VkImageCopy2 *region);
1011 
1012 static VkImageCopy2
image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 * region,struct image_to_buffer_info * info,uint32_t layer)1013 image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region,
1014                                       struct image_to_buffer_info *info,
1015                                       uint32_t layer)
1016 {
1017    VkImageCopy2 output = {
1018       .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
1019       .srcSubresource = {
1020          .aspectMask = info->src_copy_aspect,
1021          .mipLevel = region->imageSubresource.mipLevel,
1022          .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer,
1023          .layerCount = 1,
1024       },
1025       .srcOffset = {
1026             DIV_ROUND_UP(region->imageOffset.x, info->block_width),
1027             DIV_ROUND_UP(region->imageOffset.y, info->block_height),
1028             region->imageOffset.z,
1029       },
1030       .dstSubresource = {
1031          .aspectMask = info->dst_copy_aspect,
1032          .mipLevel = 0,
1033          .baseArrayLayer = 0,
1034          .layerCount = 1,
1035       },
1036       .dstOffset = { 0, 0, 0 },
1037       .extent = {
1038          DIV_ROUND_UP(region->imageExtent.width, info->block_width),
1039          DIV_ROUND_UP(region->imageExtent.height, info->block_height),
1040          1
1041       },
1042    };
1043 
1044    return output;
1045 }
1046 
1047 /**
1048  * Returns true if the implementation supports the requested operation (even if
1049  * it failed to process it, for example, due to an out-of-memory error).
1050  */
1051 static bool
copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * dst_buffer,struct v3dv_image * src_image,const VkBufferImageCopy2 * region)1052 copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1053                                   struct v3dv_buffer *dst_buffer,
1054                                   struct v3dv_image *src_image,
1055                                   const VkBufferImageCopy2 *region)
1056 {
1057    bool handled = false;
1058    VkImage dst_buffer_image;
1059    struct image_to_buffer_info info;
1060 
1061    /* This is a requirement for copy_image_linear_texel_buffer below. We check
1062     * it in advance in order to do an early return
1063     */
1064    if (src_image->tiled)
1065       return false;
1066 
1067    handled =
1068       gather_image_to_buffer_info(cmd_buffer, src_image, region,
1069                                   &info);
1070    if (!handled)
1071       return handled;
1072 
1073    /* At this point the implementation should support the copy, any possible
1074     * error below are for different reasons, like out-of-memory error
1075     */
1076    handled = true;
1077 
1078    uint32_t num_layers;
1079    if (src_image->vk.image_type != VK_IMAGE_TYPE_3D) {
1080       num_layers = vk_image_subresource_layer_count(&src_image->vk,
1081                                                     &region->imageSubresource);
1082    } else {
1083       num_layers = region->imageExtent.depth;
1084    }
1085    assert(num_layers > 0);
1086 
1087    VkResult result;
1088    VkImageCopy2 image_region;
1089    for (uint32_t layer = 0; layer < num_layers; layer++) {
1090       /* Create the destination image from the destination buffer */
1091       result =
1092          create_image_from_buffer(cmd_buffer, dst_buffer, region, &info,
1093                                   layer, &dst_buffer_image);
1094       if (result != VK_SUCCESS)
1095          return handled;
1096 
1097       image_region =
1098          image_copy_region_for_image_to_buffer(region, &info, layer);
1099 
1100       handled =
1101          copy_image_linear_texel_buffer(cmd_buffer,
1102                                         v3dv_image_from_handle(dst_buffer_image),
1103                                         src_image, &image_region);
1104    }
1105 
1106    return handled;
1107 }
1108 
1109 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * info)1110 v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
1111                               const VkCopyImageToBufferInfo2 *info)
1112 
1113 {
1114    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1115    V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
1116    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
1117 
1118    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
1119 
1120    cmd_buffer->state.is_transfer = true;
1121 
1122    for (uint32_t i = 0; i < info->regionCount; i++) {
1123       const VkBufferImageCopy2 *region = &info->pRegions[i];
1124 
1125       if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, region))
1126          continue;
1127 
1128       if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region))
1129          continue;
1130 
1131       if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region))
1132          continue;
1133 
1134       unreachable("Unsupported image to buffer copy.");
1135    }
1136    cmd_buffer->state.is_transfer = false;
1137 }
1138 
1139 /**
1140  * Returns true if the implementation supports the requested operation (even if
1141  * it failed to process it, for example, due to an out-of-memory error).
1142  */
1143 static bool
copy_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1144 copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1145                struct v3dv_image *dst,
1146                struct v3dv_image *src,
1147                const VkImageCopy2 *region)
1148 {
1149    if (V3D_DBG(DISABLE_TFU)) {
1150       perf_debug("Copy images: TFU disabled, fallbacks could be slower.\n");
1151       return false;
1152    }
1153 
1154    /* Destination can't be raster format */
1155    if (!dst->tiled)
1156       return false;
1157 
1158    /* We can only do full copies, so if the format is D24S8 both aspects need
1159     * to be copied. We only need to check the dst format because the spec
1160     * states that depth/stencil formats must match exactly.
1161     */
1162    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
1163        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
1164                                              VK_IMAGE_ASPECT_STENCIL_BIT;
1165        if (region->dstSubresource.aspectMask != ds_aspects)
1166           return false;
1167    }
1168 
1169    /* Don't handle copies between uncompressed and compressed formats for now.
1170     *
1171     * FIXME: we should be able to handle these easily but there is no coverage
1172     * in CTS at the moment that make such copies with full images (which we
1173     * require here), only partial copies. Also, in that case the code below that
1174     * checks for "dst image complete" requires some changes, since it is
1175     * checking against the region dimensions, which are in units of the source
1176     * image format.
1177     */
1178    if (vk_format_is_compressed(dst->vk.format) !=
1179        vk_format_is_compressed(src->vk.format)) {
1180       return false;
1181    }
1182 
1183    /* Source region must start at (0,0) */
1184    if (region->srcOffset.x != 0 || region->srcOffset.y != 0)
1185       return false;
1186 
1187    /* Destination image must be complete */
1188    if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
1189       return false;
1190 
1191    uint8_t src_plane =
1192       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1193    uint8_t dst_plane =
1194       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1195 
1196    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
1197    uint32_t dst_width = u_minify(dst->planes[dst_plane].width, dst_mip_level);
1198    uint32_t dst_height = u_minify(dst->planes[dst_plane].height, dst_mip_level);
1199    if (region->extent.width != dst_width || region->extent.height != dst_height)
1200       return false;
1201 
1202    /* From vkCmdCopyImage:
1203     *
1204     *   "When copying between compressed and uncompressed formats the extent
1205     *    members represent the texel dimensions of the source image and not
1206     *    the destination."
1207     */
1208    const uint32_t block_w =
1209       vk_format_get_blockwidth(src->planes[src_plane].vk_format);
1210    const uint32_t block_h =
1211       vk_format_get_blockheight(src->planes[src_plane].vk_format);
1212    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
1213    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
1214 
1215    /* Account for sample count */
1216    assert(dst->vk.samples == src->vk.samples);
1217    if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
1218       assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
1219       width *= 2;
1220       height *= 2;
1221    }
1222 
1223    /* The TFU unit doesn't handle format conversions so we need the formats to
1224     * match. On the other hand, vkCmdCopyImage allows different color formats
1225     * on the source and destination images, but only if they are texel
1226     * compatible. For us, this means that we can effectively ignore different
1227     * formats and just make the copy using either of them, since we are just
1228     * moving raw data and not making any conversions.
1229     *
1230     * Also, the formats supported by the TFU unit are limited, but again, since
1231     * we are only doing raw copies here without interpreting or converting
1232     * the underlying pixel data according to its format, we can always choose
1233     * to use compatible formats that are supported with the TFU unit.
1234     */
1235    assert(dst->planes[dst_plane].cpp == src->planes[src_plane].cpp);
1236    const struct v3dv_format *format =
1237       v3dv_get_compatible_tfu_format(cmd_buffer->device,
1238                                      dst->planes[dst_plane].cpp, NULL);
1239 
1240    /* Emit a TFU job for each layer to blit */
1241    const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
1242       vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) :
1243       region->extent.depth;
1244    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
1245 
1246    const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
1247       region->srcSubresource.baseArrayLayer : region->srcOffset.z;
1248    const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
1249       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
1250    for (uint32_t i = 0; i < layer_count; i++) {
1251       const uint32_t dst_offset =
1252          dst->planes[dst_plane].mem->bo->offset +
1253          v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i, dst_plane);
1254       const uint32_t src_offset =
1255          src->planes[src_plane].mem->bo->offset +
1256          v3dv_layer_offset(src, src_mip_level, base_src_layer + i, src_plane);
1257 
1258       const struct v3d_resource_slice *dst_slice =
1259          &dst->planes[dst_plane].slices[dst_mip_level];
1260       const struct v3d_resource_slice *src_slice =
1261          &src->planes[src_plane].slices[src_mip_level];
1262 
1263       v3d_X((&cmd_buffer->device->devinfo), meta_emit_tfu_job)(
1264          cmd_buffer,
1265          dst->planes[dst_plane].mem->bo->handle,
1266          dst_offset,
1267          dst_slice->tiling,
1268          dst_slice->padded_height,
1269          dst->planes[dst_plane].cpp,
1270          src->planes[src_plane].mem->bo->handle,
1271          src_offset,
1272          src_slice->tiling,
1273          src_slice->tiling == V3D_TILING_RASTER ?
1274                               src_slice->stride : src_slice->padded_height,
1275          src->planes[src_plane].cpp,
1276          /* All compatible TFU formats are single-plane */
1277          width, height, &format->planes[0]);
1278    }
1279 
1280    return true;
1281 }
1282 
1283 inline bool
v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1284 v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1285                                struct v3dv_image *dst,
1286                                struct v3dv_image *src,
1287                                const VkImageCopy2 *region)
1288 {
1289    return copy_image_tfu(cmd_buffer, dst, src, region);
1290 }
1291 
1292 /**
1293  * Returns true if the implementation supports the requested operation (even if
1294  * it failed to process it, for example, due to an out-of-memory error).
1295  */
1296 static bool
copy_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1297 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1298                struct v3dv_image *dst,
1299                struct v3dv_image *src,
1300                const VkImageCopy2 *region)
1301 {
1302    uint8_t src_plane =
1303       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1304    assert(src_plane < src->plane_count);
1305    uint8_t dst_plane =
1306       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1307    assert(dst_plane < dst->plane_count);
1308 
1309    VkFormat fb_format;
1310    if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel,
1311                               &region->srcOffset, NULL, &fb_format) ||
1312        !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel,
1313                               &region->dstOffset, &region->extent, &fb_format)) {
1314       return false;
1315    }
1316 
1317    /* We can't do TLB stores of linear D/S */
1318    if (!dst->tiled && vk_format_is_depth_or_stencil(fb_format))
1319       return false;
1320 
1321    /* From the Vulkan spec, VkImageCopy valid usage:
1322     *
1323     *    "If neither the calling command’s srcImage nor the calling command’s
1324     *     dstImage has a multi-planar image format then the aspectMask member
1325     *     of srcSubresource and dstSubresource must match."
1326     */
1327    assert(src->plane_count != 1 || dst->plane_count != 1 ||
1328           region->dstSubresource.aspectMask ==
1329           region->srcSubresource.aspectMask);
1330    uint32_t internal_type, internal_bpp;
1331    v3d_X((&cmd_buffer->device->devinfo), get_internal_type_bpp_for_image_aspects)
1332       (fb_format, region->dstSubresource.aspectMask,
1333        &internal_type, &internal_bpp);
1334 
1335    /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
1336     *
1337     * "The number of slices of the extent (for 3D) or layers of the
1338     *  srcSubresource (for non-3D) must match the number of slices of the
1339     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
1340     */
1341    assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
1342            vk_image_subresource_layer_count(&src->vk, &region->srcSubresource) :
1343            region->extent.depth) ==
1344           (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
1345            vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) :
1346            region->extent.depth));
1347    uint32_t num_layers;
1348    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
1349       num_layers = vk_image_subresource_layer_count(&dst->vk,
1350                                                     &region->dstSubresource);
1351    } else {
1352       num_layers = region->extent.depth;
1353    }
1354    assert(num_layers > 0);
1355 
1356    struct v3dv_job *job =
1357       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1358    if (!job)
1359       return true;
1360 
1361    /* Handle copy to compressed image using compatible format */
1362    const uint32_t block_w =
1363       vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
1364    const uint32_t block_h =
1365       vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
1366    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
1367    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
1368 
1369    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
1370                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
1371                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
1372 
1373    struct v3dv_meta_framebuffer framebuffer;
1374    v3d_X((&job->device->devinfo), meta_framebuffer_init)(&framebuffer, fb_format,
1375                                               internal_type, &job->frame_tiling);
1376 
1377    v3d_X((&job->device->devinfo), job_emit_binning_flush)(job);
1378    v3d_X((&job->device->devinfo), meta_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);
1379 
1380    v3dv_cmd_buffer_finish_job(cmd_buffer);
1381 
1382    return true;
1383 }
1384 
1385 /**
1386  * Takes the image provided as argument and creates a new image that has
1387  * the same specification and aliases the same memory storage, except that:
1388  *
1389  *   - It has the uncompressed format passed in.
1390  *   - Its original width/height are scaled by the factors passed in.
1391  *
1392  * This is useful to implement copies from compressed images using the blit
1393  * path. The idea is that we create uncompressed "image views" of both the
1394  * source and destination images using the uncompressed format and then we
1395  * define the copy blit in terms of that format.
1396  */
1397 static struct v3dv_image *
create_image_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * src,float width_scale,float height_scale,VkFormat format)1398 create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
1399                    struct v3dv_image *src,
1400                    float width_scale,
1401                    float height_scale,
1402                    VkFormat format)
1403 {
1404    assert(!vk_format_is_compressed(format));
1405    /* We don't support ycbcr compressed formats */
1406    assert(src->plane_count == 1);
1407 
1408    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
1409 
1410    VkImageCreateInfo info = {
1411       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1412       .imageType = src->vk.image_type,
1413       .format = format,
1414       .extent = {
1415          .width = src->vk.extent.width * width_scale,
1416          .height = src->vk.extent.height * height_scale,
1417          .depth = src->vk.extent.depth,
1418       },
1419       .mipLevels = src->vk.mip_levels,
1420       .arrayLayers = src->vk.array_layers,
1421       .samples = src->vk.samples,
1422       .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
1423       .usage = src->vk.usage,
1424    };
1425 
1426     VkImage _image;
1427     VkResult result =
1428       v3dv_CreateImage(_device, &info, &cmd_buffer->device->vk.alloc, &_image);
1429     if (result != VK_SUCCESS) {
1430        v3dv_flag_oom(cmd_buffer, NULL);
1431        return NULL;
1432     }
1433 
1434     v3dv_cmd_buffer_add_private_obj(
1435        cmd_buffer, (uintptr_t)_image,
1436        (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
1437 
1438     struct v3dv_image *image = v3dv_image_from_handle(_image);
1439     image->planes[0].mem = src->planes[0].mem;
1440     image->planes[0].mem_offset = src->planes[0].mem_offset;
1441     return image;
1442 }
1443 
1444 /**
1445  * Returns true if the implementation supports the requested operation (even if
1446  * it failed to process it, for example, due to an out-of-memory error).
1447  */
1448 static bool
copy_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1449 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
1450                 struct v3dv_image *dst,
1451                 struct v3dv_image *src,
1452                 const VkImageCopy2 *region)
1453 {
1454    if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D)
1455       return false;
1456 
1457    uint8_t src_plane =
1458       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1459    assert(src_plane < src->plane_count);
1460    uint8_t dst_plane =
1461       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1462    assert(dst_plane < dst->plane_count);
1463 
1464    const uint32_t src_block_w =
1465       vk_format_get_blockwidth(src->planes[src_plane].vk_format);
1466    const uint32_t src_block_h =
1467       vk_format_get_blockheight(src->planes[src_plane].vk_format);
1468    const uint32_t dst_block_w =
1469       vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
1470    const uint32_t dst_block_h =
1471       vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
1472    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
1473    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
1474 
1475    /* We need to choose a single format for the blit to ensure that this is
1476     * really a copy and there are not format conversions going on. Since we
1477     * going to blit, we need to make sure that the selected format can be
1478     * both rendered to and textured from.
1479     */
1480    VkFormat format;
1481    float src_scale_w = 1.0f;
1482    float src_scale_h = 1.0f;
1483    float dst_scale_w = block_scale_w;
1484    float dst_scale_h = block_scale_h;
1485    if (vk_format_is_compressed(src->vk.format)) {
1486       /* If we are copying from a compressed format we should be aware that we
1487        * are going to texture from the source image, and the texture setup
1488        * knows the actual size of the image, so we need to choose a format
1489        * that has a per-texel (not per-block) bpp that is compatible for that
1490        * image size. For example, for a source image with size Bw*WxBh*H
1491        * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
1492        * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
1493        * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
1494        * so we could specify a blit with size Bw*WxBh*H and a format with
1495        * a bpp of 8-bit per texel (R8_UINT).
1496        *
1497        * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
1498        * which is 64-bit per texel, then we would need a 4-bit format, which
1499        * we don't have, so instead we still choose an 8-bit format, but we
1500        * apply a divisor to the row dimensions of the blit, since we are
1501        * copying two texels per item.
1502        *
1503        * Generally, we can choose any format so long as we compute appropriate
1504        * divisors for the width and height depending on the source image's
1505        * bpp.
1506        */
1507       assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
1508 
1509       switch (src->planes[src_plane].cpp) {
1510       case 16:
1511          format = VK_FORMAT_R32G32B32A32_UINT;
1512          break;
1513       case 8:
1514          format = VK_FORMAT_R16G16B16A16_UINT;
1515          break;
1516       default:
1517          unreachable("Unsupported compressed format");
1518       }
1519 
1520       /* Create image views of the src/dst images that we can interpret in
1521        * terms of the canonical format.
1522        */
1523       src_scale_w /= src_block_w;
1524       src_scale_h /= src_block_h;
1525       dst_scale_w /= src_block_w;
1526       dst_scale_h /= src_block_h;
1527 
1528       src = create_image_alias(cmd_buffer, src,
1529                                src_scale_w, src_scale_h, format);
1530 
1531       dst = create_image_alias(cmd_buffer, dst,
1532                                dst_scale_w, dst_scale_h, format);
1533    } else {
1534       format = src->format->planes[src_plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
1535          src->planes[src_plane].vk_format :
1536          get_compatible_tlb_format(src->planes[src_plane].vk_format);
1537       if (format == VK_FORMAT_UNDEFINED)
1538          return false;
1539 
1540       const struct v3dv_format *f = v3d_X((&cmd_buffer->device->devinfo), get_format)(format);
1541       assert(f->plane_count < 2);
1542       if (!f->plane_count || f->planes[0].tex_type == TEXTURE_DATA_FORMAT_NO)
1543          return false;
1544    }
1545 
1546    /* Given an uncompressed image with size WxH, if we copy it to a compressed
1547     * image, it will result in an image with size W*bWxH*bH, where bW and bH
1548     * are the compressed format's block width and height. This means that
1549     * copies between compressed and uncompressed images involve different
1550     * image sizes, and therefore, we need to take that into account when
1551     * setting up the source and destination blit regions below, so they are
1552     * consistent from the point of view of the single compatible format
1553     * selected for the copy.
1554     *
1555     * We should take into account that the dimensions of the region provided
1556     * to the copy command are specified in terms of the source image. With that
1557     * in mind, below we adjust the blit destination region to be consistent with
1558     * the source region for the compatible format, so basically, we apply
1559     * the block scale factor to the destination offset provided by the copy
1560     * command (because it is specified in terms of the destination image, not
1561     * the source), and then we just add the region copy dimensions to that
1562     * (since the region dimensions are already specified in terms of the source
1563     * image).
1564     */
1565    uint32_t region_width = region->extent.width * src_scale_w;
1566    uint32_t region_height = region->extent.height * src_scale_h;
1567    if (src_block_w > 1)
1568       region_width = util_next_power_of_two(region_width);
1569    if (src_block_h > 1)
1570       region_height = util_next_power_of_two(region_height);
1571 
1572    const VkOffset3D src_start = {
1573       region->srcOffset.x * src_scale_w,
1574       region->srcOffset.y * src_scale_h,
1575       region->srcOffset.z,
1576    };
1577    const VkOffset3D src_end = {
1578       src_start.x + region_width,
1579       src_start.y + region_height,
1580       src_start.z + region->extent.depth,
1581    };
1582 
1583    const VkOffset3D dst_start = {
1584       region->dstOffset.x * dst_scale_w,
1585       region->dstOffset.y * dst_scale_h,
1586       region->dstOffset.z,
1587    };
1588    const VkOffset3D dst_end = {
1589       dst_start.x + region_width,
1590       dst_start.y + region_height,
1591       dst_start.z + region->extent.depth,
1592    };
1593 
1594    const VkImageBlit2 blit_region = {
1595       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
1596       .srcSubresource = region->srcSubresource,
1597       .srcOffsets = { src_start, src_end },
1598       .dstSubresource = region->dstSubresource,
1599       .dstOffsets = { dst_start, dst_end },
1600    };
1601    bool handled = blit_shader(cmd_buffer,
1602                               dst, format,
1603                               src, format,
1604                               0, NULL,
1605                               &blit_region, VK_FILTER_NEAREST, true);
1606 
1607    /* We should have selected formats that we can blit */
1608    assert(handled);
1609    return handled;
1610 }
1611 
1612 static bool
copy_image_linear_texel_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1613 copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1614                                struct v3dv_image *dst,
1615                                struct v3dv_image *src,
1616                                const VkImageCopy2 *region)
1617 {
1618    if (src->tiled)
1619       return false;
1620 
1621    /* Implementations are allowed to restrict linear images like this */
1622    assert(region->srcOffset.z == 0);
1623    assert(region->dstOffset.z == 0);
1624    assert(region->srcSubresource.mipLevel == 0);
1625    assert(region->srcSubresource.baseArrayLayer == 0);
1626    assert(region->srcSubresource.layerCount == 1);
1627    assert(region->dstSubresource.mipLevel == 0);
1628    assert(region->dstSubresource.baseArrayLayer == 0);
1629    assert(region->dstSubresource.layerCount == 1);
1630 
1631    uint8_t src_plane =
1632       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1633    uint8_t dst_plane =
1634       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1635 
1636    assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
1637    const uint32_t bpp = src->planes[src_plane].cpp;
1638 
1639    VkFormat format;
1640    switch (bpp) {
1641    case 16:
1642       format = VK_FORMAT_R32G32B32A32_UINT;
1643       break;
1644    case 8:
1645       format = VK_FORMAT_R16G16B16A16_UINT;
1646       break;
1647    case 4:
1648       format = VK_FORMAT_R8G8B8A8_UINT;
1649       break;
1650    case 2:
1651       format = VK_FORMAT_R16_UINT;
1652       break;
1653    case 1:
1654       format = VK_FORMAT_R8_UINT;
1655       break;
1656    default:
1657       unreachable("unsupported bit-size");
1658       return false;
1659    }
1660 
1661    VkComponentMapping ident_swizzle = {
1662       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
1663       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
1664       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
1665       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
1666    };
1667 
1668    const uint32_t buf_stride = src->planes[src_plane].slices[0].stride;
1669    const VkDeviceSize buf_offset =
1670       region->srcOffset.y * buf_stride + region->srcOffset.x * bpp;
1671 
1672    struct v3dv_buffer src_buffer;
1673    vk_object_base_init(&cmd_buffer->device->vk, &src_buffer.base,
1674                        VK_OBJECT_TYPE_BUFFER);
1675 
1676    const struct VkBufferCreateInfo buf_create_info = {
1677       .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
1678       .size = src->planes[src_plane].size,
1679       .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
1680       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
1681    };
1682    v3dv_buffer_init(cmd_buffer->device, &buf_create_info, &src_buffer,
1683                     src->planes[src_plane].alignment);
1684 
1685    const VkBindBufferMemoryInfo buf_bind_info = {
1686       .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
1687       .buffer = v3dv_buffer_to_handle(&src_buffer),
1688       .memory = v3dv_device_memory_to_handle(src->planes[src_plane].mem),
1689       .memoryOffset = src->planes[src_plane].mem_offset +
1690          v3dv_layer_offset(src, 0, 0, src_plane),
1691    };
1692    v3dv_buffer_bind_memory(&buf_bind_info);
1693 
1694    const VkBufferImageCopy2 copy_region = {
1695       .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1696       .pNext = NULL,
1697       .bufferOffset = buf_offset,
1698       .bufferRowLength = buf_stride / bpp,
1699       .bufferImageHeight = src->vk.extent.height,
1700       .imageSubresource = region->dstSubresource,
1701       .imageOffset = region->dstOffset,
1702       .imageExtent = region->extent,
1703    };
1704 
1705    return texel_buffer_shader_copy(cmd_buffer,
1706                                    region->dstSubresource.aspectMask,
1707                                    dst,
1708                                    format,
1709                                    format,
1710                                    &src_buffer,
1711                                    src->planes[src_plane].cpp,
1712                                    0 /* color mask: full */, &ident_swizzle,
1713                                    1, &copy_region);
1714 }
1715 
1716 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * info)1717 v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,
1718                       const VkCopyImageInfo2 *info)
1719 
1720 {
1721    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1722    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
1723    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
1724 
1725    assert(src->vk.samples == dst->vk.samples);
1726 
1727    cmd_buffer->state.is_transfer = true;
1728 
1729    for (uint32_t i = 0; i < info->regionCount; i++) {
1730       const VkImageCopy2 *region = &info->pRegions[i];
1731       if (copy_image_tfu(cmd_buffer, dst, src, region))
1732          continue;
1733       if (copy_image_tlb(cmd_buffer, dst, src, region))
1734          continue;
1735       if (copy_image_blit(cmd_buffer, dst, src, region))
1736          continue;
1737       if (copy_image_linear_texel_buffer(cmd_buffer, dst, src, region))
1738          continue;
1739       unreachable("Image copy not supported");
1740    }
1741 
1742    cmd_buffer->state.is_transfer = false;
1743 }
1744 
1745 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)1746 v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
1747                        const VkCopyBufferInfo2 *pCopyBufferInfo)
1748 {
1749    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1750    V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
1751    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
1752 
1753    cmd_buffer->state.is_transfer = true;
1754 
1755    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
1756       v3d_X((&cmd_buffer->device->devinfo), meta_copy_buffer)
1757          (cmd_buffer,
1758           dst_buffer->mem->bo, dst_buffer->mem_offset,
1759           src_buffer->mem->bo, src_buffer->mem_offset,
1760           &pCopyBufferInfo->pRegions[i]);
1761    }
1762 
1763    cmd_buffer->state.is_transfer = false;
1764 }
1765 
1766 static void
destroy_update_buffer_cb(VkDevice _device,uint64_t pobj,VkAllocationCallbacks * alloc)1767 destroy_update_buffer_cb(VkDevice _device,
1768                          uint64_t pobj,
1769                          VkAllocationCallbacks *alloc)
1770 {
1771    V3DV_FROM_HANDLE(v3dv_device, device, _device);
1772    struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
1773    v3dv_bo_free(device, bo);
1774 }
1775 
1776 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1777 v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1778                      VkBuffer dstBuffer,
1779                      VkDeviceSize dstOffset,
1780                      VkDeviceSize dataSize,
1781                      const void *pData)
1782 {
1783    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1784    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1785 
1786    struct v3dv_bo *src_bo =
1787       v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
1788    if (!src_bo) {
1789       mesa_loge("Failed to allocate BO for vkCmdUpdateBuffer.\n");
1790       return;
1791    }
1792 
1793    bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
1794    if (!ok) {
1795       mesa_loge("Failed to map BO for vkCmdUpdateBuffer.\n");
1796       return;
1797    }
1798 
1799    cmd_buffer->state.is_transfer = true;
1800 
1801    memcpy(src_bo->map, pData, dataSize);
1802 
1803    v3dv_bo_unmap(cmd_buffer->device, src_bo);
1804 
1805    VkBufferCopy2 region = {
1806       .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
1807       .srcOffset = 0,
1808       .dstOffset = dstOffset,
1809       .size = dataSize,
1810    };
1811    struct v3dv_job *copy_job =
1812       v3d_X((&cmd_buffer->device->devinfo), meta_copy_buffer)
1813       (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
1814        src_bo, 0, &region);
1815 
1816    if (copy_job) {
1817       v3dv_cmd_buffer_add_private_obj(
1818          cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
1819    }
1820 
1821    cmd_buffer->state.is_transfer = false;
1822 }
1823 
1824 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize size,uint32_t data)1825 v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
1826                    VkBuffer dstBuffer,
1827                    VkDeviceSize dstOffset,
1828                    VkDeviceSize size,
1829                    uint32_t data)
1830 {
1831    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1832    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1833 
1834    cmd_buffer->state.is_transfer = true;
1835 
1836    struct v3dv_bo *bo = dst_buffer->mem->bo;
1837 
1838    /* From the Vulkan spec:
1839     *
1840     *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
1841     *    a multiple of 4, then the nearest smaller multiple is used."
1842     */
1843    if (size == VK_WHOLE_SIZE) {
1844       size = dst_buffer->size - dstOffset;
1845       size -= size % 4;
1846    }
1847 
1848    v3d_X((&cmd_buffer->device->devinfo), meta_fill_buffer)
1849       (cmd_buffer, bo, dstOffset, size, data);
1850 
1851    cmd_buffer->state.is_transfer = false;
1852 }
1853 
1854 /**
1855  * Returns true if the implementation supports the requested operation (even if
1856  * it failed to process it, for example, due to an out-of-memory error).
1857  */
1858 static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1859 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1860                          struct v3dv_image *image,
1861                          struct v3dv_buffer *buffer,
1862                          const VkBufferImageCopy2 *region)
1863 {
1864    if (V3D_DBG(DISABLE_TFU)) {
1865       perf_debug("Copy buffer to image: TFU disabled, fallbacks could be slower.\n");
1866       return false;
1867    }
1868 
1869    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
1870 
1871    /* Destination can't be raster format */
1872    if (!image->tiled)
1873       return false;
1874 
1875    /* We can't copy D24S8 because buffer to image copies only copy one aspect
1876     * at a time, and the TFU copies full images. Also, V3D depth bits for
1877     * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
1878     * the Vulkan spec has the buffer data specified the other way around, so it
1879     * is not a straight copy, we would have to swizzle the channels, which the
1880     * TFU can't do.
1881     */
1882    if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
1883        image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1884          return false;
1885    }
1886 
1887    /* Region must include full slice */
1888    const uint32_t offset_x = region->imageOffset.x;
1889    const uint32_t offset_y = region->imageOffset.y;
1890    if (offset_x != 0 || offset_y != 0)
1891       return false;
1892 
1893    uint32_t width, height;
1894    if (region->bufferRowLength == 0)
1895       width = region->imageExtent.width;
1896    else
1897       width = region->bufferRowLength;
1898 
1899    if (region->bufferImageHeight == 0)
1900       height = region->imageExtent.height;
1901    else
1902       height = region->bufferImageHeight;
1903 
1904    const uint8_t plane =
1905       v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
1906 
1907    const uint32_t mip_level = region->imageSubresource.mipLevel;
1908    const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
1909 
1910    if (width != slice->width || height != slice->height)
1911       return false;
1912 
1913    /* Handle region semantics for compressed images */
1914    const uint32_t block_w =
1915       vk_format_get_blockwidth(image->planes[plane].vk_format);
1916    const uint32_t block_h =
1917       vk_format_get_blockheight(image->planes[plane].vk_format);
1918    width = DIV_ROUND_UP(width, block_w);
1919    height = DIV_ROUND_UP(height, block_h);
1920 
1921    /* Format must be supported for texturing via the TFU. Since we are just
1922     * copying raw data and not converting between pixel formats, we can ignore
1923     * the image's format and choose a compatible TFU format for the image
1924     * texel size instead, which expands the list of formats we can handle here.
1925     */
1926    const struct v3dv_format *format =
1927       v3dv_get_compatible_tfu_format(cmd_buffer->device,
1928                                      image->planes[plane].cpp, NULL);
1929    /* We only use single-plane formats with the TFU */
1930    assert(format->plane_count == 1);
1931    const struct v3dv_format_plane *format_plane = &format->planes[0];
1932 
1933    uint32_t num_layers;
1934    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
1935       num_layers = vk_image_subresource_layer_count(&image->vk,
1936                                                     &region->imageSubresource);
1937    } else {
1938       num_layers = region->imageExtent.depth;
1939    }
1940    assert(num_layers > 0);
1941 
1942    assert(image->planes[plane].mem && image->planes[plane].mem->bo);
1943    const struct v3dv_bo *dst_bo = image->planes[plane].mem->bo;
1944 
1945    assert(buffer->mem && buffer->mem->bo);
1946    const struct v3dv_bo *src_bo = buffer->mem->bo;
1947 
1948    /* Emit a TFU job per layer to copy */
1949    const uint32_t buffer_stride = width * image->planes[plane].cpp;
1950    for (int i = 0; i < num_layers; i++) {
1951       uint32_t layer;
1952       if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1953          layer = region->imageSubresource.baseArrayLayer + i;
1954       else
1955          layer = region->imageOffset.z + i;
1956 
1957       const uint32_t buffer_offset =
1958          buffer->mem_offset + region->bufferOffset +
1959          height * buffer_stride * i;
1960       const uint32_t src_offset = src_bo->offset + buffer_offset;
1961 
1962       const uint32_t dst_offset =
1963          dst_bo->offset + v3dv_layer_offset(image, mip_level, layer, plane);
1964 
1965       v3d_X((&cmd_buffer->device->devinfo), meta_emit_tfu_job)(
1966              cmd_buffer,
1967              dst_bo->handle,
1968              dst_offset,
1969              slice->tiling,
1970              slice->padded_height,
1971              image->planes[plane].cpp,
1972              src_bo->handle,
1973              src_offset,
1974              V3D_TILING_RASTER,
1975              width,
1976              1,
1977              width, height, format_plane);
1978    }
1979 
1980    return true;
1981 }
1982 
1983 /**
1984  * Returns true if the implementation supports the requested operation (even if
1985  * it failed to process it, for example, due to an out-of-memory error).
1986  */
1987 static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1988 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1989                          struct v3dv_image *image,
1990                          struct v3dv_buffer *buffer,
1991                          const VkBufferImageCopy2 *region)
1992 {
1993    VkFormat fb_format;
1994    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
1995    assert(plane < image->plane_count);
1996 
1997    if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
1998                               &region->imageOffset, &region->imageExtent,
1999                               &fb_format)) {
2000       return false;
2001    }
2002 
2003    /* From the Vulkan spec for VkBufferImageCopy2:
2004     *
2005     *   "The aspectMask member of imageSubresource must only have a
2006     *    single bit set."
2007     *
2008     * For us this has relevant implications because we can't do TLB stores
2009     * of linear depth/stencil so we work around this by loading D/S data to the
2010     * color tile buffer using a compatible color format (see
2011     * emit_copy_buffer_to_layer_per_tile_list and choose_tlb_format functions),
2012     * however, when we are copying a single aspect to a combined D/S image
2013     * we need to preserve the other aspect, and for that we will still use the
2014     * D/S tile buffer to load and store the aspect of the image we need to
2015     * preserve, so in this case we are still constrained by the hw restriction
2016     * for linear D/S stores.
2017     */
2018    assert(util_bitcount(region->imageSubresource.aspectMask) == 1);
2019    if (!image->tiled &&
2020        vk_format_has_depth(fb_format) &&
2021        vk_format_has_stencil(fb_format)) {
2022       return false;
2023    }
2024 
2025    uint32_t internal_type, internal_bpp;
2026    v3d_X((&cmd_buffer->device->devinfo), get_internal_type_bpp_for_image_aspects)
2027       (fb_format, region->imageSubresource.aspectMask,
2028        &internal_type, &internal_bpp);
2029 
2030    uint32_t num_layers;
2031    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2032       num_layers = vk_image_subresource_layer_count(&image->vk,
2033                                                     &region->imageSubresource);
2034    } else {
2035       num_layers = region->imageExtent.depth;
2036    }
2037    assert(num_layers > 0);
2038 
2039    struct v3dv_job *job =
2040       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
2041    if (!job)
2042       return true;
2043 
2044    /* Handle copy to compressed format using a compatible format */
2045    const uint32_t block_w =
2046       vk_format_get_blockwidth(image->planes[plane].vk_format);
2047    const uint32_t block_h =
2048       vk_format_get_blockheight(image->planes[plane].vk_format);
2049    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
2050    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
2051 
2052    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
2053                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
2054                         false);
2055 
2056    struct v3dv_meta_framebuffer framebuffer;
2057    v3d_X((&job->device->devinfo), meta_framebuffer_init)(&framebuffer, fb_format,
2058                                               internal_type, &job->frame_tiling);
2059 
2060    v3d_X((&job->device->devinfo), job_emit_binning_flush)(job);
2061    v3d_X((&job->device->devinfo), meta_emit_copy_buffer_to_image_rcl)
2062       (job, image, buffer, &framebuffer, region);
2063 
2064    v3dv_cmd_buffer_finish_job(cmd_buffer);
2065 
2066    return true;
2067 }
2068 
2069 static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)2070 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
2071                                struct v3dv_image *image,
2072                                struct v3dv_buffer *buffer,
2073                                const VkBufferImageCopy2 *region)
2074 {
2075    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
2076       return true;
2077    if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
2078       return true;
2079    return false;
2080 }
2081 
2082 static VkResult
create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)2083 create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
2084 {
2085    /* If this is not the first pool we create for this command buffer
2086     * size it based on the size of the currently exhausted pool.
2087     */
2088    uint32_t descriptor_count = 64;
2089    if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
2090       struct v3dv_descriptor_pool *exhausted_pool =
2091          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
2092       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
2093    }
2094 
2095    /* Create the descriptor pool */
2096    cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
2097    VkDescriptorPoolSize pool_size = {
2098       .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2099       .descriptorCount = descriptor_count,
2100    };
2101    VkDescriptorPoolCreateInfo info = {
2102       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
2103       .maxSets = descriptor_count,
2104       .poolSizeCount = 1,
2105       .pPoolSizes = &pool_size,
2106       .flags = 0,
2107    };
2108    VkResult result =
2109       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
2110                                 &info,
2111                                 &cmd_buffer->device->vk.alloc,
2112                                 &cmd_buffer->meta.texel_buffer_copy.dspool);
2113 
2114    if (result == VK_SUCCESS) {
2115       assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
2116       const VkDescriptorPool _pool = cmd_buffer->meta.texel_buffer_copy.dspool;
2117 
2118       v3dv_cmd_buffer_add_private_obj(
2119          cmd_buffer, (uintptr_t) _pool,
2120          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
2121 
2122       struct v3dv_descriptor_pool *pool =
2123          v3dv_descriptor_pool_from_handle(_pool);
2124       pool->is_driver_internal = true;
2125    }
2126 
2127    return result;
2128 }
2129 
2130 static VkResult
allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)2131 allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
2132                                           VkDescriptorSet *set)
2133 {
2134    /* Make sure we have a descriptor pool */
2135    VkResult result;
2136    if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
2137       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
2138       if (result != VK_SUCCESS)
2139          return result;
2140    }
2141    assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
2142 
2143    /* Allocate descriptor set */
2144    struct v3dv_device *device = cmd_buffer->device;
2145    VkDevice _device = v3dv_device_to_handle(device);
2146    VkDescriptorSetAllocateInfo info = {
2147       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
2148       .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
2149       .descriptorSetCount = 1,
2150       .pSetLayouts = &device->meta.texel_buffer_copy.ds_layout,
2151    };
2152    result = v3dv_AllocateDescriptorSets(_device, &info, set);
2153 
2154    /* If we ran out of pool space, grow the pool and try again */
2155    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
2156       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
2157       if (result == VK_SUCCESS) {
2158          info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
2159          result = v3dv_AllocateDescriptorSets(_device, &info, set);
2160       }
2161    }
2162 
2163    return result;
2164 }
2165 
2166 static void
get_texel_buffer_copy_pipeline_cache_key(VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,uint8_t * key)2167 get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
2168                                          VkColorComponentFlags cmask,
2169                                          VkComponentMapping *cswizzle,
2170                                          bool is_layered,
2171                                          uint8_t *key)
2172 {
2173    memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
2174 
2175    uint32_t *p = (uint32_t *) key;
2176 
2177    *p = format;
2178    p++;
2179 
2180    *p = cmask;
2181    p++;
2182 
2183    /* Note that that we are using a single byte for this, so we could pack
2184     * more data into this 32-bit slot in the future.
2185     */
2186    *p = is_layered ? 1 : 0;
2187    p++;
2188 
2189    memcpy(p, cswizzle, sizeof(VkComponentMapping));
2190    p += sizeof(VkComponentMapping) / sizeof(uint32_t);
2191 
2192    assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
2193 }
2194 
2195 static bool
2196 create_blit_render_pass(struct v3dv_device *device,
2197                         VkFormat dst_format,
2198                         VkFormat src_format,
2199                         VkRenderPass *pass_load,
2200                         VkRenderPass *pass_no_load);
2201 
2202 static bool
2203 create_pipeline(struct v3dv_device *device,
2204                 struct v3dv_render_pass *pass,
2205                 struct nir_shader *vs_nir,
2206                 struct nir_shader *gs_nir,
2207                 struct nir_shader *fs_nir,
2208                 const VkPipelineVertexInputStateCreateInfo *vi_state,
2209                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
2210                 const VkPipelineColorBlendStateCreateInfo *cb_state,
2211                 const VkPipelineMultisampleStateCreateInfo *ms_state,
2212                 const VkPipelineLayout layout,
2213                 VkPipeline *pipeline);
2214 
2215 static nir_shader *
get_texel_buffer_copy_vs(const nir_shader_compiler_options * options)2216 get_texel_buffer_copy_vs(const nir_shader_compiler_options *options)
2217 {
2218    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
2219                                                   "meta texel buffer copy vs");
2220    nir_variable *vs_out_pos =
2221       nir_variable_create(b.shader, nir_var_shader_out,
2222                           glsl_vec4_type(), "gl_Position");
2223    vs_out_pos->data.location = VARYING_SLOT_POS;
2224 
2225    nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
2226    nir_store_var(&b, vs_out_pos, pos, 0xf);
2227 
2228    return b.shader;
2229 }
2230 
2231 static nir_shader *
get_texel_buffer_copy_gs(const nir_shader_compiler_options * options)2232 get_texel_buffer_copy_gs(const nir_shader_compiler_options *options)
2233 {
2234    /* FIXME: this creates a geometry shader that takes the index of a single
2235     * layer to clear from push constants, so we need to emit a draw call for
2236     * each layer that we want to clear. We could actually do better and have it
2237     * take a range of layers however, if we were to do this, we would need to
2238     * be careful not to exceed the maximum number of output vertices allowed in
2239     * a geometry shader.
2240     */
2241    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
2242                                                   "meta texel buffer copy gs");
2243    nir_shader *nir = b.shader;
2244    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
2245    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
2246                                (1ull << VARYING_SLOT_LAYER);
2247    nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
2248    nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
2249    nir->info.gs.vertices_in = 3;
2250    nir->info.gs.vertices_out = 3;
2251    nir->info.gs.invocations = 1;
2252    nir->info.gs.active_stream_mask = 0x1;
2253 
2254    /* in vec4 gl_Position[3] */
2255    nir_variable *gs_in_pos =
2256       nir_variable_create(b.shader, nir_var_shader_in,
2257                           glsl_array_type(glsl_vec4_type(), 3, 0),
2258                           "in_gl_Position");
2259    gs_in_pos->data.location = VARYING_SLOT_POS;
2260 
2261    /* out vec4 gl_Position */
2262    nir_variable *gs_out_pos =
2263       nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
2264                           "out_gl_Position");
2265    gs_out_pos->data.location = VARYING_SLOT_POS;
2266 
2267    /* out float gl_Layer */
2268    nir_variable *gs_out_layer =
2269       nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
2270                           "out_gl_Layer");
2271    gs_out_layer->data.location = VARYING_SLOT_LAYER;
2272 
2273    /* Emit output triangle */
2274    for (uint32_t i = 0; i < 3; i++) {
2275       /* gl_Position from shader input */
2276       nir_deref_instr *in_pos_i =
2277          nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
2278       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
2279 
2280       /* gl_Layer from push constants */
2281       nir_def *layer =
2282          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
2283                                 .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
2284                                 .range = 4);
2285       nir_store_var(&b, gs_out_layer, layer, 0x1);
2286 
2287       nir_emit_vertex(&b, 0);
2288    }
2289 
2290    nir_end_primitive(&b, 0);
2291 
2292    return nir;
2293 }
2294 
2295 static nir_def *
load_frag_coord(nir_builder * b)2296 load_frag_coord(nir_builder *b)
2297 {
2298    nir_foreach_shader_in_variable(var, b->shader) {
2299       if (var->data.location == VARYING_SLOT_POS)
2300          return nir_load_var(b, var);
2301    }
2302    nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
2303                                            glsl_vec4_type(), NULL);
2304    pos->data.location = VARYING_SLOT_POS;
2305    return nir_load_var(b, pos);
2306 }
2307 
2308 static uint32_t
component_swizzle_to_nir_swizzle(VkComponentSwizzle comp,VkComponentSwizzle swz)2309 component_swizzle_to_nir_swizzle(VkComponentSwizzle comp, VkComponentSwizzle swz)
2310 {
2311    if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
2312       swz = comp;
2313 
2314    switch (swz) {
2315    case VK_COMPONENT_SWIZZLE_R:
2316       return 0;
2317    case VK_COMPONENT_SWIZZLE_G:
2318       return 1;
2319    case VK_COMPONENT_SWIZZLE_B:
2320       return 2;
2321    case VK_COMPONENT_SWIZZLE_A:
2322       return 3;
2323    default:
2324       unreachable("Invalid swizzle");
2325    };
2326 }
2327 
2328 static nir_shader *
get_texel_buffer_copy_fs(const nir_shader_compiler_options * options,VkFormat format,VkComponentMapping * cswizzle)2329 get_texel_buffer_copy_fs(const nir_shader_compiler_options *options,
2330                          VkFormat format, VkComponentMapping *cswizzle)
2331 {
2332    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
2333                                                   "meta texel buffer copy fs");
2334 
2335    /* We only use the copy from texel buffer shader to implement
2336     * copy_buffer_to_image_shader, which always selects a compatible integer
2337     * format for the copy.
2338     */
2339    assert(vk_format_is_int(format));
2340 
2341    /* Fragment shader output color */
2342    nir_variable *fs_out_color =
2343       nir_variable_create(b.shader, nir_var_shader_out,
2344                           glsl_uvec4_type(), "out_color");
2345    fs_out_color->data.location = FRAG_RESULT_DATA0;
2346 
2347    /* Texel buffer input */
2348    const struct glsl_type *sampler_type =
2349       glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
2350    nir_variable *sampler =
2351       nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
2352    sampler->data.descriptor_set = 0;
2353    sampler->data.binding = 0;
2354 
2355    /* Load the box describing the pixel region we want to copy from the
2356     * texel buffer.
2357     */
2358    nir_def *box =
2359       nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
2360                              .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
2361                              .range = 16);
2362 
2363    /* Load the buffer stride (this comes in texel units) */
2364    nir_def *stride =
2365       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
2366                              .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
2367                              .range = 4);
2368 
2369    /* Load the buffer offset (this comes in texel units) */
2370    nir_def *offset =
2371       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
2372                              .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
2373                              .range = 4);
2374 
2375    nir_def *coord = nir_f2i32(&b, load_frag_coord(&b));
2376 
2377    /* Load pixel data from texel buffer based on the x,y offset of the pixel
2378     * within the box. Texel buffers are 1D arrays of texels.
2379     *
2380     * Notice that we already make sure that we only generate fragments that are
2381     * inside the box through the scissor/viewport state, so our offset into the
2382     * texel buffer should always be within its bounds and we we don't need
2383     * to add a check for that here.
2384     */
2385    nir_def *x_offset =
2386       nir_isub(&b, nir_channel(&b, coord, 0),
2387                    nir_channel(&b, box, 0));
2388    nir_def *y_offset =
2389       nir_isub(&b, nir_channel(&b, coord, 1),
2390                    nir_channel(&b, box, 1));
2391    nir_def *texel_offset =
2392       nir_iadd(&b, nir_iadd(&b, offset, x_offset),
2393                    nir_imul(&b, y_offset, stride));
2394 
2395    nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def;
2396    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
2397    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
2398    tex->op = nir_texop_txf;
2399    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset);
2400    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
2401    tex->dest_type = nir_type_uint32;
2402    tex->is_array = false;
2403    tex->coord_components = 1;
2404    nir_def_init(&tex->instr, &tex->def, 4, 32);
2405    nir_builder_instr_insert(&b, &tex->instr);
2406 
2407    uint32_t swiz[4];
2408    swiz[0] =
2409       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_R, cswizzle->r);
2410    swiz[1] =
2411       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_G, cswizzle->g);
2412    swiz[2] =
2413       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
2414    swiz[3] =
2415       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
2416    nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4);
2417    nir_store_var(&b, fs_out_color, s, 0xf);
2418 
2419    return b.shader;
2420 }
2421 
2422 static bool
create_texel_buffer_copy_pipeline(struct v3dv_device * device,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)2423 create_texel_buffer_copy_pipeline(struct v3dv_device *device,
2424                                   VkFormat format,
2425                                   VkColorComponentFlags cmask,
2426                                   VkComponentMapping *cswizzle,
2427                                   bool is_layered,
2428                                   VkRenderPass _pass,
2429                                   VkPipelineLayout pipeline_layout,
2430                                   VkPipeline *pipeline)
2431 {
2432    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
2433 
2434    assert(vk_format_is_color(format));
2435 
2436    const nir_shader_compiler_options *options =
2437       v3dv_pipeline_get_nir_options(&device->devinfo);
2438 
2439    nir_shader *vs_nir = get_texel_buffer_copy_vs(options);
2440    nir_shader *fs_nir = get_texel_buffer_copy_fs(options, format, cswizzle);
2441    nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs(options) : NULL;
2442 
2443    const VkPipelineVertexInputStateCreateInfo vi_state = {
2444       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
2445       .vertexBindingDescriptionCount = 0,
2446       .vertexAttributeDescriptionCount = 0,
2447    };
2448 
2449    VkPipelineDepthStencilStateCreateInfo ds_state = {
2450       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
2451    };
2452 
2453    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
2454    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
2455       .blendEnable = false,
2456       .colorWriteMask = cmask,
2457    };
2458 
2459    const VkPipelineColorBlendStateCreateInfo cb_state = {
2460       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
2461       .logicOpEnable = false,
2462       .attachmentCount = 1,
2463       .pAttachments = blend_att_state
2464    };
2465 
2466    const VkPipelineMultisampleStateCreateInfo ms_state = {
2467       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
2468       .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
2469       .sampleShadingEnable = false,
2470       .pSampleMask = NULL,
2471       .alphaToCoverageEnable = false,
2472       .alphaToOneEnable = false,
2473    };
2474 
2475    return create_pipeline(device,
2476                           pass,
2477                           vs_nir, gs_nir, fs_nir,
2478                           &vi_state,
2479                           &ds_state,
2480                           &cb_state,
2481                           &ms_state,
2482                           pipeline_layout,
2483                           pipeline);
2484 }
2485 
2486 static bool
get_copy_texel_buffer_pipeline(struct v3dv_cmd_buffer * cmd_buffer,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,VkImageType image_type,bool is_layered,struct v3dv_meta_texel_buffer_copy_pipeline ** pipeline)2487 get_copy_texel_buffer_pipeline(
2488    struct v3dv_cmd_buffer *cmd_buffer,
2489    VkFormat format,
2490    VkColorComponentFlags cmask,
2491    VkComponentMapping *cswizzle,
2492    VkImageType image_type,
2493    bool is_layered,
2494    struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
2495 {
2496    bool ok = true;
2497    struct v3dv_device *device = cmd_buffer->device;
2498 
2499    uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
2500    if (device->instance->meta_cache_enabled) {
2501       get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
2502                                                key);
2503 
2504       mtx_lock(&device->meta.mtx);
2505       struct hash_entry *entry =
2506          _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
2507                                  key);
2508       if (entry) {
2509          mtx_unlock(&device->meta.mtx);
2510          *pipeline = entry->data;
2511          return true;
2512       }
2513    }
2514 
2515    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
2516                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2517 
2518    if (*pipeline == NULL)
2519       goto fail;
2520 
2521    /* The blit render pass is compatible */
2522    ok = create_blit_render_pass(device, format, format,
2523                                 &(*pipeline)->pass,
2524                                 &(*pipeline)->pass_no_load);
2525    if (!ok)
2526       goto fail;
2527 
2528    ok =
2529       create_texel_buffer_copy_pipeline(device,
2530                                         format, cmask, cswizzle, is_layered,
2531                                         (*pipeline)->pass,
2532                                         device->meta.texel_buffer_copy.p_layout,
2533                                         &(*pipeline)->pipeline);
2534    if (!ok)
2535       goto fail;
2536 
2537    if (device->instance->meta_cache_enabled) {
2538       _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
2539                               key, *pipeline);
2540       mtx_unlock(&device->meta.mtx);
2541    } else {
2542       v3dv_cmd_buffer_add_private_obj(
2543          cmd_buffer, (uintptr_t)*pipeline,
2544          (v3dv_cmd_buffer_private_obj_destroy_cb)destroy_meta_texel_buffer_copy_pipeline);
2545    }
2546 
2547    return true;
2548 
2549 fail:
2550    if (device->instance->meta_cache_enabled)
2551       mtx_unlock(&device->meta.mtx);
2552 
2553    VkDevice _device = v3dv_device_to_handle(device);
2554    if (*pipeline) {
2555       if ((*pipeline)->pass)
2556          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
2557       if ((*pipeline)->pipeline)
2558          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
2559       vk_free(&device->vk.alloc, *pipeline);
2560       *pipeline = NULL;
2561    }
2562 
2563    return false;
2564 }
2565 
2566 static bool
texel_buffer_shader_copy(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2 * regions)2567 texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
2568                          VkImageAspectFlags aspect,
2569                          struct v3dv_image *image,
2570                          VkFormat dst_format,
2571                          VkFormat src_format,
2572                          struct v3dv_buffer *buffer,
2573                          uint32_t buffer_bpp,
2574                          VkColorComponentFlags cmask,
2575                          VkComponentMapping *cswizzle,
2576                          uint32_t region_count,
2577                          const VkBufferImageCopy2 *regions)
2578 {
2579    VkResult result;
2580    bool handled = false;
2581 
2582    assert(cswizzle);
2583 
2584    /* This is a copy path, so we don't handle format conversions. The only
2585     * exception are stencil to D24S8 copies, which are handled as a color
2586     * masked R8->RGBA8 copy.
2587     */
2588    assert(src_format == dst_format ||
2589           (dst_format == VK_FORMAT_R8G8B8A8_UINT &&
2590            src_format == VK_FORMAT_R8_UINT &&
2591            cmask == VK_COLOR_COMPONENT_R_BIT));
2592 
2593    /* We only handle color copies. Callers can copy D/S aspects by using
2594     * a compatible color format and maybe a cmask/cswizzle for D24 formats.
2595     */
2596    if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format))
2597       return handled;
2598 
2599    /* FIXME: we only handle uncompressed images for now. */
2600    if (vk_format_is_compressed(image->vk.format))
2601       return handled;
2602 
2603    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
2604                                             VK_COLOR_COMPONENT_G_BIT |
2605                                             VK_COLOR_COMPONENT_B_BIT |
2606                                             VK_COLOR_COMPONENT_A_BIT;
2607    if (cmask == 0)
2608       cmask = full_cmask;
2609 
2610    /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
2611     * so we can bind it as a texel buffer. Otherwise, the buffer view
2612     * we create below won't setup the texture state that we need for this.
2613     */
2614    if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
2615       if (v3dv_buffer_format_supports_features(
2616              cmd_buffer->device, src_format,
2617              VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) {
2618          buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
2619       } else {
2620          return handled;
2621       }
2622    }
2623 
2624    /* At this point we should be able to handle the copy unless an unexpected
2625     * error occurs, such as an OOM.
2626     */
2627    handled = true;
2628 
2629 
2630    /* Compute the number of layers to copy.
2631     *
2632     * If we are batching (region_count > 1) all our regions have the same
2633     * image subresource so we can take this from the first region. For 3D
2634     * images we require the same depth extent.
2635     */
2636    const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
2637    uint32_t num_layers;
2638    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2639       num_layers = vk_image_subresource_layer_count(&image->vk, resource);
2640    } else {
2641       assert(region_count == 1);
2642       num_layers = regions[0].imageExtent.depth;
2643    }
2644    assert(num_layers > 0);
2645 
2646    /* Get the texel buffer copy pipeline */
2647    struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
2648    bool ok = get_copy_texel_buffer_pipeline(cmd_buffer,
2649                                             dst_format, cmask, cswizzle,
2650                                             image->vk.image_type, num_layers > 1,
2651                                             &pipeline);
2652    if (!ok)
2653       return handled;
2654    assert(pipeline && pipeline->pipeline && pipeline->pass);
2655 
2656    /* Setup descriptor set for the source texel buffer. We don't have to
2657     * register the descriptor as a private command buffer object since
2658     * all descriptors will be freed automatically with the descriptor
2659     * pool.
2660     */
2661    VkDescriptorSet set;
2662    result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
2663    if (result != VK_SUCCESS)
2664       return handled;
2665 
2666    /* We can't pass region->bufferOffset here for the offset field because
2667     * the texture base pointer in the texture shader state must be a 64-byte
2668     * aligned value. Instead, we use 0 here and we pass the offset in texels
2669     * as a push constant to the shader.
2670     */
2671    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
2672    VkBufferViewCreateInfo buffer_view_info = {
2673       .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
2674       .buffer = v3dv_buffer_to_handle(buffer),
2675       .format = src_format,
2676       .offset = 0,
2677       .range = VK_WHOLE_SIZE,
2678    };
2679 
2680    VkBufferView texel_buffer_view;
2681    result = v3dv_CreateBufferView(_device, &buffer_view_info,
2682                                   &cmd_buffer->device->vk.alloc,
2683                                   &texel_buffer_view);
2684    if (result != VK_SUCCESS)
2685       return handled;
2686 
2687    v3dv_cmd_buffer_add_private_obj(
2688       cmd_buffer, (uintptr_t)texel_buffer_view,
2689       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
2690 
2691    VkWriteDescriptorSet write = {
2692       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
2693       .dstSet = set,
2694       .dstBinding = 0,
2695       .dstArrayElement = 0,
2696       .descriptorCount = 1,
2697       .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2698       .pTexelBufferView = &texel_buffer_view,
2699    };
2700    v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
2701 
2702    /* Push command buffer state before starting meta operation */
2703    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
2704 
2705    /* Bind common state for all layers and regions  */
2706    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
2707    v3dv_CmdBindPipeline(_cmd_buffer,
2708                         VK_PIPELINE_BIND_POINT_GRAPHICS,
2709                         pipeline->pipeline);
2710 
2711    v3dv_CmdBindDescriptorSets(_cmd_buffer,
2712                               VK_PIPELINE_BIND_POINT_GRAPHICS,
2713                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2714                               0, 1, &set,
2715                               0, NULL);
2716 
2717    /* Setup framebuffer.
2718     *
2719     * For 3D images, this creates a layered framebuffer with a number of
2720     * layers matching the depth extent of the 3D image.
2721     */
2722    uint8_t plane = v3dv_plane_from_aspect(aspect);
2723    uint32_t fb_width = u_minify(image->planes[plane].width, resource->mipLevel);
2724    uint32_t fb_height = u_minify(image->planes[plane].height, resource->mipLevel);
2725 
2726    VkImageViewCreateInfo image_view_info = {
2727       .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2728       .image = v3dv_image_to_handle(image),
2729       .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
2730       .format = dst_format,
2731       .subresourceRange = {
2732          .aspectMask = aspect,
2733          .baseMipLevel = resource->mipLevel,
2734          .levelCount = 1,
2735          .baseArrayLayer = resource->baseArrayLayer,
2736          .layerCount = num_layers,
2737       },
2738    };
2739    VkImageView image_view;
2740    result = v3dv_create_image_view(cmd_buffer->device,
2741                                    &image_view_info, &image_view);
2742    if (result != VK_SUCCESS)
2743       goto fail;
2744 
2745    v3dv_cmd_buffer_add_private_obj(
2746       cmd_buffer, (uintptr_t)image_view,
2747       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
2748 
2749    VkFramebufferCreateInfo fb_info = {
2750       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
2751       .renderPass = pipeline->pass,
2752       .attachmentCount = 1,
2753       .pAttachments = &image_view,
2754       .width = fb_width,
2755       .height = fb_height,
2756       .layers = num_layers,
2757    };
2758 
2759    VkFramebuffer fb;
2760    result = v3dv_CreateFramebuffer(_device, &fb_info,
2761                                    &cmd_buffer->device->vk.alloc, &fb);
2762    if (result != VK_SUCCESS)
2763       goto fail;
2764 
2765     v3dv_cmd_buffer_add_private_obj(
2766        cmd_buffer, (uintptr_t)fb,
2767        (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
2768 
2769    /* For each layer */
2770    for (uint32_t l = 0; l < num_layers; l++) {
2771        /* Start render pass for this layer.
2772         *
2773         * If the we only have one region to copy, then we might be able to
2774         * skip the TLB load if it is aligned to tile boundaries. All layers
2775         * copy the same area, so we only need to check this once.
2776         */
2777       bool can_skip_tlb_load = false;
2778       VkRect2D render_area;
2779       if (region_count == 1) {
2780          render_area.offset.x = regions[0].imageOffset.x;
2781          render_area.offset.y = regions[0].imageOffset.y;
2782          render_area.extent.width = regions[0].imageExtent.width;
2783          render_area.extent.height = regions[0].imageExtent.height;
2784 
2785          if (l == 0) {
2786             struct v3dv_render_pass *pipeline_pass =
2787                v3dv_render_pass_from_handle(pipeline->pass);
2788             can_skip_tlb_load =
2789                cmask == full_cmask &&
2790                v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
2791                                                  v3dv_framebuffer_from_handle(fb),
2792                                                  pipeline_pass, 0);
2793          }
2794       } else {
2795          render_area.offset.x = 0;
2796          render_area.offset.y = 0;
2797          render_area.extent.width = fb_width;
2798          render_area.extent.height = fb_height;
2799       }
2800 
2801       VkRenderPassBeginInfo rp_info = {
2802          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
2803          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
2804                                            pipeline->pass,
2805          .framebuffer = fb,
2806          .renderArea = render_area,
2807          .clearValueCount = 0,
2808       };
2809 
2810       VkSubpassBeginInfo sp_info = {
2811          .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
2812          .contents = VK_SUBPASS_CONTENTS_INLINE,
2813       };
2814 
2815       v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
2816       struct v3dv_job *job = cmd_buffer->state.job;
2817       if (!job)
2818          goto fail;
2819 
2820       /* If we are using a layered copy we need to specify the layer for the
2821        * Geometry Shader.
2822        */
2823       if (num_layers > 1) {
2824          uint32_t layer = resource->baseArrayLayer + l;
2825          v3dv_CmdPushConstants(_cmd_buffer,
2826                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2827                                VK_SHADER_STAGE_GEOMETRY_BIT,
2828                                24, 4, &layer);
2829       }
2830 
2831       /* For each region */
2832       for (uint32_t r = 0; r < region_count; r++) {
2833          const VkBufferImageCopy2 *region = &regions[r];
2834 
2835          /* Obtain the 2D buffer region spec */
2836          uint32_t buf_width, buf_height;
2837          if (region->bufferRowLength == 0)
2838              buf_width = region->imageExtent.width;
2839          else
2840              buf_width = region->bufferRowLength;
2841 
2842          if (region->bufferImageHeight == 0)
2843              buf_height = region->imageExtent.height;
2844          else
2845              buf_height = region->bufferImageHeight;
2846 
2847          const VkViewport viewport = {
2848             .x = region->imageOffset.x,
2849             .y = region->imageOffset.y,
2850             .width = region->imageExtent.width,
2851             .height = region->imageExtent.height,
2852             .minDepth = 0.0f,
2853             .maxDepth = 1.0f
2854          };
2855          v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
2856          const VkRect2D scissor = {
2857             .offset = { region->imageOffset.x, region->imageOffset.y },
2858             .extent = { region->imageExtent.width, region->imageExtent.height }
2859          };
2860          v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
2861 
2862          const VkDeviceSize buf_offset =
2863             region->bufferOffset / buffer_bpp  + l * buf_height * buf_width;
2864          uint32_t push_data[6] = {
2865             region->imageOffset.x,
2866             region->imageOffset.y,
2867             region->imageOffset.x + region->imageExtent.width - 1,
2868             region->imageOffset.y + region->imageExtent.height - 1,
2869             buf_width,
2870             buf_offset,
2871          };
2872 
2873          v3dv_CmdPushConstants(_cmd_buffer,
2874                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2875                                VK_SHADER_STAGE_FRAGMENT_BIT,
2876                                0, sizeof(push_data), &push_data);
2877 
2878          v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
2879       } /* For each region */
2880 
2881       VkSubpassEndInfo sp_end_info = {
2882          .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
2883       };
2884 
2885       v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
2886    } /* For each layer */
2887 
2888 fail:
2889    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
2890    return handled;
2891 }
2892 
2893 /**
2894  * Returns true if the implementation supports the requested operation (even if
2895  * it failed to process it, for example, due to an out-of-memory error).
2896  */
2897 static bool
copy_buffer_to_image_blit(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2 * regions)2898 copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
2899                           VkImageAspectFlags aspect,
2900                           struct v3dv_image *image,
2901                           VkFormat dst_format,
2902                           VkFormat src_format,
2903                           struct v3dv_buffer *buffer,
2904                           uint32_t buffer_bpp,
2905                           VkColorComponentFlags cmask,
2906                           VkComponentMapping *cswizzle,
2907                           uint32_t region_count,
2908                           const VkBufferImageCopy2 *regions)
2909 {
2910    /* Since we can't sample linear images we need to upload the linear
2911     * buffer to a tiled image that we can use as a blit source, which
2912     * is slow.
2913     */
2914    perf_debug("Falling back to blit path for buffer to image copy.\n");
2915 
2916    struct v3dv_device *device = cmd_buffer->device;
2917    VkDevice _device = v3dv_device_to_handle(device);
2918    bool handled = true;
2919 
2920    /* Allocate memory for the tiled image. Since we copy layer by layer
2921     * we allocate memory to hold a full layer, which is the worse case.
2922     * For that we create a dummy image with that spec, get memory requirements
2923     * for it and use that information to create the memory allocation.
2924     * We will then reuse this memory store for all the regions we want to
2925     * copy.
2926     */
2927    VkImage dummy_image;
2928    VkImageCreateInfo dummy_info = {
2929       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2930       .imageType = VK_IMAGE_TYPE_2D,
2931       .format = src_format,
2932       .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
2933       .mipLevels = 1,
2934       .arrayLayers = 1,
2935       .samples = VK_SAMPLE_COUNT_1_BIT,
2936       .tiling = VK_IMAGE_TILING_OPTIMAL,
2937       .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2938                VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2939       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2940       .queueFamilyIndexCount = 0,
2941       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2942    };
2943    VkResult result =
2944       v3dv_CreateImage(_device, &dummy_info, &device->vk.alloc, &dummy_image);
2945    if (result != VK_SUCCESS)
2946       return handled;
2947 
2948    VkMemoryRequirements reqs;
2949    vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
2950    v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);
2951 
2952    VkDeviceMemory mem;
2953    VkMemoryAllocateInfo alloc_info = {
2954       .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
2955       .allocationSize = reqs.size,
2956       .memoryTypeIndex = 0,
2957    };
2958    result = v3dv_AllocateMemory(_device, &alloc_info, &device->vk.alloc, &mem);
2959    if (result != VK_SUCCESS)
2960       return handled;
2961 
2962    v3dv_cmd_buffer_add_private_obj(
2963       cmd_buffer, (uintptr_t)mem,
2964       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);
2965 
2966    /* Obtain the layer count.
2967     *
2968     * If we are batching (region_count > 1) all our regions have the same
2969     * image subresource so we can take this from the first region.
2970     */
2971    uint32_t num_layers;
2972    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2973       num_layers = vk_image_subresource_layer_count(&image->vk,
2974                                                     &regions[0].imageSubresource);
2975    } else {
2976       num_layers = regions[0].imageExtent.depth;
2977    }
2978    assert(num_layers > 0);
2979 
2980    /* Sanity check: we can only batch multiple regions together if they have
2981     * the same framebuffer (so the same layer).
2982     */
2983    assert(num_layers == 1 || region_count == 1);
2984 
2985    uint8_t plane = v3dv_plane_from_aspect(aspect);
2986    assert(plane < image->plane_count);
2987 
2988    const uint32_t block_width =
2989       vk_format_get_blockwidth(image->planes[plane].vk_format);
2990    const uint32_t block_height =
2991       vk_format_get_blockheight(image->planes[plane].vk_format);
2992 
2993    /* Copy regions by uploading each region to a temporary tiled image using
2994     * the memory we have just allocated as storage.
2995     */
2996    for (uint32_t r = 0; r < region_count; r++) {
2997       const VkBufferImageCopy2 *region = &regions[r];
2998 
2999       /* Obtain the 2D buffer region spec */
3000       uint32_t buf_width, buf_height;
3001       if (region->bufferRowLength == 0)
3002           buf_width = region->imageExtent.width;
3003       else
3004           buf_width = region->bufferRowLength;
3005 
3006       if (region->bufferImageHeight == 0)
3007           buf_height = region->imageExtent.height;
3008       else
3009           buf_height = region->bufferImageHeight;
3010 
3011       /* If the image is compressed, the bpp refers to blocks, not pixels */
3012       buf_width = buf_width / block_width;
3013       buf_height = buf_height / block_height;
3014 
3015       for (uint32_t i = 0; i < num_layers; i++) {
3016          /* Create the tiled image */
3017          VkImageCreateInfo image_info = {
3018             .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
3019             .imageType = VK_IMAGE_TYPE_2D,
3020             .format = src_format,
3021             .extent = { buf_width, buf_height, 1 },
3022             .mipLevels = 1,
3023             .arrayLayers = 1,
3024             .samples = VK_SAMPLE_COUNT_1_BIT,
3025             .tiling = VK_IMAGE_TILING_OPTIMAL,
3026             .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
3027                      VK_IMAGE_USAGE_TRANSFER_DST_BIT,
3028             .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
3029             .queueFamilyIndexCount = 0,
3030             .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3031          };
3032 
3033          VkImage buffer_image;
3034          VkResult result =
3035             v3dv_CreateImage(_device, &image_info, &device->vk.alloc,
3036                              &buffer_image);
3037          if (result != VK_SUCCESS)
3038             return handled;
3039 
3040          v3dv_cmd_buffer_add_private_obj(
3041             cmd_buffer, (uintptr_t)buffer_image,
3042             (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
3043 
3044          result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
3045          if (result != VK_SUCCESS)
3046             return handled;
3047 
3048          /* When copying a multi-plane image the aspect indicates the plane to
3049           * copy. For these, we only copy one plane at a time, which is always
3050           * a color plane.
3051           */
3052          VkImageAspectFlags copy_aspect =
3053             image->plane_count == 1 ? aspect : VK_IMAGE_ASPECT_COLOR_BIT;
3054 
3055          /* Upload buffer contents for the selected layer */
3056          const VkDeviceSize buf_offset_bytes =
3057             region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
3058          const VkBufferImageCopy2 buffer_image_copy = {
3059             .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
3060             .bufferOffset = buf_offset_bytes,
3061             .bufferRowLength = region->bufferRowLength / block_width,
3062             .bufferImageHeight = region->bufferImageHeight / block_height,
3063             .imageSubresource = {
3064                .aspectMask = copy_aspect,
3065                .mipLevel = 0,
3066                .baseArrayLayer = 0,
3067                .layerCount = 1,
3068             },
3069             .imageOffset = { 0, 0, 0 },
3070             .imageExtent = { buf_width, buf_height, 1 }
3071          };
3072          handled =
3073             create_tiled_image_from_buffer(cmd_buffer,
3074                                            v3dv_image_from_handle(buffer_image),
3075                                            buffer, &buffer_image_copy);
3076          if (!handled) {
3077             /* This is unexpected, we should have setup the upload to be
3078              * conformant to a TFU or TLB copy.
3079              */
3080             unreachable("Unable to copy buffer to image through TLB");
3081             return false;
3082          }
3083 
3084          /* Blit-copy the requested image extent from the buffer image to the
3085           * destination image.
3086           *
3087           * Since we are copying, the blit must use the same format on the
3088           * destination and source images to avoid format conversions. The
3089           * only exception is copying stencil, which we upload to a R8UI source
3090           * image, but that we need to blit to a S8D24 destination (the only
3091           * stencil format we support).
3092           */
3093          const VkImageBlit2 blit_region = {
3094             .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
3095             .srcSubresource = {
3096                .aspectMask = copy_aspect,
3097                .mipLevel = 0,
3098                .baseArrayLayer = 0,
3099                .layerCount = 1,
3100             },
3101             .srcOffsets = {
3102                { 0, 0, 0 },
3103                { region->imageExtent.width, region->imageExtent.height, 1 },
3104             },
3105             .dstSubresource = {
3106                .aspectMask = aspect,
3107                .mipLevel = region->imageSubresource.mipLevel,
3108                .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
3109                .layerCount = 1,
3110             },
3111             .dstOffsets = {
3112                {
3113                   DIV_ROUND_UP(region->imageOffset.x, block_width),
3114                   DIV_ROUND_UP(region->imageOffset.y, block_height),
3115                   region->imageOffset.z + i,
3116                },
3117                {
3118                   DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
3119                                block_width),
3120                   DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
3121                                block_height),
3122                   region->imageOffset.z + i + 1,
3123                },
3124             },
3125          };
3126 
3127          handled = blit_shader(cmd_buffer,
3128                                image, dst_format,
3129                                v3dv_image_from_handle(buffer_image), src_format,
3130                                cmask, cswizzle,
3131                                &blit_region, VK_FILTER_NEAREST, true);
3132          if (!handled) {
3133             /* This is unexpected, we should have a supported blit spec */
3134             unreachable("Unable to blit buffer to destination image");
3135             return false;
3136          }
3137       }
3138    }
3139 
3140    return handled;
3141 }
3142 
3143 /**
3144  * Returns true if the implementation supports the requested operation (even if
3145  * it failed to process it, for example, due to an out-of-memory error).
3146  */
3147 static bool
copy_buffer_to_image_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t region_count,const VkBufferImageCopy2 * regions,bool use_texel_buffer)3148 copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
3149                             struct v3dv_image *image,
3150                             struct v3dv_buffer *buffer,
3151                             uint32_t region_count,
3152                             const VkBufferImageCopy2 *regions,
3153                             bool use_texel_buffer)
3154 {
3155    /* We can only call this with region_count > 1 if we can batch the regions
3156     * together, in which case they share the same image subresource, and so
3157     * the same aspect.
3158     */
3159    VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
3160    const VkImageAspectFlagBits any_plane_aspect =
3161       VK_IMAGE_ASPECT_PLANE_0_BIT |
3162       VK_IMAGE_ASPECT_PLANE_1_BIT |
3163       VK_IMAGE_ASPECT_PLANE_2_BIT;
3164 
3165    bool is_plane_aspect = aspect & any_plane_aspect;
3166 
3167    /* Generally, the bpp of the data in the buffer matches that of the
3168     * destination image. The exception is the case where we are uploading
3169     * stencil (8bpp) to a combined d24s8 image (32bpp).
3170     */
3171    uint8_t plane = v3dv_plane_from_aspect(aspect);
3172    assert(plane < image->plane_count);
3173    uint32_t buf_bpp = image->planes[plane].cpp;
3174 
3175    /* We are about to upload the buffer data to an image so we can then
3176     * blit that to our destination region. Because we are going to implement
3177     * the copy as a blit, we want our blit source and destination formats to be
3178     * the same (to avoid any format conversions), so we choose a canonical
3179     * format that matches the destination image bpp.
3180     */
3181    VkComponentMapping ident_swizzle = {
3182       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
3183       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
3184       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
3185       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
3186    };
3187 
3188    VkComponentMapping cswizzle = ident_swizzle;
3189    VkColorComponentFlags cmask = 0; /* Write all components */
3190    VkFormat src_format;
3191    VkFormat dst_format;
3192    switch (buf_bpp) {
3193    case 16:
3194       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
3195       src_format = VK_FORMAT_R32G32B32A32_UINT;
3196       dst_format = src_format;
3197       break;
3198    case 8:
3199       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
3200       src_format = VK_FORMAT_R16G16B16A16_UINT;
3201       dst_format = src_format;
3202       break;
3203    case 4:
3204       switch (aspect) {
3205       case VK_IMAGE_ASPECT_COLOR_BIT:
3206       case VK_IMAGE_ASPECT_PLANE_0_BIT:
3207       case VK_IMAGE_ASPECT_PLANE_1_BIT:
3208       case VK_IMAGE_ASPECT_PLANE_2_BIT:
3209          src_format = VK_FORMAT_R8G8B8A8_UINT;
3210          dst_format = src_format;
3211          break;
3212       case VK_IMAGE_ASPECT_DEPTH_BIT:
3213          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
3214                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
3215                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
3216          src_format = VK_FORMAT_R8G8B8A8_UINT;
3217          dst_format = src_format;
3218 
3219          /* For D24 formats, the Vulkan spec states that the depth component
3220           * in the buffer is stored in the 24-LSB, but V3D wants it in the
3221           * 24-MSB.
3222           */
3223          if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
3224              image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
3225             cmask = VK_COLOR_COMPONENT_G_BIT |
3226                     VK_COLOR_COMPONENT_B_BIT |
3227                     VK_COLOR_COMPONENT_A_BIT;
3228             cswizzle.r = VK_COMPONENT_SWIZZLE_R;
3229             cswizzle.g = VK_COMPONENT_SWIZZLE_R;
3230             cswizzle.b = VK_COMPONENT_SWIZZLE_G;
3231             cswizzle.a = VK_COMPONENT_SWIZZLE_B;
3232          }
3233          break;
3234       case VK_IMAGE_ASPECT_STENCIL_BIT:
3235          /* Since we don't support separate stencil this is always a stencil
3236           * copy to a combined depth/stencil image. Because we don't support
3237           * separate stencil images, we interpret the buffer data as a
3238           * color R8UI image, and implement the blit as a compatible color
3239           * blit to an RGBA8UI destination masking out writes to components
3240           * GBA (which map to the D24 component of a S8D24 image).
3241           */
3242          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
3243          buf_bpp = 1;
3244          src_format = VK_FORMAT_R8_UINT;
3245          dst_format = VK_FORMAT_R8G8B8A8_UINT;
3246          cmask = VK_COLOR_COMPONENT_R_BIT;
3247          break;
3248       default:
3249          unreachable("unsupported aspect");
3250          return false;
3251       };
3252       break;
3253    case 2:
3254       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
3255              aspect == VK_IMAGE_ASPECT_DEPTH_BIT ||
3256              is_plane_aspect);
3257       src_format = VK_FORMAT_R16_UINT;
3258       dst_format = src_format;
3259       break;
3260    case 1:
3261       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || is_plane_aspect);
3262       src_format = VK_FORMAT_R8_UINT;
3263       dst_format = src_format;
3264       break;
3265    default:
3266       unreachable("unsupported bit-size");
3267       return false;
3268    }
3269 
3270    if (use_texel_buffer) {
3271       return texel_buffer_shader_copy(cmd_buffer, aspect, image,
3272                                       dst_format, src_format,
3273                                       buffer, buf_bpp,
3274                                       cmask, &cswizzle,
3275                                       region_count, regions);
3276    } else {
3277       return copy_buffer_to_image_blit(cmd_buffer, aspect, image,
3278                                        dst_format, src_format,
3279                                        buffer, buf_bpp,
3280                                        cmask, &cswizzle,
3281                                        region_count, regions);
3282    }
3283 }
3284 
3285 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * info)3286 v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
3287                               const VkCopyBufferToImageInfo2 *info)
3288 {
3289    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3290    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
3291    V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
3292 
3293    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
3294 
3295    cmd_buffer->state.is_transfer = true;
3296 
3297    uint32_t r = 0;
3298    while (r < info->regionCount) {
3299       /* The TFU and TLB paths can only copy one region at a time and the region
3300        * needs to start at the origin. We try these first for the common case
3301        * where we are copying full images, since they should be the fastest.
3302        */
3303       uint32_t batch_size = 1;
3304       if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
3305          goto handled;
3306 
3307       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
3308          goto handled;
3309 
3310       /* Otherwise, we are copying subrects, so we fallback to copying
3311        * via shader and texel buffers and we try to batch the regions
3312        * if possible. We can only batch copies if they have the same
3313        * framebuffer spec, which is mostly determined by the image
3314        * subresource of the region.
3315        */
3316       const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
3317       for (uint32_t s = r + 1; s < info->regionCount; s++) {
3318          const VkImageSubresourceLayers *rsc_s =
3319             &info->pRegions[s].imageSubresource;
3320 
3321          if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
3322             break;
3323 
3324          /* For 3D images we also need to check the depth extent */
3325          if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
3326              info->pRegions[s].imageExtent.depth !=
3327              info->pRegions[r].imageExtent.depth) {
3328                break;
3329          }
3330 
3331          batch_size++;
3332       }
3333 
3334       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
3335                                       batch_size, &info->pRegions[r], true)) {
3336          goto handled;
3337       }
3338 
3339       /* If we still could not copy, fallback to slower paths.
3340        *
3341        * FIXME: we could try to batch these too, but since they are bound to be
3342        * slow it might not be worth it and we should instead put more effort
3343        * in handling more cases with the other paths.
3344        */
3345       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
3346                                       batch_size, &info->pRegions[r], false)) {
3347          goto handled;
3348       }
3349 
3350       unreachable("Unsupported buffer to image copy.");
3351 
3352 handled:
3353       r += batch_size;
3354    }
3355 
3356    cmd_buffer->state.is_transfer = false;
3357 }
3358 
3359 static void
3360 compute_blit_3d_layers(const VkOffset3D *offsets,
3361                        uint32_t *min_layer, uint32_t *max_layer,
3362                        bool *mirror_z);
3363 
3364 /**
3365  * Returns true if the implementation supports the requested operation (even if
3366  * it failed to process it, for example, due to an out-of-memory error).
3367  *
3368  * The TFU blit path doesn't handle scaling so the blit filter parameter can
3369  * be ignored.
3370  */
3371 static bool
blit_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageBlit2 * region)3372 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
3373          struct v3dv_image *dst,
3374          struct v3dv_image *src,
3375          const VkImageBlit2 *region)
3376 {
3377    if (V3D_DBG(DISABLE_TFU)) {
3378       perf_debug("Blit: TFU disabled, fallbacks could be slower.");
3379       return false;
3380    }
3381 
3382    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
3383    assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
3384 
3385    /* From vkCmdBlitImage:
3386     *   "srcImage must not use a format that requires a sampler YCBCR
3387     *    conversion"
3388     *   "dstImage must not use a format that requires a sampler YCBCR
3389     *    conversion"
3390     */
3391    assert(dst->plane_count == 1);
3392    assert(src->plane_count == 1);
3393 
3394    /* Format must match */
3395    if (src->vk.format != dst->vk.format)
3396       return false;
3397 
3398    /* Destination can't be raster format */
3399    if (!dst->tiled)
3400       return false;
3401 
3402    /* Source region must start at (0,0) */
3403    if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
3404       return false;
3405 
3406    /* Destination image must be complete */
3407    if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
3408       return false;
3409 
3410    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
3411    const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
3412    const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
3413    if (region->dstOffsets[1].x < dst_width - 1||
3414        region->dstOffsets[1].y < dst_height - 1) {
3415       return false;
3416    }
3417 
3418    /* No XY scaling */
3419    if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
3420        region->srcOffsets[1].y != region->dstOffsets[1].y) {
3421       return false;
3422    }
3423 
3424    /* If the format is D24S8 both aspects need to be copied, since the TFU
3425     * can't be programmed to copy only one aspect of the image.
3426     */
3427    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
3428        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
3429                                              VK_IMAGE_ASPECT_STENCIL_BIT;
3430        if (region->dstSubresource.aspectMask != ds_aspects)
3431           return false;
3432    }
3433 
3434    /* Our TFU blits only handle exact copies (it requires same formats
3435     * on input and output, no scaling, etc), so there is no pixel format
3436     * conversions and we can rewrite the format to use one that is TFU
3437     * compatible based on its texel size.
3438     */
3439    const struct v3dv_format *format =
3440       v3dv_get_compatible_tfu_format(cmd_buffer->device,
3441                                      dst->planes[0].cpp, NULL);
3442 
3443    /* Emit a TFU job for each layer to blit */
3444    assert(vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) ==
3445           vk_image_subresource_layer_count(&src->vk, &region->srcSubresource));
3446 
3447    uint32_t min_dst_layer;
3448    uint32_t max_dst_layer;
3449    bool dst_mirror_z = false;
3450    if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
3451       compute_blit_3d_layers(region->dstOffsets,
3452                              &min_dst_layer, &max_dst_layer,
3453                              &dst_mirror_z);
3454    } else {
3455       min_dst_layer = region->dstSubresource.baseArrayLayer;
3456       max_dst_layer = min_dst_layer +
3457                       vk_image_subresource_layer_count(&dst->vk,
3458                                                        &region->dstSubresource);
3459    }
3460 
3461    uint32_t min_src_layer;
3462    uint32_t max_src_layer;
3463    bool src_mirror_z = false;
3464    if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
3465       compute_blit_3d_layers(region->srcOffsets,
3466                              &min_src_layer, &max_src_layer,
3467                              &src_mirror_z);
3468    } else {
3469       min_src_layer = region->srcSubresource.baseArrayLayer;
3470       max_src_layer = min_src_layer +
3471                       vk_image_subresource_layer_count(&src->vk,
3472                                                        &region->srcSubresource);
3473    }
3474 
3475    /* No Z scaling for 3D images (for non-3D images both src and dst must
3476     * have the same layerCount).
3477     */
3478    if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
3479       return false;
3480 
3481    const uint32_t layer_count = max_dst_layer - min_dst_layer;
3482    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
3483    for (uint32_t i = 0; i < layer_count; i++) {
3484       /* Since the TFU path doesn't handle scaling, Z mirroring for 3D images
3485        * only involves reversing the order of the slices.
3486        */
3487       const uint32_t dst_layer =
3488          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
3489       const uint32_t src_layer =
3490          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
3491 
3492       const uint32_t dst_offset =
3493          dst->planes[0].mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level,
3494                                                             dst_layer, 0);
3495       const uint32_t src_offset =
3496          src->planes[0].mem->bo->offset + v3dv_layer_offset(src, src_mip_level,
3497                                                             src_layer, 0);
3498 
3499       const struct v3d_resource_slice *dst_slice = &dst->planes[0].slices[dst_mip_level];
3500       const struct v3d_resource_slice *src_slice = &src->planes[0].slices[src_mip_level];
3501 
3502       v3d_X((&cmd_buffer->device->devinfo), meta_emit_tfu_job)(
3503          cmd_buffer,
3504          dst->planes[0].mem->bo->handle,
3505          dst_offset,
3506          dst_slice->tiling,
3507          dst_slice->padded_height,
3508          dst->planes[0].cpp,
3509          src->planes[0].mem->bo->handle,
3510          src_offset,
3511          src_slice->tiling,
3512          src_slice->tiling == V3D_TILING_RASTER ?
3513                               src_slice->stride : src_slice->padded_height,
3514          src->planes[0].cpp,
3515          dst_width, dst_height, &format->planes[0]);
3516    }
3517 
3518    return true;
3519 }
3520 
3521 static bool
format_needs_software_int_clamp(VkFormat format)3522 format_needs_software_int_clamp(VkFormat format)
3523 {
3524    switch (format) {
3525       case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3526       case VK_FORMAT_A2R10G10B10_SINT_PACK32:
3527       case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3528       case VK_FORMAT_A2B10G10R10_SINT_PACK32:
3529          return true;
3530       default:
3531          return false;
3532    };
3533 }
3534 
3535 static void
get_blit_pipeline_cache_key(VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,uint8_t * key)3536 get_blit_pipeline_cache_key(VkFormat dst_format,
3537                             VkFormat src_format,
3538                             VkColorComponentFlags cmask,
3539                             VkSampleCountFlagBits dst_samples,
3540                             VkSampleCountFlagBits src_samples,
3541                             uint8_t *key)
3542 {
3543    memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);
3544 
3545    uint32_t *p = (uint32_t *) key;
3546 
3547    *p = dst_format;
3548    p++;
3549 
3550    /* Generally, when blitting from a larger format to a smaller format
3551     * the hardware takes care of clamping the source to the RT range.
3552     * Specifically, for integer formats, this is done by using
3553     * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
3554     * clamps to the bit-size of the render type, and some formats, such as
3555     * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
3556     * require to clamp in software. In these cases, we need to amend the blit
3557     * shader with clamp code that depends on both the src and dst formats, so
3558     * we need the src format to be part of the key.
3559     */
3560    *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
3561    p++;
3562 
3563    *p = cmask;
3564    p++;
3565 
3566    *p = (dst_samples << 8) | src_samples;
3567    p++;
3568 
3569    assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
3570 }
3571 
3572 static bool
create_blit_render_pass(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkRenderPass * pass_load,VkRenderPass * pass_no_load)3573 create_blit_render_pass(struct v3dv_device *device,
3574                         VkFormat dst_format,
3575                         VkFormat src_format,
3576                         VkRenderPass *pass_load,
3577                         VkRenderPass *pass_no_load)
3578 {
3579    const bool is_color_blit = vk_format_is_color(dst_format);
3580 
3581    /* Attachment load operation is specified below */
3582    VkAttachmentDescription2 att = {
3583       .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
3584       .format = dst_format,
3585       .samples = VK_SAMPLE_COUNT_1_BIT,
3586       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
3587       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3588       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
3589    };
3590 
3591    VkAttachmentReference2 att_ref = {
3592       .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
3593       .attachment = 0,
3594       .layout = VK_IMAGE_LAYOUT_GENERAL,
3595    };
3596 
3597    VkSubpassDescription2 subpass = {
3598       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
3599       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
3600       .inputAttachmentCount = 0,
3601       .colorAttachmentCount = is_color_blit ? 1 : 0,
3602       .pColorAttachments = is_color_blit ? &att_ref : NULL,
3603       .pResolveAttachments = NULL,
3604       .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
3605       .preserveAttachmentCount = 0,
3606       .pPreserveAttachments = NULL,
3607    };
3608 
3609    VkRenderPassCreateInfo2 info = {
3610       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
3611       .attachmentCount = 1,
3612       .pAttachments = &att,
3613       .subpassCount = 1,
3614       .pSubpasses = &subpass,
3615       .dependencyCount = 0,
3616       .pDependencies = NULL,
3617    };
3618 
3619    VkResult result;
3620    att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
3621    result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
3622                                    &info, &device->vk.alloc, pass_load);
3623    if (result != VK_SUCCESS)
3624       return false;
3625 
3626    att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
3627    result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
3628                                    &info, &device->vk.alloc, pass_no_load);
3629    return result == VK_SUCCESS;
3630 }
3631 
3632 static nir_def *
gen_tex_coords(nir_builder * b)3633 gen_tex_coords(nir_builder *b)
3634 {
3635    nir_def *tex_box =
3636       nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
3637 
3638    nir_def *tex_z =
3639       nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
3640 
3641    nir_def *vertex_id = nir_load_vertex_id(b);
3642 
3643    /* vertex 0: src0_x, src0_y
3644     * vertex 1: src0_x, src1_y
3645     * vertex 2: src1_x, src0_y
3646     * vertex 3: src1_x, src1_y
3647     *
3648     * So:
3649     *
3650     * channel 0 is vertex_id < 2 ? src0_x : src1_x
3651     * channel 1 is vertex id & 1 ? src1_y : src0_y
3652     */
3653 
3654    nir_def *one = nir_imm_int(b, 1);
3655    nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2);
3656    nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3657 
3658    nir_def *comp[4];
3659    comp[0] = nir_bcsel(b, c0cmp,
3660                        nir_channel(b, tex_box, 0),
3661                        nir_channel(b, tex_box, 2));
3662 
3663    comp[1] = nir_bcsel(b, c1cmp,
3664                        nir_channel(b, tex_box, 3),
3665                        nir_channel(b, tex_box, 1));
3666    comp[2] = tex_z;
3667    comp[3] = nir_imm_float(b, 1.0f);
3668    return nir_vec(b, comp, 4);
3669 }
3670 
3671 static nir_def *
build_nir_tex_op_read(struct nir_builder * b,nir_def * tex_pos,enum glsl_base_type tex_type,enum glsl_sampler_dim dim)3672 build_nir_tex_op_read(struct nir_builder *b,
3673                       nir_def *tex_pos,
3674                       enum glsl_base_type tex_type,
3675                       enum glsl_sampler_dim dim)
3676 {
3677    assert(dim != GLSL_SAMPLER_DIM_MS);
3678 
3679    const struct glsl_type *sampler_type =
3680       glsl_sampler_type(dim, false, false, tex_type);
3681    nir_variable *sampler =
3682       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3683    sampler->data.descriptor_set = 0;
3684    sampler->data.binding = 0;
3685 
3686    nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
3687    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3688    tex->sampler_dim = dim;
3689    tex->op = nir_texop_tex;
3690    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
3691    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
3692    tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref);
3693    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3694    tex->is_array = glsl_sampler_type_is_array(sampler_type);
3695    tex->coord_components = tex_pos->num_components;
3696 
3697    nir_def_init(&tex->instr, &tex->def, 4, 32);
3698    nir_builder_instr_insert(b, &tex->instr);
3699    return &tex->def;
3700 }
3701 
3702 static nir_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder * b,nir_variable * sampler,nir_def * tex_deref,enum glsl_base_type tex_type,nir_def * tex_pos,nir_def * sample_idx)3703 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
3704                                  nir_variable *sampler,
3705                                  nir_def *tex_deref,
3706                                  enum glsl_base_type tex_type,
3707                                  nir_def *tex_pos,
3708                                  nir_def *sample_idx)
3709 {
3710    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3711    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
3712    tex->op = nir_texop_txf_ms;
3713    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
3714    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
3715    tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx);
3716    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3717    tex->is_array = false;
3718    tex->coord_components = tex_pos->num_components;
3719 
3720    nir_def_init(&tex->instr, &tex->def, 4, 32);
3721    nir_builder_instr_insert(b, &tex->instr);
3722    return &tex->def;
3723 }
3724 
3725 /* Fetches all samples at the given position and averages them */
3726 static nir_def *
build_nir_tex_op_ms_resolve(struct nir_builder * b,nir_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits src_samples)3727 build_nir_tex_op_ms_resolve(struct nir_builder *b,
3728                             nir_def *tex_pos,
3729                             enum glsl_base_type tex_type,
3730                             VkSampleCountFlagBits src_samples)
3731 {
3732    assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
3733    const struct glsl_type *sampler_type =
3734       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3735    nir_variable *sampler =
3736       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3737    sampler->data.descriptor_set = 0;
3738    sampler->data.binding = 0;
3739 
3740    const bool is_int = glsl_base_type_is_integer(tex_type);
3741 
3742    nir_def *tmp = NULL;
3743    nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
3744    for (uint32_t i = 0; i < src_samples; i++) {
3745       nir_def *s =
3746          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3747                                           tex_type, tex_pos,
3748                                           nir_imm_int(b, i));
3749 
3750       /* For integer formats, the multisample resolve operation is expected to
3751        * return one of the samples, we just return the first one.
3752        */
3753       if (is_int)
3754          return s;
3755 
3756       tmp = i == 0 ? s : nir_fadd(b, tmp, s);
3757    }
3758 
3759    assert(!is_int);
3760    return nir_fmul_imm(b, tmp, 1.0f / src_samples);
3761 }
3762 
3763 /* Fetches the current sample (gl_SampleID) at the given position */
3764 static nir_def *
build_nir_tex_op_ms_read(struct nir_builder * b,nir_def * tex_pos,enum glsl_base_type tex_type)3765 build_nir_tex_op_ms_read(struct nir_builder *b,
3766                          nir_def *tex_pos,
3767                          enum glsl_base_type tex_type)
3768 {
3769    const struct glsl_type *sampler_type =
3770       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3771    nir_variable *sampler =
3772       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3773    sampler->data.descriptor_set = 0;
3774    sampler->data.binding = 0;
3775 
3776    nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
3777 
3778    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3779                                            tex_type, tex_pos,
3780                                            nir_load_sample_id(b));
3781 }
3782 
3783 static nir_def *
build_nir_tex_op(struct nir_builder * b,struct v3dv_device * device,nir_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim dim)3784 build_nir_tex_op(struct nir_builder *b,
3785                  struct v3dv_device *device,
3786                  nir_def *tex_pos,
3787                  enum glsl_base_type tex_type,
3788                  VkSampleCountFlagBits dst_samples,
3789                  VkSampleCountFlagBits src_samples,
3790                  enum glsl_sampler_dim dim)
3791 {
3792    switch (dim) {
3793    case GLSL_SAMPLER_DIM_MS:
3794       assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
3795       /* For multisampled texture sources we need to use fetching instead of
3796        * normalized texture coordinates. We already configured our blit
3797        * coordinates to be in texel units, but here we still need to convert
3798        * them from floating point to integer.
3799        */
3800       tex_pos = nir_f2i32(b, tex_pos);
3801 
3802       if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
3803          return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
3804       else
3805          return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
3806    default:
3807       assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
3808       return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
3809    }
3810 }
3811 
3812 static nir_shader *
get_blit_vs(const nir_shader_compiler_options * options)3813 get_blit_vs(const nir_shader_compiler_options *options)
3814 {
3815    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
3816                                                   "meta blit vs");
3817 
3818    const struct glsl_type *vec4 = glsl_vec4_type();
3819 
3820    nir_variable *vs_out_pos =
3821       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
3822    vs_out_pos->data.location = VARYING_SLOT_POS;
3823 
3824    nir_variable *vs_out_tex_coord =
3825       nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
3826    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
3827    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
3828 
3829    nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
3830    nir_store_var(&b, vs_out_pos, pos, 0xf);
3831 
3832    nir_def *tex_coord = gen_tex_coords(&b);
3833    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
3834 
3835    return b.shader;
3836 }
3837 
3838 static uint32_t
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)3839 get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
3840 {
3841    switch (sampler_dim) {
3842    case GLSL_SAMPLER_DIM_1D: return 0x1;
3843    case GLSL_SAMPLER_DIM_2D: return 0x3;
3844    case GLSL_SAMPLER_DIM_MS: return 0x3;
3845    case GLSL_SAMPLER_DIM_3D: return 0x7;
3846    default:
3847       unreachable("invalid sampler dim");
3848    };
3849 }
3850 
3851 static nir_shader *
get_color_blit_fs(const nir_shader_compiler_options * options,struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim sampler_dim)3852 get_color_blit_fs(const nir_shader_compiler_options *options,
3853                   struct v3dv_device *device,
3854                   VkFormat dst_format,
3855                   VkFormat src_format,
3856                   VkSampleCountFlagBits dst_samples,
3857                   VkSampleCountFlagBits src_samples,
3858                   enum glsl_sampler_dim sampler_dim)
3859 {
3860    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
3861                                                   "meta blit fs");
3862 
3863    const struct glsl_type *vec4 = glsl_vec4_type();
3864 
3865    nir_variable *fs_in_tex_coord =
3866       nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
3867    fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;
3868 
3869    const struct glsl_type *fs_out_type =
3870       vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
3871       vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
3872                                       glsl_vec4_type();
3873 
3874    enum glsl_base_type src_base_type =
3875       vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
3876       vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
3877                                       GLSL_TYPE_FLOAT;
3878 
3879    nir_variable *fs_out_color =
3880       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
3881    fs_out_color->data.location = FRAG_RESULT_DATA0;
3882 
3883    nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
3884    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
3885    tex_coord = nir_channels(&b, tex_coord, channel_mask);
3886 
3887    nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
3888                                          dst_samples, src_samples, sampler_dim);
3889 
3890    /* For integer textures, if the bit-size of the destination is too small to
3891     * hold source value, Vulkan (CTS) expects the implementation to clamp to the
3892     * maximum value the destination can hold. The hardware can clamp to the
3893     * render target type, which usually matches the component bit-size, but
3894     * there are some cases that won't match, such as rgb10a2, which has a 16-bit
3895     * render target type, so in these cases we need to clamp manually.
3896     */
3897    if (format_needs_software_int_clamp(dst_format)) {
3898       assert(vk_format_is_int(dst_format));
3899       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
3900       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
3901 
3902       nir_def *c[4];
3903       for (uint32_t i = 0; i < 4; i++) {
3904          c[i] = nir_channel(&b, color, i);
3905 
3906          const uint32_t src_bit_size =
3907             util_format_get_component_bits(src_pformat,
3908                                            UTIL_FORMAT_COLORSPACE_RGB,
3909                                            i);
3910          const uint32_t dst_bit_size =
3911             util_format_get_component_bits(dst_pformat,
3912                                            UTIL_FORMAT_COLORSPACE_RGB,
3913                                            i);
3914 
3915          if (dst_bit_size >= src_bit_size)
3916             continue;
3917 
3918          assert(dst_bit_size > 0);
3919          if (util_format_is_pure_uint(dst_pformat)) {
3920             nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
3921             c[i] = nir_umin(&b, c[i], max);
3922          } else {
3923             nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
3924             nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
3925             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
3926          }
3927       }
3928 
3929       color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
3930    }
3931 
3932    nir_store_var(&b, fs_out_color, color, 0xf);
3933 
3934    return b.shader;
3935 }
3936 
3937 static bool
create_pipeline(struct v3dv_device * device,struct v3dv_render_pass * pass,struct nir_shader * vs_nir,struct nir_shader * gs_nir,struct nir_shader * fs_nir,const VkPipelineVertexInputStateCreateInfo * vi_state,const VkPipelineDepthStencilStateCreateInfo * ds_state,const VkPipelineColorBlendStateCreateInfo * cb_state,const VkPipelineMultisampleStateCreateInfo * ms_state,const VkPipelineLayout layout,VkPipeline * pipeline)3938 create_pipeline(struct v3dv_device *device,
3939                 struct v3dv_render_pass *pass,
3940                 struct nir_shader *vs_nir,
3941                 struct nir_shader *gs_nir,
3942                 struct nir_shader *fs_nir,
3943                 const VkPipelineVertexInputStateCreateInfo *vi_state,
3944                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
3945                 const VkPipelineColorBlendStateCreateInfo *cb_state,
3946                 const VkPipelineMultisampleStateCreateInfo *ms_state,
3947                 const VkPipelineLayout layout,
3948                 VkPipeline *pipeline)
3949 {
3950    struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
3951    struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir);
3952    struct vk_shader_module gs_m;
3953 
3954    uint32_t num_stages = gs_nir ? 3 : 2;
3955 
3956 
3957    VkPipelineShaderStageCreateInfo stages[3] = {
3958       {
3959          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3960          .stage = VK_SHADER_STAGE_VERTEX_BIT,
3961          .module = vk_shader_module_to_handle(&vs_m),
3962          .pName = "main",
3963       },
3964       {
3965          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3966          .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
3967          .module = vk_shader_module_to_handle(&fs_m),
3968          .pName = "main",
3969       },
3970       {
3971          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3972          .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
3973          .module = VK_NULL_HANDLE,
3974          .pName = "main",
3975       },
3976    };
3977 
3978    if (gs_nir) {
3979       gs_m = vk_shader_module_from_nir(gs_nir);
3980       stages[2].module = vk_shader_module_to_handle(&gs_m);
3981    }
3982 
3983    VkGraphicsPipelineCreateInfo info = {
3984       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
3985 
3986       .stageCount = num_stages,
3987       .pStages = stages,
3988 
3989       .pVertexInputState = vi_state,
3990 
3991       .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
3992          .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
3993          .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
3994          .primitiveRestartEnable = false,
3995       },
3996 
3997       .pViewportState = &(VkPipelineViewportStateCreateInfo) {
3998          .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
3999          .viewportCount = 1,
4000          .scissorCount = 1,
4001       },
4002 
4003       .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
4004          .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
4005          .rasterizerDiscardEnable = false,
4006          .polygonMode = VK_POLYGON_MODE_FILL,
4007          .cullMode = VK_CULL_MODE_NONE,
4008          .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
4009          .depthBiasEnable = false,
4010       },
4011 
4012       .pMultisampleState = ms_state,
4013 
4014       .pDepthStencilState = ds_state,
4015 
4016       .pColorBlendState = cb_state,
4017 
4018       /* The meta clear pipeline declares all state as dynamic.
4019        * As a consequence, vkCmdBindPipeline writes no dynamic state
4020        * to the cmd buffer. Therefore, at the end of the meta clear,
4021        * we need only restore dynamic state that was vkCmdSet.
4022        */
4023       .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
4024          .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
4025          .dynamicStateCount = 6,
4026          .pDynamicStates = (VkDynamicState[]) {
4027             VK_DYNAMIC_STATE_VIEWPORT,
4028             VK_DYNAMIC_STATE_SCISSOR,
4029             VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
4030             VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
4031             VK_DYNAMIC_STATE_STENCIL_REFERENCE,
4032             VK_DYNAMIC_STATE_BLEND_CONSTANTS,
4033             VK_DYNAMIC_STATE_DEPTH_BIAS,
4034             VK_DYNAMIC_STATE_LINE_WIDTH,
4035          },
4036       },
4037 
4038       .flags = 0,
4039       .layout = layout,
4040       .renderPass = v3dv_render_pass_to_handle(pass),
4041       .subpass = 0,
4042    };
4043 
4044    VkResult result =
4045       v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
4046                                    VK_NULL_HANDLE,
4047                                    1, &info,
4048                                    &device->vk.alloc,
4049                                    pipeline);
4050 
4051    ralloc_free(vs_nir);
4052    ralloc_free(gs_nir);
4053    ralloc_free(fs_nir);
4054 
4055    return result == VK_SUCCESS;
4056 }
4057 
4058 static enum glsl_sampler_dim
get_sampler_dim(VkImageType type,VkSampleCountFlagBits src_samples)4059 get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
4060 {
4061    /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
4062     *
4063     *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
4064     *    VK_IMAGE_TYPE_2D, ..."
4065     */
4066    assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);
4067 
4068    switch (type) {
4069    case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
4070    case VK_IMAGE_TYPE_2D:
4071       return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
4072                                                     GLSL_SAMPLER_DIM_MS;
4073    case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
4074    default:
4075       unreachable("Invalid image type");
4076    }
4077 }
4078 
4079 static bool
create_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)4080 create_blit_pipeline(struct v3dv_device *device,
4081                      VkFormat dst_format,
4082                      VkFormat src_format,
4083                      VkColorComponentFlags cmask,
4084                      VkImageType src_type,
4085                      VkSampleCountFlagBits dst_samples,
4086                      VkSampleCountFlagBits src_samples,
4087                      VkRenderPass _pass,
4088                      VkPipelineLayout pipeline_layout,
4089                      VkPipeline *pipeline)
4090 {
4091    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
4092 
4093    /* We always rewrite depth/stencil blits to compatible color blits */
4094    assert(vk_format_is_color(dst_format));
4095    assert(vk_format_is_color(src_format));
4096 
4097    const nir_shader_compiler_options *options =
4098       v3dv_pipeline_get_nir_options(&device->devinfo);
4099 
4100    const enum glsl_sampler_dim sampler_dim =
4101       get_sampler_dim(src_type, src_samples);
4102 
4103    nir_shader *vs_nir = get_blit_vs(options);
4104    nir_shader *fs_nir =
4105       get_color_blit_fs(options, device, dst_format, src_format,
4106                         dst_samples, src_samples, sampler_dim);
4107 
4108    const VkPipelineVertexInputStateCreateInfo vi_state = {
4109       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
4110       .vertexBindingDescriptionCount = 0,
4111       .vertexAttributeDescriptionCount = 0,
4112    };
4113 
4114    VkPipelineDepthStencilStateCreateInfo ds_state = {
4115       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
4116    };
4117 
4118    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
4119    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
4120       .blendEnable = false,
4121       .colorWriteMask = cmask,
4122    };
4123 
4124    const VkPipelineColorBlendStateCreateInfo cb_state = {
4125       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
4126       .logicOpEnable = false,
4127       .attachmentCount = 1,
4128       .pAttachments = blend_att_state
4129    };
4130 
4131    const VkPipelineMultisampleStateCreateInfo ms_state = {
4132       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
4133       .rasterizationSamples = dst_samples,
4134       .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
4135       .pSampleMask = NULL,
4136       .alphaToCoverageEnable = false,
4137       .alphaToOneEnable = false,
4138    };
4139 
4140    return create_pipeline(device,
4141                           pass,
4142                           vs_nir, NULL, fs_nir,
4143                           &vi_state,
4144                           &ds_state,
4145                           &cb_state,
4146                           &ms_state,
4147                           pipeline_layout,
4148                           pipeline);
4149 }
4150 
4151 /**
4152  * Return a pipeline suitable for blitting the requested aspect given the
4153  * destination and source formats.
4154  */
4155 static bool
get_blit_pipeline(struct v3dv_cmd_buffer * cmd_buffer,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,struct v3dv_meta_blit_pipeline ** pipeline)4156 get_blit_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
4157                   VkFormat dst_format,
4158                   VkFormat src_format,
4159                   VkColorComponentFlags cmask,
4160                   VkImageType src_type,
4161                   VkSampleCountFlagBits dst_samples,
4162                   VkSampleCountFlagBits src_samples,
4163                   struct v3dv_meta_blit_pipeline **pipeline)
4164 {
4165    bool ok = true;
4166    struct v3dv_device *device = cmd_buffer->device;
4167 
4168    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
4169    if (device->instance->meta_cache_enabled) {
4170       get_blit_pipeline_cache_key(dst_format, src_format, cmask,
4171                                   dst_samples, src_samples, key);
4172       mtx_lock(&device->meta.mtx);
4173       struct hash_entry *entry =
4174          _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
4175       if (entry) {
4176          mtx_unlock(&device->meta.mtx);
4177          *pipeline = entry->data;
4178          return true;
4179       }
4180    }
4181 
4182    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
4183                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
4184 
4185    if (*pipeline == NULL)
4186       goto fail;
4187 
4188    ok = create_blit_render_pass(device, dst_format, src_format,
4189                                 &(*pipeline)->pass,
4190                                 &(*pipeline)->pass_no_load);
4191    if (!ok)
4192       goto fail;
4193 
4194    /* Create the pipeline using one of the render passes, they are both
4195     * compatible, so we don't care which one we use here.
4196     */
4197    ok = create_blit_pipeline(device,
4198                              dst_format,
4199                              src_format,
4200                              cmask,
4201                              src_type,
4202                              dst_samples,
4203                              src_samples,
4204                              (*pipeline)->pass,
4205                              device->meta.blit.p_layout,
4206                              &(*pipeline)->pipeline);
4207    if (!ok)
4208       goto fail;
4209 
4210    if (device->instance->meta_cache_enabled) {
4211       memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
4212       _mesa_hash_table_insert(device->meta.blit.cache[src_type],
4213                               &(*pipeline)->key, *pipeline);
4214       mtx_unlock(&device->meta.mtx);
4215    } else {
4216       v3dv_cmd_buffer_add_private_obj(
4217          cmd_buffer, (uintptr_t)*pipeline,
4218          (v3dv_cmd_buffer_private_obj_destroy_cb)destroy_meta_blit_pipeline);
4219    }
4220 
4221    return true;
4222 
4223 fail:
4224    if (device->instance->meta_cache_enabled)
4225       mtx_unlock(&device->meta.mtx);
4226 
4227    VkDevice _device = v3dv_device_to_handle(device);
4228    if (*pipeline) {
4229       if ((*pipeline)->pass)
4230          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
4231       if ((*pipeline)->pass_no_load)
4232          v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->vk.alloc);
4233       if ((*pipeline)->pipeline)
4234          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
4235       vk_free(&device->vk.alloc, *pipeline);
4236       *pipeline = NULL;
4237    }
4238 
4239    return false;
4240 }
4241 
4242 static void
compute_blit_box(const VkOffset3D * offsets,uint32_t image_w,uint32_t image_h,uint32_t * x,uint32_t * y,uint32_t * w,uint32_t * h,bool * mirror_x,bool * mirror_y)4243 compute_blit_box(const VkOffset3D *offsets,
4244                  uint32_t image_w, uint32_t image_h,
4245                  uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
4246                  bool *mirror_x, bool *mirror_y)
4247 {
4248    if (offsets[1].x >= offsets[0].x) {
4249       *mirror_x = false;
4250       *x = MIN2(offsets[0].x, image_w - 1);
4251       *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
4252    } else {
4253       *mirror_x = true;
4254       *x = MIN2(offsets[1].x, image_w - 1);
4255       *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
4256    }
4257    if (offsets[1].y >= offsets[0].y) {
4258       *mirror_y = false;
4259       *y = MIN2(offsets[0].y, image_h - 1);
4260       *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
4261    } else {
4262       *mirror_y = true;
4263       *y = MIN2(offsets[1].y, image_h - 1);
4264       *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
4265    }
4266 }
4267 
4268 static void
compute_blit_3d_layers(const VkOffset3D * offsets,uint32_t * min_layer,uint32_t * max_layer,bool * mirror_z)4269 compute_blit_3d_layers(const VkOffset3D *offsets,
4270                        uint32_t *min_layer, uint32_t *max_layer,
4271                        bool *mirror_z)
4272 {
4273    if (offsets[1].z >= offsets[0].z) {
4274       *mirror_z = false;
4275       *min_layer = offsets[0].z;
4276       *max_layer = offsets[1].z;
4277    } else {
4278       *mirror_z = true;
4279       *min_layer = offsets[1].z;
4280       *max_layer = offsets[0].z;
4281    }
4282 }
4283 
4284 static VkResult
create_blit_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)4285 create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
4286 {
4287    /* If this is not the first pool we create for this command buffer
4288     * size it based on the size of the currently exhausted pool.
4289     */
4290    uint32_t descriptor_count = 64;
4291    if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
4292       struct v3dv_descriptor_pool *exhausted_pool =
4293          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
4294       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
4295    }
4296 
4297    /* Create the descriptor pool */
4298    cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
4299    VkDescriptorPoolSize pool_size = {
4300       .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4301       .descriptorCount = descriptor_count,
4302    };
4303    VkDescriptorPoolCreateInfo info = {
4304       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
4305       .maxSets = descriptor_count,
4306       .poolSizeCount = 1,
4307       .pPoolSizes = &pool_size,
4308       .flags = 0,
4309    };
4310    VkResult result =
4311       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
4312                                 &info,
4313                                 &cmd_buffer->device->vk.alloc,
4314                                 &cmd_buffer->meta.blit.dspool);
4315 
4316    if (result == VK_SUCCESS) {
4317       assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
4318       const VkDescriptorPool _pool = cmd_buffer->meta.blit.dspool;
4319 
4320       v3dv_cmd_buffer_add_private_obj(
4321          cmd_buffer, (uintptr_t) _pool,
4322          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
4323 
4324       struct v3dv_descriptor_pool *pool =
4325          v3dv_descriptor_pool_from_handle(_pool);
4326       pool->is_driver_internal = true;
4327    }
4328 
4329    return result;
4330 }
4331 
4332 static VkResult
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)4333 allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
4334                                     VkDescriptorSet *set)
4335 {
4336    /* Make sure we have a descriptor pool */
4337    VkResult result;
4338    if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
4339       result = create_blit_descriptor_pool(cmd_buffer);
4340       if (result != VK_SUCCESS)
4341          return result;
4342    }
4343    assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
4344 
4345    /* Allocate descriptor set */
4346    struct v3dv_device *device = cmd_buffer->device;
4347    VkDevice _device = v3dv_device_to_handle(device);
4348    VkDescriptorSetAllocateInfo info = {
4349       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
4350       .descriptorPool = cmd_buffer->meta.blit.dspool,
4351       .descriptorSetCount = 1,
4352       .pSetLayouts = &device->meta.blit.ds_layout,
4353    };
4354    result = v3dv_AllocateDescriptorSets(_device, &info, set);
4355 
4356    /* If we ran out of pool space, grow the pool and try again */
4357    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
4358       result = create_blit_descriptor_pool(cmd_buffer);
4359       if (result == VK_SUCCESS) {
4360          info.descriptorPool = cmd_buffer->meta.blit.dspool;
4361          result = v3dv_AllocateDescriptorSets(_device, &info, set);
4362       }
4363    }
4364 
4365    return result;
4366 }
4367 
4368 /**
4369  * Returns true if the implementation supports the requested operation (even if
4370  * it failed to process it, for example, due to an out-of-memory error).
4371  *
4372  * The caller can specify the channels on the destination to be written via the
4373  * cmask parameter (which can be 0 to default to all channels), as well as a
4374  * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
4375  * to use the default identity swizzle).
4376  *
4377  * Supports multi-plane formats too.
4378  */
4379 static bool
blit_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,VkFormat dst_format,struct v3dv_image * src,VkFormat src_format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,const VkImageBlit2 * region,VkFilter filter,bool dst_is_padded_image)4380 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
4381             struct v3dv_image *dst,
4382             VkFormat dst_format,
4383             struct v3dv_image *src,
4384             VkFormat src_format,
4385             VkColorComponentFlags cmask,
4386             VkComponentMapping *cswizzle,
4387             const VkImageBlit2 *region,
4388             VkFilter filter,
4389             bool dst_is_padded_image)
4390 {
4391    bool handled = true;
4392    VkResult result;
4393 
4394    /* Can't sample from linear images */
4395    if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) {
4396       return false;
4397    }
4398 
4399    /* Rewrite combined D/S blits to compatible color blits */
4400    if (vk_format_is_depth_or_stencil(dst_format)) {
4401       assert(src_format == dst_format);
4402       assert(cmask == 0);
4403       switch(dst_format) {
4404       case VK_FORMAT_D16_UNORM:
4405          dst_format = VK_FORMAT_R16_UINT;
4406          break;
4407       case VK_FORMAT_D32_SFLOAT:
4408          dst_format = VK_FORMAT_R32_UINT;
4409          break;
4410       case VK_FORMAT_X8_D24_UNORM_PACK32:
4411       case VK_FORMAT_D24_UNORM_S8_UINT:
4412          if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4413             cmask |= VK_COLOR_COMPONENT_G_BIT |
4414                      VK_COLOR_COMPONENT_B_BIT |
4415                      VK_COLOR_COMPONENT_A_BIT;
4416          }
4417          if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4418             assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
4419             cmask |= VK_COLOR_COMPONENT_R_BIT;
4420          }
4421          dst_format = VK_FORMAT_R8G8B8A8_UINT;
4422          break;
4423       default:
4424          unreachable("Unsupported depth/stencil format");
4425       };
4426       src_format = dst_format;
4427    }
4428 
4429    uint8_t src_plane =
4430       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
4431    assert(src_plane < src->plane_count);
4432    uint8_t dst_plane =
4433       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
4434    assert(dst_plane < dst->plane_count);
4435 
4436    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
4437                                             VK_COLOR_COMPONENT_G_BIT |
4438                                             VK_COLOR_COMPONENT_B_BIT |
4439                                             VK_COLOR_COMPONENT_A_BIT;
4440    if (cmask == 0)
4441       cmask = full_cmask;
4442 
4443    VkComponentMapping ident_swizzle = {
4444       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
4445       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
4446       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
4447       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
4448    };
4449    if (!cswizzle)
4450       cswizzle = &ident_swizzle;
4451 
4452    /* When we get here from a copy between compressed / uncompressed images
4453     * we choose to specify the destination blit region based on the size
4454     * semantics of the source image of the copy (see copy_image_blit), so we
4455     * need to apply those same semantics here when we compute the size of the
4456     * destination image level.
4457     */
4458    const uint32_t dst_block_w =
4459       vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
4460    const uint32_t dst_block_h =
4461       vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
4462    const uint32_t src_block_w =
4463       vk_format_get_blockwidth(src->planes[src_plane].vk_format);
4464    const uint32_t src_block_h =
4465       vk_format_get_blockheight(src->planes[src_plane].vk_format);
4466    const uint32_t dst_level_w =
4467       u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
4468                region->dstSubresource.mipLevel);
4469    const uint32_t dst_level_h =
4470       u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
4471                region->dstSubresource.mipLevel);
4472 
4473    const uint32_t src_level_w =
4474       u_minify(src->planes[src_plane].width, region->srcSubresource.mipLevel);
4475    const uint32_t src_level_h =
4476       u_minify(src->planes[src_plane].height, region->srcSubresource.mipLevel);
4477 
4478    assert(src->plane_count == 1 || src->vk.image_type != VK_IMAGE_TYPE_3D);
4479    const uint32_t src_level_d =
4480       u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel);
4481 
4482    uint32_t dst_x, dst_y, dst_w, dst_h;
4483    bool dst_mirror_x, dst_mirror_y;
4484    compute_blit_box(region->dstOffsets,
4485                     dst_level_w, dst_level_h,
4486                     &dst_x, &dst_y, &dst_w, &dst_h,
4487                     &dst_mirror_x, &dst_mirror_y);
4488 
4489    uint32_t src_x, src_y, src_w, src_h;
4490    bool src_mirror_x, src_mirror_y;
4491    compute_blit_box(region->srcOffsets,
4492                     src_level_w, src_level_h,
4493                     &src_x, &src_y, &src_w, &src_h,
4494                     &src_mirror_x, &src_mirror_y);
4495 
4496    uint32_t min_dst_layer;
4497    uint32_t max_dst_layer;
4498    bool dst_mirror_z = false;
4499    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
4500       min_dst_layer = region->dstSubresource.baseArrayLayer;
4501       max_dst_layer = min_dst_layer +
4502                       vk_image_subresource_layer_count(&dst->vk,
4503                                                        &region->dstSubresource);
4504    } else {
4505       compute_blit_3d_layers(region->dstOffsets,
4506                              &min_dst_layer, &max_dst_layer,
4507                              &dst_mirror_z);
4508    }
4509 
4510    uint32_t min_src_layer;
4511    uint32_t max_src_layer;
4512    bool src_mirror_z = false;
4513    if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
4514       min_src_layer = region->srcSubresource.baseArrayLayer;
4515       max_src_layer = min_src_layer +
4516                       vk_image_subresource_layer_count(&src->vk,
4517                                                        &region->srcSubresource);
4518    } else {
4519       compute_blit_3d_layers(region->srcOffsets,
4520                              &min_src_layer, &max_src_layer,
4521                              &src_mirror_z);
4522    }
4523 
4524    uint32_t layer_count = max_dst_layer - min_dst_layer;
4525 
4526    /* Translate source blit coordinates to normalized texture coordinates for
4527     * single sampled textures. For multisampled textures we require
4528     * unnormalized coordinates, since we can only do texelFetch on them.
4529     */
4530    float coords[4] =  {
4531       (float)src_x,
4532       (float)src_y,
4533       (float)(src_x + src_w),
4534       (float)(src_y + src_h),
4535    };
4536 
4537    if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
4538       coords[0] /= (float)src_level_w;
4539       coords[1] /= (float)src_level_h;
4540       coords[2] /= (float)src_level_w;
4541       coords[3] /= (float)src_level_h;
4542    }
4543 
4544    /* Handle mirroring */
4545    const bool mirror_x = dst_mirror_x != src_mirror_x;
4546    const bool mirror_y = dst_mirror_y != src_mirror_y;
4547    const bool mirror_z = dst_mirror_z != src_mirror_z;
4548    float tex_coords[5] = {
4549       !mirror_x ? coords[0] : coords[2],
4550       !mirror_y ? coords[1] : coords[3],
4551       !mirror_x ? coords[2] : coords[0],
4552       !mirror_y ? coords[3] : coords[1],
4553       /* Z coordinate for 3D blit sources, to be filled for each
4554        * destination layer
4555        */
4556       0.0f
4557    };
4558 
4559    /* For blits from 3D images we also need to compute the slice coordinate to
4560     * sample from, which will change for each layer in the destination.
4561     * Compute the step we should increase for each iteration.
4562     */
4563    const float src_z_step =
4564       (float)(max_src_layer - min_src_layer) / (float)layer_count;
4565 
4566    /* Get the blit pipeline */
4567    struct v3dv_meta_blit_pipeline *pipeline = NULL;
4568    bool ok = get_blit_pipeline(cmd_buffer,
4569                                dst_format, src_format, cmask, src->vk.image_type,
4570                                dst->vk.samples, src->vk.samples,
4571                                &pipeline);
4572    if (!ok)
4573       return handled;
4574    assert(pipeline && pipeline->pipeline &&
4575           pipeline->pass && pipeline->pass_no_load);
4576 
4577    struct v3dv_device *device = cmd_buffer->device;
4578    assert(device->meta.blit.ds_layout);
4579 
4580    VkDevice _device = v3dv_device_to_handle(device);
4581    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
4582 
4583    /* Create sampler for blit source image */
4584    VkSamplerCreateInfo sampler_info = {
4585       .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
4586       .magFilter = filter,
4587       .minFilter = filter,
4588       .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4589       .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4590       .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4591       .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
4592    };
4593    VkSampler sampler;
4594    result = v3dv_CreateSampler(_device, &sampler_info, &device->vk.alloc,
4595                                &sampler);
4596    if (result != VK_SUCCESS)
4597       goto fail;
4598 
4599    v3dv_cmd_buffer_add_private_obj(
4600       cmd_buffer, (uintptr_t)sampler,
4601       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);
4602 
4603    /* Push command buffer state before starting meta operation */
4604    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
4605 
4606    /* Push state that is common for all layers */
4607    v3dv_CmdBindPipeline(_cmd_buffer,
4608                         VK_PIPELINE_BIND_POINT_GRAPHICS,
4609                         pipeline->pipeline);
4610 
4611    const VkViewport viewport = {
4612       .x = dst_x,
4613       .y = dst_y,
4614       .width = dst_w,
4615       .height = dst_h,
4616       .minDepth = 0.0f,
4617       .maxDepth = 1.0f
4618    };
4619    v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
4620 
4621    const VkRect2D scissor = {
4622       .offset = { dst_x, dst_y },
4623       .extent = { dst_w, dst_h }
4624    };
4625    v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
4626 
4627    bool can_skip_tlb_load = false;
4628    const VkRect2D render_area = {
4629       .offset = { dst_x, dst_y },
4630       .extent = { dst_w, dst_h },
4631    };
4632 
4633    /* Record per-layer commands */
4634    for (uint32_t i = 0; i < layer_count; i++) {
4635       /* Setup framebuffer */
4636       VkImageViewCreateInfo dst_image_view_info = {
4637          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4638          .image = v3dv_image_to_handle(dst),
4639          .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
4640          .format = dst_format,
4641          .subresourceRange = {
4642             .aspectMask = region->dstSubresource.aspectMask,
4643             .baseMipLevel = region->dstSubresource.mipLevel,
4644             .levelCount = 1,
4645             .baseArrayLayer = min_dst_layer + i,
4646             .layerCount = 1
4647          },
4648       };
4649       VkImageView dst_image_view;
4650       result = v3dv_create_image_view(device, &dst_image_view_info,
4651                                       &dst_image_view);
4652       if (result != VK_SUCCESS)
4653          goto fail;
4654 
4655       v3dv_cmd_buffer_add_private_obj(
4656          cmd_buffer, (uintptr_t)dst_image_view,
4657          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4658 
4659       VkFramebufferCreateInfo fb_info = {
4660          .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
4661          .renderPass = pipeline->pass,
4662          .attachmentCount = 1,
4663          .pAttachments = &dst_image_view,
4664          .width = dst_x + dst_w,
4665          .height = dst_y + dst_h,
4666          .layers = 1,
4667       };
4668 
4669       VkFramebuffer fb;
4670       result = v3dv_CreateFramebuffer(_device, &fb_info,
4671                                       &cmd_buffer->device->vk.alloc, &fb);
4672       if (result != VK_SUCCESS)
4673          goto fail;
4674 
4675       struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
4676       framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
4677                                       fb_info.height == dst_level_h &&
4678                                       dst_is_padded_image;
4679 
4680       v3dv_cmd_buffer_add_private_obj(
4681          cmd_buffer, (uintptr_t)fb,
4682          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
4683 
4684       /* Setup descriptor set for blit source texture. We don't have to
4685        * register the descriptor as a private command buffer object since
4686        * all descriptors will be freed automatically with the descriptor
4687        * pool.
4688        */
4689       VkDescriptorSet set;
4690       result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
4691       if (result != VK_SUCCESS)
4692          goto fail;
4693 
4694       VkImageViewCreateInfo src_image_view_info = {
4695          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4696          .image = v3dv_image_to_handle(src),
4697          .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
4698          .format = src_format,
4699          .components = *cswizzle,
4700          .subresourceRange = {
4701             .aspectMask = region->srcSubresource.aspectMask,
4702             .baseMipLevel = region->srcSubresource.mipLevel,
4703             .levelCount = 1,
4704             .baseArrayLayer =
4705                src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
4706             .layerCount = 1
4707          },
4708       };
4709       VkImageView src_image_view;
4710       result = v3dv_create_image_view(device, &src_image_view_info,
4711                                       &src_image_view);
4712       if (result != VK_SUCCESS)
4713          goto fail;
4714 
4715       v3dv_cmd_buffer_add_private_obj(
4716          cmd_buffer, (uintptr_t)src_image_view,
4717          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4718 
4719       VkDescriptorImageInfo image_info = {
4720          .sampler = sampler,
4721          .imageView = src_image_view,
4722          .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
4723       };
4724       VkWriteDescriptorSet write = {
4725          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
4726          .dstSet = set,
4727          .dstBinding = 0,
4728          .dstArrayElement = 0,
4729          .descriptorCount = 1,
4730          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4731          .pImageInfo = &image_info,
4732       };
4733       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
4734 
4735       v3dv_CmdBindDescriptorSets(_cmd_buffer,
4736                                  VK_PIPELINE_BIND_POINT_GRAPHICS,
4737                                  device->meta.blit.p_layout,
4738                                  0, 1, &set,
4739                                  0, NULL);
4740 
4741       /* If the region we are about to blit is tile-aligned, then we can
4742        * use the render pass version that won't pre-load the tile buffer
4743        * with the dst image contents before the blit. The exception is when we
4744        * don't have a full color mask, since in that case we need to preserve
4745        * the original value of some of the color components.
4746        *
4747        * Since all layers have the same area, we only need to compute this for
4748        * the first.
4749        */
4750       if (i == 0) {
4751          struct v3dv_render_pass *pipeline_pass =
4752             v3dv_render_pass_from_handle(pipeline->pass);
4753          can_skip_tlb_load =
4754             cmask == full_cmask &&
4755             v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
4756                                               framebuffer, pipeline_pass, 0);
4757       }
4758 
4759       /* Record blit */
4760       VkRenderPassBeginInfo rp_info = {
4761          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4762          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
4763                                            pipeline->pass,
4764          .framebuffer = fb,
4765          .renderArea = render_area,
4766          .clearValueCount = 0,
4767       };
4768 
4769       VkSubpassBeginInfo sp_info = {
4770          .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
4771          .contents = VK_SUBPASS_CONTENTS_INLINE,
4772       };
4773 
4774       v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
4775       struct v3dv_job *job = cmd_buffer->state.job;
4776       if (!job)
4777          goto fail;
4778 
4779       /* For 3D blits we need to compute the source slice to blit from (the Z
4780        * coordinate of the source sample operation). We want to choose this
4781        * based on the ratio of the depth of the source and the destination
4782        * images, picking the coordinate in the middle of each step.
4783        */
4784       if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
4785          tex_coords[4] =
4786             !mirror_z ?
4787             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
4788             (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
4789       }
4790 
4791       v3dv_CmdPushConstants(_cmd_buffer,
4792                             device->meta.blit.p_layout,
4793                             VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
4794                             &tex_coords);
4795 
4796       v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
4797 
4798       VkSubpassEndInfo sp_end_info = {
4799          .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
4800       };
4801 
4802       v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
4803    }
4804 
4805 fail:
4806    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
4807 
4808    return handled;
4809 }
4810 
4811 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)4812 v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,
4813                       const VkBlitImageInfo2 *pBlitImageInfo)
4814 {
4815    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4816    V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
4817    V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
4818 
4819    /* From vkCmdBlitImage:
4820     *   "srcImage must not use a format that requires a sampler YCBCR
4821     *    conversion"
4822     *   "dstImage must not use a format that requires a sampler YCBCR
4823     *    conversion"
4824     */
4825    assert(src->plane_count == 1);
4826    assert(dst->plane_count == 1);
4827 
4828    /* This command can only happen outside a render pass */
4829    assert(cmd_buffer->state.pass == NULL);
4830    assert(cmd_buffer->state.job == NULL);
4831 
4832    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
4833    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
4834           src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4835 
4836    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
4837    assert(!vk_format_is_compressed(dst->vk.format));
4838 
4839    cmd_buffer->state.is_transfer = true;
4840 
4841    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
4842       const VkImageBlit2 *region = &pBlitImageInfo->pRegions[i];
4843 
4844       if (blit_tfu(cmd_buffer, dst, src, region))
4845          continue;
4846       if (blit_shader(cmd_buffer,
4847                       dst, dst->vk.format,
4848                       src, src->vk.format,
4849                       0, NULL,
4850                       region,
4851                       pBlitImageInfo->filter, true)) {
4852          continue;
4853       }
4854       unreachable("Unsupported blit operation");
4855    }
4856 
4857    cmd_buffer->state.is_transfer = false;
4858 }
4859 
4860 static bool
resolve_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2 * region)4861 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
4862                   struct v3dv_image *dst,
4863                   struct v3dv_image *src,
4864                   const VkImageResolve2 *region)
4865 {
4866    /* No resolve for multi-planar images. Using plane 0 */
4867    assert(dst->plane_count == 1);
4868    assert(src->plane_count == 1);
4869 
4870    if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel,
4871                               &region->srcOffset, NULL, NULL) ||
4872        !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel,
4873                               &region->dstOffset, &region->extent, NULL)) {
4874       return false;
4875    }
4876 
4877    if (!v3d_X((&cmd_buffer->device->devinfo), format_supports_tlb_resolve)(src->format))
4878       return false;
4879 
4880    const VkFormat fb_format = src->vk.format;
4881 
4882    uint32_t num_layers;
4883    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
4884       num_layers = vk_image_subresource_layer_count(&dst->vk,
4885                                                     &region->dstSubresource);
4886    } else {
4887       num_layers = region->extent.depth;
4888    }
4889    assert(num_layers > 0);
4890 
4891    struct v3dv_job *job =
4892       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
4893    if (!job)
4894       return true;
4895 
4896    const uint32_t block_w =
4897       vk_format_get_blockwidth(dst->planes[0].vk_format);
4898    const uint32_t block_h =
4899       vk_format_get_blockheight(dst->planes[0].vk_format);
4900    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
4901    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
4902 
4903    uint32_t internal_type, internal_bpp;
4904    v3d_X((&cmd_buffer->device->devinfo), get_internal_type_bpp_for_image_aspects)
4905       (fb_format, region->srcSubresource.aspectMask,
4906        &internal_type, &internal_bpp);
4907 
4908    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
4909                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
4910                         true);
4911 
4912    struct v3dv_meta_framebuffer framebuffer;
4913    v3d_X((&job->device->devinfo), meta_framebuffer_init)(&framebuffer, fb_format,
4914                                               internal_type, &job->frame_tiling);
4915 
4916    v3d_X((&job->device->devinfo), job_emit_binning_flush)(job);
4917    v3d_X((&job->device->devinfo), meta_emit_resolve_image_rcl)(job, dst, src,
4918                                                     &framebuffer, region);
4919 
4920    v3dv_cmd_buffer_finish_job(cmd_buffer);
4921    return true;
4922 }
4923 
4924 static bool
resolve_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2 * region)4925 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
4926                    struct v3dv_image *dst,
4927                    struct v3dv_image *src,
4928                    const VkImageResolve2 *region)
4929 {
4930    const VkImageBlit2 blit_region = {
4931       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
4932       .srcSubresource = region->srcSubresource,
4933       .srcOffsets = {
4934          region->srcOffset,
4935          {
4936             region->srcOffset.x + region->extent.width,
4937             region->srcOffset.y + region->extent.height,
4938          }
4939       },
4940       .dstSubresource = region->dstSubresource,
4941       .dstOffsets = {
4942          region->dstOffset,
4943          {
4944             region->dstOffset.x + region->extent.width,
4945             region->dstOffset.y + region->extent.height,
4946          }
4947       },
4948    };
4949    return blit_shader(cmd_buffer,
4950                       dst, dst->vk.format,
4951                       src, src->vk.format,
4952                       0, NULL,
4953                       &blit_region, VK_FILTER_NEAREST, true);
4954 }
4955 
4956 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * info)4957 v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,
4958                          const VkResolveImageInfo2 *info)
4959 
4960 {
4961    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4962    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
4963    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
4964 
4965     /* This command can only happen outside a render pass */
4966    assert(cmd_buffer->state.pass == NULL);
4967    assert(cmd_buffer->state.job == NULL);
4968 
4969    assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
4970    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4971 
4972    /* We don't support multi-sampled multi-plane images */
4973    assert(src->plane_count == 1);
4974    assert(dst->plane_count == 1);
4975 
4976    cmd_buffer->state.is_transfer = true;
4977 
4978    for (uint32_t i = 0; i < info->regionCount; i++) {
4979       if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
4980          continue;
4981       if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
4982          continue;
4983       unreachable("Unsupported multismaple resolve operation");
4984    }
4985 
4986    cmd_buffer->state.is_transfer = false;
4987 }
4988