• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "compiler/nir/nir_builder.h"
28 #include "util/u_pack_color.h"
29 #include "vulkan/runtime/vk_common_entrypoints.h"
30 
31 static uint32_t
meta_blit_key_hash(const void * key)32 meta_blit_key_hash(const void *key)
33 {
34    return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
35 }
36 
37 static bool
meta_blit_key_compare(const void * key1,const void * key2)38 meta_blit_key_compare(const void *key1, const void *key2)
39 {
40    return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
41 }
42 
43 static bool
create_blit_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * descriptor_set_layout,VkPipelineLayout * pipeline_layout)44 create_blit_pipeline_layout(struct v3dv_device *device,
45                             VkDescriptorSetLayout *descriptor_set_layout,
46                             VkPipelineLayout *pipeline_layout)
47 {
48    VkResult result;
49 
50    if (*descriptor_set_layout == 0) {
51       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
52          .binding = 0,
53          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
54          .descriptorCount = 1,
55          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
56       };
57       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
58          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
59          .bindingCount = 1,
60          .pBindings = &descriptor_set_layout_binding,
61       };
62       result =
63          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
64                                         &descriptor_set_layout_info,
65                                         &device->vk.alloc,
66                                         descriptor_set_layout);
67       if (result != VK_SUCCESS)
68          return false;
69    }
70 
71    assert(*pipeline_layout == 0);
72    VkPipelineLayoutCreateInfo pipeline_layout_info = {
73       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
74       .setLayoutCount = 1,
75       .pSetLayouts = descriptor_set_layout,
76       .pushConstantRangeCount = 1,
77       .pPushConstantRanges =
78          &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
79    };
80 
81    result =
82       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
83                                 &pipeline_layout_info,
84                                 &device->vk.alloc,
85                                 pipeline_layout);
86    return result == VK_SUCCESS;
87 }
88 
89 void
v3dv_meta_blit_init(struct v3dv_device * device)90 v3dv_meta_blit_init(struct v3dv_device *device)
91 {
92    for (uint32_t i = 0; i < 3; i++) {
93       device->meta.blit.cache[i] =
94          _mesa_hash_table_create(NULL,
95                                  meta_blit_key_hash,
96                                  meta_blit_key_compare);
97    }
98 
99    create_blit_pipeline_layout(device,
100                                &device->meta.blit.ds_layout,
101                                &device->meta.blit.p_layout);
102 }
103 
104 void
v3dv_meta_blit_finish(struct v3dv_device * device)105 v3dv_meta_blit_finish(struct v3dv_device *device)
106 {
107    VkDevice _device = v3dv_device_to_handle(device);
108 
109    for (uint32_t i = 0; i < 3; i++) {
110       hash_table_foreach(device->meta.blit.cache[i], entry) {
111          struct v3dv_meta_blit_pipeline *item = entry->data;
112          v3dv_DestroyPipeline(_device, item->pipeline, &device->vk.alloc);
113          v3dv_DestroyRenderPass(_device, item->pass, &device->vk.alloc);
114          v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->vk.alloc);
115          vk_free(&device->vk.alloc, item);
116       }
117       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
118    }
119 
120    if (device->meta.blit.p_layout) {
121       v3dv_DestroyPipelineLayout(_device, device->meta.blit.p_layout,
122                                  &device->vk.alloc);
123    }
124 
125    if (device->meta.blit.ds_layout) {
126       v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.ds_layout,
127                                       &device->vk.alloc);
128    }
129 }
130 
131 static uint32_t
meta_texel_buffer_copy_key_hash(const void * key)132 meta_texel_buffer_copy_key_hash(const void *key)
133 {
134    return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
135 }
136 
137 static bool
meta_texel_buffer_copy_key_compare(const void * key1,const void * key2)138 meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
139 {
140    return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
141 }
142 
143 static bool
create_texel_buffer_copy_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * ds_layout,VkPipelineLayout * p_layout)144 create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
145                                          VkDescriptorSetLayout *ds_layout,
146                                          VkPipelineLayout *p_layout)
147 {
148    VkResult result;
149 
150    if (*ds_layout == 0) {
151       VkDescriptorSetLayoutBinding ds_layout_binding = {
152          .binding = 0,
153          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
154          .descriptorCount = 1,
155          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
156       };
157       VkDescriptorSetLayoutCreateInfo ds_layout_info = {
158          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
159          .bindingCount = 1,
160          .pBindings = &ds_layout_binding,
161       };
162       result =
163          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
164                                         &ds_layout_info,
165                                         &device->vk.alloc,
166                                         ds_layout);
167       if (result != VK_SUCCESS)
168          return false;
169    }
170 
171    assert(*p_layout == 0);
172    /* FIXME: this is abusing a bit the API, since not all of our copy
173     * pipelines have a geometry shader. We could create 2 different pipeline
174     * layouts, but this works for us for now.
175     */
176 #define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
177 #define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
178 #define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
179 #define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
180    VkPushConstantRange ranges[2] = {
181       { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
182       { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
183    };
184 
185    VkPipelineLayoutCreateInfo p_layout_info = {
186       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
187       .setLayoutCount = 1,
188       .pSetLayouts = ds_layout,
189       .pushConstantRangeCount = 2,
190       .pPushConstantRanges = ranges,
191    };
192 
193    result =
194       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
195                                 &p_layout_info,
196                                 &device->vk.alloc,
197                                 p_layout);
198    return result == VK_SUCCESS;
199 }
200 
201 void
v3dv_meta_texel_buffer_copy_init(struct v3dv_device * device)202 v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
203 {
204    for (uint32_t i = 0; i < 3; i++) {
205       device->meta.texel_buffer_copy.cache[i] =
206          _mesa_hash_table_create(NULL,
207                                  meta_texel_buffer_copy_key_hash,
208                                  meta_texel_buffer_copy_key_compare);
209    }
210 
211    create_texel_buffer_copy_pipeline_layout(
212       device,
213       &device->meta.texel_buffer_copy.ds_layout,
214       &device->meta.texel_buffer_copy.p_layout);
215 }
216 
217 void
v3dv_meta_texel_buffer_copy_finish(struct v3dv_device * device)218 v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
219 {
220    VkDevice _device = v3dv_device_to_handle(device);
221 
222    for (uint32_t i = 0; i < 3; i++) {
223       hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
224          struct v3dv_meta_texel_buffer_copy_pipeline *item = entry->data;
225          v3dv_DestroyPipeline(_device, item->pipeline, &device->vk.alloc);
226          v3dv_DestroyRenderPass(_device, item->pass, &device->vk.alloc);
227          v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->vk.alloc);
228          vk_free(&device->vk.alloc, item);
229       }
230       _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
231    }
232 
233    if (device->meta.texel_buffer_copy.p_layout) {
234       v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.p_layout,
235                                  &device->vk.alloc);
236    }
237 
238    if (device->meta.texel_buffer_copy.ds_layout) {
239       v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.ds_layout,
240                                       &device->vk.alloc);
241    }
242 }
243 
244 static VkFormat
get_compatible_tlb_format(VkFormat format)245 get_compatible_tlb_format(VkFormat format)
246 {
247    switch (format) {
248    case VK_FORMAT_R8G8B8A8_SNORM:
249       return VK_FORMAT_R8G8B8A8_UINT;
250 
251    case VK_FORMAT_R8G8_SNORM:
252       return VK_FORMAT_R8G8_UINT;
253 
254    case VK_FORMAT_R8_SNORM:
255       return VK_FORMAT_R8_UINT;
256 
257    case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
258       return VK_FORMAT_A8B8G8R8_UINT_PACK32;
259 
260    case VK_FORMAT_R16_UNORM:
261    case VK_FORMAT_R16_SNORM:
262       return VK_FORMAT_R16_UINT;
263 
264    case VK_FORMAT_R16G16_UNORM:
265    case VK_FORMAT_R16G16_SNORM:
266       return VK_FORMAT_R16G16_UINT;
267 
268    case VK_FORMAT_R16G16B16A16_UNORM:
269    case VK_FORMAT_R16G16B16A16_SNORM:
270       return VK_FORMAT_R16G16B16A16_UINT;
271 
272    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
273       return VK_FORMAT_R32_SFLOAT;
274 
275    /* We can't render to compressed formats using the TLB so instead we use
276     * a compatible format with the same bpp as the compressed format. Because
277     * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
278     * case of ETC), when we implement copies with the compatible format we
279     * will have to divide offsets and dimensions on the compressed image by
280     * the compressed block size.
281     */
282    case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
283    case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
284    case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
285    case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
286    case VK_FORMAT_BC2_UNORM_BLOCK:
287    case VK_FORMAT_BC2_SRGB_BLOCK:
288    case VK_FORMAT_BC3_SRGB_BLOCK:
289    case VK_FORMAT_BC3_UNORM_BLOCK:
290    case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
291    case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
292    case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
293    case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
294    case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
295    case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
296    case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
297    case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
298    case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
299    case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
300    case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
301    case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
302    case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
303    case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
304    case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
305    case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
306    case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
307    case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
308    case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
309    case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
310    case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
311    case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
312    case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
313    case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
314    case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
315    case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
316    case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
317    case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
318       return VK_FORMAT_R32G32B32A32_UINT;
319 
320    case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
321    case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
322    case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
323    case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
324    case VK_FORMAT_EAC_R11_UNORM_BLOCK:
325    case VK_FORMAT_EAC_R11_SNORM_BLOCK:
326    case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
327    case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
328    case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
329    case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
330       return VK_FORMAT_R16G16B16A16_UINT;
331 
332    default:
333       return VK_FORMAT_UNDEFINED;
334    }
335 }
336 
337 /**
338  * Checks if we can implement an image copy or clear operation using the TLB
339  * hardware.
340  */
341 bool
v3dv_meta_can_use_tlb(struct v3dv_image * image,const VkOffset3D * offset,VkFormat * compat_format)342 v3dv_meta_can_use_tlb(struct v3dv_image *image,
343                       const VkOffset3D *offset,
344                       VkFormat *compat_format)
345 {
346    if (offset->x != 0 || offset->y != 0)
347       return false;
348 
349    if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
350       if (compat_format)
351          *compat_format = image->vk.format;
352       return true;
353    }
354 
355    /* If the image format is not TLB-supported, then check if we can use
356     * a compatible format instead.
357     */
358    if (compat_format) {
359       *compat_format = get_compatible_tlb_format(image->vk.format);
360       if (*compat_format != VK_FORMAT_UNDEFINED)
361          return true;
362    }
363 
364    return false;
365 }
366 
367 /* Implements a copy using the TLB.
368  *
369  * This only works if we are copying from offset (0,0), since a TLB store for
370  * tile (x,y) will be written at the same tile offset into the destination.
371  * When this requirement is not met, we need to use a blit instead.
372  *
373  * Returns true if the implementation supports the requested operation (even if
374  * it failed to process it, for example, due to an out-of-memory error).
375  *
376  */
377 static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region)378 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
379                          struct v3dv_buffer *buffer,
380                          struct v3dv_image *image,
381                          const VkBufferImageCopy2 *region)
382 {
383    VkFormat fb_format;
384    if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
385       return false;
386 
387    uint32_t internal_type, internal_bpp;
388    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
389       (fb_format, region->imageSubresource.aspectMask,
390        &internal_type, &internal_bpp);
391 
392    uint32_t num_layers;
393    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
394       num_layers = region->imageSubresource.layerCount;
395    else
396       num_layers = region->imageExtent.depth;
397    assert(num_layers > 0);
398 
399    struct v3dv_job *job =
400       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
401    if (!job)
402       return true;
403 
404    /* Handle copy from compressed format using a compatible format */
405    const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
406    const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
407    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
408    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
409 
410    v3dv_job_start_frame(job, width, height, num_layers, false,
411                         1, internal_bpp, false);
412 
413    struct v3dv_meta_framebuffer framebuffer;
414    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
415                                               internal_type, &job->frame_tiling);
416 
417    v3dv_X(job->device, job_emit_binning_flush)(job);
418    v3dv_X(job->device, meta_emit_copy_image_to_buffer_rcl)
419       (job, buffer, image, &framebuffer, region);
420 
421    v3dv_cmd_buffer_finish_job(cmd_buffer);
422 
423    return true;
424 }
425 
426 static bool
427 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
428             struct v3dv_image *dst,
429             VkFormat dst_format,
430             struct v3dv_image *src,
431             VkFormat src_format,
432             VkColorComponentFlags cmask,
433             VkComponentMapping *cswizzle,
434             const VkImageBlit2 *region,
435             VkFilter filter,
436             bool dst_is_padded_image);
437 
438 /**
439  * Returns true if the implementation supports the requested operation (even if
440  * it failed to process it, for example, due to an out-of-memory error).
441  */
442 static bool
copy_image_to_buffer_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region)443 copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
444                           struct v3dv_buffer *buffer,
445                           struct v3dv_image *image,
446                           const VkBufferImageCopy2 *region)
447 {
448    bool handled = false;
449 
450    /* This path uses a shader blit which doesn't support linear images. Return
451     * early to avoid all te heavy lifting in preparation for the blit_shader()
452     * call that is bound to fail in that scenario.
453     */
454    if (image->vk.tiling == VK_IMAGE_TILING_LINEAR &&
455        image->vk.image_type != VK_IMAGE_TYPE_1D) {
456       return handled;
457    }
458 
459    /* Generally, the bpp of the data in the buffer matches that of the
460     * source image. The exception is the case where we are copying
461     * stencil (8bpp) to a combined d24s8 image (32bpp).
462     */
463    uint32_t buffer_bpp = image->cpp;
464 
465    VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask;
466 
467    /* Because we are going to implement the copy as a blit, we need to create
468     * a linear image from the destination buffer and we also want our blit
469     * source and destination formats to be the same (to avoid any format
470     * conversions), so we choose a canonical format that matches the
471     * source image bpp.
472     *
473     * The exception to the above is copying from combined depth/stencil images
474     * because we are copying only one aspect of the image, so we need to setup
475     * our formats, color write mask and source swizzle mask to match that.
476     */
477    VkFormat dst_format;
478    VkFormat src_format;
479    VkColorComponentFlags cmask = 0; /* All components */
480    VkComponentMapping cswizzle = {
481       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
482       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
483       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
484       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
485    };
486    switch (buffer_bpp) {
487    case 16:
488       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
489       dst_format = VK_FORMAT_R32G32B32A32_UINT;
490       src_format = dst_format;
491       break;
492    case 8:
493       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
494       dst_format = VK_FORMAT_R16G16B16A16_UINT;
495       src_format = dst_format;
496       break;
497    case 4:
498       switch (copy_aspect) {
499       case VK_IMAGE_ASPECT_COLOR_BIT:
500          src_format = VK_FORMAT_R8G8B8A8_UINT;
501          dst_format = VK_FORMAT_R8G8B8A8_UINT;
502          break;
503       case VK_IMAGE_ASPECT_DEPTH_BIT:
504          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
505                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
506                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
507          if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
508             src_format = VK_FORMAT_R32_UINT;
509             dst_format = VK_FORMAT_R32_UINT;
510          } else {
511             /* We want to write depth in the buffer in the first 24-bits,
512              * however, the hardware has depth in bits 8-31, so swizzle the
513              * the source components to match what we want. Also, we don't
514              * want to write bits 24-31 in the destination.
515              */
516             src_format = VK_FORMAT_R8G8B8A8_UINT;
517             dst_format = VK_FORMAT_R8G8B8A8_UINT;
518             cmask = VK_COLOR_COMPONENT_R_BIT |
519                     VK_COLOR_COMPONENT_G_BIT |
520                     VK_COLOR_COMPONENT_B_BIT;
521             cswizzle.r = VK_COMPONENT_SWIZZLE_G;
522             cswizzle.g = VK_COMPONENT_SWIZZLE_B;
523             cswizzle.b = VK_COMPONENT_SWIZZLE_A;
524             cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
525          }
526          break;
527       case VK_IMAGE_ASPECT_STENCIL_BIT:
528          assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
529          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
530          /* Copying from S8D24. We want to write 8-bit stencil values only,
531           * so adjust the buffer bpp for that. Since the hardware stores stencil
532           * in the LSB, we can just do a RGBA8UI to R8UI blit.
533           */
534          src_format = VK_FORMAT_R8G8B8A8_UINT;
535          dst_format = VK_FORMAT_R8_UINT;
536          buffer_bpp = 1;
537          break;
538       default:
539          unreachable("unsupported aspect");
540          return handled;
541       };
542       break;
543    case 2:
544       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
545              copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
546       dst_format = VK_FORMAT_R16_UINT;
547       src_format = dst_format;
548       break;
549    case 1:
550       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
551       dst_format = VK_FORMAT_R8_UINT;
552       src_format = dst_format;
553       break;
554    default:
555       unreachable("unsupported bit-size");
556       return handled;
557    };
558 
559    /* The hardware doesn't support linear depth/stencil stores, so we
560     * implement copies of depth/stencil aspect as color copies using a
561     * compatible color format.
562     */
563    assert(vk_format_is_color(src_format));
564    assert(vk_format_is_color(dst_format));
565    copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
566 
567    /* We should be able to handle the blit if we got this far */
568    handled = true;
569 
570    /* Obtain the 2D buffer region spec */
571    uint32_t buf_width, buf_height;
572    if (region->bufferRowLength == 0)
573       buf_width = region->imageExtent.width;
574    else
575       buf_width = region->bufferRowLength;
576 
577    if (region->bufferImageHeight == 0)
578       buf_height = region->imageExtent.height;
579    else
580       buf_height = region->bufferImageHeight;
581 
582    /* If the image is compressed, the bpp refers to blocks, not pixels */
583    uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
584    uint32_t block_height = vk_format_get_blockheight(image->vk.format);
585    buf_width = buf_width / block_width;
586    buf_height = buf_height / block_height;
587 
588    /* Compute layers to copy */
589    uint32_t num_layers;
590    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
591       num_layers = region->imageSubresource.layerCount;
592    else
593       num_layers = region->imageExtent.depth;
594    assert(num_layers > 0);
595 
596    /* Our blit interface can see the real format of the images to detect
597     * copies between compressed and uncompressed images and adapt the
598     * blit region accordingly. Here we are just doing a raw copy of
599     * compressed data, but we are passing an uncompressed view of the
600     * buffer for the blit destination image (since compressed formats are
601     * not renderable), so we also want to provide an uncompressed view of
602     * the source image.
603     */
604    VkResult result;
605    struct v3dv_device *device = cmd_buffer->device;
606    VkDevice _device = v3dv_device_to_handle(device);
607    if (vk_format_is_compressed(image->vk.format)) {
608       VkImage uiview;
609       VkImageCreateInfo uiview_info = {
610          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
611          .imageType = VK_IMAGE_TYPE_3D,
612          .format = dst_format,
613          .extent = { buf_width, buf_height, image->vk.extent.depth },
614          .mipLevels = image->vk.mip_levels,
615          .arrayLayers = image->vk.array_layers,
616          .samples = image->vk.samples,
617          .tiling = image->vk.tiling,
618          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
619          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
620          .queueFamilyIndexCount = 0,
621          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
622       };
623       result = v3dv_CreateImage(_device, &uiview_info, &device->vk.alloc, &uiview);
624       if (result != VK_SUCCESS)
625          return handled;
626 
627       v3dv_cmd_buffer_add_private_obj(
628          cmd_buffer, (uintptr_t)uiview,
629          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
630 
631       result =
632          vk_common_BindImageMemory(_device, uiview,
633                                    v3dv_device_memory_to_handle(image->mem),
634                                    image->mem_offset);
635       if (result != VK_SUCCESS)
636          return handled;
637 
638       image = v3dv_image_from_handle(uiview);
639    }
640 
641    /* Copy requested layers */
642    for (uint32_t i = 0; i < num_layers; i++) {
643       /* Create the destination blit image from the destination buffer */
644       VkImageCreateInfo image_info = {
645          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
646          .imageType = VK_IMAGE_TYPE_2D,
647          .format = dst_format,
648          .extent = { buf_width, buf_height, 1 },
649          .mipLevels = 1,
650          .arrayLayers = 1,
651          .samples = VK_SAMPLE_COUNT_1_BIT,
652          .tiling = VK_IMAGE_TILING_LINEAR,
653          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
654          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
655          .queueFamilyIndexCount = 0,
656          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
657       };
658 
659       VkImage buffer_image;
660       result =
661          v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
662       if (result != VK_SUCCESS)
663          return handled;
664 
665       v3dv_cmd_buffer_add_private_obj(
666          cmd_buffer, (uintptr_t)buffer_image,
667          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
668 
669       /* Bind the buffer memory to the image */
670       VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
671          i * buf_width * buf_height * buffer_bpp;
672       result =
673          vk_common_BindImageMemory(_device, buffer_image,
674                                    v3dv_device_memory_to_handle(buffer->mem),
675                                    buffer_offset);
676       if (result != VK_SUCCESS)
677          return handled;
678 
679       /* Blit-copy the requested image extent.
680        *
681        * Since we are copying, the blit must use the same format on the
682        * destination and source images to avoid format conversions. The
683        * only exception is copying stencil, which we upload to a R8UI source
684        * image, but that we need to blit to a S8D24 destination (the only
685        * stencil format we support).
686        */
687       const VkImageBlit2 blit_region = {
688          .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
689          .srcSubresource = {
690             .aspectMask = copy_aspect,
691             .mipLevel = region->imageSubresource.mipLevel,
692             .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
693             .layerCount = 1,
694          },
695          .srcOffsets = {
696             {
697                DIV_ROUND_UP(region->imageOffset.x, block_width),
698                DIV_ROUND_UP(region->imageOffset.y, block_height),
699                region->imageOffset.z + i,
700             },
701             {
702                DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
703                             block_width),
704                DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
705                             block_height),
706                region->imageOffset.z + i + 1,
707             },
708          },
709          .dstSubresource = {
710             .aspectMask = copy_aspect,
711             .mipLevel = 0,
712             .baseArrayLayer = 0,
713             .layerCount = 1,
714          },
715          .dstOffsets = {
716             { 0, 0, 0 },
717             {
718                DIV_ROUND_UP(region->imageExtent.width, block_width),
719                DIV_ROUND_UP(region->imageExtent.height, block_height),
720                1
721             },
722          },
723       };
724 
725       handled = blit_shader(cmd_buffer,
726                             v3dv_image_from_handle(buffer_image), dst_format,
727                             image, src_format,
728                             cmask, &cswizzle,
729                             &blit_region, VK_FILTER_NEAREST, false);
730       if (!handled) {
731          /* This is unexpected, we should have a supported blit spec */
732          unreachable("Unable to blit buffer to destination image");
733          return false;
734       }
735    }
736 
737    assert(handled);
738    return true;
739 }
740 
741 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * info)742 v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
743                               const VkCopyImageToBufferInfo2 *info)
744 
745 {
746    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
747    V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
748    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
749 
750    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
751 
752    cmd_buffer->state.is_transfer = true;
753 
754    for (uint32_t i = 0; i < info->regionCount; i++) {
755       if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
756          continue;
757       if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &info->pRegions[i]))
758          continue;
759       unreachable("Unsupported image to buffer copy.");
760    }
761 
762    cmd_buffer->state.is_transfer = false;
763 }
764 
765 /**
766  * Returns true if the implementation supports the requested operation (even if
767  * it failed to process it, for example, due to an out-of-memory error).
768  */
769 static bool
copy_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)770 copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
771                struct v3dv_image *dst,
772                struct v3dv_image *src,
773                const VkImageCopy2 *region)
774 {
775    /* Destination can't be raster format */
776    if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
777       return false;
778 
779    /* We can only do full copies, so if the format is D24S8 both aspects need
780     * to be copied. We only need to check the dst format because the spec
781     * states that depth/stencil formats must match exactly.
782     */
783    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
784        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
785                                              VK_IMAGE_ASPECT_STENCIL_BIT;
786        if (region->dstSubresource.aspectMask != ds_aspects)
787          return false;
788    }
789 
790    /* Don't handle copies between uncompressed and compressed formats for now.
791     *
792     * FIXME: we should be able to handle these easily but there is no coverage
793     * in CTS at the moment that make such copies with full images (which we
794     * require here), only partial copies. Also, in that case the code below that
795     * checks for "dst image complete" requires some changes, since it is
796     * checking against the region dimensions, which are in units of the source
797     * image format.
798     */
799    if (vk_format_is_compressed(dst->vk.format) !=
800        vk_format_is_compressed(src->vk.format)) {
801       return false;
802    }
803 
804    /* Source region must start at (0,0) */
805    if (region->srcOffset.x != 0 || region->srcOffset.y != 0)
806       return false;
807 
808    /* Destination image must be complete */
809    if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
810       return false;
811 
812    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
813    uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
814    uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
815    if (region->extent.width != dst_width || region->extent.height != dst_height)
816       return false;
817 
818    /* From vkCmdCopyImage:
819     *
820     *   "When copying between compressed and uncompressed formats the extent
821     *    members represent the texel dimensions of the source image and not
822     *    the destination."
823     */
824    const uint32_t block_w = vk_format_get_blockwidth(src->vk.format);
825    const uint32_t block_h = vk_format_get_blockheight(src->vk.format);
826    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
827    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
828 
829    /* Account for sample count */
830    assert(dst->vk.samples == src->vk.samples);
831    if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
832       assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
833       width *= 2;
834       height *= 2;
835    }
836 
837    /* The TFU unit doesn't handle format conversions so we need the formats to
838     * match. On the other hand, vkCmdCopyImage allows different color formats
839     * on the source and destination images, but only if they are texel
840     * compatible. For us, this means that we can effectively ignore different
841     * formats and just make the copy using either of them, since we are just
842     * moving raw data and not making any conversions.
843     *
844     * Also, the formats supported by the TFU unit are limited, but again, since
845     * we are only doing raw copies here without interpreting or converting
846     * the underlying pixel data according to its format, we can always choose
847     * to use compatible formats that are supported with the TFU unit.
848     */
849    assert(dst->cpp == src->cpp);
850    const struct v3dv_format *format =
851       v3dv_get_compatible_tfu_format(cmd_buffer->device,
852                                      dst->cpp, NULL);
853 
854    /* Emit a TFU job for each layer to blit */
855    const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
856       region->dstSubresource.layerCount :
857       region->extent.depth;
858    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
859 
860    const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
861       region->srcSubresource.baseArrayLayer : region->srcOffset.z;
862    const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
863       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
864    for (uint32_t i = 0; i < layer_count; i++) {
865       const uint32_t dst_offset =
866          dst->mem->bo->offset +
867          v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i);
868       const uint32_t src_offset =
869          src->mem->bo->offset +
870          v3dv_layer_offset(src, src_mip_level, base_src_layer + i);
871 
872       const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
873       const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
874 
875       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
876          cmd_buffer,
877          dst->mem->bo->handle,
878          dst_offset,
879          dst_slice->tiling,
880          dst_slice->padded_height,
881          dst->cpp,
882          src->mem->bo->handle,
883          src_offset,
884          src_slice->tiling,
885          src_slice->tiling == V3D_TILING_RASTER ?
886                               src_slice->stride : src_slice->padded_height,
887          src->cpp,
888          width, height, format);
889    }
890 
891    return true;
892 }
893 
894 /**
895  * Returns true if the implementation supports the requested operation (even if
896  * it failed to process it, for example, due to an out-of-memory error).
897  */
898 static bool
copy_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)899 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
900                struct v3dv_image *dst,
901                struct v3dv_image *src,
902                const VkImageCopy2 *region)
903 {
904    VkFormat fb_format;
905    if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, &fb_format) ||
906        !v3dv_meta_can_use_tlb(dst, &region->dstOffset, &fb_format)) {
907       return false;
908    }
909 
910    /* From the Vulkan spec, VkImageCopy valid usage:
911     *
912     *    "If neither the calling command’s srcImage nor the calling command’s
913     *     dstImage has a multi-planar image format then the aspectMask member
914     *     of srcSubresource and dstSubresource must match."
915     */
916    assert(region->dstSubresource.aspectMask ==
917           region->srcSubresource.aspectMask);
918    uint32_t internal_type, internal_bpp;
919    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
920       (fb_format, region->dstSubresource.aspectMask,
921        &internal_type, &internal_bpp);
922 
923    /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
924     *
925     * "The number of slices of the extent (for 3D) or layers of the
926     *  srcSubresource (for non-3D) must match the number of slices of the
927     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
928     */
929    assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
930            region->srcSubresource.layerCount : region->extent.depth) ==
931           (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
932            region->dstSubresource.layerCount : region->extent.depth));
933    uint32_t num_layers;
934    if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
935       num_layers = region->dstSubresource.layerCount;
936    else
937       num_layers = region->extent.depth;
938    assert(num_layers > 0);
939 
940    struct v3dv_job *job =
941       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
942    if (!job)
943       return true;
944 
945    /* Handle copy to compressed image using compatible format */
946    const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
947    const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
948    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
949    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
950 
951    v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
952                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
953 
954    struct v3dv_meta_framebuffer framebuffer;
955    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
956                                               internal_type, &job->frame_tiling);
957 
958    v3dv_X(job->device, job_emit_binning_flush)(job);
959    v3dv_X(job->device, meta_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);
960 
961    v3dv_cmd_buffer_finish_job(cmd_buffer);
962 
963    return true;
964 }
965 
966 /**
967  * Takes the image provided as argument and creates a new image that has
968  * the same specification and aliases the same memory storage, except that:
969  *
970  *   - It has the uncompressed format passed in.
971  *   - Its original width/height are scaled by the factors passed in.
972  *
973  * This is useful to implement copies from compressed images using the blit
974  * path. The idea is that we create uncompressed "image views" of both the
975  * source and destination images using the uncompressed format and then we
976  * define the copy blit in terms of that format.
977  */
978 static struct v3dv_image *
create_image_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * src,float width_scale,float height_scale,VkFormat format)979 create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
980                    struct v3dv_image *src,
981                    float width_scale,
982                    float height_scale,
983                    VkFormat format)
984 {
985    assert(!vk_format_is_compressed(format));
986 
987    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
988 
989    VkImageCreateInfo info = {
990       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
991       .imageType = src->vk.image_type,
992       .format = format,
993       .extent = {
994          .width = src->vk.extent.width * width_scale,
995          .height = src->vk.extent.height * height_scale,
996          .depth = src->vk.extent.depth,
997       },
998       .mipLevels = src->vk.mip_levels,
999       .arrayLayers = src->vk.array_layers,
1000       .samples = src->vk.samples,
1001       .tiling = src->vk.tiling,
1002       .usage = src->vk.usage,
1003    };
1004 
1005     VkImage _image;
1006     VkResult result =
1007       v3dv_CreateImage(_device, &info, &cmd_buffer->device->vk.alloc, &_image);
1008     if (result != VK_SUCCESS) {
1009        v3dv_flag_oom(cmd_buffer, NULL);
1010        return NULL;
1011     }
1012 
1013     struct v3dv_image *image = v3dv_image_from_handle(_image);
1014     image->mem = src->mem;
1015     image->mem_offset = src->mem_offset;
1016     return image;
1017 }
1018 
1019 /**
1020  * Returns true if the implementation supports the requested operation (even if
1021  * it failed to process it, for example, due to an out-of-memory error).
1022  */
1023 static bool
copy_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1024 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
1025                 struct v3dv_image *dst,
1026                 struct v3dv_image *src,
1027                 const VkImageCopy2 *region)
1028 {
1029    const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
1030    const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
1031    const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
1032    const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
1033    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
1034    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
1035 
1036    /* We need to choose a single format for the blit to ensure that this is
1037     * really a copy and there are not format conversions going on. Since we
1038     * going to blit, we need to make sure that the selected format can be
1039     * both rendered to and textured from.
1040     */
1041    VkFormat format;
1042    float src_scale_w = 1.0f;
1043    float src_scale_h = 1.0f;
1044    float dst_scale_w = block_scale_w;
1045    float dst_scale_h = block_scale_h;
1046    if (vk_format_is_compressed(src->vk.format)) {
1047       /* If we are copying from a compressed format we should be aware that we
1048        * are going to texture from the source image, and the texture setup
1049        * knows the actual size of the image, so we need to choose a format
1050        * that has a per-texel (not per-block) bpp that is compatible for that
1051        * image size. For example, for a source image with size Bw*WxBh*H
1052        * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
1053        * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
1054        * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
1055        * so we could specify a blit with size Bw*WxBh*H and a format with
1056        * a bpp of 8-bit per texel (R8_UINT).
1057        *
1058        * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
1059        * which is 64-bit per texel, then we would need a 4-bit format, which
1060        * we don't have, so instead we still choose an 8-bit format, but we
1061        * apply a divisor to the row dimensions of the blit, since we are
1062        * copying two texels per item.
1063        *
1064        * Generally, we can choose any format so long as we compute appropriate
1065        * divisors for the width and height depending on the source image's
1066        * bpp.
1067        */
1068       assert(src->cpp == dst->cpp);
1069 
1070       format = VK_FORMAT_R32G32_UINT;
1071       switch (src->cpp) {
1072       case 16:
1073          format = VK_FORMAT_R32G32B32A32_UINT;
1074          break;
1075       case 8:
1076          format = VK_FORMAT_R16G16B16A16_UINT;
1077          break;
1078       default:
1079          unreachable("Unsupported compressed format");
1080       }
1081 
1082       /* Create image views of the src/dst images that we can interpret in
1083        * terms of the canonical format.
1084        */
1085       src_scale_w /= src_block_w;
1086       src_scale_h /= src_block_h;
1087       dst_scale_w /= src_block_w;
1088       dst_scale_h /= src_block_h;
1089 
1090       src = create_image_alias(cmd_buffer, src,
1091                                src_scale_w, src_scale_h, format);
1092 
1093       dst = create_image_alias(cmd_buffer, dst,
1094                                dst_scale_w, dst_scale_h, format);
1095    } else {
1096       format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
1097          src->vk.format : get_compatible_tlb_format(src->vk.format);
1098       if (format == VK_FORMAT_UNDEFINED)
1099          return false;
1100 
1101       const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
1102       if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
1103          return false;
1104    }
1105 
1106    /* Given an uncompressed image with size WxH, if we copy it to a compressed
1107     * image, it will result in an image with size W*bWxH*bH, where bW and bH
1108     * are the compressed format's block width and height. This means that
1109     * copies between compressed and uncompressed images involve different
1110     * image sizes, and therefore, we need to take that into account when
1111     * setting up the source and destination blit regions below, so they are
1112     * consistent from the point of view of the single compatible format
1113     * selected for the copy.
1114     *
1115     * We should take into account that the dimensions of the region provided
1116     * to the copy command are specified in terms of the source image. With that
1117     * in mind, below we adjust the blit destination region to be consistent with
1118     * the source region for the compatible format, so basically, we apply
1119     * the block scale factor to the destination offset provided by the copy
1120     * command (because it is specified in terms of the destination image, not
1121     * the source), and then we just add the region copy dimensions to that
1122     * (since the region dimensions are already specified in terms of the source
1123     * image).
1124     */
1125    const VkOffset3D src_start = {
1126       region->srcOffset.x * src_scale_w,
1127       region->srcOffset.y * src_scale_h,
1128       region->srcOffset.z,
1129    };
1130    const VkOffset3D src_end = {
1131       src_start.x + region->extent.width * src_scale_w,
1132       src_start.y + region->extent.height * src_scale_h,
1133       src_start.z + region->extent.depth,
1134    };
1135 
1136    const VkOffset3D dst_start = {
1137       region->dstOffset.x * dst_scale_w,
1138       region->dstOffset.y * dst_scale_h,
1139       region->dstOffset.z,
1140    };
1141    const VkOffset3D dst_end = {
1142       dst_start.x + region->extent.width * src_scale_w,
1143       dst_start.y + region->extent.height * src_scale_h,
1144       dst_start.z + region->extent.depth,
1145    };
1146 
1147    const VkImageBlit2 blit_region = {
1148       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
1149       .srcSubresource = region->srcSubresource,
1150       .srcOffsets = { src_start, src_end },
1151       .dstSubresource = region->dstSubresource,
1152       .dstOffsets = { dst_start, dst_end },
1153    };
1154    bool handled = blit_shader(cmd_buffer,
1155                               dst, format,
1156                               src, format,
1157                               0, NULL,
1158                               &blit_region, VK_FILTER_NEAREST, true);
1159 
1160    /* We should have selected formats that we can blit */
1161    assert(handled);
1162    return handled;
1163 }
1164 
1165 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * info)1166 v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
1167                       const VkCopyImageInfo2 *info)
1168 
1169 {
1170    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1171    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
1172    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
1173 
1174    assert(src->vk.samples == dst->vk.samples);
1175 
1176    cmd_buffer->state.is_transfer = true;
1177 
1178    for (uint32_t i = 0; i < info->regionCount; i++) {
1179       if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
1180          continue;
1181       if (copy_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
1182          continue;
1183       if (copy_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
1184          continue;
1185       unreachable("Image copy not supported");
1186    }
1187 
1188    cmd_buffer->state.is_transfer = false;
1189 }
1190 
1191 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)1192 v3dv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer,
1193                        const VkCopyBufferInfo2 *pCopyBufferInfo)
1194 {
1195    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1196    V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
1197    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
1198 
1199    cmd_buffer->state.is_transfer = true;
1200 
1201    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
1202       v3dv_X(cmd_buffer->device, meta_copy_buffer)
1203          (cmd_buffer,
1204           dst_buffer->mem->bo, dst_buffer->mem_offset,
1205           src_buffer->mem->bo, src_buffer->mem_offset,
1206           &pCopyBufferInfo->pRegions[i]);
1207    }
1208 
1209    cmd_buffer->state.is_transfer = false;
1210 }
1211 
1212 static void
destroy_update_buffer_cb(VkDevice _device,uint64_t pobj,VkAllocationCallbacks * alloc)1213 destroy_update_buffer_cb(VkDevice _device,
1214                          uint64_t pobj,
1215                          VkAllocationCallbacks *alloc)
1216 {
1217    V3DV_FROM_HANDLE(v3dv_device, device, _device);
1218    struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
1219    v3dv_bo_free(device, bo);
1220 }
1221 
1222 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1223 v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1224                      VkBuffer dstBuffer,
1225                      VkDeviceSize dstOffset,
1226                      VkDeviceSize dataSize,
1227                      const void *pData)
1228 {
1229    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1230    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1231 
1232    struct v3dv_bo *src_bo =
1233       v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
1234    if (!src_bo) {
1235       fprintf(stderr, "Failed to allocate BO for vkCmdUpdateBuffer.\n");
1236       return;
1237    }
1238 
1239    bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
1240    if (!ok) {
1241       fprintf(stderr, "Failed to map BO for vkCmdUpdateBuffer.\n");
1242       return;
1243    }
1244 
1245    cmd_buffer->state.is_transfer = true;
1246 
1247    memcpy(src_bo->map, pData, dataSize);
1248 
1249    v3dv_bo_unmap(cmd_buffer->device, src_bo);
1250 
1251    VkBufferCopy2 region = {
1252       .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
1253       .srcOffset = 0,
1254       .dstOffset = dstOffset,
1255       .size = dataSize,
1256    };
1257    struct v3dv_job *copy_job =
1258       v3dv_X(cmd_buffer->device, meta_copy_buffer)
1259       (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
1260        src_bo, 0, &region);
1261 
1262    if (copy_job) {
1263       v3dv_cmd_buffer_add_private_obj(
1264          cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
1265    }
1266 
1267    cmd_buffer->state.is_transfer = false;
1268 }
1269 
1270 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize size,uint32_t data)1271 v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
1272                    VkBuffer dstBuffer,
1273                    VkDeviceSize dstOffset,
1274                    VkDeviceSize size,
1275                    uint32_t data)
1276 {
1277    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1278    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1279 
1280    cmd_buffer->state.is_transfer = true;
1281 
1282    struct v3dv_bo *bo = dst_buffer->mem->bo;
1283 
1284    /* From the Vulkan spec:
1285     *
1286     *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
1287     *    a multiple of 4, then the nearest smaller multiple is used."
1288     */
1289    if (size == VK_WHOLE_SIZE) {
1290       size = dst_buffer->size - dstOffset;
1291       size -= size % 4;
1292    }
1293 
1294    v3dv_X(cmd_buffer->device, meta_fill_buffer)
1295       (cmd_buffer, bo, dstOffset, size, data);
1296 
1297    cmd_buffer->state.is_transfer = false;
1298 }
1299 
1300 /**
1301  * Returns true if the implementation supports the requested operation (even if
1302  * it failed to process it, for example, due to an out-of-memory error).
1303  */
1304 static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1305 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1306                          struct v3dv_image *image,
1307                          struct v3dv_buffer *buffer,
1308                          const VkBufferImageCopy2 *region)
1309 {
1310    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
1311 
1312    /* Destination can't be raster format */
1313    if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
1314       return false;
1315 
1316    /* We can't copy D24S8 because buffer to image copies only copy one aspect
1317     * at a time, and the TFU copies full images. Also, V3D depth bits for
1318     * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
1319     * the Vulkan spec has the buffer data specified the other way around, so it
1320     * is not a straight copy, we would havew to swizzle the channels, which the
1321     * TFU can't do.
1322     */
1323    if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
1324        image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1325          return false;
1326    }
1327 
1328    /* Region must include full slice */
1329    const uint32_t offset_x = region->imageOffset.x;
1330    const uint32_t offset_y = region->imageOffset.y;
1331    if (offset_x != 0 || offset_y != 0)
1332       return false;
1333 
1334    uint32_t width, height;
1335    if (region->bufferRowLength == 0)
1336       width = region->imageExtent.width;
1337    else
1338       width = region->bufferRowLength;
1339 
1340    if (region->bufferImageHeight == 0)
1341       height = region->imageExtent.height;
1342    else
1343       height = region->bufferImageHeight;
1344 
1345    if (width != image->vk.extent.width || height != image->vk.extent.height)
1346       return false;
1347 
1348    /* Handle region semantics for compressed images */
1349    const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
1350    const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
1351    width = DIV_ROUND_UP(width, block_w);
1352    height = DIV_ROUND_UP(height, block_h);
1353 
1354    /* Format must be supported for texturing via the TFU. Since we are just
1355     * copying raw data and not converting between pixel formats, we can ignore
1356     * the image's format and choose a compatible TFU format for the image
1357     * texel size instead, which expands the list of formats we can handle here.
1358     */
1359    const struct v3dv_format *format =
1360       v3dv_get_compatible_tfu_format(cmd_buffer->device,
1361                                      image->cpp, NULL);
1362 
1363    const uint32_t mip_level = region->imageSubresource.mipLevel;
1364    const struct v3d_resource_slice *slice = &image->slices[mip_level];
1365 
1366    uint32_t num_layers;
1367    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1368       num_layers = region->imageSubresource.layerCount;
1369    else
1370       num_layers = region->imageExtent.depth;
1371    assert(num_layers > 0);
1372 
1373    assert(image->mem && image->mem->bo);
1374    const struct v3dv_bo *dst_bo = image->mem->bo;
1375 
1376    assert(buffer->mem && buffer->mem->bo);
1377    const struct v3dv_bo *src_bo = buffer->mem->bo;
1378 
1379    /* Emit a TFU job per layer to copy */
1380    const uint32_t buffer_stride = width * image->cpp;
1381    for (int i = 0; i < num_layers; i++) {
1382       uint32_t layer;
1383       if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1384          layer = region->imageSubresource.baseArrayLayer + i;
1385       else
1386          layer = region->imageOffset.z + i;
1387 
1388       const uint32_t buffer_offset =
1389          buffer->mem_offset + region->bufferOffset +
1390          height * buffer_stride * i;
1391       const uint32_t src_offset = src_bo->offset + buffer_offset;
1392 
1393       const uint32_t dst_offset =
1394          dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
1395 
1396       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
1397              cmd_buffer,
1398              dst_bo->handle,
1399              dst_offset,
1400              slice->tiling,
1401              slice->padded_height,
1402              image->cpp,
1403              src_bo->handle,
1404              src_offset,
1405              V3D_TILING_RASTER,
1406              width,
1407              1,
1408              width, height, format);
1409    }
1410 
1411    return true;
1412 }
1413 
1414 /**
1415  * Returns true if the implementation supports the requested operation (even if
1416  * it failed to process it, for example, due to an out-of-memory error).
1417  */
1418 static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1419 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1420                          struct v3dv_image *image,
1421                          struct v3dv_buffer *buffer,
1422                          const VkBufferImageCopy2 *region)
1423 {
1424    VkFormat fb_format;
1425    if (!v3dv_meta_can_use_tlb(image, &region->imageOffset, &fb_format))
1426       return false;
1427 
1428    uint32_t internal_type, internal_bpp;
1429    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
1430       (fb_format, region->imageSubresource.aspectMask,
1431        &internal_type, &internal_bpp);
1432 
1433    uint32_t num_layers;
1434    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1435       num_layers = region->imageSubresource.layerCount;
1436    else
1437       num_layers = region->imageExtent.depth;
1438    assert(num_layers > 0);
1439 
1440    struct v3dv_job *job =
1441       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1442    if (!job)
1443       return true;
1444 
1445    /* Handle copy to compressed format using a compatible format */
1446    const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
1447    const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
1448    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
1449    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
1450 
1451    v3dv_job_start_frame(job, width, height, num_layers, false,
1452                         1, internal_bpp, false);
1453 
1454    struct v3dv_meta_framebuffer framebuffer;
1455    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
1456                                               internal_type, &job->frame_tiling);
1457 
1458    v3dv_X(job->device, job_emit_binning_flush)(job);
1459    v3dv_X(job->device, meta_emit_copy_buffer_to_image_rcl)
1460       (job, image, buffer, &framebuffer, region);
1461 
1462    v3dv_cmd_buffer_finish_job(cmd_buffer);
1463 
1464    return true;
1465 }
1466 
1467 static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1468 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1469                                struct v3dv_image *image,
1470                                struct v3dv_buffer *buffer,
1471                                const VkBufferImageCopy2 *region)
1472 {
1473    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
1474       return true;
1475    if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
1476       return true;
1477    return false;
1478 }
1479 
1480 static VkResult
create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)1481 create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
1482 {
1483    /* If this is not the first pool we create for this command buffer
1484     * size it based on the size of the currently exhausted pool.
1485     */
1486    uint32_t descriptor_count = 64;
1487    if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
1488       struct v3dv_descriptor_pool *exhausted_pool =
1489          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
1490       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
1491    }
1492 
1493    /* Create the descriptor pool */
1494    cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
1495    VkDescriptorPoolSize pool_size = {
1496       .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
1497       .descriptorCount = descriptor_count,
1498    };
1499    VkDescriptorPoolCreateInfo info = {
1500       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
1501       .maxSets = descriptor_count,
1502       .poolSizeCount = 1,
1503       .pPoolSizes = &pool_size,
1504       .flags = 0,
1505    };
1506    VkResult result =
1507       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
1508                                 &info,
1509                                 &cmd_buffer->device->vk.alloc,
1510                                 &cmd_buffer->meta.texel_buffer_copy.dspool);
1511 
1512    if (result == VK_SUCCESS) {
1513       assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
1514       const VkDescriptorPool _pool = cmd_buffer->meta.texel_buffer_copy.dspool;
1515 
1516       v3dv_cmd_buffer_add_private_obj(
1517          cmd_buffer, (uintptr_t) _pool,
1518          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
1519 
1520       struct v3dv_descriptor_pool *pool =
1521          v3dv_descriptor_pool_from_handle(_pool);
1522       pool->is_driver_internal = true;
1523    }
1524 
1525    return result;
1526 }
1527 
1528 static VkResult
allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)1529 allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
1530                                           VkDescriptorSet *set)
1531 {
1532    /* Make sure we have a descriptor pool */
1533    VkResult result;
1534    if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
1535       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
1536       if (result != VK_SUCCESS)
1537          return result;
1538    }
1539    assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
1540 
1541    /* Allocate descriptor set */
1542    struct v3dv_device *device = cmd_buffer->device;
1543    VkDevice _device = v3dv_device_to_handle(device);
1544    VkDescriptorSetAllocateInfo info = {
1545       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
1546       .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
1547       .descriptorSetCount = 1,
1548       .pSetLayouts = &device->meta.texel_buffer_copy.ds_layout,
1549    };
1550    result = v3dv_AllocateDescriptorSets(_device, &info, set);
1551 
1552    /* If we ran out of pool space, grow the pool and try again */
1553    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
1554       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
1555       if (result == VK_SUCCESS) {
1556          info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
1557          result = v3dv_AllocateDescriptorSets(_device, &info, set);
1558       }
1559    }
1560 
1561    return result;
1562 }
1563 
1564 static void
get_texel_buffer_copy_pipeline_cache_key(VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,uint8_t * key)1565 get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
1566                                          VkColorComponentFlags cmask,
1567                                          VkComponentMapping *cswizzle,
1568                                          bool is_layered,
1569                                          uint8_t *key)
1570 {
1571    memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1572 
1573    uint32_t *p = (uint32_t *) key;
1574 
1575    *p = format;
1576    p++;
1577 
1578    *p = cmask;
1579    p++;
1580 
1581    /* Note that that we are using a single byte for this, so we could pack
1582     * more data into this 32-bit slot in the future.
1583     */
1584    *p = is_layered ? 1 : 0;
1585    p++;
1586 
1587    memcpy(p, cswizzle, sizeof(VkComponentMapping));
1588    p += sizeof(VkComponentMapping) / sizeof(uint32_t);
1589 
1590    assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1591 }
1592 
1593 static bool
1594 create_blit_render_pass(struct v3dv_device *device,
1595                         VkFormat dst_format,
1596                         VkFormat src_format,
1597                         VkRenderPass *pass_load,
1598                         VkRenderPass *pass_no_load);
1599 
1600 static bool
1601 create_pipeline(struct v3dv_device *device,
1602                 struct v3dv_render_pass *pass,
1603                 struct nir_shader *vs_nir,
1604                 struct nir_shader *gs_nir,
1605                 struct nir_shader *fs_nir,
1606                 const VkPipelineVertexInputStateCreateInfo *vi_state,
1607                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
1608                 const VkPipelineColorBlendStateCreateInfo *cb_state,
1609                 const VkPipelineMultisampleStateCreateInfo *ms_state,
1610                 const VkPipelineLayout layout,
1611                 VkPipeline *pipeline);
1612 
1613 static nir_shader *
get_texel_buffer_copy_vs()1614 get_texel_buffer_copy_vs()
1615 {
1616    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1617    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
1618                                                   "meta texel buffer copy vs");
1619    nir_variable *vs_out_pos =
1620       nir_variable_create(b.shader, nir_var_shader_out,
1621                           glsl_vec4_type(), "gl_Position");
1622    vs_out_pos->data.location = VARYING_SLOT_POS;
1623 
1624    nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
1625    nir_store_var(&b, vs_out_pos, pos, 0xf);
1626 
1627    return b.shader;
1628 }
1629 
1630 static nir_shader *
get_texel_buffer_copy_gs()1631 get_texel_buffer_copy_gs()
1632 {
1633    /* FIXME: this creates a geometry shader that takes the index of a single
1634     * layer to clear from push constants, so we need to emit a draw call for
1635     * each layer that we want to clear. We could actually do better and have it
1636     * take a range of layers however, if we were to do this, we would need to
1637     * be careful not to exceed the maximum number of output vertices allowed in
1638     * a geometry shader.
1639     */
1640    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1641    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
1642                                                   "meta texel buffer copy gs");
1643    nir_shader *nir = b.shader;
1644    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
1645    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
1646                                (1ull << VARYING_SLOT_LAYER);
1647    nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES;
1648    nir->info.gs.output_primitive = SHADER_PRIM_TRIANGLE_STRIP;
1649    nir->info.gs.vertices_in = 3;
1650    nir->info.gs.vertices_out = 3;
1651    nir->info.gs.invocations = 1;
1652    nir->info.gs.active_stream_mask = 0x1;
1653 
1654    /* in vec4 gl_Position[3] */
1655    nir_variable *gs_in_pos =
1656       nir_variable_create(b.shader, nir_var_shader_in,
1657                           glsl_array_type(glsl_vec4_type(), 3, 0),
1658                           "in_gl_Position");
1659    gs_in_pos->data.location = VARYING_SLOT_POS;
1660 
1661    /* out vec4 gl_Position */
1662    nir_variable *gs_out_pos =
1663       nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
1664                           "out_gl_Position");
1665    gs_out_pos->data.location = VARYING_SLOT_POS;
1666 
1667    /* out float gl_Layer */
1668    nir_variable *gs_out_layer =
1669       nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
1670                           "out_gl_Layer");
1671    gs_out_layer->data.location = VARYING_SLOT_LAYER;
1672 
1673    /* Emit output triangle */
1674    for (uint32_t i = 0; i < 3; i++) {
1675       /* gl_Position from shader input */
1676       nir_deref_instr *in_pos_i =
1677          nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
1678       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
1679 
1680       /* gl_Layer from push constants */
1681       nir_ssa_def *layer =
1682          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1683                                 .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
1684                                 .range = 4);
1685       nir_store_var(&b, gs_out_layer, layer, 0x1);
1686 
1687       nir_emit_vertex(&b, 0);
1688    }
1689 
1690    nir_end_primitive(&b, 0);
1691 
1692    return nir;
1693 }
1694 
1695 static nir_ssa_def *
load_frag_coord(nir_builder * b)1696 load_frag_coord(nir_builder *b)
1697 {
1698    nir_foreach_shader_in_variable(var, b->shader) {
1699       if (var->data.location == VARYING_SLOT_POS)
1700          return nir_load_var(b, var);
1701    }
1702    nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
1703                                            glsl_vec4_type(), NULL);
1704    pos->data.location = VARYING_SLOT_POS;
1705    return nir_load_var(b, pos);
1706 }
1707 
1708 static uint32_t
component_swizzle_to_nir_swizzle(VkComponentSwizzle comp,VkComponentSwizzle swz)1709 component_swizzle_to_nir_swizzle(VkComponentSwizzle comp, VkComponentSwizzle swz)
1710 {
1711    if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
1712       swz = comp;
1713 
1714    switch (swz) {
1715    case VK_COMPONENT_SWIZZLE_R:
1716       return 0;
1717    case VK_COMPONENT_SWIZZLE_G:
1718       return 1;
1719    case VK_COMPONENT_SWIZZLE_B:
1720       return 2;
1721    case VK_COMPONENT_SWIZZLE_A:
1722       return 3;
1723    default:
1724       unreachable("Invalid swizzle");
1725    };
1726 }
1727 
1728 static nir_shader *
get_texel_buffer_copy_fs(struct v3dv_device * device,VkFormat format,VkComponentMapping * cswizzle)1729 get_texel_buffer_copy_fs(struct v3dv_device *device, VkFormat format,
1730                          VkComponentMapping *cswizzle)
1731 {
1732    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
1733    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
1734                                                   "meta texel buffer copy fs");
1735 
1736    /* We only use the copy from texel buffer shader to implement
1737     * copy_buffer_to_image_shader, which always selects a compatible integer
1738     * format for the copy.
1739     */
1740    assert(vk_format_is_int(format));
1741 
1742    /* Fragment shader output color */
1743    nir_variable *fs_out_color =
1744       nir_variable_create(b.shader, nir_var_shader_out,
1745                           glsl_uvec4_type(), "out_color");
1746    fs_out_color->data.location = FRAG_RESULT_DATA0;
1747 
1748    /* Texel buffer input */
1749    const struct glsl_type *sampler_type =
1750       glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
1751    nir_variable *sampler =
1752       nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
1753    sampler->data.descriptor_set = 0;
1754    sampler->data.binding = 0;
1755 
1756    /* Load the box describing the pixel region we want to copy from the
1757     * texel buffer.
1758     */
1759    nir_ssa_def *box =
1760       nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
1761                              .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
1762                              .range = 16);
1763 
1764    /* Load the buffer stride (this comes in texel units) */
1765    nir_ssa_def *stride =
1766       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1767                              .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
1768                              .range = 4);
1769 
1770    /* Load the buffer offset (this comes in texel units) */
1771    nir_ssa_def *offset =
1772       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
1773                              .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
1774                              .range = 4);
1775 
1776    nir_ssa_def *coord = nir_f2i32(&b, load_frag_coord(&b));
1777 
1778    /* Load pixel data from texel buffer based on the x,y offset of the pixel
1779     * within the box. Texel buffers are 1D arrays of texels.
1780     *
1781     * Notice that we already make sure that we only generate fragments that are
1782     * inside the box through the scissor/viewport state, so our offset into the
1783     * texel buffer should always be within its bounds and we we don't need
1784     * to add a check for that here.
1785     */
1786    nir_ssa_def *x_offset =
1787       nir_isub(&b, nir_channel(&b, coord, 0),
1788                    nir_channel(&b, box, 0));
1789    nir_ssa_def *y_offset =
1790       nir_isub(&b, nir_channel(&b, coord, 1),
1791                    nir_channel(&b, box, 1));
1792    nir_ssa_def *texel_offset =
1793       nir_iadd(&b, nir_iadd(&b, offset, x_offset),
1794                    nir_imul(&b, y_offset, stride));
1795 
1796    nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
1797    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
1798    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
1799    tex->op = nir_texop_txf;
1800    tex->src[0].src_type = nir_tex_src_coord;
1801    tex->src[0].src = nir_src_for_ssa(texel_offset);
1802    tex->src[1].src_type = nir_tex_src_texture_deref;
1803    tex->src[1].src = nir_src_for_ssa(tex_deref);
1804    tex->dest_type = nir_type_uint32;
1805    tex->is_array = false;
1806    tex->coord_components = 1;
1807    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "texel buffer result");
1808    nir_builder_instr_insert(&b, &tex->instr);
1809 
1810    uint32_t swiz[4];
1811    swiz[0] =
1812       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_R, cswizzle->r);
1813    swiz[1] =
1814       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_G, cswizzle->g);
1815    swiz[2] =
1816       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
1817    swiz[3] =
1818       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
1819    nir_ssa_def *s = nir_swizzle(&b, &tex->dest.ssa, swiz, 4);
1820    nir_store_var(&b, fs_out_color, s, 0xf);
1821 
1822    return b.shader;
1823 }
1824 
1825 static bool
create_texel_buffer_copy_pipeline(struct v3dv_device * device,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)1826 create_texel_buffer_copy_pipeline(struct v3dv_device *device,
1827                                   VkFormat format,
1828                                   VkColorComponentFlags cmask,
1829                                   VkComponentMapping *cswizzle,
1830                                   bool is_layered,
1831                                   VkRenderPass _pass,
1832                                   VkPipelineLayout pipeline_layout,
1833                                   VkPipeline *pipeline)
1834 {
1835    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
1836 
1837    assert(vk_format_is_color(format));
1838 
1839    nir_shader *vs_nir = get_texel_buffer_copy_vs();
1840    nir_shader *fs_nir = get_texel_buffer_copy_fs(device, format, cswizzle);
1841    nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs() : NULL;
1842 
1843    const VkPipelineVertexInputStateCreateInfo vi_state = {
1844       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
1845       .vertexBindingDescriptionCount = 0,
1846       .vertexAttributeDescriptionCount = 0,
1847    };
1848 
1849    VkPipelineDepthStencilStateCreateInfo ds_state = {
1850       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
1851    };
1852 
1853    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
1854    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
1855       .blendEnable = false,
1856       .colorWriteMask = cmask,
1857    };
1858 
1859    const VkPipelineColorBlendStateCreateInfo cb_state = {
1860       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
1861       .logicOpEnable = false,
1862       .attachmentCount = 1,
1863       .pAttachments = blend_att_state
1864    };
1865 
1866    const VkPipelineMultisampleStateCreateInfo ms_state = {
1867       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
1868       .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
1869       .sampleShadingEnable = false,
1870       .pSampleMask = NULL,
1871       .alphaToCoverageEnable = false,
1872       .alphaToOneEnable = false,
1873    };
1874 
1875    return create_pipeline(device,
1876                           pass,
1877                           vs_nir, gs_nir, fs_nir,
1878                           &vi_state,
1879                           &ds_state,
1880                           &cb_state,
1881                           &ms_state,
1882                           pipeline_layout,
1883                           pipeline);
1884 }
1885 
1886 static bool
get_copy_texel_buffer_pipeline(struct v3dv_device * device,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,VkImageType image_type,bool is_layered,struct v3dv_meta_texel_buffer_copy_pipeline ** pipeline)1887 get_copy_texel_buffer_pipeline(
1888    struct v3dv_device *device,
1889    VkFormat format,
1890    VkColorComponentFlags cmask,
1891    VkComponentMapping *cswizzle,
1892    VkImageType image_type,
1893    bool is_layered,
1894    struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
1895 {
1896    bool ok = true;
1897 
1898    uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
1899    get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
1900                                             key);
1901 
1902    mtx_lock(&device->meta.mtx);
1903    struct hash_entry *entry =
1904       _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
1905                               key);
1906    if (entry) {
1907       mtx_unlock(&device->meta.mtx);
1908       *pipeline = entry->data;
1909       return true;
1910    }
1911 
1912    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
1913                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1914 
1915    if (*pipeline == NULL)
1916       goto fail;
1917 
1918    /* The blit render pass is compatible */
1919    ok = create_blit_render_pass(device, format, format,
1920                                 &(*pipeline)->pass,
1921                                 &(*pipeline)->pass_no_load);
1922    if (!ok)
1923       goto fail;
1924 
1925    ok =
1926       create_texel_buffer_copy_pipeline(device,
1927                                         format, cmask, cswizzle, is_layered,
1928                                         (*pipeline)->pass,
1929                                         device->meta.texel_buffer_copy.p_layout,
1930                                         &(*pipeline)->pipeline);
1931    if (!ok)
1932       goto fail;
1933 
1934    uint8_t *dupkey = malloc(V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1935    memcpy(dupkey, key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
1936    _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
1937                            dupkey, *pipeline);
1938 
1939    mtx_unlock(&device->meta.mtx);
1940    return true;
1941 
1942 fail:
1943    mtx_unlock(&device->meta.mtx);
1944 
1945    VkDevice _device = v3dv_device_to_handle(device);
1946    if (*pipeline) {
1947       if ((*pipeline)->pass)
1948          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
1949       if ((*pipeline)->pipeline)
1950          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
1951       vk_free(&device->vk.alloc, *pipeline);
1952       *pipeline = NULL;
1953    }
1954 
1955    return false;
1956 }
1957 
1958 static bool
texel_buffer_shader_copy(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2 * regions)1959 texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
1960                          VkImageAspectFlags aspect,
1961                          struct v3dv_image *image,
1962                          VkFormat dst_format,
1963                          VkFormat src_format,
1964                          struct v3dv_buffer *buffer,
1965                          uint32_t buffer_bpp,
1966                          VkColorComponentFlags cmask,
1967                          VkComponentMapping *cswizzle,
1968                          uint32_t region_count,
1969                          const VkBufferImageCopy2 *regions)
1970 {
1971    VkResult result;
1972    bool handled = false;
1973 
1974    assert(cswizzle);
1975 
1976    /* This is a copy path, so we don't handle format conversions. The only
1977     * exception are stencil to D24S8 copies, which are handled as a color
1978     * masked R8->RGBA8 copy.
1979     */
1980    assert(src_format == dst_format ||
1981           (dst_format == VK_FORMAT_R8G8B8A8_UINT &&
1982            src_format == VK_FORMAT_R8_UINT &&
1983            cmask == VK_COLOR_COMPONENT_R_BIT));
1984 
1985    /* We only handle color copies. Callers can copy D/S aspects by using
1986     * a compatible color format and maybe a cmask/cswizzle for D24 formats.
1987     */
1988    if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format))
1989       return handled;
1990 
1991    /* FIXME: we only handle uncompressed images for now. */
1992    if (vk_format_is_compressed(image->vk.format))
1993       return handled;
1994 
1995    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
1996                                             VK_COLOR_COMPONENT_G_BIT |
1997                                             VK_COLOR_COMPONENT_B_BIT |
1998                                             VK_COLOR_COMPONENT_A_BIT;
1999    if (cmask == 0)
2000       cmask = full_cmask;
2001 
2002    /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
2003     * so we can bind it as a texel buffer. Otherwise, the buffer view
2004     * we create below won't setup the texture state that we need for this.
2005     */
2006    if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
2007       if (v3dv_buffer_format_supports_features(
2008              cmd_buffer->device, src_format,
2009              VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) {
2010          buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
2011       } else {
2012          return handled;
2013       }
2014    }
2015 
2016    /* At this point we should be able to handle the copy unless an unexpected
2017     * error occurs, such as an OOM.
2018     */
2019    handled = true;
2020 
2021 
2022    /* Compute the number of layers to copy.
2023     *
2024     * If we are batching (region_count > 1) all our regions have the same
2025     * image subresource so we can take this from the first region. For 3D
2026     * images we require the same depth extent.
2027     */
2028    const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
2029    uint32_t num_layers;
2030    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2031       num_layers = resource->layerCount;
2032    } else {
2033       assert(region_count == 1);
2034       num_layers = regions[0].imageExtent.depth;
2035    }
2036    assert(num_layers > 0);
2037 
2038    /* Get the texel buffer copy pipeline */
2039    struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
2040    bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
2041                                             dst_format, cmask, cswizzle,
2042                                             image->vk.image_type, num_layers > 1,
2043                                             &pipeline);
2044    if (!ok)
2045       return handled;
2046    assert(pipeline && pipeline->pipeline && pipeline->pass);
2047 
2048    /* Setup descriptor set for the source texel buffer. We don't have to
2049     * register the descriptor as a private command buffer object since
2050     * all descriptors will be freed automatically with the descriptor
2051     * pool.
2052     */
2053    VkDescriptorSet set;
2054    result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
2055    if (result != VK_SUCCESS)
2056       return handled;
2057 
2058    /* We can't pass region->bufferOffset here for the offset field because
2059     * the texture base pointer in the texture shader state must be a 64-byte
2060     * aligned value. Instead, we use 0 here and we pass the offset in texels
2061     * as a push constant to the shader.
2062     */
2063    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
2064    VkBufferViewCreateInfo buffer_view_info = {
2065       .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
2066       .buffer = v3dv_buffer_to_handle(buffer),
2067       .format = src_format,
2068       .offset = 0,
2069       .range = VK_WHOLE_SIZE,
2070    };
2071 
2072    VkBufferView texel_buffer_view;
2073    result = v3dv_CreateBufferView(_device, &buffer_view_info,
2074                                   &cmd_buffer->device->vk.alloc,
2075                                   &texel_buffer_view);
2076    if (result != VK_SUCCESS)
2077       return handled;
2078 
2079    v3dv_cmd_buffer_add_private_obj(
2080       cmd_buffer, (uintptr_t)texel_buffer_view,
2081       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
2082 
2083    VkWriteDescriptorSet write = {
2084       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
2085       .dstSet = set,
2086       .dstBinding = 0,
2087       .dstArrayElement = 0,
2088       .descriptorCount = 1,
2089       .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2090       .pTexelBufferView = &texel_buffer_view,
2091    };
2092    v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
2093 
2094    /* Push command buffer state before starting meta operation */
2095    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
2096    uint32_t dirty_dynamic_state = 0;
2097 
2098    /* Bind common state for all layers and regions  */
2099    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
2100    v3dv_CmdBindPipeline(_cmd_buffer,
2101                         VK_PIPELINE_BIND_POINT_GRAPHICS,
2102                         pipeline->pipeline);
2103 
2104    v3dv_CmdBindDescriptorSets(_cmd_buffer,
2105                               VK_PIPELINE_BIND_POINT_GRAPHICS,
2106                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2107                               0, 1, &set,
2108                               0, NULL);
2109 
2110    /* Setup framebuffer.
2111     *
2112     * For 3D images, this creates a layered framebuffer with a number of
2113     * layers matching the depth extent of the 3D image.
2114     */
2115    uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel);
2116    uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel);
2117    VkImageViewCreateInfo image_view_info = {
2118       .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2119       .image = v3dv_image_to_handle(image),
2120       .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
2121       .format = dst_format,
2122       .subresourceRange = {
2123          .aspectMask = aspect,
2124          .baseMipLevel = resource->mipLevel,
2125          .levelCount = 1,
2126          .baseArrayLayer = resource->baseArrayLayer,
2127          .layerCount = num_layers,
2128       },
2129    };
2130    VkImageView image_view;
2131    result = v3dv_create_image_view(cmd_buffer->device,
2132                                    &image_view_info, &image_view);
2133    if (result != VK_SUCCESS)
2134       goto fail;
2135 
2136    v3dv_cmd_buffer_add_private_obj(
2137       cmd_buffer, (uintptr_t)image_view,
2138       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
2139 
2140    VkFramebufferCreateInfo fb_info = {
2141       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
2142       .renderPass = pipeline->pass,
2143       .attachmentCount = 1,
2144       .pAttachments = &image_view,
2145       .width = fb_width,
2146       .height = fb_height,
2147       .layers = num_layers,
2148    };
2149 
2150    VkFramebuffer fb;
2151    result = v3dv_CreateFramebuffer(_device, &fb_info,
2152                                    &cmd_buffer->device->vk.alloc, &fb);
2153    if (result != VK_SUCCESS)
2154       goto fail;
2155 
2156     v3dv_cmd_buffer_add_private_obj(
2157        cmd_buffer, (uintptr_t)fb,
2158        (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
2159 
2160    /* For each layer */
2161    for (uint32_t l = 0; l < num_layers; l++) {
2162        /* Start render pass for this layer.
2163         *
2164         * If the we only have one region to copy, then we might be able to
2165         * skip the TLB load if it is aligned to tile boundaries. All layers
2166         * copy the same area, so we only need to check this once.
2167         */
2168       bool can_skip_tlb_load = false;
2169       VkRect2D render_area;
2170       if (region_count == 1) {
2171          render_area.offset.x = regions[0].imageOffset.x;
2172          render_area.offset.y = regions[0].imageOffset.y;
2173          render_area.extent.width = regions[0].imageExtent.width;
2174          render_area.extent.height = regions[0].imageExtent.height;
2175 
2176          if (l == 0) {
2177             struct v3dv_render_pass *pipeline_pass =
2178                v3dv_render_pass_from_handle(pipeline->pass);
2179             can_skip_tlb_load =
2180                cmask == full_cmask &&
2181                v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
2182                                                  v3dv_framebuffer_from_handle(fb),
2183                                                  pipeline_pass, 0);
2184          }
2185       } else {
2186          render_area.offset.x = 0;
2187          render_area.offset.y = 0;
2188          render_area.extent.width = fb_width;
2189          render_area.extent.height = fb_height;
2190       }
2191 
2192       VkRenderPassBeginInfo rp_info = {
2193          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
2194          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
2195                                            pipeline->pass,
2196          .framebuffer = fb,
2197          .renderArea = render_area,
2198          .clearValueCount = 0,
2199       };
2200 
2201       VkSubpassBeginInfo sp_info = {
2202          .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
2203          .contents = VK_SUBPASS_CONTENTS_INLINE,
2204       };
2205 
2206       v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
2207       struct v3dv_job *job = cmd_buffer->state.job;
2208       if (!job)
2209          goto fail;
2210 
2211       /* If we are using a layered copy we need to specify the layer for the
2212        * Geometry Shader.
2213        */
2214       if (num_layers > 1) {
2215          uint32_t layer = resource->baseArrayLayer + l;
2216          v3dv_CmdPushConstants(_cmd_buffer,
2217                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2218                                VK_SHADER_STAGE_GEOMETRY_BIT,
2219                                24, 4, &layer);
2220       }
2221 
2222       /* For each region */
2223       dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
2224       for (uint32_t r = 0; r < region_count; r++) {
2225          const VkBufferImageCopy2 *region = &regions[r];
2226 
2227          /* Obtain the 2D buffer region spec */
2228          uint32_t buf_width, buf_height;
2229          if (region->bufferRowLength == 0)
2230              buf_width = region->imageExtent.width;
2231          else
2232              buf_width = region->bufferRowLength;
2233 
2234          if (region->bufferImageHeight == 0)
2235              buf_height = region->imageExtent.height;
2236          else
2237              buf_height = region->bufferImageHeight;
2238 
2239          const VkViewport viewport = {
2240             .x = region->imageOffset.x,
2241             .y = region->imageOffset.y,
2242             .width = region->imageExtent.width,
2243             .height = region->imageExtent.height,
2244             .minDepth = 0.0f,
2245             .maxDepth = 1.0f
2246          };
2247          v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
2248          const VkRect2D scissor = {
2249             .offset = { region->imageOffset.x, region->imageOffset.y },
2250             .extent = { region->imageExtent.width, region->imageExtent.height }
2251          };
2252          v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
2253 
2254          const VkDeviceSize buf_offset =
2255             region->bufferOffset / buffer_bpp  + l * buf_height * buf_width;
2256          uint32_t push_data[6] = {
2257             region->imageOffset.x,
2258             region->imageOffset.y,
2259             region->imageOffset.x + region->imageExtent.width - 1,
2260             region->imageOffset.y + region->imageExtent.height - 1,
2261             buf_width,
2262             buf_offset,
2263          };
2264 
2265          v3dv_CmdPushConstants(_cmd_buffer,
2266                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2267                                VK_SHADER_STAGE_FRAGMENT_BIT,
2268                                0, sizeof(push_data), &push_data);
2269 
2270          v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
2271       } /* For each region */
2272 
2273       VkSubpassEndInfo sp_end_info = {
2274          .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
2275       };
2276 
2277       v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
2278    } /* For each layer */
2279 
2280 fail:
2281    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
2282    return handled;
2283 }
2284 
2285 /**
2286  * Returns true if the implementation supports the requested operation (even if
2287  * it failed to process it, for example, due to an out-of-memory error).
2288  */
2289 static bool
copy_buffer_to_image_blit(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2 * regions)2290 copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
2291                           VkImageAspectFlags aspect,
2292                           struct v3dv_image *image,
2293                           VkFormat dst_format,
2294                           VkFormat src_format,
2295                           struct v3dv_buffer *buffer,
2296                           uint32_t buffer_bpp,
2297                           VkColorComponentFlags cmask,
2298                           VkComponentMapping *cswizzle,
2299                           uint32_t region_count,
2300                           const VkBufferImageCopy2 *regions)
2301 {
2302    /* Since we can't sample linear images we need to upload the linear
2303     * buffer to a tiled image that we can use as a blit source, which
2304     * is slow.
2305     */
2306    perf_debug("Falling back to blit path for buffer to image copy.\n");
2307 
2308    struct v3dv_device *device = cmd_buffer->device;
2309    VkDevice _device = v3dv_device_to_handle(device);
2310    bool handled = true;
2311 
2312    /* Allocate memory for the tiled image. Since we copy layer by layer
2313     * we allocate memory to hold a full layer, which is the worse case.
2314     * For that we create a dummy image with that spec, get memory requirements
2315     * for it and use that information to create the memory allocation.
2316     * We will then reuse this memory store for all the regions we want to
2317     * copy.
2318     */
2319    VkImage dummy_image;
2320    VkImageCreateInfo dummy_info = {
2321       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2322       .imageType = VK_IMAGE_TYPE_2D,
2323       .format = src_format,
2324       .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
2325       .mipLevels = 1,
2326       .arrayLayers = 1,
2327       .samples = VK_SAMPLE_COUNT_1_BIT,
2328       .tiling = VK_IMAGE_TILING_OPTIMAL,
2329       .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2330                VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2331       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2332       .queueFamilyIndexCount = 0,
2333       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2334    };
2335    VkResult result =
2336       v3dv_CreateImage(_device, &dummy_info, &device->vk.alloc, &dummy_image);
2337    if (result != VK_SUCCESS)
2338       return handled;
2339 
2340    VkMemoryRequirements reqs;
2341    vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
2342    v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);
2343 
2344    VkDeviceMemory mem;
2345    VkMemoryAllocateInfo alloc_info = {
2346       .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
2347       .allocationSize = reqs.size,
2348       .memoryTypeIndex = 0,
2349    };
2350    result = v3dv_AllocateMemory(_device, &alloc_info, &device->vk.alloc, &mem);
2351    if (result != VK_SUCCESS)
2352       return handled;
2353 
2354    v3dv_cmd_buffer_add_private_obj(
2355       cmd_buffer, (uintptr_t)mem,
2356       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);
2357 
2358    /* Obtain the layer count.
2359     *
2360     * If we are batching (region_count > 1) all our regions have the same
2361     * image subresource so we can take this from the first region.
2362     */
2363    uint32_t num_layers;
2364    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
2365       num_layers = regions[0].imageSubresource.layerCount;
2366    else
2367       num_layers = regions[0].imageExtent.depth;
2368    assert(num_layers > 0);
2369 
2370    /* Sanity check: we can only batch multiple regions together if they have
2371     * the same framebuffer (so the same layer).
2372     */
2373    assert(num_layers == 1 || region_count == 1);
2374 
2375    const uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
2376    const uint32_t block_height = vk_format_get_blockheight(image->vk.format);
2377 
2378    /* Copy regions by uploading each region to a temporary tiled image using
2379     * the memory we have just allocated as storage.
2380     */
2381    for (uint32_t r = 0; r < region_count; r++) {
2382       const VkBufferImageCopy2 *region = &regions[r];
2383 
2384       /* Obtain the 2D buffer region spec */
2385       uint32_t buf_width, buf_height;
2386       if (region->bufferRowLength == 0)
2387           buf_width = region->imageExtent.width;
2388       else
2389           buf_width = region->bufferRowLength;
2390 
2391       if (region->bufferImageHeight == 0)
2392           buf_height = region->imageExtent.height;
2393       else
2394           buf_height = region->bufferImageHeight;
2395 
2396       /* If the image is compressed, the bpp refers to blocks, not pixels */
2397       buf_width = buf_width / block_width;
2398       buf_height = buf_height / block_height;
2399 
2400       for (uint32_t i = 0; i < num_layers; i++) {
2401          /* Create the tiled image */
2402          VkImageCreateInfo image_info = {
2403             .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2404             .imageType = VK_IMAGE_TYPE_2D,
2405             .format = src_format,
2406             .extent = { buf_width, buf_height, 1 },
2407             .mipLevels = 1,
2408             .arrayLayers = 1,
2409             .samples = VK_SAMPLE_COUNT_1_BIT,
2410             .tiling = VK_IMAGE_TILING_OPTIMAL,
2411             .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2412                      VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2413             .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2414             .queueFamilyIndexCount = 0,
2415             .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2416          };
2417 
2418          VkImage buffer_image;
2419          VkResult result =
2420             v3dv_CreateImage(_device, &image_info, &device->vk.alloc,
2421                              &buffer_image);
2422          if (result != VK_SUCCESS)
2423             return handled;
2424 
2425          v3dv_cmd_buffer_add_private_obj(
2426             cmd_buffer, (uintptr_t)buffer_image,
2427             (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
2428 
2429          result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
2430          if (result != VK_SUCCESS)
2431             return handled;
2432 
2433          /* Upload buffer contents for the selected layer */
2434          const VkDeviceSize buf_offset_bytes =
2435             region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
2436          const VkBufferImageCopy2 buffer_image_copy = {
2437             .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
2438             .bufferOffset = buf_offset_bytes,
2439             .bufferRowLength = region->bufferRowLength / block_width,
2440             .bufferImageHeight = region->bufferImageHeight / block_height,
2441             .imageSubresource = {
2442                .aspectMask = aspect,
2443                .mipLevel = 0,
2444                .baseArrayLayer = 0,
2445                .layerCount = 1,
2446             },
2447             .imageOffset = { 0, 0, 0 },
2448             .imageExtent = { buf_width, buf_height, 1 }
2449          };
2450          handled =
2451             create_tiled_image_from_buffer(cmd_buffer,
2452                                            v3dv_image_from_handle(buffer_image),
2453                                            buffer, &buffer_image_copy);
2454          if (!handled) {
2455             /* This is unexpected, we should have setup the upload to be
2456              * conformant to a TFU or TLB copy.
2457              */
2458             unreachable("Unable to copy buffer to image through TLB");
2459             return false;
2460          }
2461 
2462          /* Blit-copy the requested image extent from the buffer image to the
2463           * destination image.
2464           *
2465           * Since we are copying, the blit must use the same format on the
2466           * destination and source images to avoid format conversions. The
2467           * only exception is copying stencil, which we upload to a R8UI source
2468           * image, but that we need to blit to a S8D24 destination (the only
2469           * stencil format we support).
2470           */
2471          const VkImageBlit2 blit_region = {
2472             .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
2473             .srcSubresource = {
2474                .aspectMask = aspect,
2475                .mipLevel = 0,
2476                .baseArrayLayer = 0,
2477                .layerCount = 1,
2478             },
2479             .srcOffsets = {
2480                { 0, 0, 0 },
2481                { region->imageExtent.width, region->imageExtent.height, 1 },
2482             },
2483             .dstSubresource = {
2484                .aspectMask = aspect,
2485                .mipLevel = region->imageSubresource.mipLevel,
2486                .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
2487                .layerCount = 1,
2488             },
2489             .dstOffsets = {
2490                {
2491                   DIV_ROUND_UP(region->imageOffset.x, block_width),
2492                   DIV_ROUND_UP(region->imageOffset.y, block_height),
2493                   region->imageOffset.z + i,
2494                },
2495                {
2496                   DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
2497                                block_width),
2498                   DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
2499                                block_height),
2500                   region->imageOffset.z + i + 1,
2501                },
2502             },
2503          };
2504 
2505          handled = blit_shader(cmd_buffer,
2506                                image, dst_format,
2507                                v3dv_image_from_handle(buffer_image), src_format,
2508                                cmask, cswizzle,
2509                                &blit_region, VK_FILTER_NEAREST, true);
2510          if (!handled) {
2511             /* This is unexpected, we should have a supported blit spec */
2512             unreachable("Unable to blit buffer to destination image");
2513             return false;
2514          }
2515       }
2516    }
2517 
2518    return handled;
2519 }
2520 
2521 /**
2522  * Returns true if the implementation supports the requested operation (even if
2523  * it failed to process it, for example, due to an out-of-memory error).
2524  */
2525 static bool
copy_buffer_to_image_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t region_count,const VkBufferImageCopy2 * regions,bool use_texel_buffer)2526 copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
2527                             struct v3dv_image *image,
2528                             struct v3dv_buffer *buffer,
2529                             uint32_t region_count,
2530                             const VkBufferImageCopy2 *regions,
2531                             bool use_texel_buffer)
2532 {
2533    /* We can only call this with region_count > 1 if we can batch the regions
2534     * together, in which case they share the same image subresource, and so
2535     * the same aspect.
2536     */
2537    VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
2538 
2539    /* Generally, the bpp of the data in the buffer matches that of the
2540     * destination image. The exception is the case where we are uploading
2541     * stencil (8bpp) to a combined d24s8 image (32bpp).
2542     */
2543    uint32_t buf_bpp = image->cpp;
2544 
2545    /* We are about to upload the buffer data to an image so we can then
2546     * blit that to our destination region. Because we are going to implement
2547     * the copy as a blit, we want our blit source and destination formats to be
2548     * the same (to avoid any format conversions), so we choose a canonical
2549     * format that matches the destination image bpp.
2550     */
2551    VkComponentMapping ident_swizzle = {
2552       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
2553       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
2554       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
2555       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
2556    };
2557 
2558    VkComponentMapping cswizzle = ident_swizzle;
2559    VkColorComponentFlags cmask = 0; /* Write all components */
2560    VkFormat src_format;
2561    VkFormat dst_format;
2562    switch (buf_bpp) {
2563    case 16:
2564       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2565       src_format = VK_FORMAT_R32G32B32A32_UINT;
2566       dst_format = src_format;
2567       break;
2568    case 8:
2569       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2570       src_format = VK_FORMAT_R16G16B16A16_UINT;
2571       dst_format = src_format;
2572       break;
2573    case 4:
2574       switch (aspect) {
2575       case VK_IMAGE_ASPECT_COLOR_BIT:
2576          src_format = VK_FORMAT_R8G8B8A8_UINT;
2577          dst_format = src_format;
2578          break;
2579       case VK_IMAGE_ASPECT_DEPTH_BIT:
2580          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
2581                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
2582                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
2583          src_format = VK_FORMAT_R8G8B8A8_UINT;
2584          dst_format = src_format;
2585 
2586          /* For D24 formats, the Vulkan spec states that the depth component
2587           * in the buffer is stored in the 24-LSB, but V3D wants it in the
2588           * 24-MSB.
2589           */
2590          if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
2591              image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
2592             cmask = VK_COLOR_COMPONENT_G_BIT |
2593                     VK_COLOR_COMPONENT_B_BIT |
2594                     VK_COLOR_COMPONENT_A_BIT;
2595             cswizzle.r = VK_COMPONENT_SWIZZLE_R;
2596             cswizzle.g = VK_COMPONENT_SWIZZLE_R;
2597             cswizzle.b = VK_COMPONENT_SWIZZLE_G;
2598             cswizzle.a = VK_COMPONENT_SWIZZLE_B;
2599          }
2600          break;
2601       case VK_IMAGE_ASPECT_STENCIL_BIT:
2602          /* Since we don't support separate stencil this is always a stencil
2603           * copy to a combined depth/stencil image. Because we don't support
2604           * separate stencil images, we interpret the buffer data as a
2605           * color R8UI image, and implement the blit as a compatible color
2606           * blit to an RGBA8UI destination masking out writes to components
2607           * GBA (which map to the D24 component of a S8D24 image).
2608           */
2609          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
2610          buf_bpp = 1;
2611          src_format = VK_FORMAT_R8_UINT;
2612          dst_format = VK_FORMAT_R8G8B8A8_UINT;
2613          cmask = VK_COLOR_COMPONENT_R_BIT;
2614          break;
2615       default:
2616          unreachable("unsupported aspect");
2617          return false;
2618       };
2619       break;
2620    case 2:
2621       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
2622              aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
2623       src_format = VK_FORMAT_R16_UINT;
2624       dst_format = src_format;
2625       break;
2626    case 1:
2627       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2628       src_format = VK_FORMAT_R8_UINT;
2629       dst_format = src_format;
2630       break;
2631    default:
2632       unreachable("unsupported bit-size");
2633       return false;
2634    }
2635 
2636    if (use_texel_buffer) {
2637       return texel_buffer_shader_copy(cmd_buffer, aspect, image,
2638                                       dst_format, src_format,
2639                                       buffer, buf_bpp,
2640                                       cmask, &cswizzle,
2641                                       region_count, regions);
2642    } else {
2643       return copy_buffer_to_image_blit(cmd_buffer, aspect, image,
2644                                        dst_format, src_format,
2645                                        buffer, buf_bpp,
2646                                        cmask, &cswizzle,
2647                                        region_count, regions);
2648    }
2649 }
2650 
2651 /**
2652  * Returns true if the implementation supports the requested operation (even if
2653  * it failed to process it, for example, due to an out-of-memory error).
2654  */
2655 static bool
copy_buffer_to_image_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)2656 copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
2657                          struct v3dv_image *image,
2658                          struct v3dv_buffer *buffer,
2659                          const VkBufferImageCopy2 *region)
2660 {
2661    /* FIXME */
2662    if (vk_format_is_depth_or_stencil(image->vk.format))
2663       return false;
2664 
2665    if (vk_format_is_compressed(image->vk.format))
2666       return false;
2667 
2668    if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
2669       return false;
2670 
2671    uint32_t buffer_width, buffer_height;
2672    if (region->bufferRowLength == 0)
2673       buffer_width = region->imageExtent.width;
2674    else
2675       buffer_width = region->bufferRowLength;
2676 
2677    if (region->bufferImageHeight == 0)
2678       buffer_height = region->imageExtent.height;
2679    else
2680       buffer_height = region->bufferImageHeight;
2681 
2682    uint32_t buffer_stride = buffer_width * image->cpp;
2683    uint32_t buffer_layer_stride = buffer_stride * buffer_height;
2684 
2685    uint32_t num_layers;
2686    if (image->vk.image_type != VK_IMAGE_TYPE_3D)
2687       num_layers = region->imageSubresource.layerCount;
2688    else
2689       num_layers = region->imageExtent.depth;
2690    assert(num_layers > 0);
2691 
2692    struct v3dv_job *job =
2693       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
2694                                      V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
2695                                      cmd_buffer, -1);
2696    if (!job)
2697       return true;
2698 
2699    job->cpu.copy_buffer_to_image.image = image;
2700    job->cpu.copy_buffer_to_image.buffer = buffer;
2701    job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
2702    job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
2703    job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
2704    job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
2705    job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
2706    job->cpu.copy_buffer_to_image.mip_level =
2707       region->imageSubresource.mipLevel;
2708    job->cpu.copy_buffer_to_image.base_layer =
2709       region->imageSubresource.baseArrayLayer;
2710    job->cpu.copy_buffer_to_image.layer_count = num_layers;
2711 
2712    list_addtail(&job->list_link, &cmd_buffer->jobs);
2713 
2714    return true;
2715 }
2716 
2717 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * info)2718 v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
2719                               const VkCopyBufferToImageInfo2 *info)
2720 {
2721    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2722    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
2723    V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
2724 
2725    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
2726 
2727    cmd_buffer->state.is_transfer = true;
2728 
2729    uint32_t r = 0;
2730    while (r < info->regionCount) {
2731       /* The TFU and TLB paths can only copy one region at a time and the region
2732        * needs to start at the origin. We try these first for the common case
2733        * where we are copying full images, since they should be the fastest.
2734        */
2735       uint32_t batch_size = 1;
2736       if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
2737          goto handled;
2738 
2739       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
2740          goto handled;
2741 
2742       /* Otherwise, we are copying subrects, so we fallback to copying
2743        * via shader and texel buffers and we try to batch the regions
2744        * if possible. We can only batch copies if they have the same
2745        * framebuffer spec, which is mostly determined by the image
2746        * subresource of the region.
2747        */
2748       const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
2749       for (uint32_t s = r + 1; s < info->regionCount; s++) {
2750          const VkImageSubresourceLayers *rsc_s =
2751             &info->pRegions[s].imageSubresource;
2752 
2753          if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
2754             break;
2755 
2756          /* For 3D images we also need to check the depth extent */
2757          if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
2758              info->pRegions[s].imageExtent.depth !=
2759              info->pRegions[r].imageExtent.depth) {
2760                break;
2761          }
2762 
2763          batch_size++;
2764       }
2765 
2766       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
2767                                       batch_size, &info->pRegions[r], true)) {
2768          goto handled;
2769       }
2770 
2771       /* If we still could not copy, fallback to slower paths.
2772        *
2773        * FIXME: we could try to batch these too, but since they are bound to be
2774        * slow it might not be worth it and we should instead put more effort
2775        * in handling more cases with the other paths.
2776        */
2777       if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer,
2778                                    &info->pRegions[r])) {
2779          batch_size = 1;
2780          goto handled;
2781       }
2782 
2783       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
2784                                       batch_size, &info->pRegions[r], false)) {
2785          goto handled;
2786       }
2787 
2788       unreachable("Unsupported buffer to image copy.");
2789 
2790 handled:
2791       r += batch_size;
2792    }
2793 
2794    cmd_buffer->state.is_transfer = false;
2795 }
2796 
2797 static void
2798 compute_blit_3d_layers(const VkOffset3D *offsets,
2799                        uint32_t *min_layer, uint32_t *max_layer,
2800                        bool *mirror_z);
2801 
2802 /**
2803  * Returns true if the implementation supports the requested operation (even if
2804  * it failed to process it, for example, due to an out-of-memory error).
2805  *
2806  * The TFU blit path doesn't handle scaling so the blit filter parameter can
2807  * be ignored.
2808  */
2809 static bool
blit_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageBlit2 * region)2810 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
2811          struct v3dv_image *dst,
2812          struct v3dv_image *src,
2813          const VkImageBlit2 *region)
2814 {
2815    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
2816    assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
2817 
2818    /* Format must match */
2819    if (src->vk.format != dst->vk.format)
2820       return false;
2821 
2822    /* Destination can't be raster format */
2823    if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
2824       return false;
2825 
2826    /* Source region must start at (0,0) */
2827    if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
2828       return false;
2829 
2830    /* Destination image must be complete */
2831    if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
2832       return false;
2833 
2834    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
2835    const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
2836    const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
2837    if (region->dstOffsets[1].x < dst_width - 1||
2838        region->dstOffsets[1].y < dst_height - 1) {
2839       return false;
2840    }
2841 
2842    /* No XY scaling */
2843    if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
2844        region->srcOffsets[1].y != region->dstOffsets[1].y) {
2845       return false;
2846    }
2847 
2848    /* If the format is D24S8 both aspects need to be copied, since the TFU
2849     * can't be programmed to copy only one aspect of the image.
2850     */
2851    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
2852        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
2853                                              VK_IMAGE_ASPECT_STENCIL_BIT;
2854        if (region->dstSubresource.aspectMask != ds_aspects)
2855           return false;
2856    }
2857 
2858    /* Our TFU blits only handle exact copies (it requires same formats
2859     * on input and output, no scaling, etc), so there is no pixel format
2860     * conversions and we can rewrite the format to use one that is TFU
2861     * compatible based on its texel size.
2862     */
2863    const struct v3dv_format *format =
2864       v3dv_get_compatible_tfu_format(cmd_buffer->device,
2865                                      dst->cpp, NULL);
2866 
2867    /* Emit a TFU job for each layer to blit */
2868    assert(region->dstSubresource.layerCount ==
2869           region->srcSubresource.layerCount);
2870 
2871    uint32_t min_dst_layer;
2872    uint32_t max_dst_layer;
2873    bool dst_mirror_z = false;
2874    if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
2875       compute_blit_3d_layers(region->dstOffsets,
2876                              &min_dst_layer, &max_dst_layer,
2877                              &dst_mirror_z);
2878    } else {
2879       min_dst_layer = region->dstSubresource.baseArrayLayer;
2880       max_dst_layer = min_dst_layer + region->dstSubresource.layerCount;
2881    }
2882 
2883    uint32_t min_src_layer;
2884    uint32_t max_src_layer;
2885    bool src_mirror_z = false;
2886    if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
2887       compute_blit_3d_layers(region->srcOffsets,
2888                              &min_src_layer, &max_src_layer,
2889                              &src_mirror_z);
2890    } else {
2891       min_src_layer = region->srcSubresource.baseArrayLayer;
2892       max_src_layer = min_src_layer + region->srcSubresource.layerCount;
2893    }
2894 
2895    /* No Z scaling for 3D images (for non-3D images both src and dst must
2896     * have the same layerCount).
2897     */
2898    if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
2899       return false;
2900 
2901    const uint32_t layer_count = max_dst_layer - min_dst_layer;
2902    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
2903    for (uint32_t i = 0; i < layer_count; i++) {
2904       /* Since the TFU path doesn't handle scaling, Z mirroring for 3D images
2905        * only involves reversing the order of the slices.
2906        */
2907       const uint32_t dst_layer =
2908          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
2909       const uint32_t src_layer =
2910          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
2911 
2912       const uint32_t dst_offset =
2913          dst->mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
2914       const uint32_t src_offset =
2915          src->mem->bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
2916 
2917       const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
2918       const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
2919 
2920       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
2921          cmd_buffer,
2922          dst->mem->bo->handle,
2923          dst_offset,
2924          dst_slice->tiling,
2925          dst_slice->padded_height,
2926          dst->cpp,
2927          src->mem->bo->handle,
2928          src_offset,
2929          src_slice->tiling,
2930          src_slice->tiling == V3D_TILING_RASTER ?
2931                               src_slice->stride : src_slice->padded_height,
2932          src->cpp,
2933          dst_width, dst_height, format);
2934    }
2935 
2936    return true;
2937 }
2938 
2939 static bool
format_needs_software_int_clamp(VkFormat format)2940 format_needs_software_int_clamp(VkFormat format)
2941 {
2942    switch (format) {
2943       case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2944       case VK_FORMAT_A2R10G10B10_SINT_PACK32:
2945       case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2946       case VK_FORMAT_A2B10G10R10_SINT_PACK32:
2947          return true;
2948       default:
2949          return false;
2950    };
2951 }
2952 
2953 static void
get_blit_pipeline_cache_key(VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,uint8_t * key)2954 get_blit_pipeline_cache_key(VkFormat dst_format,
2955                             VkFormat src_format,
2956                             VkColorComponentFlags cmask,
2957                             VkSampleCountFlagBits dst_samples,
2958                             VkSampleCountFlagBits src_samples,
2959                             uint8_t *key)
2960 {
2961    memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);
2962 
2963    uint32_t *p = (uint32_t *) key;
2964 
2965    *p = dst_format;
2966    p++;
2967 
2968    /* Generally, when blitting from a larger format to a smaller format
2969     * the hardware takes care of clamping the source to the RT range.
2970     * Specifically, for integer formats, this is done by using
2971     * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
2972     * clamps to the bit-size of the render type, and some formats, such as
2973     * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
2974     * require to clamp in software. In these cases, we need to amend the blit
2975     * shader with clamp code that depends on both the src and dst formats, so
2976     * we need the src format to be part of the key.
2977     */
2978    *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
2979    p++;
2980 
2981    *p = cmask;
2982    p++;
2983 
2984    *p = (dst_samples << 8) | src_samples;
2985    p++;
2986 
2987    assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
2988 }
2989 
2990 static bool
create_blit_render_pass(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkRenderPass * pass_load,VkRenderPass * pass_no_load)2991 create_blit_render_pass(struct v3dv_device *device,
2992                         VkFormat dst_format,
2993                         VkFormat src_format,
2994                         VkRenderPass *pass_load,
2995                         VkRenderPass *pass_no_load)
2996 {
2997    const bool is_color_blit = vk_format_is_color(dst_format);
2998 
2999    /* Attachment load operation is specified below */
3000    VkAttachmentDescription2 att = {
3001       .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
3002       .format = dst_format,
3003       .samples = VK_SAMPLE_COUNT_1_BIT,
3004       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
3005       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3006       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
3007    };
3008 
3009    VkAttachmentReference2 att_ref = {
3010       .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
3011       .attachment = 0,
3012       .layout = VK_IMAGE_LAYOUT_GENERAL,
3013    };
3014 
3015    VkSubpassDescription2 subpass = {
3016       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
3017       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
3018       .inputAttachmentCount = 0,
3019       .colorAttachmentCount = is_color_blit ? 1 : 0,
3020       .pColorAttachments = is_color_blit ? &att_ref : NULL,
3021       .pResolveAttachments = NULL,
3022       .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
3023       .preserveAttachmentCount = 0,
3024       .pPreserveAttachments = NULL,
3025    };
3026 
3027    VkRenderPassCreateInfo2 info = {
3028       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
3029       .attachmentCount = 1,
3030       .pAttachments = &att,
3031       .subpassCount = 1,
3032       .pSubpasses = &subpass,
3033       .dependencyCount = 0,
3034       .pDependencies = NULL,
3035    };
3036 
3037    VkResult result;
3038    att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
3039    result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
3040                                    &info, &device->vk.alloc, pass_load);
3041    if (result != VK_SUCCESS)
3042       return false;
3043 
3044    att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
3045    result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
3046                                    &info, &device->vk.alloc, pass_no_load);
3047    return result == VK_SUCCESS;
3048 }
3049 
3050 static nir_ssa_def *
gen_tex_coords(nir_builder * b)3051 gen_tex_coords(nir_builder *b)
3052 {
3053    nir_ssa_def *tex_box =
3054       nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
3055 
3056    nir_ssa_def *tex_z =
3057       nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
3058 
3059    nir_ssa_def *vertex_id = nir_load_vertex_id(b);
3060 
3061    /* vertex 0: src0_x, src0_y
3062     * vertex 1: src0_x, src1_y
3063     * vertex 2: src1_x, src0_y
3064     * vertex 3: src1_x, src1_y
3065     *
3066     * So:
3067     *
3068     * channel 0 is vertex_id < 2 ? src0_x : src1_x
3069     * channel 1 is vertex id & 1 ? src1_y : src0_y
3070     */
3071 
3072    nir_ssa_def *one = nir_imm_int(b, 1);
3073    nir_ssa_def *c0cmp = nir_ilt(b, vertex_id, nir_imm_int(b, 2));
3074    nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3075 
3076    nir_ssa_def *comp[4];
3077    comp[0] = nir_bcsel(b, c0cmp,
3078                        nir_channel(b, tex_box, 0),
3079                        nir_channel(b, tex_box, 2));
3080 
3081    comp[1] = nir_bcsel(b, c1cmp,
3082                        nir_channel(b, tex_box, 3),
3083                        nir_channel(b, tex_box, 1));
3084    comp[2] = tex_z;
3085    comp[3] = nir_imm_float(b, 1.0f);
3086    return nir_vec(b, comp, 4);
3087 }
3088 
3089 static nir_ssa_def *
build_nir_tex_op_read(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,enum glsl_sampler_dim dim)3090 build_nir_tex_op_read(struct nir_builder *b,
3091                       nir_ssa_def *tex_pos,
3092                       enum glsl_base_type tex_type,
3093                       enum glsl_sampler_dim dim)
3094 {
3095    assert(dim != GLSL_SAMPLER_DIM_MS);
3096 
3097    const struct glsl_type *sampler_type =
3098       glsl_sampler_type(dim, false, false, tex_type);
3099    nir_variable *sampler =
3100       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3101    sampler->data.descriptor_set = 0;
3102    sampler->data.binding = 0;
3103 
3104    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3105    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3106    tex->sampler_dim = dim;
3107    tex->op = nir_texop_tex;
3108    tex->src[0].src_type = nir_tex_src_coord;
3109    tex->src[0].src = nir_src_for_ssa(tex_pos);
3110    tex->src[1].src_type = nir_tex_src_texture_deref;
3111    tex->src[1].src = nir_src_for_ssa(tex_deref);
3112    tex->src[2].src_type = nir_tex_src_sampler_deref;
3113    tex->src[2].src = nir_src_for_ssa(tex_deref);
3114    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3115    tex->is_array = glsl_sampler_type_is_array(sampler_type);
3116    tex->coord_components = tex_pos->num_components;
3117 
3118    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3119    nir_builder_instr_insert(b, &tex->instr);
3120    return &tex->dest.ssa;
3121 }
3122 
3123 static nir_ssa_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder * b,nir_variable * sampler,nir_ssa_def * tex_deref,enum glsl_base_type tex_type,nir_ssa_def * tex_pos,nir_ssa_def * sample_idx)3124 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
3125                                  nir_variable *sampler,
3126                                  nir_ssa_def *tex_deref,
3127                                  enum glsl_base_type tex_type,
3128                                  nir_ssa_def *tex_pos,
3129                                  nir_ssa_def *sample_idx)
3130 {
3131    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
3132    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
3133    tex->op = nir_texop_txf_ms;
3134    tex->src[0].src_type = nir_tex_src_coord;
3135    tex->src[0].src = nir_src_for_ssa(tex_pos);
3136    tex->src[1].src_type = nir_tex_src_texture_deref;
3137    tex->src[1].src = nir_src_for_ssa(tex_deref);
3138    tex->src[2].src_type = nir_tex_src_sampler_deref;
3139    tex->src[2].src = nir_src_for_ssa(tex_deref);
3140    tex->src[3].src_type = nir_tex_src_ms_index;
3141    tex->src[3].src = nir_src_for_ssa(sample_idx);
3142    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3143    tex->is_array = false;
3144    tex->coord_components = tex_pos->num_components;
3145 
3146    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3147    nir_builder_instr_insert(b, &tex->instr);
3148    return &tex->dest.ssa;
3149 }
3150 
3151 /* Fetches all samples at the given position and averages them */
3152 static nir_ssa_def *
build_nir_tex_op_ms_resolve(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits src_samples)3153 build_nir_tex_op_ms_resolve(struct nir_builder *b,
3154                             nir_ssa_def *tex_pos,
3155                             enum glsl_base_type tex_type,
3156                             VkSampleCountFlagBits src_samples)
3157 {
3158    assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
3159    const struct glsl_type *sampler_type =
3160       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3161    nir_variable *sampler =
3162       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3163    sampler->data.descriptor_set = 0;
3164    sampler->data.binding = 0;
3165 
3166    const bool is_int = glsl_base_type_is_integer(tex_type);
3167 
3168    nir_ssa_def *tmp = NULL;
3169    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3170    for (uint32_t i = 0; i < src_samples; i++) {
3171       nir_ssa_def *s =
3172          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3173                                           tex_type, tex_pos,
3174                                           nir_imm_int(b, i));
3175 
3176       /* For integer formats, the multisample resolve operation is expected to
3177        * return one of the samples, we just return the first one.
3178        */
3179       if (is_int)
3180          return s;
3181 
3182       tmp = i == 0 ? s : nir_fadd(b, tmp, s);
3183    }
3184 
3185    assert(!is_int);
3186    return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
3187 }
3188 
3189 /* Fetches the current sample (gl_SampleID) at the given position */
3190 static nir_ssa_def *
build_nir_tex_op_ms_read(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type)3191 build_nir_tex_op_ms_read(struct nir_builder *b,
3192                          nir_ssa_def *tex_pos,
3193                          enum glsl_base_type tex_type)
3194 {
3195    const struct glsl_type *sampler_type =
3196       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3197    nir_variable *sampler =
3198       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3199    sampler->data.descriptor_set = 0;
3200    sampler->data.binding = 0;
3201 
3202    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3203 
3204    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3205                                            tex_type, tex_pos,
3206                                            nir_load_sample_id(b));
3207 }
3208 
3209 static nir_ssa_def *
build_nir_tex_op(struct nir_builder * b,struct v3dv_device * device,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim dim)3210 build_nir_tex_op(struct nir_builder *b,
3211                  struct v3dv_device *device,
3212                  nir_ssa_def *tex_pos,
3213                  enum glsl_base_type tex_type,
3214                  VkSampleCountFlagBits dst_samples,
3215                  VkSampleCountFlagBits src_samples,
3216                  enum glsl_sampler_dim dim)
3217 {
3218    switch (dim) {
3219    case GLSL_SAMPLER_DIM_MS:
3220       assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
3221       /* For multisampled texture sources we need to use fetching instead of
3222        * normalized texture coordinates. We already configured our blit
3223        * coordinates to be in texel units, but here we still need to convert
3224        * them from floating point to integer.
3225        */
3226       tex_pos = nir_f2i32(b, tex_pos);
3227 
3228       if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
3229          return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
3230       else
3231          return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
3232    default:
3233       assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
3234       return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
3235    }
3236 }
3237 
3238 static nir_shader *
get_blit_vs()3239 get_blit_vs()
3240 {
3241    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3242    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
3243                                                   "meta blit vs");
3244 
3245    const struct glsl_type *vec4 = glsl_vec4_type();
3246 
3247    nir_variable *vs_out_pos =
3248       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
3249    vs_out_pos->data.location = VARYING_SLOT_POS;
3250 
3251    nir_variable *vs_out_tex_coord =
3252       nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
3253    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
3254    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
3255 
3256    nir_ssa_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
3257    nir_store_var(&b, vs_out_pos, pos, 0xf);
3258 
3259    nir_ssa_def *tex_coord = gen_tex_coords(&b);
3260    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
3261 
3262    return b.shader;
3263 }
3264 
3265 static uint32_t
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)3266 get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
3267 {
3268    switch (sampler_dim) {
3269    case GLSL_SAMPLER_DIM_1D: return 0x1;
3270    case GLSL_SAMPLER_DIM_2D: return 0x3;
3271    case GLSL_SAMPLER_DIM_MS: return 0x3;
3272    case GLSL_SAMPLER_DIM_3D: return 0x7;
3273    default:
3274       unreachable("invalid sampler dim");
3275    };
3276 }
3277 
3278 static nir_shader *
get_color_blit_fs(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim sampler_dim)3279 get_color_blit_fs(struct v3dv_device *device,
3280                   VkFormat dst_format,
3281                   VkFormat src_format,
3282                   VkSampleCountFlagBits dst_samples,
3283                   VkSampleCountFlagBits src_samples,
3284                   enum glsl_sampler_dim sampler_dim)
3285 {
3286    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3287    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
3288                                                   "meta blit fs");
3289 
3290    const struct glsl_type *vec4 = glsl_vec4_type();
3291 
3292    nir_variable *fs_in_tex_coord =
3293       nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
3294    fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;
3295 
3296    const struct glsl_type *fs_out_type =
3297       vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
3298       vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
3299                                       glsl_vec4_type();
3300 
3301    enum glsl_base_type src_base_type =
3302       vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
3303       vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
3304                                       GLSL_TYPE_FLOAT;
3305 
3306    nir_variable *fs_out_color =
3307       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
3308    fs_out_color->data.location = FRAG_RESULT_DATA0;
3309 
3310    nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
3311    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
3312    tex_coord = nir_channels(&b, tex_coord, channel_mask);
3313 
3314    nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
3315                                          dst_samples, src_samples, sampler_dim);
3316 
3317    /* For integer textures, if the bit-size of the destination is too small to
3318     * hold source value, Vulkan (CTS) expects the implementation to clamp to the
3319     * maximum value the destination can hold. The hardware can clamp to the
3320     * render target type, which usually matches the component bit-size, but
3321     * there are some cases that won't match, such as rgb10a2, which has a 16-bit
3322     * render target type, so in these cases we need to clamp manually.
3323     */
3324    if (format_needs_software_int_clamp(dst_format)) {
3325       assert(vk_format_is_int(dst_format));
3326       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
3327       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
3328 
3329       nir_ssa_def *c[4];
3330       for (uint32_t i = 0; i < 4; i++) {
3331          c[i] = nir_channel(&b, color, i);
3332 
3333          const uint32_t src_bit_size =
3334             util_format_get_component_bits(src_pformat,
3335                                            UTIL_FORMAT_COLORSPACE_RGB,
3336                                            i);
3337          const uint32_t dst_bit_size =
3338             util_format_get_component_bits(dst_pformat,
3339                                            UTIL_FORMAT_COLORSPACE_RGB,
3340                                            i);
3341 
3342          if (dst_bit_size >= src_bit_size)
3343             continue;
3344 
3345          assert(dst_bit_size > 0);
3346          if (util_format_is_pure_uint(dst_pformat)) {
3347             nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
3348             c[i] = nir_umin(&b, c[i], max);
3349          } else {
3350             nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
3351             nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
3352             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
3353          }
3354       }
3355 
3356       color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
3357    }
3358 
3359    nir_store_var(&b, fs_out_color, color, 0xf);
3360 
3361    return b.shader;
3362 }
3363 
3364 static bool
create_pipeline(struct v3dv_device * device,struct v3dv_render_pass * pass,struct nir_shader * vs_nir,struct nir_shader * gs_nir,struct nir_shader * fs_nir,const VkPipelineVertexInputStateCreateInfo * vi_state,const VkPipelineDepthStencilStateCreateInfo * ds_state,const VkPipelineColorBlendStateCreateInfo * cb_state,const VkPipelineMultisampleStateCreateInfo * ms_state,const VkPipelineLayout layout,VkPipeline * pipeline)3365 create_pipeline(struct v3dv_device *device,
3366                 struct v3dv_render_pass *pass,
3367                 struct nir_shader *vs_nir,
3368                 struct nir_shader *gs_nir,
3369                 struct nir_shader *fs_nir,
3370                 const VkPipelineVertexInputStateCreateInfo *vi_state,
3371                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
3372                 const VkPipelineColorBlendStateCreateInfo *cb_state,
3373                 const VkPipelineMultisampleStateCreateInfo *ms_state,
3374                 const VkPipelineLayout layout,
3375                 VkPipeline *pipeline)
3376 {
3377    struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
3378    struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir);
3379    struct vk_shader_module gs_m;
3380 
3381    uint32_t num_stages = gs_nir ? 3 : 2;
3382 
3383 
3384    VkPipelineShaderStageCreateInfo stages[3] = {
3385       {
3386          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3387          .stage = VK_SHADER_STAGE_VERTEX_BIT,
3388          .module = vk_shader_module_to_handle(&vs_m),
3389          .pName = "main",
3390       },
3391       {
3392          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3393          .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
3394          .module = vk_shader_module_to_handle(&fs_m),
3395          .pName = "main",
3396       },
3397       {
3398          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3399          .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
3400          .module = VK_NULL_HANDLE,
3401          .pName = "main",
3402       },
3403    };
3404 
3405    if (gs_nir) {
3406       gs_m = vk_shader_module_from_nir(gs_nir);
3407       stages[2].module = vk_shader_module_to_handle(&gs_m);
3408    }
3409 
3410    VkGraphicsPipelineCreateInfo info = {
3411       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
3412 
3413       .stageCount = num_stages,
3414       .pStages = stages,
3415 
3416       .pVertexInputState = vi_state,
3417 
3418       .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
3419          .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
3420          .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
3421          .primitiveRestartEnable = false,
3422       },
3423 
3424       .pViewportState = &(VkPipelineViewportStateCreateInfo) {
3425          .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
3426          .viewportCount = 1,
3427          .scissorCount = 1,
3428       },
3429 
3430       .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
3431          .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
3432          .rasterizerDiscardEnable = false,
3433          .polygonMode = VK_POLYGON_MODE_FILL,
3434          .cullMode = VK_CULL_MODE_NONE,
3435          .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
3436          .depthBiasEnable = false,
3437       },
3438 
3439       .pMultisampleState = ms_state,
3440 
3441       .pDepthStencilState = ds_state,
3442 
3443       .pColorBlendState = cb_state,
3444 
3445       /* The meta clear pipeline declares all state as dynamic.
3446        * As a consequence, vkCmdBindPipeline writes no dynamic state
3447        * to the cmd buffer. Therefore, at the end of the meta clear,
3448        * we need only restore dynamic state that was vkCmdSet.
3449        */
3450       .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
3451          .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
3452          .dynamicStateCount = 6,
3453          .pDynamicStates = (VkDynamicState[]) {
3454             VK_DYNAMIC_STATE_VIEWPORT,
3455             VK_DYNAMIC_STATE_SCISSOR,
3456             VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
3457             VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
3458             VK_DYNAMIC_STATE_STENCIL_REFERENCE,
3459             VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3460             VK_DYNAMIC_STATE_DEPTH_BIAS,
3461             VK_DYNAMIC_STATE_LINE_WIDTH,
3462          },
3463       },
3464 
3465       .flags = 0,
3466       .layout = layout,
3467       .renderPass = v3dv_render_pass_to_handle(pass),
3468       .subpass = 0,
3469    };
3470 
3471    VkResult result =
3472       v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
3473                                    VK_NULL_HANDLE,
3474                                    1, &info,
3475                                    &device->vk.alloc,
3476                                    pipeline);
3477 
3478    ralloc_free(vs_nir);
3479    ralloc_free(gs_nir);
3480    ralloc_free(fs_nir);
3481 
3482    return result == VK_SUCCESS;
3483 }
3484 
3485 static enum glsl_sampler_dim
get_sampler_dim(VkImageType type,VkSampleCountFlagBits src_samples)3486 get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
3487 {
3488    /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
3489     *
3490     *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
3491     *    VK_IMAGE_TYPE_2D, ..."
3492     */
3493    assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);
3494 
3495    switch (type) {
3496    case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
3497    case VK_IMAGE_TYPE_2D:
3498       return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
3499                                                     GLSL_SAMPLER_DIM_MS;
3500    case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
3501    default:
3502       unreachable("Invalid image type");
3503    }
3504 }
3505 
3506 static bool
create_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)3507 create_blit_pipeline(struct v3dv_device *device,
3508                      VkFormat dst_format,
3509                      VkFormat src_format,
3510                      VkColorComponentFlags cmask,
3511                      VkImageType src_type,
3512                      VkSampleCountFlagBits dst_samples,
3513                      VkSampleCountFlagBits src_samples,
3514                      VkRenderPass _pass,
3515                      VkPipelineLayout pipeline_layout,
3516                      VkPipeline *pipeline)
3517 {
3518    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
3519 
3520    /* We always rewrite depth/stencil blits to compatible color blits */
3521    assert(vk_format_is_color(dst_format));
3522    assert(vk_format_is_color(src_format));
3523 
3524    const enum glsl_sampler_dim sampler_dim =
3525       get_sampler_dim(src_type, src_samples);
3526 
3527    nir_shader *vs_nir = get_blit_vs();
3528    nir_shader *fs_nir =
3529       get_color_blit_fs(device, dst_format, src_format,
3530                         dst_samples, src_samples, sampler_dim);
3531 
3532    const VkPipelineVertexInputStateCreateInfo vi_state = {
3533       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
3534       .vertexBindingDescriptionCount = 0,
3535       .vertexAttributeDescriptionCount = 0,
3536    };
3537 
3538    VkPipelineDepthStencilStateCreateInfo ds_state = {
3539       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
3540    };
3541 
3542    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
3543    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
3544       .blendEnable = false,
3545       .colorWriteMask = cmask,
3546    };
3547 
3548    const VkPipelineColorBlendStateCreateInfo cb_state = {
3549       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
3550       .logicOpEnable = false,
3551       .attachmentCount = 1,
3552       .pAttachments = blend_att_state
3553    };
3554 
3555    const VkPipelineMultisampleStateCreateInfo ms_state = {
3556       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
3557       .rasterizationSamples = dst_samples,
3558       .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
3559       .pSampleMask = NULL,
3560       .alphaToCoverageEnable = false,
3561       .alphaToOneEnable = false,
3562    };
3563 
3564    return create_pipeline(device,
3565                           pass,
3566                           vs_nir, NULL, fs_nir,
3567                           &vi_state,
3568                           &ds_state,
3569                           &cb_state,
3570                           &ms_state,
3571                           pipeline_layout,
3572                           pipeline);
3573 }
3574 
3575 /**
3576  * Return a pipeline suitable for blitting the requested aspect given the
3577  * destination and source formats.
3578  */
3579 static bool
get_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,struct v3dv_meta_blit_pipeline ** pipeline)3580 get_blit_pipeline(struct v3dv_device *device,
3581                   VkFormat dst_format,
3582                   VkFormat src_format,
3583                   VkColorComponentFlags cmask,
3584                   VkImageType src_type,
3585                   VkSampleCountFlagBits dst_samples,
3586                   VkSampleCountFlagBits src_samples,
3587                   struct v3dv_meta_blit_pipeline **pipeline)
3588 {
3589    bool ok = true;
3590 
3591    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
3592    get_blit_pipeline_cache_key(dst_format, src_format, cmask,
3593                                dst_samples, src_samples, key);
3594    mtx_lock(&device->meta.mtx);
3595    struct hash_entry *entry =
3596       _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
3597    if (entry) {
3598       mtx_unlock(&device->meta.mtx);
3599       *pipeline = entry->data;
3600       return true;
3601    }
3602 
3603    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
3604                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
3605 
3606    if (*pipeline == NULL)
3607       goto fail;
3608 
3609    ok = create_blit_render_pass(device, dst_format, src_format,
3610                                 &(*pipeline)->pass,
3611                                 &(*pipeline)->pass_no_load);
3612    if (!ok)
3613       goto fail;
3614 
3615    /* Create the pipeline using one of the render passes, they are both
3616     * compatible, so we don't care which one we use here.
3617     */
3618    ok = create_blit_pipeline(device,
3619                              dst_format,
3620                              src_format,
3621                              cmask,
3622                              src_type,
3623                              dst_samples,
3624                              src_samples,
3625                              (*pipeline)->pass,
3626                              device->meta.blit.p_layout,
3627                              &(*pipeline)->pipeline);
3628    if (!ok)
3629       goto fail;
3630 
3631    memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
3632    _mesa_hash_table_insert(device->meta.blit.cache[src_type],
3633                            &(*pipeline)->key, *pipeline);
3634 
3635    mtx_unlock(&device->meta.mtx);
3636    return true;
3637 
3638 fail:
3639    mtx_unlock(&device->meta.mtx);
3640 
3641    VkDevice _device = v3dv_device_to_handle(device);
3642    if (*pipeline) {
3643       if ((*pipeline)->pass)
3644          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
3645       if ((*pipeline)->pass_no_load)
3646          v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->vk.alloc);
3647       if ((*pipeline)->pipeline)
3648          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
3649       vk_free(&device->vk.alloc, *pipeline);
3650       *pipeline = NULL;
3651    }
3652 
3653    return false;
3654 }
3655 
3656 static void
compute_blit_box(const VkOffset3D * offsets,uint32_t image_w,uint32_t image_h,uint32_t * x,uint32_t * y,uint32_t * w,uint32_t * h,bool * mirror_x,bool * mirror_y)3657 compute_blit_box(const VkOffset3D *offsets,
3658                  uint32_t image_w, uint32_t image_h,
3659                  uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
3660                  bool *mirror_x, bool *mirror_y)
3661 {
3662    if (offsets[1].x >= offsets[0].x) {
3663       *mirror_x = false;
3664       *x = MIN2(offsets[0].x, image_w - 1);
3665       *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
3666    } else {
3667       *mirror_x = true;
3668       *x = MIN2(offsets[1].x, image_w - 1);
3669       *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
3670    }
3671    if (offsets[1].y >= offsets[0].y) {
3672       *mirror_y = false;
3673       *y = MIN2(offsets[0].y, image_h - 1);
3674       *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
3675    } else {
3676       *mirror_y = true;
3677       *y = MIN2(offsets[1].y, image_h - 1);
3678       *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
3679    }
3680 }
3681 
3682 static void
compute_blit_3d_layers(const VkOffset3D * offsets,uint32_t * min_layer,uint32_t * max_layer,bool * mirror_z)3683 compute_blit_3d_layers(const VkOffset3D *offsets,
3684                        uint32_t *min_layer, uint32_t *max_layer,
3685                        bool *mirror_z)
3686 {
3687    if (offsets[1].z >= offsets[0].z) {
3688       *mirror_z = false;
3689       *min_layer = offsets[0].z;
3690       *max_layer = offsets[1].z;
3691    } else {
3692       *mirror_z = true;
3693       *min_layer = offsets[1].z;
3694       *max_layer = offsets[0].z;
3695    }
3696 }
3697 
3698 static VkResult
create_blit_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)3699 create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
3700 {
3701    /* If this is not the first pool we create for this command buffer
3702     * size it based on the size of the currently exhausted pool.
3703     */
3704    uint32_t descriptor_count = 64;
3705    if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
3706       struct v3dv_descriptor_pool *exhausted_pool =
3707          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
3708       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
3709    }
3710 
3711    /* Create the descriptor pool */
3712    cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
3713    VkDescriptorPoolSize pool_size = {
3714       .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
3715       .descriptorCount = descriptor_count,
3716    };
3717    VkDescriptorPoolCreateInfo info = {
3718       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
3719       .maxSets = descriptor_count,
3720       .poolSizeCount = 1,
3721       .pPoolSizes = &pool_size,
3722       .flags = 0,
3723    };
3724    VkResult result =
3725       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
3726                                 &info,
3727                                 &cmd_buffer->device->vk.alloc,
3728                                 &cmd_buffer->meta.blit.dspool);
3729 
3730    if (result == VK_SUCCESS) {
3731       assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3732       const VkDescriptorPool _pool = cmd_buffer->meta.blit.dspool;
3733 
3734       v3dv_cmd_buffer_add_private_obj(
3735          cmd_buffer, (uintptr_t) _pool,
3736          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
3737 
3738       struct v3dv_descriptor_pool *pool =
3739          v3dv_descriptor_pool_from_handle(_pool);
3740       pool->is_driver_internal = true;
3741    }
3742 
3743    return result;
3744 }
3745 
3746 static VkResult
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)3747 allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
3748                                     VkDescriptorSet *set)
3749 {
3750    /* Make sure we have a descriptor pool */
3751    VkResult result;
3752    if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
3753       result = create_blit_descriptor_pool(cmd_buffer);
3754       if (result != VK_SUCCESS)
3755          return result;
3756    }
3757    assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3758 
3759    /* Allocate descriptor set */
3760    struct v3dv_device *device = cmd_buffer->device;
3761    VkDevice _device = v3dv_device_to_handle(device);
3762    VkDescriptorSetAllocateInfo info = {
3763       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
3764       .descriptorPool = cmd_buffer->meta.blit.dspool,
3765       .descriptorSetCount = 1,
3766       .pSetLayouts = &device->meta.blit.ds_layout,
3767    };
3768    result = v3dv_AllocateDescriptorSets(_device, &info, set);
3769 
3770    /* If we ran out of pool space, grow the pool and try again */
3771    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
3772       result = create_blit_descriptor_pool(cmd_buffer);
3773       if (result == VK_SUCCESS) {
3774          info.descriptorPool = cmd_buffer->meta.blit.dspool;
3775          result = v3dv_AllocateDescriptorSets(_device, &info, set);
3776       }
3777    }
3778 
3779    return result;
3780 }
3781 
3782 /**
3783  * Returns true if the implementation supports the requested operation (even if
3784  * it failed to process it, for example, due to an out-of-memory error).
3785  *
3786  * The caller can specify the channels on the destination to be written via the
3787  * cmask parameter (which can be 0 to default to all channels), as well as a
3788  * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
3789  * to use the default identity swizzle).
3790  */
3791 static bool
blit_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,VkFormat dst_format,struct v3dv_image * src,VkFormat src_format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,const VkImageBlit2 * region,VkFilter filter,bool dst_is_padded_image)3792 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
3793             struct v3dv_image *dst,
3794             VkFormat dst_format,
3795             struct v3dv_image *src,
3796             VkFormat src_format,
3797             VkColorComponentFlags cmask,
3798             VkComponentMapping *cswizzle,
3799             const VkImageBlit2 *region,
3800             VkFilter filter,
3801             bool dst_is_padded_image)
3802 {
3803    bool handled = true;
3804    VkResult result;
3805    uint32_t dirty_dynamic_state = 0;
3806 
3807    /* We don't support rendering to linear depth/stencil, this should have
3808     * been rewritten to a compatible color blit by the caller.
3809     */
3810    assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
3811           !vk_format_is_depth_or_stencil(dst_format));
3812 
3813    /* Can't sample from linear images */
3814    if (src->vk.tiling == VK_IMAGE_TILING_LINEAR &&
3815        src->vk.image_type != VK_IMAGE_TYPE_1D) {
3816       return false;
3817    }
3818 
3819    /* Rewrite combined D/S blits to compatible color blits */
3820    if (vk_format_is_depth_or_stencil(dst_format)) {
3821       assert(src_format == dst_format);
3822       assert(cmask == 0);
3823       switch(dst_format) {
3824       case VK_FORMAT_D16_UNORM:
3825          dst_format = VK_FORMAT_R16_UINT;
3826          break;
3827       case VK_FORMAT_D32_SFLOAT:
3828          dst_format = VK_FORMAT_R32_UINT;
3829          break;
3830       case VK_FORMAT_X8_D24_UNORM_PACK32:
3831       case VK_FORMAT_D24_UNORM_S8_UINT:
3832          if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3833             cmask |= VK_COLOR_COMPONENT_G_BIT |
3834                      VK_COLOR_COMPONENT_B_BIT |
3835                      VK_COLOR_COMPONENT_A_BIT;
3836          }
3837          if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3838             assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
3839             cmask |= VK_COLOR_COMPONENT_R_BIT;
3840          }
3841          dst_format = VK_FORMAT_R8G8B8A8_UINT;
3842          break;
3843       default:
3844          unreachable("Unsupported depth/stencil format");
3845       };
3846       src_format = dst_format;
3847    }
3848 
3849    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
3850                                             VK_COLOR_COMPONENT_G_BIT |
3851                                             VK_COLOR_COMPONENT_B_BIT |
3852                                             VK_COLOR_COMPONENT_A_BIT;
3853    if (cmask == 0)
3854       cmask = full_cmask;
3855 
3856    VkComponentMapping ident_swizzle = {
3857       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
3858       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
3859       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
3860       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
3861    };
3862    if (!cswizzle)
3863       cswizzle = &ident_swizzle;
3864 
3865    /* When we get here from a copy between compressed / uncompressed images
3866     * we choose to specify the destination blit region based on the size
3867     * semantics of the source image of the copy (see copy_image_blit), so we
3868     * need to apply those same semantics here when we compute the size of the
3869     * destination image level.
3870     */
3871    const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
3872    const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
3873    const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
3874    const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
3875    const uint32_t dst_level_w =
3876       u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
3877                region->dstSubresource.mipLevel);
3878    const uint32_t dst_level_h =
3879       u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
3880                region->dstSubresource.mipLevel);
3881 
3882    const uint32_t src_level_w =
3883       u_minify(src->vk.extent.width, region->srcSubresource.mipLevel);
3884    const uint32_t src_level_h =
3885       u_minify(src->vk.extent.height, region->srcSubresource.mipLevel);
3886    const uint32_t src_level_d =
3887       u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel);
3888 
3889    uint32_t dst_x, dst_y, dst_w, dst_h;
3890    bool dst_mirror_x, dst_mirror_y;
3891    compute_blit_box(region->dstOffsets,
3892                     dst_level_w, dst_level_h,
3893                     &dst_x, &dst_y, &dst_w, &dst_h,
3894                     &dst_mirror_x, &dst_mirror_y);
3895 
3896    uint32_t src_x, src_y, src_w, src_h;
3897    bool src_mirror_x, src_mirror_y;
3898    compute_blit_box(region->srcOffsets,
3899                     src_level_w, src_level_h,
3900                     &src_x, &src_y, &src_w, &src_h,
3901                     &src_mirror_x, &src_mirror_y);
3902 
3903    uint32_t min_dst_layer;
3904    uint32_t max_dst_layer;
3905    bool dst_mirror_z = false;
3906    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
3907       min_dst_layer = region->dstSubresource.baseArrayLayer;
3908       max_dst_layer = min_dst_layer + region->dstSubresource.layerCount;
3909    } else {
3910       compute_blit_3d_layers(region->dstOffsets,
3911                              &min_dst_layer, &max_dst_layer,
3912                              &dst_mirror_z);
3913    }
3914 
3915    uint32_t min_src_layer;
3916    uint32_t max_src_layer;
3917    bool src_mirror_z = false;
3918    if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
3919       min_src_layer = region->srcSubresource.baseArrayLayer;
3920       max_src_layer = min_src_layer + region->srcSubresource.layerCount;
3921    } else {
3922       compute_blit_3d_layers(region->srcOffsets,
3923                              &min_src_layer, &max_src_layer,
3924                              &src_mirror_z);
3925    }
3926 
3927    uint32_t layer_count = max_dst_layer - min_dst_layer;
3928 
3929    /* Translate source blit coordinates to normalized texture coordinates for
3930     * single sampled textures. For multisampled textures we require
3931     * unnormalized coordinates, since we can only do texelFetch on them.
3932     */
3933    float coords[4] =  {
3934       (float)src_x,
3935       (float)src_y,
3936       (float)(src_x + src_w),
3937       (float)(src_y + src_h),
3938    };
3939 
3940    if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
3941       coords[0] /= (float)src_level_w;
3942       coords[1] /= (float)src_level_h;
3943       coords[2] /= (float)src_level_w;
3944       coords[3] /= (float)src_level_h;
3945    }
3946 
3947    /* Handle mirroring */
3948    const bool mirror_x = dst_mirror_x != src_mirror_x;
3949    const bool mirror_y = dst_mirror_y != src_mirror_y;
3950    const bool mirror_z = dst_mirror_z != src_mirror_z;
3951    float tex_coords[5] = {
3952       !mirror_x ? coords[0] : coords[2],
3953       !mirror_y ? coords[1] : coords[3],
3954       !mirror_x ? coords[2] : coords[0],
3955       !mirror_y ? coords[3] : coords[1],
3956       /* Z coordinate for 3D blit sources, to be filled for each
3957        * destination layer
3958        */
3959       0.0f
3960    };
3961 
3962    /* For blits from 3D images we also need to compute the slice coordinate to
3963     * sample from, which will change for each layer in the destination.
3964     * Compute the step we should increase for each iteration.
3965     */
3966    const float src_z_step =
3967       (float)(max_src_layer - min_src_layer) / (float)layer_count;
3968 
3969    /* Get the blit pipeline */
3970    struct v3dv_meta_blit_pipeline *pipeline = NULL;
3971    bool ok = get_blit_pipeline(cmd_buffer->device,
3972                                dst_format, src_format, cmask, src->vk.image_type,
3973                                dst->vk.samples, src->vk.samples,
3974                                &pipeline);
3975    if (!ok)
3976       return handled;
3977    assert(pipeline && pipeline->pipeline &&
3978           pipeline->pass && pipeline->pass_no_load);
3979 
3980    struct v3dv_device *device = cmd_buffer->device;
3981    assert(device->meta.blit.ds_layout);
3982 
3983    VkDevice _device = v3dv_device_to_handle(device);
3984    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
3985 
3986    /* Create sampler for blit source image */
3987    VkSamplerCreateInfo sampler_info = {
3988       .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
3989       .magFilter = filter,
3990       .minFilter = filter,
3991       .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
3992       .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
3993       .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
3994       .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
3995    };
3996    VkSampler sampler;
3997    result = v3dv_CreateSampler(_device, &sampler_info, &device->vk.alloc,
3998                                &sampler);
3999    if (result != VK_SUCCESS)
4000       goto fail;
4001 
4002    v3dv_cmd_buffer_add_private_obj(
4003       cmd_buffer, (uintptr_t)sampler,
4004       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);
4005 
4006    /* Push command buffer state before starting meta operation */
4007    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
4008 
4009    /* Push state that is common for all layers */
4010    v3dv_CmdBindPipeline(_cmd_buffer,
4011                         VK_PIPELINE_BIND_POINT_GRAPHICS,
4012                         pipeline->pipeline);
4013 
4014    const VkViewport viewport = {
4015       .x = dst_x,
4016       .y = dst_y,
4017       .width = dst_w,
4018       .height = dst_h,
4019       .minDepth = 0.0f,
4020       .maxDepth = 1.0f
4021    };
4022    v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
4023 
4024    const VkRect2D scissor = {
4025       .offset = { dst_x, dst_y },
4026       .extent = { dst_w, dst_h }
4027    };
4028    v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
4029 
4030    bool can_skip_tlb_load = false;
4031    const VkRect2D render_area = {
4032       .offset = { dst_x, dst_y },
4033       .extent = { dst_w, dst_h },
4034    };
4035 
4036    /* Record per-layer commands */
4037    for (uint32_t i = 0; i < layer_count; i++) {
4038       /* Setup framebuffer */
4039       VkImageViewCreateInfo dst_image_view_info = {
4040          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4041          .image = v3dv_image_to_handle(dst),
4042          .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
4043          .format = dst_format,
4044          .subresourceRange = {
4045             .aspectMask = region->dstSubresource.aspectMask,
4046             .baseMipLevel = region->dstSubresource.mipLevel,
4047             .levelCount = 1,
4048             .baseArrayLayer = min_dst_layer + i,
4049             .layerCount = 1
4050          },
4051       };
4052       VkImageView dst_image_view;
4053       result = v3dv_create_image_view(device, &dst_image_view_info,
4054                                       &dst_image_view);
4055       if (result != VK_SUCCESS)
4056          goto fail;
4057 
4058       v3dv_cmd_buffer_add_private_obj(
4059          cmd_buffer, (uintptr_t)dst_image_view,
4060          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4061 
4062       VkFramebufferCreateInfo fb_info = {
4063          .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
4064          .renderPass = pipeline->pass,
4065          .attachmentCount = 1,
4066          .pAttachments = &dst_image_view,
4067          .width = dst_x + dst_w,
4068          .height = dst_y + dst_h,
4069          .layers = 1,
4070       };
4071 
4072       VkFramebuffer fb;
4073       result = v3dv_CreateFramebuffer(_device, &fb_info,
4074                                       &cmd_buffer->device->vk.alloc, &fb);
4075       if (result != VK_SUCCESS)
4076          goto fail;
4077 
4078       struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
4079       framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
4080                                       fb_info.height == dst_level_h &&
4081                                       dst_is_padded_image;
4082 
4083       v3dv_cmd_buffer_add_private_obj(
4084          cmd_buffer, (uintptr_t)fb,
4085          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
4086 
4087       /* Setup descriptor set for blit source texture. We don't have to
4088        * register the descriptor as a private command buffer object since
4089        * all descriptors will be freed automatically with the descriptor
4090        * pool.
4091        */
4092       VkDescriptorSet set;
4093       result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
4094       if (result != VK_SUCCESS)
4095          goto fail;
4096 
4097       VkImageViewCreateInfo src_image_view_info = {
4098          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4099          .image = v3dv_image_to_handle(src),
4100          .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
4101          .format = src_format,
4102          .components = *cswizzle,
4103          .subresourceRange = {
4104             .aspectMask = region->srcSubresource.aspectMask,
4105             .baseMipLevel = region->srcSubresource.mipLevel,
4106             .levelCount = 1,
4107             .baseArrayLayer =
4108                src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
4109             .layerCount = 1
4110          },
4111       };
4112       VkImageView src_image_view;
4113       result = v3dv_create_image_view(device, &src_image_view_info,
4114                                       &src_image_view);
4115       if (result != VK_SUCCESS)
4116          goto fail;
4117 
4118       v3dv_cmd_buffer_add_private_obj(
4119          cmd_buffer, (uintptr_t)src_image_view,
4120          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4121 
4122       VkDescriptorImageInfo image_info = {
4123          .sampler = sampler,
4124          .imageView = src_image_view,
4125          .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
4126       };
4127       VkWriteDescriptorSet write = {
4128          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
4129          .dstSet = set,
4130          .dstBinding = 0,
4131          .dstArrayElement = 0,
4132          .descriptorCount = 1,
4133          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4134          .pImageInfo = &image_info,
4135       };
4136       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
4137 
4138       v3dv_CmdBindDescriptorSets(_cmd_buffer,
4139                                  VK_PIPELINE_BIND_POINT_GRAPHICS,
4140                                  device->meta.blit.p_layout,
4141                                  0, 1, &set,
4142                                  0, NULL);
4143 
4144       /* If the region we are about to blit is tile-aligned, then we can
4145        * use the render pass version that won't pre-load the tile buffer
4146        * with the dst image contents before the blit. The exception is when we
4147        * don't have a full color mask, since in that case we need to preserve
4148        * the original value of some of the color components.
4149        *
4150        * Since all layers have the same area, we only need to compute this for
4151        * the first.
4152        */
4153       if (i == 0) {
4154          struct v3dv_render_pass *pipeline_pass =
4155             v3dv_render_pass_from_handle(pipeline->pass);
4156          can_skip_tlb_load =
4157             cmask == full_cmask &&
4158             v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
4159                                               framebuffer, pipeline_pass, 0);
4160       }
4161 
4162       /* Record blit */
4163       VkRenderPassBeginInfo rp_info = {
4164          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4165          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
4166                                            pipeline->pass,
4167          .framebuffer = fb,
4168          .renderArea = render_area,
4169          .clearValueCount = 0,
4170       };
4171 
4172       VkSubpassBeginInfo sp_info = {
4173          .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
4174          .contents = VK_SUBPASS_CONTENTS_INLINE,
4175       };
4176 
4177       v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
4178       struct v3dv_job *job = cmd_buffer->state.job;
4179       if (!job)
4180          goto fail;
4181 
4182       /* For 3D blits we need to compute the source slice to blit from (the Z
4183        * coordinate of the source sample operation). We want to choose this
4184        * based on the ratio of the depth of the source and the destination
4185        * images, picking the coordinate in the middle of each step.
4186        */
4187       if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
4188          tex_coords[4] =
4189             !mirror_z ?
4190             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
4191             (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
4192       }
4193 
4194       v3dv_CmdPushConstants(_cmd_buffer,
4195                             device->meta.blit.p_layout,
4196                             VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
4197                             &tex_coords);
4198 
4199       v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
4200 
4201       VkSubpassEndInfo sp_end_info = {
4202          .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
4203       };
4204 
4205       v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
4206       dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
4207    }
4208 
4209 fail:
4210    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
4211 
4212    return handled;
4213 }
4214 
4215 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)4216 v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
4217                       const VkBlitImageInfo2 *pBlitImageInfo)
4218 {
4219    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4220    V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
4221    V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
4222 
4223     /* This command can only happen outside a render pass */
4224    assert(cmd_buffer->state.pass == NULL);
4225    assert(cmd_buffer->state.job == NULL);
4226 
4227    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
4228    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
4229           src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4230 
4231    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
4232    assert(!vk_format_is_compressed(dst->vk.format));
4233 
4234    cmd_buffer->state.is_transfer = true;
4235 
4236    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
4237       if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
4238          continue;
4239       if (blit_shader(cmd_buffer,
4240                       dst, dst->vk.format,
4241                       src, src->vk.format,
4242                       0, NULL,
4243                       &pBlitImageInfo->pRegions[i],
4244                       pBlitImageInfo->filter, true)) {
4245          continue;
4246       }
4247       unreachable("Unsupported blit operation");
4248    }
4249 
4250    cmd_buffer->state.is_transfer = false;
4251 }
4252 
4253 static bool
resolve_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2 * region)4254 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
4255                   struct v3dv_image *dst,
4256                   struct v3dv_image *src,
4257                   const VkImageResolve2 *region)
4258 {
4259    if (!v3dv_meta_can_use_tlb(src, &region->srcOffset, NULL) ||
4260        !v3dv_meta_can_use_tlb(dst, &region->dstOffset, NULL)) {
4261       return false;
4262    }
4263 
4264    if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
4265       return false;
4266 
4267    const VkFormat fb_format = src->vk.format;
4268 
4269    uint32_t num_layers;
4270    if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
4271       num_layers = region->dstSubresource.layerCount;
4272    else
4273       num_layers = region->extent.depth;
4274    assert(num_layers > 0);
4275 
4276    struct v3dv_job *job =
4277       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
4278    if (!job)
4279       return true;
4280 
4281    const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
4282    const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
4283    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
4284    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
4285 
4286    uint32_t internal_type, internal_bpp;
4287    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
4288       (fb_format, region->srcSubresource.aspectMask,
4289        &internal_type, &internal_bpp);
4290 
4291    v3dv_job_start_frame(job, width, height, num_layers, false,
4292                         1, internal_bpp, true);
4293 
4294    struct v3dv_meta_framebuffer framebuffer;
4295    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
4296                                               internal_type, &job->frame_tiling);
4297 
4298    v3dv_X(job->device, job_emit_binning_flush)(job);
4299    v3dv_X(job->device, meta_emit_resolve_image_rcl)(job, dst, src,
4300                                                     &framebuffer, region);
4301 
4302    v3dv_cmd_buffer_finish_job(cmd_buffer);
4303    return true;
4304 }
4305 
4306 static bool
resolve_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2 * region)4307 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
4308                    struct v3dv_image *dst,
4309                    struct v3dv_image *src,
4310                    const VkImageResolve2 *region)
4311 {
4312    const VkImageBlit2 blit_region = {
4313       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
4314       .srcSubresource = region->srcSubresource,
4315       .srcOffsets = {
4316          region->srcOffset,
4317          {
4318             region->srcOffset.x + region->extent.width,
4319             region->srcOffset.y + region->extent.height,
4320          }
4321       },
4322       .dstSubresource = region->dstSubresource,
4323       .dstOffsets = {
4324          region->dstOffset,
4325          {
4326             region->dstOffset.x + region->extent.width,
4327             region->dstOffset.y + region->extent.height,
4328          }
4329       },
4330    };
4331    return blit_shader(cmd_buffer,
4332                       dst, dst->vk.format,
4333                       src, src->vk.format,
4334                       0, NULL,
4335                       &blit_region, VK_FILTER_NEAREST, true);
4336 }
4337 
4338 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * info)4339 v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
4340                          const VkResolveImageInfo2 *info)
4341 
4342 {
4343    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4344    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
4345    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
4346 
4347     /* This command can only happen outside a render pass */
4348    assert(cmd_buffer->state.pass == NULL);
4349    assert(cmd_buffer->state.job == NULL);
4350 
4351    assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
4352    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4353 
4354    cmd_buffer->state.is_transfer = true;
4355 
4356    for (uint32_t i = 0; i < info->regionCount; i++) {
4357       if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
4358          continue;
4359       if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
4360          continue;
4361       unreachable("Unsupported multismaple resolve operation");
4362    }
4363 
4364    cmd_buffer->state.is_transfer = false;
4365 }
4366