• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 
26 #include "compiler/nir/nir_builder.h"
27 #include "broadcom/cle/v3dx_pack.h"
28 #include "vk_format_info.h"
29 #include "util/u_pack_color.h"
30 
31 static uint32_t
meta_blit_key_hash(const void * key)32 meta_blit_key_hash(const void *key)
33 {
34    return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
35 }
36 
37 static bool
meta_blit_key_compare(const void * key1,const void * key2)38 meta_blit_key_compare(const void *key1, const void *key2)
39 {
40    return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
41 }
42 
43 void
v3dv_meta_blit_init(struct v3dv_device * device)44 v3dv_meta_blit_init(struct v3dv_device *device)
45 {
46    for (uint32_t i = 0; i < 3; i++) {
47       device->meta.blit.cache[i] =
48          _mesa_hash_table_create(NULL,
49                                  meta_blit_key_hash,
50                                  meta_blit_key_compare);
51    }
52 }
53 
54 void
v3dv_meta_blit_finish(struct v3dv_device * device)55 v3dv_meta_blit_finish(struct v3dv_device *device)
56 {
57    VkDevice _device = v3dv_device_to_handle(device);
58 
59    for (uint32_t i = 0; i < 3; i++) {
60       hash_table_foreach(device->meta.blit.cache[i], entry) {
61          struct v3dv_meta_blit_pipeline *item = entry->data;
62          v3dv_DestroyPipeline(_device, item->pipeline, &device->alloc);
63          v3dv_DestroyRenderPass(_device, item->pass, &device->alloc);
64          v3dv_DestroyRenderPass(_device, item->pass_no_load, &device->alloc);
65          vk_free(&device->alloc, item);
66       }
67       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
68    }
69 
70    if (device->meta.blit.playout) {
71       v3dv_DestroyPipelineLayout(_device, device->meta.blit.playout,
72                                  &device->alloc);
73    }
74 
75    if (device->meta.blit.dslayout) {
76       v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.dslayout,
77                                       &device->alloc);
78    }
79 }
80 
81 static inline bool
82 can_use_tlb(struct v3dv_image *image,
83             const VkOffset3D *offset,
84             VkFormat *compat_format);
85 
86 /**
87  * Copy operations implemented in this file don't operate on a framebuffer
88  * object provided by the user, however, since most use the TLB for this,
89  * we still need to have some representation of the framebuffer. For the most
90  * part, the job's frame tiling information is enough for this, however we
91  * still need additional information such us the internal type of our single
92  * render target, so we use this auxiliary struct to pass that information
93  * around.
94  */
95 struct framebuffer_data {
96    /* The internal type of the single render target */
97    uint32_t internal_type;
98 
99    /* Supertile coverage */
100    uint32_t min_x_supertile;
101    uint32_t min_y_supertile;
102    uint32_t max_x_supertile;
103    uint32_t max_y_supertile;
104 
105    /* Format info */
106    VkFormat vk_format;
107    const struct v3dv_format *format;
108    uint8_t internal_depth_type;
109 };
110 
111 static void
setup_framebuffer_data(struct framebuffer_data * fb,VkFormat vk_format,uint32_t internal_type,const struct v3dv_frame_tiling * tiling)112 setup_framebuffer_data(struct framebuffer_data *fb,
113                        VkFormat vk_format,
114                        uint32_t internal_type,
115                        const struct v3dv_frame_tiling *tiling)
116 {
117    fb->internal_type = internal_type;
118 
119    /* Supertile coverage always starts at 0,0  */
120    uint32_t supertile_w_in_pixels =
121       tiling->tile_width * tiling->supertile_width;
122    uint32_t supertile_h_in_pixels =
123       tiling->tile_height * tiling->supertile_height;
124 
125    fb->min_x_supertile = 0;
126    fb->min_y_supertile = 0;
127    fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
128    fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
129 
130    fb->vk_format = vk_format;
131    fb->format = v3dv_get_format(vk_format);
132 
133    fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
134    if (vk_format_is_depth_or_stencil(vk_format))
135       fb->internal_depth_type = v3dv_get_internal_depth_type(vk_format);
136 }
137 
138 /* This chooses a tile buffer format that is appropriate for the copy operation.
139  * Typically, this is the image render target type, however, if we are copying
140  * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
141  * we need to load and store to/from a tile color buffer using a compatible
142  * color format.
143  */
144 static uint32_t
choose_tlb_format(struct framebuffer_data * framebuffer,VkImageAspectFlags aspect,bool for_store,bool is_copy_to_buffer,bool is_copy_from_buffer)145 choose_tlb_format(struct framebuffer_data *framebuffer,
146                   VkImageAspectFlags aspect,
147                   bool for_store,
148                   bool is_copy_to_buffer,
149                   bool is_copy_from_buffer)
150 {
151    if (is_copy_to_buffer || is_copy_from_buffer) {
152       switch (framebuffer->vk_format) {
153       case VK_FORMAT_D16_UNORM:
154          return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
155       case VK_FORMAT_D32_SFLOAT:
156          return V3D_OUTPUT_IMAGE_FORMAT_R32F;
157       case VK_FORMAT_X8_D24_UNORM_PACK32:
158          return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
159       case VK_FORMAT_D24_UNORM_S8_UINT:
160          /* When storing the stencil aspect of a combined depth/stencil image
161           * to a buffer, the Vulkan spec states that the output buffer must
162           * have packed stencil values, so we choose an R8UI format for our
163           * store outputs. For the load input we still want RGBA8UI since the
164           * source image contains 4 channels (including the 3 channels
165           * containing the 24-bit depth value).
166           *
167           * When loading the stencil aspect of a combined depth/stencil image
168           * from a buffer, we read packed 8-bit stencil values from the buffer
169           * that we need to put into the LSB of the 32-bit format (the R
170           * channel), so we use R8UI. For the store, if we used R8UI then we
171           * would write 8-bit stencil values consecutively over depth channels,
172           * so we need to use RGBA8UI. This will write each stencil value in
173           * its correct position, but will overwrite depth values (channels G
174           * B,A) with undefined values. To fix this,  we will have to restore
175           * the depth aspect from the Z tile buffer, which we should pre-load
176           * from the image before the store).
177           */
178          if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
179             return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
180          } else {
181             assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
182             if (is_copy_to_buffer) {
183                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
184                                   V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
185             } else {
186                assert(is_copy_from_buffer);
187                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
188                                   V3D_OUTPUT_IMAGE_FORMAT_R8UI;
189             }
190          }
191       default: /* Color formats */
192          return framebuffer->format->rt_type;
193          break;
194       }
195    } else {
196       return framebuffer->format->rt_type;
197    }
198 }
199 
200 static inline bool
format_needs_rb_swap(VkFormat format)201 format_needs_rb_swap(VkFormat format)
202 {
203    const uint8_t *swizzle = v3dv_get_format_swizzle(format);
204    return swizzle[0] == PIPE_SWIZZLE_Z;
205 }
206 
207 static void
get_internal_type_bpp_for_image_aspects(VkFormat vk_format,VkImageAspectFlags aspect_mask,uint32_t * internal_type,uint32_t * internal_bpp)208 get_internal_type_bpp_for_image_aspects(VkFormat vk_format,
209                                         VkImageAspectFlags aspect_mask,
210                                         uint32_t *internal_type,
211                                         uint32_t *internal_bpp)
212 {
213    const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
214                                          VK_IMAGE_ASPECT_STENCIL_BIT;
215 
216    /* We can't store depth/stencil pixel formats to a raster format, so
217     * so instead we load our depth/stencil aspects to a compatible color
218     * format.
219     */
220    /* FIXME: pre-compute this at image creation time? */
221    if (aspect_mask & ds_aspects) {
222       switch (vk_format) {
223       case VK_FORMAT_D16_UNORM:
224          *internal_type = V3D_INTERNAL_TYPE_16UI;
225          *internal_bpp = V3D_INTERNAL_BPP_64;
226          break;
227       case VK_FORMAT_D32_SFLOAT:
228          *internal_type = V3D_INTERNAL_TYPE_32F;
229          *internal_bpp = V3D_INTERNAL_BPP_128;
230          break;
231       case VK_FORMAT_X8_D24_UNORM_PACK32:
232       case VK_FORMAT_D24_UNORM_S8_UINT:
233          /* Use RGBA8 format so we can relocate the X/S bits in the appropriate
234           * place to match Vulkan expectations. See the comment on the tile
235           * load command for more details.
236           */
237          *internal_type = V3D_INTERNAL_TYPE_8UI;
238          *internal_bpp = V3D_INTERNAL_BPP_32;
239          break;
240       default:
241          assert(!"unsupported format");
242          break;
243       }
244    } else {
245       const struct v3dv_format *format = v3dv_get_format(vk_format);
246       v3dv_get_internal_type_bpp_for_output_format(format->rt_type,
247                                                    internal_type,
248                                                    internal_bpp);
249    }
250 }
251 
252 struct rcl_clear_info {
253    const union v3dv_clear_value *clear_value;
254    struct v3dv_image *image;
255    VkImageAspectFlags aspects;
256    uint32_t layer;
257    uint32_t level;
258 };
259 
260 static struct v3dv_cl *
emit_rcl_prologue(struct v3dv_job * job,struct framebuffer_data * fb,const struct rcl_clear_info * clear_info)261 emit_rcl_prologue(struct v3dv_job *job,
262                   struct framebuffer_data *fb,
263                   const struct rcl_clear_info *clear_info)
264 {
265    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
266 
267    struct v3dv_cl *rcl = &job->rcl;
268    v3dv_cl_ensure_space_with_branch(rcl, 200 +
269                                     tiling->layers * 256 *
270                                     cl_packet_length(SUPERTILE_COORDINATES));
271    if (job->cmd_buffer->state.oom)
272       return NULL;
273 
274    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
275       config.early_z_disable = true;
276       config.image_width_pixels = tiling->width;
277       config.image_height_pixels = tiling->height;
278       config.number_of_render_targets = 1;
279       config.multisample_mode_4x = tiling->msaa;
280       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
281       config.internal_depth_type = fb->internal_depth_type;
282    }
283 
284    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
285       uint32_t clear_pad = 0;
286       if (clear_info->image) {
287          const struct v3dv_image *image = clear_info->image;
288          const struct v3d_resource_slice *slice =
289             &image->slices[clear_info->level];
290          if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
291              slice->tiling == VC5_TILING_UIF_XOR) {
292             int uif_block_height = v3d_utile_height(image->cpp) * 2;
293 
294             uint32_t implicit_padded_height =
295                align(tiling->height, uif_block_height) / uif_block_height;
296 
297             if (slice->padded_height_of_output_image_in_uif_blocks -
298                 implicit_padded_height >= 15) {
299                clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
300             }
301          }
302       }
303 
304       const uint32_t *color = &clear_info->clear_value->color[0];
305       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
306          clear.clear_color_low_32_bits = color[0];
307          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
308          clear.render_target_number = 0;
309       };
310 
311       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
312          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
313             clear.clear_color_mid_low_32_bits =
314               ((color[1] >> 24) | (color[2] << 8));
315             clear.clear_color_mid_high_24_bits =
316               ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
317             clear.render_target_number = 0;
318          };
319       }
320 
321       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
322          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
323             clear.uif_padded_height_in_uif_blocks = clear_pad;
324             clear.clear_color_high_16_bits = color[3] >> 16;
325             clear.render_target_number = 0;
326          };
327       }
328    }
329 
330    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
331       rt.render_target_0_internal_bpp = tiling->internal_bpp;
332       rt.render_target_0_internal_type = fb->internal_type;
333       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
334    }
335 
336    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
337       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
338       clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
339    };
340 
341    cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
342       init.use_auto_chained_tile_lists = true;
343       init.size_of_first_block_in_chained_tile_lists =
344          TILE_ALLOCATION_BLOCK_SIZE_64B;
345    }
346 
347    return rcl;
348 }
349 
350 static void
emit_frame_setup(struct v3dv_job * job,uint32_t layer,const union v3dv_clear_value * clear_value)351 emit_frame_setup(struct v3dv_job *job,
352                  uint32_t layer,
353                  const union v3dv_clear_value *clear_value)
354 {
355    v3dv_return_if_oom(NULL, job);
356 
357    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
358 
359    struct v3dv_cl *rcl = &job->rcl;
360 
361    const uint32_t tile_alloc_offset =
362       64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
363    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
364       list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
365    }
366 
367    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
368       config.number_of_bin_tile_lists = 1;
369       config.total_frame_width_in_tiles = tiling->draw_tiles_x;
370       config.total_frame_height_in_tiles = tiling->draw_tiles_y;
371 
372       config.supertile_width_in_tiles = tiling->supertile_width;
373       config.supertile_height_in_tiles = tiling->supertile_height;
374 
375       config.total_frame_width_in_supertiles =
376          tiling->frame_width_in_supertiles;
377       config.total_frame_height_in_supertiles =
378          tiling->frame_height_in_supertiles;
379    }
380 
381    /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
382     * it here.
383     */
384    for (int i = 0; i < 2; i++) {
385       cl_emit(rcl, TILE_COORDINATES, coords);
386       cl_emit(rcl, END_OF_LOADS, end);
387       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
388          store.buffer_to_store = NONE;
389       }
390       if (clear_value && i == 0) {
391          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
392             clear.clear_z_stencil_buffer = true;
393             clear.clear_all_render_targets = true;
394          }
395       }
396       cl_emit(rcl, END_OF_TILE_MARKER, end);
397    }
398 
399    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
400 }
401 
402 static void
emit_supertile_coordinates(struct v3dv_job * job,struct framebuffer_data * framebuffer)403 emit_supertile_coordinates(struct v3dv_job *job,
404                            struct framebuffer_data *framebuffer)
405 {
406    v3dv_return_if_oom(NULL, job);
407 
408    struct v3dv_cl *rcl = &job->rcl;
409 
410    const uint32_t min_y = framebuffer->min_y_supertile;
411    const uint32_t max_y = framebuffer->max_y_supertile;
412    const uint32_t min_x = framebuffer->min_x_supertile;
413    const uint32_t max_x = framebuffer->max_x_supertile;
414 
415    for (int y = min_y; y <= max_y; y++) {
416       for (int x = min_x; x <= max_x; x++) {
417          cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
418             coords.column_number_in_supertiles = x;
419             coords.row_number_in_supertiles = y;
420          }
421       }
422    }
423 }
424 
425 static void
emit_linear_load(struct v3dv_cl * cl,uint32_t buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t stride,uint32_t format)426 emit_linear_load(struct v3dv_cl *cl,
427                  uint32_t buffer,
428                  struct v3dv_bo *bo,
429                  uint32_t offset,
430                  uint32_t stride,
431                  uint32_t format)
432 {
433    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
434       load.buffer_to_load = buffer;
435       load.address = v3dv_cl_address(bo, offset);
436       load.input_image_format = format;
437       load.memory_format = VC5_TILING_RASTER;
438       load.height_in_ub_or_stride = stride;
439       load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
440    }
441 }
442 
443 static void
emit_linear_store(struct v3dv_cl * cl,uint32_t buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t stride,bool msaa,uint32_t format)444 emit_linear_store(struct v3dv_cl *cl,
445                   uint32_t buffer,
446                   struct v3dv_bo *bo,
447                   uint32_t offset,
448                   uint32_t stride,
449                   bool msaa,
450                   uint32_t format)
451 {
452    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
453       store.buffer_to_store = RENDER_TARGET_0;
454       store.address = v3dv_cl_address(bo, offset);
455       store.clear_buffer_being_stored = false;
456       store.output_image_format = format;
457       store.memory_format = VC5_TILING_RASTER;
458       store.height_in_ub_or_stride = stride;
459       store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
460                                    V3D_DECIMATE_MODE_SAMPLE_0;
461    }
462 }
463 
464 static void
emit_image_load(struct v3dv_cl * cl,struct framebuffer_data * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspect,uint32_t layer,uint32_t mip_level,bool is_copy_to_buffer,bool is_copy_from_buffer)465 emit_image_load(struct v3dv_cl *cl,
466                 struct framebuffer_data *framebuffer,
467                 struct v3dv_image *image,
468                 VkImageAspectFlags aspect,
469                 uint32_t layer,
470                 uint32_t mip_level,
471                 bool is_copy_to_buffer,
472                 bool is_copy_from_buffer)
473 {
474    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
475 
476    /* For image to/from buffer copies we always load to and store from RT0,
477     * even for depth/stencil aspects, because the hardware can't do raster
478     * stores or loads from/to the depth/stencil tile buffers.
479     */
480    bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
481                             aspect == VK_IMAGE_ASPECT_COLOR_BIT;
482 
483    const struct v3d_resource_slice *slice = &image->slices[mip_level];
484    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
485       load.buffer_to_load = load_to_color_tlb ?
486          RENDER_TARGET_0 : v3dv_zs_buffer_from_aspect_bits(aspect);
487 
488       load.address = v3dv_cl_address(image->mem->bo, layer_offset);
489 
490       load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
491                                                   is_copy_to_buffer,
492                                                   is_copy_from_buffer);
493       load.memory_format = slice->tiling;
494 
495       /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
496        * expects the depth value in the LSB bits of each 32-bit pixel.
497        * Unfortunately, the hardware seems to put the S8/X8 bits there and the
498        * depth bits on the MSB. To work around that we can reverse the channel
499        * order and then swap the R/B channels to get what we want.
500        *
501        * NOTE: reversing and swapping only gets us the behavior we want if the
502        * operations happen in that exact order, which seems to be the case when
503        * done on the tile buffer load operations. On the store, it seems the
504        * order is not the same. The order on the store is probably reversed so
505        * that reversing and swapping on both the load and the store preserves
506        * the original order of the channels in memory.
507        *
508        * Notice that we only need to do this when copying to a buffer, where
509        * depth and stencil aspects are copied as separate regions and
510        * the spec expects them to be tightly packed.
511        */
512       bool needs_rb_swap = false;
513       bool needs_chan_reverse = false;
514       if (is_copy_to_buffer &&
515          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
516           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
517            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
518          needs_rb_swap = true;
519          needs_chan_reverse = true;
520       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
521                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
522          /* This is not a raw data copy (i.e. we are clearing the image),
523           * so we need to make sure we respect the format swizzle.
524           */
525          needs_rb_swap = format_needs_rb_swap(framebuffer->vk_format);
526       }
527 
528       load.r_b_swap = needs_rb_swap;
529       load.channel_reverse = needs_chan_reverse;
530 
531       if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
532           slice->tiling == VC5_TILING_UIF_XOR) {
533          load.height_in_ub_or_stride =
534             slice->padded_height_of_output_image_in_uif_blocks;
535       } else if (slice->tiling == VC5_TILING_RASTER) {
536          load.height_in_ub_or_stride = slice->stride;
537       }
538 
539       if (image->samples > VK_SAMPLE_COUNT_1_BIT)
540          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
541       else
542          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
543    }
544 }
545 
546 static void
emit_image_store(struct v3dv_cl * cl,struct framebuffer_data * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspect,uint32_t layer,uint32_t mip_level,bool is_copy_to_buffer,bool is_copy_from_buffer,bool is_multisample_resolve)547 emit_image_store(struct v3dv_cl *cl,
548                  struct framebuffer_data *framebuffer,
549                  struct v3dv_image *image,
550                  VkImageAspectFlags aspect,
551                  uint32_t layer,
552                  uint32_t mip_level,
553                  bool is_copy_to_buffer,
554                  bool is_copy_from_buffer,
555                  bool is_multisample_resolve)
556 {
557    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer);
558 
559    bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
560                                aspect == VK_IMAGE_ASPECT_COLOR_BIT;
561 
562    const struct v3d_resource_slice *slice = &image->slices[mip_level];
563    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
564       store.buffer_to_store = store_from_color_tlb ?
565          RENDER_TARGET_0 : v3dv_zs_buffer_from_aspect_bits(aspect);
566 
567       store.address = v3dv_cl_address(image->mem->bo, layer_offset);
568       store.clear_buffer_being_stored = false;
569 
570       /* See rationale in emit_image_load() */
571       bool needs_rb_swap = false;
572       bool needs_chan_reverse = false;
573       if (is_copy_from_buffer &&
574          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
575           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
576            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
577          needs_rb_swap = true;
578          needs_chan_reverse = true;
579       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
580                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
581          needs_rb_swap = format_needs_rb_swap(framebuffer->vk_format);
582       }
583 
584       store.r_b_swap = needs_rb_swap;
585       store.channel_reverse = needs_chan_reverse;
586 
587       store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
588                                                     is_copy_to_buffer,
589                                                     is_copy_from_buffer);
590       store.memory_format = slice->tiling;
591       if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
592           slice->tiling == VC5_TILING_UIF_XOR) {
593          store.height_in_ub_or_stride =
594             slice->padded_height_of_output_image_in_uif_blocks;
595       } else if (slice->tiling == VC5_TILING_RASTER) {
596          store.height_in_ub_or_stride = slice->stride;
597       }
598 
599       if (image->samples > VK_SAMPLE_COUNT_1_BIT)
600          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
601       else if (is_multisample_resolve)
602          store.decimate_mode = V3D_DECIMATE_MODE_4X;
603       else
604          store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
605    }
606 }
607 
608 static void
emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job * job,struct framebuffer_data * framebuffer,struct v3dv_buffer * buffer,struct v3dv_image * image,uint32_t layer_offset,const VkBufferImageCopy * region)609 emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
610                                         struct framebuffer_data *framebuffer,
611                                         struct v3dv_buffer *buffer,
612                                         struct v3dv_image *image,
613                                         uint32_t layer_offset,
614                                         const VkBufferImageCopy *region)
615 {
616    struct v3dv_cl *cl = &job->indirect;
617    v3dv_cl_ensure_space(cl, 200, 1);
618    v3dv_return_if_oom(NULL, job);
619 
620    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
621 
622    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
623 
624    /* Load image to TLB */
625    assert((image->type != VK_IMAGE_TYPE_3D &&
626            layer_offset < region->imageSubresource.layerCount) ||
627           layer_offset < image->extent.depth);
628 
629    const uint32_t image_layer = image->type != VK_IMAGE_TYPE_3D ?
630       region->imageSubresource.baseArrayLayer + layer_offset :
631       region->imageOffset.z + layer_offset;
632 
633    emit_image_load(cl, framebuffer, image,
634                    region->imageSubresource.aspectMask,
635                    image_layer,
636                    region->imageSubresource.mipLevel,
637                    true, false);
638 
639    cl_emit(cl, END_OF_LOADS, end);
640 
641    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
642 
643    /* Store TLB to buffer */
644    uint32_t width, height;
645    if (region->bufferRowLength == 0)
646       width = region->imageExtent.width;
647    else
648       width = region->bufferRowLength;
649 
650    if (region->bufferImageHeight == 0)
651       height = region->imageExtent.height;
652    else
653       height = region->bufferImageHeight;
654 
655    /* Handle copy from compressed format */
656    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk_format));
657    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk_format));
658 
659    /* If we are storing stencil from a combined depth/stencil format the
660     * Vulkan spec states that the output buffer must have packed stencil
661     * values, where each stencil value is 1 byte.
662     */
663    uint32_t cpp =
664       region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
665          1 : image->cpp;
666    uint32_t buffer_stride = width * cpp;
667    uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
668                             height * buffer_stride * layer_offset;
669 
670    uint32_t format = choose_tlb_format(framebuffer,
671                                        region->imageSubresource.aspectMask,
672                                        true, true, false);
673    bool msaa = image->samples > VK_SAMPLE_COUNT_1_BIT;
674 
675    emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
676                      buffer_offset, buffer_stride, msaa, format);
677 
678    cl_emit(cl, END_OF_TILE_MARKER, end);
679 
680    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
681 
682    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
683       branch.start = tile_list_start;
684       branch.end = v3dv_cl_get_address(cl);
685    }
686 }
687 
688 static void
emit_copy_layer_to_buffer(struct v3dv_job * job,struct v3dv_buffer * buffer,struct v3dv_image * image,struct framebuffer_data * framebuffer,uint32_t layer,const VkBufferImageCopy * region)689 emit_copy_layer_to_buffer(struct v3dv_job *job,
690                           struct v3dv_buffer *buffer,
691                           struct v3dv_image *image,
692                           struct framebuffer_data *framebuffer,
693                           uint32_t layer,
694                           const VkBufferImageCopy *region)
695 {
696    emit_frame_setup(job, layer, NULL);
697    emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
698                                            image, layer, region);
699    emit_supertile_coordinates(job, framebuffer);
700 }
701 
702 static void
emit_copy_image_to_buffer_rcl(struct v3dv_job * job,struct v3dv_buffer * buffer,struct v3dv_image * image,struct framebuffer_data * framebuffer,const VkBufferImageCopy * region)703 emit_copy_image_to_buffer_rcl(struct v3dv_job *job,
704                               struct v3dv_buffer *buffer,
705                               struct v3dv_image *image,
706                               struct framebuffer_data *framebuffer,
707                               const VkBufferImageCopy *region)
708 {
709    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
710    v3dv_return_if_oom(NULL, job);
711 
712    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
713       emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
714    cl_emit(rcl, END_OF_RENDERING, end);
715 }
716 
717 /* Implements a copy using the TLB.
718  *
719  * This only works if we are copying from offset (0,0), since a TLB store for
720  * tile (x,y) will be written at the same tile offset into the destination.
721  * When this requirement is not met, we need to use a blit instead.
722  *
723  * Returns true if the implementation supports the requested operation (even if
724  * it failed to process it, for example, due to an out-of-memory error).
725  *
726  */
727 static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy * region)728 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
729                          struct v3dv_buffer *buffer,
730                          struct v3dv_image *image,
731                          const VkBufferImageCopy *region)
732 {
733    VkFormat fb_format;
734    if (!can_use_tlb(image, &region->imageOffset, &fb_format))
735       return false;
736 
737    uint32_t internal_type, internal_bpp;
738    get_internal_type_bpp_for_image_aspects(fb_format,
739                                            region->imageSubresource.aspectMask,
740                                            &internal_type, &internal_bpp);
741 
742    uint32_t num_layers;
743    if (image->type != VK_IMAGE_TYPE_3D)
744       num_layers = region->imageSubresource.layerCount;
745    else
746       num_layers = region->imageExtent.depth;
747    assert(num_layers > 0);
748 
749    struct v3dv_job *job =
750       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
751    if (!job)
752       return true;
753 
754    /* Handle copy from compressed format using a compatible format */
755    const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
756    const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
757    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
758    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
759 
760    v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, false);
761 
762    struct framebuffer_data framebuffer;
763    setup_framebuffer_data(&framebuffer, fb_format, internal_type,
764                           &job->frame_tiling);
765 
766    v3dv_job_emit_binning_flush(job);
767    emit_copy_image_to_buffer_rcl(job, buffer, image, &framebuffer, region);
768 
769    v3dv_cmd_buffer_finish_job(cmd_buffer);
770 
771    return true;
772 }
773 
774 static bool
775 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
776             struct v3dv_image *dst,
777             VkFormat dst_format,
778             struct v3dv_image *src,
779             VkFormat src_format,
780             VkColorComponentFlags cmask,
781             VkComponentMapping *cswizzle,
782             const VkImageBlit *region,
783             VkFilter filter,
784             bool dst_is_padded_image);
785 
786 /**
787  * Returns true if the implementation supports the requested operation (even if
788  * it failed to process it, for example, due to an out-of-memory error).
789  */
790 static bool
copy_image_to_buffer_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy * region)791 copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
792                           struct v3dv_buffer *buffer,
793                           struct v3dv_image *image,
794                           const VkBufferImageCopy *region)
795 {
796    bool handled = false;
797 
798    /* Generally, the bpp of the data in the buffer matches that of the
799     * source image. The exception is the case where we are copying
800     * stencil (8bpp) to a combined d24s8 image (32bpp).
801     */
802    uint32_t buffer_bpp = image->cpp;
803 
804    VkImageAspectFlags copy_aspect = region->imageSubresource.aspectMask;
805 
806    /* Because we are going to implement the copy as a blit, we need to create
807     * a linear image from the destination buffer and we also want our blit
808     * source and destination formats to be the same (to avoid any format
809     * conversions), so we choose a canonical format that matches the
810     * source image bpp.
811     *
812     * The exception to the above is copying from combined depth/stencil images
813     * because we are copying only one aspect of the image, so we need to setup
814     * our formats, color write mask and source swizzle mask to match that.
815     */
816    VkFormat dst_format;
817    VkFormat src_format;
818    VkColorComponentFlags cmask = 0; /* All components */
819    VkComponentMapping cswizzle = {
820       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
821       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
822       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
823       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
824    };
825    switch (buffer_bpp) {
826    case 16:
827       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
828       dst_format = VK_FORMAT_R32G32B32A32_UINT;
829       src_format = dst_format;
830       break;
831    case 8:
832       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
833       dst_format = VK_FORMAT_R16G16B16A16_UINT;
834       src_format = dst_format;
835       break;
836    case 4:
837       switch (copy_aspect) {
838       case VK_IMAGE_ASPECT_COLOR_BIT:
839          src_format = VK_FORMAT_R8G8B8A8_UINT;
840          dst_format = VK_FORMAT_R8G8B8A8_UINT;
841          break;
842       case VK_IMAGE_ASPECT_DEPTH_BIT:
843          assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
844                 image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
845                 image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
846          if (image->vk_format == VK_FORMAT_D32_SFLOAT) {
847             src_format = VK_FORMAT_R32_UINT;
848             dst_format = VK_FORMAT_R32_UINT;
849          } else {
850             /* We want to write depth in the buffer in the first 24-bits,
851              * however, the hardware has depth in bits 8-31, so swizzle the
852              * the source components to match what we want. Also, we don't
853              * want to write bits 24-31 in the destination.
854              */
855             src_format = VK_FORMAT_R8G8B8A8_UINT;
856             dst_format = VK_FORMAT_R8G8B8A8_UINT;
857             cmask = VK_COLOR_COMPONENT_R_BIT |
858                     VK_COLOR_COMPONENT_G_BIT |
859                     VK_COLOR_COMPONENT_B_BIT;
860             cswizzle.r = VK_COMPONENT_SWIZZLE_G;
861             cswizzle.g = VK_COMPONENT_SWIZZLE_B;
862             cswizzle.b = VK_COMPONENT_SWIZZLE_A;
863             cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
864          }
865          break;
866       case VK_IMAGE_ASPECT_STENCIL_BIT:
867          assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
868          assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
869          /* Copying from S8D24. We want to write 8-bit stencil values only,
870           * so adjust the buffer bpp for that. Since the hardware stores stencil
871           * in the LSB, we can just do a RGBA8UI to R8UI blit.
872           */
873          src_format = VK_FORMAT_R8G8B8A8_UINT;
874          dst_format = VK_FORMAT_R8_UINT;
875          buffer_bpp = 1;
876          break;
877       default:
878          unreachable("unsupported aspect");
879          return handled;
880       };
881       break;
882    case 2:
883       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
884              copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
885       dst_format = VK_FORMAT_R16_UINT;
886       src_format = dst_format;
887       break;
888    case 1:
889       assert(copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
890       dst_format = VK_FORMAT_R8_UINT;
891       src_format = dst_format;
892       break;
893    default:
894       unreachable("unsupported bit-size");
895       return handled;
896    };
897 
898    /* The hardware doesn't support linear depth/stencil stores, so we
899     * implement copies of depth/stencil aspect as color copies using a
900     * compatible color format.
901     */
902    assert(vk_format_is_color(src_format));
903    assert(vk_format_is_color(dst_format));
904    copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
905 
906    /* We should be able to handle the blit if we got this far */
907    handled = true;
908 
909    /* Obtain the 2D buffer region spec */
910    uint32_t buf_width, buf_height;
911    if (region->bufferRowLength == 0)
912       buf_width = region->imageExtent.width;
913    else
914       buf_width = region->bufferRowLength;
915 
916    if (region->bufferImageHeight == 0)
917       buf_height = region->imageExtent.height;
918    else
919       buf_height = region->bufferImageHeight;
920 
921    /* If the image is compressed, the bpp refers to blocks, not pixels */
922    uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
923    uint32_t block_height = vk_format_get_blockheight(image->vk_format);
924    buf_width = buf_width / block_width;
925    buf_height = buf_height / block_height;
926 
927    /* Compute layers to copy */
928    uint32_t num_layers;
929    if (image->type != VK_IMAGE_TYPE_3D)
930       num_layers = region->imageSubresource.layerCount;
931    else
932       num_layers = region->imageExtent.depth;
933    assert(num_layers > 0);
934 
935    /* Our blit interface can see the real format of the images to detect
936     * copies between compressed and uncompressed images and adapt the
937     * blit region accordingly. Here we are just doing a raw copy of
938     * compressed data, but we are passing an uncompressed view of the
939     * buffer for the blit destination image (since compressed formats are
940     * not renderable), so we also want to provide an uncompressed view of
941     * the source image.
942     */
943    VkResult result;
944    struct v3dv_device *device = cmd_buffer->device;
945    VkDevice _device = v3dv_device_to_handle(device);
946    if (vk_format_is_compressed(image->vk_format)) {
947       VkImage uiview;
948       VkImageCreateInfo uiview_info = {
949          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
950          .imageType = VK_IMAGE_TYPE_3D,
951          .format = dst_format,
952          .extent = { buf_width, buf_height, image->extent.depth },
953          .mipLevels = image->levels,
954          .arrayLayers = image->array_size,
955          .samples = image->samples,
956          .tiling = image->tiling,
957          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
958          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
959          .queueFamilyIndexCount = 0,
960          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
961       };
962       result = v3dv_CreateImage(_device, &uiview_info, &device->alloc, &uiview);
963       if (result != VK_SUCCESS)
964          return handled;
965 
966       v3dv_cmd_buffer_add_private_obj(
967          cmd_buffer, (uintptr_t)uiview,
968          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
969 
970       result = v3dv_BindImageMemory(_device, uiview,
971                                     v3dv_device_memory_to_handle(image->mem),
972                                     image->mem_offset);
973       if (result != VK_SUCCESS)
974          return handled;
975 
976       image = v3dv_image_from_handle(uiview);
977    }
978 
979    /* Copy requested layers */
980    for (uint32_t i = 0; i < num_layers; i++) {
981       /* Create the destination blit image from the destination buffer */
982       VkImageCreateInfo image_info = {
983          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
984          .imageType = VK_IMAGE_TYPE_2D,
985          .format = dst_format,
986          .extent = { buf_width, buf_height, 1 },
987          .mipLevels = 1,
988          .arrayLayers = 1,
989          .samples = VK_SAMPLE_COUNT_1_BIT,
990          .tiling = VK_IMAGE_TILING_LINEAR,
991          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
992          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
993          .queueFamilyIndexCount = 0,
994          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
995       };
996 
997       VkImage buffer_image;
998       result =
999          v3dv_CreateImage(_device, &image_info, &device->alloc, &buffer_image);
1000       if (result != VK_SUCCESS)
1001          return handled;
1002 
1003       v3dv_cmd_buffer_add_private_obj(
1004          cmd_buffer, (uintptr_t)buffer_image,
1005          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
1006 
1007       /* Bind the buffer memory to the image */
1008       VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
1009          i * buf_width * buf_height * buffer_bpp;
1010       result = v3dv_BindImageMemory(_device, buffer_image,
1011                                     v3dv_device_memory_to_handle(buffer->mem),
1012                                     buffer_offset);
1013       if (result != VK_SUCCESS)
1014          return handled;
1015 
1016       /* Blit-copy the requested image extent.
1017        *
1018        * Since we are copying, the blit must use the same format on the
1019        * destination and source images to avoid format conversions. The
1020        * only exception is copying stencil, which we upload to a R8UI source
1021        * image, but that we need to blit to a S8D24 destination (the only
1022        * stencil format we support).
1023        */
1024       const VkImageBlit blit_region = {
1025          .srcSubresource = {
1026             .aspectMask = copy_aspect,
1027             .mipLevel = region->imageSubresource.mipLevel,
1028             .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
1029             .layerCount = 1,
1030          },
1031          .srcOffsets = {
1032             {
1033                DIV_ROUND_UP(region->imageOffset.x, block_width),
1034                DIV_ROUND_UP(region->imageOffset.y, block_height),
1035                region->imageOffset.z + i,
1036             },
1037             {
1038                DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
1039                             block_width),
1040                DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
1041                             block_height),
1042                region->imageOffset.z + i + 1,
1043             },
1044          },
1045          .dstSubresource = {
1046             .aspectMask = copy_aspect,
1047             .mipLevel = 0,
1048             .baseArrayLayer = 0,
1049             .layerCount = 1,
1050          },
1051          .dstOffsets = {
1052             { 0, 0, 0 },
1053             {
1054                DIV_ROUND_UP(region->imageExtent.width, block_width),
1055                DIV_ROUND_UP(region->imageExtent.height, block_height),
1056                1
1057             },
1058          },
1059       };
1060 
1061       handled = blit_shader(cmd_buffer,
1062                             v3dv_image_from_handle(buffer_image), dst_format,
1063                             image, src_format,
1064                             cmask, &cswizzle,
1065                             &blit_region, VK_FILTER_NEAREST, false);
1066       if (!handled) {
1067          /* This is unexpected, we should have a supported blit spec */
1068          unreachable("Unable to blit buffer to destination image");
1069          return false;
1070       }
1071    }
1072 
1073    assert(handled);
1074    return true;
1075 }
1076 
1077 static VkFormat
get_compatible_tlb_format(VkFormat format)1078 get_compatible_tlb_format(VkFormat format)
1079 {
1080    switch (format) {
1081    case VK_FORMAT_R8G8B8A8_SNORM:
1082       return VK_FORMAT_R8G8B8A8_UINT;
1083 
1084    case VK_FORMAT_R8G8_SNORM:
1085       return VK_FORMAT_R8G8_UINT;
1086 
1087    case VK_FORMAT_R8_SNORM:
1088       return VK_FORMAT_R8_UINT;
1089 
1090    case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1091       return VK_FORMAT_A8B8G8R8_UINT_PACK32;
1092 
1093    case VK_FORMAT_R16_UNORM:
1094    case VK_FORMAT_R16_SNORM:
1095       return VK_FORMAT_R16_UINT;
1096 
1097    case VK_FORMAT_R16G16_UNORM:
1098    case VK_FORMAT_R16G16_SNORM:
1099       return VK_FORMAT_R16G16_UINT;
1100 
1101    case VK_FORMAT_R16G16B16A16_UNORM:
1102    case VK_FORMAT_R16G16B16A16_SNORM:
1103       return VK_FORMAT_R16G16B16A16_UINT;
1104 
1105    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
1106       return VK_FORMAT_R32_SFLOAT;
1107 
1108    /* We can't render to compressed formats using the TLB so instead we use
1109     * a compatible format with the same bpp as the compressed format. Because
1110     * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
1111     * case of ETC), when we implement copies with the compatible format we
1112     * will have to divide offsets and dimensions on the compressed image by
1113     * the compressed block size.
1114     */
1115    case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
1116    case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
1117    case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
1118    case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
1119       return VK_FORMAT_R32G32B32A32_UINT;
1120 
1121    case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
1122    case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
1123    case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
1124    case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
1125    case VK_FORMAT_EAC_R11_UNORM_BLOCK:
1126    case VK_FORMAT_EAC_R11_SNORM_BLOCK:
1127       return VK_FORMAT_R16G16B16A16_UINT;
1128 
1129    default:
1130       return VK_FORMAT_UNDEFINED;
1131    }
1132 }
1133 
1134 static inline bool
can_use_tlb(struct v3dv_image * image,const VkOffset3D * offset,VkFormat * compat_format)1135 can_use_tlb(struct v3dv_image *image,
1136             const VkOffset3D *offset,
1137             VkFormat *compat_format)
1138 {
1139    if (offset->x != 0 || offset->y != 0)
1140       return false;
1141 
1142    if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
1143       if (compat_format)
1144          *compat_format = image->vk_format;
1145       return true;
1146    }
1147 
1148    /* If the image format is not TLB-supported, then check if we can use
1149     * a compatible format instead.
1150     */
1151    if (compat_format) {
1152       *compat_format = get_compatible_tlb_format(image->vk_format);
1153       if (*compat_format != VK_FORMAT_UNDEFINED)
1154          return true;
1155    }
1156 
1157    return false;
1158 }
1159 
1160 void
v3dv_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkBuffer destBuffer,uint32_t regionCount,const VkBufferImageCopy * pRegions)1161 v3dv_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
1162                           VkImage srcImage,
1163                           VkImageLayout srcImageLayout,
1164                           VkBuffer destBuffer,
1165                           uint32_t regionCount,
1166                           const VkBufferImageCopy *pRegions)
1167 {
1168    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1169    V3DV_FROM_HANDLE(v3dv_image, image, srcImage);
1170    V3DV_FROM_HANDLE(v3dv_buffer, buffer, destBuffer);
1171 
1172    assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
1173 
1174    for (uint32_t i = 0; i < regionCount; i++) {
1175       if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &pRegions[i]))
1176          continue;
1177       if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, &pRegions[i]))
1178          continue;
1179       unreachable("Unsupported image to buffer copy.");
1180    }
1181 }
1182 
1183 static void
emit_copy_image_layer_per_tile_list(struct v3dv_job * job,struct framebuffer_data * framebuffer,struct v3dv_image * dst,struct v3dv_image * src,uint32_t layer_offset,const VkImageCopy * region)1184 emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
1185                                     struct framebuffer_data *framebuffer,
1186                                     struct v3dv_image *dst,
1187                                     struct v3dv_image *src,
1188                                     uint32_t layer_offset,
1189                                     const VkImageCopy *region)
1190 {
1191    struct v3dv_cl *cl = &job->indirect;
1192    v3dv_cl_ensure_space(cl, 200, 1);
1193    v3dv_return_if_oom(NULL, job);
1194 
1195    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1196 
1197    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1198 
1199    assert((src->type != VK_IMAGE_TYPE_3D &&
1200            layer_offset < region->srcSubresource.layerCount) ||
1201           layer_offset < src->extent.depth);
1202 
1203    const uint32_t src_layer = src->type != VK_IMAGE_TYPE_3D ?
1204       region->srcSubresource.baseArrayLayer + layer_offset :
1205       region->srcOffset.z + layer_offset;
1206 
1207    emit_image_load(cl, framebuffer, src,
1208                    region->srcSubresource.aspectMask,
1209                    src_layer,
1210                    region->srcSubresource.mipLevel,
1211                    false, false);
1212 
1213    cl_emit(cl, END_OF_LOADS, end);
1214 
1215    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1216 
1217    assert((dst->type != VK_IMAGE_TYPE_3D &&
1218            layer_offset < region->dstSubresource.layerCount) ||
1219           layer_offset < dst->extent.depth);
1220 
1221    const uint32_t dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
1222       region->dstSubresource.baseArrayLayer + layer_offset :
1223       region->dstOffset.z + layer_offset;
1224 
1225    emit_image_store(cl, framebuffer, dst,
1226                     region->dstSubresource.aspectMask,
1227                     dst_layer,
1228                     region->dstSubresource.mipLevel,
1229                     false, false, false);
1230 
1231    cl_emit(cl, END_OF_TILE_MARKER, end);
1232 
1233    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1234 
1235    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1236       branch.start = tile_list_start;
1237       branch.end = v3dv_cl_get_address(cl);
1238    }
1239 }
1240 
1241 static void
emit_copy_image_layer(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct framebuffer_data * framebuffer,uint32_t layer,const VkImageCopy * region)1242 emit_copy_image_layer(struct v3dv_job *job,
1243                       struct v3dv_image *dst,
1244                       struct v3dv_image *src,
1245                       struct framebuffer_data *framebuffer,
1246                       uint32_t layer,
1247                       const VkImageCopy *region)
1248 {
1249    emit_frame_setup(job, layer, NULL);
1250    emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
1251    emit_supertile_coordinates(job, framebuffer);
1252 }
1253 
1254 static void
emit_copy_image_rcl(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct framebuffer_data * framebuffer,const VkImageCopy * region)1255 emit_copy_image_rcl(struct v3dv_job *job,
1256                     struct v3dv_image *dst,
1257                     struct v3dv_image *src,
1258                     struct framebuffer_data *framebuffer,
1259                     const VkImageCopy *region)
1260 {
1261    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
1262    v3dv_return_if_oom(NULL, job);
1263 
1264    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
1265       emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
1266    cl_emit(rcl, END_OF_RENDERING, end);
1267 }
1268 
1269 /**
1270  * Returns true if the implementation supports the requested operation (even if
1271  * it failed to process it, for example, due to an out-of-memory error).
1272  */
1273 static bool
copy_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy * region)1274 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1275                struct v3dv_image *dst,
1276                struct v3dv_image *src,
1277                const VkImageCopy *region)
1278 {
1279    VkFormat fb_format;
1280    if (!can_use_tlb(src, &region->srcOffset, &fb_format) ||
1281        !can_use_tlb(dst, &region->dstOffset, &fb_format)) {
1282       return false;
1283    }
1284 
1285    /* From the Vulkan spec, VkImageCopy valid usage:
1286     *
1287     *    "If neither the calling command’s srcImage nor the calling command’s
1288     *     dstImage has a multi-planar image format then the aspectMask member
1289     *     of srcSubresource and dstSubresource must match."
1290     */
1291    assert(region->dstSubresource.aspectMask ==
1292           region->srcSubresource.aspectMask);
1293    uint32_t internal_type, internal_bpp;
1294    get_internal_type_bpp_for_image_aspects(fb_format,
1295                                            region->dstSubresource.aspectMask,
1296                                            &internal_type, &internal_bpp);
1297 
1298    /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
1299     *
1300     * "The number of slices of the extent (for 3D) or layers of the
1301     *  srcSubresource (for non-3D) must match the number of slices of the
1302     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
1303     */
1304    assert((src->type != VK_IMAGE_TYPE_3D ?
1305            region->srcSubresource.layerCount : region->extent.depth) ==
1306           (dst->type != VK_IMAGE_TYPE_3D ?
1307            region->dstSubresource.layerCount : region->extent.depth));
1308    uint32_t num_layers;
1309    if (dst->type != VK_IMAGE_TYPE_3D)
1310       num_layers = region->dstSubresource.layerCount;
1311    else
1312       num_layers = region->extent.depth;
1313    assert(num_layers > 0);
1314 
1315    struct v3dv_job *job =
1316       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1317    if (!job)
1318       return true;
1319 
1320    /* Handle copy to compressed image using compatible format */
1321    const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
1322    const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
1323    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
1324    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
1325 
1326    v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp,
1327                         src->samples > VK_SAMPLE_COUNT_1_BIT);
1328 
1329    struct framebuffer_data framebuffer;
1330    setup_framebuffer_data(&framebuffer, fb_format, internal_type,
1331                           &job->frame_tiling);
1332 
1333    v3dv_job_emit_binning_flush(job);
1334    emit_copy_image_rcl(job, dst, src, &framebuffer, region);
1335 
1336    v3dv_cmd_buffer_finish_job(cmd_buffer);
1337 
1338    return true;
1339 }
1340 
1341 /**
1342  * Takes the image provided as argument and creates a new image that has
1343  * the same specification and aliases the same memory storage, except that:
1344  *
1345  *   - It has the uncompressed format passed in.
1346  *   - Its original width/height are scaled by the factors passed in.
1347  *
1348  * This is useful to implement copies from compressed images using the blit
1349  * path. The idea is that we create uncompressed "image views" of both the
1350  * source and destination images using the uncompressed format and then we
1351  * define the copy blit in terms of that format.
1352  */
1353 static struct v3dv_image *
create_image_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * src,float width_scale,float height_scale,VkFormat format)1354 create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
1355                    struct v3dv_image *src,
1356                    float width_scale,
1357                    float height_scale,
1358                    VkFormat format)
1359 {
1360    assert(!vk_format_is_compressed(format));
1361 
1362    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
1363 
1364    VkImageCreateInfo info = {
1365       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1366       .imageType = src->type,
1367       .format = format,
1368       .extent = {
1369          .width = src->extent.width * width_scale,
1370          .height = src->extent.height * height_scale,
1371          .depth = src->extent.depth,
1372       },
1373       .mipLevels = src->levels,
1374       .arrayLayers = src->array_size,
1375       .samples = src->samples,
1376       .tiling = src->tiling,
1377       .usage = src->usage,
1378    };
1379 
1380     VkImage _image;
1381     VkResult result =
1382       v3dv_CreateImage(_device, &info, &cmd_buffer->device->alloc, &_image);
1383     if (result != VK_SUCCESS) {
1384        v3dv_flag_oom(cmd_buffer, NULL);
1385        return NULL;
1386     }
1387 
1388     struct v3dv_image *image = v3dv_image_from_handle(_image);
1389     image->mem = src->mem;
1390     image->mem_offset = src->mem_offset;
1391     return image;
1392 }
1393 
1394 /**
1395  * Returns true if the implementation supports the requested operation (even if
1396  * it failed to process it, for example, due to an out-of-memory error).
1397  */
1398 static bool
copy_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy * region)1399 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
1400                 struct v3dv_image *dst,
1401                 struct v3dv_image *src,
1402                 const VkImageCopy *region)
1403 {
1404    const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
1405    const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
1406    const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
1407    const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
1408    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
1409    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
1410 
1411    /* We need to choose a single format for the blit to ensure that this is
1412     * really a copy and there are not format conversions going on. Since we
1413     * going to blit, we need to make sure that the selected format can be
1414     * both rendered to and textured from.
1415     */
1416    VkFormat format;
1417    float src_scale_w = 1.0f;
1418    float src_scale_h = 1.0f;
1419    float dst_scale_w = block_scale_w;
1420    float dst_scale_h = block_scale_h;
1421    if (vk_format_is_compressed(src->vk_format)) {
1422       /* If we are copying from a compressed format we should be aware that we
1423        * are going to texture from the source image, and the texture setup
1424        * knows the actual size of the image, so we need to choose a format
1425        * that has a per-texel (not per-block) bpp that is compatible for that
1426        * image size. For example, for a source image with size Bw*WxBh*H
1427        * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
1428        * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
1429        * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
1430        * so we could specify a blit with size Bw*WxBh*H and a format with
1431        * a bpp of 8-bit per texel (R8_UINT).
1432        *
1433        * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
1434        * which is 64-bit per texel, then we would need a 4-bit format, which
1435        * we don't have, so instead we still choose an 8-bit format, but we
1436        * apply a divisor to the row dimensions of the blit, since we are
1437        * copying two texels per item.
1438        *
1439        * Generally, we can choose any format so long as we compute appropriate
1440        * divisors for the width and height depending on the source image's
1441        * bpp.
1442        */
1443       assert(src->cpp == dst->cpp);
1444 
1445       uint32_t divisor_w, divisor_h;
1446       format = VK_FORMAT_R32G32_UINT;
1447       switch (src->cpp) {
1448       case 16:
1449          format = VK_FORMAT_R32G32B32A32_UINT;
1450          divisor_w = 4;
1451          divisor_h = 4;
1452          break;
1453       case 8:
1454          format = VK_FORMAT_R16G16B16A16_UINT;
1455          divisor_w = 4;
1456          divisor_h = 4;
1457          break;
1458       default:
1459          unreachable("Unsupported compressed format");
1460       }
1461 
1462       /* Create image views of the src/dst images that we can interpret in
1463        * terms of the canonical format.
1464        */
1465       src_scale_w /= divisor_w;
1466       src_scale_h /= divisor_h;
1467       dst_scale_w /= divisor_w;
1468       dst_scale_h /= divisor_h;
1469 
1470       src = create_image_alias(cmd_buffer, src,
1471                                src_scale_w, src_scale_h, format);
1472 
1473       dst = create_image_alias(cmd_buffer, dst,
1474                                dst_scale_w, dst_scale_h, format);
1475    } else {
1476       format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
1477          src->vk_format : get_compatible_tlb_format(src->vk_format);
1478       if (format == VK_FORMAT_UNDEFINED)
1479          return false;
1480 
1481       const struct v3dv_format *f = v3dv_get_format(format);
1482       if (!f->supported || f->tex_type == TEXTURE_DATA_FORMAT_NO)
1483          return false;
1484    }
1485 
1486    /* Given an uncompressed image with size WxH, if we copy it to a compressed
1487     * image, it will result in an image with size W*bWxH*bH, where bW and bH
1488     * are the compressed format's block width and height. This means that
1489     * copies between compressed and uncompressed images involve different
1490     * image sizes, and therefore, we need to take that into account when
1491     * setting up the source and destination blit regions below, so they are
1492     * consistent from the point of view of the single compatible format
1493     * selected for the copy.
1494     *
1495     * We should take into account that the dimensions of the region provided
1496     * to the copy command are specified in terms of the source image. With that
1497     * in mind, below we adjust the blit destination region to be consistent with
1498     * the source region for the compatible format, so basically, we apply
1499     * the block scale factor to the destination offset provided by the copy
1500     * command (because it is specified in terms of the destination image, not
1501     * the source), and then we just add the region copy dimensions to that
1502     * (since the region dimensions are already specified in terms of the source
1503     * image).
1504     */
1505    const VkOffset3D src_start = {
1506       region->srcOffset.x * src_scale_w,
1507       region->srcOffset.y * src_scale_h,
1508       region->srcOffset.z,
1509    };
1510    const VkOffset3D src_end = {
1511       src_start.x + region->extent.width * src_scale_w,
1512       src_start.y + region->extent.height * src_scale_h,
1513       src_start.z + region->extent.depth,
1514    };
1515 
1516    const VkOffset3D dst_start = {
1517       region->dstOffset.x * dst_scale_w,
1518       region->dstOffset.y * dst_scale_h,
1519       region->dstOffset.z,
1520    };
1521    const VkOffset3D dst_end = {
1522       dst_start.x + region->extent.width * src_scale_w,
1523       dst_start.y + region->extent.height * src_scale_h,
1524       dst_start.z + region->extent.depth,
1525    };
1526 
1527    const VkImageBlit blit_region = {
1528       .srcSubresource = region->srcSubresource,
1529       .srcOffsets = { src_start, src_end },
1530       .dstSubresource = region->dstSubresource,
1531       .dstOffsets = { dst_start, dst_end },
1532    };
1533    bool handled = blit_shader(cmd_buffer,
1534                               dst, format,
1535                               src, format,
1536                               0, NULL,
1537                               &blit_region, VK_FILTER_NEAREST, true);
1538 
1539    /* We should have selected formats that we can blit */
1540    assert(handled);
1541    return handled;
1542 }
1543 
1544 void
v3dv_CmdCopyImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageCopy * pRegions)1545 v3dv_CmdCopyImage(VkCommandBuffer commandBuffer,
1546                   VkImage srcImage,
1547                   VkImageLayout srcImageLayout,
1548                   VkImage dstImage,
1549                   VkImageLayout dstImageLayout,
1550                   uint32_t regionCount,
1551                   const VkImageCopy *pRegions)
1552 {
1553    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1554    V3DV_FROM_HANDLE(v3dv_image, src, srcImage);
1555    V3DV_FROM_HANDLE(v3dv_image, dst, dstImage);
1556 
1557    assert(src->samples == dst->samples);
1558 
1559    for (uint32_t i = 0; i < regionCount; i++) {
1560       if (copy_image_tlb(cmd_buffer, dst, src, &pRegions[i]))
1561          continue;
1562       if (copy_image_blit(cmd_buffer, dst, src, &pRegions[i]))
1563          continue;
1564       unreachable("Image copy not supported");
1565    }
1566 }
1567 
1568 static void
emit_clear_image_per_tile_list(struct v3dv_job * job,struct framebuffer_data * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspects,uint32_t layer,uint32_t level)1569 emit_clear_image_per_tile_list(struct v3dv_job *job,
1570                                struct framebuffer_data *framebuffer,
1571                                struct v3dv_image *image,
1572                                VkImageAspectFlags aspects,
1573                                uint32_t layer,
1574                                uint32_t level)
1575 {
1576    struct v3dv_cl *cl = &job->indirect;
1577    v3dv_cl_ensure_space(cl, 200, 1);
1578    v3dv_return_if_oom(NULL, job);
1579 
1580    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1581 
1582    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1583 
1584    cl_emit(cl, END_OF_LOADS, end);
1585 
1586    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1587 
1588    emit_image_store(cl, framebuffer, image, aspects, layer, level,
1589                     false, false, false);
1590 
1591    cl_emit(cl, END_OF_TILE_MARKER, end);
1592 
1593    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1594 
1595    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1596       branch.start = tile_list_start;
1597       branch.end = v3dv_cl_get_address(cl);
1598    }
1599 }
1600 
1601 static void
emit_clear_image(struct v3dv_job * job,struct v3dv_image * image,struct framebuffer_data * framebuffer,VkImageAspectFlags aspects,uint32_t layer,uint32_t level)1602 emit_clear_image(struct v3dv_job *job,
1603                  struct v3dv_image *image,
1604                  struct framebuffer_data *framebuffer,
1605                  VkImageAspectFlags aspects,
1606                  uint32_t layer,
1607                  uint32_t level)
1608 {
1609    emit_clear_image_per_tile_list(job, framebuffer, image, aspects, layer, level);
1610    emit_supertile_coordinates(job, framebuffer);
1611 }
1612 
1613 static void
emit_clear_image_rcl(struct v3dv_job * job,struct v3dv_image * image,struct framebuffer_data * framebuffer,const union v3dv_clear_value * clear_value,VkImageAspectFlags aspects,uint32_t layer,uint32_t level)1614 emit_clear_image_rcl(struct v3dv_job *job,
1615                      struct v3dv_image *image,
1616                      struct framebuffer_data *framebuffer,
1617                      const union v3dv_clear_value *clear_value,
1618                      VkImageAspectFlags aspects,
1619                      uint32_t layer,
1620                      uint32_t level)
1621 {
1622    const struct rcl_clear_info clear_info = {
1623       .clear_value = clear_value,
1624       .image = image,
1625       .aspects = aspects,
1626       .layer = layer,
1627       .level = level,
1628    };
1629 
1630    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
1631    v3dv_return_if_oom(NULL, job);
1632 
1633    emit_frame_setup(job, 0, clear_value);
1634    emit_clear_image(job, image, framebuffer, aspects, layer, level);
1635    cl_emit(rcl, END_OF_RENDERING, end);
1636 }
1637 
1638 static void
get_hw_clear_color(const VkClearColorValue * color,VkFormat fb_format,VkFormat image_format,uint32_t internal_type,uint32_t internal_bpp,uint32_t * hw_color)1639 get_hw_clear_color(const VkClearColorValue *color,
1640                    VkFormat fb_format,
1641                    VkFormat image_format,
1642                    uint32_t internal_type,
1643                    uint32_t internal_bpp,
1644                    uint32_t *hw_color)
1645 {
1646    const uint32_t internal_size = 4 << internal_bpp;
1647 
1648    /* If the image format doesn't match the framebuffer format, then we are
1649     * trying to clear an unsupported tlb format using a compatible
1650     * format for the framebuffer. In this case, we want to make sure that
1651     * we pack the clear value according to the original format semantics,
1652     * not the compatible format.
1653     */
1654    if (fb_format == image_format) {
1655       v3dv_get_hw_clear_color(color, internal_type, internal_size, hw_color);
1656    } else {
1657       union util_color uc;
1658       enum pipe_format pipe_image_format =
1659          vk_format_to_pipe_format(image_format);
1660       util_pack_color(color->float32, pipe_image_format, &uc);
1661       memcpy(hw_color, uc.ui, internal_size);
1662    }
1663 }
1664 
1665 /* Returns true if the implementation is able to handle the case, false
1666  * otherwise.
1667 */
1668 static bool
clear_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range)1669 clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1670                 struct v3dv_image *image,
1671                 const VkClearValue *clear_value,
1672                 const VkImageSubresourceRange *range)
1673 {
1674    const VkOffset3D origin = { 0, 0, 0 };
1675    VkFormat fb_format;
1676    if (!can_use_tlb(image, &origin, &fb_format))
1677       return false;
1678 
1679    uint32_t internal_type, internal_bpp;
1680    get_internal_type_bpp_for_image_aspects(fb_format, range->aspectMask,
1681                                            &internal_type, &internal_bpp);
1682 
1683    union v3dv_clear_value hw_clear_value = { 0 };
1684    if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
1685       get_hw_clear_color(&clear_value->color, fb_format, image->vk_format,
1686                          internal_type, internal_bpp, &hw_clear_value.color[0]);
1687    } else {
1688       assert((range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) ||
1689              (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT));
1690       hw_clear_value.z = clear_value->depthStencil.depth;
1691       hw_clear_value.s = clear_value->depthStencil.stencil;
1692    }
1693 
1694    uint32_t level_count = range->levelCount == VK_REMAINING_MIP_LEVELS ?
1695                           image->levels - range->baseMipLevel :
1696                           range->levelCount;
1697    uint32_t min_level = range->baseMipLevel;
1698    uint32_t max_level = range->baseMipLevel + level_count;
1699 
1700    /* For 3D images baseArrayLayer and layerCount must be 0 and 1 respectively.
1701     * Instead, we need to consider the full depth dimension of the image, which
1702     * goes from 0 up to the level's depth extent.
1703     */
1704    uint32_t min_layer;
1705    uint32_t max_layer;
1706    if (image->type != VK_IMAGE_TYPE_3D) {
1707       uint32_t layer_count = range->layerCount == VK_REMAINING_ARRAY_LAYERS ?
1708                              image->array_size - range->baseArrayLayer :
1709                              range->layerCount;
1710       min_layer = range->baseArrayLayer;
1711       max_layer = range->baseArrayLayer + layer_count;
1712    } else {
1713       min_layer = 0;
1714       max_layer = 0;
1715    }
1716 
1717    for (uint32_t level = min_level; level < max_level; level++) {
1718       if (image->type == VK_IMAGE_TYPE_3D)
1719          max_layer = u_minify(image->extent.depth, level);
1720       for (uint32_t layer = min_layer; layer < max_layer; layer++) {
1721          uint32_t width = u_minify(image->extent.width, level);
1722          uint32_t height = u_minify(image->extent.height, level);
1723 
1724          struct v3dv_job *job =
1725             v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1726 
1727          if (!job)
1728             return true;
1729 
1730          /* We start a a new job for each layer so the frame "depth" is 1 */
1731          v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp,
1732                               image->samples > VK_SAMPLE_COUNT_1_BIT);
1733 
1734          struct framebuffer_data framebuffer;
1735          setup_framebuffer_data(&framebuffer, fb_format, internal_type,
1736                                 &job->frame_tiling);
1737 
1738          v3dv_job_emit_binning_flush(job);
1739 
1740          /* If this triggers it is an application bug: the spec requires
1741           * that any aspects to clear are present in the image.
1742           */
1743          assert(range->aspectMask & image->aspects);
1744 
1745          emit_clear_image_rcl(job, image, &framebuffer, &hw_clear_value,
1746                              range->aspectMask, layer, level);
1747 
1748          v3dv_cmd_buffer_finish_job(cmd_buffer);
1749       }
1750    }
1751 
1752    return true;
1753 }
1754 
1755 void
v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage _image,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)1756 v3dv_CmdClearColorImage(VkCommandBuffer commandBuffer,
1757                         VkImage _image,
1758                         VkImageLayout imageLayout,
1759                         const VkClearColorValue *pColor,
1760                         uint32_t rangeCount,
1761                         const VkImageSubresourceRange *pRanges)
1762 {
1763    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1764    V3DV_FROM_HANDLE(v3dv_image, image, _image);
1765 
1766    const VkClearValue clear_value = {
1767       .color = *pColor,
1768    };
1769 
1770    for (uint32_t i = 0; i < rangeCount; i++) {
1771       if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
1772          continue;
1773       unreachable("Unsupported color clear.");
1774    }
1775 }
1776 
1777 void
v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage _image,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)1778 v3dv_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
1779                                VkImage _image,
1780                                VkImageLayout imageLayout,
1781                                const VkClearDepthStencilValue *pDepthStencil,
1782                                uint32_t rangeCount,
1783                                const VkImageSubresourceRange *pRanges)
1784 {
1785    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1786    V3DV_FROM_HANDLE(v3dv_image, image, _image);
1787 
1788    const VkClearValue clear_value = {
1789       .depthStencil = *pDepthStencil,
1790    };
1791 
1792    for (uint32_t i = 0; i < rangeCount; i++) {
1793       if (clear_image_tlb(cmd_buffer, image, &clear_value, &pRanges[i]))
1794          continue;
1795       unreachable("Unsupported depth/stencil clear.");
1796    }
1797 }
1798 
1799 static void
emit_copy_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_bo * dst,struct v3dv_bo * src,uint32_t dst_offset,uint32_t src_offset,uint32_t stride,uint32_t format)1800 emit_copy_buffer_per_tile_list(struct v3dv_job *job,
1801                                struct v3dv_bo *dst,
1802                                struct v3dv_bo *src,
1803                                uint32_t dst_offset,
1804                                uint32_t src_offset,
1805                                uint32_t stride,
1806                                uint32_t format)
1807 {
1808    struct v3dv_cl *cl = &job->indirect;
1809    v3dv_cl_ensure_space(cl, 200, 1);
1810    v3dv_return_if_oom(NULL, job);
1811 
1812    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1813 
1814    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1815 
1816    emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
1817 
1818    cl_emit(cl, END_OF_LOADS, end);
1819 
1820    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1821 
1822    emit_linear_store(cl, RENDER_TARGET_0,
1823                      dst, dst_offset, stride, false, format);
1824 
1825    cl_emit(cl, END_OF_TILE_MARKER, end);
1826 
1827    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1828 
1829    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1830       branch.start = tile_list_start;
1831       branch.end = v3dv_cl_get_address(cl);
1832    }
1833 }
1834 
1835 static void
emit_copy_buffer(struct v3dv_job * job,struct v3dv_bo * dst,struct v3dv_bo * src,uint32_t dst_offset,uint32_t src_offset,struct framebuffer_data * framebuffer,uint32_t format)1836 emit_copy_buffer(struct v3dv_job *job,
1837                  struct v3dv_bo *dst,
1838                  struct v3dv_bo *src,
1839                  uint32_t dst_offset,
1840                  uint32_t src_offset,
1841                  struct framebuffer_data *framebuffer,
1842                  uint32_t format)
1843 {
1844    const uint32_t stride = job->frame_tiling.width * 4;
1845    emit_copy_buffer_per_tile_list(job, dst, src,
1846                                   dst_offset, src_offset,
1847                                   stride, format);
1848    emit_supertile_coordinates(job, framebuffer);
1849 }
1850 
1851 static void
emit_copy_buffer_rcl(struct v3dv_job * job,struct v3dv_bo * dst,struct v3dv_bo * src,uint32_t dst_offset,uint32_t src_offset,struct framebuffer_data * framebuffer,uint32_t format)1852 emit_copy_buffer_rcl(struct v3dv_job *job,
1853                      struct v3dv_bo *dst,
1854                      struct v3dv_bo *src,
1855                      uint32_t dst_offset,
1856                      uint32_t src_offset,
1857                      struct framebuffer_data *framebuffer,
1858                      uint32_t format)
1859 {
1860    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
1861    v3dv_return_if_oom(NULL, job);
1862 
1863    emit_frame_setup(job, 0, NULL);
1864    emit_copy_buffer(job, dst, src, dst_offset, src_offset, framebuffer, format);
1865    cl_emit(rcl, END_OF_RENDERING, end);
1866 }
1867 
1868 /* Figure out a TLB size configuration for a number of pixels to process.
1869  * Beware that we can't "render" more than 4096x4096 pixels in a single job,
1870  * if the pixel count is larger than this, the caller might need to split
1871  * the job and call this function multiple times.
1872  */
1873 static void
framebuffer_size_for_pixel_count(uint32_t num_pixels,uint32_t * width,uint32_t * height)1874 framebuffer_size_for_pixel_count(uint32_t num_pixels,
1875                                  uint32_t *width,
1876                                  uint32_t *height)
1877 {
1878    assert(num_pixels > 0);
1879 
1880    const uint32_t max_dim_pixels = 4096;
1881    const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
1882 
1883    uint32_t w, h;
1884    if (num_pixels > max_pixels) {
1885       w = max_dim_pixels;
1886       h = max_dim_pixels;
1887    } else {
1888       w = num_pixels;
1889       h = 1;
1890       while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
1891          w >>= 1;
1892          h <<= 1;
1893       }
1894    }
1895    assert(w <= max_dim_pixels && h <= max_dim_pixels);
1896    assert(w * h <= num_pixels);
1897    assert(w > 0 && h > 0);
1898 
1899    *width = w;
1900    *height = h;
1901 }
1902 
1903 static struct v3dv_job *
copy_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_bo * dst,uint32_t dst_offset,struct v3dv_bo * src,uint32_t src_offset,const VkBufferCopy * region)1904 copy_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1905             struct v3dv_bo *dst,
1906             uint32_t dst_offset,
1907             struct v3dv_bo *src,
1908             uint32_t src_offset,
1909             const VkBufferCopy *region)
1910 {
1911    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1912    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1913 
1914    /* Select appropriate pixel format for the copy operation based on the
1915     * size to copy and the alignment of the source and destination offsets.
1916     */
1917    src_offset += region->srcOffset;
1918    dst_offset += region->dstOffset;
1919    uint32_t item_size = 4;
1920    while (item_size > 1 &&
1921           (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
1922       item_size /= 2;
1923    }
1924 
1925    while (item_size > 1 && region->size % item_size != 0)
1926       item_size /= 2;
1927 
1928    assert(region->size % item_size == 0);
1929    uint32_t num_items = region->size / item_size;
1930    assert(num_items > 0);
1931 
1932    uint32_t format;
1933    VkFormat vk_format;
1934    switch (item_size) {
1935    case 4:
1936       format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
1937       vk_format = VK_FORMAT_R8G8B8A8_UINT;
1938       break;
1939    case 2:
1940       format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
1941       vk_format = VK_FORMAT_R8G8_UINT;
1942       break;
1943    default:
1944       format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
1945       vk_format = VK_FORMAT_R8_UINT;
1946       break;
1947    }
1948 
1949    struct v3dv_job *job = NULL;
1950    while (num_items > 0) {
1951       job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1952       if (!job)
1953          return NULL;
1954 
1955       uint32_t width, height;
1956       framebuffer_size_for_pixel_count(num_items, &width, &height);
1957 
1958       v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp, false);
1959 
1960       struct framebuffer_data framebuffer;
1961       setup_framebuffer_data(&framebuffer, vk_format, internal_type,
1962                              &job->frame_tiling);
1963 
1964       v3dv_job_emit_binning_flush(job);
1965 
1966       emit_copy_buffer_rcl(job, dst, src, dst_offset, src_offset,
1967                            &framebuffer, format);
1968 
1969       v3dv_cmd_buffer_finish_job(cmd_buffer);
1970 
1971       const uint32_t items_copied = width * height;
1972       const uint32_t bytes_copied = items_copied * item_size;
1973       num_items -= items_copied;
1974       src_offset += bytes_copied;
1975       dst_offset += bytes_copied;
1976    }
1977 
1978    return job;
1979 }
1980 
1981 void
v3dv_CmdCopyBuffer(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkBuffer dstBuffer,uint32_t regionCount,const VkBufferCopy * pRegions)1982 v3dv_CmdCopyBuffer(VkCommandBuffer commandBuffer,
1983                    VkBuffer srcBuffer,
1984                    VkBuffer dstBuffer,
1985                    uint32_t regionCount,
1986                    const VkBufferCopy *pRegions)
1987 {
1988    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1989    V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, srcBuffer);
1990    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1991 
1992    for (uint32_t i = 0; i < regionCount; i++) {
1993      copy_buffer(cmd_buffer,
1994                  dst_buffer->mem->bo, dst_buffer->mem_offset,
1995                  src_buffer->mem->bo, src_buffer->mem_offset,
1996                  &pRegions[i]);
1997    }
1998 }
1999 
2000 static void
destroy_update_buffer_cb(VkDevice _device,uint64_t pobj,VkAllocationCallbacks * alloc)2001 destroy_update_buffer_cb(VkDevice _device,
2002                          uint64_t pobj,
2003                          VkAllocationCallbacks *alloc)
2004 {
2005    V3DV_FROM_HANDLE(v3dv_device, device, _device);
2006    struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
2007    v3dv_bo_free(device, bo);
2008 }
2009 
2010 void
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)2011 v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2012                      VkBuffer dstBuffer,
2013                      VkDeviceSize dstOffset,
2014                      VkDeviceSize dataSize,
2015                      const void *pData)
2016 {
2017    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2018    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
2019 
2020    struct v3dv_bo *src_bo =
2021       v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
2022    if (!src_bo) {
2023       fprintf(stderr, "Failed to allocate BO for vkCmdUpdateBuffer.\n");
2024       return;
2025    }
2026 
2027    bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
2028    if (!ok) {
2029       fprintf(stderr, "Failed to map BO for vkCmdUpdateBuffer.\n");
2030       return;
2031    }
2032 
2033    memcpy(src_bo->map, pData, dataSize);
2034 
2035    v3dv_bo_unmap(cmd_buffer->device, src_bo);
2036 
2037    VkBufferCopy region = {
2038       .srcOffset = 0,
2039       .dstOffset = dstOffset,
2040       .size = dataSize,
2041    };
2042    struct v3dv_job *copy_job =
2043       copy_buffer(cmd_buffer,
2044                   dst_buffer->mem->bo, dst_buffer->mem_offset,
2045                   src_bo, 0,
2046                   &region);
2047    if (!copy_job)
2048       return;
2049 
2050    v3dv_cmd_buffer_add_private_obj(
2051       cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
2052 }
2053 
2054 static void
emit_fill_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,uint32_t stride)2055 emit_fill_buffer_per_tile_list(struct v3dv_job *job,
2056                                struct v3dv_bo *bo,
2057                                uint32_t offset,
2058                                uint32_t stride)
2059 {
2060    struct v3dv_cl *cl = &job->indirect;
2061    v3dv_cl_ensure_space(cl, 200, 1);
2062    v3dv_return_if_oom(NULL, job);
2063 
2064    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
2065 
2066    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
2067 
2068    cl_emit(cl, END_OF_LOADS, end);
2069 
2070    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
2071 
2072    emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
2073                      V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
2074 
2075    cl_emit(cl, END_OF_TILE_MARKER, end);
2076 
2077    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
2078 
2079    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
2080       branch.start = tile_list_start;
2081       branch.end = v3dv_cl_get_address(cl);
2082    }
2083 }
2084 
2085 static void
emit_fill_buffer(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,struct framebuffer_data * framebuffer)2086 emit_fill_buffer(struct v3dv_job *job,
2087                  struct v3dv_bo *bo,
2088                  uint32_t offset,
2089                  struct framebuffer_data *framebuffer)
2090 {
2091    const uint32_t stride = job->frame_tiling.width * 4;
2092    emit_fill_buffer_per_tile_list(job, bo, offset, stride);
2093    emit_supertile_coordinates(job, framebuffer);
2094 }
2095 
2096 static void
emit_fill_buffer_rcl(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,struct framebuffer_data * framebuffer,uint32_t data)2097 emit_fill_buffer_rcl(struct v3dv_job *job,
2098                      struct v3dv_bo *bo,
2099                      uint32_t offset,
2100                      struct framebuffer_data *framebuffer,
2101                      uint32_t data)
2102 {
2103    const union v3dv_clear_value clear_value = {
2104        .color = { data, 0, 0, 0 },
2105    };
2106 
2107    const struct rcl_clear_info clear_info = {
2108       .clear_value = &clear_value,
2109       .image = NULL,
2110       .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
2111       .layer = 0,
2112       .level = 0,
2113    };
2114 
2115    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
2116    v3dv_return_if_oom(NULL, job);
2117 
2118    emit_frame_setup(job, 0, &clear_value);
2119    emit_fill_buffer(job, bo, offset, framebuffer);
2120    cl_emit(rcl, END_OF_RENDERING, end);
2121 }
2122 
2123 static void
fill_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t size,uint32_t data)2124 fill_buffer(struct v3dv_cmd_buffer *cmd_buffer,
2125             struct v3dv_bo *bo,
2126             uint32_t offset,
2127             uint32_t size,
2128             uint32_t data)
2129 {
2130    assert(size > 0 && size % 4 == 0);
2131    assert(offset + size <= bo->size);
2132 
2133    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
2134    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
2135    uint32_t num_items = size / 4;
2136 
2137    while (num_items > 0) {
2138       struct v3dv_job *job =
2139          v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
2140       if (!job)
2141          return;
2142 
2143       uint32_t width, height;
2144       framebuffer_size_for_pixel_count(num_items, &width, &height);
2145 
2146       v3dv_job_start_frame(job, width, height, 1, 1, internal_bpp, false);
2147 
2148       struct framebuffer_data framebuffer;
2149       setup_framebuffer_data(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
2150                              internal_type, &job->frame_tiling);
2151 
2152       v3dv_job_emit_binning_flush(job);
2153 
2154       emit_fill_buffer_rcl(job, bo, offset, &framebuffer, data);
2155 
2156       v3dv_cmd_buffer_finish_job(cmd_buffer);
2157 
2158       const uint32_t items_copied = width * height;
2159       const uint32_t bytes_copied = items_copied * 4;
2160       num_items -= items_copied;
2161       offset += bytes_copied;
2162    }
2163 }
2164 
2165 void
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize size,uint32_t data)2166 v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
2167                    VkBuffer dstBuffer,
2168                    VkDeviceSize dstOffset,
2169                    VkDeviceSize size,
2170                    uint32_t data)
2171 {
2172    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2173    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
2174 
2175    struct v3dv_bo *bo = dst_buffer->mem->bo;
2176 
2177    /* From the Vulkan spec:
2178     *
2179     *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
2180     *    a multiple of 4, then the nearest smaller multiple is used."
2181     */
2182    if (size == VK_WHOLE_SIZE) {
2183       size = dst_buffer->size - dstOffset;
2184       size -= size % 4;
2185    }
2186 
2187    fill_buffer(cmd_buffer, bo, dstOffset, size, data);
2188 }
2189 
2190 /* Disable level 0 write, just write following mipmaps */
2191 #define V3D_TFU_IOA_DIMTW (1 << 0)
2192 #define V3D_TFU_IOA_FORMAT_SHIFT 3
2193 #define V3D_TFU_IOA_FORMAT_LINEARTILE 3
2194 #define V3D_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN 4
2195 #define V3D_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN 5
2196 #define V3D_TFU_IOA_FORMAT_UIF_NO_XOR 6
2197 #define V3D_TFU_IOA_FORMAT_UIF_XOR 7
2198 
2199 #define V3D_TFU_ICFG_NUMMM_SHIFT 5
2200 #define V3D_TFU_ICFG_TTYPE_SHIFT 9
2201 
2202 #define V3D_TFU_ICFG_OPAD_SHIFT 22
2203 
2204 #define V3D_TFU_ICFG_FORMAT_SHIFT 18
2205 #define V3D_TFU_ICFG_FORMAT_RASTER 0
2206 #define V3D_TFU_ICFG_FORMAT_SAND_128 1
2207 #define V3D_TFU_ICFG_FORMAT_SAND_256 2
2208 #define V3D_TFU_ICFG_FORMAT_LINEARTILE 11
2209 #define V3D_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
2210 #define V3D_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
2211 #define V3D_TFU_ICFG_FORMAT_UIF_NO_XOR 14
2212 #define V3D_TFU_ICFG_FORMAT_UIF_XOR 15
2213 
2214 /**
2215  * Returns true if the implementation supports the requested operation (even if
2216  * it failed to process it, for example, due to an out-of-memory error).
2217  */
2218 static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy * region)2219 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
2220                          struct v3dv_image *image,
2221                          struct v3dv_buffer *buffer,
2222                          const VkBufferImageCopy *region)
2223 {
2224    VkFormat vk_format = image->vk_format;
2225    const struct v3dv_format *format = image->format;
2226 
2227    /* Format must be supported for texturing */
2228    if (!v3dv_tfu_supports_tex_format(&cmd_buffer->device->devinfo,
2229                                      format->tex_type)) {
2230       return false;
2231    }
2232 
2233    /* Only color formats */
2234    if (vk_format_is_depth_or_stencil(vk_format))
2235       return false;
2236 
2237    /* Destination can't be raster format */
2238    const uint32_t mip_level = region->imageSubresource.mipLevel;
2239    if (image->slices[mip_level].tiling == VC5_TILING_RASTER)
2240       return false;
2241 
2242    /* Region must include full slice */
2243    const uint32_t offset_x = region->imageOffset.x;
2244    const uint32_t offset_y = region->imageOffset.y;
2245    if (offset_x != 0 || offset_y != 0)
2246       return false;
2247 
2248    uint32_t width, height;
2249    if (region->bufferRowLength == 0)
2250       width = region->imageExtent.width;
2251    else
2252       width = region->bufferRowLength;
2253 
2254    if (region->bufferImageHeight == 0)
2255       height = region->imageExtent.height;
2256    else
2257       height = region->bufferImageHeight;
2258 
2259    if (width != image->extent.width || height != image->extent.height)
2260       return false;
2261 
2262    const struct v3d_resource_slice *slice = &image->slices[mip_level];
2263 
2264    uint32_t num_layers;
2265    if (image->type != VK_IMAGE_TYPE_3D)
2266       num_layers = region->imageSubresource.layerCount;
2267    else
2268       num_layers = region->imageExtent.depth;
2269    assert(num_layers > 0);
2270 
2271    assert(image->mem && image->mem->bo);
2272    const struct v3dv_bo *dst_bo = image->mem->bo;
2273 
2274    assert(buffer->mem && buffer->mem->bo);
2275    const struct v3dv_bo *src_bo = buffer->mem->bo;
2276 
2277    /* Emit a TFU job per layer to copy */
2278    const uint32_t buffer_stride = width * image->cpp;
2279    for (int i = 0; i < num_layers; i++) {
2280       uint32_t layer = region->imageSubresource.baseArrayLayer + i;
2281 
2282       struct drm_v3d_submit_tfu tfu = {
2283          .ios = (height << 16) | width,
2284          .bo_handles = {
2285             dst_bo->handle,
2286             src_bo != dst_bo ? src_bo->handle : 0
2287          },
2288       };
2289 
2290       const uint32_t buffer_offset =
2291          buffer->mem_offset + region->bufferOffset +
2292          height * buffer_stride * i;
2293 
2294       const uint32_t src_offset = src_bo->offset + buffer_offset;
2295       tfu.iia |= src_offset;
2296       tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
2297       tfu.iis |= width;
2298 
2299       const uint32_t dst_offset =
2300          dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
2301       tfu.ioa |= dst_offset;
2302 
2303       tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
2304                   (slice->tiling - VC5_TILING_LINEARTILE)) <<
2305                    V3D_TFU_IOA_FORMAT_SHIFT;
2306       tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
2307 
2308       /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
2309        * OPAD field for the destination (how many extra UIF blocks beyond
2310        * those necessary to cover the height).
2311        */
2312       if (slice->tiling == VC5_TILING_UIF_NO_XOR ||
2313           slice->tiling == VC5_TILING_UIF_XOR) {
2314          uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
2315          uint32_t implicit_padded_height = align(height, uif_block_h);
2316          uint32_t icfg =
2317             (slice->padded_height - implicit_padded_height) / uif_block_h;
2318          tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
2319       }
2320 
2321       v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
2322    }
2323 
2324    return true;
2325 }
2326 
2327 static void
emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job * job,struct framebuffer_data * framebuffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t layer,const VkBufferImageCopy * region)2328 emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
2329                                         struct framebuffer_data *framebuffer,
2330                                         struct v3dv_image *image,
2331                                         struct v3dv_buffer *buffer,
2332                                         uint32_t layer,
2333                                         const VkBufferImageCopy *region)
2334 {
2335    struct v3dv_cl *cl = &job->indirect;
2336    v3dv_cl_ensure_space(cl, 200, 1);
2337    v3dv_return_if_oom(NULL, job);
2338 
2339    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
2340 
2341    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
2342 
2343    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
2344    assert((image->type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
2345           layer < image->extent.depth);
2346 
2347    /* Load TLB from buffer */
2348    uint32_t width, height;
2349    if (region->bufferRowLength == 0)
2350       width = region->imageExtent.width;
2351    else
2352       width = region->bufferRowLength;
2353 
2354    if (region->bufferImageHeight == 0)
2355       height = region->imageExtent.height;
2356    else
2357       height = region->bufferImageHeight;
2358 
2359    /* Handle copy to compressed format using a compatible format */
2360    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk_format));
2361    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk_format));
2362 
2363    uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
2364                   1 : image->cpp;
2365    uint32_t buffer_stride = width * cpp;
2366    uint32_t buffer_offset =
2367       buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
2368 
2369    uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
2370                                        false, false, true);
2371 
2372    emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
2373                     buffer_offset, buffer_stride, format);
2374 
2375    /* Because we can't do raster loads/stores of Z/S formats we need to
2376     * use a color tile buffer with a compatible RGBA color format instead.
2377     * However, when we are uploading a single aspect to a combined
2378     * depth/stencil image we have the problem that our tile buffer stores don't
2379     * allow us to mask out the other aspect, so we always write all four RGBA
2380     * channels to the image and we end up overwriting that other aspect with
2381     * undefined values. To work around that, we first load the aspect we are
2382     * not copying from the image memory into a proper Z/S tile buffer. Then we
2383     * do our store from the color buffer for the aspect we are copying, and
2384     * after that, we do another store from the Z/S tile buffer to restore the
2385     * other aspect to its original value.
2386     */
2387    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
2388       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2389          emit_image_load(cl, framebuffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
2390                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
2391                          false, false);
2392       } else {
2393          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
2394          emit_image_load(cl, framebuffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
2395                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
2396                          false, false);
2397       }
2398    }
2399 
2400    cl_emit(cl, END_OF_LOADS, end);
2401 
2402    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
2403 
2404    /* Store TLB to image */
2405    emit_image_store(cl, framebuffer, image, imgrsc->aspectMask,
2406                     imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
2407                     false, true, false);
2408 
2409    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
2410       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
2411          emit_image_store(cl, framebuffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
2412                           imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
2413                           false, false, false);
2414       } else {
2415          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
2416          emit_image_store(cl, framebuffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
2417                           imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
2418                           false, false, false);
2419       }
2420    }
2421 
2422    cl_emit(cl, END_OF_TILE_MARKER, end);
2423 
2424    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
2425 
2426    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
2427       branch.start = tile_list_start;
2428       branch.end = v3dv_cl_get_address(cl);
2429    }
2430 }
2431 
2432 static void
emit_copy_buffer_to_layer(struct v3dv_job * job,struct v3dv_image * image,struct v3dv_buffer * buffer,struct framebuffer_data * framebuffer,uint32_t layer,const VkBufferImageCopy * region)2433 emit_copy_buffer_to_layer(struct v3dv_job *job,
2434                           struct v3dv_image *image,
2435                           struct v3dv_buffer *buffer,
2436                           struct framebuffer_data *framebuffer,
2437                           uint32_t layer,
2438                           const VkBufferImageCopy *region)
2439 {
2440    emit_frame_setup(job, layer, NULL);
2441    emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
2442                                            layer, region);
2443    emit_supertile_coordinates(job, framebuffer);
2444 }
2445 
2446 static void
emit_copy_buffer_to_image_rcl(struct v3dv_job * job,struct v3dv_image * image,struct v3dv_buffer * buffer,struct framebuffer_data * framebuffer,const VkBufferImageCopy * region)2447 emit_copy_buffer_to_image_rcl(struct v3dv_job *job,
2448                               struct v3dv_image *image,
2449                               struct v3dv_buffer *buffer,
2450                               struct framebuffer_data *framebuffer,
2451                               const VkBufferImageCopy *region)
2452 {
2453    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
2454    v3dv_return_if_oom(NULL, job);
2455 
2456    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
2457       emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
2458    cl_emit(rcl, END_OF_RENDERING, end);
2459 }
2460 
2461 /**
2462  * Returns true if the implementation supports the requested operation (even if
2463  * it failed to process it, for example, due to an out-of-memory error).
2464  */
2465 static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy * region)2466 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
2467                          struct v3dv_image *image,
2468                          struct v3dv_buffer *buffer,
2469                          const VkBufferImageCopy *region)
2470 {
2471    VkFormat fb_format;
2472    if (!can_use_tlb(image, &region->imageOffset, &fb_format))
2473       return false;
2474 
2475    uint32_t internal_type, internal_bpp;
2476    get_internal_type_bpp_for_image_aspects(fb_format,
2477                                            region->imageSubresource.aspectMask,
2478                                            &internal_type, &internal_bpp);
2479 
2480    uint32_t num_layers;
2481    if (image->type != VK_IMAGE_TYPE_3D)
2482       num_layers = region->imageSubresource.layerCount;
2483    else
2484       num_layers = region->imageExtent.depth;
2485    assert(num_layers > 0);
2486 
2487    struct v3dv_job *job =
2488       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
2489    if (!job)
2490       return true;
2491 
2492    /* Handle copy to compressed format using a compatible format */
2493    const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
2494    const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
2495    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
2496    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
2497 
2498    v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, false);
2499 
2500    struct framebuffer_data framebuffer;
2501    setup_framebuffer_data(&framebuffer, fb_format, internal_type,
2502                           &job->frame_tiling);
2503 
2504    v3dv_job_emit_binning_flush(job);
2505    emit_copy_buffer_to_image_rcl(job, image, buffer, &framebuffer, region);
2506 
2507    v3dv_cmd_buffer_finish_job(cmd_buffer);
2508 
2509    return true;
2510 }
2511 
2512 static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy * region)2513 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
2514                                struct v3dv_image *image,
2515                                struct v3dv_buffer *buffer,
2516                                const VkBufferImageCopy *region)
2517 {
2518    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
2519       return true;
2520    if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
2521       return true;
2522    return false;
2523 }
2524 /**
2525  * Returns true if the implementation supports the requested operation (even if
2526  * it failed to process it, for example, due to an out-of-memory error).
2527  */
2528 static bool
copy_buffer_to_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy * region)2529 copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
2530                           struct v3dv_image *image,
2531                           struct v3dv_buffer *buffer,
2532                           const VkBufferImageCopy *region)
2533 {
2534    bool handled = false;
2535 
2536    /* Generally, the bpp of the data in the buffer matches that of the
2537     * destination image. The exception is the case where we are uploading
2538     * stencil (8bpp) to a combined d24s8 image (32bpp).
2539     */
2540    uint32_t buffer_bpp = image->cpp;
2541 
2542    VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
2543 
2544    /* We are about to upload the buffer data to an image so we can then
2545     * blit that to our destination region. Because we are going to implement
2546     * the copy as a blit, we want our blit source and destination formats to be
2547     * the same (to avoid any format conversions), so we choose a canonical
2548     * format that matches the destination image bpp.
2549     */
2550    VkColorComponentFlags cmask = 0; /* Write all components */
2551    VkFormat src_format;
2552    VkFormat dst_format;
2553    switch (buffer_bpp) {
2554    case 16:
2555       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2556       src_format = VK_FORMAT_R32G32B32A32_UINT;
2557       dst_format = src_format;
2558       break;
2559    case 8:
2560       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2561       src_format = VK_FORMAT_R16G16B16A16_UINT;
2562       dst_format = src_format;
2563       break;
2564    case 4:
2565       switch (aspect) {
2566       case VK_IMAGE_ASPECT_COLOR_BIT:
2567          src_format = VK_FORMAT_R8G8B8A8_UINT;
2568          dst_format = src_format;
2569          break;
2570       case VK_IMAGE_ASPECT_DEPTH_BIT:
2571          assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
2572                 image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
2573                 image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
2574          if (image->tiling != VK_IMAGE_TILING_LINEAR) {
2575             src_format = image->vk_format;
2576          } else {
2577             src_format = VK_FORMAT_R8G8B8A8_UINT;
2578             aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2579             if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
2580                cmask = VK_COLOR_COMPONENT_R_BIT |
2581                        VK_COLOR_COMPONENT_G_BIT |
2582                        VK_COLOR_COMPONENT_B_BIT;
2583             }
2584          }
2585          dst_format = src_format;
2586          break;
2587       case VK_IMAGE_ASPECT_STENCIL_BIT:
2588          /* Since we don't support separate stencil this is always a stencil
2589           * copy to a combined depth/stencil image. Becasue we don't support
2590           * separate stencil images, we upload the buffer data to a compatible
2591           * color R8UI image, and implement the blit as a compatible color
2592           * blit to an RGBA8UI destination masking out writes to components
2593           * GBA (which map to the D24 component of a S8D24 image).
2594           */
2595          assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
2596          buffer_bpp = 1;
2597          src_format = VK_FORMAT_R8_UINT;
2598          dst_format = VK_FORMAT_R8G8B8A8_UINT;
2599          cmask = VK_COLOR_COMPONENT_R_BIT;
2600          aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2601          break;
2602       default:
2603          unreachable("unsupported aspect");
2604          return handled;
2605       };
2606       break;
2607    case 2:
2608       aspect = VK_IMAGE_ASPECT_COLOR_BIT;
2609       src_format = VK_FORMAT_R16_UINT;
2610       dst_format = src_format;
2611       break;
2612    case 1:
2613       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
2614       src_format = VK_FORMAT_R8_UINT;
2615       dst_format = src_format;
2616       break;
2617    default:
2618       unreachable("unsupported bit-size");
2619       return handled;
2620    }
2621 
2622    /* We should be able to handle the blit if we reached here */
2623    handled = true;
2624 
2625    /* Obtain the 2D buffer region spec */
2626    uint32_t buf_width, buf_height;
2627    if (region->bufferRowLength == 0)
2628       buf_width = region->imageExtent.width;
2629    else
2630       buf_width = region->bufferRowLength;
2631 
2632    if (region->bufferImageHeight == 0)
2633       buf_height = region->imageExtent.height;
2634    else
2635       buf_height = region->bufferImageHeight;
2636 
2637    /* If the image is compressed, the bpp refers to blocks, not pixels */
2638    uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
2639    uint32_t block_height = vk_format_get_blockheight(image->vk_format);
2640    buf_width = buf_width / block_width;
2641    buf_height = buf_height / block_height;
2642 
2643    /* Compute layers to copy */
2644    uint32_t num_layers;
2645    if (image->type != VK_IMAGE_TYPE_3D)
2646       num_layers = region->imageSubresource.layerCount;
2647    else
2648       num_layers = region->imageExtent.depth;
2649    assert(num_layers > 0);
2650 
2651    struct v3dv_device *device = cmd_buffer->device;
2652    VkDevice _device = v3dv_device_to_handle(device);
2653    for (uint32_t i = 0; i < num_layers; i++) {
2654       /* Create the source blit image from the source buffer.
2655        *
2656        * We can't texture from a linear image, so we can't just setup a blit
2657        * straight from the buffer contents. Instead, we need to upload the
2658        * buffer to a tiled image, and then copy that image to the selected
2659        * region of the destination.
2660        *
2661        * FIXME: we could do better than this is we use a blit shader that has
2662        * a UBO (for the buffer) as input instead of a texture. Then we would
2663        * have to do some arithmetics in the shader to identify the offset into
2664        * the UBO that we need to load for each pixel in the destination image
2665        * (we would need to support all the possible copy formats we have above).
2666        */
2667       VkImageCreateInfo image_info = {
2668          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2669          .imageType = VK_IMAGE_TYPE_2D,
2670          .format = src_format,
2671          .extent = { buf_width, buf_height, 1 },
2672          .mipLevels = 1,
2673          .arrayLayers = 1,
2674          .samples = VK_SAMPLE_COUNT_1_BIT,
2675          .tiling = VK_IMAGE_TILING_OPTIMAL,
2676          .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2677                   VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2678          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2679          .queueFamilyIndexCount = 0,
2680          .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2681       };
2682 
2683       VkImage buffer_image;
2684       VkResult result =
2685          v3dv_CreateImage(_device, &image_info, &device->alloc, &buffer_image);
2686       if (result != VK_SUCCESS)
2687          return handled;
2688 
2689       v3dv_cmd_buffer_add_private_obj(
2690          cmd_buffer, (uintptr_t)buffer_image,
2691          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
2692 
2693       /* Allocate and bind memory for the image */
2694       VkDeviceMemory mem;
2695       VkMemoryRequirements reqs;
2696       v3dv_GetImageMemoryRequirements(_device, buffer_image, &reqs);
2697       VkMemoryAllocateInfo alloc_info = {
2698          .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
2699          .allocationSize = reqs.size,
2700          .memoryTypeIndex = 0,
2701       };
2702       result = v3dv_AllocateMemory(_device, &alloc_info, &device->alloc, &mem);
2703       if (result != VK_SUCCESS)
2704          return handled;
2705 
2706       v3dv_cmd_buffer_add_private_obj(
2707          cmd_buffer, (uintptr_t)mem,
2708          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);
2709 
2710       result = v3dv_BindImageMemory(_device, buffer_image, mem, 0);
2711       if (result != VK_SUCCESS)
2712          return handled;
2713 
2714       /* Upload buffer contents for the selected layer */
2715       VkDeviceSize buffer_offset =
2716          region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
2717       const VkBufferImageCopy buffer_image_copy = {
2718          .bufferOffset = buffer_offset,
2719          .bufferRowLength = region->bufferRowLength / block_width,
2720          .bufferImageHeight = region->bufferImageHeight / block_height,
2721          .imageSubresource = {
2722             .aspectMask = aspect,
2723             .mipLevel = 0,
2724             .baseArrayLayer = 0,
2725             .layerCount = 1,
2726          },
2727          .imageOffset = { 0, 0, 0 },
2728          .imageExtent = { buf_width, buf_height, 1 }
2729       };
2730       handled =
2731          create_tiled_image_from_buffer(cmd_buffer,
2732                                         v3dv_image_from_handle(buffer_image),
2733                                         buffer, &buffer_image_copy);
2734       if (!handled) {
2735          /* This is unexpected, we should have setup the upload to be
2736           * conformant to a TFU or TLB copy.
2737           */
2738          unreachable("Unable to copy buffer to image through TLB");
2739          return false;
2740       }
2741 
2742       /* Blit-copy the requested image extent from the buffer image to the
2743        * destination image.
2744        *
2745        * Since we are copying, the blit must use the same format on the
2746        * destination and source images to avoid format conversions. The
2747        * only exception is copying stencil, which we upload to a R8UI source
2748        * image, but that we need to blit to a S8D24 destination (the only
2749        * stencil format we support).
2750        */
2751       const VkImageBlit blit_region = {
2752          .srcSubresource = {
2753             .aspectMask = aspect,
2754             .mipLevel = 0,
2755             .baseArrayLayer = 0,
2756             .layerCount = 1,
2757          },
2758          .srcOffsets = {
2759             { 0, 0, 0 },
2760             { region->imageExtent.width, region->imageExtent.height, 1 },
2761          },
2762          .dstSubresource = {
2763             .aspectMask = aspect,
2764             .mipLevel = region->imageSubresource.mipLevel,
2765             .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
2766             .layerCount = 1,
2767          },
2768          .dstOffsets = {
2769             {
2770                DIV_ROUND_UP(region->imageOffset.x, block_width),
2771                DIV_ROUND_UP(region->imageOffset.y, block_height),
2772                region->imageOffset.z + i,
2773             },
2774             {
2775                DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
2776                             block_width),
2777                DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
2778                             block_height),
2779                region->imageOffset.z + i + 1,
2780             },
2781          },
2782       };
2783 
2784       handled = blit_shader(cmd_buffer,
2785                             image, dst_format,
2786                             v3dv_image_from_handle(buffer_image), src_format,
2787                             cmask, NULL,
2788                             &blit_region, VK_FILTER_NEAREST, true);
2789       if (!handled) {
2790          /* This is unexpected, we should have a supported blit spec */
2791          unreachable("Unable to blit buffer to destination image");
2792          return false;
2793       }
2794    }
2795 
2796    assert(handled);
2797    return true;
2798 }
2799 
2800 /**
2801  * Returns true if the implementation supports the requested operation (even if
2802  * it failed to process it, for example, due to an out-of-memory error).
2803  */
2804 static bool
copy_buffer_to_image_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy * region)2805 copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
2806                          struct v3dv_image *image,
2807                          struct v3dv_buffer *buffer,
2808                          const VkBufferImageCopy *region)
2809 {
2810    /* FIXME */
2811    if (vk_format_is_depth_or_stencil(image->vk_format))
2812       return false;
2813 
2814    if (vk_format_is_compressed(image->vk_format))
2815       return false;
2816 
2817    if (image->tiling == VK_IMAGE_TILING_LINEAR)
2818       return false;
2819 
2820    uint32_t buffer_width, buffer_height;
2821    if (region->bufferRowLength == 0)
2822       buffer_width = region->imageExtent.width;
2823    else
2824       buffer_width = region->bufferRowLength;
2825 
2826    if (region->bufferImageHeight == 0)
2827       buffer_height = region->imageExtent.height;
2828    else
2829       buffer_height = region->bufferImageHeight;
2830 
2831    uint32_t buffer_stride = buffer_width * image->cpp;
2832    uint32_t buffer_layer_stride = buffer_stride * buffer_height;
2833 
2834    uint32_t num_layers;
2835    if (image->type != VK_IMAGE_TYPE_3D)
2836       num_layers = region->imageSubresource.layerCount;
2837    else
2838       num_layers = region->imageExtent.depth;
2839    assert(num_layers > 0);
2840 
2841    struct v3dv_job *job =
2842       v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
2843                                      V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
2844                                      cmd_buffer, -1);
2845    if (!job)
2846       return true;
2847 
2848    job->cpu.copy_buffer_to_image.image = image;
2849    job->cpu.copy_buffer_to_image.buffer = buffer;
2850    job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
2851    job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
2852    job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
2853    job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
2854    job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
2855    job->cpu.copy_buffer_to_image.mip_level =
2856       region->imageSubresource.mipLevel;
2857    job->cpu.copy_buffer_to_image.base_layer =
2858       region->imageSubresource.baseArrayLayer;
2859    job->cpu.copy_buffer_to_image.layer_count = num_layers;
2860 
2861    list_addtail(&job->list_link, &cmd_buffer->jobs);
2862 
2863    return true;
2864 }
2865 
2866 void
v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,VkBuffer srcBuffer,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkBufferImageCopy * pRegions)2867 v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
2868                           VkBuffer srcBuffer,
2869                           VkImage dstImage,
2870                           VkImageLayout dstImageLayout,
2871                           uint32_t regionCount,
2872                           const VkBufferImageCopy *pRegions)
2873 {
2874    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2875    V3DV_FROM_HANDLE(v3dv_buffer, buffer, srcBuffer);
2876    V3DV_FROM_HANDLE(v3dv_image, image, dstImage);
2877 
2878    assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
2879 
2880    for (uint32_t i = 0; i < regionCount; i++) {
2881       if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &pRegions[i]))
2882          continue;
2883       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &pRegions[i]))
2884          continue;
2885       if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &pRegions[i]))
2886          continue;
2887       if (copy_buffer_to_image_blit(cmd_buffer, image, buffer, &pRegions[i]))
2888          continue;
2889       unreachable("Unsupported buffer to image copy.");
2890    }
2891 }
2892 
2893 static void
emit_tfu_job(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,uint32_t dst_mip_level,uint32_t dst_layer,struct v3dv_image * src,uint32_t src_mip_level,uint32_t src_layer,uint32_t width,uint32_t height)2894 emit_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
2895              struct v3dv_image *dst,
2896              uint32_t dst_mip_level,
2897              uint32_t dst_layer,
2898              struct v3dv_image *src,
2899              uint32_t src_mip_level,
2900              uint32_t src_layer,
2901              uint32_t width,
2902              uint32_t height)
2903 {
2904    const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
2905    const struct v3d_resource_slice *dst_slice = &dst->slices[src_mip_level];
2906 
2907    assert(dst->mem && dst->mem->bo);
2908    const struct v3dv_bo *dst_bo = dst->mem->bo;
2909 
2910    assert(src->mem && src->mem->bo);
2911    const struct v3dv_bo *src_bo = src->mem->bo;
2912 
2913    struct drm_v3d_submit_tfu tfu = {
2914       .ios = (height << 16) | width,
2915       .bo_handles = {
2916          dst_bo->handle,
2917          src != dst ? src_bo->handle : 0
2918       },
2919    };
2920 
2921    const uint32_t src_offset =
2922       src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
2923    tfu.iia |= src_offset;
2924 
2925    uint32_t icfg;
2926    if (src_slice->tiling == VC5_TILING_RASTER) {
2927       icfg = V3D_TFU_ICFG_FORMAT_RASTER;
2928    } else {
2929       icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
2930              (src_slice->tiling - VC5_TILING_LINEARTILE);
2931    }
2932    tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
2933 
2934    const uint32_t dst_offset =
2935       dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
2936    tfu.ioa |= dst_offset;
2937 
2938    tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
2939                (dst_slice->tiling - VC5_TILING_LINEARTILE)) <<
2940                 V3D_TFU_IOA_FORMAT_SHIFT;
2941    tfu.icfg |= dst->format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
2942 
2943    switch (src_slice->tiling) {
2944    case VC5_TILING_UIF_NO_XOR:
2945    case VC5_TILING_UIF_XOR:
2946       tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
2947       break;
2948    case VC5_TILING_RASTER:
2949       tfu.iis |= src_slice->stride / src->cpp;
2950       break;
2951    default:
2952       break;
2953    }
2954 
2955    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
2956     * OPAD field for the destination (how many extra UIF blocks beyond
2957     * those necessary to cover the height).
2958     */
2959    if (dst_slice->tiling == VC5_TILING_UIF_NO_XOR ||
2960        dst_slice->tiling == VC5_TILING_UIF_XOR) {
2961       uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
2962       uint32_t implicit_padded_height = align(height, uif_block_h);
2963       uint32_t icfg =
2964          (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
2965       tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
2966    }
2967 
2968    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
2969 }
2970 
2971 static void
2972 compute_blit_3d_layers(const VkOffset3D *offsets,
2973                        uint32_t *min_layer, uint32_t *max_layer,
2974                        bool *mirror_z);
2975 
2976 /**
2977  * Returns true if the implementation supports the requested operation (even if
2978  * it failed to process it, for example, due to an out-of-memory error).
2979  */
2980 static bool
blit_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageBlit * region,VkFilter filter)2981 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
2982          struct v3dv_image *dst,
2983          struct v3dv_image *src,
2984          const VkImageBlit *region,
2985          VkFilter filter)
2986 {
2987    /* FIXME? The v3d driver seems to ignore filtering completely! */
2988    if (filter != VK_FILTER_NEAREST)
2989       return false;
2990 
2991    /* Format must match */
2992    if (src->vk_format != dst->vk_format)
2993       return false;
2994 
2995    VkFormat vk_format = dst->vk_format;
2996    const struct v3dv_format *format = dst->format;
2997 
2998    /* Format must be supported for texturing */
2999    if (!v3dv_tfu_supports_tex_format(&cmd_buffer->device->devinfo,
3000                                      format->tex_type)) {
3001       return false;
3002    }
3003 
3004    /* Only color formats */
3005    if (vk_format_is_depth_or_stencil(vk_format))
3006       return false;
3007 
3008 #if 0
3009    /* FIXME: Only 2D images? */
3010    if (dst->type == VK_IMAGE_TYPE_2D || src->type == VK_IMAGE_TYPE_2D)
3011       return false;
3012 #endif
3013 
3014    /* Destination can't be raster format */
3015    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
3016    if (dst->slices[dst_mip_level].tiling == VC5_TILING_RASTER)
3017       return false;
3018 
3019    /* Source region must start at (0,0) */
3020    if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
3021       return false;
3022 
3023    /* Destination image must be complete */
3024    if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
3025       return false;
3026 
3027    const uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
3028    const uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
3029    if (region->dstOffsets[1].x < dst_width - 1||
3030        region->dstOffsets[1].y < dst_height - 1) {
3031       return false;
3032    }
3033 
3034    /* No scaling */
3035    if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
3036        region->srcOffsets[1].y != region->dstOffsets[1].y) {
3037       return false;
3038    }
3039 
3040    if (dst->type == VK_IMAGE_TYPE_3D &&
3041        region->srcOffsets[1].z != region->dstOffsets[1].z) {
3042       return false;
3043    }
3044 
3045    /* Emit a TFU job for each layer to blit */
3046    assert(region->dstSubresource.layerCount ==
3047           region->srcSubresource.layerCount);
3048 
3049    uint32_t min_dst_layer;
3050    uint32_t max_dst_layer;
3051    bool dst_mirror_z = false;
3052    if (dst->type == VK_IMAGE_TYPE_3D) {
3053       compute_blit_3d_layers(region->dstOffsets,
3054                              &min_dst_layer, &max_dst_layer,
3055                              &dst_mirror_z);
3056 
3057       /* TFU can only do exact copies, so we can't handle mirroring. This checks
3058        * mirroring in Z for 3D images, XY mirroring is already handled by earlier
3059        * checks
3060        */
3061       if (dst_mirror_z)
3062          return false;
3063    }
3064 
3065    uint32_t min_src_layer;
3066    uint32_t max_src_layer;
3067    bool src_mirror_z = false;
3068    if (src->type == VK_IMAGE_TYPE_3D) {
3069       compute_blit_3d_layers(region->srcOffsets,
3070                              &min_src_layer, &max_src_layer,
3071                              &src_mirror_z);
3072 
3073       if (src_mirror_z)
3074          return false;
3075 
3076       if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
3077          return false;
3078    }
3079 
3080    const uint32_t layer_count = dst->type != VK_IMAGE_TYPE_3D ?
3081       region->dstSubresource.layerCount :
3082       max_dst_layer - min_dst_layer;
3083    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
3084 
3085    for (uint32_t i = 0; i < layer_count; i++) {
3086       emit_tfu_job(cmd_buffer,
3087                    dst, dst_mip_level, region->dstSubresource.baseArrayLayer + i,
3088                    src, src_mip_level, region->srcSubresource.baseArrayLayer + i,
3089                    dst_width, dst_height);
3090    }
3091 
3092    return true;
3093 }
3094 
3095 static bool
format_needs_software_int_clamp(VkFormat format)3096 format_needs_software_int_clamp(VkFormat format)
3097 {
3098    switch (format) {
3099       case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3100       case VK_FORMAT_A2R10G10B10_SINT_PACK32:
3101       case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3102       case VK_FORMAT_A2B10G10R10_SINT_PACK32:
3103          return true;
3104       default:
3105          return false;
3106    };
3107 }
3108 
3109 static void
get_blit_pipeline_cache_key(VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,uint8_t * key)3110 get_blit_pipeline_cache_key(VkFormat dst_format,
3111                             VkFormat src_format,
3112                             VkColorComponentFlags cmask,
3113                             VkSampleCountFlagBits dst_samples,
3114                             VkSampleCountFlagBits src_samples,
3115                             uint8_t *key)
3116 {
3117    memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);
3118 
3119    uint32_t *p = (uint32_t *) key;
3120 
3121    *p = dst_format;
3122    p++;
3123 
3124    /* Generally, when blitting from a larger format to a smaller format
3125     * the hardware takes care of clamping the source to the RT range.
3126     * Specifically, for integer formats, this is done by using
3127     * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
3128     * clamps to the bit-size of the render type, and some formats, such as
3129     * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
3130     * require to clamp in software. In these cases, we need to amend the blit
3131     * shader with clamp code that depends on both the src and dst formats, so
3132     * we need the src format to be part of the key.
3133     */
3134    *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
3135    p++;
3136 
3137    *p = cmask;
3138    p++;
3139 
3140    *p = (dst_samples << 8) | src_samples;
3141    p++;
3142 
3143    assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
3144 }
3145 
3146 static bool
create_blit_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * descriptor_set_layout,VkPipelineLayout * pipeline_layout)3147 create_blit_pipeline_layout(struct v3dv_device *device,
3148                             VkDescriptorSetLayout *descriptor_set_layout,
3149                             VkPipelineLayout *pipeline_layout)
3150 {
3151    VkResult result;
3152 
3153    if (*descriptor_set_layout == 0) {
3154       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
3155          .binding = 0,
3156          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
3157          .descriptorCount = 1,
3158          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
3159       };
3160       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
3161          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
3162          .bindingCount = 1,
3163          .pBindings = &descriptor_set_layout_binding,
3164       };
3165       result =
3166          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
3167                                         &descriptor_set_layout_info,
3168                                         &device->alloc,
3169                                         descriptor_set_layout);
3170       if (result != VK_SUCCESS)
3171          return false;
3172    }
3173 
3174    assert(*pipeline_layout == 0);
3175    VkPipelineLayoutCreateInfo pipeline_layout_info = {
3176       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
3177       .setLayoutCount = 1,
3178       .pSetLayouts = descriptor_set_layout,
3179       .pushConstantRangeCount = 1,
3180       .pPushConstantRanges =
3181          &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
3182    };
3183 
3184    result =
3185       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
3186                                 &pipeline_layout_info,
3187                                 &device->alloc,
3188                                 pipeline_layout);
3189    return result == VK_SUCCESS;
3190 }
3191 
3192 static bool
create_blit_render_pass(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkRenderPass * pass_load,VkRenderPass * pass_no_load)3193 create_blit_render_pass(struct v3dv_device *device,
3194                         VkFormat dst_format,
3195                         VkFormat src_format,
3196                         VkRenderPass *pass_load,
3197                         VkRenderPass *pass_no_load)
3198 {
3199    const bool is_color_blit = vk_format_is_color(dst_format);
3200 
3201    /* Attachment load operation is specified below */
3202    VkAttachmentDescription att = {
3203       .format = dst_format,
3204       .samples = VK_SAMPLE_COUNT_1_BIT,
3205       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
3206       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3207       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
3208    };
3209 
3210    VkAttachmentReference att_ref = {
3211       .attachment = 0,
3212       .layout = VK_IMAGE_LAYOUT_GENERAL,
3213    };
3214 
3215    VkSubpassDescription subpass = {
3216       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
3217       .inputAttachmentCount = 0,
3218       .colorAttachmentCount = is_color_blit ? 1 : 0,
3219       .pColorAttachments = is_color_blit ? &att_ref : NULL,
3220       .pResolveAttachments = NULL,
3221       .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
3222       .preserveAttachmentCount = 0,
3223       .pPreserveAttachments = NULL,
3224    };
3225 
3226    VkRenderPassCreateInfo info = {
3227       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
3228       .attachmentCount = 1,
3229       .pAttachments = &att,
3230       .subpassCount = 1,
3231       .pSubpasses = &subpass,
3232       .dependencyCount = 0,
3233       .pDependencies = NULL,
3234    };
3235 
3236    VkResult result;
3237    att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
3238    result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
3239                                   &info, &device->alloc, pass_load);
3240    if (result != VK_SUCCESS)
3241       return false;
3242 
3243    att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
3244    result = v3dv_CreateRenderPass(v3dv_device_to_handle(device),
3245                                   &info, &device->alloc, pass_no_load);
3246    return result == VK_SUCCESS;
3247 }
3248 
3249 static nir_ssa_def *
gen_rect_vertices(nir_builder * b)3250 gen_rect_vertices(nir_builder *b)
3251 {
3252    nir_intrinsic_instr *vertex_id =
3253       nir_intrinsic_instr_create(b->shader,
3254                                  nir_intrinsic_load_vertex_id);
3255    nir_ssa_dest_init(&vertex_id->instr, &vertex_id->dest, 1, 32, "vertexid");
3256    nir_builder_instr_insert(b, &vertex_id->instr);
3257 
3258 
3259    /* vertex 0: -1.0, -1.0
3260     * vertex 1: -1.0,  1.0
3261     * vertex 2:  1.0, -1.0
3262     * vertex 3:  1.0,  1.0
3263     *
3264     * so:
3265     *
3266     * channel 0 is vertex_id < 2 ? -1.0 :  1.0
3267     * channel 1 is vertex id & 1 ?  1.0 : -1.0
3268     */
3269 
3270    nir_ssa_def *one = nir_imm_int(b, 1);
3271    nir_ssa_def *c0cmp = nir_ilt(b, &vertex_id->dest.ssa, nir_imm_int(b, 2));
3272    nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, &vertex_id->dest.ssa, one), one);
3273 
3274    nir_ssa_def *comp[4];
3275    comp[0] = nir_bcsel(b, c0cmp,
3276                        nir_imm_float(b, -1.0f),
3277                        nir_imm_float(b, 1.0f));
3278 
3279    comp[1] = nir_bcsel(b, c1cmp,
3280                        nir_imm_float(b, 1.0f),
3281                        nir_imm_float(b, -1.0f));
3282    comp[2] = nir_imm_float(b, 0.0f);
3283    comp[3] = nir_imm_float(b, 1.0f);
3284    return nir_vec(b, comp, 4);
3285 }
3286 
3287 static nir_ssa_def *
gen_tex_coords(nir_builder * b)3288 gen_tex_coords(nir_builder *b)
3289 {
3290    nir_intrinsic_instr *tex_box =
3291       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
3292    tex_box->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
3293    nir_intrinsic_set_base(tex_box, 0);
3294    nir_intrinsic_set_range(tex_box, 16);
3295    tex_box->num_components = 4;
3296    nir_ssa_dest_init(&tex_box->instr, &tex_box->dest, 4, 32, "tex_box");
3297    nir_builder_instr_insert(b, &tex_box->instr);
3298 
3299    nir_intrinsic_instr *tex_z =
3300       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
3301    tex_z->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
3302    nir_intrinsic_set_base(tex_z, 16);
3303    nir_intrinsic_set_range(tex_z, 4);
3304    tex_z->num_components = 1;
3305    nir_ssa_dest_init(&tex_z->instr, &tex_z->dest, 1, 32, "tex_z");
3306    nir_builder_instr_insert(b, &tex_z->instr);
3307 
3308    nir_intrinsic_instr *vertex_id =
3309       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_vertex_id);
3310    nir_ssa_dest_init(&vertex_id->instr, &vertex_id->dest, 1, 32, "vertexid");
3311    nir_builder_instr_insert(b, &vertex_id->instr);
3312 
3313    /* vertex 0: src0_x, src0_y
3314     * vertex 1: src0_x, src1_y
3315     * vertex 2: src1_x, src0_y
3316     * vertex 3: src1_x, src1_y
3317     *
3318     * So:
3319     *
3320     * channel 0 is vertex_id < 2 ? src0_x : src1_x
3321     * channel 1 is vertex id & 1 ? src1_y : src0_y
3322     */
3323 
3324    nir_ssa_def *one = nir_imm_int(b, 1);
3325    nir_ssa_def *c0cmp = nir_ilt(b, &vertex_id->dest.ssa, nir_imm_int(b, 2));
3326    nir_ssa_def *c1cmp = nir_ieq(b, nir_iand(b, &vertex_id->dest.ssa, one), one);
3327 
3328    nir_ssa_def *comp[4];
3329    comp[0] = nir_bcsel(b, c0cmp,
3330                        nir_channel(b, &tex_box->dest.ssa, 0),
3331                        nir_channel(b, &tex_box->dest.ssa, 2));
3332 
3333    comp[1] = nir_bcsel(b, c1cmp,
3334                        nir_channel(b, &tex_box->dest.ssa, 3),
3335                        nir_channel(b, &tex_box->dest.ssa, 1));
3336    comp[2] = &tex_z->dest.ssa;
3337    comp[3] = nir_imm_float(b, 1.0f);
3338    return nir_vec(b, comp, 4);
3339 }
3340 
3341 static nir_ssa_def *
build_nir_tex_op_read(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,enum glsl_sampler_dim dim)3342 build_nir_tex_op_read(struct nir_builder *b,
3343                       nir_ssa_def *tex_pos,
3344                       enum glsl_base_type tex_type,
3345                       enum glsl_sampler_dim dim)
3346 {
3347    assert(dim != GLSL_SAMPLER_DIM_MS);
3348 
3349    const struct glsl_type *sampler_type =
3350       glsl_sampler_type(dim, false, false, tex_type);
3351    nir_variable *sampler =
3352       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3353    sampler->data.descriptor_set = 0;
3354    sampler->data.binding = 0;
3355 
3356    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3357    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3358    tex->sampler_dim = dim;
3359    tex->op = nir_texop_tex;
3360    tex->src[0].src_type = nir_tex_src_coord;
3361    tex->src[0].src = nir_src_for_ssa(tex_pos);
3362    tex->src[1].src_type = nir_tex_src_texture_deref;
3363    tex->src[1].src = nir_src_for_ssa(tex_deref);
3364    tex->src[2].src_type = nir_tex_src_sampler_deref;
3365    tex->src[2].src = nir_src_for_ssa(tex_deref);
3366    tex->dest_type =
3367       nir_alu_type_get_base_type(nir_get_nir_type_for_glsl_base_type(tex_type));
3368    tex->is_array = glsl_sampler_type_is_array(sampler_type);
3369    tex->coord_components = tex_pos->num_components;
3370 
3371    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3372    nir_builder_instr_insert(b, &tex->instr);
3373    return &tex->dest.ssa;
3374 }
3375 
3376 static nir_ssa_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder * b,nir_variable * sampler,nir_ssa_def * tex_deref,enum glsl_base_type tex_type,nir_ssa_def * tex_pos,nir_ssa_def * sample_idx)3377 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
3378                                  nir_variable *sampler,
3379                                  nir_ssa_def *tex_deref,
3380                                  enum glsl_base_type tex_type,
3381                                  nir_ssa_def *tex_pos,
3382                                  nir_ssa_def *sample_idx)
3383 {
3384    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 4);
3385    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
3386    tex->op = nir_texop_txf_ms;
3387    tex->src[0].src_type = nir_tex_src_coord;
3388    tex->src[0].src = nir_src_for_ssa(tex_pos);
3389    tex->src[1].src_type = nir_tex_src_texture_deref;
3390    tex->src[1].src = nir_src_for_ssa(tex_deref);
3391    tex->src[2].src_type = nir_tex_src_sampler_deref;
3392    tex->src[2].src = nir_src_for_ssa(tex_deref);
3393    tex->src[3].src_type = nir_tex_src_ms_index;
3394    tex->src[3].src = nir_src_for_ssa(sample_idx);
3395    tex->dest_type =
3396       nir_alu_type_get_base_type(nir_get_nir_type_for_glsl_base_type(tex_type));
3397    tex->is_array = false;
3398    tex->coord_components = tex_pos->num_components;
3399 
3400    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
3401    nir_builder_instr_insert(b, &tex->instr);
3402    return &tex->dest.ssa;
3403 }
3404 
3405 /* Fetches all samples at the given position and averages them */
3406 static nir_ssa_def *
build_nir_tex_op_ms_resolve(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits src_samples)3407 build_nir_tex_op_ms_resolve(struct nir_builder *b,
3408                             nir_ssa_def *tex_pos,
3409                             enum glsl_base_type tex_type,
3410                             VkSampleCountFlagBits src_samples)
3411 {
3412    assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
3413    const struct glsl_type *sampler_type =
3414       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3415    nir_variable *sampler =
3416       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3417    sampler->data.descriptor_set = 0;
3418    sampler->data.binding = 0;
3419 
3420    const bool is_int = glsl_base_type_is_integer(tex_type);
3421 
3422    nir_ssa_def *tmp;
3423    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3424    for (uint32_t i = 0; i < src_samples; i++) {
3425       nir_ssa_def *s =
3426          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3427                                           tex_type, tex_pos,
3428                                           nir_imm_int(b, i));
3429 
3430       /* For integer formats, the multisample resolve operation is expected to
3431        * return one of the samples, we just return the first one.
3432        */
3433       if (is_int)
3434          return s;
3435 
3436       tmp = i == 0 ? s : nir_fadd(b, tmp, s);
3437    }
3438 
3439    assert(!is_int);
3440    return nir_fmul(b, tmp, nir_imm_float(b, 1.0f / src_samples));
3441 }
3442 
3443 /* Fetches the current sample (gl_SampleID) at the given position */
3444 static nir_ssa_def *
build_nir_tex_op_ms_read(struct nir_builder * b,nir_ssa_def * tex_pos,enum glsl_base_type tex_type)3445 build_nir_tex_op_ms_read(struct nir_builder *b,
3446                          nir_ssa_def *tex_pos,
3447                          enum glsl_base_type tex_type)
3448 {
3449    const struct glsl_type *sampler_type =
3450       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3451    nir_variable *sampler =
3452       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3453    sampler->data.descriptor_set = 0;
3454    sampler->data.binding = 0;
3455 
3456    nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
3457 
3458    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3459                                            tex_type, tex_pos,
3460                                            nir_load_sample_id(b));
3461 }
3462 
3463 static nir_ssa_def *
build_nir_tex_op(struct nir_builder * b,struct v3dv_device * device,nir_ssa_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim dim)3464 build_nir_tex_op(struct nir_builder *b,
3465                  struct v3dv_device *device,
3466                  nir_ssa_def *tex_pos,
3467                  enum glsl_base_type tex_type,
3468                  VkSampleCountFlagBits dst_samples,
3469                  VkSampleCountFlagBits src_samples,
3470                  enum glsl_sampler_dim dim)
3471 {
3472    switch (dim) {
3473    case GLSL_SAMPLER_DIM_MS:
3474       assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
3475       /* For multisampled texture sources we need to use fetching instead of
3476        * normalized texture coordinates. We already configured our blit
3477        * coordinates to be in texel units, but here we still need to convert
3478        * them from floating point to integer.
3479        */
3480       tex_pos = nir_f2i32(b, tex_pos);
3481 
3482       if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
3483          return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
3484       else
3485          return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
3486    default:
3487       assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
3488       return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
3489    }
3490 }
3491 
3492 static nir_shader *
get_blit_vs()3493 get_blit_vs()
3494 {
3495    nir_builder b;
3496    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3497    nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_VERTEX, options);
3498    b.shader->info.name = ralloc_strdup(b.shader, "meta blit vs");
3499 
3500    const struct glsl_type *vec4 = glsl_vec4_type();
3501 
3502    nir_variable *vs_out_pos =
3503       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
3504    vs_out_pos->data.location = VARYING_SLOT_POS;
3505 
3506    nir_variable *vs_out_tex_coord =
3507       nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
3508    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
3509    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
3510 
3511    nir_ssa_def *pos = gen_rect_vertices(&b);
3512    nir_store_var(&b, vs_out_pos, pos, 0xf);
3513 
3514    nir_ssa_def *tex_coord = gen_tex_coords(&b);
3515    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
3516 
3517    return b.shader;
3518 }
3519 
3520 static uint32_t
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)3521 get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
3522 {
3523    switch (sampler_dim) {
3524    case GLSL_SAMPLER_DIM_1D: return 0x1;
3525    case GLSL_SAMPLER_DIM_2D: return 0x3;
3526    case GLSL_SAMPLER_DIM_MS: return 0x3;
3527    case GLSL_SAMPLER_DIM_3D: return 0x7;
3528    default:
3529       unreachable("invalid sampler dim");
3530    };
3531 }
3532 
3533 static nir_shader *
get_color_blit_fs(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim sampler_dim)3534 get_color_blit_fs(struct v3dv_device *device,
3535                   VkFormat dst_format,
3536                   VkFormat src_format,
3537                   VkSampleCountFlagBits dst_samples,
3538                   VkSampleCountFlagBits src_samples,
3539                   enum glsl_sampler_dim sampler_dim)
3540 {
3541    nir_builder b;
3542    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
3543    nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, options);
3544    b.shader->info.name = ralloc_strdup(b.shader, "meta blit fs");
3545 
3546    const struct glsl_type *vec4 = glsl_vec4_type();
3547 
3548    nir_variable *fs_in_tex_coord =
3549       nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
3550    fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;
3551 
3552    const struct glsl_type *fs_out_type =
3553       vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
3554       vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
3555                                       glsl_vec4_type();
3556 
3557    enum glsl_base_type src_base_type =
3558       vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
3559       vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
3560                                       GLSL_TYPE_FLOAT;
3561 
3562    nir_variable *fs_out_color =
3563       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
3564    fs_out_color->data.location = FRAG_RESULT_DATA0;
3565 
3566    nir_ssa_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
3567    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
3568    tex_coord = nir_channels(&b, tex_coord, channel_mask);
3569 
3570    nir_ssa_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
3571                                          dst_samples, src_samples, sampler_dim);
3572 
3573    /* For integer textures, if the bit-size of the destination is too small to
3574     * hold source value, Vulkan (CTS) expects the implementation to clamp to the
3575     * maximum value the destination can hold. The hardware can clamp to the
3576     * render target type, which usually matches the component bit-size, but
3577     * there are some cases that won't match, such as rgb10a2, which has a 16-bit
3578     * render target type, so in these cases we need to clamp manually.
3579     */
3580    if (format_needs_software_int_clamp(dst_format)) {
3581       assert(vk_format_is_int(dst_format));
3582       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
3583       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
3584 
3585       nir_ssa_def *c[4];
3586       for (uint32_t i = 0; i < 4; i++) {
3587          c[i] = nir_channel(&b, color, i);
3588 
3589          const uint32_t src_bit_size =
3590             util_format_get_component_bits(src_pformat,
3591                                            UTIL_FORMAT_COLORSPACE_RGB,
3592                                            i);
3593          const uint32_t dst_bit_size =
3594             util_format_get_component_bits(dst_pformat,
3595                                            UTIL_FORMAT_COLORSPACE_RGB,
3596                                            i);
3597 
3598          if (dst_bit_size >= src_bit_size)
3599             continue;
3600 
3601          if (util_format_is_pure_uint(dst_pformat)) {
3602             nir_ssa_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
3603             c[i] = nir_umin(&b, c[i], max);
3604          } else {
3605             nir_ssa_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
3606             nir_ssa_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
3607             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
3608          }
3609       }
3610 
3611       color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
3612    }
3613 
3614    nir_store_var(&b, fs_out_color, color, 0xf);
3615 
3616    return b.shader;
3617 }
3618 
3619 static bool
create_pipeline(struct v3dv_device * device,struct v3dv_render_pass * pass,struct nir_shader * vs_nir,struct nir_shader * fs_nir,const VkPipelineVertexInputStateCreateInfo * vi_state,const VkPipelineDepthStencilStateCreateInfo * ds_state,const VkPipelineColorBlendStateCreateInfo * cb_state,const VkPipelineMultisampleStateCreateInfo * ms_state,const VkPipelineLayout layout,VkPipeline * pipeline)3620 create_pipeline(struct v3dv_device *device,
3621                 struct v3dv_render_pass *pass,
3622                 struct nir_shader *vs_nir,
3623                 struct nir_shader *fs_nir,
3624                 const VkPipelineVertexInputStateCreateInfo *vi_state,
3625                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
3626                 const VkPipelineColorBlendStateCreateInfo *cb_state,
3627                 const VkPipelineMultisampleStateCreateInfo *ms_state,
3628                 const VkPipelineLayout layout,
3629                 VkPipeline *pipeline)
3630 {
3631    struct v3dv_shader_module vs_m;
3632    struct v3dv_shader_module fs_m;
3633 
3634    v3dv_shader_module_internal_init(&vs_m, vs_nir);
3635    v3dv_shader_module_internal_init(&fs_m, fs_nir);
3636 
3637    VkPipelineShaderStageCreateInfo stages[2] = {
3638       {
3639          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3640          .stage = VK_SHADER_STAGE_VERTEX_BIT,
3641          .module = v3dv_shader_module_to_handle(&vs_m),
3642          .pName = "main",
3643       },
3644       {
3645          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3646          .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
3647          .module = v3dv_shader_module_to_handle(&fs_m),
3648          .pName = "main",
3649       },
3650    };
3651 
3652    VkGraphicsPipelineCreateInfo info = {
3653       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
3654 
3655       .stageCount = 2,
3656       .pStages = stages,
3657 
3658       .pVertexInputState = vi_state,
3659 
3660       .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
3661          .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
3662          .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
3663          .primitiveRestartEnable = false,
3664       },
3665 
3666       .pViewportState = &(VkPipelineViewportStateCreateInfo) {
3667          .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
3668          .viewportCount = 1,
3669          .scissorCount = 1,
3670       },
3671 
3672       .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
3673          .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
3674          .rasterizerDiscardEnable = false,
3675          .polygonMode = VK_POLYGON_MODE_FILL,
3676          .cullMode = VK_CULL_MODE_NONE,
3677          .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
3678          .depthBiasEnable = false,
3679       },
3680 
3681       .pMultisampleState = ms_state,
3682 
3683       .pDepthStencilState = ds_state,
3684 
3685       .pColorBlendState = cb_state,
3686 
3687       /* The meta clear pipeline declares all state as dynamic.
3688        * As a consequence, vkCmdBindPipeline writes no dynamic state
3689        * to the cmd buffer. Therefore, at the end of the meta clear,
3690        * we need only restore dynamic state that was vkCmdSet.
3691        */
3692       .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
3693          .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
3694          .dynamicStateCount = 6,
3695          .pDynamicStates = (VkDynamicState[]) {
3696             VK_DYNAMIC_STATE_VIEWPORT,
3697             VK_DYNAMIC_STATE_SCISSOR,
3698             VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
3699             VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
3700             VK_DYNAMIC_STATE_STENCIL_REFERENCE,
3701             VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3702             VK_DYNAMIC_STATE_DEPTH_BIAS,
3703             VK_DYNAMIC_STATE_LINE_WIDTH,
3704          },
3705       },
3706 
3707       .flags = 0,
3708       .layout = layout,
3709       .renderPass = v3dv_render_pass_to_handle(pass),
3710       .subpass = 0,
3711    };
3712 
3713    VkResult result =
3714       v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
3715                                    VK_NULL_HANDLE,
3716                                    1, &info,
3717                                    &device->alloc,
3718                                    pipeline);
3719 
3720    ralloc_free(vs_nir);
3721    ralloc_free(fs_nir);
3722 
3723    return result == VK_SUCCESS;
3724 }
3725 
3726 static enum glsl_sampler_dim
get_sampler_dim(VkImageType type,VkSampleCountFlagBits src_samples)3727 get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
3728 {
3729    /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
3730     *
3731     *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
3732     *    VK_IMAGE_TYPE_2D, ..."
3733     */
3734    assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);
3735 
3736    switch (type) {
3737    case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
3738    case VK_IMAGE_TYPE_2D:
3739       return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
3740                                                     GLSL_SAMPLER_DIM_MS;
3741    case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
3742    default:
3743       unreachable("Invalid image type");
3744    }
3745 }
3746 
3747 static bool
create_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)3748 create_blit_pipeline(struct v3dv_device *device,
3749                      VkFormat dst_format,
3750                      VkFormat src_format,
3751                      VkColorComponentFlags cmask,
3752                      VkImageType src_type,
3753                      VkSampleCountFlagBits dst_samples,
3754                      VkSampleCountFlagBits src_samples,
3755                      VkRenderPass _pass,
3756                      VkPipelineLayout pipeline_layout,
3757                      VkPipeline *pipeline)
3758 {
3759    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
3760 
3761    /* We always rewrite depth/stencil blits to compatible color blits */
3762    assert(vk_format_is_color(dst_format));
3763    assert(vk_format_is_color(src_format));
3764 
3765    const enum glsl_sampler_dim sampler_dim =
3766       get_sampler_dim(src_type, src_samples);
3767 
3768    nir_shader *vs_nir = get_blit_vs();
3769    nir_shader *fs_nir =
3770       get_color_blit_fs(device, dst_format, src_format,
3771                         dst_samples, src_samples, sampler_dim);
3772 
3773    const VkPipelineVertexInputStateCreateInfo vi_state = {
3774       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
3775       .vertexBindingDescriptionCount = 0,
3776       .vertexAttributeDescriptionCount = 0,
3777    };
3778 
3779    VkPipelineDepthStencilStateCreateInfo ds_state = {
3780       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
3781    };
3782 
3783    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
3784    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
3785       .blendEnable = false,
3786       .colorWriteMask = cmask,
3787    };
3788 
3789    const VkPipelineColorBlendStateCreateInfo cb_state = {
3790       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
3791       .logicOpEnable = false,
3792       .attachmentCount = 1,
3793       .pAttachments = blend_att_state
3794    };
3795 
3796    const VkPipelineMultisampleStateCreateInfo ms_state = {
3797       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
3798       .rasterizationSamples = dst_samples,
3799       .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
3800       .pSampleMask = NULL,
3801       .alphaToCoverageEnable = false,
3802       .alphaToOneEnable = false,
3803    };
3804 
3805    return create_pipeline(device,
3806                           pass,
3807                           vs_nir, fs_nir,
3808                           &vi_state,
3809                           &ds_state,
3810                           &cb_state,
3811                           &ms_state,
3812                           pipeline_layout,
3813                           pipeline);
3814 }
3815 
3816 /**
3817  * Return a pipeline suitable for blitting the requested aspect given the
3818  * destination and source formats.
3819  */
3820 static bool
get_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,struct v3dv_meta_blit_pipeline ** pipeline)3821 get_blit_pipeline(struct v3dv_device *device,
3822                   VkFormat dst_format,
3823                   VkFormat src_format,
3824                   VkColorComponentFlags cmask,
3825                   VkImageType src_type,
3826                   VkSampleCountFlagBits dst_samples,
3827                   VkSampleCountFlagBits src_samples,
3828                   struct v3dv_meta_blit_pipeline **pipeline)
3829 {
3830    bool ok = true;
3831 
3832    mtx_lock(&device->meta.mtx);
3833    if (!device->meta.blit.playout) {
3834       ok = create_blit_pipeline_layout(device,
3835                                        &device->meta.blit.dslayout,
3836                                        &device->meta.blit.playout);
3837    }
3838    mtx_unlock(&device->meta.mtx);
3839    if (!ok)
3840       return false;
3841 
3842    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
3843    get_blit_pipeline_cache_key(dst_format, src_format, cmask,
3844                                dst_samples, src_samples, key);
3845    mtx_lock(&device->meta.mtx);
3846    struct hash_entry *entry =
3847       _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
3848    if (entry) {
3849       mtx_unlock(&device->meta.mtx);
3850       *pipeline = entry->data;
3851       return true;
3852    }
3853 
3854    *pipeline = vk_zalloc2(&device->alloc, NULL, sizeof(**pipeline), 8,
3855                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
3856 
3857    if (*pipeline == NULL)
3858       goto fail;
3859 
3860    ok = create_blit_render_pass(device, dst_format, src_format,
3861                                 &(*pipeline)->pass,
3862                                 &(*pipeline)->pass_no_load);
3863    if (!ok)
3864       goto fail;
3865 
3866    /* Create the pipeline using one of the render passes, they are both
3867     * compatible, so we don't care which one we use here.
3868     */
3869    ok = create_blit_pipeline(device,
3870                              dst_format,
3871                              src_format,
3872                              cmask,
3873                              src_type,
3874                              dst_samples,
3875                              src_samples,
3876                              (*pipeline)->pass,
3877                              device->meta.blit.playout,
3878                              &(*pipeline)->pipeline);
3879    if (!ok)
3880       goto fail;
3881 
3882    memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
3883    _mesa_hash_table_insert(device->meta.blit.cache[src_type],
3884                            &(*pipeline)->key, *pipeline);
3885 
3886    mtx_unlock(&device->meta.mtx);
3887    return true;
3888 
3889 fail:
3890    mtx_unlock(&device->meta.mtx);
3891 
3892    VkDevice _device = v3dv_device_to_handle(device);
3893    if (*pipeline) {
3894       if ((*pipeline)->pass)
3895          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->alloc);
3896       if ((*pipeline)->pass_no_load)
3897          v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->alloc);
3898       if ((*pipeline)->pipeline)
3899          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->alloc);
3900       vk_free(&device->alloc, *pipeline);
3901       *pipeline = NULL;
3902    }
3903 
3904    return false;
3905 }
3906 
3907 static void
compute_blit_box(const VkOffset3D * offsets,uint32_t image_w,uint32_t image_h,uint32_t * x,uint32_t * y,uint32_t * w,uint32_t * h,bool * mirror_x,bool * mirror_y)3908 compute_blit_box(const VkOffset3D *offsets,
3909                  uint32_t image_w, uint32_t image_h,
3910                  uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
3911                  bool *mirror_x, bool *mirror_y)
3912 {
3913    if (offsets[1].x >= offsets[0].x) {
3914       *mirror_x = false;
3915       *x = MIN2(offsets[0].x, image_w - 1);
3916       *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
3917    } else {
3918       *mirror_x = true;
3919       *x = MIN2(offsets[1].x, image_w - 1);
3920       *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
3921    }
3922    if (offsets[1].y >= offsets[0].y) {
3923       *mirror_y = false;
3924       *y = MIN2(offsets[0].y, image_h - 1);
3925       *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
3926    } else {
3927       *mirror_y = true;
3928       *y = MIN2(offsets[1].y, image_h - 1);
3929       *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
3930    }
3931 }
3932 
3933 static void
compute_blit_3d_layers(const VkOffset3D * offsets,uint32_t * min_layer,uint32_t * max_layer,bool * mirror_z)3934 compute_blit_3d_layers(const VkOffset3D *offsets,
3935                        uint32_t *min_layer, uint32_t *max_layer,
3936                        bool *mirror_z)
3937 {
3938    if (offsets[1].z >= offsets[0].z) {
3939       *mirror_z = false;
3940       *min_layer = offsets[0].z;
3941       *max_layer = offsets[1].z;
3942    } else {
3943       *mirror_z = true;
3944       *min_layer = offsets[1].z;
3945       *max_layer = offsets[0].z;
3946    }
3947 }
3948 
3949 static VkResult
create_blit_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)3950 create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
3951 {
3952    /* If this is not the first pool we create for this command buffer
3953     * size it based on the size of the currently exhausted pool.
3954     */
3955    uint32_t descriptor_count = 64;
3956    if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
3957       struct v3dv_descriptor_pool *exhausted_pool =
3958          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
3959       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
3960    }
3961 
3962    /* Create the descriptor pool */
3963    cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
3964    VkDescriptorPoolSize pool_size = {
3965       .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
3966       .descriptorCount = descriptor_count,
3967    };
3968    VkDescriptorPoolCreateInfo info = {
3969       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
3970       .maxSets = descriptor_count,
3971       .poolSizeCount = 1,
3972       .pPoolSizes = &pool_size,
3973       .flags = 0,
3974    };
3975    VkResult result =
3976       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
3977                                 &info,
3978                                 &cmd_buffer->device->alloc,
3979                                 &cmd_buffer->meta.blit.dspool);
3980 
3981    if (result == VK_SUCCESS) {
3982       assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
3983       v3dv_cmd_buffer_add_private_obj(
3984          cmd_buffer, (uintptr_t)cmd_buffer->meta.blit.dspool,
3985          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
3986    }
3987 
3988    return result;
3989 }
3990 
3991 static VkResult
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)3992 allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
3993                                     VkDescriptorSet *set)
3994 {
3995    /* Make sure we have a descriptor pool */
3996    VkResult result;
3997    if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
3998       result = create_blit_descriptor_pool(cmd_buffer);
3999       if (result != VK_SUCCESS)
4000          return result;
4001    }
4002    assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
4003 
4004    /* Allocate descriptor set */
4005    struct v3dv_device *device = cmd_buffer->device;
4006    VkDevice _device = v3dv_device_to_handle(device);
4007    VkDescriptorSetAllocateInfo info = {
4008       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
4009       .descriptorPool = cmd_buffer->meta.blit.dspool,
4010       .descriptorSetCount = 1,
4011       .pSetLayouts = &device->meta.blit.dslayout,
4012    };
4013    result = v3dv_AllocateDescriptorSets(_device, &info, set);
4014 
4015    /* If we ran out of pool space, grow the pool and try again */
4016    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
4017       result = create_blit_descriptor_pool(cmd_buffer);
4018       if (result == VK_SUCCESS) {
4019          info.descriptorPool = cmd_buffer->meta.blit.dspool;
4020          result = v3dv_AllocateDescriptorSets(_device, &info, set);
4021       }
4022    }
4023 
4024    return result;
4025 }
4026 
4027 /**
4028  * Returns true if the implementation supports the requested operation (even if
4029  * it failed to process it, for example, due to an out-of-memory error).
4030  *
4031  * The caller can specify the channels on the destination to be written via the
4032  * cmask parameter (which can be 0 to default to all channels), as well as a
4033  * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
4034  * to use the default identity swizzle).
4035  */
4036 static bool
blit_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,VkFormat dst_format,struct v3dv_image * src,VkFormat src_format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,const VkImageBlit * _region,VkFilter filter,bool dst_is_padded_image)4037 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
4038             struct v3dv_image *dst,
4039             VkFormat dst_format,
4040             struct v3dv_image *src,
4041             VkFormat src_format,
4042             VkColorComponentFlags cmask,
4043             VkComponentMapping *cswizzle,
4044             const VkImageBlit *_region,
4045             VkFilter filter,
4046             bool dst_is_padded_image)
4047 {
4048    bool handled = true;
4049 
4050    /* We don't support rendering to linear depth/stencil, this should have
4051     * been rewritten to a compatible color blit by the caller.
4052     */
4053    assert(dst->tiling != VK_IMAGE_TILING_LINEAR ||
4054           !vk_format_is_depth_or_stencil(dst_format));
4055 
4056    /* Can't sample from linear images */
4057    if (src->tiling == VK_IMAGE_TILING_LINEAR && src->type != VK_IMAGE_TYPE_1D)
4058       return false;
4059 
4060    VkImageBlit region = *_region;
4061    /* Rewrite combined D/S blits to compatible color blits */
4062    if (vk_format_is_depth_or_stencil(dst_format)) {
4063       assert(src_format == dst_format);
4064       assert(cmask == 0);
4065       switch(dst_format) {
4066       case VK_FORMAT_D16_UNORM:
4067          dst_format = VK_FORMAT_R16_UINT;
4068          break;
4069       case VK_FORMAT_D32_SFLOAT:
4070          dst_format = VK_FORMAT_R32_UINT;
4071          break;
4072       case VK_FORMAT_X8_D24_UNORM_PACK32:
4073       case VK_FORMAT_D24_UNORM_S8_UINT:
4074          if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4075             cmask |= VK_COLOR_COMPONENT_G_BIT |
4076                      VK_COLOR_COMPONENT_B_BIT |
4077                      VK_COLOR_COMPONENT_A_BIT;
4078          }
4079          if (region.srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4080             assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
4081             cmask |= VK_COLOR_COMPONENT_R_BIT;
4082          }
4083          dst_format = VK_FORMAT_R8G8B8A8_UINT;
4084          break;
4085       default:
4086          unreachable("Unsupported depth/stencil format");
4087       };
4088       src_format = dst_format;
4089       region.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
4090       region.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
4091    }
4092 
4093    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
4094                                             VK_COLOR_COMPONENT_G_BIT |
4095                                             VK_COLOR_COMPONENT_B_BIT |
4096                                             VK_COLOR_COMPONENT_A_BIT;
4097    if (cmask == 0)
4098       cmask = full_cmask;
4099 
4100    VkComponentMapping ident_swizzle = {
4101       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
4102       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
4103       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
4104       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
4105    };
4106    if (!cswizzle)
4107       cswizzle = &ident_swizzle;
4108 
4109    /* When we get here from a copy between compressed / uncompressed images
4110     * we choose to specify the destination blit region based on the size
4111     * semantics of the source image of the copy (see copy_image_blit), so we
4112     * need to apply those same semantics here when we compute the size of the
4113     * destination image level.
4114     */
4115    const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
4116    const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
4117    const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
4118    const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
4119    const uint32_t dst_level_w =
4120       u_minify(DIV_ROUND_UP(dst->extent.width * src_block_w, dst_block_w),
4121                region.dstSubresource.mipLevel);
4122    const uint32_t dst_level_h =
4123       u_minify(DIV_ROUND_UP(dst->extent.height * src_block_h, dst_block_h),
4124                region.dstSubresource.mipLevel);
4125 
4126    const uint32_t src_level_w =
4127       u_minify(src->extent.width, region.srcSubresource.mipLevel);
4128    const uint32_t src_level_h =
4129       u_minify(src->extent.height, region.srcSubresource.mipLevel);
4130    const uint32_t src_level_d =
4131       u_minify(src->extent.depth, region.srcSubresource.mipLevel);
4132 
4133    uint32_t dst_x, dst_y, dst_w, dst_h;
4134    bool dst_mirror_x, dst_mirror_y;
4135    compute_blit_box(region.dstOffsets,
4136                     dst_level_w, dst_level_h,
4137                     &dst_x, &dst_y, &dst_w, &dst_h,
4138                     &dst_mirror_x, &dst_mirror_y);
4139 
4140    uint32_t src_x, src_y, src_w, src_h;
4141    bool src_mirror_x, src_mirror_y;
4142    compute_blit_box(region.srcOffsets,
4143                     src_level_w, src_level_h,
4144                     &src_x, &src_y, &src_w, &src_h,
4145                     &src_mirror_x, &src_mirror_y);
4146 
4147    uint32_t min_dst_layer;
4148    uint32_t max_dst_layer;
4149    bool dst_mirror_z;
4150    if (dst->type != VK_IMAGE_TYPE_3D) {
4151       min_dst_layer = region.dstSubresource.baseArrayLayer;
4152       max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
4153    } else {
4154       compute_blit_3d_layers(region.dstOffsets,
4155                              &min_dst_layer, &max_dst_layer,
4156                              &dst_mirror_z);
4157    }
4158 
4159    uint32_t min_src_layer;
4160    uint32_t max_src_layer;
4161    bool src_mirror_z;
4162    if (src->type != VK_IMAGE_TYPE_3D) {
4163       min_src_layer = region.srcSubresource.baseArrayLayer;
4164       max_src_layer = min_src_layer + region.srcSubresource.layerCount;
4165    } else {
4166       compute_blit_3d_layers(region.srcOffsets,
4167                              &min_src_layer, &max_src_layer,
4168                              &src_mirror_z);
4169    }
4170 
4171    uint32_t layer_count = max_dst_layer - min_dst_layer;
4172 
4173    /* Translate source blit coordinates to normalized texture coordinates for
4174     * single sampled textures. For multisampled textures we require
4175     * unnormalized coordinates, since we can only do texelFetch on them.
4176     */
4177    float coords[4] =  {
4178       (float)src_x,
4179       (float)src_y,
4180       (float)(src_x + src_w),
4181       (float)(src_y + src_h),
4182    };
4183 
4184    if (src->samples == VK_SAMPLE_COUNT_1_BIT) {
4185       coords[0] /= (float)src_level_w;
4186       coords[1] /= (float)src_level_h;
4187       coords[2] /= (float)src_level_w;
4188       coords[3] /= (float)src_level_h;
4189    }
4190 
4191    /* Handle mirroring */
4192    const bool mirror_x = dst_mirror_x != src_mirror_x;
4193    const bool mirror_y = dst_mirror_y != src_mirror_y;
4194    const bool mirror_z = dst_mirror_z != src_mirror_z;
4195    float tex_coords[5] = {
4196       !mirror_x ? coords[0] : coords[2],
4197       !mirror_y ? coords[1] : coords[3],
4198       !mirror_x ? coords[2] : coords[0],
4199       !mirror_y ? coords[3] : coords[1],
4200       /* Z coordinate for 3D blit sources, to be filled for each
4201        * destination layer
4202        */
4203       0.0f
4204    };
4205 
4206 
4207    /* For blits from 3D images we also need to compute the slice coordinate to
4208     * sample from, which will change for each layer in the destination.
4209     * Compute the step we should increase for each iteration.
4210     */
4211    const float src_z_step =
4212       (float)(max_src_layer - min_src_layer) / (float)layer_count;
4213 
4214    /* Get the blit pipeline */
4215    struct v3dv_meta_blit_pipeline *pipeline = NULL;
4216    bool ok = get_blit_pipeline(cmd_buffer->device,
4217                                dst_format, src_format, cmask, src->type,
4218                                dst->samples, src->samples,
4219                                &pipeline);
4220    if (!ok)
4221       return handled;
4222    assert(pipeline && pipeline->pipeline &&
4223           pipeline->pass && pipeline->pass_no_load);
4224 
4225    struct v3dv_device *device = cmd_buffer->device;
4226    assert(device->meta.blit.dslayout);
4227 
4228    /* Push command buffer state before starting meta operation */
4229    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
4230 
4231    /* Setup framebuffer */
4232    VkDevice _device = v3dv_device_to_handle(device);
4233    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
4234 
4235    VkResult result;
4236    uint32_t dirty_dynamic_state = 0;
4237    VkImageAspectFlags aspects = region.dstSubresource.aspectMask;
4238    for (uint32_t i = 0; i < layer_count; i++) {
4239       VkImageViewCreateInfo dst_image_view_info = {
4240          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4241          .image = v3dv_image_to_handle(dst),
4242          .viewType = v3dv_image_type_to_view_type(dst->type),
4243          .format = dst_format,
4244          .subresourceRange = {
4245             .aspectMask = aspects,
4246             .baseMipLevel = region.dstSubresource.mipLevel,
4247             .levelCount = 1,
4248             .baseArrayLayer = min_dst_layer + i,
4249             .layerCount = 1
4250          },
4251       };
4252       VkImageView dst_image_view;
4253       result = v3dv_CreateImageView(_device, &dst_image_view_info,
4254                                     &device->alloc, &dst_image_view);
4255       if (result != VK_SUCCESS)
4256          goto fail;
4257 
4258       v3dv_cmd_buffer_add_private_obj(
4259          cmd_buffer, (uintptr_t)dst_image_view,
4260          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4261 
4262       VkFramebufferCreateInfo fb_info = {
4263          .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
4264          .renderPass = pipeline->pass,
4265          .attachmentCount = 1,
4266          .pAttachments = &dst_image_view,
4267          .width = dst_x + dst_w,
4268          .height = dst_y + dst_h,
4269          .layers = 1,
4270       };
4271 
4272       VkFramebuffer fb;
4273       result = v3dv_CreateFramebuffer(_device, &fb_info,
4274                                       &cmd_buffer->device->alloc, &fb);
4275       if (result != VK_SUCCESS)
4276          goto fail;
4277 
4278       struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
4279       framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
4280                                       fb_info.height == dst_level_h &&
4281                                       dst_is_padded_image;
4282 
4283       v3dv_cmd_buffer_add_private_obj(
4284          cmd_buffer, (uintptr_t)fb,
4285          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
4286 
4287       /* Setup descriptor set for blit source texture. We don't have to
4288        * register the descriptor as a private command buffer object since
4289        * all descriptors will be freed automatically with the descriptor
4290        * pool.
4291        */
4292       VkDescriptorSet set;
4293       result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
4294       if (result != VK_SUCCESS)
4295          goto fail;
4296 
4297       VkSamplerCreateInfo sampler_info = {
4298          .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
4299          .magFilter = filter,
4300          .minFilter = filter,
4301          .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4302          .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4303          .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4304          .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
4305       };
4306       VkSampler sampler;
4307       result = v3dv_CreateSampler(_device, &sampler_info, &device->alloc,
4308                                   &sampler);
4309       if (result != VK_SUCCESS)
4310          goto fail;
4311 
4312       v3dv_cmd_buffer_add_private_obj(
4313          cmd_buffer, (uintptr_t)sampler,
4314          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);
4315 
4316       VkImageViewCreateInfo src_image_view_info = {
4317          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4318          .image = v3dv_image_to_handle(src),
4319          .viewType = v3dv_image_type_to_view_type(src->type),
4320          .format = src_format,
4321          .components = *cswizzle,
4322          .subresourceRange = {
4323             .aspectMask = aspects,
4324             .baseMipLevel = region.srcSubresource.mipLevel,
4325             .levelCount = 1,
4326             .baseArrayLayer =
4327                src->type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
4328             .layerCount = 1
4329          },
4330       };
4331       VkImageView src_image_view;
4332       result = v3dv_CreateImageView(_device, &src_image_view_info,
4333                                     &device->alloc, &src_image_view);
4334       if (result != VK_SUCCESS)
4335          goto fail;
4336 
4337       v3dv_cmd_buffer_add_private_obj(
4338          cmd_buffer, (uintptr_t)src_image_view,
4339          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4340 
4341       VkDescriptorImageInfo image_info = {
4342          .sampler = sampler,
4343          .imageView = src_image_view,
4344          .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
4345       };
4346       VkWriteDescriptorSet write = {
4347          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
4348          .dstSet = set,
4349          .dstBinding = 0,
4350          .dstArrayElement = 0,
4351          .descriptorCount = 1,
4352          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4353          .pImageInfo = &image_info,
4354       };
4355       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
4356 
4357       /* If the region we are about to blit is tile-aligned, then we can
4358        * use the render pass version that won't pre-load the tile buffer
4359        * with the dst image contents before the blit. The exception is when we
4360        * don't have a full color mask, since in that case we need to preserve
4361        * the original value of some of the color components.
4362        */
4363       const VkRect2D render_area = {
4364          .offset = { dst_x, dst_y },
4365          .extent = { dst_w, dst_h },
4366       };
4367       struct v3dv_render_pass *pipeline_pass =
4368          v3dv_render_pass_from_handle(pipeline->pass);
4369       bool can_skip_tlb_load =
4370          cmask == full_cmask &&
4371          v3dv_subpass_area_is_tile_aligned(&render_area, framebuffer,
4372                                            pipeline_pass, 0);
4373 
4374       /* Record blit */
4375       VkRenderPassBeginInfo rp_info = {
4376          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4377          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
4378                                            pipeline->pass,
4379          .framebuffer = fb,
4380          .renderArea = render_area,
4381          .clearValueCount = 0,
4382       };
4383 
4384       v3dv_CmdBeginRenderPass(_cmd_buffer, &rp_info, VK_SUBPASS_CONTENTS_INLINE);
4385       struct v3dv_job *job = cmd_buffer->state.job;
4386       if (!job)
4387          goto fail;
4388 
4389       /* For 3D blits we need to compute the source slice to blit from (the Z
4390        * coordinate of the source sample operation). We want to choose this
4391        * based on the ratio of the depth of the source and the destination
4392        * images, picking the coordinate in the middle of each step.
4393        */
4394       if (src->type == VK_IMAGE_TYPE_3D) {
4395          tex_coords[4] =
4396             !mirror_z ?
4397             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
4398             (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
4399       }
4400 
4401       v3dv_CmdPushConstants(_cmd_buffer,
4402                             device->meta.blit.playout,
4403                             VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
4404                             &tex_coords);
4405 
4406       v3dv_CmdBindPipeline(_cmd_buffer,
4407                            VK_PIPELINE_BIND_POINT_GRAPHICS,
4408                            pipeline->pipeline);
4409 
4410       v3dv_CmdBindDescriptorSets(_cmd_buffer,
4411                                  VK_PIPELINE_BIND_POINT_GRAPHICS,
4412                                  device->meta.blit.playout,
4413                                  0, 1, &set,
4414                                  0, NULL);
4415 
4416       const VkViewport viewport = {
4417          .x = dst_x,
4418          .y = dst_y,
4419          .width = dst_w,
4420          .height = dst_h,
4421          .minDepth = 0.0f,
4422          .maxDepth = 1.0f
4423       };
4424       v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
4425       const VkRect2D scissor = {
4426          .offset = { dst_x, dst_y },
4427          .extent = { dst_w, dst_h }
4428       };
4429       v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
4430 
4431       v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
4432 
4433       v3dv_CmdEndRenderPass(_cmd_buffer);
4434       dirty_dynamic_state = V3DV_CMD_DIRTY_VIEWPORT | V3DV_CMD_DIRTY_SCISSOR;
4435    }
4436 
4437 fail:
4438    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, dirty_dynamic_state, true);
4439 
4440    return handled;
4441 }
4442 
4443 void
v3dv_CmdBlitImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageBlit * pRegions,VkFilter filter)4444 v3dv_CmdBlitImage(VkCommandBuffer commandBuffer,
4445                   VkImage srcImage,
4446                   VkImageLayout srcImageLayout,
4447                   VkImage dstImage,
4448                   VkImageLayout dstImageLayout,
4449                   uint32_t regionCount,
4450                   const VkImageBlit* pRegions,
4451                   VkFilter filter)
4452 {
4453    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4454    V3DV_FROM_HANDLE(v3dv_image, src, srcImage);
4455    V3DV_FROM_HANDLE(v3dv_image, dst, dstImage);
4456 
4457     /* This command can only happen outside a render pass */
4458    assert(cmd_buffer->state.pass == NULL);
4459    assert(cmd_buffer->state.job == NULL);
4460 
4461    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
4462    assert(dst->samples == VK_SAMPLE_COUNT_1_BIT &&
4463           src->samples == VK_SAMPLE_COUNT_1_BIT);
4464 
4465    for (uint32_t i = 0; i < regionCount; i++) {
4466       if (blit_tfu(cmd_buffer, dst, src, &pRegions[i], filter))
4467          continue;
4468       if (blit_shader(cmd_buffer,
4469                       dst, dst->vk_format,
4470                       src, src->vk_format,
4471                       0, NULL,
4472                       &pRegions[i], filter, true)) {
4473          continue;
4474       }
4475       unreachable("Unsupported blit operation");
4476    }
4477 }
4478 
4479 static void
emit_resolve_image_layer_per_tile_list(struct v3dv_job * job,struct framebuffer_data * framebuffer,struct v3dv_image * dst,struct v3dv_image * src,uint32_t layer_offset,const VkImageResolve * region)4480 emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
4481                                        struct framebuffer_data *framebuffer,
4482                                        struct v3dv_image *dst,
4483                                        struct v3dv_image *src,
4484                                        uint32_t layer_offset,
4485                                        const VkImageResolve *region)
4486 {
4487    struct v3dv_cl *cl = &job->indirect;
4488    v3dv_cl_ensure_space(cl, 200, 1);
4489    v3dv_return_if_oom(NULL, job);
4490 
4491    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
4492 
4493    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
4494 
4495    assert((src->type != VK_IMAGE_TYPE_3D &&
4496            layer_offset < region->srcSubresource.layerCount) ||
4497           layer_offset < src->extent.depth);
4498 
4499    const uint32_t src_layer = src->type != VK_IMAGE_TYPE_3D ?
4500       region->srcSubresource.baseArrayLayer + layer_offset :
4501       region->srcOffset.z + layer_offset;
4502 
4503    emit_image_load(cl, framebuffer, src,
4504                    region->srcSubresource.aspectMask,
4505                    src_layer,
4506                    region->srcSubresource.mipLevel,
4507                    false, false);
4508 
4509    cl_emit(cl, END_OF_LOADS, end);
4510 
4511    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
4512 
4513    assert((dst->type != VK_IMAGE_TYPE_3D &&
4514            layer_offset < region->dstSubresource.layerCount) ||
4515           layer_offset < dst->extent.depth);
4516 
4517    const uint32_t dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
4518       region->dstSubresource.baseArrayLayer + layer_offset :
4519       region->dstOffset.z + layer_offset;
4520 
4521    emit_image_store(cl, framebuffer, dst,
4522                     region->dstSubresource.aspectMask,
4523                     dst_layer,
4524                     region->dstSubresource.mipLevel,
4525                     false, false, true);
4526 
4527    cl_emit(cl, END_OF_TILE_MARKER, end);
4528 
4529    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
4530 
4531    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
4532       branch.start = tile_list_start;
4533       branch.end = v3dv_cl_get_address(cl);
4534    }
4535 }
4536 
4537 static void
emit_resolve_image_layer(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct framebuffer_data * framebuffer,uint32_t layer,const VkImageResolve * region)4538 emit_resolve_image_layer(struct v3dv_job *job,
4539                          struct v3dv_image *dst,
4540                          struct v3dv_image *src,
4541                          struct framebuffer_data *framebuffer,
4542                          uint32_t layer,
4543                          const VkImageResolve *region)
4544 {
4545    emit_frame_setup(job, layer, NULL);
4546    emit_resolve_image_layer_per_tile_list(job, framebuffer,
4547                                           dst, src, layer, region);
4548    emit_supertile_coordinates(job, framebuffer);
4549 }
4550 
4551 static void
emit_resolve_image_rcl(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct framebuffer_data * framebuffer,const VkImageResolve * region)4552 emit_resolve_image_rcl(struct v3dv_job *job,
4553                        struct v3dv_image *dst,
4554                        struct v3dv_image *src,
4555                        struct framebuffer_data *framebuffer,
4556                        const VkImageResolve *region)
4557 {
4558    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
4559    v3dv_return_if_oom(NULL, job);
4560 
4561    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
4562       emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
4563    cl_emit(rcl, END_OF_RENDERING, end);
4564 }
4565 
4566 static bool
resolve_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve * region)4567 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
4568                   struct v3dv_image *dst,
4569                   struct v3dv_image *src,
4570                   const VkImageResolve *region)
4571 {
4572    if (!can_use_tlb(src, &region->srcOffset, NULL) ||
4573        !can_use_tlb(dst, &region->dstOffset, NULL)) {
4574       return false;
4575    }
4576 
4577    if (!v3dv_format_supports_tlb_resolve(src->format))
4578       return false;
4579 
4580    const VkFormat fb_format = src->vk_format;
4581 
4582    uint32_t num_layers;
4583    if (dst->type != VK_IMAGE_TYPE_3D)
4584       num_layers = region->dstSubresource.layerCount;
4585    else
4586       num_layers = region->extent.depth;
4587    assert(num_layers > 0);
4588 
4589    struct v3dv_job *job =
4590       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
4591    if (!job)
4592       return true;
4593 
4594    const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
4595    const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
4596    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
4597    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
4598 
4599    uint32_t internal_type, internal_bpp;
4600    get_internal_type_bpp_for_image_aspects(fb_format,
4601                                            region->srcSubresource.aspectMask,
4602                                            &internal_type, &internal_bpp);
4603 
4604    v3dv_job_start_frame(job, width, height, num_layers, 1, internal_bpp, true);
4605 
4606    struct framebuffer_data framebuffer;
4607    setup_framebuffer_data(&framebuffer, fb_format, internal_type,
4608                           &job->frame_tiling);
4609 
4610    v3dv_job_emit_binning_flush(job);
4611    emit_resolve_image_rcl(job, dst, src, &framebuffer, region);
4612 
4613    v3dv_cmd_buffer_finish_job(cmd_buffer);
4614    return true;
4615 }
4616 
4617 static bool
resolve_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve * region)4618 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
4619                    struct v3dv_image *dst,
4620                    struct v3dv_image *src,
4621                    const VkImageResolve *region)
4622 {
4623    const VkImageBlit blit_region = {
4624       .srcSubresource = region->srcSubresource,
4625       .srcOffsets = {
4626          region->srcOffset,
4627          {
4628             region->srcOffset.x + region->extent.width,
4629             region->srcOffset.y + region->extent.height,
4630          }
4631       },
4632       .dstSubresource = region->dstSubresource,
4633       .dstOffsets = {
4634          region->dstOffset,
4635          {
4636             region->dstOffset.x + region->extent.width,
4637             region->dstOffset.y + region->extent.height,
4638          }
4639       },
4640    };
4641    return blit_shader(cmd_buffer,
4642                       dst, dst->vk_format,
4643                       src, src->vk_format,
4644                       0, NULL,
4645                       &blit_region, VK_FILTER_NEAREST, true);
4646 }
4647 
4648 void
v3dv_CmdResolveImage(VkCommandBuffer commandBuffer,VkImage srcImage,VkImageLayout srcImageLayout,VkImage dstImage,VkImageLayout dstImageLayout,uint32_t regionCount,const VkImageResolve * pRegions)4649 v3dv_CmdResolveImage(VkCommandBuffer commandBuffer,
4650                      VkImage srcImage,
4651                      VkImageLayout srcImageLayout,
4652                      VkImage dstImage,
4653                      VkImageLayout dstImageLayout,
4654                      uint32_t regionCount,
4655                      const VkImageResolve *pRegions)
4656 {
4657    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4658    V3DV_FROM_HANDLE(v3dv_image, src, srcImage);
4659    V3DV_FROM_HANDLE(v3dv_image, dst, dstImage);
4660 
4661     /* This command can only happen outside a render pass */
4662    assert(cmd_buffer->state.pass == NULL);
4663    assert(cmd_buffer->state.job == NULL);
4664 
4665    assert(src->samples == VK_SAMPLE_COUNT_4_BIT);
4666    assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
4667 
4668    for (uint32_t i = 0; i < regionCount; i++) {
4669       if (resolve_image_tlb(cmd_buffer, dst, src, &pRegions[i]))
4670          continue;
4671       if (resolve_image_blit(cmd_buffer, dst, src, &pRegions[i]))
4672          continue;
4673       unreachable("Unsupported multismaple resolve operation");
4674    }
4675 }
4676