• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_nir_meta.h"
8 #include "ac_nir_helpers.h"
9 #include "ac_surface.h"
10 #include "nir_format_convert.h"
11 #include "compiler/aco_interface.h"
12 #include "util/format_srgb.h"
13 #include "util/u_pack_color.h"
14 
15 static nir_def *
deref_ssa(nir_builder * b,nir_variable * var)16 deref_ssa(nir_builder *b, nir_variable *var)
17 {
18    return &nir_build_deref_var(b, var)->def;
19 }
20 
21 /* unpack_2x16_signed(src, x, y): x = (int32_t)((uint16_t)src); y = src >> 16; */
22 static void
unpack_2x16_signed(nir_builder * b,unsigned bit_size,nir_def * src,nir_def ** x,nir_def ** y)23 unpack_2x16_signed(nir_builder *b, unsigned bit_size, nir_def *src, nir_def **x, nir_def **y)
24 {
25    assert(bit_size == 32 || bit_size == 16);
26    *x = nir_unpack_32_2x16_split_x(b, src);
27    *y = nir_unpack_32_2x16_split_y(b, src);
28 
29    if (bit_size == 32) {
30       *x = nir_i2i32(b, *x);
31       *y = nir_i2i32(b, *y);
32    }
33 }
34 
35 static nir_def *
convert_linear_to_srgb(nir_builder * b,nir_def * input)36 convert_linear_to_srgb(nir_builder *b, nir_def *input)
37 {
38    /* There are small precision differences compared to CB, so the gfx blit will return slightly
39     * different results.
40     */
41    for (unsigned i = 0; i < MIN2(3, input->num_components); i++) {
42       input = nir_vector_insert_imm(b, input,
43                                     nir_format_linear_to_srgb(b, nir_channel(b, input, i)), i);
44    }
45 
46    return input;
47 }
48 
49 static nir_def *
apply_blit_output_modifiers(nir_builder * b,nir_def * color,const union ac_cs_blit_key * key)50 apply_blit_output_modifiers(nir_builder *b, nir_def *color,
51                             const union ac_cs_blit_key *key)
52 {
53    unsigned bit_size = color->bit_size;
54    nir_def *zero = nir_imm_intN_t(b, 0, bit_size);
55 
56    if (key->sint_to_uint)
57       color = nir_imax(b, color, zero);
58 
59    if (key->uint_to_sint) {
60       color = nir_umin(b, color,
61                        nir_imm_intN_t(b, bit_size == 16 ? INT16_MAX : INT32_MAX,
62                                       bit_size));
63    }
64 
65    if (key->dst_is_srgb)
66       color = convert_linear_to_srgb(b, color);
67 
68    nir_def *one = key->use_integer_one ? nir_imm_intN_t(b, 1, bit_size) :
69                                              nir_imm_floatN_t(b, 1, bit_size);
70 
71    if (key->is_clear) {
72       if (key->last_dst_channel < 3)
73          color = nir_trim_vector(b, color, key->last_dst_channel + 1);
74    } else {
75       assert(key->last_src_channel <= key->last_dst_channel);
76       assert(color->num_components == key->last_src_channel + 1);
77 
78       /* Set channels not present in src to 0 or 1. */
79       if (key->last_src_channel < key->last_dst_channel) {
80          color = nir_pad_vector(b, color, key->last_dst_channel + 1);
81 
82          for (unsigned chan = key->last_src_channel + 1; chan <= key->last_dst_channel; chan++)
83             color = nir_vector_insert_imm(b, color, chan == 3 ? one : zero, chan);
84       }
85 
86       /* Discard channels not present in dst. The hardware fills unstored channels with 0. */
87       if (key->last_dst_channel < key->last_src_channel)
88          color = nir_trim_vector(b, color, key->last_dst_channel + 1);
89    }
90 
91    /* Discard channels not present in dst. The hardware fills unstored channels with 0. */
92    if (key->last_dst_channel < 3)
93       color = nir_trim_vector(b, color, key->last_dst_channel + 1);
94 
95    return color;
96 }
97 
98 /* The compute blit shader.
99  *
100  * Implementation details:
101  * - Out-of-bounds dst coordinates are not clamped at all. The hw drops
102  *   out-of-bounds stores for us.
103  * - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using
104  *   the image_size NIR intrinsic.
105  * - X/Y flipping just does this in the shader: -threadIDs - 1, assuming the starting coordinates
106  *   are 1 pixel after the bottom-right corner, e.g. x + width, matching the gallium behavior.
107  * - This list doesn't do it justice.
108  */
109 nir_shader *
ac_create_blit_cs(const struct ac_cs_blit_options * options,const union ac_cs_blit_key * key)110 ac_create_blit_cs(const struct ac_cs_blit_options *options, const union ac_cs_blit_key *key)
111 {
112    if (options->print_key) {
113       fprintf(stderr, "Internal shader: compute_blit\n");
114       fprintf(stderr, "   key.use_aco = %u\n", key->use_aco);
115       fprintf(stderr, "   key.wg_dim = %u\n", key->wg_dim);
116       fprintf(stderr, "   key.has_start_xyz = %u\n", key->has_start_xyz);
117       fprintf(stderr, "   key.log_lane_width = %u\n", key->log_lane_width);
118       fprintf(stderr, "   key.log_lane_height = %u\n", key->log_lane_height);
119       fprintf(stderr, "   key.log_lane_depth = %u\n", key->log_lane_depth);
120       fprintf(stderr, "   key.is_clear = %u\n", key->is_clear);
121       fprintf(stderr, "   key.src_is_1d = %u\n", key->src_is_1d);
122       fprintf(stderr, "   key.dst_is_1d = %u\n", key->dst_is_1d);
123       fprintf(stderr, "   key.src_is_msaa = %u\n", key->src_is_msaa);
124       fprintf(stderr, "   key.dst_is_msaa = %u\n", key->dst_is_msaa);
125       fprintf(stderr, "   key.src_has_z = %u\n", key->src_has_z);
126       fprintf(stderr, "   key.dst_has_z = %u\n", key->dst_has_z);
127       fprintf(stderr, "   key.a16 = %u\n", key->a16);
128       fprintf(stderr, "   key.d16 = %u\n", key->d16);
129       fprintf(stderr, "   key.log_samples = %u\n", key->log_samples);
130       fprintf(stderr, "   key.sample0_only = %u\n", key->sample0_only);
131       fprintf(stderr, "   key.x_clamp_to_edge = %u\n", key->x_clamp_to_edge);
132       fprintf(stderr, "   key.y_clamp_to_edge = %u\n", key->y_clamp_to_edge);
133       fprintf(stderr, "   key.flip_x = %u\n", key->flip_x);
134       fprintf(stderr, "   key.flip_y = %u\n", key->flip_y);
135       fprintf(stderr, "   key.sint_to_uint = %u\n", key->sint_to_uint);
136       fprintf(stderr, "   key.uint_to_sint = %u\n", key->uint_to_sint);
137       fprintf(stderr, "   key.dst_is_srgb = %u\n", key->dst_is_srgb);
138       fprintf(stderr, "   key.use_integer_one = %u\n", key->use_integer_one);
139       fprintf(stderr, "   key.last_src_channel = %u\n", key->last_src_channel);
140       fprintf(stderr, "   key.last_dst_channel = %u\n", key->last_dst_channel);
141       fprintf(stderr, "\n");
142    }
143 
144    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options->nir_options,
145                                                   "blit_non_scaled_cs");
146    b.shader->info.use_aco_amd = options->use_aco ||
147                                 (key->use_aco && aco_is_gpu_supported(options->info));
148    b.shader->info.num_images = key->is_clear ? 1 : 2;
149    unsigned image_dst_index = b.shader->info.num_images - 1;
150    if (!key->is_clear && key->src_is_msaa)
151       BITSET_SET(b.shader->info.msaa_images, 0);
152    if (key->dst_is_msaa)
153       BITSET_SET(b.shader->info.msaa_images, image_dst_index);
154    /* The workgroup size varies depending on the tiling layout and blit dimensions. */
155    b.shader->info.workgroup_size_variable = true;
156    b.shader->info.cs.user_data_components_amd =
157       key->is_clear ? (key->d16 ? 6 : 8) : key->has_start_xyz ? 4 : 3;
158 
159    const struct glsl_type *img_type[2] = {
160       glsl_image_type(key->src_is_1d ? GLSL_SAMPLER_DIM_1D :
161                       key->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
162                       key->src_has_z, GLSL_TYPE_FLOAT),
163       glsl_image_type(key->dst_is_1d ? GLSL_SAMPLER_DIM_1D :
164                       key->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
165                       key->dst_has_z, GLSL_TYPE_FLOAT),
166    };
167 
168    nir_variable *img_src = NULL;
169    if (!key->is_clear) {
170       img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0");
171       img_src->data.binding = 0;
172    }
173 
174    nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
175    img_dst->data.binding = image_dst_index;
176 
177    unsigned lane_width = 1 << key->log_lane_width;
178    unsigned lane_height = 1 << key->log_lane_height;
179    unsigned lane_depth = 1 << key->log_lane_depth;
180    unsigned lane_size = lane_width * lane_height * lane_depth;
181    assert(lane_size <= SI_MAX_COMPUTE_BLIT_LANE_SIZE);
182 
183    nir_def *zero_lod = nir_imm_intN_t(&b, 0, key->a16 ? 16 : 32);
184 
185    /* Instructions. */
186    /* Let's work with 0-based src and dst coordinates (thread IDs) first. */
187    unsigned coord_bit_size = key->a16 ? 16 : 32;
188    nir_def *dst_xyz = ac_get_global_ids(&b, key->wg_dim, coord_bit_size);
189    dst_xyz = nir_pad_vector_imm_int(&b, dst_xyz, 0, 3);
190 
191    /* If the blit area is unaligned, we launched extra threads to make it aligned.
192     * Skip those threads here.
193     */
194    nir_if *if_positive = NULL;
195    if (key->has_start_xyz) {
196       nir_def *start_xyz = nir_channel(&b, nir_load_user_data_amd(&b), 3);
197       start_xyz = nir_u2uN(&b, nir_unpack_32_4x8(&b, start_xyz), coord_bit_size);
198       start_xyz = nir_trim_vector(&b, start_xyz, 3);
199 
200       dst_xyz = nir_isub(&b, dst_xyz, start_xyz);
201       nir_def *is_positive_xyz = nir_ige_imm(&b, dst_xyz, 0);
202       nir_def *is_positive = nir_iand(&b, nir_channel(&b, is_positive_xyz, 0),
203                                       nir_iand(&b, nir_channel(&b, is_positive_xyz, 1),
204                                                nir_channel(&b, is_positive_xyz, 2)));
205       if_positive = nir_push_if(&b, is_positive);
206    }
207 
208    dst_xyz = nir_imul(&b, dst_xyz, nir_imm_ivec3_intN(&b, lane_width, lane_height, lane_depth,
209                                                       coord_bit_size));
210    nir_def *src_xyz = dst_xyz;
211 
212    /* Flip src coordinates. */
213    for (unsigned i = 0; i < 2; i++) {
214       if (i ? key->flip_y : key->flip_x) {
215          /* A normal blit loads from (box.x + tid.x) where tid.x = 0..(width - 1).
216           *
217           * A flipped blit sets box.x = width, so we should make tid.x negative to load from
218           * (width - 1)..0.
219           *
220           * Therefore do: x = -x - 1, which becomes (width - 1) to 0 after we add box.x = width.
221           */
222          nir_def *comp = nir_channel(&b, src_xyz, i);
223          comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -(int)(i ? lane_height : lane_width));
224          src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
225       }
226    }
227 
228    /* Add box.xyz. */
229    nir_def *base_coord_src = NULL, *base_coord_dst = NULL;
230    unpack_2x16_signed(&b, coord_bit_size, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
231                       &base_coord_src, &base_coord_dst);
232    base_coord_dst = nir_iadd(&b, base_coord_dst, dst_xyz);
233    base_coord_src = nir_iadd(&b, base_coord_src, src_xyz);
234 
235    /* Coordinates must have 4 channels in NIR. */
236    base_coord_src = nir_pad_vector(&b, base_coord_src, 4);
237    base_coord_dst = nir_pad_vector(&b, base_coord_dst, 4);
238 
239 /* Iterate over all pixels in the lane. num_samples is the only input.
240  * (sample, x, y, z) are generated coordinates, while "i" is the coordinates converted to
241  * an absolute index.
242  */
243 #define foreach_pixel_in_lane(num_samples, sample, x, y, z, i) \
244    for (unsigned z = 0; z < lane_depth; z++) \
245       for (unsigned y = 0; y < lane_height; y++) \
246          for (unsigned x = 0; x < lane_width; x++) \
247             for (unsigned i = ((z * lane_height + y) * lane_width + x) * (num_samples), sample = 0; \
248                  sample < (num_samples); sample++, i++) \
249 
250    /* Swizzle coordinates for 1D_ARRAY. */
251    static const unsigned swizzle_xz[] = {0, 2, 0, 0};
252 
253    /* Execute image loads and stores. */
254    unsigned num_src_coords = (key->src_is_1d ? 1 : 2) + key->src_has_z + key->src_is_msaa;
255    unsigned num_dst_coords = (key->dst_is_1d ? 1 : 2) + key->dst_has_z + key->dst_is_msaa;
256    unsigned bit_size = key->d16 ? 16 : 32;
257    unsigned num_samples = 1 << key->log_samples;
258    unsigned src_samples = key->src_is_msaa && !key->sample0_only &&
259                           !key->is_clear ? num_samples : 1;
260    unsigned dst_samples = key->dst_is_msaa ? num_samples : 1;
261    nir_def *color[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
262    nir_def *coord_dst[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
263    nir_def *src_resinfo = NULL;
264 
265    if (key->is_clear) {
266       /* The clear color starts at component 4 of user data. */
267       color[0] = nir_channels(&b, nir_load_user_data_amd(&b),
268                               BITFIELD_RANGE(4, key->d16 ? 2 : 4));
269       if (key->d16)
270          color[0] = nir_unpack_64_4x16(&b, nir_pack_64_2x32(&b, color[0]));
271 
272       foreach_pixel_in_lane(1, sample, x, y, z, i) {
273          color[i] = color[0];
274       }
275    } else {
276       nir_def *coord_src[SI_MAX_COMPUTE_BLIT_LANE_SIZE * SI_MAX_COMPUTE_BLIT_SAMPLES] = {0};
277 
278       /* Initialize src coordinates, one vector per pixel. */
279       foreach_pixel_in_lane(src_samples, sample, x, y, z, i) {
280          unsigned tmp_x = x;
281          unsigned tmp_y = y;
282 
283          /* Change the order from 0..N to N..0 for flipped blits. */
284          if (key->flip_x)
285             tmp_x = lane_width - 1 - x;
286          if (key->flip_y)
287             tmp_y = lane_height - 1 - y;
288 
289          coord_src[i] = nir_iadd(&b, base_coord_src,
290                                      nir_imm_ivec4_intN(&b, tmp_x, tmp_y, z, 0, coord_bit_size));
291          if (key->src_is_1d)
292             coord_src[i] = nir_swizzle(&b, coord_src[i], swizzle_xz, 4);
293          if (key->src_is_msaa) {
294             coord_src[i] = nir_vector_insert_imm(&b, coord_src[i],
295                                                  nir_imm_intN_t(&b, sample, coord_bit_size),
296                                                  num_src_coords - 1);
297          }
298 
299          /* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
300          for (unsigned chan = 0; chan < 2; chan++) {
301             if (chan ? key->y_clamp_to_edge : key->x_clamp_to_edge) {
302                assert(!key->src_is_1d || chan == 0);
303 
304                if (!src_resinfo) {
305                   /* Always use the 32-bit return type because the image dimensions can be
306                    * > INT16_MAX even if the blit box fits within sint16.
307                    */
308                   src_resinfo = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src),
309                                                      zero_lod);
310                   if (coord_bit_size == 16) {
311                      src_resinfo = nir_umin_imm(&b, src_resinfo, INT16_MAX);
312                      src_resinfo = nir_i2i16(&b, src_resinfo);
313                   }
314                }
315 
316                nir_def *tmp = nir_channel(&b, coord_src[i], chan);
317                tmp = nir_imax_imm(&b, tmp, 0);
318                tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, src_resinfo, chan), -1));
319                coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], tmp, chan);
320             }
321          }
322       }
323 
324       /* We don't want the computation of src coordinates to be interleaved with loads. */
325       if (lane_size > 1 || src_samples > 1) {
326          ac_optimization_barrier_vgpr_array(options->info, &b, coord_src,
327                                             lane_size * src_samples, num_src_coords);
328       }
329 
330       /* Use "samples_identical" for MSAA resolving if it's supported. */
331       bool is_resolve = src_samples > 1 && dst_samples == 1;
332       bool uses_samples_identical = options->info->gfx_level < GFX11 && !options->no_fmask && is_resolve;
333       nir_def *samples_identical = NULL, *sample0[SI_MAX_COMPUTE_BLIT_LANE_SIZE] = {0};
334       nir_if *if_identical = NULL;
335 
336       if (uses_samples_identical) {
337          samples_identical = nir_imm_true(&b);
338 
339          /* If we are resolving multiple pixels per lane, AND all results of "samples_identical". */
340          foreach_pixel_in_lane(1, sample, x, y, z, i) {
341             nir_def *iden = nir_image_deref_samples_identical(&b, 1, deref_ssa(&b, img_src),
342                                                               coord_src[i * src_samples],
343                                                               .image_dim = GLSL_SAMPLER_DIM_MS);
344             samples_identical = nir_iand(&b, samples_identical, iden);
345          }
346 
347          /* If all samples are identical, load only sample 0. */
348          if_identical = nir_push_if(&b, samples_identical);
349          foreach_pixel_in_lane(1, sample, x, y, z, i) {
350             sample0[i] = nir_image_deref_load(&b, key->last_src_channel + 1, bit_size,
351                                               deref_ssa(&b, img_src), coord_src[i * src_samples],
352                                               nir_channel(&b, coord_src[i * src_samples],
353                                                           num_src_coords - 1), zero_lod,
354                                               .image_dim = img_src->type->sampler_dimensionality,
355                                               .image_array = img_src->type->sampler_array);
356          }
357          nir_push_else(&b, if_identical);
358       }
359 
360       /* Load src pixels, one per sample. */
361       foreach_pixel_in_lane(src_samples, sample, x, y, z, i) {
362          color[i] = nir_image_deref_load(&b, key->last_src_channel + 1, bit_size,
363                                          deref_ssa(&b, img_src), coord_src[i],
364                                          nir_channel(&b, coord_src[i], num_src_coords - 1), zero_lod,
365                                          .image_dim = img_src->type->sampler_dimensionality,
366                                          .image_array = img_src->type->sampler_array);
367       }
368 
369       /* Resolve MSAA if necessary. */
370       if (is_resolve) {
371          /* We don't want the averaging of samples to be interleaved with image loads. */
372          ac_optimization_barrier_vgpr_array(options->info, &b, color, lane_size * src_samples,
373                                             key->last_src_channel + 1);
374 
375          /* This reduces the "color" array from "src_samples * lane_size" elements to only
376           * "lane_size" elements.
377           */
378          foreach_pixel_in_lane(1, sample, x, y, z, i) {
379             color[i] = ac_average_samples(&b, &color[i * src_samples], src_samples);
380          }
381          src_samples = 1;
382       }
383 
384       if (uses_samples_identical) {
385          nir_pop_if(&b, if_identical);
386          foreach_pixel_in_lane(1, sample, x, y, z, i) {
387             color[i] = nir_if_phi(&b, sample0[i], color[i]);
388          }
389       }
390    }
391 
392    /* We need to load the descriptor here, otherwise the load would be after optimization
393     * barriers waiting for image loads, i.e. after s_waitcnt vmcnt(0).
394     */
395    nir_def *img_dst_desc =
396       nir_image_deref_descriptor_amd(&b, 8, 32, deref_ssa(&b, img_dst),
397                                      .image_dim = img_dst->type->sampler_dimensionality,
398                                      .image_array = img_dst->type->sampler_array);
399    if (lane_size > 1 && !b.shader->info.use_aco_amd)
400       img_dst_desc = nir_optimization_barrier_sgpr_amd(&b, 32, img_dst_desc);
401 
402    /* Apply the blit output modifiers, once per sample.  */
403    foreach_pixel_in_lane(src_samples, sample, x, y, z, i) {
404       color[i] = apply_blit_output_modifiers(&b, color[i], key);
405    }
406 
407    /* Initialize dst coordinates, one vector per pixel. */
408    foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) {
409       coord_dst[i] = nir_iadd(&b, base_coord_dst,
410                               nir_imm_ivec4_intN(&b, x, y, z, 0, coord_bit_size));
411       if (key->dst_is_1d)
412          coord_dst[i] = nir_swizzle(&b, coord_dst[i], swizzle_xz, 4);
413       if (key->dst_is_msaa) {
414          coord_dst[i] = nir_vector_insert_imm(&b, coord_dst[i],
415                                               nir_imm_intN_t(&b, sample, coord_bit_size),
416                                               num_dst_coords - 1);
417       }
418    }
419 
420    /* We don't want the computation of dst coordinates to be interleaved with stores. */
421    if (lane_size > 1 || dst_samples > 1) {
422       ac_optimization_barrier_vgpr_array(options->info, &b, coord_dst, lane_size * dst_samples,
423                                          num_dst_coords);
424    }
425 
426    /* We don't want the application of blit output modifiers to be interleaved with stores. */
427    if (!key->is_clear && (lane_size > 1 || MIN2(src_samples, dst_samples) > 1)) {
428       ac_optimization_barrier_vgpr_array(options->info, &b, color, lane_size * src_samples,
429                                          key->last_dst_channel + 1);
430    }
431 
432    /* Store the pixels, one per sample. */
433    foreach_pixel_in_lane(dst_samples, sample, x, y, z, i) {
434       nir_bindless_image_store(&b, img_dst_desc, coord_dst[i],
435                                nir_channel(&b, coord_dst[i], num_dst_coords - 1),
436                                src_samples > 1 ? color[i] : color[i / dst_samples], zero_lod,
437                                .image_dim = glsl_get_sampler_dim(img_type[1]),
438                                .image_array = glsl_sampler_type_is_array(img_type[1]));
439    }
440 
441    if (key->has_start_xyz)
442       nir_pop_if(&b, if_positive);
443 
444    return b.shader;
445 }
446 
447 static unsigned
set_work_size(struct ac_cs_blit_dispatch * dispatch,unsigned block_x,unsigned block_y,unsigned block_z,unsigned num_wg_x,unsigned num_wg_y,unsigned num_wg_z)448 set_work_size(struct ac_cs_blit_dispatch *dispatch,
449               unsigned block_x, unsigned block_y, unsigned block_z,
450               unsigned num_wg_x, unsigned num_wg_y, unsigned num_wg_z)
451 {
452    dispatch->wg_size[0] = block_x;
453    dispatch->wg_size[1] = block_y;
454    dispatch->wg_size[2] = block_z;
455 
456    unsigned num_wg[3] = {num_wg_x, num_wg_y, num_wg_z};
457    for (int i = 0; i < 3; ++i) {
458       dispatch->last_wg_size[i] = num_wg[i] % dispatch->wg_size[i];
459       dispatch->num_workgroups[i] = DIV_ROUND_UP(num_wg[i], dispatch->wg_size[i]);
460    }
461 
462    return num_wg_z > 1 ? 3 : (num_wg_y > 1 ? 2 : 1);
463 }
464 
465 static bool
should_blit_clamp_to_edge(const struct ac_cs_blit_description * blit,unsigned coord_mask)466 should_blit_clamp_to_edge(const struct ac_cs_blit_description *blit, unsigned coord_mask)
467 {
468    return util_is_box_out_of_bounds(&blit->src.box, coord_mask, blit->src.width0,
469                                     blit->src.height0, blit->src.level);
470 }
471 
472 /* Return a power-of-two alignment of a number. */
473 static unsigned
compute_alignment(unsigned x)474 compute_alignment(unsigned x)
475 {
476    return x ? BITFIELD_BIT(ffs(x) - 1) : BITFIELD_BIT(31);
477 }
478 
479 /* Set the blit info, but change the dst box and trim the src box according to the new dst box. */
480 static void
set_trimmed_blit(const struct ac_cs_blit_description * old,const struct pipe_box * box,bool is_clear,struct ac_cs_blit_description * out)481 set_trimmed_blit(const struct ac_cs_blit_description *old, const struct pipe_box *box,
482                  bool is_clear, struct ac_cs_blit_description *out)
483 {
484    assert(old->dst.box.x <= box->x);
485    assert(old->dst.box.y <= box->y);
486    assert(old->dst.box.z <= box->z);
487    assert(box->x + box->width <= old->dst.box.x + old->dst.box.width);
488    assert(box->y + box->height <= old->dst.box.y + old->dst.box.height);
489    assert(box->z + box->depth <= old->dst.box.z + old->dst.box.depth);
490    /* No scaling. */
491    assert(is_clear || old->dst.box.width == abs(old->src.box.width));
492    assert(is_clear || old->dst.box.height == abs(old->src.box.height));
493    assert(is_clear || old->dst.box.depth == abs(old->src.box.depth));
494 
495    *out = *old;
496    out->dst.box = *box;
497 
498    if (!is_clear) {
499       if (out->src.box.width > 0) {
500          out->src.box.x += box->x - old->dst.box.x;
501          out->src.box.width = box->width;
502       } else {
503          out->src.box.x -= box->x - old->dst.box.x;
504          out->src.box.width = -box->width;
505       }
506 
507       if (out->src.box.height > 0) {
508          out->src.box.y += box->y - old->dst.box.y;
509          out->src.box.height = box->height;
510       } else {
511          out->src.box.y -= box->y - old->dst.box.y;
512          out->src.box.height = -box->height;
513       }
514 
515       out->src.box.z += box->z - old->dst.box.z;
516       out->src.box.depth = box->depth;
517    }
518 }
519 
520 typedef struct {
521    unsigned x, y, z;
522 } uvec3;
523 
524 /* This function uses the blit description to generate the shader key, prepare user SGPR constants,
525  * and determine the parameters for up to 7 compute dispatches.
526  *
527  * The driver should use the shader key to create the shader, set the SGPR constants, and launch
528  * compute dispatches.
529  */
530 bool
ac_prepare_compute_blit(const struct ac_cs_blit_options * options,const struct ac_cs_blit_description * blit,struct ac_cs_blit_dispatches * out)531 ac_prepare_compute_blit(const struct ac_cs_blit_options *options,
532                         const struct ac_cs_blit_description *blit,
533                         struct ac_cs_blit_dispatches *out)
534 {
535    const struct radeon_info *info = options->info;
536    bool is_2d_tiling = !blit->dst.surf->is_linear && !blit->dst.surf->thick_tiling;
537    bool is_3d_tiling = blit->dst.surf->thick_tiling;
538    bool is_clear = !blit->src.surf;
539    unsigned dst_samples = MAX2(1, blit->dst.num_samples);
540    unsigned src_samples = is_clear ? 1 : MAX2(1, blit->src.num_samples);
541    bool is_resolve = !is_clear && dst_samples == 1 && src_samples >= 2 &&
542                      !util_format_is_pure_integer(blit->dst.format);
543    bool is_upsampling = !is_clear && src_samples == 1 && dst_samples >= 2;
544    bool sample0_only = src_samples >= 2 && dst_samples == 1 &&
545                        (blit->sample0_only || util_format_is_pure_integer(blit->dst.format));
546    /* Get the channel sizes. */
547    unsigned max_dst_chan_size = util_format_get_max_channel_size(blit->dst.format);
548    unsigned max_src_chan_size = is_clear ? 0 : util_format_get_max_channel_size(blit->src.format);
549 
550    if (!options->is_nested)
551       memset(out, 0, sizeof(*out));
552 
553    /* Reject blits with invalid parameters. */
554    if (blit->dst.box.width < 0 || blit->dst.box.height < 0 || blit->dst.box.depth < 0 ||
555        blit->src.box.depth < 0) {
556       assert(!"invalid box parameters"); /* this is reachable and prevents hangs */
557       return true;
558    }
559 
560    /* Skip zero-area blits. */
561    if (!blit->dst.box.width || !blit->dst.box.height || !blit->dst.box.depth ||
562        (!is_clear && (!blit->src.box.width || !blit->src.box.height || !blit->src.box.depth)))
563       return true;
564 
565    if (blit->dst.format == PIPE_FORMAT_A8R8_UNORM || /* This format fails AMD_TEST=imagecopy. */
566        max_dst_chan_size == 5 || /* PIPE_FORMAT_R5G5B5A1_UNORM has precision issues */
567        max_dst_chan_size == 6 || /* PIPE_FORMAT_R5G6B5_UNORM has precision issues */
568        util_format_is_depth_or_stencil(blit->dst.format) ||
569        dst_samples > SI_MAX_COMPUTE_BLIT_SAMPLES ||
570        /* Image stores support DCC since GFX10. Fail only for gfx queues because compute queues
571         * can't fall back to a pixel shader. DCC must be decompressed and disabled for compute
572         * queues by the caller. */
573        (options->info->gfx_level < GFX10 && blit->is_gfx_queue && blit->dst_has_dcc) ||
574        (!is_clear &&
575         /* Scaling is not implemented by the compute shader. */
576         (blit->dst.box.width != abs(blit->src.box.width) ||
577          blit->dst.box.height != abs(blit->src.box.height) ||
578          blit->dst.box.depth != abs(blit->src.box.depth) ||
579          util_format_is_depth_or_stencil(blit->src.format) ||
580          src_samples > SI_MAX_COMPUTE_BLIT_SAMPLES)))
581       return false;
582 
583    /* Return a failure if a compute blit is slower than a gfx blit. */
584    if (options->fail_if_slow) {
585       if (is_clear) {
586          /* Verified on: Tahiti, Hawaii, Tonga, Vega10, Navi10, Navi21, Navi31 */
587          if (is_3d_tiling) {
588             if (info->gfx_level == GFX6 && blit->dst.surf->bpe == 8)
589                return false;
590          } else if (is_2d_tiling) {
591             if (!(info->gfx_level == GFX6 && blit->dst.surf->bpe <= 4 && dst_samples == 1) &&
592                 !(info->gfx_level == GFX7 && blit->dst.surf->bpe == 1 && dst_samples == 1))
593                return false;
594          }
595       } else {
596          /* For upsampling, image stores don't compress MSAA as good as draws. */
597          if (is_upsampling)
598             return false;
599 
600          switch (info->gfx_level) {
601          case GFX6:
602          case GFX7:
603          case GFX8:
604          case GFX9:
605          case GFX10:
606          case GFX10_3:
607             /* Verified on: Tahiti, Hawaii, Tonga, Vega10, Navi10, Navi21 */
608             if (is_resolve) {
609                if (!(info->gfx_level == GFX7 && blit->dst.surf->bpe == 16))
610                   return false;
611             } else {
612                assert(dst_samples == src_samples || sample0_only);
613 
614                if (is_2d_tiling) {
615                   if (dst_samples == 1) {
616                      if (blit->dst.surf->bpe <= 8 &&
617                          !(info->gfx_level <= GFX7 && blit->dst.surf->bpe == 1) &&
618                          !(info->gfx_level == GFX6 && blit->dst.surf->bpe == 2 &&
619                            blit->src.surf->is_linear) &&
620                          !(info->gfx_level == GFX7 && blit->dst.surf->bpe >= 2 &&
621                            blit->src.surf->is_linear) &&
622                          !((info->gfx_level == GFX8 || info->gfx_level == GFX9) &&
623                            blit->dst.surf->bpe >= 2 && blit->src.surf->is_linear) &&
624                          !(info->gfx_level == GFX10 && blit->dst.surf->bpe <= 2 &&
625                            blit->src.surf->is_linear) &&
626                          !(info->gfx_level == GFX10_3 && blit->dst.surf->bpe == 8 &&
627                            blit->src.surf->is_linear))
628                         return false;
629 
630                      if (info->gfx_level == GFX6 && blit->dst.surf->bpe == 16 &&
631                          blit->src.surf->is_linear && blit->dst.dim != 3)
632                         return false;
633 
634                      if (blit->dst.surf->bpe == 16 && !blit->src.surf->is_linear &&
635                          /* Only GFX6 selects 2D tiling for 128bpp 3D textures. */
636                          !(info->gfx_level == GFX6 && blit->dst.dim == 3) &&
637                          info->gfx_level != GFX7)
638                         return false;
639                   } else {
640                      /* MSAA copies - tested only without FMASK on Navi21. */
641                      if (blit->dst.surf->bpe >= 4)
642                         return false;
643                   }
644                }
645             }
646             break;
647 
648          case GFX11:
649          case GFX11_5:
650          default:
651             /* Verified on Navi31. */
652             if (is_resolve) {
653                if (!((blit->dst.surf->bpe <= 2 && src_samples == 2) ||
654                      (blit->dst.surf->bpe == 2 && src_samples == 4) ||
655                      (blit->dst.surf->bpe == 16 && src_samples == 4)))
656                   return false;
657             } else {
658                assert(dst_samples == src_samples || sample0_only);
659 
660                if (is_2d_tiling) {
661                   if (blit->dst.surf->bpe == 2 && blit->src.surf->is_linear && dst_samples == 1)
662                      return false;
663 
664                   if (blit->dst.surf->bpe >= 4 && dst_samples == 1 && !blit->src.surf->is_linear)
665                      return false;
666 
667                   if (blit->dst.surf->bpe == 16 && dst_samples == 8)
668                      return false;
669                }
670             }
671             break;
672          }
673       }
674    }
675 
676    unsigned width = blit->dst.box.width;
677    unsigned height = blit->dst.box.height;
678    unsigned depth = blit->dst.box.depth;
679    uvec3 lane_size = (uvec3){1, 1, 1};
680 
681    /* Determine the size of the block of pixels that will be processed by a single lane.
682     * Generally we want to load and store about 8-16B per lane, but there are exceptions.
683     * The block sizes were fine-tuned for Navi31, and might be suboptimal on different generations.
684     */
685    if (blit->dst.surf->bpe <= 8 && (is_resolve ? src_samples : dst_samples) <= 4 &&
686        /* Small blits don't benefit. */
687        width * height * depth * blit->dst.surf->bpe * dst_samples > 128 * 1024 &&
688        info->has_image_opcodes) {
689       if (is_3d_tiling) {
690          /* Thick tiling. */
691          if (!is_clear && blit->src.surf->is_linear) {
692             /* Linear -> Thick. */
693             if (blit->dst.surf->bpe == 4)
694                lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
695             else if (blit->dst.surf->bpe == 2)
696                lane_size = (uvec3){2, 1, 2}; /* 8B per lane */
697             else if (blit->dst.surf->bpe == 1)
698                lane_size = (uvec3){4, 1, 2}; /* 8B per lane */
699          } else {
700             if (blit->dst.surf->bpe == 8)
701                lane_size = (uvec3){1, 1, 2}; /* 16B per lane */
702             else if (blit->dst.surf->bpe == 4)
703                lane_size = (uvec3){1, 2, 2}; /* 16B per lane */
704             else if (blit->dst.surf->bpe == 2)
705                lane_size = (uvec3){1, 2, 4}; /* 16B per lane */
706             else
707                lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
708          }
709       } else if (blit->dst.surf->is_linear) {
710          /* Linear layout. */
711          if (!is_clear && !blit->src.surf->is_linear) {
712             /* Tiled -> Linear. */
713             if (blit->dst.surf->bpe == 8 && !blit->src.surf->thick_tiling)
714                lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
715             else if (blit->dst.surf->bpe == 4)
716                lane_size = (uvec3){1, 2, 1}; /* 8B per lane */
717             else if (blit->dst.surf->bpe == 2 && blit->src.surf->thick_tiling)
718                lane_size = (uvec3){2, 2, 1}; /* 8B per lane */
719             else if (blit->dst.surf->bpe == 1 && blit->src.surf->thick_tiling)
720                lane_size = (uvec3){2, 2, 2}; /* 8B per lane */
721             else if (blit->dst.surf->bpe <= 2)
722                lane_size = (uvec3){2, 4, 1}; /* 8-16B per lane */
723          } else {
724             /* Clear or Linear -> Linear. */
725             if (blit->dst.surf->bpe == 8)
726                lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
727             else if (blit->dst.surf->bpe == 4)
728                lane_size = (uvec3){4, 1, 1}; /* 16B per lane */
729             else if (blit->dst.surf->bpe == 2)
730                lane_size = (uvec3){4, 2, 1}; /* 16B per lane */
731             else
732                lane_size = (uvec3){8, 1, 1}; /* 8B per lane */
733          }
734       } else {
735          /* Thin tiling. */
736          if (is_resolve) {
737             if (blit->dst.surf->bpe == 8 && src_samples == 2) {
738                lane_size = (uvec3){1, 2, 1}; /* 32B->16B per lane */
739             } else if (blit->dst.surf->bpe == 4) {
740                lane_size = (uvec3){2, 1, 1}; /* 32B->8B for 4 samples, 16B->8B for 2 samples */
741             } else if (blit->dst.surf->bpe <= 2) {
742                if (src_samples == 4)
743                   lane_size = (uvec3){2, 1, 1}; /* 16B->4B for 16bpp, 8B->2B for 8bpp */
744                else
745                   lane_size = (uvec3){2, 2, 1}; /* 16B->8B for 16bpp, 8B->4B for 8bpp */
746             }
747          } else {
748             if (blit->dst.surf->bpe == 8 && dst_samples == 1)
749                lane_size = (uvec3){1, 2, 1}; /* 16B per lane */
750             else if (blit->dst.surf->bpe == 4) {
751                if (dst_samples == 2)
752                   lane_size = (uvec3){2, 1, 1}; /* 16B per lane */
753                else if (dst_samples == 1)
754                   lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
755             } else if (blit->dst.surf->bpe == 2) {
756                if (dst_samples == 4 || (!is_clear && blit->src.surf->is_linear))
757                   lane_size = (uvec3){2, 1, 1}; /* 16B per lane (4B for linear src) */
758                else if (dst_samples == 2)
759                   lane_size = (uvec3){2, 2, 1}; /* 16B per lane */
760                else
761                   lane_size = (uvec3){2, 4, 1}; /* 16B per lane */
762             } else if (blit->dst.surf->bpe == 1) {
763                if (dst_samples == 4)
764                   lane_size = (uvec3){2, 1, 1}; /* 8B per lane */
765                else if (dst_samples == 2 || (!is_clear && blit->src.surf->is_linear))
766                   lane_size = (uvec3){2, 2, 1}; /* 8B per lane (4B for linear src) */
767                else
768                   lane_size = (uvec3){2, 4, 1}; /* 8B per lane */
769             }
770          }
771       }
772    }
773 
774    /* Check that the lane size fits into the shader key. */
775    static const union ac_cs_blit_key max_lane_size = {
776       .log_lane_width = ~0,
777       .log_lane_height = ~0,
778       .log_lane_depth = ~0,
779    };
780    assert(util_logbase2(lane_size.x) <= max_lane_size.log_lane_width);
781    assert(util_logbase2(lane_size.y) <= max_lane_size.log_lane_height);
782    assert(util_logbase2(lane_size.z) <= max_lane_size.log_lane_depth);
783 
784    /* If the shader blits a block of pixels per lane, it must have the dst box aligned to that
785     * block because it can't blit a subset of pixels per lane.
786     *
787     * If the blit dst box is not aligned to the lane size, split it into multiple blits by cutting
788     * off the unaligned sides of the box and blitting the middle that's aligned to the lane size,
789     * then blit the unaligned sides separately. This splits the blit into up to 7 blits for 3D,
790     * and 5 blits for 2D.
791     */
792    if (blit->dst.box.x % lane_size.x ||
793        blit->dst.box.y % lane_size.y ||
794        blit->dst.box.z % lane_size.z ||
795        blit->dst.box.width % lane_size.x ||
796        blit->dst.box.height % lane_size.y ||
797        blit->dst.box.depth % lane_size.z) {
798       struct pipe_box middle;
799 
800       /* Cut off unaligned regions on the sides of the box. */
801       middle.x = align(blit->dst.box.x, lane_size.x);
802       middle.y = align(blit->dst.box.y, lane_size.y);
803       middle.z = align(blit->dst.box.z, lane_size.z);
804 
805       middle.width = blit->dst.box.width - (middle.x - blit->dst.box.x);
806       if (middle.width > 0)
807          middle.width -= middle.width % lane_size.x;
808       middle.height = blit->dst.box.height - (middle.y - blit->dst.box.y);
809       if (middle.height > 0)
810          middle.height -= middle.height % lane_size.y;
811       middle.depth = blit->dst.box.depth - (middle.z - blit->dst.box.z);
812       if (middle.depth > 0)
813          middle.depth -= middle.depth % lane_size.z;
814 
815       /* Only a few cases are regressed by this. The vast majority benefits a lot.
816        * This was fine-tuned for Navi31, and might be suboptimal on different generations.
817        */
818       bool slow = (blit->dst.surf->is_linear && !is_clear && blit->src.surf->is_linear && depth > 1) ||
819                   (blit->dst.surf->thick_tiling &&
820                    ((blit->dst.surf->bpe == 8 && is_clear) ||
821                     (blit->dst.surf->bpe == 4 &&
822                      (blit->dst.surf->is_linear || (!is_clear && blit->src.surf->is_linear))) ||
823                     (blit->dst.surf->bpe == 2 && blit->dst.surf->is_linear && !is_clear &&
824                      blit->src.surf->is_linear))) ||
825                   (!blit->dst.surf->thick_tiling &&
826                    ((blit->dst.surf->bpe == 4 && blit->dst.surf->is_linear && !is_clear &&
827                      blit->src.surf->is_linear) ||
828                     (blit->dst.surf->bpe == 8 && !is_clear &&
829                      blit->dst.surf->is_linear != blit->src.surf->is_linear) ||
830                     (is_resolve && blit->dst.surf->bpe == 4 && src_samples == 4) ||
831                     (is_resolve && blit->dst.surf->bpe == 8 && src_samples == 2)));
832 
833       /* Only use this if the middle blit is large enough. */
834       if (!slow && middle.width > 0 && middle.height > 0 && middle.depth > 0 &&
835           middle.width * middle.height * middle.depth * blit->dst.surf->bpe * dst_samples >
836           128 * 1024) {
837          /* Compute the size of unaligned regions on all sides of the box. */
838          struct pipe_box top, left, right, bottom, front, back;
839 
840          assert(!options->is_nested);
841 
842          top = blit->dst.box;
843          top.height = middle.y - top.y;
844 
845          bottom = blit->dst.box;
846          bottom.y = middle.y + middle.height;
847          bottom.height = blit->dst.box.height - top.height - middle.height;
848 
849          left = blit->dst.box;
850          left.y = middle.y;
851          left.height = middle.height;
852          left.width = middle.x - left.x;
853 
854          right = blit->dst.box;
855          right.y = middle.y;
856          right.height = middle.height;
857          right.x = middle.x + middle.width;
858          right.width = blit->dst.box.width - left.width - middle.width;
859 
860          front = blit->dst.box;
861          front.x = middle.x;
862          front.y = middle.y;
863          front.width = middle.width;
864          front.height = middle.height;
865          front.depth = middle.z - front.z;
866 
867          back = blit->dst.box;
868          back.x = middle.x;
869          back.y = middle.y;
870          back.width = middle.width;
871          back.height = middle.height;
872          back.z = middle.z + middle.depth;
873          back.depth = blit->dst.box.depth - front.depth - middle.depth;
874 
875          struct pipe_box boxes[] = {middle, top, bottom, left, right, front, back};
876 
877          /* Verify that the boxes don't intersect. */
878          for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
879             for (unsigned j = i + 1; j < ARRAY_SIZE(boxes); j++) {
880                if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0 &&
881                    boxes[j].width > 0 && boxes[j].height > 0 && boxes[j].depth > 0) {
882                   if (u_box_test_intersection_3d(&boxes[i], &boxes[j])) {
883                      printf("\b   (%u, %u, %u) -> (%u, %u, %u) | (%u, %u, %u) -> (%u, %u, %u)\n",
884                             boxes[i].x, boxes[i].y, boxes[i].z,
885                             boxes[i].x + boxes[i].width - 1,
886                             boxes[i].y + boxes[i].height - 1,
887                             boxes[i].z + boxes[i].depth - 1,
888                             boxes[j].x, boxes[j].y, boxes[j].z,
889                             boxes[j].x + boxes[j].width,
890                             boxes[j].y + boxes[j].height,
891                             boxes[j].z + boxes[j].depth);
892                      assert(0);
893                   }
894                }
895             }
896          }
897 
898          struct ac_cs_blit_options nested_options = *options;
899          nested_options.is_nested = true;
900 
901          for (unsigned i = 0; i < ARRAY_SIZE(boxes); i++) {
902             if (boxes[i].width > 0 && boxes[i].height > 0 && boxes[i].depth > 0) {
903                struct ac_cs_blit_description new_blit;
904                ASSERTED bool ok;
905 
906                set_trimmed_blit(blit, &boxes[i], is_clear, &new_blit);
907                ok = ac_prepare_compute_blit(&nested_options, &new_blit, out);
908                assert(ok);
909             }
910          }
911          return true;
912       }
913    }
914 
915    /* If the box can't blit split, at least reduce the lane size to the alignment of the box. */
916    lane_size.x = MIN3(lane_size.x, compute_alignment(blit->dst.box.x), compute_alignment(width));
917    lane_size.y = MIN3(lane_size.y, compute_alignment(blit->dst.box.y), compute_alignment(height));
918    lane_size.z = MIN3(lane_size.z, compute_alignment(blit->dst.box.z), compute_alignment(depth));
919 
920    /* Determine the alignment of coordinates of the first thread of each wave. The alignment should be
921     * to a 256B block or the size of 1 wave, whichever is less, but there are a few exceptions.
922     */
923    uvec3 align;
924    if (is_3d_tiling) {
925       /* Thick tiling. */
926       /* This is based on GFX11_SW_PATTERN_NIBBLE01, which also matches GFX10. */
927       if (blit->dst.surf->bpe == 1)
928          align = (uvec3){8, 4, 8};
929       else if (blit->dst.surf->bpe == 2)
930          align = (uvec3){4, 4, 8};
931       else if (blit->dst.surf->bpe == 4)
932          align = (uvec3){4, 4, 4};
933       else if (blit->dst.surf->bpe == 8)
934          align = (uvec3){4, 2, 4};
935       else {
936          /* 16bpp linear source image reads perform better with this. */
937          if (!is_clear && blit->src.surf->is_linear)
938             align = (uvec3){4, 2, 4}; /* align to 512B for linear->tiled */
939          else
940             align = (uvec3){2, 2, 4};
941       }
942 
943       /* Clamp the alignment to the expected size of 1 wave. */
944       align.x = MIN2(align.x, 4 * lane_size.x);
945       align.y = MIN2(align.y, 4 * lane_size.y);
946       align.z = MIN2(align.z, 4 * lane_size.z);
947    } else if (blit->dst.surf->is_linear) {
948       /* 1D blits from linear to linear are faster unaligned.
949        * 1D image clears don't benefit from any alignment.
950        */
951       if (height == 1 && depth == 1 && (is_clear || blit->src.surf->is_linear)) {
952          align = (uvec3){1, 1, 1};
953       } else {
954          /* Linear blits should use the cache line size instead of 256B alignment.
955           * Clamp it to the expected size of 1 wave.
956           */
957          align.x = MIN2(options->info->tcc_cache_line_size / blit->dst.surf->bpe, 64 * lane_size.x);
958          align.y = 1;
959          align.z = 1;
960       }
961    } else {
962       /* Thin tiling. */
963       if (info->gfx_level >= GFX11) {
964          /* Samples are next to each other on GFX11+. */
965          unsigned pix_size = blit->dst.surf->bpe * dst_samples;
966 
967          /* This is based on GFX11_SW_PATTERN_NIBBLE01. */
968          if (pix_size == 1)
969             align = (uvec3){16, 16, 1};
970          else if (pix_size == 2)
971             align = (uvec3){16, 8, 1};
972          else if (pix_size == 4)
973             align = (uvec3){8, 8, 1};
974          else if (pix_size == 8)
975             align = (uvec3){8, 4, 1};
976          else if (pix_size == 16)
977             align = (uvec3){4, 4, 1};
978          else if (pix_size == 32)
979             align = (uvec3){4, 2, 1};
980          else if (pix_size == 64)
981             align = (uvec3){2, 2, 1};
982          else
983             align = (uvec3){2, 1, 1}; /* 16bpp 8xAA */
984       } else {
985          /* This is for 64KB_R_X. (most likely to occur due to DCC)
986           * It's based on GFX10_SW_64K_R_X_*xaa_RBPLUS_PATINFO (GFX10.3).
987           * The patterns are GFX10_SW_PATTERN_NIBBLE01[0, 1, 39, 6, 7] for 8bpp-128bpp.
988           * GFX6-10.1 and other swizzle modes might be similar.
989           */
990          if (blit->dst.surf->bpe == 1)
991             align = (uvec3){16, 16, 1};
992          else if (blit->dst.surf->bpe == 2)
993             align = (uvec3){16, 8, 1};
994          else if (blit->dst.surf->bpe == 4)
995             align = (uvec3){8, 8, 1};
996          else if (blit->dst.surf->bpe == 8)
997             align = (uvec3){8, 4, 1};
998          else
999             align = (uvec3){4, 4, 1};
1000       }
1001 
1002       /* Clamp the alignment to the expected size of 1 wave. */
1003       align.x = MIN2(align.x, 8 * lane_size.x);
1004       align.y = MIN2(align.y, 8 * lane_size.y);
1005    }
1006 
1007    /* If we don't have much to copy, don't align. The threshold is guessed and isn't covered
1008     * by benchmarking.
1009     */
1010    if (width <= align.x * 4)
1011       align.x = 1;
1012    if (height <= align.y * 4)
1013       align.y = 1;
1014    if (depth <= align.z * 4)
1015       align.z = 1;
1016 
1017    unsigned start_x, start_y, start_z;
1018    unsigned block_x, block_y, block_z;
1019 
1020    /* If the blit destination area is unaligned, launch extra threads before 0,0,0 to make it
1021     * aligned. This makes sure that a wave doesn't straddle a DCC block boundary or a cache line
1022     * unnecessarily, so that each cache line is only stored by exactly 1 CU. The shader will skip
1023     * the extra threads. This makes unaligned compute blits faster.
1024     */
1025    start_x = blit->dst.box.x % align.x;
1026    start_y = blit->dst.box.y % align.y;
1027    start_z = blit->dst.box.z % align.z;
1028    width += start_x;
1029    height += start_y;
1030    depth += start_z;
1031 
1032    /* Divide by the dispatch parameters by the lane size. */
1033    assert(start_x % lane_size.x == 0);
1034    assert(start_y % lane_size.y == 0);
1035    assert(start_z % lane_size.z == 0);
1036    assert(width % lane_size.x == 0);
1037    assert(height % lane_size.y == 0);
1038    assert(depth % lane_size.z == 0);
1039 
1040    start_x /= lane_size.x;
1041    start_y /= lane_size.y;
1042    start_z /= lane_size.z;
1043    width /= lane_size.x;
1044    height /= lane_size.y;
1045    depth /= lane_size.z;
1046 
1047    /* Choose the block (i.e. wave) dimensions based on the copy area size and the image layout
1048     * of dst.
1049     */
1050    if (is_3d_tiling) {
1051       /* Thick tiling. (microtiles are 3D boxes)
1052        * If the box height and depth is > 2, the block size will be 4x4x4.
1053        * If not, the threads will spill over to X.
1054        */
1055       block_y = util_next_power_of_two(MIN2(height, 4));
1056       block_z = util_next_power_of_two(MIN2(depth, 4));
1057       block_x = 64 / (block_y * block_z);
1058    } else if (blit->dst.surf->is_linear) {
1059       /* If the box width is > 128B, the block size will be 64x1 for bpp <= 4, 32x2 for bpp == 8,
1060        * and 16x4 for bpp == 16.
1061        * If not, the threads will spill over to Y, then Z if they aren't small.
1062        *
1063        * This is derived from the fact that the linear image layout has 256B linear blocks, and
1064        * longer blocks don't benefit linear write performance, but they hurt tiled read performance.
1065        * We want to prioritize blocks that are 256Bx2 over 512Bx1 because the source can be tiled.
1066        *
1067        * Using the cache line size (128B) instead of hardcoding 256B makes linear blits slower.
1068        */
1069       block_x = util_next_power_of_two(MIN3(width, 64, 256 / blit->dst.surf->bpe));
1070       block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
1071       block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
1072       block_x = 64 / (block_y * block_z);
1073    } else {
1074       /* Thin tiling. (microtiles are 2D rectangles)
1075        * If the box width and height is > 4, the block size will be 8x8.
1076        * If Y is <= 4, the threads will spill over to X.
1077        * If X is <= 4, the threads will spill over to Y, then Z if they aren't small.
1078        */
1079       block_y = util_next_power_of_two(MIN2(height, 8));
1080       block_x = util_next_power_of_two(MIN2(width, 64 / block_y));
1081       block_y = util_next_power_of_two(MIN2(height, 64 / block_x));
1082       block_z = util_next_power_of_two(MIN2(depth, 64 / (block_x * block_y)));
1083       block_x = 64 / (block_y * block_z);
1084    }
1085 
1086    unsigned index = out->num_dispatches++;
1087    assert(index < ARRAY_SIZE(out->dispatches));
1088    struct ac_cs_blit_dispatch *dispatch = &out->dispatches[index];
1089    unsigned wg_dim = set_work_size(dispatch, block_x, block_y, block_z, width, height, depth);
1090 
1091    /* Get the shader key. */
1092    union ac_cs_blit_key key;
1093    key.key = 0;
1094 
1095    /* Only ACO can form VMEM clauses for image stores, which is a requirement for performance. */
1096    key.use_aco = true;
1097    key.is_clear = is_clear;
1098    key.wg_dim = wg_dim;
1099    key.has_start_xyz = start_x || start_y || start_z;
1100    key.log_lane_width = util_logbase2(lane_size.x);
1101    key.log_lane_height = util_logbase2(lane_size.y);
1102    key.log_lane_depth = util_logbase2(lane_size.z);
1103    key.dst_is_1d = blit->dst.dim == 1;
1104    key.dst_is_msaa = dst_samples > 1;
1105    key.dst_has_z = blit->dst.dim == 3 || blit->dst.is_array;
1106    key.last_dst_channel = util_format_get_last_component(blit->dst.format);
1107 
1108    /* ACO doesn't support D16 on GFX8 */
1109    bool has_d16 = info->gfx_level >= (key.use_aco || options->use_aco ? GFX9 : GFX8);
1110 
1111    if (is_clear) {
1112       assert(dst_samples <= 8);
1113       key.log_samples = util_logbase2(dst_samples);
1114       key.a16 = info->gfx_level >= GFX9 && util_is_box_sint16(&blit->dst.box);
1115       key.d16 = has_d16 &&
1116                 max_dst_chan_size <= (util_format_is_float(blit->dst.format) ||
1117                                       util_format_is_pure_integer(blit->dst.format) ? 16 : 11);
1118    } else {
1119       key.src_is_1d = blit->src.dim == 1;
1120       key.src_is_msaa = src_samples > 1;
1121       key.src_has_z = blit->src.dim == 3 || blit->src.is_array;
1122       /* Resolving integer formats only copies sample 0. log_samples is then unused. */
1123       key.sample0_only = sample0_only;
1124       unsigned num_samples = MAX2(src_samples, dst_samples);
1125       assert(num_samples <= 8);
1126       key.log_samples = sample0_only ? 0 : util_logbase2(num_samples);
1127       key.x_clamp_to_edge = should_blit_clamp_to_edge(blit, BITFIELD_BIT(0));
1128       key.y_clamp_to_edge = should_blit_clamp_to_edge(blit, BITFIELD_BIT(1));
1129       key.flip_x = blit->src.box.width < 0;
1130       key.flip_y = blit->src.box.height < 0;
1131       key.sint_to_uint = util_format_is_pure_sint(blit->src.format) &&
1132                          util_format_is_pure_uint(blit->dst.format);
1133       key.uint_to_sint = util_format_is_pure_uint(blit->src.format) &&
1134                          util_format_is_pure_sint(blit->dst.format);
1135       key.dst_is_srgb = util_format_is_srgb(blit->dst.format);
1136       key.last_src_channel = MIN2(util_format_get_last_component(blit->src.format),
1137                                   key.last_dst_channel);
1138       key.use_integer_one = util_format_is_pure_integer(blit->dst.format) &&
1139                             key.last_src_channel < key.last_dst_channel &&
1140                             key.last_dst_channel == 3;
1141       key.a16 = info->gfx_level >= GFX9 && util_is_box_sint16(&blit->dst.box) &&
1142                 util_is_box_sint16(&blit->src.box);
1143       key.d16 = has_d16 &&
1144                 /* Blitting FP16 using D16 has precision issues. Resolving has precision
1145                  * issues all the way down to R11G11B10_FLOAT. */
1146                 MIN2(max_dst_chan_size, max_src_chan_size) <=
1147                 (util_format_is_pure_integer(blit->dst.format) ?
1148                     (key.sint_to_uint || key.uint_to_sint ? 10 : 16) :
1149                     (is_resolve ? 10 : 11));
1150    }
1151 
1152    dispatch->shader_key = key;
1153 
1154    dispatch->user_data[0] = (blit->src.box.x & 0xffff) | ((blit->dst.box.x & 0xffff) << 16);
1155    dispatch->user_data[1] = (blit->src.box.y & 0xffff) | ((blit->dst.box.y & 0xffff) << 16);
1156    dispatch->user_data[2] = (blit->src.box.z & 0xffff) | ((blit->dst.box.z & 0xffff) << 16);
1157    dispatch->user_data[3] = (start_x & 0xff) | ((start_y & 0xff) << 8) | ((start_z & 0xff) << 16);
1158 
1159    if (is_clear) {
1160       union pipe_color_union final_value;
1161       memcpy(&final_value, &blit->clear_color, sizeof(final_value));
1162 
1163       /* Do the conversion to sRGB here instead of the shader. */
1164       if (util_format_is_srgb(blit->dst.format)) {
1165          for (int i = 0; i < 3; i++)
1166             final_value.f[i] = util_format_linear_to_srgb_float(final_value.f[i]);
1167       }
1168 
1169       if (key.d16) {
1170          enum pipe_format data_format;
1171 
1172          if (util_format_is_pure_uint(blit->dst.format))
1173             data_format = PIPE_FORMAT_R16G16B16A16_UINT;
1174          else if (util_format_is_pure_sint(blit->dst.format))
1175             data_format = PIPE_FORMAT_R16G16B16A16_SINT;
1176          else
1177             data_format = PIPE_FORMAT_R16G16B16A16_FLOAT;
1178 
1179          util_pack_color_union(data_format, (union util_color *)&dispatch->user_data[4],
1180                                &final_value);
1181       } else {
1182          memcpy(&dispatch->user_data[4], &final_value, sizeof(final_value));
1183       }
1184    }
1185 
1186    return true;
1187 }
1188