• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Intel Corporation
3  * Copyright © 2023 Collabora, Ltd.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include "util/bitscan.h"
26 #include "util/u_math.h"
27 #include "nir_builder.h"
28 
29 static nir_intrinsic_instr *
dup_mem_intrinsic(nir_builder * b,nir_intrinsic_instr * intrin,nir_def * offset,unsigned align_mul,unsigned align_offset,nir_def * data,unsigned num_components,unsigned bit_size)30 dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
31                   nir_def *offset,
32                   unsigned align_mul, unsigned align_offset,
33                   nir_def *data,
34                   unsigned num_components, unsigned bit_size)
35 {
36    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
37 
38    nir_intrinsic_instr *dup =
39       nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
40 
41    nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
42    for (unsigned i = 0; i < info->num_srcs; i++) {
43       if (i == 0 && data != NULL) {
44          assert(!info->has_dest);
45          assert(&intrin->src[i] != intrin_offset_src);
46          dup->src[i] = nir_src_for_ssa(data);
47       } else if (&intrin->src[i] == intrin_offset_src) {
48          dup->src[i] = nir_src_for_ssa(offset);
49       } else {
50          dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
51       }
52    }
53 
54    dup->num_components = num_components;
55    for (unsigned i = 0; i < info->num_indices; i++)
56       dup->const_index[i] = intrin->const_index[i];
57 
58    nir_intrinsic_set_align(dup, align_mul, align_offset);
59 
60    if (info->has_dest) {
61       nir_def_init(&dup->instr, &dup->def, num_components, bit_size);
62    } else {
63       nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
64    }
65 
66    nir_builder_instr_insert(b, &dup->instr);
67 
68    return dup;
69 }
70 
71 static bool
lower_mem_load(nir_builder * b,nir_intrinsic_instr * intrin,nir_lower_mem_access_bit_sizes_cb mem_access_size_align_cb,const void * cb_data)72 lower_mem_load(nir_builder *b, nir_intrinsic_instr *intrin,
73                nir_lower_mem_access_bit_sizes_cb mem_access_size_align_cb,
74                const void *cb_data)
75 {
76    const unsigned bit_size = intrin->def.bit_size;
77    const unsigned num_components = intrin->def.num_components;
78    const unsigned bytes_read = num_components * (bit_size / 8);
79    const uint32_t align_mul = nir_intrinsic_align_mul(intrin);
80    const uint32_t whole_align_offset = nir_intrinsic_align_offset(intrin);
81    const uint32_t whole_align = nir_intrinsic_align(intrin);
82    nir_src *offset_src = nir_get_io_offset_src(intrin);
83    const bool offset_is_const = nir_src_is_const(*offset_src);
84    nir_def *offset = offset_src->ssa;
85 
86    nir_mem_access_size_align requested =
87       mem_access_size_align_cb(intrin->intrinsic, bytes_read,
88                                bit_size, align_mul, whole_align_offset,
89                                offset_is_const, cb_data);
90 
91    assert(util_is_power_of_two_nonzero(align_mul));
92    assert(util_is_power_of_two_nonzero(requested.align));
93    if (requested.num_components == num_components &&
94        requested.bit_size == bit_size &&
95        requested.align <= whole_align)
96       return false;
97 
98    /* Otherwise, we have to break it into chunks.  We could end up with as
99     * many as 32 chunks if we're loading a u64vec16 as individual dwords.
100     */
101    nir_def *chunks[32];
102    unsigned num_chunks = 0;
103    unsigned chunk_start = 0;
104    while (chunk_start < bytes_read) {
105       const unsigned bytes_left = bytes_read - chunk_start;
106       const uint32_t chunk_align_offset =
107          (whole_align_offset + chunk_start) % align_mul;
108       const uint32_t chunk_align =
109          nir_combined_align(align_mul, chunk_align_offset);
110       requested = mem_access_size_align_cb(intrin->intrinsic, bytes_left,
111                                            bit_size, align_mul, chunk_align_offset,
112                                            offset_is_const, cb_data);
113 
114       unsigned chunk_bytes;
115       assert(util_is_power_of_two_nonzero(requested.align));
116       if (align_mul < requested.align) {
117          /* For this case, we need to be able to shift the value so we assume
118           * the alignment is less than the size of a single component.  This
119           * ensures that we don't need to upcast in order to shift.
120           */
121          assert(requested.bit_size >= requested.align * 8);
122 
123          uint64_t align_mask = requested.align - 1;
124          nir_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
125          nir_def *pad = nir_iand_imm(b, chunk_offset, align_mask);
126          chunk_offset = nir_iand_imm(b, chunk_offset, ~align_mask);
127 
128          nir_intrinsic_instr *load =
129             dup_mem_intrinsic(b, intrin, chunk_offset,
130                               requested.align, 0, NULL,
131                               requested.num_components, requested.bit_size);
132 
133          unsigned max_pad = requested.align - chunk_align;
134          unsigned requested_bytes =
135             requested.num_components * requested.bit_size / 8;
136          chunk_bytes = MIN2(bytes_left, requested_bytes - max_pad);
137 
138          nir_def *shift = nir_imul_imm(b, pad, 8);
139          nir_def *shifted = nir_ushr(b, &load->def, shift);
140 
141          if (load->def.num_components > 1) {
142             nir_def *rev_shift =
143                nir_isub_imm(b, load->def.bit_size, shift);
144             nir_def *rev_shifted = nir_ishl(b, &load->def, rev_shift);
145 
146             nir_def *comps[NIR_MAX_VEC_COMPONENTS];
147             for (unsigned i = 1; i < load->def.num_components; i++)
148                comps[i - 1] = nir_channel(b, rev_shifted, i);
149 
150             comps[load->def.num_components - 1] =
151                nir_imm_zero(b, 1, load->def.bit_size);
152 
153             rev_shifted = nir_vec(b, comps, load->def.num_components);
154             shifted = nir_bcsel(b, nir_ieq_imm(b, shift, 0), &load->def,
155                                 nir_ior(b, shifted, rev_shifted));
156          }
157 
158          unsigned chunk_bit_size = MIN2(8 << (ffs(chunk_bytes) - 1), bit_size);
159          unsigned chunk_num_components = chunk_bytes / (chunk_bit_size / 8);
160 
161          /* There's no guarantee that chunk_num_components is a valid NIR
162           * vector size, so just loop one chunk component at a time
163           */
164          for (unsigned i = 0; i < chunk_num_components; i++) {
165             assert(num_chunks < ARRAY_SIZE(chunks));
166             chunks[num_chunks++] =
167                nir_extract_bits(b, &shifted, 1, i * chunk_bit_size,
168                                 1, chunk_bit_size);
169          }
170       } else if (chunk_align_offset % requested.align) {
171          /* In this case, we know how much to adjust the offset */
172          uint32_t delta = chunk_align_offset % requested.align;
173          nir_def *load_offset =
174             nir_iadd_imm(b, offset, chunk_start - (int)delta);
175 
176          const uint32_t load_align_offset =
177             (chunk_align_offset - delta) % align_mul;
178 
179          nir_intrinsic_instr *load =
180             dup_mem_intrinsic(b, intrin, load_offset,
181                               align_mul, load_align_offset, NULL,
182                               requested.num_components, requested.bit_size);
183 
184          assert(requested.bit_size >= 8);
185          chunk_bytes = requested.num_components * (requested.bit_size / 8);
186          assert(chunk_bytes > delta);
187          chunk_bytes -= delta;
188 
189          unsigned chunk_bit_size = MIN2(8 << (ffs(chunk_bytes) - 1), bit_size);
190          unsigned chunk_num_components = chunk_bytes / (chunk_bit_size / 8);
191 
192          /* There's no guarantee that chunk_num_components is a valid NIR
193           * vector size, so just loop one chunk component at a time
194           */
195          nir_def *chunk_data = &load->def;
196          for (unsigned i = 0; i < chunk_num_components; i++) {
197             assert(num_chunks < ARRAY_SIZE(chunks));
198             chunks[num_chunks++] =
199                nir_extract_bits(b, &chunk_data, 1,
200                                 delta * 8 + i * chunk_bit_size,
201                                 1, chunk_bit_size);
202          }
203       } else {
204          nir_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
205          nir_intrinsic_instr *load =
206             dup_mem_intrinsic(b, intrin, chunk_offset,
207                               align_mul, chunk_align_offset, NULL,
208                               requested.num_components, requested.bit_size);
209 
210          chunk_bytes = requested.num_components * (requested.bit_size / 8);
211          assert(num_chunks < ARRAY_SIZE(chunks));
212          chunks[num_chunks++] = &load->def;
213       }
214 
215       chunk_start += chunk_bytes;
216    }
217 
218    nir_def *result = nir_extract_bits(b, chunks, num_chunks, 0,
219                                       num_components, bit_size);
220    nir_def_rewrite_uses(&intrin->def, result);
221    nir_instr_remove(&intrin->instr);
222 
223    return true;
224 }
225 
226 static bool
lower_mem_store(nir_builder * b,nir_intrinsic_instr * intrin,nir_lower_mem_access_bit_sizes_cb mem_access_size_align_cb,const void * cb_data,bool allow_unaligned_stores_as_atomics)227 lower_mem_store(nir_builder *b, nir_intrinsic_instr *intrin,
228                 nir_lower_mem_access_bit_sizes_cb mem_access_size_align_cb,
229                 const void *cb_data, bool allow_unaligned_stores_as_atomics)
230 {
231    nir_def *value = intrin->src[0].ssa;
232 
233    assert(intrin->num_components == value->num_components);
234    const unsigned bit_size = value->bit_size;
235    const unsigned byte_size = bit_size / 8;
236    const unsigned num_components = intrin->num_components;
237    const unsigned bytes_written = num_components * byte_size;
238    const uint32_t align_mul = nir_intrinsic_align_mul(intrin);
239    const uint32_t whole_align_offset = nir_intrinsic_align_offset(intrin);
240    const uint32_t whole_align = nir_intrinsic_align(intrin);
241    nir_src *offset_src = nir_get_io_offset_src(intrin);
242    const bool offset_is_const = nir_src_is_const(*offset_src);
243    nir_def *offset = offset_src->ssa;
244 
245    nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
246    assert(writemask < (1 << num_components));
247 
248    nir_mem_access_size_align requested =
249       mem_access_size_align_cb(intrin->intrinsic, bytes_written,
250                                bit_size, align_mul, whole_align_offset,
251                                offset_is_const, cb_data);
252 
253    assert(util_is_power_of_two_nonzero(align_mul));
254    assert(util_is_power_of_two_nonzero(requested.align));
255    if (requested.num_components == num_components &&
256        requested.bit_size == bit_size &&
257        requested.align <= whole_align &&
258        writemask == BITFIELD_MASK(num_components))
259       return false;
260 
261    assert(byte_size <= sizeof(uint64_t));
262    BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t));
263    BITSET_ZERO(mask);
264 
265    for (unsigned i = 0; i < num_components; i++) {
266       if (writemask & (1u << i)) {
267          BITSET_SET_RANGE_INSIDE_WORD(mask, i * byte_size,
268                                       ((i + 1) * byte_size) - 1);
269       }
270    }
271 
272    while (BITSET_FFS(mask) != 0) {
273       const uint32_t chunk_start = BITSET_FFS(mask) - 1;
274 
275       uint32_t end;
276       for (end = chunk_start + 1; end < bytes_written; end++) {
277          if (!(BITSET_TEST(mask, end)))
278             break;
279       }
280       /* The size of the current contiguous chunk in bytes */
281       const uint32_t max_chunk_bytes = end - chunk_start;
282       const uint32_t chunk_align_offset =
283          (whole_align_offset + chunk_start) % align_mul;
284       const uint32_t chunk_align =
285          nir_combined_align(align_mul, chunk_align_offset);
286 
287       requested = mem_access_size_align_cb(intrin->intrinsic, max_chunk_bytes,
288                                            bit_size, align_mul, chunk_align_offset,
289                                            offset_is_const, cb_data);
290 
291       uint32_t chunk_bytes = requested.num_components * (requested.bit_size / 8);
292 
293       assert(util_is_power_of_two_nonzero(requested.align));
294       if (chunk_align < requested.align ||
295           chunk_bytes > max_chunk_bytes) {
296          /* Otherwise the caller made a mistake with their return values. */
297          assert(chunk_bytes <= 4);
298          assert(allow_unaligned_stores_as_atomics);
299 
300          /* We'll turn this into a pair of 32-bit atomics to modify only the right
301           * bits of memory.
302           */
303          requested = (nir_mem_access_size_align){
304             .align = 4,
305             .bit_size = 32,
306             .num_components = 1,
307          };
308 
309          uint64_t align_mask = requested.align - 1;
310          nir_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
311          nir_def *pad = chunk_align < 4 ? nir_iand_imm(b, chunk_offset, align_mask) : nir_imm_intN_t(b, 0, chunk_offset->bit_size);
312          chunk_offset = nir_iand_imm(b, chunk_offset, ~align_mask);
313 
314          unsigned max_pad = chunk_align < requested.align ? requested.align - chunk_align : 0;
315          unsigned requested_bytes =
316             requested.num_components * requested.bit_size / 8;
317          chunk_bytes = MIN2(max_chunk_bytes, requested_bytes - max_pad);
318          unsigned chunk_bits = chunk_bytes * 8;
319 
320          nir_def *data;
321          if (chunk_bits == 24) {
322             /* This is a bit of a special case because we don't have 24-bit integers */
323             data = nir_extract_bits(b, &value, 1, chunk_start * 8, 3, 8);
324             data = nir_pack_bits(b, nir_pad_vector_imm_int(b, data, 0, 4), 32);
325          } else {
326             data = nir_extract_bits(b, &value, 1, chunk_start * 8, 1, chunk_bits);
327             data = nir_u2u32(b, data);
328          }
329 
330          nir_def *iand_mask = nir_imm_int(b, (1 << chunk_bits) - 1);
331 
332          if (chunk_align < requested.align) {
333             nir_def *shift = nir_u2u32(b, nir_imul_imm(b, pad, 8));
334             data = nir_ishl(b, data, shift);
335             iand_mask = nir_ishl(b, iand_mask, shift);
336          }
337 
338          iand_mask = nir_inot(b, iand_mask);
339 
340          switch (intrin->intrinsic) {
341          case nir_intrinsic_store_ssbo:
342             nir_ssbo_atomic(b, 32, intrin->src[1].ssa, chunk_offset, iand_mask,
343                             .atomic_op = nir_atomic_op_iand,
344                             .access = nir_intrinsic_access(intrin));
345             nir_ssbo_atomic(b, 32, intrin->src[1].ssa, chunk_offset, data,
346                             .atomic_op = nir_atomic_op_ior,
347                             .access = nir_intrinsic_access(intrin));
348             break;
349          case nir_intrinsic_store_global:
350             nir_global_atomic(b, 32, chunk_offset, iand_mask,
351                               .atomic_op = nir_atomic_op_iand);
352             nir_global_atomic(b, 32, chunk_offset, data,
353                               .atomic_op = nir_atomic_op_ior);
354             break;
355          case nir_intrinsic_store_shared:
356             nir_shared_atomic(b, 32, chunk_offset, iand_mask,
357                               .atomic_op = nir_atomic_op_iand,
358                               .base = nir_intrinsic_base(intrin));
359             nir_shared_atomic(b, 32, chunk_offset, data,
360                               .atomic_op = nir_atomic_op_ior,
361                               .base = nir_intrinsic_base(intrin));
362             break;
363          default:
364             unreachable("Unsupported unaligned store");
365          }
366       } else {
367          nir_def *packed = nir_extract_bits(b, &value, 1, chunk_start * 8,
368                                             requested.num_components,
369                                             requested.bit_size);
370 
371          nir_def *chunk_offset = nir_iadd_imm(b, offset, chunk_start);
372          dup_mem_intrinsic(b, intrin, chunk_offset,
373                            align_mul, chunk_align_offset, packed,
374                            requested.num_components, requested.bit_size);
375       }
376       BITSET_CLEAR_RANGE(mask, chunk_start, (chunk_start + chunk_bytes - 1));
377    }
378 
379    nir_instr_remove(&intrin->instr);
380 
381    return true;
382 }
383 
384 static nir_variable_mode
intrin_to_variable_mode(nir_intrinsic_op intrin)385 intrin_to_variable_mode(nir_intrinsic_op intrin)
386 {
387    switch (intrin) {
388    case nir_intrinsic_load_ubo:
389       return nir_var_mem_ubo;
390 
391    case nir_intrinsic_load_global:
392    case nir_intrinsic_store_global:
393       return nir_var_mem_global;
394 
395    case nir_intrinsic_load_global_constant:
396       return nir_var_mem_constant;
397 
398    case nir_intrinsic_load_ssbo:
399    case nir_intrinsic_store_ssbo:
400       return nir_var_mem_ssbo;
401 
402    case nir_intrinsic_load_shared:
403    case nir_intrinsic_store_shared:
404       return nir_var_mem_shared;
405 
406    case nir_intrinsic_load_scratch:
407    case nir_intrinsic_store_scratch:
408       return nir_var_shader_temp | nir_var_function_temp;
409 
410    case nir_intrinsic_load_task_payload:
411    case nir_intrinsic_store_task_payload:
412       return nir_var_mem_task_payload;
413 
414    default:
415       return 0;
416    }
417 }
418 
419 static bool
lower_mem_access_instr(nir_builder * b,nir_instr * instr,void * _data)420 lower_mem_access_instr(nir_builder *b, nir_instr *instr, void *_data)
421 {
422    const nir_lower_mem_access_bit_sizes_options *state = _data;
423 
424    if (instr->type != nir_instr_type_intrinsic)
425       return false;
426 
427    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
428    if (!(state->modes & intrin_to_variable_mode(intrin->intrinsic)))
429       return false;
430 
431    b->cursor = nir_after_instr(instr);
432 
433    switch (intrin->intrinsic) {
434    case nir_intrinsic_load_ubo:
435    case nir_intrinsic_load_global:
436    case nir_intrinsic_load_global_constant:
437    case nir_intrinsic_load_ssbo:
438    case nir_intrinsic_load_shared:
439    case nir_intrinsic_load_scratch:
440    case nir_intrinsic_load_task_payload:
441       return lower_mem_load(b, intrin, state->callback, state->cb_data);
442 
443    case nir_intrinsic_store_global:
444    case nir_intrinsic_store_ssbo:
445    case nir_intrinsic_store_shared:
446    case nir_intrinsic_store_scratch:
447    case nir_intrinsic_store_task_payload:
448       return lower_mem_store(b, intrin, state->callback, state->cb_data,
449                              state->may_lower_unaligned_stores_to_atomics);
450 
451    default:
452       return false;
453    }
454 }
455 
456 bool
nir_lower_mem_access_bit_sizes(nir_shader * shader,const nir_lower_mem_access_bit_sizes_options * options)457 nir_lower_mem_access_bit_sizes(nir_shader *shader,
458                                const nir_lower_mem_access_bit_sizes_options *options)
459 {
460    return nir_shader_instructions_pass(shader, lower_mem_access_instr,
461                                        nir_metadata_block_index |
462                                           nir_metadata_dominance,
463                                        (void *)options);
464 }
465