• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26 
27 /**
28  * The V3D TMU unit can only do 32-bit general vector access so for anything
29  * else we need to split vector load/store instructions to scalar.
30  *
31  * Note that a vectorization pass after this lowering may be able to
32  * re-vectorize some of these using 32-bit load/store instructions instead,
33  * which we do support.
34  */
35 
36 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_def * offset,uint32_t bit_size,nir_def ** scalar_offset)37 init_scalar_intrinsic(nir_builder *b,
38                       nir_intrinsic_instr *intr,
39                       uint32_t component,
40                       nir_def *offset,
41                       uint32_t bit_size,
42                       nir_def **scalar_offset)
43 {
44 
45         nir_intrinsic_instr *new_intr =
46                 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
47 
48         nir_intrinsic_copy_const_indices(new_intr, intr);
49 
50         const int offset_units = bit_size / 8;
51         assert(offset_units >= 1);
52         assert(!nir_intrinsic_has_align_mul(intr));
53         assert(nir_intrinsic_has_base(intr));
54 
55         *scalar_offset = offset;
56         unsigned offset_adj = offset_units * component;
57         nir_intrinsic_set_base(new_intr, nir_intrinsic_base(intr) + offset_adj);
58         new_intr->num_components = 1;
59 
60         return new_intr;
61 }
62 
63 static bool
lower_load_bitsize(nir_builder * b,nir_intrinsic_instr * intr)64 lower_load_bitsize(nir_builder *b,
65                    nir_intrinsic_instr *intr)
66 {
67         uint32_t bit_size = intr->def.bit_size;
68         if (bit_size == 32)
69                 return false;
70 
71         /* No need to split if it is already scalar */
72         int num_comp = nir_intrinsic_dest_components(intr);
73         if (num_comp <= 1)
74                 return false;
75 
76         b->cursor = nir_before_instr(&intr->instr);
77 
78         int offset_idx = nir_get_io_offset_src_number(intr);
79         assert(offset_idx >= 0);
80         nir_def *offset = intr->src[offset_idx].ssa;
81 
82         /* Split vector store to multiple scalar loads */
83         nir_def *dest_components[16] = { NULL };
84         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
85         for (int component = 0; component < num_comp; component++) {
86                 nir_def *scalar_offset;
87                 nir_intrinsic_instr *new_intr =
88                         init_scalar_intrinsic(b, intr, component, offset,
89                                               bit_size, &scalar_offset);
90 
91                 for (unsigned i = 0; i < info->num_srcs; i++) {
92                         if (i == offset_idx) {
93                                 new_intr->src[i] = nir_src_for_ssa(scalar_offset);
94                         } else {
95                                 new_intr->src[i] = intr->src[i];
96                         }
97                 }
98 
99                 nir_def_init(&new_intr->instr, &new_intr->def, 1,
100                              bit_size);
101                 dest_components[component] = &new_intr->def;
102 
103                 nir_builder_instr_insert(b, &new_intr->instr);
104         }
105 
106         nir_def *new_dst = nir_vec(b, dest_components, num_comp);
107         nir_def_replace(&intr->def, new_dst);
108         return true;
109 }
110 
111 static bool
lower_load_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr,void * data)112 lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
113                          void *data)
114 {
115         switch (intr->intrinsic) {
116         case nir_intrinsic_load_uniform:
117                return lower_load_bitsize(b, intr);
118 
119         default:
120                 return false;
121         }
122 }
123 
124 /*
125  * The idea here is to lower bit_sizes until we meet the alignment of the data
126  * in order not having to use atomics. Also we keep load/stores we can operate
127  * on with a bit_size of 32 vectorized to up to 4 components at most.
128  */
129 static nir_mem_access_size_align
v3d_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t input_bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)130 v3d_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
131                   uint8_t input_bit_size, uint32_t align,
132                   uint32_t align_offset, bool offset_is_const,
133                   enum gl_access_qualifier access, const void *cb_data)
134 {
135         /* we only support single component 32 bit load/stores on scratch */
136         if (intrin == nir_intrinsic_load_scratch ||
137             intrin == nir_intrinsic_store_scratch) {
138                 return (nir_mem_access_size_align){
139                         .num_components = 1,
140                         .bit_size = 32,
141                         .align = 4,
142                         .shift = nir_mem_access_shift_method_scalar,
143                 };
144         }
145 
146         align = nir_combined_align(align, align_offset);
147         assert(util_is_power_of_two_nonzero(align));
148 
149         /* TODO: we could update the bit_size to 32 if possible, but that might
150          * cause suboptimal pack/unpack operations.
151          */
152         unsigned bit_size = MIN2(32, input_bit_size);
153 
154         /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
155          * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
156          * the size.
157          */
158         if (align == 1)
159                 bit_size = 8;
160         else if (align == 2)
161                 bit_size = MIN2(bit_size, 16);
162 
163         /* But we only support single component loads for anything below 32 bit.
164          * And only up to 4 components for 32 bit.
165          */
166         unsigned num_components;
167         if (bit_size == 32) {
168                 num_components = MIN2(bytes / 4, 4);
169 
170                 /* Now we have to reduce the num_components even further for unaligned
171                  * vector load/stores
172                  */
173                 num_components = MIN2(align / 4, num_components);
174         } else {
175                 num_components = 1;
176         }
177 
178         return (nir_mem_access_size_align){
179                 .num_components = num_components,
180                 .bit_size = bit_size,
181                 .align = (bit_size / 8) * (num_components == 3 ? 4 : num_components),
182                 .shift = nir_mem_access_shift_method_scalar,
183         };
184 }
185 
186 static nir_intrinsic_op
convert_global_2x32_to_scalar(nir_intrinsic_op op)187 convert_global_2x32_to_scalar(nir_intrinsic_op op)
188 {
189         switch (op) {
190         case nir_intrinsic_global_atomic_2x32:
191                 return nir_intrinsic_global_atomic;
192         case nir_intrinsic_global_atomic_swap_2x32:
193                 return nir_intrinsic_global_atomic_swap;
194         case nir_intrinsic_load_global_2x32:
195                 return nir_intrinsic_load_global;
196         case nir_intrinsic_store_global_2x32:
197                 return nir_intrinsic_store_global;
198         default:
199                 return op;
200         }
201 }
202 
203 static bool
lower_global_2x32(nir_builder * b,nir_intrinsic_instr * intr,void * data)204 lower_global_2x32(nir_builder *b, nir_intrinsic_instr *intr, void *data)
205 {
206         nir_intrinsic_op op = convert_global_2x32_to_scalar(intr->intrinsic);
207         if (op == intr->intrinsic)
208             return false;
209 
210         b->cursor = nir_before_instr(&intr->instr);
211         nir_src *addr_src = nir_get_io_offset_src(intr);
212         nir_src_rewrite(addr_src, nir_channel(b, addr_src->ssa, 0));
213         intr->intrinsic = op;
214 
215         return true;
216 }
217 
218 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s)219 v3d_nir_lower_load_store_bitsize(nir_shader *s)
220 {
221         nir_lower_mem_access_bit_sizes_options lower_options = {
222                 .modes = nir_var_mem_global | nir_var_mem_ssbo |
223                          nir_var_mem_ubo | nir_var_mem_constant |
224                          nir_var_mem_shared | nir_var_function_temp,
225                 .callback = v3d_size_align_cb,
226         };
227 
228         bool res = nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
229                                               nir_metadata_control_flow,
230                                               NULL);
231         res |= nir_lower_mem_access_bit_sizes(s, &lower_options);
232         return res;
233 }
234 
235 bool
v3d_nir_lower_global_2x32(nir_shader * s)236 v3d_nir_lower_global_2x32(nir_shader *s)
237 {
238         return  nir_shader_intrinsics_pass(s, lower_global_2x32,
239                                            nir_metadata_control_flow,
240                                            NULL);
241 }
242