• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26 
27 /**
28  * The V3D TMU unit can only do 32-bit general vector access so for anything
29  * else we need to split vector load/store instructions to scalar.
30  *
31  * Note that a vectorization pass after this lowering may be able to
32  * re-vectorize some of these using 32-bit load/store instructions instead,
33  * which we do support.
34  */
35 
36 static int
value_src(nir_intrinsic_op intrinsic)37 value_src(nir_intrinsic_op intrinsic)
38 {
39    switch (intrinsic) {
40    case nir_intrinsic_store_ssbo:
41    case nir_intrinsic_store_scratch:
42    case nir_intrinsic_store_global_2x32:
43       return 0;
44    default:
45       unreachable("Unsupported intrinsic");
46    }
47 }
48 
49 static int
offset_src(nir_intrinsic_op intrinsic)50 offset_src(nir_intrinsic_op intrinsic)
51 {
52    switch (intrinsic) {
53    case nir_intrinsic_load_uniform:
54    case nir_intrinsic_load_shared:
55    case nir_intrinsic_load_scratch:
56    case nir_intrinsic_load_global_2x32:
57       return 0;
58    case nir_intrinsic_load_ubo:
59    case nir_intrinsic_load_ssbo:
60    case nir_intrinsic_store_scratch:
61    case nir_intrinsic_store_global_2x32:
62       return 1;
63    case nir_intrinsic_store_ssbo:
64       return 2;
65    default:
66       unreachable("Unsupported intrinsic");
67    }
68 }
69 
70 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_def * offset,uint32_t bit_size,nir_def ** scalar_offset)71 init_scalar_intrinsic(nir_builder *b,
72                       nir_intrinsic_instr *intr,
73                       uint32_t component,
74                       nir_def *offset,
75                       uint32_t bit_size,
76                       nir_def **scalar_offset)
77 {
78 
79         nir_intrinsic_instr *new_intr =
80                 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
81 
82         nir_intrinsic_copy_const_indices(new_intr, intr);
83 
84         const int offset_units = bit_size / 8;
85         assert(offset_units >= 1);
86 
87         if (nir_intrinsic_has_align_mul(intr)) {
88                 assert(nir_intrinsic_has_align_offset(intr));
89                 unsigned align_mul = nir_intrinsic_align_mul(intr);
90                 unsigned align_off = nir_intrinsic_align_offset(intr);
91 
92                 align_off += offset_units * component;
93                 align_off = align_off % align_mul;
94 
95                 nir_intrinsic_set_align(new_intr, align_mul, align_off);
96         }
97 
98         *scalar_offset = offset;
99         unsigned offset_adj = offset_units * component;
100         if (nir_intrinsic_has_base(intr)) {
101                 nir_intrinsic_set_base(
102                         new_intr, nir_intrinsic_base(intr) + offset_adj);
103         } else {
104                 *scalar_offset =
105                         nir_iadd(b, offset,
106                                  nir_imm_intN_t(b, offset_adj,
107                                                 offset->bit_size));
108         }
109 
110         new_intr->num_components = 1;
111 
112         return new_intr;
113 }
114 
115 static bool
lower_load_bitsize(nir_builder * b,nir_intrinsic_instr * intr)116 lower_load_bitsize(nir_builder *b,
117                    nir_intrinsic_instr *intr)
118 {
119         uint32_t bit_size = intr->def.bit_size;
120         if (bit_size == 32)
121                 return false;
122 
123         /* No need to split if it is already scalar */
124         int num_comp = nir_intrinsic_dest_components(intr);
125         if (num_comp <= 1)
126                 return false;
127 
128         b->cursor = nir_before_instr(&intr->instr);
129 
130         /* For global 2x32 we ignore Y component because it must be zero */
131         unsigned offset_idx = offset_src(intr->intrinsic);
132         nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
133 
134         /* Split vector store to multiple scalar loads */
135         nir_def *dest_components[4] = { NULL };
136         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
137         for (int component = 0; component < num_comp; component++) {
138                 nir_def *scalar_offset;
139                 nir_intrinsic_instr *new_intr =
140                         init_scalar_intrinsic(b, intr, component, offset,
141                                               bit_size, &scalar_offset);
142 
143                 for (unsigned i = 0; i < info->num_srcs; i++) {
144                         if (i == offset_idx) {
145                                 nir_def *final_offset;
146                                 final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
147                                         scalar_offset :
148                                         nir_vec2(b, scalar_offset,
149                                                  nir_imm_int(b, 0));
150                                 new_intr->src[i] = nir_src_for_ssa(final_offset);
151                         } else {
152                                 new_intr->src[i] = intr->src[i];
153                         }
154                 }
155 
156                 nir_def_init(&new_intr->instr, &new_intr->def, 1,
157                              bit_size);
158                 dest_components[component] = &new_intr->def;
159 
160                 nir_builder_instr_insert(b, &new_intr->instr);
161         }
162 
163         nir_def *new_dst = nir_vec(b, dest_components, num_comp);
164         nir_def_rewrite_uses(&intr->def, new_dst);
165 
166         nir_instr_remove(&intr->instr);
167         return true;
168 }
169 
170 static bool
lower_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr)171 lower_store_bitsize(nir_builder *b,
172                     nir_intrinsic_instr *intr)
173 {
174         /* No need to split if it is already scalar */
175         int value_idx = value_src(intr->intrinsic);
176         int num_comp = nir_intrinsic_src_components(intr, value_idx);
177         if (num_comp <= 1)
178                 return false;
179 
180         /* No need to split if it is 32-bit */
181         if (nir_src_bit_size(intr->src[value_idx]) == 32)
182                 return false;
183 
184         nir_def *value = intr->src[value_idx].ssa;
185 
186         b->cursor = nir_before_instr(&intr->instr);
187 
188         /* For global 2x32 we ignore Y component because it must be zero */
189         unsigned offset_idx = offset_src(intr->intrinsic);
190         nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
191 
192         /* Split vector store to multiple scalar stores */
193         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
194         unsigned wrmask = nir_intrinsic_write_mask(intr);
195         while (wrmask) {
196                 unsigned component = ffs(wrmask) - 1;
197 
198                 nir_def *scalar_offset;
199                 nir_intrinsic_instr *new_intr =
200                         init_scalar_intrinsic(b, intr, component, offset,
201                                               value->bit_size, &scalar_offset);
202 
203                 nir_intrinsic_set_write_mask(new_intr, 0x1);
204 
205                 for (unsigned i = 0; i < info->num_srcs; i++) {
206                         if (i == value_idx) {
207                                 nir_def *scalar_value =
208                                         nir_channels(b, value, 1 << component);
209                                 new_intr->src[i] = nir_src_for_ssa(scalar_value);
210                         } else if (i == offset_idx) {
211                                 nir_def *final_offset;
212                                 final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
213                                         scalar_offset :
214                                         nir_vec2(b, scalar_offset,
215                                                  nir_imm_int(b, 0));
216                                 new_intr->src[i] = nir_src_for_ssa(final_offset);
217                         } else {
218                                 new_intr->src[i] = intr->src[i];
219                         }
220                 }
221 
222                 nir_builder_instr_insert(b, &new_intr->instr);
223 
224                 wrmask &= ~(1 << component);
225         }
226 
227         nir_instr_remove(&intr->instr);
228         return true;
229 }
230 
231 static bool
lower_load_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr,void * data)232 lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
233                          void *data)
234 {
235         switch (intr->intrinsic) {
236         case nir_intrinsic_load_ssbo:
237         case nir_intrinsic_load_ubo:
238         case nir_intrinsic_load_uniform:
239         case nir_intrinsic_load_scratch:
240         case nir_intrinsic_load_global_2x32:
241                return lower_load_bitsize(b, intr);
242 
243         case nir_intrinsic_store_ssbo:
244         case nir_intrinsic_store_scratch:
245         case nir_intrinsic_store_global_2x32:
246                 return lower_store_bitsize(b, intr);
247 
248         default:
249                 return false;
250         }
251 }
252 
253 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s)254 v3d_nir_lower_load_store_bitsize(nir_shader *s)
255 {
256         return nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
257                                             nir_metadata_block_index |
258                                             nir_metadata_dominance,
259                                             NULL);
260 }
261