• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26 
27 /**
28  * The V3D TMU unit can only do 32-bit general vector access so for anything
29  * else we need to split vector load/store instructions to scalar.
30  *
31  * Note that a vectorization pass after this lowering may be able to
32  * re-vectorize some of these using 32-bit load/store instructions instead,
33  * which we do support.
34  */
35 
36 static int
value_src(nir_intrinsic_op intrinsic)37 value_src(nir_intrinsic_op intrinsic)
38 {
39    switch (intrinsic) {
40    case nir_intrinsic_store_ssbo:
41    case nir_intrinsic_store_scratch:
42    case nir_intrinsic_store_global_2x32:
43       return 0;
44    default:
45       unreachable("Unsupported intrinsic");
46    }
47 }
48 
49 static int
offset_src(nir_intrinsic_op intrinsic)50 offset_src(nir_intrinsic_op intrinsic)
51 {
52    switch (intrinsic) {
53    case nir_intrinsic_load_uniform:
54    case nir_intrinsic_load_shared:
55    case nir_intrinsic_load_scratch:
56    case nir_intrinsic_load_global_2x32:
57       return 0;
58    case nir_intrinsic_load_ubo:
59    case nir_intrinsic_load_ssbo:
60    case nir_intrinsic_store_scratch:
61    case nir_intrinsic_store_global_2x32:
62       return 1;
63    case nir_intrinsic_store_ssbo:
64       return 2;
65    default:
66       unreachable("Unsupported intrinsic");
67    }
68 }
69 
70 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_ssa_def * offset,uint32_t bit_size,nir_ssa_def ** scalar_offset)71 init_scalar_intrinsic(nir_builder *b,
72                       nir_intrinsic_instr *intr,
73                       uint32_t component,
74                       nir_ssa_def *offset,
75                       uint32_t bit_size,
76                       nir_ssa_def **scalar_offset)
77 {
78 
79         nir_intrinsic_instr *new_intr =
80                 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
81 
82         nir_intrinsic_copy_const_indices(new_intr, intr);
83 
84         const int offset_units = bit_size / 8;
85         assert(offset_units >= 1);
86 
87         if (nir_intrinsic_has_align_mul(intr)) {
88                 assert(nir_intrinsic_has_align_offset(intr));
89                 unsigned align_mul = nir_intrinsic_align_mul(intr);
90                 unsigned align_off = nir_intrinsic_align_offset(intr);
91 
92                 align_off += offset_units * component;
93                 align_off = align_off % align_mul;
94 
95                 nir_intrinsic_set_align(new_intr, align_mul, align_off);
96         }
97 
98         *scalar_offset = offset;
99         unsigned offset_adj = offset_units * component;
100         if (nir_intrinsic_has_base(intr)) {
101                 nir_intrinsic_set_base(
102                         new_intr, nir_intrinsic_base(intr) + offset_adj);
103         } else {
104                 *scalar_offset =
105                         nir_iadd(b, offset,
106                                  nir_imm_intN_t(b, offset_adj,
107                                                 offset->bit_size));
108         }
109 
110         new_intr->num_components = 1;
111 
112         return new_intr;
113 }
114 
115 static bool
lower_load_bitsize(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)116 lower_load_bitsize(struct v3d_compile *c,
117                    nir_builder *b,
118                    nir_intrinsic_instr *intr)
119 {
120         uint32_t bit_size = nir_dest_bit_size(intr->dest);
121         if (bit_size == 32)
122                 return false;
123 
124         /* No need to split if it is already scalar */
125         int num_comp = nir_intrinsic_dest_components(intr);
126         if (num_comp <= 1)
127                 return false;
128 
129         b->cursor = nir_before_instr(&intr->instr);
130 
131         /* For global 2x32 we ignore Y component because it must be zero */
132         unsigned offset_idx = offset_src(intr->intrinsic);
133         nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1);
134 
135         /* Split vector store to multiple scalar loads */
136         nir_ssa_def *dest_components[4] = { NULL };
137         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
138         for (int component = 0; component < num_comp; component++) {
139                 nir_ssa_def *scalar_offset;
140                 nir_intrinsic_instr *new_intr =
141                         init_scalar_intrinsic(b, intr, component, offset,
142                                               bit_size, &scalar_offset);
143 
144                 for (unsigned i = 0; i < info->num_srcs; i++) {
145                         if (i == offset_idx) {
146                                 nir_ssa_def *final_offset;
147                                 final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
148                                         scalar_offset :
149                                         nir_vec2(b, scalar_offset,
150                                                  nir_imm_int(b, 0));
151                                 new_intr->src[i] = nir_src_for_ssa(final_offset);
152                         } else {
153                                 new_intr->src[i] = intr->src[i];
154                         }
155                 }
156 
157                 nir_ssa_dest_init(&new_intr->instr, &new_intr->dest,
158                                   1, bit_size, NULL);
159                 dest_components[component] = &new_intr->dest.ssa;
160 
161                 nir_builder_instr_insert(b, &new_intr->instr);
162         }
163 
164         nir_ssa_def *new_dst = nir_vec(b, dest_components, num_comp);
165         nir_ssa_def_rewrite_uses(&intr->dest.ssa, new_dst);
166 
167         nir_instr_remove(&intr->instr);
168         return true;
169 }
170 
171 static bool
lower_store_bitsize(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)172 lower_store_bitsize(struct v3d_compile *c,
173                    nir_builder *b,
174                    nir_intrinsic_instr *intr)
175 {
176         /* No need to split if it is already scalar */
177         int value_idx = value_src(intr->intrinsic);
178         int num_comp = nir_intrinsic_src_components(intr, value_idx);
179         if (num_comp <= 1)
180                 return false;
181 
182         /* No need to split if it is 32-bit */
183         if (nir_src_bit_size(intr->src[value_idx]) == 32)
184                 return false;
185 
186         nir_ssa_def *value = nir_ssa_for_src(b, intr->src[value_idx], num_comp);
187 
188         b->cursor = nir_before_instr(&intr->instr);
189 
190         /* For global 2x32 we ignore Y component because it must be zero */
191         unsigned offset_idx = offset_src(intr->intrinsic);
192         nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1);
193 
194         /* Split vector store to multiple scalar stores */
195         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
196         unsigned wrmask = nir_intrinsic_write_mask(intr);
197         while (wrmask) {
198                 unsigned component = ffs(wrmask) - 1;
199 
200                 nir_ssa_def *scalar_offset;
201                 nir_intrinsic_instr *new_intr =
202                         init_scalar_intrinsic(b, intr, component, offset,
203                                               value->bit_size, &scalar_offset);
204 
205                 nir_intrinsic_set_write_mask(new_intr, 0x1);
206 
207                 for (unsigned i = 0; i < info->num_srcs; i++) {
208                         if (i == value_idx) {
209                                 nir_ssa_def *scalar_value =
210                                         nir_channels(b, value, 1 << component);
211                                 new_intr->src[i] = nir_src_for_ssa(scalar_value);
212                         } else if (i == offset_idx) {
213                                 nir_ssa_def *final_offset;
214                                 final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
215                                         scalar_offset :
216                                         nir_vec2(b, scalar_offset,
217                                                  nir_imm_int(b, 0));
218                                 new_intr->src[i] = nir_src_for_ssa(final_offset);
219                         } else {
220                                 new_intr->src[i] = intr->src[i];
221                         }
222                 }
223 
224                 nir_builder_instr_insert(b, &new_intr->instr);
225 
226                 wrmask &= ~(1 << component);
227         }
228 
229         nir_instr_remove(&intr->instr);
230         return true;
231 }
232 
233 static bool
lower_load_store_bitsize(nir_builder * b,nir_instr * instr,void * data)234 lower_load_store_bitsize(nir_builder *b, nir_instr *instr, void *data)
235 {
236         struct v3d_compile *c = (struct v3d_compile *) data;
237 
238         if (instr->type != nir_instr_type_intrinsic)
239                 return false;
240         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
241 
242         switch (intr->intrinsic) {
243         case nir_intrinsic_load_ssbo:
244         case nir_intrinsic_load_ubo:
245         case nir_intrinsic_load_uniform:
246         case nir_intrinsic_load_scratch:
247         case nir_intrinsic_load_global_2x32:
248                return lower_load_bitsize(c, b, intr);
249 
250         case nir_intrinsic_store_ssbo:
251         case nir_intrinsic_store_scratch:
252         case nir_intrinsic_store_global_2x32:
253                 return lower_store_bitsize(c, b, intr);
254 
255         default:
256                 return false;
257         }
258 }
259 
260 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s,struct v3d_compile * c)261 v3d_nir_lower_load_store_bitsize(nir_shader *s, struct v3d_compile *c)
262 {
263         return nir_shader_instructions_pass(s,
264                                             lower_load_store_bitsize,
265                                             nir_metadata_block_index |
266                                             nir_metadata_dominance,
267                                             c);
268 }
269