1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26
27 /**
28 * The V3D TMU unit can only do 32-bit general vector access so for anything
29 * else we need to split vector load/store instructions to scalar.
30 *
31 * Note that a vectorization pass after this lowering may be able to
32 * re-vectorize some of these using 32-bit load/store instructions instead,
33 * which we do support.
34 */
35
36 static int
value_src(nir_intrinsic_op intrinsic)37 value_src(nir_intrinsic_op intrinsic)
38 {
39 switch (intrinsic) {
40 case nir_intrinsic_store_ssbo:
41 case nir_intrinsic_store_scratch:
42 case nir_intrinsic_store_global_2x32:
43 return 0;
44 default:
45 unreachable("Unsupported intrinsic");
46 }
47 }
48
49 static int
offset_src(nir_intrinsic_op intrinsic)50 offset_src(nir_intrinsic_op intrinsic)
51 {
52 switch (intrinsic) {
53 case nir_intrinsic_load_uniform:
54 case nir_intrinsic_load_shared:
55 case nir_intrinsic_load_scratch:
56 case nir_intrinsic_load_global_2x32:
57 return 0;
58 case nir_intrinsic_load_ubo:
59 case nir_intrinsic_load_ssbo:
60 case nir_intrinsic_store_scratch:
61 case nir_intrinsic_store_global_2x32:
62 return 1;
63 case nir_intrinsic_store_ssbo:
64 return 2;
65 default:
66 unreachable("Unsupported intrinsic");
67 }
68 }
69
70 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_ssa_def * offset,uint32_t bit_size,nir_ssa_def ** scalar_offset)71 init_scalar_intrinsic(nir_builder *b,
72 nir_intrinsic_instr *intr,
73 uint32_t component,
74 nir_ssa_def *offset,
75 uint32_t bit_size,
76 nir_ssa_def **scalar_offset)
77 {
78
79 nir_intrinsic_instr *new_intr =
80 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
81
82 nir_intrinsic_copy_const_indices(new_intr, intr);
83
84 const int offset_units = bit_size / 8;
85 assert(offset_units >= 1);
86
87 if (nir_intrinsic_has_align_mul(intr)) {
88 assert(nir_intrinsic_has_align_offset(intr));
89 unsigned align_mul = nir_intrinsic_align_mul(intr);
90 unsigned align_off = nir_intrinsic_align_offset(intr);
91
92 align_off += offset_units * component;
93 align_off = align_off % align_mul;
94
95 nir_intrinsic_set_align(new_intr, align_mul, align_off);
96 }
97
98 *scalar_offset = offset;
99 unsigned offset_adj = offset_units * component;
100 if (nir_intrinsic_has_base(intr)) {
101 nir_intrinsic_set_base(
102 new_intr, nir_intrinsic_base(intr) + offset_adj);
103 } else {
104 *scalar_offset =
105 nir_iadd(b, offset,
106 nir_imm_intN_t(b, offset_adj,
107 offset->bit_size));
108 }
109
110 new_intr->num_components = 1;
111
112 return new_intr;
113 }
114
115 static bool
lower_load_bitsize(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)116 lower_load_bitsize(struct v3d_compile *c,
117 nir_builder *b,
118 nir_intrinsic_instr *intr)
119 {
120 uint32_t bit_size = nir_dest_bit_size(intr->dest);
121 if (bit_size == 32)
122 return false;
123
124 /* No need to split if it is already scalar */
125 int num_comp = nir_intrinsic_dest_components(intr);
126 if (num_comp <= 1)
127 return false;
128
129 b->cursor = nir_before_instr(&intr->instr);
130
131 /* For global 2x32 we ignore Y component because it must be zero */
132 unsigned offset_idx = offset_src(intr->intrinsic);
133 nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1);
134
135 /* Split vector store to multiple scalar loads */
136 nir_ssa_def *dest_components[4] = { NULL };
137 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
138 for (int component = 0; component < num_comp; component++) {
139 nir_ssa_def *scalar_offset;
140 nir_intrinsic_instr *new_intr =
141 init_scalar_intrinsic(b, intr, component, offset,
142 bit_size, &scalar_offset);
143
144 for (unsigned i = 0; i < info->num_srcs; i++) {
145 if (i == offset_idx) {
146 nir_ssa_def *final_offset;
147 final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
148 scalar_offset :
149 nir_vec2(b, scalar_offset,
150 nir_imm_int(b, 0));
151 new_intr->src[i] = nir_src_for_ssa(final_offset);
152 } else {
153 new_intr->src[i] = intr->src[i];
154 }
155 }
156
157 nir_ssa_dest_init(&new_intr->instr, &new_intr->dest,
158 1, bit_size, NULL);
159 dest_components[component] = &new_intr->dest.ssa;
160
161 nir_builder_instr_insert(b, &new_intr->instr);
162 }
163
164 nir_ssa_def *new_dst = nir_vec(b, dest_components, num_comp);
165 nir_ssa_def_rewrite_uses(&intr->dest.ssa, new_dst);
166
167 nir_instr_remove(&intr->instr);
168 return true;
169 }
170
171 static bool
lower_store_bitsize(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)172 lower_store_bitsize(struct v3d_compile *c,
173 nir_builder *b,
174 nir_intrinsic_instr *intr)
175 {
176 /* No need to split if it is already scalar */
177 int value_idx = value_src(intr->intrinsic);
178 int num_comp = nir_intrinsic_src_components(intr, value_idx);
179 if (num_comp <= 1)
180 return false;
181
182 /* No need to split if it is 32-bit */
183 if (nir_src_bit_size(intr->src[value_idx]) == 32)
184 return false;
185
186 nir_ssa_def *value = nir_ssa_for_src(b, intr->src[value_idx], num_comp);
187
188 b->cursor = nir_before_instr(&intr->instr);
189
190 /* For global 2x32 we ignore Y component because it must be zero */
191 unsigned offset_idx = offset_src(intr->intrinsic);
192 nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1);
193
194 /* Split vector store to multiple scalar stores */
195 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
196 unsigned wrmask = nir_intrinsic_write_mask(intr);
197 while (wrmask) {
198 unsigned component = ffs(wrmask) - 1;
199
200 nir_ssa_def *scalar_offset;
201 nir_intrinsic_instr *new_intr =
202 init_scalar_intrinsic(b, intr, component, offset,
203 value->bit_size, &scalar_offset);
204
205 nir_intrinsic_set_write_mask(new_intr, 0x1);
206
207 for (unsigned i = 0; i < info->num_srcs; i++) {
208 if (i == value_idx) {
209 nir_ssa_def *scalar_value =
210 nir_channels(b, value, 1 << component);
211 new_intr->src[i] = nir_src_for_ssa(scalar_value);
212 } else if (i == offset_idx) {
213 nir_ssa_def *final_offset;
214 final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
215 scalar_offset :
216 nir_vec2(b, scalar_offset,
217 nir_imm_int(b, 0));
218 new_intr->src[i] = nir_src_for_ssa(final_offset);
219 } else {
220 new_intr->src[i] = intr->src[i];
221 }
222 }
223
224 nir_builder_instr_insert(b, &new_intr->instr);
225
226 wrmask &= ~(1 << component);
227 }
228
229 nir_instr_remove(&intr->instr);
230 return true;
231 }
232
233 static bool
lower_load_store_bitsize(nir_builder * b,nir_instr * instr,void * data)234 lower_load_store_bitsize(nir_builder *b, nir_instr *instr, void *data)
235 {
236 struct v3d_compile *c = (struct v3d_compile *) data;
237
238 if (instr->type != nir_instr_type_intrinsic)
239 return false;
240 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
241
242 switch (intr->intrinsic) {
243 case nir_intrinsic_load_ssbo:
244 case nir_intrinsic_load_ubo:
245 case nir_intrinsic_load_uniform:
246 case nir_intrinsic_load_scratch:
247 case nir_intrinsic_load_global_2x32:
248 return lower_load_bitsize(c, b, intr);
249
250 case nir_intrinsic_store_ssbo:
251 case nir_intrinsic_store_scratch:
252 case nir_intrinsic_store_global_2x32:
253 return lower_store_bitsize(c, b, intr);
254
255 default:
256 return false;
257 }
258 }
259
260 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s,struct v3d_compile * c)261 v3d_nir_lower_load_store_bitsize(nir_shader *s, struct v3d_compile *c)
262 {
263 return nir_shader_instructions_pass(s,
264 lower_load_store_bitsize,
265 nir_metadata_block_index |
266 nir_metadata_dominance,
267 c);
268 }
269