1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26
27 /**
28 * The V3D TMU unit can only do 32-bit general vector access so for anything
29 * else we need to split vector load/store instructions to scalar.
30 *
31 * Note that a vectorization pass after this lowering may be able to
32 * re-vectorize some of these using 32-bit load/store instructions instead,
33 * which we do support.
34 */
35
36 static int
value_src(nir_intrinsic_op intrinsic)37 value_src(nir_intrinsic_op intrinsic)
38 {
39 switch (intrinsic) {
40 case nir_intrinsic_store_ssbo:
41 case nir_intrinsic_store_scratch:
42 case nir_intrinsic_store_global_2x32:
43 return 0;
44 default:
45 unreachable("Unsupported intrinsic");
46 }
47 }
48
49 static int
offset_src(nir_intrinsic_op intrinsic)50 offset_src(nir_intrinsic_op intrinsic)
51 {
52 switch (intrinsic) {
53 case nir_intrinsic_load_uniform:
54 case nir_intrinsic_load_shared:
55 case nir_intrinsic_load_scratch:
56 case nir_intrinsic_load_global_2x32:
57 return 0;
58 case nir_intrinsic_load_ubo:
59 case nir_intrinsic_load_ssbo:
60 case nir_intrinsic_store_scratch:
61 case nir_intrinsic_store_global_2x32:
62 return 1;
63 case nir_intrinsic_store_ssbo:
64 return 2;
65 default:
66 unreachable("Unsupported intrinsic");
67 }
68 }
69
70 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_def * offset,uint32_t bit_size,nir_def ** scalar_offset)71 init_scalar_intrinsic(nir_builder *b,
72 nir_intrinsic_instr *intr,
73 uint32_t component,
74 nir_def *offset,
75 uint32_t bit_size,
76 nir_def **scalar_offset)
77 {
78
79 nir_intrinsic_instr *new_intr =
80 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
81
82 nir_intrinsic_copy_const_indices(new_intr, intr);
83
84 const int offset_units = bit_size / 8;
85 assert(offset_units >= 1);
86
87 if (nir_intrinsic_has_align_mul(intr)) {
88 assert(nir_intrinsic_has_align_offset(intr));
89 unsigned align_mul = nir_intrinsic_align_mul(intr);
90 unsigned align_off = nir_intrinsic_align_offset(intr);
91
92 align_off += offset_units * component;
93 align_off = align_off % align_mul;
94
95 nir_intrinsic_set_align(new_intr, align_mul, align_off);
96 }
97
98 *scalar_offset = offset;
99 unsigned offset_adj = offset_units * component;
100 if (nir_intrinsic_has_base(intr)) {
101 nir_intrinsic_set_base(
102 new_intr, nir_intrinsic_base(intr) + offset_adj);
103 } else {
104 *scalar_offset =
105 nir_iadd(b, offset,
106 nir_imm_intN_t(b, offset_adj,
107 offset->bit_size));
108 }
109
110 new_intr->num_components = 1;
111
112 return new_intr;
113 }
114
115 static bool
lower_load_bitsize(nir_builder * b,nir_intrinsic_instr * intr)116 lower_load_bitsize(nir_builder *b,
117 nir_intrinsic_instr *intr)
118 {
119 uint32_t bit_size = intr->def.bit_size;
120 if (bit_size == 32)
121 return false;
122
123 /* No need to split if it is already scalar */
124 int num_comp = nir_intrinsic_dest_components(intr);
125 if (num_comp <= 1)
126 return false;
127
128 b->cursor = nir_before_instr(&intr->instr);
129
130 /* For global 2x32 we ignore Y component because it must be zero */
131 unsigned offset_idx = offset_src(intr->intrinsic);
132 nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
133
134 /* Split vector store to multiple scalar loads */
135 nir_def *dest_components[4] = { NULL };
136 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
137 for (int component = 0; component < num_comp; component++) {
138 nir_def *scalar_offset;
139 nir_intrinsic_instr *new_intr =
140 init_scalar_intrinsic(b, intr, component, offset,
141 bit_size, &scalar_offset);
142
143 for (unsigned i = 0; i < info->num_srcs; i++) {
144 if (i == offset_idx) {
145 nir_def *final_offset;
146 final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
147 scalar_offset :
148 nir_vec2(b, scalar_offset,
149 nir_imm_int(b, 0));
150 new_intr->src[i] = nir_src_for_ssa(final_offset);
151 } else {
152 new_intr->src[i] = intr->src[i];
153 }
154 }
155
156 nir_def_init(&new_intr->instr, &new_intr->def, 1,
157 bit_size);
158 dest_components[component] = &new_intr->def;
159
160 nir_builder_instr_insert(b, &new_intr->instr);
161 }
162
163 nir_def *new_dst = nir_vec(b, dest_components, num_comp);
164 nir_def_rewrite_uses(&intr->def, new_dst);
165
166 nir_instr_remove(&intr->instr);
167 return true;
168 }
169
170 static bool
lower_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr)171 lower_store_bitsize(nir_builder *b,
172 nir_intrinsic_instr *intr)
173 {
174 /* No need to split if it is already scalar */
175 int value_idx = value_src(intr->intrinsic);
176 int num_comp = nir_intrinsic_src_components(intr, value_idx);
177 if (num_comp <= 1)
178 return false;
179
180 /* No need to split if it is 32-bit */
181 if (nir_src_bit_size(intr->src[value_idx]) == 32)
182 return false;
183
184 nir_def *value = intr->src[value_idx].ssa;
185
186 b->cursor = nir_before_instr(&intr->instr);
187
188 /* For global 2x32 we ignore Y component because it must be zero */
189 unsigned offset_idx = offset_src(intr->intrinsic);
190 nir_def *offset = nir_trim_vector(b, intr->src[offset_idx].ssa, 1);
191
192 /* Split vector store to multiple scalar stores */
193 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
194 unsigned wrmask = nir_intrinsic_write_mask(intr);
195 while (wrmask) {
196 unsigned component = ffs(wrmask) - 1;
197
198 nir_def *scalar_offset;
199 nir_intrinsic_instr *new_intr =
200 init_scalar_intrinsic(b, intr, component, offset,
201 value->bit_size, &scalar_offset);
202
203 nir_intrinsic_set_write_mask(new_intr, 0x1);
204
205 for (unsigned i = 0; i < info->num_srcs; i++) {
206 if (i == value_idx) {
207 nir_def *scalar_value =
208 nir_channels(b, value, 1 << component);
209 new_intr->src[i] = nir_src_for_ssa(scalar_value);
210 } else if (i == offset_idx) {
211 nir_def *final_offset;
212 final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
213 scalar_offset :
214 nir_vec2(b, scalar_offset,
215 nir_imm_int(b, 0));
216 new_intr->src[i] = nir_src_for_ssa(final_offset);
217 } else {
218 new_intr->src[i] = intr->src[i];
219 }
220 }
221
222 nir_builder_instr_insert(b, &new_intr->instr);
223
224 wrmask &= ~(1 << component);
225 }
226
227 nir_instr_remove(&intr->instr);
228 return true;
229 }
230
231 static bool
lower_load_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr,void * data)232 lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
233 void *data)
234 {
235 switch (intr->intrinsic) {
236 case nir_intrinsic_load_ssbo:
237 case nir_intrinsic_load_ubo:
238 case nir_intrinsic_load_uniform:
239 case nir_intrinsic_load_scratch:
240 case nir_intrinsic_load_global_2x32:
241 return lower_load_bitsize(b, intr);
242
243 case nir_intrinsic_store_ssbo:
244 case nir_intrinsic_store_scratch:
245 case nir_intrinsic_store_global_2x32:
246 return lower_store_bitsize(b, intr);
247
248 default:
249 return false;
250 }
251 }
252
253 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s)254 v3d_nir_lower_load_store_bitsize(nir_shader *s)
255 {
256 return nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
257 nir_metadata_block_index |
258 nir_metadata_dominance,
259 NULL);
260 }
261