1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26
27 /**
28 * The V3D TMU unit can only do 32-bit general vector access so for anything
29 * else we need to split vector load/store instructions to scalar.
30 *
31 * Note that a vectorization pass after this lowering may be able to
32 * re-vectorize some of these using 32-bit load/store instructions instead,
33 * which we do support.
34 */
35
36 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_def * offset,uint32_t bit_size,nir_def ** scalar_offset)37 init_scalar_intrinsic(nir_builder *b,
38 nir_intrinsic_instr *intr,
39 uint32_t component,
40 nir_def *offset,
41 uint32_t bit_size,
42 nir_def **scalar_offset)
43 {
44
45 nir_intrinsic_instr *new_intr =
46 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
47
48 nir_intrinsic_copy_const_indices(new_intr, intr);
49
50 const int offset_units = bit_size / 8;
51 assert(offset_units >= 1);
52 assert(!nir_intrinsic_has_align_mul(intr));
53 assert(nir_intrinsic_has_base(intr));
54
55 *scalar_offset = offset;
56 unsigned offset_adj = offset_units * component;
57 nir_intrinsic_set_base(new_intr, nir_intrinsic_base(intr) + offset_adj);
58 new_intr->num_components = 1;
59
60 return new_intr;
61 }
62
63 static bool
lower_load_bitsize(nir_builder * b,nir_intrinsic_instr * intr)64 lower_load_bitsize(nir_builder *b,
65 nir_intrinsic_instr *intr)
66 {
67 uint32_t bit_size = intr->def.bit_size;
68 if (bit_size == 32)
69 return false;
70
71 /* No need to split if it is already scalar */
72 int num_comp = nir_intrinsic_dest_components(intr);
73 if (num_comp <= 1)
74 return false;
75
76 b->cursor = nir_before_instr(&intr->instr);
77
78 int offset_idx = nir_get_io_offset_src_number(intr);
79 assert(offset_idx >= 0);
80 nir_def *offset = intr->src[offset_idx].ssa;
81
82 /* Split vector store to multiple scalar loads */
83 nir_def *dest_components[16] = { NULL };
84 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
85 for (int component = 0; component < num_comp; component++) {
86 nir_def *scalar_offset;
87 nir_intrinsic_instr *new_intr =
88 init_scalar_intrinsic(b, intr, component, offset,
89 bit_size, &scalar_offset);
90
91 for (unsigned i = 0; i < info->num_srcs; i++) {
92 if (i == offset_idx) {
93 new_intr->src[i] = nir_src_for_ssa(scalar_offset);
94 } else {
95 new_intr->src[i] = intr->src[i];
96 }
97 }
98
99 nir_def_init(&new_intr->instr, &new_intr->def, 1,
100 bit_size);
101 dest_components[component] = &new_intr->def;
102
103 nir_builder_instr_insert(b, &new_intr->instr);
104 }
105
106 nir_def *new_dst = nir_vec(b, dest_components, num_comp);
107 nir_def_replace(&intr->def, new_dst);
108 return true;
109 }
110
111 static bool
lower_load_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr,void * data)112 lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
113 void *data)
114 {
115 switch (intr->intrinsic) {
116 case nir_intrinsic_load_uniform:
117 return lower_load_bitsize(b, intr);
118
119 default:
120 return false;
121 }
122 }
123
124 /*
125 * The idea here is to lower bit_sizes until we meet the alignment of the data
126 * in order not having to use atomics. Also we keep load/stores we can operate
127 * on with a bit_size of 32 vectorized to up to 4 components at most.
128 */
129 static nir_mem_access_size_align
v3d_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t input_bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)130 v3d_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
131 uint8_t input_bit_size, uint32_t align,
132 uint32_t align_offset, bool offset_is_const,
133 enum gl_access_qualifier access, const void *cb_data)
134 {
135 /* we only support single component 32 bit load/stores on scratch */
136 if (intrin == nir_intrinsic_load_scratch ||
137 intrin == nir_intrinsic_store_scratch) {
138 return (nir_mem_access_size_align){
139 .num_components = 1,
140 .bit_size = 32,
141 .align = 4,
142 .shift = nir_mem_access_shift_method_scalar,
143 };
144 }
145
146 align = nir_combined_align(align, align_offset);
147 assert(util_is_power_of_two_nonzero(align));
148
149 /* TODO: we could update the bit_size to 32 if possible, but that might
150 * cause suboptimal pack/unpack operations.
151 */
152 unsigned bit_size = MIN2(32, input_bit_size);
153
154 /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
155 * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
156 * the size.
157 */
158 if (align == 1)
159 bit_size = 8;
160 else if (align == 2)
161 bit_size = MIN2(bit_size, 16);
162
163 /* But we only support single component loads for anything below 32 bit.
164 * And only up to 4 components for 32 bit.
165 */
166 unsigned num_components;
167 if (bit_size == 32) {
168 num_components = MIN2(bytes / 4, 4);
169
170 /* Now we have to reduce the num_components even further for unaligned
171 * vector load/stores
172 */
173 num_components = MIN2(align / 4, num_components);
174 } else {
175 num_components = 1;
176 }
177
178 return (nir_mem_access_size_align){
179 .num_components = num_components,
180 .bit_size = bit_size,
181 .align = (bit_size / 8) * (num_components == 3 ? 4 : num_components),
182 .shift = nir_mem_access_shift_method_scalar,
183 };
184 }
185
186 static nir_intrinsic_op
convert_global_2x32_to_scalar(nir_intrinsic_op op)187 convert_global_2x32_to_scalar(nir_intrinsic_op op)
188 {
189 switch (op) {
190 case nir_intrinsic_global_atomic_2x32:
191 return nir_intrinsic_global_atomic;
192 case nir_intrinsic_global_atomic_swap_2x32:
193 return nir_intrinsic_global_atomic_swap;
194 case nir_intrinsic_load_global_2x32:
195 return nir_intrinsic_load_global;
196 case nir_intrinsic_store_global_2x32:
197 return nir_intrinsic_store_global;
198 default:
199 return op;
200 }
201 }
202
203 static bool
lower_global_2x32(nir_builder * b,nir_intrinsic_instr * intr,void * data)204 lower_global_2x32(nir_builder *b, nir_intrinsic_instr *intr, void *data)
205 {
206 nir_intrinsic_op op = convert_global_2x32_to_scalar(intr->intrinsic);
207 if (op == intr->intrinsic)
208 return false;
209
210 b->cursor = nir_before_instr(&intr->instr);
211 nir_src *addr_src = nir_get_io_offset_src(intr);
212 nir_src_rewrite(addr_src, nir_channel(b, addr_src->ssa, 0));
213 intr->intrinsic = op;
214
215 return true;
216 }
217
218 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s)219 v3d_nir_lower_load_store_bitsize(nir_shader *s)
220 {
221 nir_lower_mem_access_bit_sizes_options lower_options = {
222 .modes = nir_var_mem_global | nir_var_mem_ssbo |
223 nir_var_mem_ubo | nir_var_mem_constant |
224 nir_var_mem_shared | nir_var_function_temp,
225 .callback = v3d_size_align_cb,
226 };
227
228 bool res = nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
229 nir_metadata_control_flow,
230 NULL);
231 res |= nir_lower_mem_access_bit_sizes(s, &lower_options);
232 return res;
233 }
234
235 bool
v3d_nir_lower_global_2x32(nir_shader * s)236 v3d_nir_lower_global_2x32(nir_shader *s)
237 {
238 return nir_shader_intrinsics_pass(s, lower_global_2x32,
239 nir_metadata_control_flow,
240 NULL);
241 }
242