1 /*
2 * Copyright © 2021 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "ir3_nir.h"
7 #include "nir.h"
8 #include "nir_builder.h"
9 #include "nir_builder_opcodes.h"
10 #include "nir_intrinsics.h"
11
12 /*
13 * Lowering for 64b intrinsics generated with OpenCL or with
14 * VK_KHR_buffer_device_address. All our intrinsics from a hw
15 * standpoint are 32b, so we just need to combine in zero for
16 * the upper 32bits and let the other nir passes clean up the mess.
17 */
18
19 static bool
lower_64b_intrinsics_filter(const nir_instr * instr,const void * unused)20 lower_64b_intrinsics_filter(const nir_instr *instr, const void *unused)
21 {
22 (void)unused;
23
24 if (instr->type != nir_instr_type_intrinsic)
25 return false;
26
27 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
28
29 if (intr->intrinsic == nir_intrinsic_load_deref ||
30 intr->intrinsic == nir_intrinsic_store_deref)
31 return false;
32
33 if (is_intrinsic_store(intr->intrinsic))
34 return nir_src_bit_size(intr->src[0]) == 64;
35
36 /* skip over ssbo atomics, we'll lower them later */
37 if (intr->intrinsic == nir_intrinsic_ssbo_atomic ||
38 intr->intrinsic == nir_intrinsic_ssbo_atomic_swap ||
39 intr->intrinsic == nir_intrinsic_global_atomic_ir3 ||
40 intr->intrinsic == nir_intrinsic_global_atomic_swap_ir3)
41 return false;
42
43 if (nir_intrinsic_dest_components(intr) == 0)
44 return false;
45
46 return intr->def.bit_size == 64;
47 }
48
49 static nir_def *
lower_64b_intrinsics(nir_builder * b,nir_instr * instr,void * unused)50 lower_64b_intrinsics(nir_builder *b, nir_instr *instr, void *unused)
51 {
52 (void)unused;
53
54 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
55
56 /* We could be *slightly* more clever and, for ex, turn a 64b vec4
57 * load into two 32b vec4 loads, rather than 4 32b vec2 loads.
58 */
59
60 if (is_intrinsic_store(intr->intrinsic)) {
61 unsigned offset_src_idx;
62 switch (intr->intrinsic) {
63 case nir_intrinsic_store_ssbo:
64 case nir_intrinsic_store_global_ir3:
65 case nir_intrinsic_store_per_view_output:
66 offset_src_idx = 2;
67 break;
68 default:
69 offset_src_idx = 1;
70 }
71
72 unsigned num_comp = nir_intrinsic_src_components(intr, 0);
73 unsigned wrmask = nir_intrinsic_has_write_mask(intr) ?
74 nir_intrinsic_write_mask(intr) : BITSET_MASK(num_comp);
75 nir_def *val = intr->src[0].ssa;
76 nir_def *off = intr->src[offset_src_idx].ssa;
77
78 for (unsigned i = 0; i < num_comp; i++) {
79 if (!(wrmask & BITFIELD_BIT(i)))
80 continue;
81
82 nir_def *c64 = nir_channel(b, val, i);
83 nir_def *c32 = nir_unpack_64_2x32(b, c64);
84
85 nir_intrinsic_instr *store =
86 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
87 store->num_components = 2;
88 store->src[0] = nir_src_for_ssa(c32);
89 store->src[offset_src_idx] = nir_src_for_ssa(off);
90
91 if (nir_intrinsic_has_write_mask(intr))
92 nir_intrinsic_set_write_mask(store, 0x3);
93 nir_builder_instr_insert(b, &store->instr);
94
95 off = nir_iadd_imm(b, off, 8);
96 }
97
98 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
99 }
100
101 unsigned num_comp = nir_intrinsic_dest_components(intr);
102
103 nir_def *def = &intr->def;
104 def->bit_size = 32;
105
106 /* load_kernel_input is handled specially, lowering to two 32b inputs:
107 */
108 if (intr->intrinsic == nir_intrinsic_load_kernel_input) {
109 assert(num_comp == 1);
110
111 nir_def *offset = nir_iadd_imm(b,
112 intr->src[0].ssa, 4);
113
114 nir_def *upper = nir_load_kernel_input(b, 1, 32, offset);
115
116 return nir_pack_64_2x32_split(b, def, upper);
117 }
118
119 nir_def *components[num_comp];
120
121 if (is_intrinsic_load(intr->intrinsic)) {
122 unsigned offset_src_idx;
123 switch(intr->intrinsic) {
124 case nir_intrinsic_load_ssbo:
125 case nir_intrinsic_load_ubo:
126 case nir_intrinsic_load_global_ir3:
127 case nir_intrinsic_load_per_view_output:
128 offset_src_idx = 1;
129 break;
130 default:
131 offset_src_idx = 0;
132 }
133
134 nir_def *off = intr->src[offset_src_idx].ssa;
135
136 for (unsigned i = 0; i < num_comp; i++) {
137 nir_intrinsic_instr *load =
138 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
139 load->num_components = 2;
140 load->src[offset_src_idx] = nir_src_for_ssa(off);
141
142 nir_def_init(&load->instr, &load->def, 2, 32);
143 nir_builder_instr_insert(b, &load->instr);
144
145 components[i] = nir_pack_64_2x32(b, &load->def);
146
147 off = nir_iadd_imm(b, off, 8);
148 }
149 } else {
150 /* The remaining (non load/store) intrinsics just get zero-
151 * extended from 32b to 64b:
152 */
153 for (unsigned i = 0; i < num_comp; i++) {
154 nir_def *c = nir_channel(b, def, i);
155 components[i] = nir_pack_64_2x32_split(b, c, nir_imm_zero(b, 1, 32));
156 }
157 }
158
159 return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
160 }
161
162 bool
ir3_nir_lower_64b_intrinsics(nir_shader * shader)163 ir3_nir_lower_64b_intrinsics(nir_shader *shader)
164 {
165 return nir_shader_lower_instructions(
166 shader, lower_64b_intrinsics_filter,
167 lower_64b_intrinsics, NULL);
168 }
169
170 /*
171 * Lowering for 64b undef instructions, splitting into a two 32b undefs
172 */
173
174 static nir_def *
lower_64b_undef(nir_builder * b,nir_instr * instr,void * unused)175 lower_64b_undef(nir_builder *b, nir_instr *instr, void *unused)
176 {
177 (void)unused;
178
179 nir_undef_instr *undef = nir_instr_as_undef(instr);
180 unsigned num_comp = undef->def.num_components;
181 nir_def *components[num_comp];
182
183 for (unsigned i = 0; i < num_comp; i++) {
184 nir_def *lowered = nir_undef(b, 2, 32);
185
186 components[i] = nir_pack_64_2x32_split(b,
187 nir_channel(b, lowered, 0),
188 nir_channel(b, lowered, 1));
189 }
190
191 return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
192 }
193
194 static bool
lower_64b_undef_filter(const nir_instr * instr,const void * unused)195 lower_64b_undef_filter(const nir_instr *instr, const void *unused)
196 {
197 (void)unused;
198
199 return instr->type == nir_instr_type_undef &&
200 nir_instr_as_undef(instr)->def.bit_size == 64;
201 }
202
203 bool
ir3_nir_lower_64b_undef(nir_shader * shader)204 ir3_nir_lower_64b_undef(nir_shader *shader)
205 {
206 return nir_shader_lower_instructions(
207 shader, lower_64b_undef_filter,
208 lower_64b_undef, NULL);
209 }
210
211 /*
212 * Lowering for load_global/store_global with 64b addresses to ir3
213 * variants, which instead take a uvec2_32
214 */
215
216 static bool
lower_64b_global_filter(const nir_instr * instr,const void * unused)217 lower_64b_global_filter(const nir_instr *instr, const void *unused)
218 {
219 (void)unused;
220
221 if (instr->type != nir_instr_type_intrinsic)
222 return false;
223
224 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
225 switch (intr->intrinsic) {
226 case nir_intrinsic_load_global:
227 case nir_intrinsic_load_global_constant:
228 case nir_intrinsic_store_global:
229 case nir_intrinsic_global_atomic:
230 case nir_intrinsic_global_atomic_swap:
231 return true;
232 default:
233 return false;
234 }
235 }
236
237 static nir_def *
lower_64b_global(nir_builder * b,nir_instr * instr,void * unused)238 lower_64b_global(nir_builder *b, nir_instr *instr, void *unused)
239 {
240 (void)unused;
241
242 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
243 bool load = intr->intrinsic != nir_intrinsic_store_global;
244
245 nir_def *addr64 = intr->src[load ? 0 : 1].ssa;
246 nir_def *addr = nir_unpack_64_2x32(b, addr64);
247
248 /*
249 * Note that we can get vec8/vec16 with OpenCL.. we need to split
250 * those up into max 4 components per load/store.
251 */
252
253 if (intr->intrinsic == nir_intrinsic_global_atomic) {
254 return nir_global_atomic_ir3(
255 b, intr->def.bit_size, addr,
256 intr->src[1].ssa,
257 .atomic_op = nir_intrinsic_atomic_op(intr));
258 } else if (intr->intrinsic == nir_intrinsic_global_atomic_swap) {
259 return nir_global_atomic_swap_ir3(
260 b, intr->def.bit_size, addr,
261 intr->src[1].ssa, intr->src[2].ssa,
262 .atomic_op = nir_intrinsic_atomic_op(intr));
263 }
264
265 if (load) {
266 unsigned num_comp = nir_intrinsic_dest_components(intr);
267 nir_def *components[num_comp];
268 for (unsigned off = 0; off < num_comp;) {
269 unsigned c = MIN2(num_comp - off, 4);
270 nir_def *val = nir_load_global_ir3(
271 b, c, intr->def.bit_size,
272 addr, nir_imm_int(b, off));
273 for (unsigned i = 0; i < c; i++) {
274 components[off++] = nir_channel(b, val, i);
275 }
276 }
277 return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
278 } else {
279 unsigned num_comp = nir_intrinsic_src_components(intr, 0);
280 nir_def *value = intr->src[0].ssa;
281 for (unsigned off = 0; off < num_comp; off += 4) {
282 unsigned c = MIN2(num_comp - off, 4);
283 nir_def *v = nir_channels(b, value, BITFIELD_MASK(c) << off);
284 nir_store_global_ir3(b, v, addr, nir_imm_int(b, off));
285 }
286 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
287 }
288 }
289
290 bool
ir3_nir_lower_64b_global(nir_shader * shader)291 ir3_nir_lower_64b_global(nir_shader *shader)
292 {
293 return nir_shader_lower_instructions(
294 shader, lower_64b_global_filter,
295 lower_64b_global, NULL);
296 }
297
298 /*
299 * Lowering for 64b registers:
300 * - @decl_reg -> split in two 32b ones
301 * - @store_reg -> unpack_64_2x32_split_x/y and two separate stores
302 * - @load_reg -> two separate loads and pack_64_2x32_split
303 */
304
305 static void
lower_64b_reg(nir_builder * b,nir_intrinsic_instr * reg)306 lower_64b_reg(nir_builder *b, nir_intrinsic_instr *reg)
307 {
308 unsigned num_components = nir_intrinsic_num_components(reg);
309 unsigned num_array_elems = nir_intrinsic_num_array_elems(reg);
310
311 nir_def *reg_hi = nir_decl_reg(b, num_components, 32, num_array_elems);
312 nir_def *reg_lo = nir_decl_reg(b, num_components, 32, num_array_elems);
313
314 nir_foreach_reg_store_safe (store_reg_src, reg) {
315 nir_intrinsic_instr *store =
316 nir_instr_as_intrinsic(nir_src_parent_instr(store_reg_src));
317 b->cursor = nir_before_instr(&store->instr);
318
319 nir_def *packed = store->src[0].ssa;
320 nir_def *unpacked_lo = nir_unpack_64_2x32_split_x(b, packed);
321 nir_def *unpacked_hi = nir_unpack_64_2x32_split_y(b, packed);
322 int base = nir_intrinsic_base(store);
323
324 if (store->intrinsic == nir_intrinsic_store_reg) {
325 nir_build_store_reg(b, unpacked_lo, reg_lo, .base = base);
326 nir_build_store_reg(b, unpacked_hi, reg_hi, .base = base);
327 } else {
328 assert(store->intrinsic == nir_intrinsic_store_reg_indirect);
329
330 nir_def *offset = store->src[2].ssa;
331 nir_store_reg_indirect(b, unpacked_lo, reg_lo, offset, .base = base);
332 nir_store_reg_indirect(b, unpacked_hi, reg_hi, offset, .base = base);
333 }
334
335 nir_instr_remove(&store->instr);
336 }
337
338 nir_foreach_reg_load_safe (load_reg_src, reg) {
339 nir_intrinsic_instr *load =
340 nir_instr_as_intrinsic(nir_src_parent_instr(load_reg_src));
341 b->cursor = nir_before_instr(&load->instr);
342
343 int base = nir_intrinsic_base(load);
344 nir_def *load_lo, *load_hi;
345
346 if (load->intrinsic == nir_intrinsic_load_reg) {
347 load_lo =
348 nir_build_load_reg(b, num_components, 32, reg_lo, .base = base);
349 load_hi =
350 nir_build_load_reg(b, num_components, 32, reg_hi, .base = base);
351 } else {
352 assert(load->intrinsic == nir_intrinsic_load_reg_indirect);
353
354 nir_def *offset = load->src[1].ssa;
355 load_lo = nir_load_reg_indirect(b, num_components, 32, reg_lo, offset,
356 .base = base);
357 load_hi = nir_load_reg_indirect(b, num_components, 32, reg_hi, offset,
358 .base = base);
359 }
360
361 nir_def *packed = nir_pack_64_2x32_split(b, load_lo, load_hi);
362 nir_def_rewrite_uses(&load->def, packed);
363 nir_instr_remove(&load->instr);
364 }
365
366 nir_instr_remove(®->instr);
367 }
368
369 bool
ir3_nir_lower_64b_regs(nir_shader * shader)370 ir3_nir_lower_64b_regs(nir_shader *shader)
371 {
372 bool progress = false;
373
374 nir_foreach_function_impl (impl, shader) {
375 bool impl_progress = false;
376 nir_builder b = nir_builder_create(impl);
377
378 nir_foreach_reg_decl_safe (reg, impl) {
379 if (nir_intrinsic_bit_size(reg) == 64) {
380 lower_64b_reg(&b, reg);
381 impl_progress = true;
382 }
383 }
384
385 if (impl_progress) {
386 nir_metadata_preserve(
387 impl, nir_metadata_control_flow);
388 progress = true;
389 }
390 }
391
392 return progress;
393 }
394