1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2020 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_nir.h"
28
29 #include "nir.h"
30 #include "nir_builder.h"
31
32 #include <map>
33 #include <vector>
34 #include <iostream>
35
36 namespace r600 {
37
38 using std::map;
39 using std::pair;
40 using std::make_pair;
41 using std::vector;
42
43 class LowerSplit64BitVar : public NirLowerInstruction {
44 public:
45
46 ~LowerSplit64BitVar();
47 using VarSplit = pair<nir_variable*, nir_variable*>;
48 using VarMap = map<unsigned, VarSplit>;
49
50 nir_ssa_def *
51 split_double_load_deref(nir_intrinsic_instr *intr);
52
53 nir_ssa_def *
54 split_double_store_deref(nir_intrinsic_instr *intr);
55
56 private:
57 nir_ssa_def *
58 split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
59
60 nir_ssa_def *
61 split_load_deref_var(nir_intrinsic_instr *intr);
62
63 nir_ssa_def *
64 split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref);
65
66 nir_ssa_def *
67 split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
68
69 VarSplit get_var_pair(nir_variable *old_var);
70
71 nir_ssa_def *
72 merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3);
73
74 nir_ssa_def *split_double_load(nir_intrinsic_instr *load1);
75
76 nir_ssa_def *
77 split_store_output(nir_intrinsic_instr *store1);
78
79 nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr);
80
81 nir_ssa_def *
82 split_double_load_ssbo(nir_intrinsic_instr *intr);
83
84 nir_ssa_def *
85 split_double_load_ubo(nir_intrinsic_instr *intr);
86
87 nir_ssa_def *
88 split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
89
90 nir_ssa_def *
91 split_reduction3(nir_alu_instr *alu,
92 nir_op op1, nir_op op2, nir_op reduction);
93
94 nir_ssa_def *
95 split_reduction4(nir_alu_instr *alu,
96 nir_op op1, nir_op op2, nir_op reduction);
97
98 nir_ssa_def *split_bcsel(nir_alu_instr *alu);
99
100 nir_ssa_def *split_load_const(nir_load_const_instr *lc);
101
102 bool filter(const nir_instr *instr) const override;
103 nir_ssa_def *lower(nir_instr *instr) override;
104
105 VarMap m_varmap;
106 vector<nir_variable*> m_old_vars;
107 vector<nir_instr *> m_old_stores;
108 };
109
110
111 class LowerLoad64Uniform : public NirLowerInstruction {
112 bool filter(const nir_instr *instr) const override;
113 nir_ssa_def *lower(nir_instr *instr) override;
114 };
115
filter(const nir_instr * instr) const116 bool LowerLoad64Uniform::filter(const nir_instr *instr) const
117 {
118 if (instr->type != nir_instr_type_intrinsic)
119 return false;
120
121 auto intr = nir_instr_as_intrinsic(instr);
122 if (intr->intrinsic != nir_intrinsic_load_uniform &&
123 intr->intrinsic != nir_intrinsic_load_ubo &&
124 intr->intrinsic != nir_intrinsic_load_ubo_vec4)
125 return false;
126
127 return nir_dest_bit_size(intr->dest) == 64;
128 }
129
130
lower(nir_instr * instr)131 nir_ssa_def *LowerLoad64Uniform::lower(nir_instr *instr)
132 {
133 auto intr = nir_instr_as_intrinsic(instr);
134 int old_components = nir_dest_num_components(intr->dest);
135 assert(old_components <= 2);
136 assert(intr->dest.is_ssa);
137 intr->dest.ssa.num_components *= 2;
138 intr->dest.ssa.bit_size = 32;
139 intr->num_components *= 2;
140
141 if (intr->intrinsic ==nir_intrinsic_load_ubo ||
142 intr->intrinsic ==nir_intrinsic_load_ubo_vec4)
143 nir_intrinsic_set_component(intr, 2 * nir_intrinsic_component(intr));
144
145 nir_ssa_def *result_vec[2] = {nullptr, nullptr};
146
147 for (int i = 0; i < old_components; ++i) {
148 result_vec[i] = nir_pack_64_2x32_split(b,
149 nir_channel(b, &intr->dest.ssa, 2 * i),
150 nir_channel(b, &intr->dest.ssa, 2 * i + 1));
151 }
152 if (old_components == 1)
153 return result_vec[0];
154
155 return nir_vec2(b, result_vec[0], result_vec[1]);
156 }
157
r600_split_64bit_uniforms_and_ubo(nir_shader * sh)158 bool r600_split_64bit_uniforms_and_ubo(nir_shader *sh)
159 {
160 return LowerLoad64Uniform().run(sh);
161 }
162
163 class LowerSplit64op : public NirLowerInstruction {
filter(const nir_instr * instr) const164 bool filter(const nir_instr *instr) const override {
165 switch (instr->type) {
166 case nir_instr_type_alu: {
167 auto alu = nir_instr_as_alu(instr);
168 switch (alu->op) {
169 case nir_op_bcsel:
170 return nir_dest_bit_size(alu->dest.dest) == 64;
171 case nir_op_f2b1:
172 case nir_op_f2i32:
173 case nir_op_f2u32:
174 case nir_op_f2i64:
175 case nir_op_f2u64:
176 case nir_op_u2f64:
177 case nir_op_i2f64:
178 return nir_src_bit_size(alu->src[0].src) == 64;
179 default:
180 return false;
181 }
182 }
183 case nir_instr_type_phi: {
184 auto phi = nir_instr_as_phi(instr);
185 return nir_dest_num_components(phi->dest) == 64;
186 }
187 default:
188 return false;
189 }
190 }
191
lower(nir_instr * instr)192 nir_ssa_def *lower(nir_instr *instr) override {
193
194 switch (instr->type) {
195 case nir_instr_type_alu: {
196 auto alu = nir_instr_as_alu(instr);
197 switch (alu->op) {
198
199 case nir_op_bcsel: {
200 auto lo = nir_bcsel(b, nir_ssa_for_src(b, alu->src[0].src, 1),
201 nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 1)),
202 nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 2)));
203 auto hi = nir_bcsel(b, nir_ssa_for_src(b, alu->src[0].src, 1),
204 nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 1)),
205 nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 2)));
206 return nir_pack_64_2x32_split(b, lo, hi);
207 }
208 case nir_op_f2b1: {
209 auto mask = nir_component_mask(nir_dest_num_components(alu->dest.dest));
210 return nir_fneu(b, nir_channels(b, nir_ssa_for_alu_src(b, alu, 0), mask),
211 nir_imm_zero(b, nir_dest_num_components(alu->dest.dest), 64));
212 }
213 case nir_op_f2i32: {
214 auto src = nir_ssa_for_alu_src(b, alu, 0);
215 auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
216 auto abs_src = nir_fabs(b, src);
217 auto value = nir_f2u32(b, abs_src);
218 return nir_bcsel(b, gt0, value, nir_ineg(b, value));
219 }
220 case nir_op_f2u32: {
221 /* fp32 doesn't hold suffient bits to represent the full range of
222 * u32, therefore we have to split the values, and because f2f32
223 * rounds, we have to remove the fractional part in the hi bits
224 * For values > UINT_MAX the result is undefined */
225 auto src = nir_ssa_for_alu_src(b, alu, 0);
226 auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
227 auto highval = nir_fmul_imm(b, src, 1.0/65536.0);
228 auto fract = nir_ffract(b, highval);
229 auto high = nir_f2u32(b, nir_f2f32(b, nir_fsub(b, highval, fract)));
230 auto lowval = nir_fmul_imm(b, fract, 65536.0);
231 auto low = nir_f2u32(b, nir_f2f32(b, lowval));
232 return nir_bcsel(b, gt0, nir_ior(b, nir_ishl_imm(b, high, 16), low),
233 nir_imm_int(b, 0));
234 }
235 case nir_op_f2i64: {
236 auto src = nir_ssa_for_alu_src(b, alu, 0);
237 auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
238 auto abs_src = nir_fabs(b, src);
239 auto value = nir_f2u64(b, abs_src);
240 return nir_bcsel(b, gt0, value, nir_isub(b, nir_imm_zero(b, 1, 64), value));
241 }
242 case nir_op_f2u64: {
243 auto src = nir_ssa_for_alu_src(b, alu, 0);
244 auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
245 auto highval = nir_fmul_imm(b, src, 1.0/(65536.0 * 65536.0));
246 auto fract = nir_ffract(b, highval);
247 auto high = nir_f2u32(b, nir_fsub(b, highval, fract));
248 auto low = nir_f2u32(b, nir_fmul_imm(b, fract, 65536.0 * 65536.0));
249 return nir_bcsel(b, gt0, nir_pack_64_2x32_split(b, low, high),
250 nir_imm_zero(b, 1, 64));
251 }
252 case nir_op_u2f64: {
253 auto src = nir_ssa_for_alu_src(b, alu, 0);
254 auto low = nir_unpack_64_2x32_split_x(b, src);
255 auto high = nir_unpack_64_2x32_split_y(b, src);
256 auto flow = nir_u2f64(b, low);
257 auto fhigh = nir_u2f64(b, high);
258 return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
259 }
260 case nir_op_i2f64: {
261 auto src = nir_ssa_for_alu_src(b, alu, 0);
262 auto low = nir_unpack_64_2x32_split_x(b, src);
263 auto high = nir_unpack_64_2x32_split_y(b, src);
264 auto flow = nir_u2f64(b, low);
265 auto fhigh = nir_i2f64(b, high);
266 return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
267 }
268 default:
269 unreachable("trying to lower instruction that was not in filter");
270 }
271 }
272 case nir_instr_type_phi: {
273 auto phi = nir_instr_as_phi(instr);
274 auto phi_lo = nir_phi_instr_create(b->shader);
275 auto phi_hi = nir_phi_instr_create(b->shader);
276 nir_ssa_dest_init(&phi_lo->instr, &phi_lo->dest, phi->dest.ssa.num_components * 2, 32, "");
277 nir_ssa_dest_init(&phi_hi->instr, &phi_hi->dest, phi->dest.ssa.num_components * 2, 32, "");
278 nir_foreach_phi_src(s, phi) {
279 auto lo = nir_unpack_32_2x16_split_x(b, nir_ssa_for_src(b, s->src, 1));
280 auto hi = nir_unpack_32_2x16_split_x(b, nir_ssa_for_src(b, s->src, 1));
281 nir_phi_instr_add_src(phi_lo, s->pred, nir_src_for_ssa(lo));
282 nir_phi_instr_add_src(phi_hi, s->pred, nir_src_for_ssa(hi));
283 }
284 return nir_pack_64_2x32_split(b, &phi_lo->dest.ssa, &phi_hi->dest.ssa);
285 }
286 default:
287 unreachable("Trying to lower instruction that was not in filter");
288 }
289 }
290 };
291
r600_split_64bit_alu_and_phi(nir_shader * sh)292 bool r600_split_64bit_alu_and_phi(nir_shader *sh)
293 {
294 return LowerSplit64op().run(sh);
295 }
296
297
298 bool
filter(const nir_instr * instr) const299 LowerSplit64BitVar::filter(const nir_instr *instr) const
300 {
301 switch (instr->type) {
302 case nir_instr_type_intrinsic: {
303 auto intr = nir_instr_as_intrinsic(instr);
304
305 switch (intr->intrinsic) {
306 case nir_intrinsic_load_deref:
307 case nir_intrinsic_load_uniform:
308 case nir_intrinsic_load_input:
309 case nir_intrinsic_load_ubo:
310 case nir_intrinsic_load_ssbo:
311 if (nir_dest_bit_size(intr->dest) != 64)
312 return false;
313 return nir_dest_num_components(intr->dest) >= 3;
314 case nir_intrinsic_store_output:
315 if (nir_src_bit_size(intr->src[0]) != 64)
316 return false;
317 return nir_src_num_components(intr->src[0]) >= 3;
318 case nir_intrinsic_store_deref:
319 if (nir_src_bit_size(intr->src[1]) != 64)
320 return false;
321 return nir_src_num_components(intr->src[1]) >= 3;
322 default:
323 return false;
324 }
325 }
326 case nir_instr_type_alu: {
327 auto alu = nir_instr_as_alu(instr);
328 switch (alu->op) {
329 case nir_op_bcsel:
330 if (nir_dest_num_components(alu->dest.dest) < 3)
331 return false;
332 return nir_dest_bit_size(alu->dest.dest) == 64;
333 case nir_op_bany_fnequal3:
334 case nir_op_bany_fnequal4:
335 case nir_op_ball_fequal3:
336 case nir_op_ball_fequal4:
337 case nir_op_bany_inequal3:
338 case nir_op_bany_inequal4:
339 case nir_op_ball_iequal3:
340 case nir_op_ball_iequal4:
341 case nir_op_fdot3:
342 case nir_op_fdot4:
343 return nir_src_bit_size(alu->src[1].src) == 64;
344 default:
345 return false;
346 }
347 }
348 case nir_instr_type_load_const: {
349 auto lc = nir_instr_as_load_const(instr);
350 if (lc->def.bit_size != 64)
351 return false;
352 return lc->def.num_components >= 3;
353 }
354 default:
355 return false;
356 }
357 }
358
359 nir_ssa_def *
merge_64bit_loads(nir_ssa_def * load1,nir_ssa_def * load2,bool out_is_vec3)360 LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1,
361 nir_ssa_def *load2, bool out_is_vec3)
362 {
363 if (out_is_vec3)
364 return nir_vec3(b, nir_channel(b, load1, 0),
365 nir_channel(b, load1, 1),
366 nir_channel(b, load2, 0));
367 else
368 return nir_vec4(b, nir_channel(b, load1, 0),
369 nir_channel(b, load1, 1),
370 nir_channel(b, load2, 0),
371 nir_channel(b, load2, 1));
372 }
373
~LowerSplit64BitVar()374 LowerSplit64BitVar::~LowerSplit64BitVar()
375 {
376 for(auto&& v: m_old_vars)
377 exec_node_remove(&v->node);
378
379 for(auto&& v: m_old_stores)
380 nir_instr_remove(v);
381 }
382
383 nir_ssa_def *
split_double_store_deref(nir_intrinsic_instr * intr)384 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
385 {
386 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
387 if (deref->deref_type == nir_deref_type_var)
388 return split_store_deref_var(intr, deref);
389 else if (deref->deref_type == nir_deref_type_array)
390 return split_store_deref_array(intr, deref);
391 else {
392 unreachable("only splitting of stores to vars and arrays is supported");
393 }
394 }
395
396 nir_ssa_def *
split_double_load_deref(nir_intrinsic_instr * intr)397 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
398 {
399 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
400 if (deref->deref_type == nir_deref_type_var)
401 return split_load_deref_var(intr);
402 else if (deref->deref_type == nir_deref_type_array)
403 return split_load_deref_array(intr, deref->arr.index);
404 else {
405 unreachable(0 && "only splitting of loads from vars and arrays is supported");
406 }
407 m_old_stores.push_back(&intr->instr);
408 }
409
410 nir_ssa_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)411 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
412 {
413 auto old_var = nir_intrinsic_get_var(intr, 0);
414 unsigned old_components = old_var->type->without_array()->components();
415
416 assert(old_components > 2 && old_components <= 4);
417
418 auto vars = get_var_pair(old_var);
419
420 auto deref1 = nir_build_deref_var(b, vars.first);
421 auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1));
422 auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0);
423
424 auto deref2 = nir_build_deref_var(b, vars.second);
425 auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1));
426
427 auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0);
428
429 return merge_64bit_loads(load1, load2, old_components == 3);
430 }
431
432 nir_ssa_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)433 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref)
434 {
435 auto old_var = nir_intrinsic_get_var(intr, 0);
436 unsigned old_components = old_var->type->without_array()->components();
437
438 assert(old_components > 2 && old_components <= 4);
439
440 auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
441
442 auto vars = get_var_pair(old_var);
443
444 auto deref1 = nir_build_deref_var(b, vars.first);
445 auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1));
446
447 nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3);
448
449 auto deref2 = nir_build_deref_var(b, vars.second);
450 auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1));
451
452 if (old_components == 3)
453 nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
454 else
455 nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
456
457 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
458 }
459
460 nir_ssa_def *
split_store_deref_var(nir_intrinsic_instr * intr,UNUSED nir_deref_instr * deref)461 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, UNUSED nir_deref_instr *deref)
462 {
463 auto old_var = nir_intrinsic_get_var(intr, 0);
464 unsigned old_components = old_var->type->without_array()->components();
465
466 assert(old_components > 2 && old_components <= 4);
467
468 auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
469
470 auto vars = get_var_pair(old_var);
471
472 auto deref1 = nir_build_deref_var(b, vars.first);
473 nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3);
474
475 auto deref2 = nir_build_deref_var(b, vars.second);
476 if (old_components == 3)
477 nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
478 else
479 nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
480
481 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
482 }
483
484 nir_ssa_def *
split_load_deref_var(nir_intrinsic_instr * intr)485 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
486 {
487 auto old_var = nir_intrinsic_get_var(intr, 0);
488 auto vars = get_var_pair(old_var);
489 unsigned old_components = old_var->type->components();
490
491 nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
492 auto *load1 = nir_load_deref(b, deref1);
493
494 nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
495 deref2->type = vars.second->type;
496
497 auto *load2 = nir_load_deref(b, deref2);
498
499 return merge_64bit_loads(load1, load2, old_components == 3);
500 }
501
502 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)503 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
504 {
505 auto split_vars = m_varmap.find(old_var->data.driver_location);
506
507 assert(old_var->type->without_array()->components() > 2);
508
509 if (split_vars == m_varmap.end()) {
510 auto var1 = nir_variable_clone(old_var, b->shader);
511 auto var2 = nir_variable_clone(old_var, b->shader);
512
513 var1->type = glsl_dvec_type(2);
514 var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2);
515
516 if (old_var->type->is_array()) {
517 var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0);
518 var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0);
519 }
520
521 if (old_var->data.mode == nir_var_shader_in ||
522 old_var->data.mode == nir_var_shader_out) {
523 ++var2->data.driver_location;
524 ++var2->data.location;
525 nir_shader_add_variable(b->shader, var1);
526 nir_shader_add_variable(b->shader, var2);
527 } else if (old_var->data.mode == nir_var_function_temp) {
528 exec_list_push_tail(&b->impl->locals, &var1->node);
529 exec_list_push_tail(&b->impl->locals, &var2->node);
530 }
531
532 m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
533 }
534 return m_varmap[old_var->data.driver_location];
535 }
536
537
538 nir_ssa_def *
split_double_load(nir_intrinsic_instr * load1)539 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
540 {
541 unsigned old_components = nir_dest_num_components(load1->dest);
542 auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
543 nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
544
545 load1->dest.ssa.num_components = 2;
546 sem.num_slots = 1;
547 nir_intrinsic_set_io_semantics(load1, sem);
548
549 load2->dest.ssa.num_components = old_components - 2;
550 sem.location += 1;
551 nir_intrinsic_set_io_semantics(load2, sem);
552 nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
553 nir_builder_instr_insert(b, &load2->instr);
554
555 return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3);
556 }
557
558
559 nir_ssa_def *
split_store_output(nir_intrinsic_instr * store1)560 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
561 {
562 auto src = store1->src[0];
563 unsigned old_components = nir_src_num_components(src);
564 nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
565
566 auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
567 auto src1 = nir_channels(b, src.ssa, 3);
568 auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
569
570 nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1));
571 nir_intrinsic_set_write_mask(store1, 3);
572
573 nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2));
574 nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
575
576 sem.num_slots = 1;
577 nir_intrinsic_set_io_semantics(store1, sem);
578
579 sem.location += 1;
580 nir_intrinsic_set_io_semantics(store2, sem);
581 nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
582
583 nir_builder_instr_insert(b, &store2->instr);
584 return NIR_LOWER_INSTR_PROGRESS;
585 }
586
587
588 nir_ssa_def *
split_double_load_uniform(nir_intrinsic_instr * intr)589 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
590 {
591 unsigned second_components = nir_dest_num_components(intr->dest) - 2;
592 nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
593 load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
594 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
595 nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
596 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
597 load2->num_components = second_components;
598
599 nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
600 nir_builder_instr_insert(b, &load2->instr);
601
602 intr->dest.ssa.num_components = intr->num_components = 2;
603
604 if (second_components == 1)
605 return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0),
606 nir_channel(b, &intr->dest.ssa, 1),
607 nir_channel(b, &load2->dest.ssa, 0));
608 else
609 return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0),
610 nir_channel(b, &intr->dest.ssa, 1),
611 nir_channel(b, &load2->dest.ssa, 0),
612 nir_channel(b, &load2->dest.ssa, 1));
613 }
614
615 nir_ssa_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)616 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
617 {
618 unsigned second_components = nir_dest_num_components(intr->dest) - 2;
619 nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
620
621 auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
622 nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0);
623 load2->num_components = second_components;
624 nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
625
626 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
627 nir_builder_instr_insert(b, &load2->instr);
628
629 intr->dest.ssa.num_components = intr->num_components = 2;
630
631 return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
632 }
633
634
635 nir_ssa_def *
split_double_load_ubo(nir_intrinsic_instr * intr)636 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
637 {
638 unsigned second_components = nir_dest_num_components(intr->dest) - 2;
639 nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
640 load2->src[0] = intr->src[0];
641 load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
642 nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
643 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
644 nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
645 nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
646 nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16);
647
648 load2->num_components = second_components;
649
650 nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
651 nir_builder_instr_insert(b, &load2->instr);
652
653 intr->dest.ssa.num_components = intr->num_components = 2;
654
655 return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
656 }
657
658 nir_ssa_def *
split_reduction(nir_ssa_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)659 LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction)
660 {
661 auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
662 auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
663 return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
664 }
665
666 nir_ssa_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)667 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
668 nir_op op1, nir_op op2, nir_op reduction)
669 {
670 nir_ssa_def *src[2][2];
671
672 src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
673 src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
674
675 src[1][0] = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2);
676 src[1][1] = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2);
677
678 return split_reduction(src, op1, op2, reduction);
679 }
680
681 nir_ssa_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)682 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
683 nir_op op1, nir_op op2, nir_op reduction)
684 {
685 nir_ssa_def *src[2][2];
686
687 src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
688 src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
689
690 src[1][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc);
691 src[1][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc);
692
693 return split_reduction(src, op1, op2, reduction);
694 }
695
696 nir_ssa_def *
split_bcsel(nir_alu_instr * alu)697 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
698 {
699 static nir_ssa_def *dest[4];
700 for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) {
701 dest[i] = nir_bcsel(b,
702 nir_channel(b, alu->src[0].src.ssa, i),
703 nir_channel(b, alu->src[1].src.ssa, i),
704 nir_channel(b, alu->src[2].src.ssa, i));
705 }
706 return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest));
707 }
708
709 nir_ssa_def *
split_load_const(nir_load_const_instr * lc)710 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
711 {
712 nir_ssa_def *ir[4];
713 for (unsigned i = 0; i < lc->def.num_components; ++i)
714 ir[i] = nir_imm_double(b, lc->value[i].f64);
715
716 return nir_vec(b, ir, lc->def.num_components);
717 }
718
719 nir_ssa_def *
lower(nir_instr * instr)720 LowerSplit64BitVar::lower(nir_instr *instr)
721 {
722 switch (instr->type) {
723 case nir_instr_type_intrinsic: {
724 auto intr = nir_instr_as_intrinsic(instr);
725 switch (intr->intrinsic) {
726 case nir_intrinsic_load_deref:
727 return this->split_double_load_deref(intr);
728 case nir_intrinsic_load_uniform:
729 return split_double_load_uniform(intr);
730 case nir_intrinsic_load_ubo:
731 return split_double_load_ubo(intr);
732 case nir_intrinsic_load_ssbo:
733 return split_double_load_ssbo(intr);
734 case nir_intrinsic_load_input:
735 return split_double_load(intr);
736 case nir_intrinsic_store_output:
737 return split_store_output(intr);
738 case nir_intrinsic_store_deref:
739 return split_double_store_deref(intr);
740 default:
741 assert(0);
742 }
743 }
744 case nir_instr_type_alu: {
745 auto alu = nir_instr_as_alu(instr);
746 switch (alu->op) {
747 case nir_op_bany_fnequal3:
748 return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
749 case nir_op_ball_fequal3:
750 return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
751 case nir_op_bany_inequal3:
752 return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
753 case nir_op_ball_iequal3:
754 return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
755 case nir_op_fdot3:
756 return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
757 case nir_op_bany_fnequal4:
758 return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
759 case nir_op_ball_fequal4:
760 return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand);
761 case nir_op_bany_inequal4:
762 return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior);
763 case nir_op_ball_iequal4:
764 return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
765 case nir_op_fdot4:
766 return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
767 case nir_op_bcsel:
768 return split_bcsel(alu);
769 default:
770 assert(0);
771 }
772 }
773 case nir_instr_type_load_const: {
774 auto lc = nir_instr_as_load_const(instr);
775 return split_load_const(lc);
776 }
777 default:
778 assert(0);
779 }
780 return nullptr;
781 }
782
783 /* Split 64 bit instruction so that at most two 64 bit components are
784 * used in one instruction */
785
786 bool
r600_nir_split_64bit_io(nir_shader * sh)787 r600_nir_split_64bit_io(nir_shader *sh)
788 {
789 return LowerSplit64BitVar().run(sh);
790 }
791
792 /* */
793 class Lower64BitToVec2 : public NirLowerInstruction {
794
795 private:
796 bool filter(const nir_instr *instr) const override;
797 nir_ssa_def *lower(nir_instr *instr) override;
798
799 nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
800 nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
801 nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
802 nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr);
803 nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr);
804 };
805
806 bool
filter(const nir_instr * instr) const807 Lower64BitToVec2::filter(const nir_instr *instr) const
808 {
809 switch (instr->type) {
810 case nir_instr_type_intrinsic: {
811 auto intr = nir_instr_as_intrinsic(instr);
812
813 switch (intr->intrinsic) {
814 case nir_intrinsic_load_deref:
815 case nir_intrinsic_load_input:
816 case nir_intrinsic_load_uniform:
817 case nir_intrinsic_load_ubo:
818 case nir_intrinsic_load_ubo_vec4:
819 case nir_intrinsic_load_ssbo:
820 return nir_dest_bit_size(intr->dest) == 64;
821 case nir_intrinsic_store_deref: {
822 if (nir_src_bit_size(intr->src[1]) == 64)
823 return true;
824 auto var = nir_intrinsic_get_var(intr, 0);
825 if (var->type->without_array()->bit_size() == 64)
826 return true;
827 return (var->type->without_array()->components() != intr->num_components);
828 }
829 default:
830 return false;
831 }
832 }
833 case nir_instr_type_alu: {
834 auto alu = nir_instr_as_alu(instr);
835 return nir_dest_bit_size(alu->dest.dest) == 64;
836 }
837 case nir_instr_type_phi: {
838 auto phi = nir_instr_as_phi(instr);
839 return nir_dest_bit_size(phi->dest) == 64;
840 }
841 case nir_instr_type_load_const: {
842 auto lc = nir_instr_as_load_const(instr);
843 return lc->def.bit_size == 64;
844 }
845 case nir_instr_type_ssa_undef: {
846 auto undef = nir_instr_as_ssa_undef(instr);
847 return undef->def.bit_size == 64;
848 }
849 default:
850 return false;
851 }
852 }
853
854 nir_ssa_def *
lower(nir_instr * instr)855 Lower64BitToVec2::lower(nir_instr *instr)
856 {
857 switch (instr->type) {
858 case nir_instr_type_intrinsic: {
859 auto intr = nir_instr_as_intrinsic(instr);
860 switch (intr->intrinsic) {
861 case nir_intrinsic_load_deref:
862 return load_deref_64_to_vec2(intr);
863 case nir_intrinsic_load_uniform:
864 return load_uniform_64_to_vec2(intr);
865 case nir_intrinsic_load_ssbo:
866 return load_ssbo_64_to_vec2(intr);
867 case nir_intrinsic_load_input:
868 case nir_intrinsic_load_ubo:
869 case nir_intrinsic_load_ubo_vec4:
870 return load_64_to_vec2(intr);
871 case nir_intrinsic_store_deref:
872 return store_64_to_vec2(intr);
873 default:
874
875 return nullptr;
876 }
877 }
878 case nir_instr_type_alu: {
879 auto alu = nir_instr_as_alu(instr);
880 alu->dest.dest.ssa.bit_size = 32;
881 alu->dest.dest.ssa.num_components *= 2;
882 alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1;
883 switch (alu->op) {
884 case nir_op_pack_64_2x32_split:
885 alu->op = nir_op_vec2;
886 break;
887 case nir_op_pack_64_2x32:
888 alu->op = nir_op_mov;
889 break;
890 case nir_op_vec2:
891 return nir_vec4(b,
892 nir_channel(b, alu->src[0].src.ssa, 0),
893 nir_channel(b, alu->src[0].src.ssa, 1),
894 nir_channel(b, alu->src[1].src.ssa, 0),
895 nir_channel(b, alu->src[1].src.ssa, 1));
896 default:
897 return NULL;
898 }
899 return NIR_LOWER_INSTR_PROGRESS;
900 }
901 case nir_instr_type_phi: {
902 auto phi = nir_instr_as_phi(instr);
903 phi->dest.ssa.bit_size = 32;
904 phi->dest.ssa.num_components = 2;
905 return NIR_LOWER_INSTR_PROGRESS;
906 }
907 case nir_instr_type_load_const: {
908 auto lc = nir_instr_as_load_const(instr);
909 assert(lc->def.num_components < 3);
910 nir_const_value val[4] = {0};
911 for (uint i = 0; i < lc->def.num_components; ++i) {
912 uint64_t v = lc->value[i].u64;
913 val[0].u32 = v & 0xffffffff;
914 val[1].u32 = (v >> 32) & 0xffffffff;
915 }
916
917 return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
918 }
919 case nir_instr_type_ssa_undef: {
920 auto undef = nir_instr_as_ssa_undef(instr);
921 undef->def.num_components *= 2;
922 undef->def.bit_size = 32;
923 return NIR_LOWER_INSTR_PROGRESS;
924 }
925 default:
926 return nullptr;
927 }
928
929 }
930
931
932 nir_ssa_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)933 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
934 {
935 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
936 auto var = nir_intrinsic_get_var(intr, 0);
937 unsigned components = var->type->without_array()->components();
938 if (var->type->without_array()->bit_size() == 64) {
939 components *= 2;
940 if (deref->deref_type == nir_deref_type_var) {
941 var->type = glsl_vec_type(components);
942 } else if (deref->deref_type == nir_deref_type_array) {
943
944 var->type = glsl_array_type(glsl_vec_type(components),
945 var->type->array_size(), 0);
946
947 } else {
948 nir_print_shader(b->shader, stderr);
949 assert(0 && "Only lowring of var and array derefs supported\n");
950 }
951 }
952 deref->type = var->type;
953 if (deref->deref_type == nir_deref_type_array) {
954 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
955 deref_array->type = var->type;
956 deref->type = deref_array->type->without_array();
957 }
958
959 intr->num_components = components;
960 intr->dest.ssa.bit_size = 32;
961 intr->dest.ssa.num_components = components;
962 return NIR_LOWER_INSTR_PROGRESS;
963 }
964
965 nir_ssa_def *
store_64_to_vec2(nir_intrinsic_instr * intr)966 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
967 {
968 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
969 auto var = nir_intrinsic_get_var(intr, 0);
970
971 unsigned components = var->type->without_array()->components();
972 unsigned wrmask = nir_intrinsic_write_mask(intr);
973 if (var->type->without_array()->bit_size() == 64) {
974 components *= 2;
975 if (deref->deref_type == nir_deref_type_var) {
976 var->type = glsl_vec_type(components);
977 } else if (deref->deref_type == nir_deref_type_array) {
978 var->type = glsl_array_type(glsl_vec_type(components),
979 var->type->array_size(), 0);
980 } else {
981 nir_print_shader(b->shader, stderr);
982 assert(0 && "Only lowring of var and array derefs supported\n");
983 }
984 }
985 deref->type = var->type;
986 if (deref->deref_type == nir_deref_type_array) {
987 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
988 deref_array->type = var->type;
989 deref->type = deref_array->type->without_array();
990 }
991 intr->num_components = components;
992 nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
993 return NIR_LOWER_INSTR_PROGRESS;
994 }
995
996
997 nir_ssa_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)998 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
999 {
1000 intr->num_components *= 2;
1001 intr->dest.ssa.bit_size = 32;
1002 intr->dest.ssa.num_components *= 2;
1003 nir_intrinsic_set_dest_type(intr, nir_type_float32);
1004 return NIR_LOWER_INSTR_PROGRESS;
1005 }
1006
1007 nir_ssa_def *
load_64_to_vec2(nir_intrinsic_instr * intr)1008 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
1009 {
1010 intr->num_components *= 2;
1011 intr->dest.ssa.bit_size = 32;
1012 intr->dest.ssa.num_components *= 2;
1013 nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
1014 return NIR_LOWER_INSTR_PROGRESS;
1015 }
1016
1017 nir_ssa_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)1018 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
1019 {
1020 intr->num_components *= 2;
1021 intr->dest.ssa.bit_size = 32;
1022 intr->dest.ssa.num_components *= 2;
1023 return NIR_LOWER_INSTR_PROGRESS;
1024 }
1025
store_64bit_intr(nir_src * src,void * state)1026 static bool store_64bit_intr(nir_src *src, void *state)
1027 {
1028 bool *s = (bool *)state;
1029 *s = nir_src_bit_size(*src) == 64;
1030 return !*s;
1031 }
1032
double2vec2(nir_src * src,UNUSED void * state)1033 static bool double2vec2(nir_src *src, UNUSED void *state)
1034 {
1035 if (nir_src_bit_size(*src) != 64)
1036 return true;
1037
1038 assert(src->is_ssa);
1039 src->ssa->bit_size = 32;
1040 src->ssa->num_components *= 2;
1041 return true;
1042 }
1043
1044 bool
r600_nir_64_to_vec2(nir_shader * sh)1045 r600_nir_64_to_vec2(nir_shader *sh)
1046 {
1047 vector<nir_instr*> intr64bit;
1048 nir_foreach_function(function, sh) {
1049 if (function->impl) {
1050 nir_builder b;
1051 nir_builder_init(&b, function->impl);
1052
1053 nir_foreach_block(block, function->impl) {
1054 nir_foreach_instr_safe(instr, block) {
1055 switch (instr->type) {
1056 case nir_instr_type_alu: {
1057 bool success = false;
1058 nir_foreach_src(instr, store_64bit_intr, &success);
1059 if (success)
1060 intr64bit.push_back(instr);
1061 break;
1062 }
1063 case nir_instr_type_intrinsic: {
1064 auto ir = nir_instr_as_intrinsic(instr);
1065 switch (ir->intrinsic) {
1066 case nir_intrinsic_store_output:
1067 case nir_intrinsic_store_ssbo: {
1068 bool success = false;
1069 nir_foreach_src(instr, store_64bit_intr, &success);
1070 if (success) {
1071 auto wm = nir_intrinsic_write_mask(ir);
1072 nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
1073 ir->num_components *= 2;
1074 }
1075 break;
1076 }
1077 default:
1078 ;
1079 }
1080 }
1081 default:
1082 ;
1083 }
1084 }
1085 }
1086 }
1087 }
1088
1089 bool result = Lower64BitToVec2().run(sh);
1090
1091 if (result || !intr64bit.empty()) {
1092
1093 for(auto&& instr: intr64bit) {
1094 if (instr->type == nir_instr_type_alu) {
1095 auto alu = nir_instr_as_alu(instr);
1096 auto alu_info = nir_op_infos[alu->op];
1097 for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
1098 int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
1099 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
1100 if (!nir_alu_instr_channel_used(alu, i, k)) {
1101 continue;
1102 }
1103
1104 switch (alu->op) {
1105 case nir_op_unpack_64_2x32_split_x:
1106 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1107 alu->op = nir_op_mov;
1108 break;
1109 case nir_op_unpack_64_2x32_split_y:
1110 swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
1111 alu->op = nir_op_mov;
1112 break;
1113 case nir_op_unpack_64_2x32:
1114 alu->op = nir_op_mov;
1115 break;
1116 case nir_op_bcsel:
1117 if (i == 0) {
1118 swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
1119 break;
1120 }
1121 FALLTHROUGH;
1122 default:
1123 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1124 swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
1125 }
1126 }
1127 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
1128 alu->src[i].swizzle[k] = swizzle[k];
1129 }
1130 }
1131 } else
1132 nir_foreach_src(instr, double2vec2, nullptr);
1133 }
1134 result = true;
1135 }
1136
1137 return result;
1138 }
1139
1140 using std::map;
1141 using std::vector;
1142 using std::pair;
1143
1144 class StoreMerger {
1145 public:
1146 StoreMerger(nir_shader *shader);
1147 void collect_stores();
1148 bool combine();
1149 void combine_one_slot(vector<nir_intrinsic_instr*>& stores);
1150
1151 using StoreCombos = map<unsigned, vector<nir_intrinsic_instr*>>;
1152
1153 StoreCombos m_stores;
1154 nir_shader *sh;
1155 };
1156
StoreMerger(nir_shader * shader)1157 StoreMerger::StoreMerger(nir_shader *shader):
1158 sh(shader)
1159 {
1160 }
1161
1162
collect_stores()1163 void StoreMerger::collect_stores()
1164 {
1165 unsigned vertex = 0;
1166 nir_foreach_function(function, sh) {
1167 if (function->impl) {
1168 nir_foreach_block(block, function->impl) {
1169 nir_foreach_instr_safe(instr, block) {
1170 if (instr->type != nir_instr_type_intrinsic)
1171 continue;
1172
1173 auto ir = nir_instr_as_intrinsic(instr);
1174 if (ir->intrinsic == nir_intrinsic_emit_vertex ||
1175 ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
1176 ++vertex;
1177 continue;
1178 }
1179 if (ir->intrinsic != nir_intrinsic_store_output)
1180 continue;
1181
1182 unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
1183 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
1184 m_stores[index].push_back(ir);
1185 }
1186 }
1187 }
1188 }
1189 }
1190
combine()1191 bool StoreMerger::combine()
1192 {
1193 bool progress = false;
1194 for(auto&& i : m_stores) {
1195 if (i.second.size() < 2)
1196 continue;
1197
1198 combine_one_slot(i.second);
1199 progress = true;
1200 }
1201 return progress;
1202 }
1203
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1204 void StoreMerger::combine_one_slot(vector<nir_intrinsic_instr*>& stores)
1205 {
1206 nir_ssa_def *srcs[4] = {nullptr};
1207
1208 nir_builder b;
1209 nir_builder_init(&b, nir_shader_get_entrypoint(sh));
1210 auto last_store = *stores.rbegin();
1211
1212 b.cursor = nir_before_instr(&last_store->instr);
1213
1214 unsigned comps = 0;
1215 unsigned writemask = 0;
1216 unsigned first_comp = 4;
1217 for (auto&& store : stores) {
1218 int cmp = nir_intrinsic_component(store);
1219 for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1220 unsigned out_comp = i + cmp;
1221 srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1222 writemask |= 1 << out_comp;
1223 if (first_comp > out_comp)
1224 first_comp = out_comp;
1225 }
1226 }
1227
1228 auto new_src = nir_vec(&b, srcs, comps);
1229
1230 nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src));
1231 last_store->num_components = comps;
1232 nir_intrinsic_set_component(last_store, first_comp);
1233 nir_intrinsic_set_write_mask(last_store, writemask);
1234
1235 for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1236 nir_instr_remove(&(*i)->instr);
1237 }
1238
r600_merge_vec2_stores(nir_shader * shader)1239 bool r600_merge_vec2_stores(nir_shader *shader)
1240 {
1241 r600::StoreMerger merger(shader);
1242 merger.collect_stores();
1243 return merger.combine();
1244 }
1245
1246 static bool
r600_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)1247 r600_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
1248 {
1249 b->cursor = nir_after_instr(&instr->instr);
1250
1251 switch (instr->intrinsic) {
1252 case nir_intrinsic_load_ubo:
1253 case nir_intrinsic_load_ubo_vec4:
1254 case nir_intrinsic_load_uniform:
1255 case nir_intrinsic_load_ssbo:
1256 case nir_intrinsic_load_input:
1257 case nir_intrinsic_load_interpolated_input:
1258 case nir_intrinsic_load_per_vertex_input:
1259 case nir_intrinsic_store_output:
1260 case nir_intrinsic_store_per_vertex_output:
1261 case nir_intrinsic_store_ssbo:
1262 break;
1263 default:
1264 return false;
1265 }
1266
1267 if (instr->num_components <= 2)
1268 return false;
1269
1270 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
1271 if (has_dest) {
1272 if (nir_dest_bit_size(instr->dest) != 64)
1273 return false;
1274 } else {
1275 if (nir_src_bit_size(instr->src[0]) != 64)
1276 return false;
1277 }
1278
1279 nir_intrinsic_instr *first =
1280 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1281 nir_intrinsic_instr *second =
1282 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1283
1284 switch (instr->intrinsic) {
1285 case nir_intrinsic_load_ubo:
1286 case nir_intrinsic_load_ubo_vec4:
1287 case nir_intrinsic_load_uniform:
1288 case nir_intrinsic_load_ssbo:
1289 case nir_intrinsic_store_ssbo:
1290 break;
1291
1292 default: {
1293 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
1294 semantics.location++;
1295 semantics.num_slots--;
1296 nir_intrinsic_set_io_semantics(second, semantics);
1297
1298 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
1299 break;
1300 }
1301 }
1302
1303 first->num_components = 2;
1304 second->num_components -= 2;
1305 if (has_dest) {
1306 first->dest.ssa.num_components = 2;
1307 second->dest.ssa.num_components -= 2;
1308 }
1309
1310 nir_builder_instr_insert(b, &first->instr);
1311 nir_builder_instr_insert(b, &second->instr);
1312
1313 if (has_dest) {
1314 /* Merge the two loads' results back into a vector. */
1315 nir_ssa_scalar channels[4] = {
1316 nir_get_ssa_scalar(&first->dest.ssa, 0),
1317 nir_get_ssa_scalar(&first->dest.ssa, 1),
1318 nir_get_ssa_scalar(&second->dest.ssa, 0),
1319 nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0),
1320 };
1321 nir_ssa_def *new_ir = nir_vec_scalars(b, channels, instr->num_components);
1322 nir_ssa_def_rewrite_uses(&instr->dest.ssa, new_ir);
1323 } else {
1324 /* Split the src value across the two stores. */
1325 b->cursor = nir_before_instr(&instr->instr);
1326
1327 nir_ssa_def *src0 = instr->src[0].ssa;
1328 nir_ssa_scalar channels[4] = { 0 };
1329 for (int i = 0; i < instr->num_components; i++)
1330 channels[i] = nir_get_ssa_scalar(src0, i);
1331
1332 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
1333 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
1334
1335 nir_instr_rewrite_src(&first->instr, &first->src[0],
1336 nir_src_for_ssa(nir_vec_scalars(b, channels, 2)));
1337 nir_instr_rewrite_src(&second->instr, &second->src[0],
1338 nir_src_for_ssa(nir_vec_scalars(b, &channels[2],
1339 second->num_components)));
1340 }
1341
1342 int offset_src = -1;
1343 uint32_t offset_amount = 16;
1344
1345 switch (instr->intrinsic) {
1346 case nir_intrinsic_load_ssbo:
1347 case nir_intrinsic_load_ubo:
1348 offset_src = 1;
1349 break;
1350 case nir_intrinsic_load_ubo_vec4:
1351 case nir_intrinsic_load_uniform:
1352 offset_src = 0;
1353 offset_amount = 1;
1354 break;
1355 case nir_intrinsic_store_ssbo:
1356 offset_src = 2;
1357 break;
1358 default:
1359 break;
1360 }
1361 if (offset_src != -1) {
1362 b->cursor = nir_before_instr(&second->instr);
1363 nir_ssa_def *second_offset =
1364 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
1365 nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
1366 nir_src_for_ssa(second_offset));
1367 }
1368
1369 /* DCE stores we generated with no writemask (nothing else does this
1370 * currently).
1371 */
1372 if (!has_dest) {
1373 if (nir_intrinsic_write_mask(first) == 0)
1374 nir_instr_remove(&first->instr);
1375 if (nir_intrinsic_write_mask(second) == 0)
1376 nir_instr_remove(&second->instr);
1377 }
1378
1379 nir_instr_remove(&instr->instr);
1380
1381 return true;
1382 }
1383
1384 static bool
r600_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)1385 r600_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
1386 {
1387 int num_components = instr->def.num_components;
1388
1389 if (instr->def.bit_size != 64 || num_components <= 2)
1390 return false;
1391
1392 b->cursor = nir_before_instr(&instr->instr);
1393
1394 nir_load_const_instr *first =
1395 nir_load_const_instr_create(b->shader, 2, 64);
1396 nir_load_const_instr *second =
1397 nir_load_const_instr_create(b->shader, num_components - 2, 64);
1398
1399 first->value[0] = instr->value[0];
1400 first->value[1] = instr->value[1];
1401 second->value[0] = instr->value[2];
1402 if (num_components == 4)
1403 second->value[1] = instr->value[3];
1404
1405 nir_builder_instr_insert(b, &first->instr);
1406 nir_builder_instr_insert(b, &second->instr);
1407
1408 nir_ssa_def *channels[4] = {
1409 nir_channel(b, &first->def, 0),
1410 nir_channel(b, &first->def, 1),
1411 nir_channel(b, &second->def, 0),
1412 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
1413 };
1414 nir_ssa_def *new_ir = nir_vec(b, channels, num_components);
1415 nir_ssa_def_rewrite_uses(&instr->def, new_ir);
1416 nir_instr_remove(&instr->instr);
1417
1418 return true;
1419 }
1420
1421 static bool
r600_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)1422 r600_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, void *data)
1423 {
1424 switch (instr->type) {
1425 case nir_instr_type_load_const:
1426 return r600_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
1427
1428 case nir_instr_type_intrinsic:
1429 return r600_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
1430 default:
1431 return false;
1432 }
1433 }
1434
1435 bool
r600_lower_64bit_to_vec2(nir_shader * s)1436 r600_lower_64bit_to_vec2(nir_shader *s)
1437 {
1438 return nir_shader_instructions_pass(s,
1439 r600_lower_64bit_to_vec2_instr,
1440 nir_metadata_block_index |
1441 nir_metadata_dominance,
1442 NULL);
1443 }
1444
1445
1446 } // end namespace r600
1447
1448
1449