1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2020 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "nir.h"
28 #include "nir_builder.h"
29 #include "sfn_nir.h"
30
31 #include <iostream>
32 #include <map>
33 #include <vector>
34
35 namespace r600 {
36
37 using std::make_pair;
38 using std::map;
39 using std::pair;
40 using std::vector;
41
42 class LowerSplit64BitVar : public NirLowerInstruction {
43 public:
44 ~LowerSplit64BitVar();
45 using VarSplit = pair<nir_variable *, nir_variable *>;
46 using VarMap = map<unsigned, VarSplit>;
47
48 nir_def *split_double_load_deref(nir_intrinsic_instr *intr);
49
50 nir_def *split_double_store_deref(nir_intrinsic_instr *intr);
51
52 private:
53 nir_def *split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
54
55 nir_def *split_load_deref_var(nir_intrinsic_instr *intr);
56
57 nir_def *split_store_deref_array(nir_intrinsic_instr *intr,
58 nir_deref_instr *deref);
59
60 nir_def *split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
61
62 VarSplit get_var_pair(nir_variable *old_var);
63
64 nir_def *
65 merge_64bit_loads(nir_def *load1, nir_def *load2, bool out_is_vec3);
66
67 nir_def *split_double_load(nir_intrinsic_instr *load1);
68
69 nir_def *split_store_output(nir_intrinsic_instr *store1);
70
71 nir_def *split_double_load_uniform(nir_intrinsic_instr *intr);
72
73 nir_def *split_double_load_ssbo(nir_intrinsic_instr *intr);
74
75 nir_def *split_double_load_ubo(nir_intrinsic_instr *intr);
76
77 nir_def *
78 split_reduction(nir_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
79
80 nir_def *
81 split_reduction3(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
82
83 nir_def *
84 split_reduction4(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
85
86 nir_def *split_bcsel(nir_alu_instr *alu);
87
88 nir_def *split_load_const(nir_load_const_instr *lc);
89
90 bool filter(const nir_instr *instr) const override;
91 nir_def *lower(nir_instr *instr) override;
92
93 VarMap m_varmap;
94 vector<nir_variable *> m_old_vars;
95 vector<nir_instr *> m_old_stores;
96 };
97
98 class LowerLoad64Uniform : public NirLowerInstruction {
99 bool filter(const nir_instr *instr) const override;
100 nir_def *lower(nir_instr *instr) override;
101 };
102
103 bool
filter(const nir_instr * instr) const104 LowerLoad64Uniform::filter(const nir_instr *instr) const
105 {
106 if (instr->type != nir_instr_type_intrinsic)
107 return false;
108
109 auto intr = nir_instr_as_intrinsic(instr);
110 if (intr->intrinsic != nir_intrinsic_load_uniform &&
111 intr->intrinsic != nir_intrinsic_load_ubo &&
112 intr->intrinsic != nir_intrinsic_load_ubo_vec4)
113 return false;
114
115 return intr->def.bit_size == 64;
116 }
117
118 nir_def *
lower(nir_instr * instr)119 LowerLoad64Uniform::lower(nir_instr *instr)
120 {
121 auto intr = nir_instr_as_intrinsic(instr);
122 int old_components = intr->def.num_components;
123 assert(old_components <= 2);
124 intr->def.num_components *= 2;
125 intr->def.bit_size = 32;
126 intr->num_components *= 2;
127
128 if (intr->intrinsic == nir_intrinsic_load_ubo ||
129 intr->intrinsic == nir_intrinsic_load_ubo_vec4)
130 nir_intrinsic_set_component(intr, 2 * nir_intrinsic_component(intr));
131
132 nir_def *result_vec[2] = {nullptr, nullptr};
133
134 for (int i = 0; i < old_components; ++i) {
135 result_vec[i] = nir_pack_64_2x32_split(b,
136 nir_channel(b, &intr->def, 2 * i),
137 nir_channel(b, &intr->def, 2 * i + 1));
138 }
139 if (old_components == 1)
140 return result_vec[0];
141
142 return nir_vec2(b, result_vec[0], result_vec[1]);
143 }
144
145 bool
r600_split_64bit_uniforms_and_ubo(nir_shader * sh)146 r600_split_64bit_uniforms_and_ubo(nir_shader *sh)
147 {
148 return LowerLoad64Uniform().run(sh);
149 }
150
151 class LowerSplit64op : public NirLowerInstruction {
filter(const nir_instr * instr) const152 bool filter(const nir_instr *instr) const override
153 {
154 switch (instr->type) {
155 case nir_instr_type_alu: {
156 auto alu = nir_instr_as_alu(instr);
157 switch (alu->op) {
158 case nir_op_bcsel:
159 return alu->def.bit_size == 64;
160 case nir_op_f2i32:
161 case nir_op_f2u32:
162 case nir_op_f2i64:
163 case nir_op_f2u64:
164 case nir_op_u2f64:
165 case nir_op_i2f64:
166 return nir_src_bit_size(alu->src[0].src) == 64;
167 default:
168 return false;
169 }
170 }
171 case nir_instr_type_phi: {
172 auto phi = nir_instr_as_phi(instr);
173 return phi->def.num_components == 64;
174 }
175 default:
176 return false;
177 }
178 }
179
lower(nir_instr * instr)180 nir_def *lower(nir_instr *instr) override
181 {
182
183 switch (instr->type) {
184 case nir_instr_type_alu: {
185 auto alu = nir_instr_as_alu(instr);
186 switch (alu->op) {
187
188 case nir_op_bcsel: {
189 auto lo =
190 nir_bcsel(b,
191 alu->src[0].src.ssa,
192 nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 1)),
193 nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 2)));
194 auto hi =
195 nir_bcsel(b,
196 alu->src[0].src.ssa,
197 nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 1)),
198 nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 2)));
199 return nir_pack_64_2x32_split(b, lo, hi);
200 }
201 case nir_op_f2i32: {
202 auto src = nir_ssa_for_alu_src(b, alu, 0);
203 auto gt0 = nir_fgt_imm(b, src, 0.0);
204 auto abs_src = nir_fabs(b, src);
205 auto value = nir_f2u32(b, abs_src);
206 return nir_bcsel(b, gt0, value, nir_ineg(b, value));
207 }
208 case nir_op_f2u32: {
209 /* fp32 doesn't hold sufficient bits to represent the full range of
210 * u32, therefore we have to split the values, and because f2f32
211 * rounds, we have to remove the fractional part in the hi bits
212 * For values > UINT_MAX the result is undefined */
213 auto src = nir_ssa_for_alu_src(b, alu, 0);
214 src = nir_fadd(b, src, nir_fneg(b, nir_ffract(b, src)));
215 auto gt0 = nir_fgt_imm(b, src, 0.0);
216 auto highval = nir_fmul_imm(b, src, 1.0 / 65536.0);
217 auto fract = nir_ffract(b, highval);
218 auto high = nir_f2u32(b, nir_f2f32(b, nir_fadd(b, highval, nir_fneg(b, fract))));
219 auto lowval = nir_fmul_imm(b, fract, 65536.0);
220 auto low = nir_f2u32(b, nir_f2f32(b, lowval));
221 return nir_bcsel(b,
222 gt0,
223 nir_ior(b, nir_ishl_imm(b, high, 16), low),
224 nir_imm_int(b, 0));
225 }
226 case nir_op_u2f64: {
227 auto src = nir_ssa_for_alu_src(b, alu, 0);
228 auto low = nir_unpack_64_2x32_split_x(b, src);
229 auto high = nir_unpack_64_2x32_split_y(b, src);
230 auto flow = nir_u2f64(b, low);
231 auto fhigh = nir_u2f64(b, high);
232 return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
233 }
234 case nir_op_i2f64: {
235 auto src = nir_ssa_for_alu_src(b, alu, 0);
236 auto low = nir_unpack_64_2x32_split_x(b, src);
237 auto high = nir_unpack_64_2x32_split_y(b, src);
238 auto flow = nir_u2f64(b, low);
239 auto fhigh = nir_i2f64(b, high);
240 return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
241 }
242 default:
243 unreachable("trying to lower instruction that was not in filter");
244 }
245 }
246 case nir_instr_type_phi: {
247 auto phi = nir_instr_as_phi(instr);
248 auto phi_lo = nir_phi_instr_create(b->shader);
249 auto phi_hi = nir_phi_instr_create(b->shader);
250 nir_def_init(
251 &phi_lo->instr, &phi_lo->def, phi->def.num_components * 2, 32);
252 nir_def_init(
253 &phi_hi->instr, &phi_hi->def, phi->def.num_components * 2, 32);
254 nir_foreach_phi_src(s, phi)
255 {
256 auto lo = nir_unpack_32_2x16_split_x(b, s->src.ssa);
257 auto hi = nir_unpack_32_2x16_split_x(b, s->src.ssa);
258 nir_phi_instr_add_src(phi_lo, s->pred, lo);
259 nir_phi_instr_add_src(phi_hi, s->pred, hi);
260 }
261 return nir_pack_64_2x32_split(b, &phi_lo->def, &phi_hi->def);
262 }
263 default:
264 unreachable("Trying to lower instruction that was not in filter");
265 }
266 }
267 };
268
269 bool
r600_split_64bit_alu_and_phi(nir_shader * sh)270 r600_split_64bit_alu_and_phi(nir_shader *sh)
271 {
272 return LowerSplit64op().run(sh);
273 }
274
275 bool
filter(const nir_instr * instr) const276 LowerSplit64BitVar::filter(const nir_instr *instr) const
277 {
278 switch (instr->type) {
279 case nir_instr_type_intrinsic: {
280 auto intr = nir_instr_as_intrinsic(instr);
281
282 switch (intr->intrinsic) {
283 case nir_intrinsic_load_deref:
284 case nir_intrinsic_load_uniform:
285 case nir_intrinsic_load_input:
286 case nir_intrinsic_load_ubo:
287 case nir_intrinsic_load_ssbo:
288 if (intr->def.bit_size != 64)
289 return false;
290 return intr->def.num_components >= 3;
291 case nir_intrinsic_store_output:
292 if (nir_src_bit_size(intr->src[0]) != 64)
293 return false;
294 return nir_src_num_components(intr->src[0]) >= 3;
295 case nir_intrinsic_store_deref:
296 if (nir_src_bit_size(intr->src[1]) != 64)
297 return false;
298 return nir_src_num_components(intr->src[1]) >= 3;
299 default:
300 return false;
301 }
302 }
303 case nir_instr_type_alu: {
304 auto alu = nir_instr_as_alu(instr);
305 switch (alu->op) {
306 case nir_op_bcsel:
307 if (alu->def.num_components < 3)
308 return false;
309 return alu->def.bit_size == 64;
310 case nir_op_bany_fnequal3:
311 case nir_op_bany_fnequal4:
312 case nir_op_ball_fequal3:
313 case nir_op_ball_fequal4:
314 case nir_op_bany_inequal3:
315 case nir_op_bany_inequal4:
316 case nir_op_ball_iequal3:
317 case nir_op_ball_iequal4:
318 case nir_op_fdot3:
319 case nir_op_fdot4:
320 return nir_src_bit_size(alu->src[1].src) == 64;
321 default:
322 return false;
323 }
324 }
325 case nir_instr_type_load_const: {
326 auto lc = nir_instr_as_load_const(instr);
327 if (lc->def.bit_size != 64)
328 return false;
329 return lc->def.num_components >= 3;
330 }
331 default:
332 return false;
333 }
334 }
335
336 nir_def *
merge_64bit_loads(nir_def * load1,nir_def * load2,bool out_is_vec3)337 LowerSplit64BitVar::merge_64bit_loads(nir_def *load1,
338 nir_def *load2,
339 bool out_is_vec3)
340 {
341 if (out_is_vec3)
342 return nir_vec3(b,
343 nir_channel(b, load1, 0),
344 nir_channel(b, load1, 1),
345 nir_channel(b, load2, 0));
346 else
347 return nir_vec4(b,
348 nir_channel(b, load1, 0),
349 nir_channel(b, load1, 1),
350 nir_channel(b, load2, 0),
351 nir_channel(b, load2, 1));
352 }
353
~LowerSplit64BitVar()354 LowerSplit64BitVar::~LowerSplit64BitVar()
355 {
356 for (auto&& v : m_old_vars)
357 exec_node_remove(&v->node);
358
359 for (auto&& v : m_old_stores)
360 nir_instr_remove(v);
361 }
362
363 nir_def *
split_double_store_deref(nir_intrinsic_instr * intr)364 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
365 {
366 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
367 if (deref->deref_type == nir_deref_type_var)
368 return split_store_deref_var(intr, deref);
369 else if (deref->deref_type == nir_deref_type_array)
370 return split_store_deref_array(intr, deref);
371 else {
372 unreachable("only splitting of stores to vars and arrays is supported");
373 }
374 }
375
376 nir_def *
split_double_load_deref(nir_intrinsic_instr * intr)377 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
378 {
379 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
380 if (deref->deref_type == nir_deref_type_var)
381 return split_load_deref_var(intr);
382 else if (deref->deref_type == nir_deref_type_array)
383 return split_load_deref_array(intr, deref->arr.index);
384 else {
385 unreachable("only splitting of loads from vars and arrays is supported");
386 }
387 m_old_stores.push_back(&intr->instr);
388 }
389
390 nir_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)391 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
392 {
393 auto old_var = nir_intrinsic_get_var(intr, 0);
394 unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
395
396 assert(old_components > 2 && old_components <= 4);
397
398 auto vars = get_var_pair(old_var);
399
400 auto deref1 = nir_build_deref_var(b, vars.first);
401 auto deref_array1 = nir_build_deref_array(b, deref1, index.ssa);
402 auto load1 =
403 nir_build_load_deref(b, 2, 64, &deref_array1->def, (enum gl_access_qualifier)0);
404
405 auto deref2 = nir_build_deref_var(b, vars.second);
406 auto deref_array2 = nir_build_deref_array(b, deref2, index.ssa);
407
408 auto load2 = nir_build_load_deref(
409 b, old_components - 2, 64, &deref_array2->def, (enum gl_access_qualifier)0);
410
411 return merge_64bit_loads(load1, load2, old_components == 3);
412 }
413
414 nir_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)415 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr,
416 nir_deref_instr *deref)
417 {
418 auto old_var = nir_intrinsic_get_var(intr, 0);
419 unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
420
421 assert(old_components > 2 && old_components <= 4);
422
423 auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
424
425 auto vars = get_var_pair(old_var);
426
427 auto deref1 = nir_build_deref_var(b, vars.first);
428 auto deref_array1 =
429 nir_build_deref_array(b, deref1, deref->arr.index.ssa);
430
431 nir_build_store_deref(b, &deref_array1->def, src_xy, 3);
432
433 auto deref2 = nir_build_deref_var(b, vars.second);
434 auto deref_array2 =
435 nir_build_deref_array(b, deref2, deref->arr.index.ssa);
436
437 if (old_components == 3)
438 nir_build_store_deref(b,
439 &deref_array2->def,
440 nir_channel(b, intr->src[1].ssa, 2),
441 1);
442 else
443 nir_build_store_deref(b,
444 &deref_array2->def,
445 nir_channels(b, intr->src[1].ssa, 0xc),
446 3);
447
448 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
449 }
450
451 nir_def *
split_store_deref_var(nir_intrinsic_instr * intr,UNUSED nir_deref_instr * deref)452 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr,
453 UNUSED nir_deref_instr *deref)
454 {
455 auto old_var = nir_intrinsic_get_var(intr, 0);
456 unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
457
458 assert(old_components > 2 && old_components <= 4);
459
460 auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
461
462 auto vars = get_var_pair(old_var);
463
464 auto deref1 = nir_build_deref_var(b, vars.first);
465 nir_build_store_deref(b, &deref1->def, src_xy, 3);
466
467 auto deref2 = nir_build_deref_var(b, vars.second);
468 if (old_components == 3)
469 nir_build_store_deref(b, &deref2->def, nir_channel(b, intr->src[1].ssa, 2), 1);
470 else
471 nir_build_store_deref(b,
472 &deref2->def,
473 nir_channels(b, intr->src[1].ssa, 0xc),
474 3);
475
476 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
477 }
478
479 nir_def *
split_load_deref_var(nir_intrinsic_instr * intr)480 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
481 {
482 auto old_var = nir_intrinsic_get_var(intr, 0);
483 auto vars = get_var_pair(old_var);
484 unsigned old_components = glsl_get_components(old_var->type);
485
486 nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
487 auto *load1 = nir_load_deref(b, deref1);
488
489 nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
490 deref2->type = vars.second->type;
491
492 auto *load2 = nir_load_deref(b, deref2);
493
494 return merge_64bit_loads(load1, load2, old_components == 3);
495 }
496
497 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)498 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
499 {
500 auto split_vars = m_varmap.find(old_var->data.driver_location);
501
502 assert(glsl_get_components(glsl_without_array(old_var->type)) > 2);
503
504 if (split_vars == m_varmap.end()) {
505 auto var1 = nir_variable_clone(old_var, b->shader);
506 auto var2 = nir_variable_clone(old_var, b->shader);
507
508 var1->type = glsl_dvec_type(2);
509 var2->type = glsl_dvec_type(glsl_get_components(glsl_without_array(old_var->type)) - 2);
510
511 if (glsl_type_is_array(old_var->type)) {
512 var1->type = glsl_array_type(var1->type, glsl_array_size(old_var->type), 0);
513 var2->type = glsl_array_type(var2->type, glsl_array_size(old_var->type), 0);
514 }
515
516 if (old_var->data.mode == nir_var_shader_in ||
517 old_var->data.mode == nir_var_shader_out) {
518 ++var2->data.driver_location;
519 ++var2->data.location;
520 nir_shader_add_variable(b->shader, var1);
521 nir_shader_add_variable(b->shader, var2);
522 } else if (old_var->data.mode == nir_var_function_temp) {
523 exec_list_push_tail(&b->impl->locals, &var1->node);
524 exec_list_push_tail(&b->impl->locals, &var2->node);
525 }
526
527 m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
528 }
529 return m_varmap[old_var->data.driver_location];
530 }
531
532 nir_def *
split_double_load(nir_intrinsic_instr * load1)533 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
534 {
535 unsigned old_components = load1->def.num_components;
536 auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
537 nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
538
539 load1->def.num_components = 2;
540 sem.num_slots = 1;
541 nir_intrinsic_set_io_semantics(load1, sem);
542
543 load2->def.num_components = old_components - 2;
544 sem.location += 1;
545 nir_intrinsic_set_io_semantics(load2, sem);
546 nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
547 nir_builder_instr_insert(b, &load2->instr);
548
549 return merge_64bit_loads(&load1->def, &load2->def, old_components == 3);
550 }
551
552 nir_def *
split_store_output(nir_intrinsic_instr * store1)553 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
554 {
555 auto src = store1->src[0];
556 unsigned old_components = nir_src_num_components(src);
557 nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
558
559 auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
560 auto src1 = nir_trim_vector(b, src.ssa, 2);
561 auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
562
563 nir_src_rewrite(&src, src1);
564 nir_intrinsic_set_write_mask(store1, 3);
565
566 nir_src_rewrite(&src, src2);
567 nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
568
569 sem.num_slots = 1;
570 nir_intrinsic_set_io_semantics(store1, sem);
571
572 sem.location += 1;
573 nir_intrinsic_set_io_semantics(store2, sem);
574 nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
575
576 nir_builder_instr_insert(b, &store2->instr);
577 return NIR_LOWER_INSTR_PROGRESS;
578 }
579
580 nir_def *
split_double_load_uniform(nir_intrinsic_instr * intr)581 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
582 {
583 unsigned second_components = intr->def.num_components - 2;
584 nir_intrinsic_instr *load2 =
585 nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
586 load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
587 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
588 nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
589 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
590 load2->num_components = second_components;
591
592 nir_def_init(&load2->instr, &load2->def, second_components, 64);
593 nir_builder_instr_insert(b, &load2->instr);
594
595 intr->def.num_components = intr->num_components = 2;
596
597 if (second_components == 1)
598 return nir_vec3(b,
599 nir_channel(b, &intr->def, 0),
600 nir_channel(b, &intr->def, 1),
601 nir_channel(b, &load2->def, 0));
602 else
603 return nir_vec4(b,
604 nir_channel(b, &intr->def, 0),
605 nir_channel(b, &intr->def, 1),
606 nir_channel(b, &load2->def, 0),
607 nir_channel(b, &load2->def, 1));
608 }
609
610 nir_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)611 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
612 {
613 unsigned second_components = intr->def.num_components - 2;
614 nir_intrinsic_instr *load2 =
615 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
616
617 nir_src_rewrite(&load2->src[0], nir_iadd_imm(b, intr->src[0].ssa, 1));
618 load2->num_components = second_components;
619 nir_def_init(&load2->instr, &load2->def, second_components, 64);
620
621 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
622 nir_builder_instr_insert(b, &load2->instr);
623
624 intr->def.num_components = intr->num_components = 2;
625
626 return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
627 }
628
629 nir_def *
split_double_load_ubo(nir_intrinsic_instr * intr)630 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
631 {
632 unsigned second_components = intr->def.num_components - 2;
633 nir_intrinsic_instr *load2 =
634 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
635 load2->src[0] = intr->src[0];
636 load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
637 nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
638 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
639 nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
640 nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
641 nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr));
642
643 load2->num_components = second_components;
644
645 nir_def_init(&load2->instr, &load2->def, second_components, 64);
646 nir_builder_instr_insert(b, &load2->instr);
647
648 intr->def.num_components = intr->num_components = 2;
649
650 return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
651 }
652
653 nir_def *
split_reduction(nir_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)654 LowerSplit64BitVar::split_reduction(nir_def *src[2][2],
655 nir_op op1,
656 nir_op op2,
657 nir_op reduction)
658 {
659 auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
660 auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
661 return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
662 }
663
664 nir_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)665 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
666 nir_op op1,
667 nir_op op2,
668 nir_op reduction)
669 {
670 nir_def *src[2][2];
671
672 src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
673 src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
674
675 src[1][0] = nir_channel(b, alu->src[0].src.ssa, 2);
676 src[1][1] = nir_channel(b, alu->src[1].src.ssa, 2);
677
678 return split_reduction(src, op1, op2, reduction);
679 }
680
681 nir_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)682 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
683 nir_op op1,
684 nir_op op2,
685 nir_op reduction)
686 {
687 nir_def *src[2][2];
688
689 src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
690 src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
691
692 src[1][0] = nir_channels(b, alu->src[0].src.ssa, 0xc);
693 src[1][1] = nir_channels(b, alu->src[1].src.ssa, 0xc);
694
695 return split_reduction(src, op1, op2, reduction);
696 }
697
698 nir_def *
split_bcsel(nir_alu_instr * alu)699 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
700 {
701 static nir_def *dest[4];
702 for (unsigned i = 0; i < alu->def.num_components; ++i) {
703 dest[i] = nir_bcsel(b,
704 nir_channel(b, alu->src[0].src.ssa, i),
705 nir_channel(b, alu->src[1].src.ssa, i),
706 nir_channel(b, alu->src[2].src.ssa, i));
707 }
708 return nir_vec(b, dest, alu->def.num_components);
709 }
710
711 nir_def *
split_load_const(nir_load_const_instr * lc)712 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
713 {
714 nir_def *ir[4];
715 for (unsigned i = 0; i < lc->def.num_components; ++i)
716 ir[i] = nir_imm_double(b, lc->value[i].f64);
717
718 return nir_vec(b, ir, lc->def.num_components);
719 }
720
721 nir_def *
lower(nir_instr * instr)722 LowerSplit64BitVar::lower(nir_instr *instr)
723 {
724 switch (instr->type) {
725 case nir_instr_type_intrinsic: {
726 auto intr = nir_instr_as_intrinsic(instr);
727 switch (intr->intrinsic) {
728 case nir_intrinsic_load_deref:
729 return this->split_double_load_deref(intr);
730 case nir_intrinsic_load_uniform:
731 return split_double_load_uniform(intr);
732 case nir_intrinsic_load_ubo:
733 return split_double_load_ubo(intr);
734 case nir_intrinsic_load_ssbo:
735 return split_double_load_ssbo(intr);
736 case nir_intrinsic_load_input:
737 return split_double_load(intr);
738 case nir_intrinsic_store_output:
739 return split_store_output(intr);
740 case nir_intrinsic_store_deref:
741 return split_double_store_deref(intr);
742 default:
743 assert(0);
744 }
745 }
746 case nir_instr_type_alu: {
747 auto alu = nir_instr_as_alu(instr);
748 switch (alu->op) {
749 case nir_op_bany_fnequal3:
750 return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
751 case nir_op_ball_fequal3:
752 return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
753 case nir_op_bany_inequal3:
754 return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
755 case nir_op_ball_iequal3:
756 return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
757 case nir_op_fdot3:
758 return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
759 case nir_op_bany_fnequal4:
760 return split_reduction4(alu,
761 nir_op_bany_fnequal2,
762 nir_op_bany_fnequal2,
763 nir_op_ior);
764 case nir_op_ball_fequal4:
765 return split_reduction4(alu,
766 nir_op_ball_fequal2,
767 nir_op_ball_fequal2,
768 nir_op_iand);
769 case nir_op_bany_inequal4:
770 return split_reduction4(alu,
771 nir_op_bany_inequal2,
772 nir_op_bany_inequal2,
773 nir_op_ior);
774 case nir_op_ball_iequal4:
775 return split_reduction4(alu,
776 nir_op_bany_fnequal2,
777 nir_op_bany_fnequal2,
778 nir_op_ior);
779 case nir_op_fdot4:
780 return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
781 case nir_op_bcsel:
782 return split_bcsel(alu);
783 default:
784 assert(0);
785 }
786 }
787 case nir_instr_type_load_const: {
788 auto lc = nir_instr_as_load_const(instr);
789 return split_load_const(lc);
790 }
791 default:
792 assert(0);
793 }
794 return nullptr;
795 }
796
797 /* Split 64 bit instruction so that at most two 64 bit components are
798 * used in one instruction */
799
800 bool
r600_nir_split_64bit_io(nir_shader * sh)801 r600_nir_split_64bit_io(nir_shader *sh)
802 {
803 return LowerSplit64BitVar().run(sh);
804 }
805
806 /* */
807 class Lower64BitToVec2 : public NirLowerInstruction {
808
809 private:
810 bool filter(const nir_instr *instr) const override;
811 nir_def *lower(nir_instr *instr) override;
812
813 nir_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
814 nir_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
815 nir_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
816 nir_def *load_64_to_vec2(nir_intrinsic_instr *intr);
817 nir_def *store_64_to_vec2(nir_intrinsic_instr *intr);
818 };
819
820 bool
filter(const nir_instr * instr) const821 Lower64BitToVec2::filter(const nir_instr *instr) const
822 {
823 switch (instr->type) {
824 case nir_instr_type_intrinsic: {
825 auto intr = nir_instr_as_intrinsic(instr);
826
827 switch (intr->intrinsic) {
828 case nir_intrinsic_load_deref:
829 case nir_intrinsic_load_input:
830 case nir_intrinsic_load_uniform:
831 case nir_intrinsic_load_ubo:
832 case nir_intrinsic_load_global:
833 case nir_intrinsic_load_global_constant:
834 case nir_intrinsic_load_ubo_vec4:
835 case nir_intrinsic_load_ssbo:
836 return intr->def.bit_size == 64;
837 case nir_intrinsic_store_deref: {
838 if (nir_src_bit_size(intr->src[1]) == 64)
839 return true;
840 auto var = nir_intrinsic_get_var(intr, 0);
841 if (glsl_get_bit_size(glsl_without_array(var->type)) == 64)
842 return true;
843 return (glsl_get_components(glsl_without_array(var->type)) != intr->num_components);
844 }
845 case nir_intrinsic_store_global:
846 return nir_src_bit_size(intr->src[0]) == 64;
847 default:
848 return false;
849 }
850 }
851 case nir_instr_type_alu: {
852 auto alu = nir_instr_as_alu(instr);
853 return alu->def.bit_size == 64;
854 }
855 case nir_instr_type_phi: {
856 auto phi = nir_instr_as_phi(instr);
857 return phi->def.bit_size == 64;
858 }
859 case nir_instr_type_load_const: {
860 auto lc = nir_instr_as_load_const(instr);
861 return lc->def.bit_size == 64;
862 }
863 case nir_instr_type_undef: {
864 auto undef = nir_instr_as_undef(instr);
865 return undef->def.bit_size == 64;
866 }
867 default:
868 return false;
869 }
870 }
871
872 nir_def *
lower(nir_instr * instr)873 Lower64BitToVec2::lower(nir_instr *instr)
874 {
875 switch (instr->type) {
876 case nir_instr_type_intrinsic: {
877 auto intr = nir_instr_as_intrinsic(instr);
878 switch (intr->intrinsic) {
879 case nir_intrinsic_load_deref:
880 return load_deref_64_to_vec2(intr);
881 case nir_intrinsic_load_uniform:
882 return load_uniform_64_to_vec2(intr);
883 case nir_intrinsic_load_ssbo:
884 return load_ssbo_64_to_vec2(intr);
885 case nir_intrinsic_load_input:
886 case nir_intrinsic_load_global:
887 case nir_intrinsic_load_global_constant:
888 case nir_intrinsic_load_ubo:
889 case nir_intrinsic_load_ubo_vec4:
890 return load_64_to_vec2(intr);
891 case nir_intrinsic_store_deref:
892 return store_64_to_vec2(intr);
893 default:
894
895 return nullptr;
896 }
897 }
898 case nir_instr_type_alu: {
899 auto alu = nir_instr_as_alu(instr);
900 alu->def.bit_size = 32;
901 alu->def.num_components *= 2;
902 switch (alu->op) {
903 case nir_op_pack_64_2x32_split:
904 alu->op = nir_op_vec2;
905 break;
906 case nir_op_pack_64_2x32:
907 alu->op = nir_op_mov;
908 break;
909 case nir_op_vec2:
910 return nir_vec4(b,
911 nir_channel(b, alu->src[0].src.ssa, 0),
912 nir_channel(b, alu->src[0].src.ssa, 1),
913 nir_channel(b, alu->src[1].src.ssa, 0),
914 nir_channel(b, alu->src[1].src.ssa, 1));
915 default:
916 return NULL;
917 }
918 return NIR_LOWER_INSTR_PROGRESS;
919 }
920 case nir_instr_type_phi: {
921 auto phi = nir_instr_as_phi(instr);
922 phi->def.bit_size = 32;
923 phi->def.num_components = 2;
924 return NIR_LOWER_INSTR_PROGRESS;
925 }
926 case nir_instr_type_load_const: {
927 auto lc = nir_instr_as_load_const(instr);
928 assert(lc->def.num_components <= 2);
929 nir_const_value val[4];
930 for (uint i = 0; i < lc->def.num_components; ++i) {
931 uint64_t v = lc->value[i].u64;
932 val[i * 2 + 0] = nir_const_value_for_uint(v & 0xffffffff, 32);
933 val[i * 2 + 1] = nir_const_value_for_uint(v >> 32, 32);
934 }
935
936 return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
937 }
938 case nir_instr_type_undef: {
939 auto undef = nir_instr_as_undef(instr);
940 undef->def.num_components *= 2;
941 undef->def.bit_size = 32;
942 return NIR_LOWER_INSTR_PROGRESS;
943 }
944 default:
945 return nullptr;
946 }
947 }
948
949 nir_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)950 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
951 {
952 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
953 auto var = nir_intrinsic_get_var(intr, 0);
954 unsigned components = glsl_get_components(glsl_without_array(var->type));
955 if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
956 components *= 2;
957 if (deref->deref_type == nir_deref_type_var) {
958 var->type = glsl_vec_type(components);
959 } else if (deref->deref_type == nir_deref_type_array) {
960
961 var->type =
962 glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
963
964 } else {
965 nir_print_shader(b->shader, stderr);
966 assert(0 && "Only lowring of var and array derefs supported\n");
967 }
968 }
969 deref->type = var->type;
970 if (deref->deref_type == nir_deref_type_array) {
971 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
972 deref_array->type = var->type;
973 deref->type = glsl_without_array(deref_array->type);
974 }
975
976 intr->num_components = components;
977 intr->def.bit_size = 32;
978 intr->def.num_components = components;
979 return NIR_LOWER_INSTR_PROGRESS;
980 }
981
982 nir_def *
store_64_to_vec2(nir_intrinsic_instr * intr)983 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
984 {
985 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
986 auto var = nir_intrinsic_get_var(intr, 0);
987
988 unsigned components = glsl_get_components(glsl_without_array(var->type));
989 unsigned wrmask = nir_intrinsic_write_mask(intr);
990 if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
991 components *= 2;
992 if (deref->deref_type == nir_deref_type_var) {
993 var->type = glsl_vec_type(components);
994 } else if (deref->deref_type == nir_deref_type_array) {
995 var->type =
996 glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
997 } else {
998 nir_print_shader(b->shader, stderr);
999 assert(0 && "Only lowring of var and array derefs supported\n");
1000 }
1001 }
1002 deref->type = var->type;
1003 if (deref->deref_type == nir_deref_type_array) {
1004 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
1005 deref_array->type = var->type;
1006 deref->type = glsl_without_array(deref_array->type);
1007 }
1008 intr->num_components = components;
1009 nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
1010 return NIR_LOWER_INSTR_PROGRESS;
1011 }
1012
1013 nir_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)1014 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
1015 {
1016 intr->num_components *= 2;
1017 intr->def.bit_size = 32;
1018 intr->def.num_components *= 2;
1019 nir_intrinsic_set_dest_type(intr, nir_type_float32);
1020 return NIR_LOWER_INSTR_PROGRESS;
1021 }
1022
1023 nir_def *
load_64_to_vec2(nir_intrinsic_instr * intr)1024 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
1025 {
1026 intr->num_components *= 2;
1027 intr->def.bit_size = 32;
1028 intr->def.num_components *= 2;
1029 if (nir_intrinsic_has_component(intr))
1030 nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
1031 return NIR_LOWER_INSTR_PROGRESS;
1032 }
1033
1034 nir_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)1035 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
1036 {
1037 intr->num_components *= 2;
1038 intr->def.bit_size = 32;
1039 intr->def.num_components *= 2;
1040 return NIR_LOWER_INSTR_PROGRESS;
1041 }
1042
1043 static bool
store_64bit_intr(nir_src * src,void * state)1044 store_64bit_intr(nir_src *src, void *state)
1045 {
1046 bool *s = (bool *)state;
1047 *s = nir_src_bit_size(*src) == 64;
1048 return !*s;
1049 }
1050
1051 static bool
double2vec2(nir_src * src,UNUSED void * state)1052 double2vec2(nir_src *src, UNUSED void *state)
1053 {
1054 if (nir_src_bit_size(*src) != 64)
1055 return true;
1056
1057 src->ssa->bit_size = 32;
1058 src->ssa->num_components *= 2;
1059 return true;
1060 }
1061
1062 bool
r600_nir_64_to_vec2(nir_shader * sh)1063 r600_nir_64_to_vec2(nir_shader *sh)
1064 {
1065 vector<nir_instr *> intr64bit;
1066 nir_foreach_function_impl(impl, sh)
1067 {
1068 nir_foreach_block(block, impl)
1069 {
1070 nir_foreach_instr_safe(instr, block)
1071 {
1072 switch (instr->type) {
1073 case nir_instr_type_alu: {
1074 bool success = false;
1075 nir_foreach_src(instr, store_64bit_intr, &success);
1076 if (success)
1077 intr64bit.push_back(instr);
1078 break;
1079 }
1080 case nir_instr_type_intrinsic: {
1081 auto ir = nir_instr_as_intrinsic(instr);
1082 switch (ir->intrinsic) {
1083 case nir_intrinsic_store_output:
1084 case nir_intrinsic_store_global:
1085 case nir_intrinsic_store_ssbo: {
1086 bool success = false;
1087 nir_foreach_src(instr, store_64bit_intr, &success);
1088 if (success) {
1089 auto wm = nir_intrinsic_write_mask(ir);
1090 nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
1091 ir->num_components *= 2;
1092 }
1093 break;
1094 }
1095 default:;
1096 }
1097 }
1098 default:;
1099 }
1100 }
1101 }
1102 }
1103
1104 bool result = Lower64BitToVec2().run(sh);
1105
1106 if (result || !intr64bit.empty()) {
1107
1108 for (auto&& instr : intr64bit) {
1109 if (instr->type == nir_instr_type_alu) {
1110 auto alu = nir_instr_as_alu(instr);
1111 auto alu_info = nir_op_infos[alu->op];
1112 for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
1113 int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
1114 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
1115 if (!nir_alu_instr_channel_used(alu, i, k)) {
1116 continue;
1117 }
1118
1119 switch (alu->op) {
1120 case nir_op_unpack_64_2x32_split_x:
1121 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1122 alu->op = nir_op_mov;
1123 break;
1124 case nir_op_unpack_64_2x32_split_y:
1125 swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
1126 alu->op = nir_op_mov;
1127 break;
1128 case nir_op_unpack_64_2x32:
1129 alu->op = nir_op_mov;
1130 break;
1131 case nir_op_bcsel:
1132 if (i == 0) {
1133 swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
1134 break;
1135 }
1136 FALLTHROUGH;
1137 default:
1138 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1139 swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
1140 }
1141 }
1142 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
1143 alu->src[i].swizzle[k] = swizzle[k];
1144 }
1145 }
1146 } else
1147 nir_foreach_src(instr, double2vec2, nullptr);
1148 }
1149 result = true;
1150 }
1151
1152 return result;
1153 }
1154
1155 using std::map;
1156 using std::pair;
1157 using std::vector;
1158
1159 class StoreMerger {
1160 public:
1161 StoreMerger(nir_shader *shader);
1162 void collect_stores();
1163 bool combine();
1164 void combine_one_slot(vector<nir_intrinsic_instr *>& stores);
1165
1166 using StoreCombos = map<unsigned, vector<nir_intrinsic_instr *>>;
1167
1168 StoreCombos m_stores;
1169 nir_shader *sh;
1170 };
1171
StoreMerger(nir_shader * shader)1172 StoreMerger::StoreMerger(nir_shader *shader):
1173 sh(shader)
1174 {
1175 }
1176
1177 void
collect_stores()1178 StoreMerger::collect_stores()
1179 {
1180 unsigned vertex = 0;
1181 nir_foreach_function_impl(impl, sh)
1182 {
1183 nir_foreach_block(block, impl)
1184 {
1185 nir_foreach_instr_safe(instr, block)
1186 {
1187 if (instr->type != nir_instr_type_intrinsic)
1188 continue;
1189
1190 auto ir = nir_instr_as_intrinsic(instr);
1191 if (ir->intrinsic == nir_intrinsic_emit_vertex ||
1192 ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
1193 ++vertex;
1194 continue;
1195 }
1196 if (ir->intrinsic != nir_intrinsic_store_output)
1197 continue;
1198
1199 unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
1200 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
1201 m_stores[index].push_back(ir);
1202 }
1203 }
1204 }
1205 }
1206
1207 bool
combine()1208 StoreMerger::combine()
1209 {
1210 bool progress = false;
1211 for (auto&& i : m_stores) {
1212 if (i.second.size() < 2)
1213 continue;
1214
1215 combine_one_slot(i.second);
1216 progress = true;
1217 }
1218 return progress;
1219 }
1220
1221 void
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1222 StoreMerger::combine_one_slot(vector<nir_intrinsic_instr *>& stores)
1223 {
1224 nir_def *srcs[4] = {nullptr};
1225
1226 auto last_store = *stores.rbegin();
1227
1228 nir_builder b = nir_builder_at(nir_before_instr(&last_store->instr));
1229
1230 unsigned comps = 0;
1231 unsigned writemask = 0;
1232 unsigned first_comp = 4;
1233 for (auto&& store : stores) {
1234 int cmp = nir_intrinsic_component(store);
1235 for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1236 unsigned out_comp = i + cmp;
1237 srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1238 writemask |= 1 << out_comp;
1239 if (first_comp > out_comp)
1240 first_comp = out_comp;
1241 }
1242 }
1243
1244 auto new_src = nir_vec(&b, srcs, comps);
1245
1246 nir_src_rewrite(&last_store->src[0], new_src);
1247 last_store->num_components = comps;
1248 nir_intrinsic_set_component(last_store, first_comp);
1249 nir_intrinsic_set_write_mask(last_store, writemask);
1250
1251 for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1252 nir_instr_remove(&(*i)->instr);
1253 }
1254
1255 bool
r600_merge_vec2_stores(nir_shader * shader)1256 r600_merge_vec2_stores(nir_shader *shader)
1257 {
1258 r600::StoreMerger merger(shader);
1259 merger.collect_stores();
1260 return merger.combine();
1261 }
1262
1263 static bool
r600_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)1264 r600_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
1265 {
1266 b->cursor = nir_after_instr(&instr->instr);
1267
1268 switch (instr->intrinsic) {
1269 case nir_intrinsic_load_ubo:
1270 case nir_intrinsic_load_ubo_vec4:
1271 case nir_intrinsic_load_uniform:
1272 case nir_intrinsic_load_ssbo:
1273 case nir_intrinsic_load_input:
1274 case nir_intrinsic_load_interpolated_input:
1275 case nir_intrinsic_load_per_vertex_input:
1276 case nir_intrinsic_store_output:
1277 case nir_intrinsic_store_per_vertex_output:
1278 case nir_intrinsic_store_ssbo:
1279 break;
1280 default:
1281 return false;
1282 }
1283
1284 if (instr->num_components <= 2)
1285 return false;
1286
1287 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
1288 if (has_dest) {
1289 if (instr->def.bit_size != 64)
1290 return false;
1291 } else {
1292 if (nir_src_bit_size(instr->src[0]) != 64)
1293 return false;
1294 }
1295
1296 nir_intrinsic_instr *first =
1297 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1298 nir_intrinsic_instr *second =
1299 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1300
1301 switch (instr->intrinsic) {
1302 case nir_intrinsic_load_ubo:
1303 case nir_intrinsic_load_ubo_vec4:
1304 case nir_intrinsic_load_uniform:
1305 case nir_intrinsic_load_ssbo:
1306 case nir_intrinsic_store_ssbo:
1307 break;
1308
1309 default: {
1310 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
1311 semantics.location++;
1312 semantics.num_slots--;
1313 nir_intrinsic_set_io_semantics(second, semantics);
1314
1315 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
1316 break;
1317 }
1318 }
1319
1320 first->num_components = 2;
1321 second->num_components -= 2;
1322 if (has_dest) {
1323 first->def.num_components = 2;
1324 second->def.num_components -= 2;
1325 }
1326
1327 nir_builder_instr_insert(b, &first->instr);
1328 nir_builder_instr_insert(b, &second->instr);
1329
1330 if (has_dest) {
1331 /* Merge the two loads' results back into a vector. */
1332 nir_scalar channels[4] = {
1333 nir_get_scalar(&first->def, 0),
1334 nir_get_scalar(&first->def, 1),
1335 nir_get_scalar(&second->def, 0),
1336 nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
1337 };
1338 nir_def *new_ir = nir_vec_scalars(b, channels, instr->num_components);
1339 nir_def_rewrite_uses(&instr->def, new_ir);
1340 } else {
1341 /* Split the src value across the two stores. */
1342 b->cursor = nir_before_instr(&instr->instr);
1343
1344 nir_def *src0 = instr->src[0].ssa;
1345 nir_scalar channels[4] = {{0}};
1346 for (int i = 0; i < instr->num_components; i++)
1347 channels[i] = nir_get_scalar(src0, i);
1348
1349 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
1350 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
1351
1352 nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
1353 nir_src_rewrite(&second->src[0],
1354 nir_vec_scalars(b, &channels[2], second->num_components));
1355 }
1356
1357 int offset_src = -1;
1358 uint32_t offset_amount = 16;
1359
1360 switch (instr->intrinsic) {
1361 case nir_intrinsic_load_ssbo:
1362 case nir_intrinsic_load_ubo:
1363 offset_src = 1;
1364 break;
1365 case nir_intrinsic_load_ubo_vec4:
1366 case nir_intrinsic_load_uniform:
1367 offset_src = 0;
1368 offset_amount = 1;
1369 break;
1370 case nir_intrinsic_store_ssbo:
1371 offset_src = 2;
1372 break;
1373 default:
1374 break;
1375 }
1376 if (offset_src != -1) {
1377 b->cursor = nir_before_instr(&second->instr);
1378 nir_def *second_offset =
1379 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
1380 nir_src_rewrite(&second->src[offset_src], second_offset);
1381 }
1382
1383 /* DCE stores we generated with no writemask (nothing else does this
1384 * currently).
1385 */
1386 if (!has_dest) {
1387 if (nir_intrinsic_write_mask(first) == 0)
1388 nir_instr_remove(&first->instr);
1389 if (nir_intrinsic_write_mask(second) == 0)
1390 nir_instr_remove(&second->instr);
1391 }
1392
1393 nir_instr_remove(&instr->instr);
1394
1395 return true;
1396 }
1397
1398 static bool
r600_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)1399 r600_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
1400 {
1401 int num_components = instr->def.num_components;
1402
1403 if (instr->def.bit_size != 64 || num_components <= 2)
1404 return false;
1405
1406 b->cursor = nir_before_instr(&instr->instr);
1407
1408 nir_load_const_instr *first = nir_load_const_instr_create(b->shader, 2, 64);
1409 nir_load_const_instr *second =
1410 nir_load_const_instr_create(b->shader, num_components - 2, 64);
1411
1412 first->value[0] = instr->value[0];
1413 first->value[1] = instr->value[1];
1414 second->value[0] = instr->value[2];
1415 if (num_components == 4)
1416 second->value[1] = instr->value[3];
1417
1418 nir_builder_instr_insert(b, &first->instr);
1419 nir_builder_instr_insert(b, &second->instr);
1420
1421 nir_def *channels[4] = {
1422 nir_channel(b, &first->def, 0),
1423 nir_channel(b, &first->def, 1),
1424 nir_channel(b, &second->def, 0),
1425 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
1426 };
1427 nir_def *new_ir = nir_vec(b, channels, num_components);
1428 nir_def_rewrite_uses(&instr->def, new_ir);
1429 nir_instr_remove(&instr->instr);
1430
1431 return true;
1432 }
1433
1434 static bool
r600_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,UNUSED void * data)1435 r600_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, UNUSED void *data)
1436 {
1437 switch (instr->type) {
1438 case nir_instr_type_load_const:
1439 return r600_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
1440
1441 case nir_instr_type_intrinsic:
1442 return r600_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
1443 default:
1444 return false;
1445 }
1446 }
1447
1448 bool
r600_lower_64bit_to_vec2(nir_shader * s)1449 r600_lower_64bit_to_vec2(nir_shader *s)
1450 {
1451 return nir_shader_instructions_pass(s,
1452 r600_lower_64bit_to_vec2_instr,
1453 nir_metadata_block_index | nir_metadata_dominance,
1454 NULL);
1455 }
1456
1457 } // end namespace r600
1458