• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- mesa-c++  -*-
2  *
3  * Copyright (c) 2020 Collabora LTD
4  *
5  * Author: Gert Wollny <gert.wollny@collabora.com>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "nir.h"
28 #include "nir_builder.h"
29 #include "sfn_nir.h"
30 
31 #include <iostream>
32 #include <map>
33 #include <vector>
34 
35 namespace r600 {
36 
37 using std::make_pair;
38 using std::map;
39 using std::pair;
40 using std::vector;
41 
42 class LowerSplit64BitVar : public NirLowerInstruction {
43 public:
44    ~LowerSplit64BitVar();
45    using VarSplit = pair<nir_variable *, nir_variable *>;
46    using VarMap = map<unsigned, VarSplit>;
47 
48    nir_def *split_double_load_deref(nir_intrinsic_instr *intr);
49 
50    nir_def *split_double_store_deref(nir_intrinsic_instr *intr);
51 
52 private:
53    nir_def *split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
54 
55    nir_def *split_load_deref_var(nir_intrinsic_instr *intr);
56 
57    nir_def *split_store_deref_array(nir_intrinsic_instr *intr,
58                                         nir_deref_instr *deref);
59 
60    nir_def *split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
61 
62    VarSplit get_var_pair(nir_variable *old_var);
63 
64    nir_def *
65    merge_64bit_loads(nir_def *load1, nir_def *load2, bool out_is_vec3);
66 
67    nir_def *split_double_load(nir_intrinsic_instr *load1);
68 
69    nir_def *split_store_output(nir_intrinsic_instr *store1);
70 
71    nir_def *split_double_load_uniform(nir_intrinsic_instr *intr);
72 
73    nir_def *split_double_load_ssbo(nir_intrinsic_instr *intr);
74 
75    nir_def *split_double_load_ubo(nir_intrinsic_instr *intr);
76 
77    nir_def *
78    split_reduction(nir_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
79 
80    nir_def *
81    split_reduction3(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
82 
83    nir_def *
84    split_reduction4(nir_alu_instr *alu, nir_op op1, nir_op op2, nir_op reduction);
85 
86    nir_def *split_bcsel(nir_alu_instr *alu);
87 
88    nir_def *split_load_const(nir_load_const_instr *lc);
89 
90    bool filter(const nir_instr *instr) const override;
91    nir_def *lower(nir_instr *instr) override;
92 
93    VarMap m_varmap;
94    vector<nir_variable *> m_old_vars;
95    vector<nir_instr *> m_old_stores;
96 };
97 
98 class LowerLoad64Uniform : public NirLowerInstruction {
99    bool filter(const nir_instr *instr) const override;
100    nir_def *lower(nir_instr *instr) override;
101 };
102 
103 bool
filter(const nir_instr * instr) const104 LowerLoad64Uniform::filter(const nir_instr *instr) const
105 {
106    if (instr->type != nir_instr_type_intrinsic)
107       return false;
108 
109    auto intr = nir_instr_as_intrinsic(instr);
110    if (intr->intrinsic != nir_intrinsic_load_uniform &&
111        intr->intrinsic != nir_intrinsic_load_ubo &&
112        intr->intrinsic != nir_intrinsic_load_ubo_vec4)
113       return false;
114 
115    return intr->def.bit_size == 64;
116 }
117 
118 nir_def *
lower(nir_instr * instr)119 LowerLoad64Uniform::lower(nir_instr *instr)
120 {
121    auto intr = nir_instr_as_intrinsic(instr);
122    int old_components = intr->def.num_components;
123    assert(old_components <= 2);
124    intr->def.num_components *= 2;
125    intr->def.bit_size = 32;
126    intr->num_components *= 2;
127 
128    if (intr->intrinsic == nir_intrinsic_load_ubo ||
129        intr->intrinsic == nir_intrinsic_load_ubo_vec4)
130       nir_intrinsic_set_component(intr, 2 * nir_intrinsic_component(intr));
131 
132    nir_def *result_vec[2] = {nullptr, nullptr};
133 
134    for (int i = 0; i < old_components; ++i) {
135       result_vec[i] = nir_pack_64_2x32_split(b,
136                                              nir_channel(b, &intr->def, 2 * i),
137                                              nir_channel(b, &intr->def, 2 * i + 1));
138    }
139    if (old_components == 1)
140       return result_vec[0];
141 
142    return nir_vec2(b, result_vec[0], result_vec[1]);
143 }
144 
145 bool
r600_split_64bit_uniforms_and_ubo(nir_shader * sh)146 r600_split_64bit_uniforms_and_ubo(nir_shader *sh)
147 {
148    return LowerLoad64Uniform().run(sh);
149 }
150 
151 class LowerSplit64op : public NirLowerInstruction {
filter(const nir_instr * instr) const152    bool filter(const nir_instr *instr) const override
153    {
154       switch (instr->type) {
155       case nir_instr_type_alu: {
156          auto alu = nir_instr_as_alu(instr);
157          switch (alu->op) {
158          case nir_op_bcsel:
159             return alu->def.bit_size == 64;
160          case nir_op_f2i32:
161          case nir_op_f2u32:
162          case nir_op_f2i64:
163          case nir_op_f2u64:
164          case nir_op_u2f64:
165          case nir_op_i2f64:
166             return nir_src_bit_size(alu->src[0].src) == 64;
167          default:
168             return false;
169          }
170       }
171       case nir_instr_type_phi: {
172          auto phi = nir_instr_as_phi(instr);
173          return phi->def.num_components == 64;
174       }
175       default:
176          return false;
177       }
178    }
179 
lower(nir_instr * instr)180    nir_def *lower(nir_instr *instr) override
181    {
182 
183       switch (instr->type) {
184       case nir_instr_type_alu: {
185          auto alu = nir_instr_as_alu(instr);
186          switch (alu->op) {
187 
188          case nir_op_bcsel: {
189             auto lo =
190                nir_bcsel(b,
191                          alu->src[0].src.ssa,
192                          nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 1)),
193                          nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 2)));
194             auto hi =
195                nir_bcsel(b,
196                          alu->src[0].src.ssa,
197                          nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 1)),
198                          nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 2)));
199             return nir_pack_64_2x32_split(b, lo, hi);
200          }
201          case nir_op_f2i32: {
202             auto src = nir_ssa_for_alu_src(b, alu, 0);
203             auto gt0 = nir_fgt_imm(b, src, 0.0);
204             auto abs_src = nir_fabs(b, src);
205             auto value = nir_f2u32(b, abs_src);
206             return nir_bcsel(b, gt0, value, nir_ineg(b, value));
207          }
208          case nir_op_f2u32: {
209             /* fp32 doesn't hold sufficient bits to represent the full range of
210              * u32, therefore we have to split the values, and because f2f32
211              * rounds, we have to remove the fractional part in the hi bits
212              * For values > UINT_MAX the result is undefined */
213             auto src = nir_ssa_for_alu_src(b, alu, 0);
214             src = nir_fadd(b, src, nir_fneg(b, nir_ffract(b, src)));
215             auto gt0 = nir_fgt_imm(b, src, 0.0);
216             auto highval = nir_fmul_imm(b, src, 1.0 / 65536.0);
217             auto fract = nir_ffract(b, highval);
218             auto high = nir_f2u32(b, nir_f2f32(b, nir_fadd(b, highval, nir_fneg(b, fract))));
219             auto lowval = nir_fmul_imm(b, fract, 65536.0);
220             auto low = nir_f2u32(b, nir_f2f32(b, lowval));
221             return nir_bcsel(b,
222                              gt0,
223                              nir_ior(b, nir_ishl_imm(b, high, 16), low),
224                              nir_imm_int(b, 0));
225          }
226          case nir_op_u2f64: {
227             auto src = nir_ssa_for_alu_src(b, alu, 0);
228             auto low = nir_unpack_64_2x32_split_x(b, src);
229             auto high = nir_unpack_64_2x32_split_y(b, src);
230             auto flow = nir_u2f64(b, low);
231             auto fhigh = nir_u2f64(b, high);
232             return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
233          }
234          case nir_op_i2f64: {
235             auto src = nir_ssa_for_alu_src(b, alu, 0);
236             auto low = nir_unpack_64_2x32_split_x(b, src);
237             auto high = nir_unpack_64_2x32_split_y(b, src);
238             auto flow = nir_u2f64(b, low);
239             auto fhigh = nir_i2f64(b, high);
240             return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
241          }
242          default:
243             unreachable("trying to lower instruction that was not in filter");
244          }
245       }
246       case nir_instr_type_phi: {
247          auto phi = nir_instr_as_phi(instr);
248          auto phi_lo = nir_phi_instr_create(b->shader);
249          auto phi_hi = nir_phi_instr_create(b->shader);
250          nir_def_init(
251             &phi_lo->instr, &phi_lo->def, phi->def.num_components * 2, 32);
252          nir_def_init(
253             &phi_hi->instr, &phi_hi->def, phi->def.num_components * 2, 32);
254          nir_foreach_phi_src(s, phi)
255          {
256             auto lo = nir_unpack_32_2x16_split_x(b, s->src.ssa);
257             auto hi = nir_unpack_32_2x16_split_x(b, s->src.ssa);
258             nir_phi_instr_add_src(phi_lo, s->pred, lo);
259             nir_phi_instr_add_src(phi_hi, s->pred, hi);
260          }
261          return nir_pack_64_2x32_split(b, &phi_lo->def, &phi_hi->def);
262       }
263       default:
264          unreachable("Trying to lower instruction that was not in filter");
265       }
266    }
267 };
268 
269 bool
r600_split_64bit_alu_and_phi(nir_shader * sh)270 r600_split_64bit_alu_and_phi(nir_shader *sh)
271 {
272    return LowerSplit64op().run(sh);
273 }
274 
275 bool
filter(const nir_instr * instr) const276 LowerSplit64BitVar::filter(const nir_instr *instr) const
277 {
278    switch (instr->type) {
279    case nir_instr_type_intrinsic: {
280       auto intr = nir_instr_as_intrinsic(instr);
281 
282       switch (intr->intrinsic) {
283       case nir_intrinsic_load_deref:
284       case nir_intrinsic_load_uniform:
285       case nir_intrinsic_load_input:
286       case nir_intrinsic_load_ubo:
287       case nir_intrinsic_load_ssbo:
288          if (intr->def.bit_size != 64)
289             return false;
290          return intr->def.num_components >= 3;
291       case nir_intrinsic_store_output:
292          if (nir_src_bit_size(intr->src[0]) != 64)
293             return false;
294          return nir_src_num_components(intr->src[0]) >= 3;
295       case nir_intrinsic_store_deref:
296          if (nir_src_bit_size(intr->src[1]) != 64)
297             return false;
298          return nir_src_num_components(intr->src[1]) >= 3;
299       default:
300          return false;
301       }
302    }
303    case nir_instr_type_alu: {
304       auto alu = nir_instr_as_alu(instr);
305       switch (alu->op) {
306       case nir_op_bcsel:
307          if (alu->def.num_components < 3)
308             return false;
309          return alu->def.bit_size == 64;
310       case nir_op_bany_fnequal3:
311       case nir_op_bany_fnequal4:
312       case nir_op_ball_fequal3:
313       case nir_op_ball_fequal4:
314       case nir_op_bany_inequal3:
315       case nir_op_bany_inequal4:
316       case nir_op_ball_iequal3:
317       case nir_op_ball_iequal4:
318       case nir_op_fdot3:
319       case nir_op_fdot4:
320          return nir_src_bit_size(alu->src[1].src) == 64;
321       default:
322          return false;
323       }
324    }
325    case nir_instr_type_load_const: {
326       auto lc = nir_instr_as_load_const(instr);
327       if (lc->def.bit_size != 64)
328          return false;
329       return lc->def.num_components >= 3;
330    }
331    default:
332       return false;
333    }
334 }
335 
336 nir_def *
merge_64bit_loads(nir_def * load1,nir_def * load2,bool out_is_vec3)337 LowerSplit64BitVar::merge_64bit_loads(nir_def *load1,
338                                       nir_def *load2,
339                                       bool out_is_vec3)
340 {
341    if (out_is_vec3)
342       return nir_vec3(b,
343                       nir_channel(b, load1, 0),
344                       nir_channel(b, load1, 1),
345                       nir_channel(b, load2, 0));
346    else
347       return nir_vec4(b,
348                       nir_channel(b, load1, 0),
349                       nir_channel(b, load1, 1),
350                       nir_channel(b, load2, 0),
351                       nir_channel(b, load2, 1));
352 }
353 
~LowerSplit64BitVar()354 LowerSplit64BitVar::~LowerSplit64BitVar()
355 {
356    for (auto&& v : m_old_vars)
357       exec_node_remove(&v->node);
358 
359    for (auto&& v : m_old_stores)
360       nir_instr_remove(v);
361 }
362 
363 nir_def *
split_double_store_deref(nir_intrinsic_instr * intr)364 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
365 {
366    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
367    if (deref->deref_type == nir_deref_type_var)
368       return split_store_deref_var(intr, deref);
369    else if (deref->deref_type == nir_deref_type_array)
370       return split_store_deref_array(intr, deref);
371    else {
372       unreachable("only splitting of stores to vars and arrays is supported");
373    }
374 }
375 
376 nir_def *
split_double_load_deref(nir_intrinsic_instr * intr)377 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
378 {
379    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
380    if (deref->deref_type == nir_deref_type_var)
381       return split_load_deref_var(intr);
382    else if (deref->deref_type == nir_deref_type_array)
383       return split_load_deref_array(intr, deref->arr.index);
384    else {
385       unreachable("only splitting of loads from vars and arrays is supported");
386    }
387    m_old_stores.push_back(&intr->instr);
388 }
389 
390 nir_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)391 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
392 {
393    auto old_var = nir_intrinsic_get_var(intr, 0);
394    unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
395 
396    assert(old_components > 2 && old_components <= 4);
397 
398    auto vars = get_var_pair(old_var);
399 
400    auto deref1 = nir_build_deref_var(b, vars.first);
401    auto deref_array1 = nir_build_deref_array(b, deref1, index.ssa);
402    auto load1 =
403       nir_build_load_deref(b, 2, 64, &deref_array1->def, (enum gl_access_qualifier)0);
404 
405    auto deref2 = nir_build_deref_var(b, vars.second);
406    auto deref_array2 = nir_build_deref_array(b, deref2, index.ssa);
407 
408    auto load2 = nir_build_load_deref(
409       b, old_components - 2, 64, &deref_array2->def, (enum gl_access_qualifier)0);
410 
411    return merge_64bit_loads(load1, load2, old_components == 3);
412 }
413 
414 nir_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)415 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr,
416                                             nir_deref_instr *deref)
417 {
418    auto old_var = nir_intrinsic_get_var(intr, 0);
419    unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
420 
421    assert(old_components > 2 && old_components <= 4);
422 
423    auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
424 
425    auto vars = get_var_pair(old_var);
426 
427    auto deref1 = nir_build_deref_var(b, vars.first);
428    auto deref_array1 =
429       nir_build_deref_array(b, deref1, deref->arr.index.ssa);
430 
431    nir_build_store_deref(b, &deref_array1->def, src_xy, 3);
432 
433    auto deref2 = nir_build_deref_var(b, vars.second);
434    auto deref_array2 =
435       nir_build_deref_array(b, deref2, deref->arr.index.ssa);
436 
437    if (old_components == 3)
438       nir_build_store_deref(b,
439                             &deref_array2->def,
440                             nir_channel(b, intr->src[1].ssa, 2),
441                             1);
442    else
443       nir_build_store_deref(b,
444                             &deref_array2->def,
445                             nir_channels(b, intr->src[1].ssa, 0xc),
446                             3);
447 
448    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
449 }
450 
451 nir_def *
split_store_deref_var(nir_intrinsic_instr * intr,UNUSED nir_deref_instr * deref)452 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr,
453                                           UNUSED nir_deref_instr *deref)
454 {
455    auto old_var = nir_intrinsic_get_var(intr, 0);
456    unsigned old_components = glsl_get_components(glsl_without_array(old_var->type));
457 
458    assert(old_components > 2 && old_components <= 4);
459 
460    auto src_xy = nir_trim_vector(b, intr->src[1].ssa, 2);
461 
462    auto vars = get_var_pair(old_var);
463 
464    auto deref1 = nir_build_deref_var(b, vars.first);
465    nir_build_store_deref(b, &deref1->def, src_xy, 3);
466 
467    auto deref2 = nir_build_deref_var(b, vars.second);
468    if (old_components == 3)
469       nir_build_store_deref(b, &deref2->def, nir_channel(b, intr->src[1].ssa, 2), 1);
470    else
471       nir_build_store_deref(b,
472                             &deref2->def,
473                             nir_channels(b, intr->src[1].ssa, 0xc),
474                             3);
475 
476    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
477 }
478 
479 nir_def *
split_load_deref_var(nir_intrinsic_instr * intr)480 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
481 {
482    auto old_var = nir_intrinsic_get_var(intr, 0);
483    auto vars = get_var_pair(old_var);
484    unsigned old_components = glsl_get_components(old_var->type);
485 
486    nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
487    auto *load1 = nir_load_deref(b, deref1);
488 
489    nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
490    deref2->type = vars.second->type;
491 
492    auto *load2 = nir_load_deref(b, deref2);
493 
494    return merge_64bit_loads(load1, load2, old_components == 3);
495 }
496 
497 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)498 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
499 {
500    auto split_vars = m_varmap.find(old_var->data.driver_location);
501 
502    assert(glsl_get_components(glsl_without_array(old_var->type)) > 2);
503 
504    if (split_vars == m_varmap.end()) {
505       auto var1 = nir_variable_clone(old_var, b->shader);
506       auto var2 = nir_variable_clone(old_var, b->shader);
507 
508       var1->type = glsl_dvec_type(2);
509       var2->type = glsl_dvec_type(glsl_get_components(glsl_without_array(old_var->type)) - 2);
510 
511       if (glsl_type_is_array(old_var->type)) {
512          var1->type = glsl_array_type(var1->type, glsl_array_size(old_var->type), 0);
513          var2->type = glsl_array_type(var2->type, glsl_array_size(old_var->type), 0);
514       }
515 
516       if (old_var->data.mode == nir_var_shader_in ||
517           old_var->data.mode == nir_var_shader_out) {
518          ++var2->data.driver_location;
519          ++var2->data.location;
520          nir_shader_add_variable(b->shader, var1);
521          nir_shader_add_variable(b->shader, var2);
522       } else if (old_var->data.mode == nir_var_function_temp) {
523          exec_list_push_tail(&b->impl->locals, &var1->node);
524          exec_list_push_tail(&b->impl->locals, &var2->node);
525       }
526 
527       m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
528    }
529    return m_varmap[old_var->data.driver_location];
530 }
531 
532 nir_def *
split_double_load(nir_intrinsic_instr * load1)533 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
534 {
535    unsigned old_components = load1->def.num_components;
536    auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
537    nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
538 
539    load1->def.num_components = 2;
540    sem.num_slots = 1;
541    nir_intrinsic_set_io_semantics(load1, sem);
542 
543    load2->def.num_components = old_components - 2;
544    sem.location += 1;
545    nir_intrinsic_set_io_semantics(load2, sem);
546    nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
547    nir_builder_instr_insert(b, &load2->instr);
548 
549    return merge_64bit_loads(&load1->def, &load2->def, old_components == 3);
550 }
551 
552 nir_def *
split_store_output(nir_intrinsic_instr * store1)553 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
554 {
555    auto src = store1->src[0];
556    unsigned old_components = nir_src_num_components(src);
557    nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
558 
559    auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
560    auto src1 = nir_trim_vector(b, src.ssa, 2);
561    auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
562 
563    nir_src_rewrite(&src, src1);
564    nir_intrinsic_set_write_mask(store1, 3);
565 
566    nir_src_rewrite(&src, src2);
567    nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
568 
569    sem.num_slots = 1;
570    nir_intrinsic_set_io_semantics(store1, sem);
571 
572    sem.location += 1;
573    nir_intrinsic_set_io_semantics(store2, sem);
574    nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
575 
576    nir_builder_instr_insert(b, &store2->instr);
577    return NIR_LOWER_INSTR_PROGRESS;
578 }
579 
580 nir_def *
split_double_load_uniform(nir_intrinsic_instr * intr)581 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
582 {
583    unsigned second_components = intr->def.num_components - 2;
584    nir_intrinsic_instr *load2 =
585       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
586    load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
587    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
588    nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
589    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
590    load2->num_components = second_components;
591 
592    nir_def_init(&load2->instr, &load2->def, second_components, 64);
593    nir_builder_instr_insert(b, &load2->instr);
594 
595    intr->def.num_components = intr->num_components = 2;
596 
597    if (second_components == 1)
598       return nir_vec3(b,
599                       nir_channel(b, &intr->def, 0),
600                       nir_channel(b, &intr->def, 1),
601                       nir_channel(b, &load2->def, 0));
602    else
603       return nir_vec4(b,
604                       nir_channel(b, &intr->def, 0),
605                       nir_channel(b, &intr->def, 1),
606                       nir_channel(b, &load2->def, 0),
607                       nir_channel(b, &load2->def, 1));
608 }
609 
610 nir_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)611 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
612 {
613    unsigned second_components = intr->def.num_components - 2;
614    nir_intrinsic_instr *load2 =
615       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
616 
617    nir_src_rewrite(&load2->src[0], nir_iadd_imm(b, intr->src[0].ssa, 1));
618    load2->num_components = second_components;
619    nir_def_init(&load2->instr, &load2->def, second_components, 64);
620 
621    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
622    nir_builder_instr_insert(b, &load2->instr);
623 
624    intr->def.num_components = intr->num_components = 2;
625 
626    return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
627 }
628 
629 nir_def *
split_double_load_ubo(nir_intrinsic_instr * intr)630 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
631 {
632    unsigned second_components = intr->def.num_components - 2;
633    nir_intrinsic_instr *load2 =
634       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
635    load2->src[0] = intr->src[0];
636    load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
637    nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
638    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
639    nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
640    nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
641    nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr));
642 
643    load2->num_components = second_components;
644 
645    nir_def_init(&load2->instr, &load2->def, second_components, 64);
646    nir_builder_instr_insert(b, &load2->instr);
647 
648    intr->def.num_components = intr->num_components = 2;
649 
650    return merge_64bit_loads(&intr->def, &load2->def, second_components == 1);
651 }
652 
653 nir_def *
split_reduction(nir_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)654 LowerSplit64BitVar::split_reduction(nir_def *src[2][2],
655                                     nir_op op1,
656                                     nir_op op2,
657                                     nir_op reduction)
658 {
659    auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
660    auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
661    return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
662 }
663 
664 nir_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)665 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
666                                      nir_op op1,
667                                      nir_op op2,
668                                      nir_op reduction)
669 {
670    nir_def *src[2][2];
671 
672    src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
673    src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
674 
675    src[1][0] = nir_channel(b, alu->src[0].src.ssa, 2);
676    src[1][1] = nir_channel(b, alu->src[1].src.ssa, 2);
677 
678    return split_reduction(src, op1, op2, reduction);
679 }
680 
681 nir_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)682 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
683                                      nir_op op1,
684                                      nir_op op2,
685                                      nir_op reduction)
686 {
687    nir_def *src[2][2];
688 
689    src[0][0] = nir_trim_vector(b, alu->src[0].src.ssa, 2);
690    src[0][1] = nir_trim_vector(b, alu->src[1].src.ssa, 2);
691 
692    src[1][0] = nir_channels(b, alu->src[0].src.ssa, 0xc);
693    src[1][1] = nir_channels(b, alu->src[1].src.ssa, 0xc);
694 
695    return split_reduction(src, op1, op2, reduction);
696 }
697 
698 nir_def *
split_bcsel(nir_alu_instr * alu)699 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
700 {
701    static nir_def *dest[4];
702    for (unsigned i = 0; i < alu->def.num_components; ++i) {
703       dest[i] = nir_bcsel(b,
704                           nir_channel(b, alu->src[0].src.ssa, i),
705                           nir_channel(b, alu->src[1].src.ssa, i),
706                           nir_channel(b, alu->src[2].src.ssa, i));
707    }
708    return nir_vec(b, dest, alu->def.num_components);
709 }
710 
711 nir_def *
split_load_const(nir_load_const_instr * lc)712 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
713 {
714    nir_def *ir[4];
715    for (unsigned i = 0; i < lc->def.num_components; ++i)
716       ir[i] = nir_imm_double(b, lc->value[i].f64);
717 
718    return nir_vec(b, ir, lc->def.num_components);
719 }
720 
721 nir_def *
lower(nir_instr * instr)722 LowerSplit64BitVar::lower(nir_instr *instr)
723 {
724    switch (instr->type) {
725    case nir_instr_type_intrinsic: {
726       auto intr = nir_instr_as_intrinsic(instr);
727       switch (intr->intrinsic) {
728       case nir_intrinsic_load_deref:
729          return this->split_double_load_deref(intr);
730       case nir_intrinsic_load_uniform:
731          return split_double_load_uniform(intr);
732       case nir_intrinsic_load_ubo:
733          return split_double_load_ubo(intr);
734       case nir_intrinsic_load_ssbo:
735          return split_double_load_ssbo(intr);
736       case nir_intrinsic_load_input:
737          return split_double_load(intr);
738       case nir_intrinsic_store_output:
739          return split_store_output(intr);
740       case nir_intrinsic_store_deref:
741          return split_double_store_deref(intr);
742       default:
743          assert(0);
744       }
745    }
746    case nir_instr_type_alu: {
747       auto alu = nir_instr_as_alu(instr);
748       switch (alu->op) {
749       case nir_op_bany_fnequal3:
750          return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
751       case nir_op_ball_fequal3:
752          return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
753       case nir_op_bany_inequal3:
754          return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
755       case nir_op_ball_iequal3:
756          return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
757       case nir_op_fdot3:
758          return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
759       case nir_op_bany_fnequal4:
760          return split_reduction4(alu,
761                                  nir_op_bany_fnequal2,
762                                  nir_op_bany_fnequal2,
763                                  nir_op_ior);
764       case nir_op_ball_fequal4:
765          return split_reduction4(alu,
766                                  nir_op_ball_fequal2,
767                                  nir_op_ball_fequal2,
768                                  nir_op_iand);
769       case nir_op_bany_inequal4:
770          return split_reduction4(alu,
771                                  nir_op_bany_inequal2,
772                                  nir_op_bany_inequal2,
773                                  nir_op_ior);
774       case nir_op_ball_iequal4:
775          return split_reduction4(alu,
776                                  nir_op_bany_fnequal2,
777                                  nir_op_bany_fnequal2,
778                                  nir_op_ior);
779       case nir_op_fdot4:
780          return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
781       case nir_op_bcsel:
782          return split_bcsel(alu);
783       default:
784          assert(0);
785       }
786    }
787    case nir_instr_type_load_const: {
788       auto lc = nir_instr_as_load_const(instr);
789       return split_load_const(lc);
790    }
791    default:
792       assert(0);
793    }
794    return nullptr;
795 }
796 
797 /* Split 64 bit instruction so that at most two 64 bit components are
798  * used in one instruction */
799 
800 bool
r600_nir_split_64bit_io(nir_shader * sh)801 r600_nir_split_64bit_io(nir_shader *sh)
802 {
803    return LowerSplit64BitVar().run(sh);
804 }
805 
806 /* */
807 class Lower64BitToVec2 : public NirLowerInstruction {
808 
809 private:
810    bool filter(const nir_instr *instr) const override;
811    nir_def *lower(nir_instr *instr) override;
812 
813    nir_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
814    nir_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
815    nir_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
816    nir_def *load_64_to_vec2(nir_intrinsic_instr *intr);
817    nir_def *store_64_to_vec2(nir_intrinsic_instr *intr);
818 };
819 
820 bool
filter(const nir_instr * instr) const821 Lower64BitToVec2::filter(const nir_instr *instr) const
822 {
823    switch (instr->type) {
824    case nir_instr_type_intrinsic: {
825       auto intr = nir_instr_as_intrinsic(instr);
826 
827       switch (intr->intrinsic) {
828       case nir_intrinsic_load_deref:
829       case nir_intrinsic_load_input:
830       case nir_intrinsic_load_uniform:
831       case nir_intrinsic_load_ubo:
832       case nir_intrinsic_load_global:
833       case nir_intrinsic_load_global_constant:
834       case nir_intrinsic_load_ubo_vec4:
835       case nir_intrinsic_load_ssbo:
836          return intr->def.bit_size == 64;
837       case nir_intrinsic_store_deref: {
838          if (nir_src_bit_size(intr->src[1]) == 64)
839             return true;
840          auto var = nir_intrinsic_get_var(intr, 0);
841          if (glsl_get_bit_size(glsl_without_array(var->type)) == 64)
842             return true;
843          return (glsl_get_components(glsl_without_array(var->type)) != intr->num_components);
844       }
845       case nir_intrinsic_store_global:
846          return nir_src_bit_size(intr->src[0]) == 64;
847       default:
848          return false;
849       }
850    }
851    case nir_instr_type_alu: {
852       auto alu = nir_instr_as_alu(instr);
853       return alu->def.bit_size == 64;
854    }
855    case nir_instr_type_phi: {
856       auto phi = nir_instr_as_phi(instr);
857       return phi->def.bit_size == 64;
858    }
859    case nir_instr_type_load_const: {
860       auto lc = nir_instr_as_load_const(instr);
861       return lc->def.bit_size == 64;
862    }
863    case nir_instr_type_undef: {
864       auto undef = nir_instr_as_undef(instr);
865       return undef->def.bit_size == 64;
866    }
867    default:
868       return false;
869    }
870 }
871 
872 nir_def *
lower(nir_instr * instr)873 Lower64BitToVec2::lower(nir_instr *instr)
874 {
875    switch (instr->type) {
876    case nir_instr_type_intrinsic: {
877       auto intr = nir_instr_as_intrinsic(instr);
878       switch (intr->intrinsic) {
879       case nir_intrinsic_load_deref:
880          return load_deref_64_to_vec2(intr);
881       case nir_intrinsic_load_uniform:
882          return load_uniform_64_to_vec2(intr);
883       case nir_intrinsic_load_ssbo:
884          return load_ssbo_64_to_vec2(intr);
885       case nir_intrinsic_load_input:
886       case nir_intrinsic_load_global:
887       case nir_intrinsic_load_global_constant:
888       case nir_intrinsic_load_ubo:
889       case nir_intrinsic_load_ubo_vec4:
890          return load_64_to_vec2(intr);
891       case nir_intrinsic_store_deref:
892          return store_64_to_vec2(intr);
893       default:
894 
895          return nullptr;
896       }
897    }
898    case nir_instr_type_alu: {
899       auto alu = nir_instr_as_alu(instr);
900       alu->def.bit_size = 32;
901       alu->def.num_components *= 2;
902       switch (alu->op) {
903       case nir_op_pack_64_2x32_split:
904          alu->op = nir_op_vec2;
905          break;
906       case nir_op_pack_64_2x32:
907          alu->op = nir_op_mov;
908          break;
909       case nir_op_vec2:
910          return nir_vec4(b,
911                          nir_channel(b, alu->src[0].src.ssa, 0),
912                          nir_channel(b, alu->src[0].src.ssa, 1),
913                          nir_channel(b, alu->src[1].src.ssa, 0),
914                          nir_channel(b, alu->src[1].src.ssa, 1));
915       default:
916          return NULL;
917       }
918       return NIR_LOWER_INSTR_PROGRESS;
919    }
920    case nir_instr_type_phi: {
921       auto phi = nir_instr_as_phi(instr);
922       phi->def.bit_size = 32;
923       phi->def.num_components = 2;
924       return NIR_LOWER_INSTR_PROGRESS;
925    }
926    case nir_instr_type_load_const: {
927       auto lc = nir_instr_as_load_const(instr);
928       assert(lc->def.num_components <= 2);
929       nir_const_value val[4];
930       for (uint i = 0; i < lc->def.num_components; ++i) {
931          uint64_t v = lc->value[i].u64;
932          val[i * 2 + 0] = nir_const_value_for_uint(v & 0xffffffff, 32);
933          val[i * 2 + 1] = nir_const_value_for_uint(v >> 32, 32);
934       }
935 
936       return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
937    }
938    case nir_instr_type_undef: {
939       auto undef = nir_instr_as_undef(instr);
940       undef->def.num_components *= 2;
941       undef->def.bit_size = 32;
942       return NIR_LOWER_INSTR_PROGRESS;
943    }
944    default:
945       return nullptr;
946    }
947 }
948 
949 nir_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)950 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
951 {
952    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
953    auto var = nir_intrinsic_get_var(intr, 0);
954    unsigned components = glsl_get_components(glsl_without_array(var->type));
955    if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
956       components *= 2;
957       if (deref->deref_type == nir_deref_type_var) {
958          var->type = glsl_vec_type(components);
959       } else if (deref->deref_type == nir_deref_type_array) {
960 
961          var->type =
962             glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
963 
964       } else {
965          nir_print_shader(b->shader, stderr);
966          assert(0 && "Only lowring of var and array derefs supported\n");
967       }
968    }
969    deref->type = var->type;
970    if (deref->deref_type == nir_deref_type_array) {
971       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
972       deref_array->type = var->type;
973       deref->type = glsl_without_array(deref_array->type);
974    }
975 
976    intr->num_components = components;
977    intr->def.bit_size = 32;
978    intr->def.num_components = components;
979    return NIR_LOWER_INSTR_PROGRESS;
980 }
981 
982 nir_def *
store_64_to_vec2(nir_intrinsic_instr * intr)983 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
984 {
985    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
986    auto var = nir_intrinsic_get_var(intr, 0);
987 
988    unsigned components = glsl_get_components(glsl_without_array(var->type));
989    unsigned wrmask = nir_intrinsic_write_mask(intr);
990    if (glsl_get_bit_size(glsl_without_array(var->type)) == 64) {
991       components *= 2;
992       if (deref->deref_type == nir_deref_type_var) {
993          var->type = glsl_vec_type(components);
994       } else if (deref->deref_type == nir_deref_type_array) {
995          var->type =
996             glsl_array_type(glsl_vec_type(components), glsl_array_size(var->type), 0);
997       } else {
998          nir_print_shader(b->shader, stderr);
999          assert(0 && "Only lowring of var and array derefs supported\n");
1000       }
1001    }
1002    deref->type = var->type;
1003    if (deref->deref_type == nir_deref_type_array) {
1004       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
1005       deref_array->type = var->type;
1006       deref->type = glsl_without_array(deref_array->type);
1007    }
1008    intr->num_components = components;
1009    nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
1010    return NIR_LOWER_INSTR_PROGRESS;
1011 }
1012 
1013 nir_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)1014 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
1015 {
1016    intr->num_components *= 2;
1017    intr->def.bit_size = 32;
1018    intr->def.num_components *= 2;
1019    nir_intrinsic_set_dest_type(intr, nir_type_float32);
1020    return NIR_LOWER_INSTR_PROGRESS;
1021 }
1022 
1023 nir_def *
load_64_to_vec2(nir_intrinsic_instr * intr)1024 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
1025 {
1026    intr->num_components *= 2;
1027    intr->def.bit_size = 32;
1028    intr->def.num_components *= 2;
1029    if (nir_intrinsic_has_component(intr))
1030       nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
1031    return NIR_LOWER_INSTR_PROGRESS;
1032 }
1033 
1034 nir_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)1035 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
1036 {
1037    intr->num_components *= 2;
1038    intr->def.bit_size = 32;
1039    intr->def.num_components *= 2;
1040    return NIR_LOWER_INSTR_PROGRESS;
1041 }
1042 
1043 static bool
store_64bit_intr(nir_src * src,void * state)1044 store_64bit_intr(nir_src *src, void *state)
1045 {
1046    bool *s = (bool *)state;
1047    *s = nir_src_bit_size(*src) == 64;
1048    return !*s;
1049 }
1050 
1051 static bool
double2vec2(nir_src * src,UNUSED void * state)1052 double2vec2(nir_src *src, UNUSED void *state)
1053 {
1054    if (nir_src_bit_size(*src) != 64)
1055       return true;
1056 
1057    src->ssa->bit_size = 32;
1058    src->ssa->num_components *= 2;
1059    return true;
1060 }
1061 
1062 bool
r600_nir_64_to_vec2(nir_shader * sh)1063 r600_nir_64_to_vec2(nir_shader *sh)
1064 {
1065    vector<nir_instr *> intr64bit;
1066    nir_foreach_function_impl(impl, sh)
1067    {
1068       nir_foreach_block(block, impl)
1069       {
1070          nir_foreach_instr_safe(instr, block)
1071          {
1072             switch (instr->type) {
1073             case nir_instr_type_alu: {
1074                bool success = false;
1075                nir_foreach_src(instr, store_64bit_intr, &success);
1076                if (success)
1077                   intr64bit.push_back(instr);
1078                break;
1079             }
1080             case nir_instr_type_intrinsic: {
1081                auto ir = nir_instr_as_intrinsic(instr);
1082                switch (ir->intrinsic) {
1083                case nir_intrinsic_store_output:
1084                case nir_intrinsic_store_global:
1085                case nir_intrinsic_store_ssbo: {
1086                   bool success = false;
1087                   nir_foreach_src(instr, store_64bit_intr, &success);
1088                   if (success) {
1089                      auto wm = nir_intrinsic_write_mask(ir);
1090                      nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
1091                      ir->num_components *= 2;
1092                   }
1093                   break;
1094                }
1095                default:;
1096                }
1097             }
1098             default:;
1099             }
1100          }
1101       }
1102    }
1103 
1104    bool result = Lower64BitToVec2().run(sh);
1105 
1106    if (result || !intr64bit.empty()) {
1107 
1108       for (auto&& instr : intr64bit) {
1109          if (instr->type == nir_instr_type_alu) {
1110             auto alu = nir_instr_as_alu(instr);
1111             auto alu_info = nir_op_infos[alu->op];
1112             for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
1113                int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
1114                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
1115                   if (!nir_alu_instr_channel_used(alu, i, k)) {
1116                      continue;
1117                   }
1118 
1119                   switch (alu->op) {
1120                   case nir_op_unpack_64_2x32_split_x:
1121                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1122                      alu->op = nir_op_mov;
1123                      break;
1124                   case nir_op_unpack_64_2x32_split_y:
1125                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
1126                      alu->op = nir_op_mov;
1127                      break;
1128                   case nir_op_unpack_64_2x32:
1129                      alu->op = nir_op_mov;
1130                      break;
1131                   case nir_op_bcsel:
1132                      if (i == 0) {
1133                         swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
1134                         break;
1135                      }
1136                      FALLTHROUGH;
1137                   default:
1138                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1139                      swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
1140                   }
1141                }
1142                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
1143                   alu->src[i].swizzle[k] = swizzle[k];
1144                }
1145             }
1146          } else
1147             nir_foreach_src(instr, double2vec2, nullptr);
1148       }
1149       result = true;
1150    }
1151 
1152    return result;
1153 }
1154 
1155 using std::map;
1156 using std::pair;
1157 using std::vector;
1158 
1159 class StoreMerger {
1160 public:
1161    StoreMerger(nir_shader *shader);
1162    void collect_stores();
1163    bool combine();
1164    void combine_one_slot(vector<nir_intrinsic_instr *>& stores);
1165 
1166    using StoreCombos = map<unsigned, vector<nir_intrinsic_instr *>>;
1167 
1168    StoreCombos m_stores;
1169    nir_shader *sh;
1170 };
1171 
StoreMerger(nir_shader * shader)1172 StoreMerger::StoreMerger(nir_shader *shader):
1173     sh(shader)
1174 {
1175 }
1176 
1177 void
collect_stores()1178 StoreMerger::collect_stores()
1179 {
1180    unsigned vertex = 0;
1181    nir_foreach_function_impl(impl, sh)
1182    {
1183       nir_foreach_block(block, impl)
1184       {
1185          nir_foreach_instr_safe(instr, block)
1186          {
1187             if (instr->type != nir_instr_type_intrinsic)
1188                continue;
1189 
1190             auto ir = nir_instr_as_intrinsic(instr);
1191             if (ir->intrinsic == nir_intrinsic_emit_vertex ||
1192                 ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
1193                ++vertex;
1194                continue;
1195             }
1196             if (ir->intrinsic != nir_intrinsic_store_output)
1197                continue;
1198 
1199             unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
1200                              8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
1201             m_stores[index].push_back(ir);
1202          }
1203       }
1204    }
1205 }
1206 
1207 bool
combine()1208 StoreMerger::combine()
1209 {
1210    bool progress = false;
1211    for (auto&& i : m_stores) {
1212       if (i.second.size() < 2)
1213          continue;
1214 
1215       combine_one_slot(i.second);
1216       progress = true;
1217    }
1218    return progress;
1219 }
1220 
1221 void
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1222 StoreMerger::combine_one_slot(vector<nir_intrinsic_instr *>& stores)
1223 {
1224    nir_def *srcs[4] = {nullptr};
1225 
1226    auto last_store = *stores.rbegin();
1227 
1228    nir_builder b = nir_builder_at(nir_before_instr(&last_store->instr));
1229 
1230    unsigned comps = 0;
1231    unsigned writemask = 0;
1232    unsigned first_comp = 4;
1233    for (auto&& store : stores) {
1234       int cmp = nir_intrinsic_component(store);
1235       for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1236          unsigned out_comp = i + cmp;
1237          srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1238          writemask |= 1 << out_comp;
1239          if (first_comp > out_comp)
1240             first_comp = out_comp;
1241       }
1242    }
1243 
1244    auto new_src = nir_vec(&b, srcs, comps);
1245 
1246    nir_src_rewrite(&last_store->src[0], new_src);
1247    last_store->num_components = comps;
1248    nir_intrinsic_set_component(last_store, first_comp);
1249    nir_intrinsic_set_write_mask(last_store, writemask);
1250 
1251    for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1252       nir_instr_remove(&(*i)->instr);
1253 }
1254 
1255 bool
r600_merge_vec2_stores(nir_shader * shader)1256 r600_merge_vec2_stores(nir_shader *shader)
1257 {
1258    r600::StoreMerger merger(shader);
1259    merger.collect_stores();
1260    return merger.combine();
1261 }
1262 
1263 static bool
r600_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)1264 r600_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
1265 {
1266    b->cursor = nir_after_instr(&instr->instr);
1267 
1268    switch (instr->intrinsic) {
1269    case nir_intrinsic_load_ubo:
1270    case nir_intrinsic_load_ubo_vec4:
1271    case nir_intrinsic_load_uniform:
1272    case nir_intrinsic_load_ssbo:
1273    case nir_intrinsic_load_input:
1274    case nir_intrinsic_load_interpolated_input:
1275    case nir_intrinsic_load_per_vertex_input:
1276    case nir_intrinsic_store_output:
1277    case nir_intrinsic_store_per_vertex_output:
1278    case nir_intrinsic_store_ssbo:
1279       break;
1280    default:
1281       return false;
1282    }
1283 
1284    if (instr->num_components <= 2)
1285       return false;
1286 
1287    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
1288    if (has_dest) {
1289       if (instr->def.bit_size != 64)
1290          return false;
1291    } else {
1292       if (nir_src_bit_size(instr->src[0]) != 64)
1293          return false;
1294    }
1295 
1296    nir_intrinsic_instr *first =
1297       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1298    nir_intrinsic_instr *second =
1299       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1300 
1301    switch (instr->intrinsic) {
1302    case nir_intrinsic_load_ubo:
1303    case nir_intrinsic_load_ubo_vec4:
1304    case nir_intrinsic_load_uniform:
1305    case nir_intrinsic_load_ssbo:
1306    case nir_intrinsic_store_ssbo:
1307       break;
1308 
1309    default: {
1310       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
1311       semantics.location++;
1312       semantics.num_slots--;
1313       nir_intrinsic_set_io_semantics(second, semantics);
1314 
1315       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
1316       break;
1317    }
1318    }
1319 
1320    first->num_components = 2;
1321    second->num_components -= 2;
1322    if (has_dest) {
1323       first->def.num_components = 2;
1324       second->def.num_components -= 2;
1325    }
1326 
1327    nir_builder_instr_insert(b, &first->instr);
1328    nir_builder_instr_insert(b, &second->instr);
1329 
1330    if (has_dest) {
1331       /* Merge the two loads' results back into a vector. */
1332       nir_scalar channels[4] = {
1333          nir_get_scalar(&first->def, 0),
1334          nir_get_scalar(&first->def, 1),
1335          nir_get_scalar(&second->def, 0),
1336          nir_get_scalar(&second->def, second->num_components > 1 ? 1 : 0),
1337       };
1338       nir_def *new_ir = nir_vec_scalars(b, channels, instr->num_components);
1339       nir_def_rewrite_uses(&instr->def, new_ir);
1340    } else {
1341       /* Split the src value across the two stores. */
1342       b->cursor = nir_before_instr(&instr->instr);
1343 
1344       nir_def *src0 = instr->src[0].ssa;
1345       nir_scalar channels[4] = {{0}};
1346       for (int i = 0; i < instr->num_components; i++)
1347          channels[i] = nir_get_scalar(src0, i);
1348 
1349       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
1350       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
1351 
1352       nir_src_rewrite(&first->src[0], nir_vec_scalars(b, channels, 2));
1353       nir_src_rewrite(&second->src[0],
1354                       nir_vec_scalars(b, &channels[2], second->num_components));
1355    }
1356 
1357    int offset_src = -1;
1358    uint32_t offset_amount = 16;
1359 
1360    switch (instr->intrinsic) {
1361    case nir_intrinsic_load_ssbo:
1362    case nir_intrinsic_load_ubo:
1363       offset_src = 1;
1364       break;
1365    case nir_intrinsic_load_ubo_vec4:
1366    case nir_intrinsic_load_uniform:
1367       offset_src = 0;
1368       offset_amount = 1;
1369       break;
1370    case nir_intrinsic_store_ssbo:
1371       offset_src = 2;
1372       break;
1373    default:
1374       break;
1375    }
1376    if (offset_src != -1) {
1377       b->cursor = nir_before_instr(&second->instr);
1378       nir_def *second_offset =
1379          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
1380       nir_src_rewrite(&second->src[offset_src], second_offset);
1381    }
1382 
1383    /* DCE stores we generated with no writemask (nothing else does this
1384     * currently).
1385     */
1386    if (!has_dest) {
1387       if (nir_intrinsic_write_mask(first) == 0)
1388          nir_instr_remove(&first->instr);
1389       if (nir_intrinsic_write_mask(second) == 0)
1390          nir_instr_remove(&second->instr);
1391    }
1392 
1393    nir_instr_remove(&instr->instr);
1394 
1395    return true;
1396 }
1397 
1398 static bool
r600_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)1399 r600_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
1400 {
1401    int num_components = instr->def.num_components;
1402 
1403    if (instr->def.bit_size != 64 || num_components <= 2)
1404       return false;
1405 
1406    b->cursor = nir_before_instr(&instr->instr);
1407 
1408    nir_load_const_instr *first = nir_load_const_instr_create(b->shader, 2, 64);
1409    nir_load_const_instr *second =
1410       nir_load_const_instr_create(b->shader, num_components - 2, 64);
1411 
1412    first->value[0] = instr->value[0];
1413    first->value[1] = instr->value[1];
1414    second->value[0] = instr->value[2];
1415    if (num_components == 4)
1416       second->value[1] = instr->value[3];
1417 
1418    nir_builder_instr_insert(b, &first->instr);
1419    nir_builder_instr_insert(b, &second->instr);
1420 
1421    nir_def *channels[4] = {
1422       nir_channel(b, &first->def, 0),
1423       nir_channel(b, &first->def, 1),
1424       nir_channel(b, &second->def, 0),
1425       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
1426    };
1427    nir_def *new_ir = nir_vec(b, channels, num_components);
1428    nir_def_rewrite_uses(&instr->def, new_ir);
1429    nir_instr_remove(&instr->instr);
1430 
1431    return true;
1432 }
1433 
1434 static bool
r600_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,UNUSED void * data)1435 r600_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, UNUSED void *data)
1436 {
1437    switch (instr->type) {
1438    case nir_instr_type_load_const:
1439       return r600_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
1440 
1441    case nir_instr_type_intrinsic:
1442       return r600_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
1443    default:
1444       return false;
1445    }
1446 }
1447 
1448 bool
r600_lower_64bit_to_vec2(nir_shader * s)1449 r600_lower_64bit_to_vec2(nir_shader *s)
1450 {
1451    return nir_shader_instructions_pass(s,
1452                                        r600_lower_64bit_to_vec2_instr,
1453                                        nir_metadata_block_index | nir_metadata_dominance,
1454                                        NULL);
1455 }
1456 
1457 } // end namespace r600
1458