• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- mesa-c++  -*-
2  *
3  * Copyright (c) 2020 Collabora LTD
4  *
5  * Author: Gert Wollny <gert.wollny@collabora.com>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "sfn_nir.h"
28 
29 #include "nir.h"
30 #include "nir_builder.h"
31 
32 #include <map>
33 #include <vector>
34 #include <iostream>
35 
36 namespace r600 {
37 
38 using std::map;
39 using std::pair;
40 using std::make_pair;
41 using std::vector;
42 
43 class LowerSplit64BitVar : public NirLowerInstruction {
44 public:
45 
46    ~LowerSplit64BitVar();
47    using VarSplit = pair<nir_variable*, nir_variable*>;
48    using VarMap = map<unsigned, VarSplit>;
49 
50    nir_ssa_def *
51    split_double_load_deref(nir_intrinsic_instr *intr);
52 
53    nir_ssa_def *
54    split_double_store_deref(nir_intrinsic_instr *intr);
55 
56 private:
57    nir_ssa_def *
58    split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
59 
60    nir_ssa_def *
61    split_load_deref_var(nir_intrinsic_instr *intr);
62 
63    nir_ssa_def *
64    split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref);
65 
66    nir_ssa_def *
67    split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
68 
69    VarSplit get_var_pair(nir_variable *old_var);
70 
71    nir_ssa_def *
72    merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3);
73 
74    nir_ssa_def *split_double_load(nir_intrinsic_instr *load1);
75 
76    nir_ssa_def *
77    split_store_output(nir_intrinsic_instr *store1);
78 
79    nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr);
80 
81    nir_ssa_def *
82    split_double_load_ssbo(nir_intrinsic_instr *intr);
83 
84    nir_ssa_def *
85    split_double_load_ubo(nir_intrinsic_instr *intr);
86 
87    nir_ssa_def *
88    split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
89 
90    nir_ssa_def *
91    split_reduction3(nir_alu_instr *alu,
92                     nir_op op1, nir_op op2, nir_op reduction);
93 
94    nir_ssa_def *
95    split_reduction4(nir_alu_instr *alu,
96                     nir_op op1, nir_op op2, nir_op reduction);
97 
98    nir_ssa_def *split_bcsel(nir_alu_instr *alu);
99 
100    nir_ssa_def *split_load_const(nir_load_const_instr *lc);
101 
102    bool filter(const nir_instr *instr) const override;
103    nir_ssa_def *lower(nir_instr *instr) override;
104 
105    VarMap m_varmap;
106    vector<nir_variable*> m_old_vars;
107    vector<nir_instr *> m_old_stores;
108 };
109 
110 
111 class LowerLoad64Uniform : public NirLowerInstruction {
112    bool filter(const nir_instr *instr) const override;
113    nir_ssa_def *lower(nir_instr *instr) override;
114 };
115 
filter(const nir_instr * instr) const116 bool LowerLoad64Uniform::filter(const nir_instr *instr) const
117 {
118    if (instr->type != nir_instr_type_intrinsic)
119       return false;
120 
121    auto intr = nir_instr_as_intrinsic(instr);
122    if (intr->intrinsic != nir_intrinsic_load_uniform &&
123        intr->intrinsic != nir_intrinsic_load_ubo &&
124        intr->intrinsic != nir_intrinsic_load_ubo_vec4)
125       return false;
126 
127    return nir_dest_bit_size(intr->dest) == 64;
128 }
129 
130 
lower(nir_instr * instr)131 nir_ssa_def *LowerLoad64Uniform::lower(nir_instr *instr)
132 {
133    auto intr = nir_instr_as_intrinsic(instr);
134    int old_components = nir_dest_num_components(intr->dest);
135    assert(old_components <= 2);
136    assert(intr->dest.is_ssa);
137    intr->dest.ssa.num_components *= 2;
138    intr->dest.ssa.bit_size = 32;
139    intr->num_components *= 2;
140 
141    if (intr->intrinsic ==nir_intrinsic_load_ubo ||
142        intr->intrinsic ==nir_intrinsic_load_ubo_vec4)
143       nir_intrinsic_set_component(intr, 2 * nir_intrinsic_component(intr));
144 
145    nir_ssa_def *result_vec[2] = {nullptr, nullptr};
146 
147    for (int i = 0; i < old_components; ++i) {
148       result_vec[i] = nir_pack_64_2x32_split(b,
149                                              nir_channel(b, &intr->dest.ssa, 2 * i),
150                                              nir_channel(b, &intr->dest.ssa, 2 * i + 1));
151    }
152    if (old_components == 1)
153       return result_vec[0];
154 
155    return nir_vec2(b, result_vec[0], result_vec[1]);
156 }
157 
r600_split_64bit_uniforms_and_ubo(nir_shader * sh)158 bool r600_split_64bit_uniforms_and_ubo(nir_shader *sh)
159 {
160    return LowerLoad64Uniform().run(sh);
161 }
162 
163 class LowerSplit64op : public NirLowerInstruction {
filter(const nir_instr * instr) const164    bool filter(const nir_instr *instr) const override {
165       switch (instr->type) {
166       case nir_instr_type_alu: {
167          auto alu = nir_instr_as_alu(instr);
168          switch (alu->op) {
169          case nir_op_bcsel:
170             return nir_dest_bit_size(alu->dest.dest) == 64;
171          case nir_op_f2b1:
172          case nir_op_f2i32:
173          case nir_op_f2u32:
174          case nir_op_f2i64:
175          case nir_op_f2u64:
176          case nir_op_u2f64:
177          case nir_op_i2f64:
178             return nir_src_bit_size(alu->src[0].src) == 64;
179          default:
180             return false;
181          }
182       }
183       case nir_instr_type_phi: {
184          auto phi = nir_instr_as_phi(instr);
185          return nir_dest_num_components(phi->dest) == 64;
186       }
187       default:
188          return false;
189       }
190    }
191 
lower(nir_instr * instr)192    nir_ssa_def *lower(nir_instr *instr) override {
193 
194       switch (instr->type) {
195       case nir_instr_type_alu: {
196          auto alu = nir_instr_as_alu(instr);
197          switch (alu->op) {
198 
199          case nir_op_bcsel: {
200             auto lo = nir_bcsel(b, nir_ssa_for_src(b, alu->src[0].src, 1),
201                   nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 1)),
202                   nir_unpack_64_2x32_split_x(b, nir_ssa_for_alu_src(b, alu, 2)));
203             auto hi = nir_bcsel(b, nir_ssa_for_src(b, alu->src[0].src, 1),
204                   nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 1)),
205                   nir_unpack_64_2x32_split_y(b, nir_ssa_for_alu_src(b, alu, 2)));
206             return nir_pack_64_2x32_split(b, lo, hi);
207          }
208          case nir_op_f2b1: {
209             auto mask = nir_component_mask(nir_dest_num_components(alu->dest.dest));
210             return nir_fneu(b, nir_channels(b, nir_ssa_for_alu_src(b, alu, 0), mask),
211                   nir_imm_zero(b, nir_dest_num_components(alu->dest.dest), 64));
212          }
213          case nir_op_f2i32: {
214             auto src = nir_ssa_for_alu_src(b, alu, 0);
215             auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
216             auto abs_src = nir_fabs(b, src);
217             auto value = nir_f2u32(b, abs_src);
218             return nir_bcsel(b, gt0, value, nir_ineg(b, value));
219          }
220          case nir_op_f2u32: {
221             /* fp32 doesn't hold suffient bits to represent the full range of
222              * u32, therefore we have to split the values, and because f2f32
223              * rounds, we have to remove the fractional part in the hi bits
224              * For values > UINT_MAX the result is undefined */
225             auto src = nir_ssa_for_alu_src(b, alu, 0);
226             auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
227             auto highval = nir_fmul_imm(b, src, 1.0/65536.0);
228             auto fract = nir_ffract(b, highval);
229             auto high = nir_f2u32(b, nir_f2f32(b, nir_fsub(b, highval, fract)));
230             auto lowval = nir_fmul_imm(b, fract, 65536.0);
231             auto low = nir_f2u32(b, nir_f2f32(b, lowval));
232             return nir_bcsel(b, gt0, nir_ior(b, nir_ishl_imm(b, high, 16), low),
233                              nir_imm_int(b, 0));
234          }
235          case nir_op_f2i64: {
236             auto src = nir_ssa_for_alu_src(b, alu, 0);
237             auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
238             auto abs_src = nir_fabs(b, src);
239             auto value = nir_f2u64(b, abs_src);
240             return nir_bcsel(b, gt0, value, nir_isub(b, nir_imm_zero(b, 1, 64), value));
241          }
242          case nir_op_f2u64: {
243             auto src = nir_ssa_for_alu_src(b, alu, 0);
244             auto gt0 = nir_flt(b, nir_imm_double(b, 0.0), src);
245             auto highval = nir_fmul_imm(b, src, 1.0/(65536.0 * 65536.0));
246             auto fract = nir_ffract(b, highval);
247             auto high = nir_f2u32(b, nir_fsub(b, highval, fract));
248             auto low = nir_f2u32(b, nir_fmul_imm(b, fract, 65536.0 * 65536.0));
249             return nir_bcsel(b, gt0, nir_pack_64_2x32_split(b, low, high),
250                              nir_imm_zero(b, 1, 64));
251          }
252          case nir_op_u2f64: {
253             auto src = nir_ssa_for_alu_src(b, alu, 0);
254             auto low = nir_unpack_64_2x32_split_x(b, src);
255             auto high = nir_unpack_64_2x32_split_y(b, src);
256             auto flow = nir_u2f64(b, low);
257             auto fhigh = nir_u2f64(b, high);
258             return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
259          }
260          case nir_op_i2f64: {
261             auto src = nir_ssa_for_alu_src(b, alu, 0);
262             auto low = nir_unpack_64_2x32_split_x(b, src);
263             auto high = nir_unpack_64_2x32_split_y(b, src);
264             auto flow = nir_u2f64(b, low);
265             auto fhigh = nir_i2f64(b, high);
266             return nir_fadd(b, nir_fmul_imm(b, fhigh, 65536.0 * 65536.0), flow);
267          }
268          default:
269             unreachable("trying to lower instruction that was not in filter");
270          }
271       }
272       case nir_instr_type_phi: {
273          auto phi = nir_instr_as_phi(instr);
274          auto phi_lo = nir_phi_instr_create(b->shader);
275          auto phi_hi = nir_phi_instr_create(b->shader);
276          nir_ssa_dest_init(&phi_lo->instr, &phi_lo->dest, phi->dest.ssa.num_components * 2, 32, "");
277          nir_ssa_dest_init(&phi_hi->instr, &phi_hi->dest, phi->dest.ssa.num_components * 2, 32, "");
278          nir_foreach_phi_src(s, phi) {
279             auto lo = nir_unpack_32_2x16_split_x(b, nir_ssa_for_src(b, s->src, 1));
280             auto hi = nir_unpack_32_2x16_split_x(b, nir_ssa_for_src(b, s->src, 1));
281             nir_phi_instr_add_src(phi_lo, s->pred, nir_src_for_ssa(lo));
282             nir_phi_instr_add_src(phi_hi, s->pred, nir_src_for_ssa(hi));
283          }
284          return nir_pack_64_2x32_split(b, &phi_lo->dest.ssa, &phi_hi->dest.ssa);
285       }
286       default:
287          unreachable("Trying to lower instruction that was not in filter");
288       }
289    }
290 };
291 
r600_split_64bit_alu_and_phi(nir_shader * sh)292 bool r600_split_64bit_alu_and_phi(nir_shader *sh)
293 {
294    return LowerSplit64op().run(sh);
295 }
296 
297 
298 bool
filter(const nir_instr * instr) const299 LowerSplit64BitVar::filter(const nir_instr *instr) const
300 {
301    switch (instr->type) {
302    case  nir_instr_type_intrinsic: {
303       auto intr = nir_instr_as_intrinsic(instr);
304 
305       switch (intr->intrinsic) {
306       case nir_intrinsic_load_deref:
307       case nir_intrinsic_load_uniform:
308       case nir_intrinsic_load_input:
309       case nir_intrinsic_load_ubo:
310       case nir_intrinsic_load_ssbo:
311          if (nir_dest_bit_size(intr->dest) != 64)
312             return false;
313          return nir_dest_num_components(intr->dest) >= 3;
314       case nir_intrinsic_store_output:
315          if (nir_src_bit_size(intr->src[0]) != 64)
316             return false;
317          return nir_src_num_components(intr->src[0]) >= 3;
318       case nir_intrinsic_store_deref:
319          if (nir_src_bit_size(intr->src[1]) != 64)
320             return false;
321          return nir_src_num_components(intr->src[1]) >= 3;
322       default:
323          return false;
324       }
325    }
326    case  nir_instr_type_alu: {
327       auto alu = nir_instr_as_alu(instr);
328       switch (alu->op) {
329       case nir_op_bcsel:
330          if (nir_dest_num_components(alu->dest.dest) < 3)
331             return false;
332          return nir_dest_bit_size(alu->dest.dest) == 64;
333       case nir_op_bany_fnequal3:
334       case nir_op_bany_fnequal4:
335       case nir_op_ball_fequal3:
336       case nir_op_ball_fequal4:
337       case nir_op_bany_inequal3:
338       case nir_op_bany_inequal4:
339       case nir_op_ball_iequal3:
340       case nir_op_ball_iequal4:
341       case nir_op_fdot3:
342       case nir_op_fdot4:
343          return nir_src_bit_size(alu->src[1].src) == 64;
344       default:
345          return false;
346       }
347    }
348    case nir_instr_type_load_const: {
349       auto lc = nir_instr_as_load_const(instr);
350       if (lc->def.bit_size != 64)
351          return false;
352       return lc->def.num_components >= 3;
353    }
354    default:
355       return false;
356    }
357 }
358 
359 nir_ssa_def *
merge_64bit_loads(nir_ssa_def * load1,nir_ssa_def * load2,bool out_is_vec3)360 LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1,
361                                       nir_ssa_def *load2, bool out_is_vec3)
362 {
363    if (out_is_vec3)
364       return nir_vec3(b, nir_channel(b, load1, 0),
365                       nir_channel(b, load1, 1),
366                       nir_channel(b, load2, 0));
367    else
368       return nir_vec4(b, nir_channel(b, load1, 0),
369                       nir_channel(b, load1, 1),
370                       nir_channel(b, load2, 0),
371                       nir_channel(b, load2, 1));
372 }
373 
~LowerSplit64BitVar()374 LowerSplit64BitVar::~LowerSplit64BitVar()
375 {
376    for(auto&& v: m_old_vars)
377       exec_node_remove(&v->node);
378 
379    for(auto&& v: m_old_stores)
380       nir_instr_remove(v);
381 }
382 
383 nir_ssa_def *
split_double_store_deref(nir_intrinsic_instr * intr)384 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
385 {
386    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
387    if (deref->deref_type == nir_deref_type_var)
388       return split_store_deref_var(intr, deref);
389    else if (deref->deref_type == nir_deref_type_array)
390       return split_store_deref_array(intr, deref);
391    else {
392       unreachable("only splitting of stores to vars and arrays is supported");
393    }
394 }
395 
396 nir_ssa_def *
split_double_load_deref(nir_intrinsic_instr * intr)397 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
398 {
399    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
400    if (deref->deref_type == nir_deref_type_var)
401       return split_load_deref_var(intr);
402    else if (deref->deref_type == nir_deref_type_array)
403       return split_load_deref_array(intr, deref->arr.index);
404    else {
405       unreachable(0 && "only splitting of loads from vars and arrays is supported");
406    }
407    m_old_stores.push_back(&intr->instr);
408 }
409 
410 nir_ssa_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)411 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
412 {
413    auto old_var = nir_intrinsic_get_var(intr, 0);
414    unsigned old_components = old_var->type->without_array()->components();
415 
416    assert(old_components > 2 && old_components <= 4);
417 
418    auto vars = get_var_pair(old_var);
419 
420    auto deref1 = nir_build_deref_var(b, vars.first);
421    auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1));
422    auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0);
423 
424    auto deref2 = nir_build_deref_var(b, vars.second);
425    auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1));
426 
427    auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0);
428 
429    return merge_64bit_loads(load1, load2, old_components == 3);
430 }
431 
432 nir_ssa_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)433 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref)
434 {
435    auto old_var = nir_intrinsic_get_var(intr, 0);
436    unsigned old_components = old_var->type->without_array()->components();
437 
438    assert(old_components > 2 && old_components <= 4);
439 
440    auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
441 
442    auto vars = get_var_pair(old_var);
443 
444    auto deref1 = nir_build_deref_var(b, vars.first);
445    auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1));
446 
447    nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3);
448 
449    auto deref2 = nir_build_deref_var(b, vars.second);
450    auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1));
451 
452    if (old_components == 3)
453       nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
454    else
455       nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
456 
457    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
458 }
459 
460 nir_ssa_def *
split_store_deref_var(nir_intrinsic_instr * intr,UNUSED nir_deref_instr * deref)461 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, UNUSED nir_deref_instr *deref)
462 {
463    auto old_var = nir_intrinsic_get_var(intr, 0);
464    unsigned old_components = old_var->type->without_array()->components();
465 
466    assert(old_components > 2 && old_components <= 4);
467 
468    auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
469 
470    auto vars = get_var_pair(old_var);
471 
472    auto deref1 = nir_build_deref_var(b, vars.first);
473    nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3);
474 
475    auto deref2 = nir_build_deref_var(b, vars.second);
476    if (old_components == 3)
477       nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
478    else
479       nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
480 
481    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
482 }
483 
484 nir_ssa_def *
split_load_deref_var(nir_intrinsic_instr * intr)485 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
486 {
487    auto old_var = nir_intrinsic_get_var(intr, 0);
488    auto vars = get_var_pair(old_var);
489    unsigned old_components = old_var->type->components();
490 
491    nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
492    auto *load1 = nir_load_deref(b, deref1);
493 
494    nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
495    deref2->type = vars.second->type;
496 
497    auto *load2 = nir_load_deref(b, deref2);
498 
499    return merge_64bit_loads(load1, load2, old_components == 3);
500 }
501 
502 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)503 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
504 {
505    auto split_vars = m_varmap.find(old_var->data.driver_location);
506 
507    assert(old_var->type->without_array()->components() > 2);
508 
509    if (split_vars == m_varmap.end()) {
510       auto var1 = nir_variable_clone(old_var, b->shader);
511       auto var2 = nir_variable_clone(old_var, b->shader);
512 
513       var1->type = glsl_dvec_type(2);
514       var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2);
515 
516       if (old_var->type->is_array()) {
517          var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0);
518          var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0);
519       }
520 
521       if (old_var->data.mode == nir_var_shader_in ||
522           old_var->data.mode == nir_var_shader_out) {
523          ++var2->data.driver_location;
524          ++var2->data.location;
525          nir_shader_add_variable(b->shader, var1);
526          nir_shader_add_variable(b->shader, var2);
527       } else if (old_var->data.mode == nir_var_function_temp) {
528          exec_list_push_tail(&b->impl->locals, &var1->node);
529          exec_list_push_tail(&b->impl->locals, &var2->node);
530       }
531 
532       m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
533    }
534    return m_varmap[old_var->data.driver_location];
535 }
536 
537 
538 nir_ssa_def *
split_double_load(nir_intrinsic_instr * load1)539 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
540 {
541    unsigned old_components = nir_dest_num_components(load1->dest);
542    auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
543    nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
544 
545    load1->dest.ssa.num_components = 2;
546    sem.num_slots = 1;
547    nir_intrinsic_set_io_semantics(load1, sem);
548 
549    load2->dest.ssa.num_components = old_components - 2;
550    sem.location += 1;
551    nir_intrinsic_set_io_semantics(load2, sem);
552    nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
553    nir_builder_instr_insert(b, &load2->instr);
554 
555    return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3);
556 }
557 
558 
559 nir_ssa_def *
split_store_output(nir_intrinsic_instr * store1)560 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
561 {
562    auto src = store1->src[0];
563    unsigned old_components = nir_src_num_components(src);
564    nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
565 
566    auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
567    auto src1 = nir_channels(b, src.ssa, 3);
568    auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
569 
570    nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1));
571    nir_intrinsic_set_write_mask(store1, 3);
572 
573    nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2));
574    nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
575 
576    sem.num_slots = 1;
577    nir_intrinsic_set_io_semantics(store1, sem);
578 
579    sem.location += 1;
580    nir_intrinsic_set_io_semantics(store2, sem);
581    nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
582 
583    nir_builder_instr_insert(b, &store2->instr);
584    return NIR_LOWER_INSTR_PROGRESS;
585 }
586 
587 
588 nir_ssa_def *
split_double_load_uniform(nir_intrinsic_instr * intr)589 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
590 {
591    unsigned second_components = nir_dest_num_components(intr->dest) - 2;
592    nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
593    load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
594    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
595    nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
596    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
597    load2->num_components = second_components;
598 
599    nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
600    nir_builder_instr_insert(b, &load2->instr);
601 
602    intr->dest.ssa.num_components = intr->num_components = 2;
603 
604    if (second_components == 1)
605       return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0),
606                       nir_channel(b, &intr->dest.ssa, 1),
607                       nir_channel(b, &load2->dest.ssa, 0));
608    else
609       return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0),
610                       nir_channel(b, &intr->dest.ssa, 1),
611                       nir_channel(b, &load2->dest.ssa, 0),
612                       nir_channel(b, &load2->dest.ssa, 1));
613 }
614 
615 nir_ssa_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)616 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
617 {
618    unsigned second_components = nir_dest_num_components(intr->dest) - 2;
619    nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
620 
621    auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
622    nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0);
623    load2->num_components = second_components;
624    nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
625 
626    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
627    nir_builder_instr_insert(b, &load2->instr);
628 
629    intr->dest.ssa.num_components = intr->num_components = 2;
630 
631    return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
632 }
633 
634 
635 nir_ssa_def *
split_double_load_ubo(nir_intrinsic_instr * intr)636 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
637 {
638    unsigned second_components = nir_dest_num_components(intr->dest) - 2;
639    nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
640    load2->src[0] = intr->src[0];
641    load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
642    nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
643    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
644    nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
645    nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
646    nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16);
647 
648    load2->num_components = second_components;
649 
650    nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
651    nir_builder_instr_insert(b, &load2->instr);
652 
653    intr->dest.ssa.num_components = intr->num_components = 2;
654 
655    return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
656 }
657 
658 nir_ssa_def *
split_reduction(nir_ssa_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)659 LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction)
660 {
661    auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
662    auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
663    return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
664 }
665 
666 nir_ssa_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)667 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
668                                      nir_op op1, nir_op op2, nir_op reduction)
669 {
670    nir_ssa_def *src[2][2];
671 
672    src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
673    src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
674 
675    src[1][0]  = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2);
676    src[1][1]  = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2);
677 
678    return split_reduction(src, op1, op2, reduction);
679 }
680 
681 nir_ssa_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)682 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
683                                      nir_op op1, nir_op op2, nir_op reduction)
684 {
685    nir_ssa_def *src[2][2];
686 
687    src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
688    src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
689 
690    src[1][0]  = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc);
691    src[1][1]  = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc);
692 
693    return split_reduction(src, op1, op2, reduction);
694 }
695 
696 nir_ssa_def *
split_bcsel(nir_alu_instr * alu)697 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
698 {
699    static nir_ssa_def *dest[4];
700    for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) {
701       dest[i] = nir_bcsel(b,
702                           nir_channel(b, alu->src[0].src.ssa, i),
703                           nir_channel(b, alu->src[1].src.ssa, i),
704                           nir_channel(b, alu->src[2].src.ssa, i));
705    }
706    return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest));
707 }
708 
709 nir_ssa_def *
split_load_const(nir_load_const_instr * lc)710 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
711 {
712    nir_ssa_def *ir[4];
713    for (unsigned i = 0; i < lc->def.num_components; ++i)
714       ir[i] = nir_imm_double(b, lc->value[i].f64);
715 
716    return nir_vec(b, ir, lc->def.num_components);
717 }
718 
719 nir_ssa_def *
lower(nir_instr * instr)720 LowerSplit64BitVar::lower(nir_instr *instr)
721 {
722    switch (instr->type) {
723    case nir_instr_type_intrinsic: {
724       auto intr = nir_instr_as_intrinsic(instr);
725       switch (intr->intrinsic) {
726       case nir_intrinsic_load_deref:
727          return this->split_double_load_deref(intr);
728       case nir_intrinsic_load_uniform:
729          return split_double_load_uniform(intr);
730       case nir_intrinsic_load_ubo:
731          return split_double_load_ubo(intr);
732       case nir_intrinsic_load_ssbo:
733          return split_double_load_ssbo(intr);
734       case nir_intrinsic_load_input:
735          return split_double_load(intr);
736       case nir_intrinsic_store_output:
737          return split_store_output(intr);
738       case nir_intrinsic_store_deref:
739          return split_double_store_deref(intr);
740       default:
741          assert(0);
742       }
743    }
744    case  nir_instr_type_alu: {
745       auto alu = nir_instr_as_alu(instr);
746       switch (alu->op) {
747       case nir_op_bany_fnequal3:
748          return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
749       case nir_op_ball_fequal3:
750          return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
751       case nir_op_bany_inequal3:
752          return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
753       case nir_op_ball_iequal3:
754          return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
755       case nir_op_fdot3:
756          return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
757       case nir_op_bany_fnequal4:
758          return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
759       case nir_op_ball_fequal4:
760          return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand);
761       case nir_op_bany_inequal4:
762          return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior);
763       case nir_op_ball_iequal4:
764          return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
765       case nir_op_fdot4:
766          return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
767       case nir_op_bcsel:
768          return split_bcsel(alu);
769       default:
770          assert(0);
771       }
772    }
773    case nir_instr_type_load_const: {
774       auto lc = nir_instr_as_load_const(instr);
775       return split_load_const(lc);
776    }
777    default:
778       assert(0);
779    }
780    return nullptr;
781 }
782 
783 /* Split 64 bit instruction so that at most two 64 bit components are
784  * used in one instruction */
785 
786 bool
r600_nir_split_64bit_io(nir_shader * sh)787 r600_nir_split_64bit_io(nir_shader *sh)
788 {
789    return LowerSplit64BitVar().run(sh);
790 }
791 
792 /* */
793 class Lower64BitToVec2 : public NirLowerInstruction {
794 
795 private:
796    bool filter(const nir_instr *instr) const override;
797    nir_ssa_def *lower(nir_instr *instr) override;
798 
799    nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
800    nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
801    nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
802    nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr);
803    nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr);
804 };
805 
806 bool
filter(const nir_instr * instr) const807 Lower64BitToVec2::filter(const nir_instr *instr) const
808 {
809    switch (instr->type) {
810    case nir_instr_type_intrinsic:  {
811       auto intr = nir_instr_as_intrinsic(instr);
812 
813       switch (intr->intrinsic) {
814       case nir_intrinsic_load_deref:
815       case nir_intrinsic_load_input:
816       case nir_intrinsic_load_uniform:
817       case nir_intrinsic_load_ubo:
818       case nir_intrinsic_load_ubo_vec4:
819       case nir_intrinsic_load_ssbo:
820          return nir_dest_bit_size(intr->dest) == 64;
821       case nir_intrinsic_store_deref: {
822          if (nir_src_bit_size(intr->src[1]) == 64)
823             return true;
824          auto var = nir_intrinsic_get_var(intr, 0);
825          if (var->type->without_array()->bit_size() == 64)
826             return true;
827          return (var->type->without_array()->components() != intr->num_components);
828       }
829       default:
830          return false;
831       }
832    }
833    case nir_instr_type_alu: {
834       auto alu = nir_instr_as_alu(instr);
835       return nir_dest_bit_size(alu->dest.dest) == 64;
836    }
837    case nir_instr_type_phi: {
838       auto phi = nir_instr_as_phi(instr);
839       return nir_dest_bit_size(phi->dest) == 64;
840    }
841    case nir_instr_type_load_const:  {
842       auto lc = nir_instr_as_load_const(instr);
843       return lc->def.bit_size == 64;
844    }
845    case nir_instr_type_ssa_undef:  {
846       auto undef = nir_instr_as_ssa_undef(instr);
847       return undef->def.bit_size == 64;
848    }
849    default:
850       return false;
851    }
852 }
853 
854 nir_ssa_def *
lower(nir_instr * instr)855 Lower64BitToVec2::lower(nir_instr *instr)
856 {
857    switch (instr->type) {
858    case nir_instr_type_intrinsic:  {
859       auto intr = nir_instr_as_intrinsic(instr);
860       switch (intr->intrinsic) {
861       case nir_intrinsic_load_deref:
862          return load_deref_64_to_vec2(intr);
863       case nir_intrinsic_load_uniform:
864          return load_uniform_64_to_vec2(intr);
865       case nir_intrinsic_load_ssbo:
866          return load_ssbo_64_to_vec2(intr);
867       case nir_intrinsic_load_input:
868       case nir_intrinsic_load_ubo:
869       case nir_intrinsic_load_ubo_vec4:
870          return load_64_to_vec2(intr);
871       case nir_intrinsic_store_deref:
872          return store_64_to_vec2(intr);
873       default:
874 
875          return nullptr;
876       }
877    }
878    case nir_instr_type_alu: {
879       auto alu = nir_instr_as_alu(instr);
880       alu->dest.dest.ssa.bit_size = 32;
881       alu->dest.dest.ssa.num_components *= 2;
882       alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1;
883       switch (alu->op) {
884       case nir_op_pack_64_2x32_split:
885          alu->op = nir_op_vec2;
886          break;
887       case nir_op_pack_64_2x32:
888          alu->op = nir_op_mov;
889          break;
890       case nir_op_vec2:
891          return nir_vec4(b,
892                          nir_channel(b, alu->src[0].src.ssa, 0),
893                          nir_channel(b, alu->src[0].src.ssa, 1),
894                          nir_channel(b, alu->src[1].src.ssa, 0),
895                          nir_channel(b, alu->src[1].src.ssa, 1));
896       default:
897          return NULL;
898       }
899       return NIR_LOWER_INSTR_PROGRESS;
900    }
901    case nir_instr_type_phi: {
902       auto phi = nir_instr_as_phi(instr);
903       phi->dest.ssa.bit_size = 32;
904       phi->dest.ssa.num_components = 2;
905       return NIR_LOWER_INSTR_PROGRESS;
906    }
907    case nir_instr_type_load_const:  {
908       auto lc = nir_instr_as_load_const(instr);
909       assert(lc->def.num_components < 3);
910       nir_const_value val[4] = {0};
911       for (uint i = 0; i < lc->def.num_components; ++i) {
912          uint64_t v = lc->value[i].u64;
913          val[0].u32 = v & 0xffffffff;
914          val[1].u32 = (v >> 32) & 0xffffffff;
915       }
916 
917       return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
918    }
919    case nir_instr_type_ssa_undef:  {
920       auto undef = nir_instr_as_ssa_undef(instr);
921       undef->def.num_components *= 2;
922       undef->def.bit_size = 32;
923       return NIR_LOWER_INSTR_PROGRESS;
924    }
925    default:
926       return nullptr;
927    }
928 
929 }
930 
931 
932 nir_ssa_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)933 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
934 {
935    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
936    auto var = nir_intrinsic_get_var(intr, 0);
937    unsigned components = var->type->without_array()->components();
938    if (var->type->without_array()->bit_size() == 64) {
939       components *= 2;
940       if (deref->deref_type == nir_deref_type_var) {
941          var->type = glsl_vec_type(components);
942       } else if (deref->deref_type == nir_deref_type_array) {
943 
944          var->type = glsl_array_type(glsl_vec_type(components),
945                                      var->type->array_size(), 0);
946 
947       } else {
948          nir_print_shader(b->shader, stderr);
949          assert(0 && "Only lowring of var and array derefs supported\n");
950       }
951    }
952    deref->type = var->type;
953    if (deref->deref_type == nir_deref_type_array) {
954       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
955       deref_array->type = var->type;
956       deref->type = deref_array->type->without_array();
957    }
958 
959    intr->num_components = components;
960    intr->dest.ssa.bit_size = 32;
961    intr->dest.ssa.num_components = components;
962    return NIR_LOWER_INSTR_PROGRESS;
963 }
964 
965 nir_ssa_def *
store_64_to_vec2(nir_intrinsic_instr * intr)966 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
967 {
968    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
969    auto var = nir_intrinsic_get_var(intr, 0);
970 
971    unsigned components = var->type->without_array()->components();
972    unsigned wrmask = nir_intrinsic_write_mask(intr);
973    if (var->type->without_array()->bit_size() == 64) {
974       components *= 2;
975       if (deref->deref_type == nir_deref_type_var) {
976          var->type = glsl_vec_type(components);
977       } else if (deref->deref_type == nir_deref_type_array) {
978          var->type = glsl_array_type(glsl_vec_type(components),
979                                      var->type->array_size(), 0);
980       } else {
981             nir_print_shader(b->shader, stderr);
982             assert(0 && "Only lowring of var and array derefs supported\n");
983       }
984    }
985    deref->type = var->type;
986    if (deref->deref_type == nir_deref_type_array) {
987       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
988       deref_array->type = var->type;
989       deref->type = deref_array->type->without_array();
990    }
991    intr->num_components = components;
992    nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
993    return NIR_LOWER_INSTR_PROGRESS;
994 }
995 
996 
997 nir_ssa_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)998 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
999 {
1000    intr->num_components *= 2;
1001    intr->dest.ssa.bit_size = 32;
1002    intr->dest.ssa.num_components *= 2;
1003    nir_intrinsic_set_dest_type(intr, nir_type_float32);
1004    return NIR_LOWER_INSTR_PROGRESS;
1005 }
1006 
1007 nir_ssa_def *
load_64_to_vec2(nir_intrinsic_instr * intr)1008 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
1009 {
1010    intr->num_components *= 2;
1011    intr->dest.ssa.bit_size = 32;
1012    intr->dest.ssa.num_components *= 2;
1013    nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
1014    return NIR_LOWER_INSTR_PROGRESS;
1015 }
1016 
1017 nir_ssa_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)1018 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
1019 {
1020    intr->num_components *= 2;
1021    intr->dest.ssa.bit_size = 32;
1022    intr->dest.ssa.num_components *= 2;
1023    return NIR_LOWER_INSTR_PROGRESS;
1024 }
1025 
store_64bit_intr(nir_src * src,void * state)1026 static bool store_64bit_intr(nir_src *src, void *state)
1027 {
1028    bool *s = (bool *)state;
1029    *s = nir_src_bit_size(*src) == 64;
1030    return !*s;
1031 }
1032 
double2vec2(nir_src * src,UNUSED void * state)1033 static bool double2vec2(nir_src *src, UNUSED void *state)
1034 {
1035    if (nir_src_bit_size(*src) != 64)
1036       return true;
1037 
1038    assert(src->is_ssa);
1039    src->ssa->bit_size = 32;
1040    src->ssa->num_components *= 2;
1041    return true;
1042 }
1043 
1044 bool
r600_nir_64_to_vec2(nir_shader * sh)1045 r600_nir_64_to_vec2(nir_shader *sh)
1046 {
1047    vector<nir_instr*> intr64bit;
1048    nir_foreach_function(function, sh) {
1049       if (function->impl) {
1050          nir_builder b;
1051          nir_builder_init(&b, function->impl);
1052 
1053          nir_foreach_block(block, function->impl) {
1054             nir_foreach_instr_safe(instr, block) {
1055                switch (instr->type) {
1056                case nir_instr_type_alu: {
1057                   bool success = false;
1058                   nir_foreach_src(instr, store_64bit_intr, &success);
1059                   if (success)
1060                      intr64bit.push_back(instr);
1061                   break;
1062                }
1063                case nir_instr_type_intrinsic: {
1064                   auto ir = nir_instr_as_intrinsic(instr);
1065                   switch (ir->intrinsic) {
1066                   case nir_intrinsic_store_output:
1067                   case nir_intrinsic_store_ssbo: {
1068                      bool success = false;
1069                      nir_foreach_src(instr, store_64bit_intr, &success);
1070                      if (success) {
1071                         auto wm = nir_intrinsic_write_mask(ir);
1072                         nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
1073                         ir->num_components *= 2;
1074                      }
1075                      break;
1076                   }
1077                   default:
1078                      ;
1079                   }
1080                }
1081                default:
1082                   ;
1083                }
1084             }
1085          }
1086       }
1087    }
1088 
1089    bool result = Lower64BitToVec2().run(sh);
1090 
1091    if (result || !intr64bit.empty()) {
1092 
1093       for(auto&& instr: intr64bit) {
1094          if (instr->type == nir_instr_type_alu) {
1095             auto alu = nir_instr_as_alu(instr);
1096             auto alu_info = nir_op_infos[alu->op];
1097             for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
1098                int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
1099                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
1100                   if (!nir_alu_instr_channel_used(alu, i, k)) {
1101                      continue;
1102                   }
1103 
1104                   switch (alu->op) {
1105                   case nir_op_unpack_64_2x32_split_x:
1106                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1107                      alu->op = nir_op_mov;
1108                      break;
1109                   case nir_op_unpack_64_2x32_split_y:
1110                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
1111                      alu->op = nir_op_mov;
1112                      break;
1113                   case nir_op_unpack_64_2x32:
1114                      alu->op = nir_op_mov;
1115                      break;
1116                   case nir_op_bcsel:
1117                      if (i == 0) {
1118                         swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
1119                         break;
1120                      }
1121                      FALLTHROUGH;
1122                   default:
1123                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
1124                      swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
1125                   }
1126                }
1127                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
1128                   alu->src[i].swizzle[k] = swizzle[k];
1129                }
1130             }
1131          } else
1132             nir_foreach_src(instr, double2vec2, nullptr);
1133       }
1134       result = true;
1135    }
1136 
1137    return result;
1138 }
1139 
1140 using std::map;
1141 using std::vector;
1142 using std::pair;
1143 
1144 class StoreMerger {
1145 public:
1146    StoreMerger(nir_shader *shader);
1147    void collect_stores();
1148    bool combine();
1149    void combine_one_slot(vector<nir_intrinsic_instr*>& stores);
1150 
1151    using StoreCombos = map<unsigned, vector<nir_intrinsic_instr*>>;
1152 
1153    StoreCombos m_stores;
1154    nir_shader *sh;
1155 };
1156 
StoreMerger(nir_shader * shader)1157 StoreMerger::StoreMerger(nir_shader *shader):
1158    sh(shader)
1159 {
1160 }
1161 
1162 
collect_stores()1163 void StoreMerger::collect_stores()
1164 {
1165    unsigned vertex = 0;
1166    nir_foreach_function(function, sh) {
1167       if (function->impl) {
1168          nir_foreach_block(block, function->impl) {
1169             nir_foreach_instr_safe(instr, block) {
1170                if (instr->type != nir_instr_type_intrinsic)
1171                   continue;
1172 
1173                auto ir = nir_instr_as_intrinsic(instr);
1174                if (ir->intrinsic == nir_intrinsic_emit_vertex ||
1175                    ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
1176                   ++vertex;
1177                   continue;
1178                }
1179                if (ir->intrinsic != nir_intrinsic_store_output)
1180                   continue;
1181 
1182                unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
1183                                 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
1184                m_stores[index].push_back(ir);
1185             }
1186          }
1187       }
1188    }
1189 }
1190 
combine()1191 bool StoreMerger::combine()
1192 {
1193    bool progress = false;
1194    for(auto&& i : m_stores) {
1195       if (i.second.size() < 2)
1196          continue;
1197 
1198       combine_one_slot(i.second);
1199       progress = true;
1200    }
1201    return progress;
1202 }
1203 
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1204 void StoreMerger::combine_one_slot(vector<nir_intrinsic_instr*>& stores)
1205 {
1206    nir_ssa_def *srcs[4] = {nullptr};
1207 
1208    nir_builder b;
1209    nir_builder_init(&b, nir_shader_get_entrypoint(sh));
1210    auto last_store = *stores.rbegin();
1211 
1212    b.cursor = nir_before_instr(&last_store->instr);
1213 
1214    unsigned comps = 0;
1215    unsigned writemask = 0;
1216    unsigned first_comp = 4;
1217    for (auto&& store : stores) {
1218       int cmp = nir_intrinsic_component(store);
1219       for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1220          unsigned out_comp = i + cmp;
1221          srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1222          writemask |= 1 << out_comp;
1223          if (first_comp > out_comp)
1224             first_comp = out_comp;
1225       }
1226    }
1227 
1228    auto new_src = nir_vec(&b, srcs, comps);
1229 
1230    nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src));
1231    last_store->num_components = comps;
1232    nir_intrinsic_set_component(last_store, first_comp);
1233    nir_intrinsic_set_write_mask(last_store, writemask);
1234 
1235    for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1236       nir_instr_remove(&(*i)->instr);
1237 }
1238 
r600_merge_vec2_stores(nir_shader * shader)1239 bool r600_merge_vec2_stores(nir_shader *shader)
1240 {
1241    r600::StoreMerger merger(shader);
1242    merger.collect_stores();
1243    return merger.combine();
1244 }
1245 
1246 static bool
r600_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)1247 r600_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
1248 {
1249    b->cursor = nir_after_instr(&instr->instr);
1250 
1251    switch (instr->intrinsic) {
1252    case nir_intrinsic_load_ubo:
1253    case nir_intrinsic_load_ubo_vec4:
1254    case nir_intrinsic_load_uniform:
1255    case nir_intrinsic_load_ssbo:
1256    case nir_intrinsic_load_input:
1257    case nir_intrinsic_load_interpolated_input:
1258    case nir_intrinsic_load_per_vertex_input:
1259    case nir_intrinsic_store_output:
1260    case nir_intrinsic_store_per_vertex_output:
1261    case nir_intrinsic_store_ssbo:
1262       break;
1263    default:
1264       return false;
1265    }
1266 
1267    if (instr->num_components <= 2)
1268       return false;
1269 
1270    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
1271    if (has_dest) {
1272       if (nir_dest_bit_size(instr->dest) != 64)
1273          return false;
1274    } else  {
1275       if (nir_src_bit_size(instr->src[0]) != 64)
1276           return false;
1277    }
1278 
1279    nir_intrinsic_instr *first =
1280       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1281    nir_intrinsic_instr *second =
1282       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
1283 
1284    switch (instr->intrinsic) {
1285    case nir_intrinsic_load_ubo:
1286    case nir_intrinsic_load_ubo_vec4:
1287    case nir_intrinsic_load_uniform:
1288    case nir_intrinsic_load_ssbo:
1289    case nir_intrinsic_store_ssbo:
1290       break;
1291 
1292    default: {
1293       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
1294       semantics.location++;
1295       semantics.num_slots--;
1296       nir_intrinsic_set_io_semantics(second, semantics);
1297 
1298       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
1299       break;
1300    }
1301    }
1302 
1303    first->num_components = 2;
1304    second->num_components -= 2;
1305    if (has_dest) {
1306       first->dest.ssa.num_components = 2;
1307       second->dest.ssa.num_components -= 2;
1308    }
1309 
1310    nir_builder_instr_insert(b, &first->instr);
1311    nir_builder_instr_insert(b, &second->instr);
1312 
1313    if (has_dest) {
1314       /* Merge the two loads' results back into a vector. */
1315       nir_ssa_scalar channels[4] = {
1316          nir_get_ssa_scalar(&first->dest.ssa, 0),
1317          nir_get_ssa_scalar(&first->dest.ssa, 1),
1318          nir_get_ssa_scalar(&second->dest.ssa, 0),
1319          nir_get_ssa_scalar(&second->dest.ssa, second->num_components > 1 ? 1 : 0),
1320       };
1321       nir_ssa_def *new_ir = nir_vec_scalars(b, channels, instr->num_components);
1322       nir_ssa_def_rewrite_uses(&instr->dest.ssa, new_ir);
1323    } else {
1324       /* Split the src value across the two stores. */
1325       b->cursor = nir_before_instr(&instr->instr);
1326 
1327       nir_ssa_def *src0 = instr->src[0].ssa;
1328       nir_ssa_scalar channels[4] = { 0 };
1329       for (int i = 0; i < instr->num_components; i++)
1330          channels[i] = nir_get_ssa_scalar(src0, i);
1331 
1332       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
1333       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
1334 
1335       nir_instr_rewrite_src(&first->instr, &first->src[0],
1336                             nir_src_for_ssa(nir_vec_scalars(b, channels, 2)));
1337       nir_instr_rewrite_src(&second->instr, &second->src[0],
1338                             nir_src_for_ssa(nir_vec_scalars(b, &channels[2],
1339                                                            second->num_components)));
1340    }
1341 
1342    int offset_src = -1;
1343    uint32_t offset_amount = 16;
1344 
1345    switch (instr->intrinsic) {
1346    case nir_intrinsic_load_ssbo:
1347    case nir_intrinsic_load_ubo:
1348       offset_src = 1;
1349       break;
1350    case nir_intrinsic_load_ubo_vec4:
1351    case nir_intrinsic_load_uniform:
1352       offset_src = 0;
1353       offset_amount = 1;
1354       break;
1355    case nir_intrinsic_store_ssbo:
1356       offset_src = 2;
1357       break;
1358    default:
1359       break;
1360    }
1361    if (offset_src != -1) {
1362       b->cursor = nir_before_instr(&second->instr);
1363       nir_ssa_def *second_offset =
1364          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
1365       nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
1366                             nir_src_for_ssa(second_offset));
1367    }
1368 
1369    /* DCE stores we generated with no writemask (nothing else does this
1370     * currently).
1371     */
1372    if (!has_dest) {
1373       if (nir_intrinsic_write_mask(first) == 0)
1374          nir_instr_remove(&first->instr);
1375       if (nir_intrinsic_write_mask(second) == 0)
1376          nir_instr_remove(&second->instr);
1377    }
1378 
1379    nir_instr_remove(&instr->instr);
1380 
1381    return true;
1382 }
1383 
1384 static bool
r600_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)1385 r600_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
1386 {
1387    int num_components = instr->def.num_components;
1388 
1389    if (instr->def.bit_size != 64 || num_components <= 2)
1390       return false;
1391 
1392    b->cursor = nir_before_instr(&instr->instr);
1393 
1394    nir_load_const_instr *first =
1395       nir_load_const_instr_create(b->shader, 2, 64);
1396    nir_load_const_instr *second =
1397       nir_load_const_instr_create(b->shader, num_components - 2, 64);
1398 
1399    first->value[0] = instr->value[0];
1400    first->value[1] = instr->value[1];
1401    second->value[0] = instr->value[2];
1402    if (num_components == 4)
1403       second->value[1] = instr->value[3];
1404 
1405    nir_builder_instr_insert(b, &first->instr);
1406    nir_builder_instr_insert(b, &second->instr);
1407 
1408    nir_ssa_def *channels[4] = {
1409       nir_channel(b, &first->def, 0),
1410       nir_channel(b, &first->def, 1),
1411       nir_channel(b, &second->def, 0),
1412       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
1413    };
1414    nir_ssa_def *new_ir = nir_vec(b, channels, num_components);
1415    nir_ssa_def_rewrite_uses(&instr->def, new_ir);
1416    nir_instr_remove(&instr->instr);
1417 
1418    return true;
1419 }
1420 
1421 static bool
r600_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)1422 r600_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr, void *data)
1423 {
1424    switch (instr->type) {
1425    case nir_instr_type_load_const:
1426       return r600_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
1427 
1428    case nir_instr_type_intrinsic:
1429       return r600_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
1430    default:
1431       return false;
1432    }
1433 }
1434 
1435 bool
r600_lower_64bit_to_vec2(nir_shader * s)1436 r600_lower_64bit_to_vec2(nir_shader *s)
1437 {
1438    return nir_shader_instructions_pass(s,
1439                                        r600_lower_64bit_to_vec2_instr,
1440                                        nir_metadata_block_index |
1441                                        nir_metadata_dominance,
1442                                        NULL);
1443 }
1444 
1445 
1446 } // end namespace r600
1447 
1448 
1449