1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2020 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_nir.h"
28
29 #include "nir.h"
30 #include "nir_builder.h"
31
32 #include <map>
33 #include <vector>
34 #include <iostream>
35
36 namespace r600 {
37
38 using std::map;
39 using std::pair;
40 using std::make_pair;
41 using std::vector;
42
43 class LowerSplit64BitVar : public NirLowerInstruction {
44 public:
45
46 ~LowerSplit64BitVar();
47 using VarSplit = pair<nir_variable*, nir_variable*>;
48 using VarMap = map<unsigned, VarSplit>;
49
50 nir_ssa_def *
51 split_double_load_deref(nir_intrinsic_instr *intr);
52
53 nir_ssa_def *
54 split_double_store_deref(nir_intrinsic_instr *intr);
55
56 private:
57 nir_ssa_def *
58 split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
59
60 nir_ssa_def *
61 split_load_deref_var(nir_intrinsic_instr *intr);
62
63 nir_ssa_def *
64 split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref);
65
66 nir_ssa_def *
67 split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
68
69 VarSplit get_var_pair(nir_variable *old_var);
70
71 nir_ssa_def *
72 merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3);
73
74 nir_ssa_def *split_double_load(nir_intrinsic_instr *load1);
75
76 nir_ssa_def *
77 split_store_output(nir_intrinsic_instr *store1);
78
79 nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr);
80
81 nir_ssa_def *
82 split_double_load_ssbo(nir_intrinsic_instr *intr);
83
84 nir_ssa_def *
85 split_double_load_ubo(nir_intrinsic_instr *intr);
86
87 nir_ssa_def *
88 split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
89
90 nir_ssa_def *
91 split_reduction3(nir_alu_instr *alu,
92 nir_op op1, nir_op op2, nir_op reduction);
93
94 nir_ssa_def *
95 split_reduction4(nir_alu_instr *alu,
96 nir_op op1, nir_op op2, nir_op reduction);
97
98 nir_ssa_def *split_bcsel(nir_alu_instr *alu);
99
100 nir_ssa_def *split_load_const(nir_load_const_instr *lc);
101
102 bool filter(const nir_instr *instr) const override;
103 nir_ssa_def *lower(nir_instr *instr) override;
104
105 VarMap m_varmap;
106 vector<nir_variable*> m_old_vars;
107 vector<nir_instr *> m_old_stores;
108 };
109
110
111 bool
filter(const nir_instr * instr) const112 LowerSplit64BitVar::filter(const nir_instr *instr) const
113 {
114 switch (instr->type) {
115 case nir_instr_type_intrinsic: {
116 auto intr = nir_instr_as_intrinsic(instr);
117
118 switch (intr->intrinsic) {
119 case nir_intrinsic_load_deref:
120 case nir_intrinsic_load_uniform:
121 case nir_intrinsic_load_input:
122 case nir_intrinsic_load_ubo:
123 case nir_intrinsic_load_ssbo:
124 if (nir_dest_bit_size(intr->dest) != 64)
125 return false;
126 return nir_dest_num_components(intr->dest) >= 3;
127 case nir_intrinsic_store_output:
128 if (nir_src_bit_size(intr->src[0]) != 64)
129 return false;
130 return nir_src_num_components(intr->src[0]) >= 3;
131 case nir_intrinsic_store_deref:
132 if (nir_src_bit_size(intr->src[1]) != 64)
133 return false;
134 return nir_src_num_components(intr->src[1]) >= 3;
135 default:
136 return false;
137 }
138 }
139 case nir_instr_type_alu: {
140 auto alu = nir_instr_as_alu(instr);
141 switch (alu->op) {
142 case nir_op_bcsel:
143 if (nir_dest_num_components(alu->dest.dest) < 3)
144 return false;
145 return nir_dest_bit_size(alu->dest.dest) == 64;
146 case nir_op_bany_fnequal3:
147 case nir_op_bany_fnequal4:
148 case nir_op_ball_fequal3:
149 case nir_op_ball_fequal4:
150 case nir_op_bany_inequal3:
151 case nir_op_bany_inequal4:
152 case nir_op_ball_iequal3:
153 case nir_op_ball_iequal4:
154 case nir_op_fdot3:
155 case nir_op_fdot4:
156 return nir_src_bit_size(alu->src[1].src) == 64;
157 default:
158 return false;
159 }
160 }
161 case nir_instr_type_load_const: {
162 auto lc = nir_instr_as_load_const(instr);
163 if (lc->def.bit_size != 64)
164 return false;
165 return lc->def.num_components >= 3;
166 }
167 default:
168 return false;
169 }
170 }
171
172 nir_ssa_def *
merge_64bit_loads(nir_ssa_def * load1,nir_ssa_def * load2,bool out_is_vec3)173 LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1,
174 nir_ssa_def *load2, bool out_is_vec3)
175 {
176 if (out_is_vec3)
177 return nir_vec3(b, nir_channel(b, load1, 0),
178 nir_channel(b, load1, 1),
179 nir_channel(b, load2, 0));
180 else
181 return nir_vec4(b, nir_channel(b, load1, 0),
182 nir_channel(b, load1, 1),
183 nir_channel(b, load2, 0),
184 nir_channel(b, load2, 1));
185 }
186
~LowerSplit64BitVar()187 LowerSplit64BitVar::~LowerSplit64BitVar()
188 {
189 for(auto&& v: m_old_vars)
190 exec_node_remove(&v->node);
191
192 for(auto&& v: m_old_stores)
193 nir_instr_remove(v);
194 }
195
196 nir_ssa_def *
split_double_store_deref(nir_intrinsic_instr * intr)197 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
198 {
199 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
200 if (deref->deref_type == nir_deref_type_var)
201 return split_store_deref_var(intr, deref);
202 else if (deref->deref_type == nir_deref_type_array)
203 return split_store_deref_array(intr, deref);
204 else {
205 unreachable("only splitting of stores to vars and arrays is supported");
206 }
207 }
208
209 nir_ssa_def *
split_double_load_deref(nir_intrinsic_instr * intr)210 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
211 {
212 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
213 if (deref->deref_type == nir_deref_type_var)
214 return split_load_deref_var(intr);
215 else if (deref->deref_type == nir_deref_type_array)
216 return split_load_deref_array(intr, deref->arr.index);
217 else {
218 unreachable(0 && "only splitting of loads from vars and arrays is supported");
219 }
220 m_old_stores.push_back(&intr->instr);
221 }
222
223 nir_ssa_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)224 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
225 {
226 auto old_var = nir_intrinsic_get_var(intr, 0);
227 unsigned old_components = old_var->type->without_array()->components();
228
229 assert(old_components > 2 && old_components <= 4);
230
231 auto vars = get_var_pair(old_var);
232
233 auto deref1 = nir_build_deref_var(b, vars.first);
234 auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1));
235 auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0);
236
237 auto deref2 = nir_build_deref_var(b, vars.second);
238 auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1));
239
240 auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0);
241
242 return merge_64bit_loads(load1, load2, old_components == 3);
243 }
244
245 nir_ssa_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)246 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref)
247 {
248 auto old_var = nir_intrinsic_get_var(intr, 0);
249 unsigned old_components = old_var->type->without_array()->components();
250
251 assert(old_components > 2 && old_components <= 4);
252
253 auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
254
255 auto vars = get_var_pair(old_var);
256
257 auto deref1 = nir_build_deref_var(b, vars.first);
258 auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1));
259
260 nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3);
261
262 auto deref2 = nir_build_deref_var(b, vars.second);
263 auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1));
264
265 if (old_components == 3)
266 nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
267 else
268 nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
269
270 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
271 }
272
273 nir_ssa_def *
split_store_deref_var(nir_intrinsic_instr * intr,nir_deref_instr * deref)274 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref)
275 {
276 auto old_var = nir_intrinsic_get_var(intr, 0);
277 unsigned old_components = old_var->type->without_array()->components();
278
279 assert(old_components > 2 && old_components <= 4);
280
281 auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
282
283 auto vars = get_var_pair(old_var);
284
285 auto deref1 = nir_build_deref_var(b, vars.first);
286 nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3);
287
288 auto deref2 = nir_build_deref_var(b, vars.second);
289 if (old_components == 3)
290 nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
291 else
292 nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
293
294 return NIR_LOWER_INSTR_PROGRESS_REPLACE;
295 }
296
297 nir_ssa_def *
split_load_deref_var(nir_intrinsic_instr * intr)298 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
299 {
300 auto old_var = nir_intrinsic_get_var(intr, 0);
301 auto vars = get_var_pair(old_var);
302 unsigned old_components = old_var->type->components();
303
304 nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
305 auto *load1 = nir_load_deref(b, deref1);
306
307 nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
308 deref2->type = vars.second->type;
309
310 auto *load2 = nir_load_deref(b, deref2);
311
312 return merge_64bit_loads(load1, load2, old_components == 3);
313 }
314
315 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)316 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
317 {
318 auto split_vars = m_varmap.find(old_var->data.driver_location);
319
320 assert(old_var->type->without_array()->components() > 2);
321
322 if (split_vars == m_varmap.end()) {
323 auto var1 = nir_variable_clone(old_var, b->shader);
324 auto var2 = nir_variable_clone(old_var, b->shader);
325
326 var1->type = glsl_dvec_type(2);
327 var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2);
328
329 if (old_var->type->is_array()) {
330 var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0);
331 var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0);
332 }
333
334 if (old_var->data.mode == nir_var_shader_in ||
335 old_var->data.mode == nir_var_shader_out) {
336 ++var2->data.driver_location;
337 ++var2->data.location;
338 nir_shader_add_variable(b->shader, var1);
339 nir_shader_add_variable(b->shader, var2);
340 } else if (old_var->data.mode == nir_var_function_temp) {
341 exec_list_push_tail(&b->impl->locals, &var1->node);
342 exec_list_push_tail(&b->impl->locals, &var2->node);
343 }
344
345 m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
346 }
347 return m_varmap[old_var->data.driver_location];
348 }
349
350
351 nir_ssa_def *
split_double_load(nir_intrinsic_instr * load1)352 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
353 {
354 unsigned old_components = nir_dest_num_components(load1->dest);
355 auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
356 nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
357
358 load1->dest.ssa.num_components = 2;
359 sem.num_slots = 1;
360 nir_intrinsic_set_io_semantics(load1, sem);
361
362 load2->dest.ssa.num_components = old_components - 2;
363 sem.location += 1;
364 nir_intrinsic_set_io_semantics(load2, sem);
365 nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
366 nir_builder_instr_insert(b, &load2->instr);
367
368 return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3);
369 }
370
371
372 nir_ssa_def *
split_store_output(nir_intrinsic_instr * store1)373 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
374 {
375 auto src = store1->src[0];
376 unsigned old_components = nir_src_num_components(src);
377 nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
378
379 auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
380 auto src1 = nir_channels(b, src.ssa, 3);
381 auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
382
383 nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1));
384 nir_intrinsic_set_write_mask(store1, 3);
385
386 nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2));
387 nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
388
389 sem.num_slots = 1;
390 nir_intrinsic_set_io_semantics(store1, sem);
391
392 sem.location += 1;
393 nir_intrinsic_set_io_semantics(store2, sem);
394 nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
395
396 nir_builder_instr_insert(b, &store2->instr);
397 return NIR_LOWER_INSTR_PROGRESS;
398 }
399
400
401 nir_ssa_def *
split_double_load_uniform(nir_intrinsic_instr * intr)402 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
403 {
404 unsigned second_components = nir_dest_num_components(intr->dest) - 2;
405 nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
406 load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
407 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
408 nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
409 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
410 load2->num_components = second_components;
411
412 nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
413 nir_builder_instr_insert(b, &load2->instr);
414
415 intr->dest.ssa.num_components = intr->num_components = 2;
416
417 if (second_components == 1)
418 return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0),
419 nir_channel(b, &intr->dest.ssa, 1),
420 nir_channel(b, &load2->dest.ssa, 0));
421 else
422 return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0),
423 nir_channel(b, &intr->dest.ssa, 1),
424 nir_channel(b, &load2->dest.ssa, 0),
425 nir_channel(b, &load2->dest.ssa, 1));
426 }
427
428 nir_ssa_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)429 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
430 {
431 unsigned second_components = nir_dest_num_components(intr->dest) - 2;
432 nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
433
434 auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
435 nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0);
436 load2->num_components = second_components;
437 nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
438
439 nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
440 nir_builder_instr_insert(b, &load2->instr);
441
442 intr->dest.ssa.num_components = intr->num_components = 2;
443
444 return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
445 }
446
447
448 nir_ssa_def *
split_double_load_ubo(nir_intrinsic_instr * intr)449 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
450 {
451 unsigned second_components = nir_dest_num_components(intr->dest) - 2;
452 nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
453 load2->src[0] = intr->src[0];
454 load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
455 nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
456 nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
457 nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
458 nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
459 nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16);
460
461 load2->num_components = second_components;
462
463 nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
464 nir_builder_instr_insert(b, &load2->instr);
465
466 intr->dest.ssa.num_components = intr->num_components = 2;
467
468 return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
469 }
470
471 nir_ssa_def *
split_reduction(nir_ssa_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)472 LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction)
473 {
474 auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
475 auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
476 return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
477 }
478
479 nir_ssa_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)480 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
481 nir_op op1, nir_op op2, nir_op reduction)
482 {
483 nir_ssa_def *src[2][2];
484
485 src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
486 src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
487
488 src[1][0] = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2);
489 src[1][1] = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2);
490
491 return split_reduction(src, op1, op2, reduction);
492 }
493
494 nir_ssa_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)495 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
496 nir_op op1, nir_op op2, nir_op reduction)
497 {
498 nir_ssa_def *src[2][2];
499
500 src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
501 src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
502
503 src[1][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc);
504 src[1][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc);
505
506 return split_reduction(src, op1, op2, reduction);
507 }
508
509 nir_ssa_def *
split_bcsel(nir_alu_instr * alu)510 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
511 {
512 static nir_ssa_def *dest[4];
513 for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) {
514 dest[i] = nir_bcsel(b,
515 nir_channel(b, alu->src[0].src.ssa, i),
516 nir_channel(b, alu->src[1].src.ssa, i),
517 nir_channel(b, alu->src[2].src.ssa, i));
518 }
519 return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest));
520 }
521
522 nir_ssa_def *
split_load_const(nir_load_const_instr * lc)523 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
524 {
525 nir_ssa_def *ir[4];
526 for (unsigned i = 0; i < lc->def.num_components; ++i)
527 ir[i] = nir_imm_double(b, lc->value[i].f64);
528
529 return nir_vec(b, ir, lc->def.num_components);
530 }
531
532 nir_ssa_def *
lower(nir_instr * instr)533 LowerSplit64BitVar::lower(nir_instr *instr)
534 {
535 switch (instr->type) {
536 case nir_instr_type_intrinsic: {
537 auto intr = nir_instr_as_intrinsic(instr);
538 switch (intr->intrinsic) {
539 case nir_intrinsic_load_deref:
540 return this->split_double_load_deref(intr);
541 case nir_intrinsic_load_uniform:
542 return split_double_load_uniform(intr);
543 case nir_intrinsic_load_ubo:
544 return split_double_load_ubo(intr);
545 case nir_intrinsic_load_ssbo:
546 return split_double_load_ssbo(intr);
547 case nir_intrinsic_load_input:
548 return split_double_load(intr);
549 case nir_intrinsic_store_output:
550 return split_store_output(intr);
551 case nir_intrinsic_store_deref:
552 return split_double_store_deref(intr);
553 default:
554 assert(0);
555 }
556 }
557 case nir_instr_type_alu: {
558 auto alu = nir_instr_as_alu(instr);
559 nir_print_instr(instr, stderr);
560 fprintf(stderr, "\n");
561 switch (alu->op) {
562 case nir_op_bany_fnequal3:
563 return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
564 case nir_op_ball_fequal3:
565 return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
566 case nir_op_bany_inequal3:
567 return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
568 case nir_op_ball_iequal3:
569 return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
570 case nir_op_fdot3:
571 return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
572 case nir_op_bany_fnequal4:
573 return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
574 case nir_op_ball_fequal4:
575 return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand);
576 case nir_op_bany_inequal4:
577 return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior);
578 case nir_op_ball_iequal4:
579 return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
580 case nir_op_fdot4:
581 return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
582 case nir_op_bcsel:
583 return split_bcsel(alu);
584 default:
585 assert(0);
586 }
587 }
588 case nir_instr_type_load_const: {
589 auto lc = nir_instr_as_load_const(instr);
590 return split_load_const(lc);
591 }
592 default:
593 assert(0);
594 }
595 return nullptr;
596 }
597
598 /* Split 64 bit instruction so that at most two 64 bit components are
599 * used in one instruction */
600
601 bool
r600_nir_split_64bit_io(nir_shader * sh)602 r600_nir_split_64bit_io(nir_shader *sh)
603 {
604 return LowerSplit64BitVar().run(sh);
605 }
606
607 /* */
608 class Lower64BitToVec2 : public NirLowerInstruction {
609
610 private:
611 bool filter(const nir_instr *instr) const override;
612 nir_ssa_def *lower(nir_instr *instr) override;
613
614 nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
615 nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
616 nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
617 nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr);
618 nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr);
619 };
620
621 bool
filter(const nir_instr * instr) const622 Lower64BitToVec2::filter(const nir_instr *instr) const
623 {
624 switch (instr->type) {
625 case nir_instr_type_intrinsic: {
626 auto intr = nir_instr_as_intrinsic(instr);
627
628 switch (intr->intrinsic) {
629 case nir_intrinsic_load_deref:
630 case nir_intrinsic_load_input:
631 case nir_intrinsic_load_uniform:
632 case nir_intrinsic_load_ubo:
633 case nir_intrinsic_load_ubo_vec4:
634 case nir_intrinsic_load_ssbo:
635 return nir_dest_bit_size(intr->dest) == 64;
636 case nir_intrinsic_store_deref: {
637 if (nir_src_bit_size(intr->src[1]) == 64)
638 return true;
639 auto var = nir_intrinsic_get_var(intr, 0);
640 if (var->type->without_array()->bit_size() == 64)
641 return true;
642 return (var->type->without_array()->components() != intr->num_components);
643 }
644 default:
645 return false;
646 }
647 }
648 case nir_instr_type_alu: {
649 auto alu = nir_instr_as_alu(instr);
650 return nir_dest_bit_size(alu->dest.dest) == 64;
651 }
652 case nir_instr_type_phi: {
653 auto phi = nir_instr_as_phi(instr);
654 return nir_dest_bit_size(phi->dest) == 64;
655 }
656 case nir_instr_type_load_const: {
657 auto lc = nir_instr_as_load_const(instr);
658 return lc->def.bit_size == 64;
659 }
660 case nir_instr_type_ssa_undef: {
661 auto undef = nir_instr_as_ssa_undef(instr);
662 return undef->def.bit_size == 64;
663 }
664 default:
665 return false;
666 }
667 }
668
669 nir_ssa_def *
lower(nir_instr * instr)670 Lower64BitToVec2::lower(nir_instr *instr)
671 {
672 switch (instr->type) {
673 case nir_instr_type_intrinsic: {
674 auto intr = nir_instr_as_intrinsic(instr);
675 switch (intr->intrinsic) {
676 case nir_intrinsic_load_deref:
677 return load_deref_64_to_vec2(intr);
678 case nir_intrinsic_load_uniform:
679 return load_uniform_64_to_vec2(intr);
680 case nir_intrinsic_load_ssbo:
681 return load_ssbo_64_to_vec2(intr);
682 case nir_intrinsic_load_input:
683 case nir_intrinsic_load_ubo:
684 case nir_intrinsic_load_ubo_vec4:
685 return load_64_to_vec2(intr);
686 case nir_intrinsic_store_deref:
687 return store_64_to_vec2(intr);
688 default:
689
690 return nullptr;
691 }
692 }
693 case nir_instr_type_alu: {
694 auto alu = nir_instr_as_alu(instr);
695 alu->dest.dest.ssa.bit_size = 32;
696 alu->dest.dest.ssa.num_components *= 2;
697 alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1;
698 switch (alu->op) {
699 case nir_op_pack_64_2x32_split:
700 alu->op = nir_op_vec2;
701 break;
702 case nir_op_pack_64_2x32:
703 alu->op = nir_op_mov;
704 break;
705 case nir_op_vec2:
706 return nir_vec4(b,
707 nir_channel(b, alu->src[0].src.ssa, 0),
708 nir_channel(b, alu->src[0].src.ssa, 1),
709 nir_channel(b, alu->src[1].src.ssa, 0),
710 nir_channel(b, alu->src[1].src.ssa, 1));
711 default:
712 return NULL;
713 }
714 return NIR_LOWER_INSTR_PROGRESS;
715 }
716 case nir_instr_type_phi: {
717 auto phi = nir_instr_as_phi(instr);
718 phi->dest.ssa.bit_size = 32;
719 phi->dest.ssa.num_components = 2;
720 return NIR_LOWER_INSTR_PROGRESS;
721 }
722 case nir_instr_type_load_const: {
723 auto lc = nir_instr_as_load_const(instr);
724 assert(lc->def.num_components < 3);
725 nir_const_value val[4] = {0};
726 for (uint i = 0; i < lc->def.num_components; ++i) {
727 uint64_t v = lc->value[i].u64;
728 val[0].u32 = v & 0xffffffff;
729 val[1].u32 = (v >> 32) & 0xffffffff;
730 }
731
732 return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
733 }
734 case nir_instr_type_ssa_undef: {
735 auto undef = nir_instr_as_ssa_undef(instr);
736 undef->def.num_components *= 2;
737 undef->def.bit_size = 32;
738 return NIR_LOWER_INSTR_PROGRESS;
739 }
740 default:
741 return nullptr;
742 }
743
744 }
745
746
747 nir_ssa_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)748 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
749 {
750 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
751 auto var = nir_intrinsic_get_var(intr, 0);
752 unsigned components = var->type->without_array()->components();
753 if (var->type->without_array()->bit_size() == 64) {
754 components *= 2;
755 if (deref->deref_type == nir_deref_type_var) {
756 var->type = glsl_vec_type(components);
757 } else if (deref->deref_type == nir_deref_type_array) {
758
759 var->type = glsl_array_type(glsl_vec_type(components),
760 var->type->array_size(), 0);
761
762 } else {
763 nir_print_shader(b->shader, stderr);
764 assert(0 && "Only lowring of var and array derefs supported\n");
765 }
766 }
767 deref->type = var->type;
768 if (deref->deref_type == nir_deref_type_array) {
769 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
770 deref_array->type = var->type;
771 deref->type = deref_array->type->without_array();
772 }
773
774 intr->num_components = components;
775 intr->dest.ssa.bit_size = 32;
776 intr->dest.ssa.num_components = components;
777 return NIR_LOWER_INSTR_PROGRESS;
778 }
779
780 nir_ssa_def *
store_64_to_vec2(nir_intrinsic_instr * intr)781 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
782 {
783 auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
784 auto var = nir_intrinsic_get_var(intr, 0);
785
786 unsigned components = var->type->without_array()->components();
787 unsigned wrmask = nir_intrinsic_write_mask(intr);
788 if (var->type->without_array()->bit_size() == 64) {
789 components *= 2;
790 if (deref->deref_type == nir_deref_type_var) {
791 var->type = glsl_vec_type(components);
792 } else if (deref->deref_type == nir_deref_type_array) {
793 var->type = glsl_array_type(glsl_vec_type(components),
794 var->type->array_size(), 0);
795 } else {
796 nir_print_shader(b->shader, stderr);
797 assert(0 && "Only lowring of var and array derefs supported\n");
798 }
799 }
800 deref->type = var->type;
801 if (deref->deref_type == nir_deref_type_array) {
802 auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
803 deref_array->type = var->type;
804 deref->type = deref_array->type->without_array();
805 }
806 intr->num_components = components;
807 nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
808 return NIR_LOWER_INSTR_PROGRESS;
809 }
810
811
812 nir_ssa_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)813 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
814 {
815 intr->num_components *= 2;
816 intr->dest.ssa.bit_size = 32;
817 intr->dest.ssa.num_components *= 2;
818 nir_intrinsic_set_dest_type(intr, nir_type_float32);
819 return NIR_LOWER_INSTR_PROGRESS;
820 }
821
822 nir_ssa_def *
load_64_to_vec2(nir_intrinsic_instr * intr)823 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
824 {
825 intr->num_components *= 2;
826 intr->dest.ssa.bit_size = 32;
827 intr->dest.ssa.num_components *= 2;
828 nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
829 return NIR_LOWER_INSTR_PROGRESS;
830 }
831
832 nir_ssa_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)833 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
834 {
835 intr->num_components *= 2;
836 intr->dest.ssa.bit_size = 32;
837 intr->dest.ssa.num_components *= 2;
838 return NIR_LOWER_INSTR_PROGRESS;
839 }
840
store_64bit_intr(nir_src * src,void * state)841 static bool store_64bit_intr(nir_src *src, void *state)
842 {
843 bool *s = (bool *)state;
844 *s = nir_src_bit_size(*src) == 64;
845 return !*s;
846 }
847
double2vec2(nir_src * src,void * state)848 static bool double2vec2(nir_src *src, void *state)
849 {
850 if (nir_src_bit_size(*src) != 64)
851 return true;
852
853 assert(src->is_ssa);
854 src->ssa->bit_size = 32;
855 src->ssa->num_components *= 2;
856 return true;
857 }
858
859 bool
r600_nir_64_to_vec2(nir_shader * sh)860 r600_nir_64_to_vec2(nir_shader *sh)
861 {
862 vector<nir_instr*> intr64bit;
863 nir_foreach_function(function, sh) {
864 if (function->impl) {
865 nir_builder b;
866 nir_builder_init(&b, function->impl);
867
868 nir_foreach_block(block, function->impl) {
869 nir_foreach_instr_safe(instr, block) {
870 switch (instr->type) {
871 case nir_instr_type_alu: {
872 bool success = false;
873 nir_foreach_src(instr, store_64bit_intr, &success);
874 if (success)
875 intr64bit.push_back(instr);
876 break;
877 }
878 case nir_instr_type_intrinsic: {
879 auto ir = nir_instr_as_intrinsic(instr);
880 switch (ir->intrinsic) {
881 case nir_intrinsic_store_output:
882 case nir_intrinsic_store_ssbo: {
883 bool success = false;
884 nir_foreach_src(instr, store_64bit_intr, &success);
885 if (success) {
886 auto wm = nir_intrinsic_write_mask(ir);
887 nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
888 ir->num_components *= 2;
889 }
890 break;
891 }
892 default:
893 ;
894 }
895 }
896 default:
897 ;
898 }
899 }
900 }
901 }
902 }
903
904 bool result = Lower64BitToVec2().run(sh);
905
906 if (result || !intr64bit.empty()) {
907
908 for(auto&& instr: intr64bit) {
909 if (instr->type == nir_instr_type_alu) {
910 auto alu = nir_instr_as_alu(instr);
911 auto alu_info = nir_op_infos[alu->op];
912 for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
913 int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
914 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
915 if (!nir_alu_instr_channel_used(alu, i, k)) {
916 continue;
917 }
918
919 switch (alu->op) {
920 case nir_op_unpack_64_2x32_split_x:
921 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
922 alu->op = nir_op_mov;
923 break;
924 case nir_op_unpack_64_2x32_split_y:
925 swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
926 alu->op = nir_op_mov;
927 break;
928 case nir_op_unpack_64_2x32:
929 alu->op = nir_op_mov;
930 break;
931 case nir_op_bcsel:
932 if (i == 0) {
933 swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
934 break;
935 }
936 FALLTHROUGH;
937 default:
938 swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
939 swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
940 }
941 }
942 for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
943 alu->src[i].swizzle[k] = swizzle[k];
944 }
945 }
946 } else
947 nir_foreach_src(instr, double2vec2, nullptr);
948 }
949 result = true;
950 }
951
952 return result;
953 }
954
955 using std::map;
956 using std::vector;
957 using std::pair;
958
959 class StoreMerger {
960 public:
961 StoreMerger(nir_shader *shader);
962 void collect_stores();
963 bool combine();
964 void combine_one_slot(vector<nir_intrinsic_instr*>& stores);
965
966 using StoreCombos = map<unsigned, vector<nir_intrinsic_instr*>>;
967
968 StoreCombos m_stores;
969 nir_shader *sh;
970 };
971
StoreMerger(nir_shader * shader)972 StoreMerger::StoreMerger(nir_shader *shader):
973 sh(shader)
974 {
975 }
976
977
collect_stores()978 void StoreMerger::collect_stores()
979 {
980 unsigned vertex = 0;
981 nir_foreach_function(function, sh) {
982 if (function->impl) {
983 nir_foreach_block(block, function->impl) {
984 nir_foreach_instr_safe(instr, block) {
985 if (instr->type != nir_instr_type_intrinsic)
986 continue;
987
988 auto ir = nir_instr_as_intrinsic(instr);
989 if (ir->intrinsic == nir_intrinsic_emit_vertex ||
990 ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
991 ++vertex;
992 continue;
993 }
994 if (ir->intrinsic != nir_intrinsic_store_output)
995 continue;
996
997 unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
998 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
999 m_stores[index].push_back(ir);
1000 }
1001 }
1002 }
1003 }
1004 }
1005
combine()1006 bool StoreMerger::combine()
1007 {
1008 bool progress = false;
1009 for(auto&& i : m_stores) {
1010 if (i.second.size() < 2)
1011 continue;
1012
1013 combine_one_slot(i.second);
1014 progress = true;
1015 }
1016 return progress;
1017 }
1018
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1019 void StoreMerger::combine_one_slot(vector<nir_intrinsic_instr*>& stores)
1020 {
1021 nir_ssa_def *srcs[4] = {nullptr};
1022
1023 nir_builder b;
1024 nir_builder_init(&b, nir_shader_get_entrypoint(sh));
1025 auto last_store = *stores.rbegin();
1026
1027 b.cursor = nir_before_instr(&last_store->instr);
1028
1029 unsigned comps = 0;
1030 unsigned writemask = 0;
1031 unsigned first_comp = 4;
1032 for (auto&& store : stores) {
1033 int cmp = nir_intrinsic_component(store);
1034 for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1035 unsigned out_comp = i + cmp;
1036 srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1037 writemask |= 1 << out_comp;
1038 if (first_comp > out_comp)
1039 first_comp = out_comp;
1040 }
1041 }
1042
1043 auto new_src = nir_vec(&b, srcs, comps);
1044
1045 nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src));
1046 last_store->num_components = comps;
1047 nir_intrinsic_set_component(last_store, first_comp);
1048 nir_intrinsic_set_write_mask(last_store, writemask);
1049
1050 for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1051 nir_instr_remove(&(*i)->instr);
1052 }
1053
r600_merge_vec2_stores(nir_shader * shader)1054 bool r600_merge_vec2_stores(nir_shader *shader)
1055 {
1056 r600::StoreMerger merger(shader);
1057 merger.collect_stores();
1058 return merger.combine();
1059 }
1060
1061 } // end namespace r600
1062
1063
1064