1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2019 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_nir.h"
28
29 #include "../r600_asm.h"
30 #include "../r600_pipe.h"
31 #include "../r600_shader.h"
32
33 #include "nir.h"
34 #include "nir_builder.h"
35 #include "nir_intrinsics.h"
36 #include "sfn_assembler.h"
37 #include "sfn_debug.h"
38 #include "sfn_instr_tex.h"
39 #include "sfn_liverangeevaluator.h"
40 #include "sfn_nir_lower_alu.h"
41 #include "sfn_nir_lower_fs_out_to_vector.h"
42 #include "sfn_nir_lower_tex.h"
43 #include "sfn_optimizer.h"
44 #include "sfn_ra.h"
45 #include "sfn_scheduler.h"
46 #include "sfn_shader.h"
47 #include "sfn_split_address_loads.h"
48 #include "util/u_debug.h"
49 #include "util/u_prim.h"
50
51 #include <vector>
52
53 namespace r600 {
54
55 using std::vector;
56
NirLowerInstruction()57 NirLowerInstruction::NirLowerInstruction():
58 b(nullptr)
59 {
60 }
61
62 bool
filter_instr(const nir_instr * instr,const void * data)63 NirLowerInstruction::filter_instr(const nir_instr *instr, const void *data)
64 {
65 auto me = reinterpret_cast<const NirLowerInstruction *>(data);
66 return me->filter(instr);
67 }
68
69 nir_def *
lower_instr(nir_builder * b,nir_instr * instr,void * data)70 NirLowerInstruction::lower_instr(nir_builder *b, nir_instr *instr, void *data)
71 {
72 auto me = reinterpret_cast<NirLowerInstruction *>(data);
73 me->set_builder(b);
74 return me->lower(instr);
75 }
76
77 bool
run(nir_shader * shader)78 NirLowerInstruction::run(nir_shader *shader)
79 {
80 return nir_shader_lower_instructions(shader, filter_instr, lower_instr, (void *)this);
81 }
82
~AssemblyFromShader()83 AssemblyFromShader::~AssemblyFromShader() {}
84
85 bool
lower(const Shader & ir)86 AssemblyFromShader::lower(const Shader& ir)
87 {
88 return do_lower(ir);
89 }
90
91 static void
r600_nir_lower_scratch_address_impl(nir_builder * b,nir_intrinsic_instr * instr)92 r600_nir_lower_scratch_address_impl(nir_builder *b, nir_intrinsic_instr *instr)
93 {
94 b->cursor = nir_before_instr(&instr->instr);
95
96 int address_index = 0;
97 int align;
98
99 if (instr->intrinsic == nir_intrinsic_store_scratch) {
100 align = instr->src[0].ssa->num_components;
101 address_index = 1;
102 } else {
103 align = instr->def.num_components;
104 }
105
106 nir_def *address = instr->src[address_index].ssa;
107 nir_def *new_address = nir_ishr_imm(b, address, 4 * align);
108
109 nir_src_rewrite(&instr->src[address_index], new_address);
110 }
111
112 bool
r600_lower_scratch_addresses(nir_shader * shader)113 r600_lower_scratch_addresses(nir_shader *shader)
114 {
115 bool progress = false;
116 nir_foreach_function_impl(impl, shader)
117 {
118 nir_builder build = nir_builder_create(impl);
119
120 nir_foreach_block(block, impl)
121 {
122 nir_foreach_instr(instr, block)
123 {
124 if (instr->type != nir_instr_type_intrinsic)
125 continue;
126 nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
127 if (op->intrinsic != nir_intrinsic_load_scratch &&
128 op->intrinsic != nir_intrinsic_store_scratch)
129 continue;
130 r600_nir_lower_scratch_address_impl(&build, op);
131 progress = true;
132 }
133 }
134 }
135 return progress;
136 }
137
138 static void
insert_uniform_sorted(struct exec_list * var_list,nir_variable * new_var)139 insert_uniform_sorted(struct exec_list *var_list, nir_variable *new_var)
140 {
141 nir_foreach_variable_in_list(var, var_list)
142 {
143 if (var->data.binding > new_var->data.binding ||
144 (var->data.binding == new_var->data.binding &&
145 var->data.offset > new_var->data.offset)) {
146 exec_node_insert_node_before(&var->node, &new_var->node);
147 return;
148 }
149 }
150 exec_list_push_tail(var_list, &new_var->node);
151 }
152
153 void
sort_uniforms(nir_shader * shader)154 sort_uniforms(nir_shader *shader)
155 {
156 struct exec_list new_list;
157 exec_list_make_empty(&new_list);
158
159 nir_foreach_uniform_variable_safe(var, shader)
160 {
161 exec_node_remove(&var->node);
162 insert_uniform_sorted(&new_list, var);
163 }
164 exec_list_append(&shader->variables, &new_list);
165 }
166
167 static void
insert_fsoutput_sorted(struct exec_list * var_list,nir_variable * new_var)168 insert_fsoutput_sorted(struct exec_list *var_list, nir_variable *new_var)
169 {
170
171 nir_foreach_variable_in_list(var, var_list)
172 {
173 if ((var->data.location >= FRAG_RESULT_DATA0 ||
174 var->data.location == FRAG_RESULT_COLOR) &&
175 (new_var->data.location < FRAG_RESULT_COLOR ||
176 new_var->data.location == FRAG_RESULT_SAMPLE_MASK)) {
177 exec_node_insert_after(&var->node, &new_var->node);
178 return;
179 } else if ((new_var->data.location >= FRAG_RESULT_DATA0 ||
180 new_var->data.location == FRAG_RESULT_COLOR) &&
181 (var->data.location < FRAG_RESULT_COLOR ||
182 var->data.location == FRAG_RESULT_SAMPLE_MASK)) {
183 exec_node_insert_node_before(&var->node, &new_var->node);
184 return;
185 } else if (var->data.location > new_var->data.location ||
186 (var->data.location == new_var->data.location &&
187 var->data.index > new_var->data.index)) {
188 exec_node_insert_node_before(&var->node, &new_var->node);
189 return;
190 }
191 }
192
193 exec_list_push_tail(var_list, &new_var->node);
194 }
195
196 void
sort_fsoutput(nir_shader * shader)197 sort_fsoutput(nir_shader *shader)
198 {
199 struct exec_list new_list;
200 exec_list_make_empty(&new_list);
201
202 nir_foreach_shader_out_variable_safe(var, shader)
203 {
204 exec_node_remove(&var->node);
205 insert_fsoutput_sorted(&new_list, var);
206 }
207
208 unsigned driver_location = 0;
209 nir_foreach_variable_in_list(var, &new_list) var->data.driver_location =
210 driver_location++;
211
212 exec_list_append(&shader->variables, &new_list);
213 }
214
215 class LowerClipvertexWrite : public NirLowerInstruction {
216
217 public:
LowerClipvertexWrite(int noutputs,pipe_stream_output_info & so_info)218 LowerClipvertexWrite(int noutputs, pipe_stream_output_info& so_info):
219 m_clipplane1(noutputs),
220 m_clipvtx(noutputs + 1),
221 m_so_info(so_info)
222 {
223 }
224
225 private:
filter(const nir_instr * instr) const226 bool filter(const nir_instr *instr) const override
227 {
228 if (instr->type != nir_instr_type_intrinsic)
229 return false;
230
231 auto intr = nir_instr_as_intrinsic(instr);
232 if (intr->intrinsic != nir_intrinsic_store_output)
233 return false;
234
235 return nir_intrinsic_io_semantics(intr).location == VARYING_SLOT_CLIP_VERTEX;
236 }
237
lower(nir_instr * instr)238 nir_def *lower(nir_instr *instr) override
239 {
240
241 auto intr = nir_instr_as_intrinsic(instr);
242 nir_def *output[8] = {nullptr};
243
244 auto buf_id = nir_imm_int(b, R600_BUFFER_INFO_CONST_BUFFER);
245
246 auto clip_vtx = intr->src[0].ssa;
247
248 for (int i = 0; i < 8; ++i) {
249 auto sel = nir_imm_int(b, i);
250 auto mrow = nir_load_ubo_vec4(b, 4, 32, buf_id, sel);
251 output[i] = nir_fdot4(b, clip_vtx, mrow);
252 }
253
254 unsigned clip_vertex_index = nir_intrinsic_base(intr);
255
256 for (int i = 0; i < 2; ++i) {
257 auto clip_i = nir_vec(b, &output[4 * i], 4);
258 auto store = nir_store_output(b, clip_i, intr->src[1].ssa);
259 nir_intrinsic_set_write_mask(store, 0xf);
260 nir_intrinsic_set_base(store, clip_vertex_index);
261 nir_io_semantics semantic = nir_intrinsic_io_semantics(intr);
262 semantic.location = VARYING_SLOT_CLIP_DIST0 + i;
263 semantic.no_varying = 1;
264
265 if (i > 0)
266 nir_intrinsic_set_base(store, m_clipplane1);
267 nir_intrinsic_set_write_mask(store, 0xf);
268 nir_intrinsic_set_io_semantics(store, semantic);
269 }
270 nir_intrinsic_set_base(intr, m_clipvtx);
271
272 nir_def *result = NIR_LOWER_INSTR_PROGRESS_REPLACE;
273 for (unsigned i = 0; i < m_so_info.num_outputs; ++i) {
274 if (m_so_info.output[i].register_index == clip_vertex_index) {
275 m_so_info.output[i].register_index = m_clipvtx;
276 result = NIR_LOWER_INSTR_PROGRESS;
277 }
278 }
279 return result;
280 }
281 int m_clipplane1;
282 int m_clipvtx;
283 pipe_stream_output_info& m_so_info;
284 };
285
286 /* lower_uniforms_to_ubo adds a 1 to the UBO buffer ID.
287 * If the buffer ID is a non-constant value we end up
288 * with "iadd bufid, 1", bot on r600 we can put that constant
289 * "1" as constant cache ID into the CF instruction and don't need
290 * to execute that extra ADD op, so eliminate the addition here
291 * again and move the buffer base ID into the base value of
292 * the intrinsic that is not used otherwise */
293 class OptIndirectUBOLoads : public NirLowerInstruction {
294 private:
filter(const nir_instr * instr) const295 bool filter(const nir_instr *instr) const override
296 {
297 if (instr->type != nir_instr_type_intrinsic)
298 return false;
299
300 auto intr = nir_instr_as_intrinsic(instr);
301 if (intr->intrinsic != nir_intrinsic_load_ubo_vec4)
302 return false;
303
304 if (nir_src_as_const_value(intr->src[0]) != nullptr)
305 return false;
306
307 return nir_intrinsic_base(intr) == 0;
308 }
309
lower(nir_instr * instr)310 nir_def *lower(nir_instr *instr) override
311 {
312 auto intr = nir_instr_as_intrinsic(instr);
313 assert(intr->intrinsic == nir_intrinsic_load_ubo_vec4);
314
315 auto parent = intr->src[0].ssa->parent_instr;
316
317 if (parent->type != nir_instr_type_alu)
318 return nullptr;
319
320 auto alu = nir_instr_as_alu(parent);
321
322 if (alu->op != nir_op_iadd)
323 return nullptr;
324
325 int new_base = 0;
326 nir_src *new_bufid = nullptr;
327 auto src0 = nir_src_as_const_value(alu->src[0].src);
328 if (src0) {
329 new_bufid = &alu->src[1].src;
330 new_base = src0->i32;
331 } else if (auto src1 = nir_src_as_const_value(alu->src[1].src)) {
332 new_bufid = &alu->src[0].src;
333 new_base = src1->i32;
334 } else {
335 return nullptr;
336 }
337
338 nir_intrinsic_set_base(intr, new_base);
339 nir_src_rewrite(&intr->src[0], new_bufid->ssa);
340 return &intr->def;
341 }
342 };
343
344 } // namespace r600
345
346 static nir_intrinsic_op
r600_map_atomic(nir_intrinsic_op op)347 r600_map_atomic(nir_intrinsic_op op)
348 {
349 switch (op) {
350 case nir_intrinsic_atomic_counter_read_deref:
351 return nir_intrinsic_atomic_counter_read;
352 case nir_intrinsic_atomic_counter_inc_deref:
353 return nir_intrinsic_atomic_counter_inc;
354 case nir_intrinsic_atomic_counter_pre_dec_deref:
355 return nir_intrinsic_atomic_counter_pre_dec;
356 case nir_intrinsic_atomic_counter_post_dec_deref:
357 return nir_intrinsic_atomic_counter_post_dec;
358 case nir_intrinsic_atomic_counter_add_deref:
359 return nir_intrinsic_atomic_counter_add;
360 case nir_intrinsic_atomic_counter_min_deref:
361 return nir_intrinsic_atomic_counter_min;
362 case nir_intrinsic_atomic_counter_max_deref:
363 return nir_intrinsic_atomic_counter_max;
364 case nir_intrinsic_atomic_counter_and_deref:
365 return nir_intrinsic_atomic_counter_and;
366 case nir_intrinsic_atomic_counter_or_deref:
367 return nir_intrinsic_atomic_counter_or;
368 case nir_intrinsic_atomic_counter_xor_deref:
369 return nir_intrinsic_atomic_counter_xor;
370 case nir_intrinsic_atomic_counter_exchange_deref:
371 return nir_intrinsic_atomic_counter_exchange;
372 case nir_intrinsic_atomic_counter_comp_swap_deref:
373 return nir_intrinsic_atomic_counter_comp_swap;
374 default:
375 return nir_num_intrinsics;
376 }
377 }
378
379 static bool
r600_lower_deref_instr(nir_builder * b,nir_intrinsic_instr * instr,UNUSED void * cb_data)380 r600_lower_deref_instr(nir_builder *b, nir_intrinsic_instr *instr,
381 UNUSED void *cb_data)
382 {
383 nir_intrinsic_op op = r600_map_atomic(instr->intrinsic);
384 if (nir_num_intrinsics == op)
385 return false;
386
387 nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
388 nir_variable *var = nir_deref_instr_get_variable(deref);
389
390 if (var->data.mode != nir_var_uniform && var->data.mode != nir_var_mem_ssbo &&
391 var->data.mode != nir_var_mem_shared)
392 return false; /* atomics passed as function arguments can't be lowered */
393
394 const unsigned idx = var->data.binding;
395
396 b->cursor = nir_before_instr(&instr->instr);
397
398 nir_def *offset = nir_imm_int(b, 0);
399 for (nir_deref_instr *d = deref; d->deref_type != nir_deref_type_var;
400 d = nir_deref_instr_parent(d)) {
401 assert(d->deref_type == nir_deref_type_array);
402
403 unsigned array_stride = 1;
404 if (glsl_type_is_array(d->type))
405 array_stride *= glsl_get_aoa_size(d->type);
406
407 offset =
408 nir_iadd(b, offset, nir_imul_imm(b, d->arr.index.ssa, array_stride));
409 }
410
411 /* Since the first source is a deref and the first source in the lowered
412 * instruction is the offset, we can just swap it out and change the
413 * opcode.
414 */
415 instr->intrinsic = op;
416 nir_src_rewrite(&instr->src[0], offset);
417 nir_intrinsic_set_base(instr, idx);
418 nir_intrinsic_set_range_base(instr, var->data.index);
419
420 nir_deref_instr_remove_if_unused(deref);
421
422 return true;
423 }
424
425 static bool
r600_lower_clipvertex_to_clipdist(nir_shader * sh,pipe_stream_output_info & so_info)426 r600_lower_clipvertex_to_clipdist(nir_shader *sh, pipe_stream_output_info& so_info)
427 {
428 if (!(sh->info.outputs_written & VARYING_BIT_CLIP_VERTEX))
429 return false;
430
431 int noutputs = util_bitcount64(sh->info.outputs_written);
432 bool result = r600::LowerClipvertexWrite(noutputs, so_info).run(sh);
433 return result;
434 }
435
436 static bool
r600_nir_lower_atomics(nir_shader * shader)437 r600_nir_lower_atomics(nir_shader *shader)
438 {
439 /* In Hardware we start at a zero index for each new
440 * binding, and we use an offset of one per counter. We also
441 * need to sort the atomics according to binding and offset. */
442 std::map<unsigned, unsigned> binding_offset;
443 std::map<unsigned, nir_variable *> sorted_var;
444
445 nir_foreach_variable_with_modes_safe(var, shader, nir_var_uniform) {
446 if (glsl_contains_atomic(var->type)) {
447 sorted_var[(var->data.binding << 16) | var->data.offset] = var;
448 exec_node_remove(&var->node);
449 }
450 }
451
452 for (auto& [dummy, var] : sorted_var) {
453 auto iindex = binding_offset.find(var->data.binding);
454 unsigned offset_update = glsl_atomic_size(var->type) / 4; /* ATOMIC_COUNTER_SIZE */
455 if (iindex == binding_offset.end()) {
456 var->data.index = 0;
457 binding_offset[var->data.binding] = offset_update;
458 } else {
459 var->data.index = iindex->second;
460 iindex->second += offset_update;
461 }
462 shader->variables.push_tail(&var->node);
463 }
464
465 return nir_shader_intrinsics_pass(shader, r600_lower_deref_instr,
466 nir_metadata_block_index | nir_metadata_dominance,
467 NULL);
468 }
469 using r600::r600_lower_fs_out_to_vector;
470 using r600::r600_lower_scratch_addresses;
471 using r600::r600_lower_ubo_to_align16;
472
473 int
r600_glsl_type_size(const struct glsl_type * type,bool is_bindless)474 r600_glsl_type_size(const struct glsl_type *type, bool is_bindless)
475 {
476 return glsl_count_vec4_slots(type, false, is_bindless);
477 }
478
479 void
r600_get_natural_size_align_bytes(const struct glsl_type * type,unsigned * size,unsigned * align)480 r600_get_natural_size_align_bytes(const struct glsl_type *type,
481 unsigned *size,
482 unsigned *align)
483 {
484 if (type->base_type != GLSL_TYPE_ARRAY) {
485 *align = 1;
486 *size = 1;
487 } else {
488 unsigned elem_size, elem_align;
489 glsl_get_natural_size_align_bytes(type->fields.array, &elem_size, &elem_align);
490 *align = 1;
491 *size = type->length;
492 }
493 }
494
495 static bool
r600_lower_shared_io_impl(nir_function_impl * impl)496 r600_lower_shared_io_impl(nir_function_impl *impl)
497 {
498 nir_builder b = nir_builder_create(impl);
499
500 bool progress = false;
501 nir_foreach_block(block, impl)
502 {
503 nir_foreach_instr_safe(instr, block)
504 {
505
506 if (instr->type != nir_instr_type_intrinsic)
507 continue;
508
509 nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
510 if (op->intrinsic != nir_intrinsic_load_shared &&
511 op->intrinsic != nir_intrinsic_store_shared)
512 continue;
513
514 b.cursor = nir_before_instr(instr);
515
516 if (op->intrinsic == nir_intrinsic_load_shared) {
517 nir_def *addr = op->src[0].ssa;
518
519 switch (op->def.num_components) {
520 case 2: {
521 auto addr2 = nir_iadd_imm(&b, addr, 4);
522 addr = nir_vec2(&b, addr, addr2);
523 break;
524 }
525 case 3: {
526 auto addr2 = nir_iadd(&b, addr, nir_imm_ivec2(&b, 4, 8));
527 addr =
528 nir_vec3(&b, addr, nir_channel(&b, addr2, 0), nir_channel(&b, addr2, 1));
529 break;
530 }
531 case 4: {
532 addr = nir_iadd(&b, addr, nir_imm_ivec4(&b, 0, 4, 8, 12));
533 break;
534 }
535 }
536
537 auto load =
538 nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_local_shared_r600);
539 load->num_components = op->def.num_components;
540 load->src[0] = nir_src_for_ssa(addr);
541 nir_def_init(&load->instr, &load->def, load->num_components,
542 32);
543 nir_def_rewrite_uses(&op->def, &load->def);
544 nir_builder_instr_insert(&b, &load->instr);
545 } else {
546 nir_def *addr = op->src[1].ssa;
547 for (int i = 0; i < 2; ++i) {
548 unsigned test_mask = (0x3 << 2 * i);
549 if (!(nir_intrinsic_write_mask(op) & test_mask))
550 continue;
551
552 auto store =
553 nir_intrinsic_instr_create(b.shader,
554 nir_intrinsic_store_local_shared_r600);
555 unsigned writemask = nir_intrinsic_write_mask(op) & test_mask;
556 nir_intrinsic_set_write_mask(store, writemask);
557 store->src[0] = nir_src_for_ssa(op->src[0].ssa);
558 store->num_components = store->src[0].ssa->num_components;
559 bool start_even = (writemask & (1u << (2 * i)));
560
561 auto addr2 =
562 nir_iadd_imm(&b, addr, 8 * i + (start_even ? 0 : 4));
563 store->src[1] = nir_src_for_ssa(addr2);
564
565 nir_builder_instr_insert(&b, &store->instr);
566 }
567 }
568 nir_instr_remove(instr);
569 progress = true;
570 }
571 }
572 return progress;
573 }
574
575 static bool
r600_lower_shared_io(nir_shader * nir)576 r600_lower_shared_io(nir_shader *nir)
577 {
578 bool progress = false;
579 nir_foreach_function_impl(impl, nir)
580 {
581 if (r600_lower_shared_io_impl(impl))
582 progress = true;
583 }
584 return progress;
585 }
586
587 static nir_def *
r600_lower_fs_pos_input_impl(nir_builder * b,nir_instr * instr,void * _options)588 r600_lower_fs_pos_input_impl(nir_builder *b, nir_instr *instr, void *_options)
589 {
590 (void)_options;
591 auto old_ir = nir_instr_as_intrinsic(instr);
592 auto load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_input);
593 nir_def_init(&load->instr, &load->def,
594 old_ir->def.num_components, old_ir->def.bit_size);
595 nir_intrinsic_set_io_semantics(load, nir_intrinsic_io_semantics(old_ir));
596
597 nir_intrinsic_set_base(load, nir_intrinsic_base(old_ir));
598 nir_intrinsic_set_component(load, nir_intrinsic_component(old_ir));
599 nir_intrinsic_set_dest_type(load, nir_type_float32);
600 load->num_components = old_ir->num_components;
601 load->src[0] = old_ir->src[1];
602 nir_builder_instr_insert(b, &load->instr);
603 return &load->def;
604 }
605
606 bool
r600_lower_fs_pos_input_filter(const nir_instr * instr,const void * _options)607 r600_lower_fs_pos_input_filter(const nir_instr *instr, const void *_options)
608 {
609 (void)_options;
610
611 if (instr->type != nir_instr_type_intrinsic)
612 return false;
613
614 auto ir = nir_instr_as_intrinsic(instr);
615 if (ir->intrinsic != nir_intrinsic_load_interpolated_input)
616 return false;
617
618 return nir_intrinsic_io_semantics(ir).location == VARYING_SLOT_POS;
619 }
620
621 /* Strip the interpolator specification, it is not needed and irritates */
622 bool
r600_lower_fs_pos_input(nir_shader * shader)623 r600_lower_fs_pos_input(nir_shader *shader)
624 {
625 return nir_shader_lower_instructions(shader,
626 r600_lower_fs_pos_input_filter,
627 r600_lower_fs_pos_input_impl,
628 nullptr);
629 };
630
631 bool
r600_opt_indirect_fbo_loads(nir_shader * shader)632 r600_opt_indirect_fbo_loads(nir_shader *shader)
633 {
634 return r600::OptIndirectUBOLoads().run(shader);
635 }
636
637 static bool
optimize_once(nir_shader * shader)638 optimize_once(nir_shader *shader)
639 {
640 bool progress = false;
641 NIR_PASS(progress, shader, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
642 NIR_PASS(progress, shader, nir_lower_vars_to_ssa);
643 NIR_PASS(progress, shader, nir_copy_prop);
644 NIR_PASS(progress, shader, nir_opt_dce);
645 NIR_PASS(progress, shader, nir_opt_algebraic);
646 NIR_PASS(progress, shader, nir_opt_constant_folding);
647 NIR_PASS(progress, shader, nir_opt_copy_prop_vars);
648 NIR_PASS(progress, shader, nir_opt_remove_phis);
649
650 if (nir_opt_loop(shader)) {
651 progress = true;
652 NIR_PASS(progress, shader, nir_copy_prop);
653 NIR_PASS(progress, shader, nir_opt_dce);
654 }
655
656 NIR_PASS(progress, shader, nir_opt_if, nir_opt_if_optimize_phi_true_false);
657 NIR_PASS(progress, shader, nir_opt_dead_cf);
658 NIR_PASS(progress, shader, nir_opt_cse);
659 NIR_PASS(progress, shader, nir_opt_peephole_select, 200, true, true);
660
661 NIR_PASS(progress, shader, nir_opt_conditional_discard);
662 NIR_PASS(progress, shader, nir_opt_dce);
663 NIR_PASS(progress, shader, nir_opt_undef);
664 NIR_PASS(progress, shader, nir_opt_loop_unroll);
665 return progress;
666 }
667
668 static bool
r600_is_last_vertex_stage(nir_shader * nir,const r600_shader_key & key)669 r600_is_last_vertex_stage(nir_shader *nir, const r600_shader_key& key)
670 {
671 if (nir->info.stage == MESA_SHADER_GEOMETRY)
672 return true;
673
674 if (nir->info.stage == MESA_SHADER_TESS_EVAL && !key.tes.as_es)
675 return true;
676
677 if (nir->info.stage == MESA_SHADER_VERTEX && !key.vs.as_es && !key.vs.as_ls)
678 return true;
679
680 return false;
681 }
682
683 extern "C" bool
r600_lower_to_scalar_instr_filter(const nir_instr * instr,const void *)684 r600_lower_to_scalar_instr_filter(const nir_instr *instr, const void *)
685 {
686 if (instr->type != nir_instr_type_alu)
687 return true;
688
689 auto alu = nir_instr_as_alu(instr);
690 switch (alu->op) {
691 case nir_op_bany_fnequal3:
692 case nir_op_bany_fnequal4:
693 case nir_op_ball_fequal3:
694 case nir_op_ball_fequal4:
695 case nir_op_bany_inequal3:
696 case nir_op_bany_inequal4:
697 case nir_op_ball_iequal3:
698 case nir_op_ball_iequal4:
699 case nir_op_fdot2:
700 case nir_op_fdot3:
701 case nir_op_fdot4:
702 case nir_op_fddx:
703 case nir_op_fddx_coarse:
704 case nir_op_fddx_fine:
705 case nir_op_fddy:
706 case nir_op_fddy_coarse:
707 case nir_op_fddy_fine:
708 return nir_src_bit_size(alu->src[0].src) == 64;
709 default:
710 return true;
711 }
712 }
713
714 void
r600_finalize_nir_common(nir_shader * nir,enum amd_gfx_level gfx_level)715 r600_finalize_nir_common(nir_shader *nir, enum amd_gfx_level gfx_level)
716 {
717 const int nir_lower_flrp_mask = 16 | 32 | 64;
718
719 NIR_PASS_V(nir, nir_lower_flrp, nir_lower_flrp_mask, false);
720
721 nir_lower_idiv_options idiv_options = {0};
722 NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
723
724 NIR_PASS_V(nir, r600_nir_lower_trigen, gfx_level);
725 NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
726 NIR_PASS_V(nir, nir_lower_undef_to_zero);
727
728 struct nir_lower_tex_options lower_tex_options = {0};
729 lower_tex_options.lower_txp = ~0u;
730 lower_tex_options.lower_txf_offset = true;
731 lower_tex_options.lower_invalid_implicit_lod = true;
732 lower_tex_options.lower_tg4_offsets = true;
733
734 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
735 NIR_PASS_V(nir, r600_nir_lower_txl_txf_array_or_cube);
736 NIR_PASS_V(nir, r600_nir_lower_cube_to_2darray);
737
738 NIR_PASS_V(nir, r600_nir_lower_pack_unpack_2x16);
739
740 NIR_PASS_V(nir, r600_lower_shared_io);
741 NIR_PASS_V(nir, r600_nir_lower_atomics);
742
743 if (gfx_level == CAYMAN)
744 NIR_PASS_V(nir, r600_legalize_image_load_store);
745
746 while (optimize_once(nir))
747 ;
748 }
749
750 DEBUG_GET_ONCE_NUM_OPTION(skip_opt_start, "R600_SFN_SKIP_OPT_START", -1);
751 DEBUG_GET_ONCE_NUM_OPTION(skip_opt_end, "R600_SFN_SKIP_OPT_END", -1);
752
753 void
r600_lower_and_optimize_nir(nir_shader * sh,const union r600_shader_key * key,enum amd_gfx_level gfx_level,struct pipe_stream_output_info * so_info)754 r600_lower_and_optimize_nir(nir_shader *sh,
755 const union r600_shader_key *key,
756 enum amd_gfx_level gfx_level,
757 struct pipe_stream_output_info *so_info)
758 {
759 bool lower_64bit =
760 gfx_level < CAYMAN &&
761 (sh->options->lower_int64_options || sh->options->lower_doubles_options) &&
762 ((sh->info.bit_sizes_float | sh->info.bit_sizes_int) & 64);
763
764 r600::sort_uniforms(sh);
765 NIR_PASS_V(sh, r600_nir_fix_kcache_indirect_access);
766
767 while (optimize_once(sh))
768 ;
769
770
771 if (sh->info.stage == MESA_SHADER_VERTEX)
772 NIR_PASS_V(sh, r600_vectorize_vs_inputs);
773
774 if (sh->info.stage == MESA_SHADER_FRAGMENT) {
775 NIR_PASS_V(sh, nir_lower_fragcoord_wtrans);
776 NIR_PASS_V(sh, r600_lower_fs_out_to_vector);
777 NIR_PASS_V(sh, nir_opt_dce);
778 NIR_PASS_V(sh, nir_remove_dead_variables, nir_var_shader_out, 0);
779 r600::sort_fsoutput(sh);
780 }
781 nir_variable_mode io_modes = nir_var_uniform | nir_var_shader_in | nir_var_shader_out;
782
783 NIR_PASS_V(sh, nir_opt_combine_stores, nir_var_shader_out);
784 NIR_PASS_V(sh,
785 nir_lower_io,
786 io_modes,
787 r600_glsl_type_size,
788 nir_lower_io_lower_64bit_to_32);
789
790 if (sh->info.stage == MESA_SHADER_FRAGMENT)
791 NIR_PASS_V(sh, r600_lower_fs_pos_input);
792
793 /**/
794 if (lower_64bit)
795 NIR_PASS_V(sh, nir_lower_indirect_derefs, nir_var_function_temp, 10);
796
797 NIR_PASS_V(sh, nir_opt_constant_folding);
798 NIR_PASS_V(sh, nir_io_add_const_offset_to_base, io_modes);
799
800 NIR_PASS_V(sh, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
801 NIR_PASS_V(sh, nir_lower_phis_to_scalar, false);
802 if (lower_64bit)
803 NIR_PASS_V(sh, r600::r600_nir_split_64bit_io);
804 NIR_PASS_V(sh, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
805 NIR_PASS_V(sh, nir_lower_phis_to_scalar, false);
806 NIR_PASS_V(sh, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
807 NIR_PASS_V(sh, nir_copy_prop);
808 NIR_PASS_V(sh, nir_opt_dce);
809
810
811
812 if (r600_is_last_vertex_stage(sh, *key))
813 r600_lower_clipvertex_to_clipdist(sh, *so_info);
814
815 if (sh->info.stage == MESA_SHADER_TESS_CTRL ||
816 sh->info.stage == MESA_SHADER_TESS_EVAL ||
817 (sh->info.stage == MESA_SHADER_VERTEX && key->vs.as_ls)) {
818 auto prim_type = sh->info.stage == MESA_SHADER_TESS_EVAL
819 ? u_tess_prim_from_shader(sh->info.tess._primitive_mode)
820 : (mesa_prim)key->tcs.prim_mode;
821 NIR_PASS_V(sh, r600_lower_tess_io, static_cast<mesa_prim>(prim_type));
822 }
823
824 if (sh->info.stage == MESA_SHADER_TESS_CTRL)
825 NIR_PASS_V(sh, r600_append_tcs_TF_emission, (mesa_prim)key->tcs.prim_mode);
826
827 if (sh->info.stage == MESA_SHADER_TESS_EVAL) {
828 NIR_PASS_V(sh, nir_lower_tess_coord_z,
829 sh->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES);
830 }
831
832 NIR_PASS_V(sh, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
833 NIR_PASS_V(sh, nir_lower_phis_to_scalar, false);
834 NIR_PASS_V(sh, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
835 NIR_PASS_V(sh, r600_nir_lower_int_tg4);
836 NIR_PASS_V(sh, r600::r600_nir_lower_tex_to_backend, gfx_level);
837
838 if ((sh->info.bit_sizes_float | sh->info.bit_sizes_int) & 64) {
839 NIR_PASS_V(sh, r600::r600_nir_split_64bit_io);
840 NIR_PASS_V(sh, r600::r600_split_64bit_alu_and_phi);
841 NIR_PASS_V(sh, nir_split_64bit_vec3_and_vec4);
842 NIR_PASS_V(sh, nir_lower_int64);
843 }
844
845 NIR_PASS_V(sh, nir_lower_ubo_vec4);
846 NIR_PASS_V(sh, r600_opt_indirect_fbo_loads);
847
848 if (lower_64bit)
849 NIR_PASS_V(sh, r600::r600_nir_64_to_vec2);
850
851 if ((sh->info.bit_sizes_float | sh->info.bit_sizes_int) & 64)
852 NIR_PASS_V(sh, r600::r600_split_64bit_uniforms_and_ubo);
853
854 /* Lower to scalar to let some optimization work out better */
855 while (optimize_once(sh))
856 ;
857
858 if (lower_64bit)
859 NIR_PASS_V(sh, r600::r600_merge_vec2_stores);
860
861 NIR_PASS_V(sh, nir_remove_dead_variables, nir_var_shader_in, NULL);
862 NIR_PASS_V(sh, nir_remove_dead_variables, nir_var_shader_out, NULL);
863
864 NIR_PASS_V(sh,
865 nir_lower_vars_to_scratch,
866 nir_var_function_temp,
867 40,
868 r600_get_natural_size_align_bytes);
869
870 while (optimize_once(sh))
871 ;
872
873 if ((sh->info.bit_sizes_float | sh->info.bit_sizes_int) & 64)
874 NIR_PASS_V(sh, r600::r600_split_64bit_alu_and_phi);
875
876 bool late_algebraic_progress;
877 do {
878 late_algebraic_progress = false;
879 NIR_PASS(late_algebraic_progress, sh, nir_opt_algebraic_late);
880 NIR_PASS(late_algebraic_progress, sh, nir_opt_constant_folding);
881 NIR_PASS(late_algebraic_progress, sh, nir_copy_prop);
882 NIR_PASS(late_algebraic_progress, sh, nir_opt_dce);
883 NIR_PASS(late_algebraic_progress, sh, nir_opt_cse);
884 } while (late_algebraic_progress);
885
886 NIR_PASS_V(sh, nir_lower_bool_to_int32);
887
888 NIR_PASS_V(sh, nir_lower_locals_to_regs, 32);
889 NIR_PASS_V(sh, nir_convert_from_ssa, true);
890 NIR_PASS_V(sh, nir_opt_dce);
891 }
892
893 void
r600_finalize_and_optimize_shader(r600::Shader * shader)894 r600_finalize_and_optimize_shader(r600::Shader *shader)
895 {
896 if (r600::sfn_log.has_debug_flag(r600::SfnLog::steps)) {
897 std::cerr << "Shader after conversion from nir\n";
898 shader->print(std::cerr);
899 }
900
901 auto sfn_skip_opt_start = debug_get_option_skip_opt_start();
902 auto sfn_skip_opt_end = debug_get_option_skip_opt_end();
903 bool skip_shader_opt_per_id = sfn_skip_opt_start >= 0 &&
904 sfn_skip_opt_start <= shader->shader_id() &&
905 sfn_skip_opt_end >= shader->shader_id();
906
907 bool skip_shader_opt = r600::sfn_log.has_debug_flag(r600::SfnLog::noopt) ||
908 skip_shader_opt_per_id;
909
910 if (!skip_shader_opt) {
911 optimize(*shader);
912 if (r600::sfn_log.has_debug_flag(r600::SfnLog::steps)) {
913 std::cerr << "Shader after optimization\n";
914 shader->print(std::cerr);
915 }
916 }
917
918 split_address_loads(*shader);
919
920 if (r600::sfn_log.has_debug_flag(r600::SfnLog::steps)) {
921 std::cerr << "Shader after splitting address loads\n";
922 shader->print(std::cerr);
923 }
924
925 if (!skip_shader_opt) {
926 optimize(*shader);
927 if (r600::sfn_log.has_debug_flag(r600::SfnLog::steps)) {
928 std::cerr << "Shader after optimization\n";
929 shader->print(std::cerr);
930 }
931 }
932 }
933
934 r600::Shader *
r600_schedule_shader(r600::Shader * shader)935 r600_schedule_shader(r600::Shader *shader)
936 {
937 auto scheduled_shader = r600::schedule(shader);
938 if (r600::sfn_log.has_debug_flag(r600::SfnLog::steps)) {
939 std::cerr << "Shader after scheduling\n";
940 scheduled_shader->print(std::cerr);
941 }
942
943 if (!r600::sfn_log.has_debug_flag(r600::SfnLog::nomerge)) {
944
945 if (r600::sfn_log.has_debug_flag(r600::SfnLog::merge)) {
946 r600::sfn_log << r600::SfnLog::merge << "Shader before RA\n";
947 scheduled_shader->print(std::cerr);
948 }
949
950 r600::sfn_log << r600::SfnLog::trans << "Merge registers\n";
951 auto lrm = r600::LiveRangeEvaluator().run(*scheduled_shader);
952
953 if (!r600::register_allocation(lrm)) {
954 R600_ERR("%s: Register allocation failed\n", __func__);
955 /* For now crash if the shader could not be benerated */
956 assert(0);
957 return nullptr;
958 } else if (r600::sfn_log.has_debug_flag(r600::SfnLog::merge) ||
959 r600::sfn_log.has_debug_flag(r600::SfnLog::steps)) {
960 r600::sfn_log << "Shader after RA\n";
961 scheduled_shader->print(std::cerr);
962 }
963 }
964
965 return scheduled_shader;
966 }
967