1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2022 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_shader_fs.h"
28
29 #include "sfn_debug.h"
30 #include "sfn_instr_alugroup.h"
31 #include "sfn_instr_export.h"
32 #include "sfn_instr_fetch.h"
33 #include "sfn_instr_tex.h"
34
35 #include <sstream>
36
37 namespace r600 {
38
39 using std::string;
40
FragmentShader(const r600_shader_key & key)41 FragmentShader::FragmentShader(const r600_shader_key& key):
42 Shader("FS", key.ps.first_atomic_counter),
43 m_dual_source_blend(key.ps.dual_source_blend),
44 m_max_color_exports(MAX2(key.ps.nr_cbufs, 1)),
45 m_pos_input(127, false),
46 m_fs_write_all(false),
47 m_apply_sample_mask(key.ps.apply_sample_id_mask),
48 m_rat_base(key.ps.nr_cbufs),
49 m_image_size_const_offset(key.ps.image_size_const_offset)
50 {
51 }
52
53 void
do_get_shader_info(r600_shader * sh_info)54 FragmentShader::do_get_shader_info(r600_shader *sh_info)
55 {
56 sh_info->processor_type = PIPE_SHADER_FRAGMENT;
57
58 sh_info->ps_color_export_mask = m_color_export_mask;
59 sh_info->ps_export_highest = m_export_highest;
60 sh_info->nr_ps_color_exports = m_num_color_exports;
61
62 sh_info->fs_write_all = m_fs_write_all;
63
64 sh_info->rat_base = m_rat_base;
65 sh_info->uses_kill = m_uses_discard;
66 sh_info->gs_prim_id_input = m_gs_prim_id_input;
67 sh_info->nsys_inputs = m_nsys_inputs;
68 sh_info->uses_helper_invocation = m_helper_invocation != nullptr;
69 }
70
71 bool
load_input(nir_intrinsic_instr * intr)72 FragmentShader::load_input(nir_intrinsic_instr *intr)
73 {
74 auto& vf = value_factory();
75
76 auto location = nir_intrinsic_io_semantics(intr).location;
77 if (location == VARYING_SLOT_POS) {
78 AluInstr *ir = nullptr;
79 for (unsigned i = 0; i < intr->def.num_components; ++i) {
80 ir = new AluInstr(op1_mov,
81 vf.dest(intr->def, i, pin_none),
82 m_pos_input[i],
83 AluInstr::write);
84 emit_instruction(ir);
85 }
86 ir->set_alu_flag(alu_last_instr);
87 return true;
88 }
89
90 if (location == VARYING_SLOT_FACE) {
91 auto ir = new AluInstr(op2_setgt_dx10,
92 vf.dest(intr->def, 0, pin_none),
93 m_face_input,
94 vf.inline_const(ALU_SRC_0, 0),
95 AluInstr::last_write);
96 emit_instruction(ir);
97 return true;
98 }
99
100 return load_input_hw(intr);
101 }
102
103 bool
store_output(nir_intrinsic_instr * intr)104 FragmentShader::store_output(nir_intrinsic_instr *intr)
105 {
106 auto location = nir_intrinsic_io_semantics(intr).location;
107
108 if (location == FRAG_RESULT_COLOR && !m_dual_source_blend) {
109 m_fs_write_all = true;
110 }
111
112 return emit_export_pixel(*intr);
113 }
114
115 unsigned
barycentric_ij_index(nir_intrinsic_instr * intr)116 barycentric_ij_index(nir_intrinsic_instr *intr)
117 {
118 unsigned index = 0;
119 switch (intr->intrinsic) {
120 case nir_intrinsic_load_barycentric_sample:
121 index = 0;
122 break;
123 case nir_intrinsic_load_barycentric_at_sample:
124 case nir_intrinsic_load_barycentric_at_offset:
125 case nir_intrinsic_load_barycentric_pixel:
126 index = 1;
127 break;
128 case nir_intrinsic_load_barycentric_centroid:
129 index = 2;
130 break;
131 default:
132 unreachable("Unknown interpolator intrinsic");
133 }
134
135 switch (nir_intrinsic_interp_mode(intr)) {
136 case INTERP_MODE_NONE:
137 case INTERP_MODE_SMOOTH:
138 return index;
139 case INTERP_MODE_NOPERSPECTIVE:
140 return index + 3;
141 case INTERP_MODE_FLAT:
142 case INTERP_MODE_EXPLICIT:
143 default:
144 unreachable("unknown/unsupported mode for load_interpolated");
145 }
146 return 0;
147 }
148
149 bool
process_stage_intrinsic(nir_intrinsic_instr * intr)150 FragmentShader::process_stage_intrinsic(nir_intrinsic_instr *intr)
151 {
152 if (process_stage_intrinsic_hw(intr))
153 return true;
154
155 switch (intr->intrinsic) {
156 case nir_intrinsic_load_input:
157 return load_input(intr);
158 case nir_intrinsic_load_interpolated_input:
159 return load_interpolated_input(intr);
160 case nir_intrinsic_discard_if:
161 m_uses_discard = true;
162 emit_instruction(new AluInstr(op2_killne_int,
163 nullptr,
164 value_factory().src(intr->src[0], 0),
165 value_factory().zero(),
166 {AluInstr::last}));
167
168 return true;
169 case nir_intrinsic_discard:
170 m_uses_discard = true;
171 emit_instruction(new AluInstr(op2_kille_int,
172 nullptr,
173 value_factory().zero(),
174 value_factory().zero(),
175 {AluInstr::last}));
176 return true;
177 case nir_intrinsic_load_sample_mask_in:
178 if (m_apply_sample_mask) {
179 return emit_load_sample_mask_in(intr);
180 } else
181 return emit_simple_mov(intr->def, 0, m_sample_mask_reg);
182 case nir_intrinsic_load_sample_id:
183 return emit_simple_mov(intr->def, 0, m_sample_id_reg);
184 case nir_intrinsic_load_helper_invocation:
185 return emit_load_helper_invocation(intr);
186 case nir_intrinsic_load_sample_pos:
187 return emit_load_sample_pos(intr);
188 default:
189 return false;
190 }
191 }
192
193 bool
load_interpolated_input(nir_intrinsic_instr * intr)194 FragmentShader::load_interpolated_input(nir_intrinsic_instr *intr)
195 {
196 auto& vf = value_factory();
197 unsigned loc = nir_intrinsic_io_semantics(intr).location;
198 switch (loc) {
199 case VARYING_SLOT_POS:
200 for (unsigned i = 0; i < intr->def.num_components; ++i)
201 vf.inject_value(intr->def, i, m_pos_input[i]);
202 return true;
203 case VARYING_SLOT_FACE:
204 return false;
205 default:;
206 }
207
208 return load_interpolated_input_hw(intr);
209 }
210
211 int
do_allocate_reserved_registers()212 FragmentShader::do_allocate_reserved_registers()
213 {
214 int next_register = allocate_interpolators_or_inputs();
215
216 if (m_sv_values.test(es_pos)) {
217 set_input_gpr(m_pos_driver_loc, next_register);
218 m_pos_input = value_factory().allocate_pinned_vec4(next_register++, false);
219 }
220
221 int face_reg_index = -1;
222 if (m_sv_values.test(es_face)) {
223 set_input_gpr(m_face_driver_loc, next_register);
224 face_reg_index = next_register++;
225 m_face_input = value_factory().allocate_pinned_register(face_reg_index, 0);
226 }
227
228 if (m_sv_values.test(es_sample_mask_in)) {
229 if (face_reg_index < 0)
230 face_reg_index = next_register++;
231 m_sample_mask_reg = value_factory().allocate_pinned_register(face_reg_index, 2);
232 sfn_log << SfnLog::io << "Set sample mask in register to " << *m_sample_mask_reg
233 << "\n";
234 m_nsys_inputs = 1;
235 ShaderInput input(ninputs());
236 input.set_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN);
237 input.set_gpr(face_reg_index);
238 add_input(input);
239 }
240
241 if (m_sv_values.test(es_sample_id) || m_sv_values.test(es_sample_mask_in)) {
242 int sample_id_reg = next_register++;
243 m_sample_id_reg = value_factory().allocate_pinned_register(sample_id_reg, 3);
244 sfn_log << SfnLog::io << "Set sample id register to " << *m_sample_id_reg << "\n";
245 m_nsys_inputs++;
246 ShaderInput input(ninputs());
247 input.set_system_value(SYSTEM_VALUE_SAMPLE_ID);
248 input.set_gpr(sample_id_reg);
249 add_input(input);
250 }
251
252 if (m_sv_values.test(es_helper_invocation)) {
253 m_helper_invocation = value_factory().temp_register(0, false);
254 }
255
256 return next_register;
257 }
258
259 bool
do_scan_instruction(nir_instr * instr)260 FragmentShader::do_scan_instruction(nir_instr *instr)
261 {
262 if (instr->type != nir_instr_type_intrinsic)
263 return false;
264
265 auto intr = nir_instr_as_intrinsic(instr);
266 switch (intr->intrinsic) {
267 case nir_intrinsic_load_barycentric_pixel:
268 case nir_intrinsic_load_barycentric_sample:
269 case nir_intrinsic_load_barycentric_at_sample:
270 case nir_intrinsic_load_barycentric_at_offset:
271 case nir_intrinsic_load_barycentric_centroid:
272 m_interpolators_used.set(barycentric_ij_index(intr));
273 break;
274 case nir_intrinsic_load_front_face:
275 m_sv_values.set(es_face);
276 break;
277 case nir_intrinsic_load_sample_mask_in:
278 m_sv_values.set(es_sample_mask_in);
279 break;
280 case nir_intrinsic_load_sample_pos:
281 m_sv_values.set(es_sample_pos);
282 FALLTHROUGH;
283 case nir_intrinsic_load_sample_id:
284 m_sv_values.set(es_sample_id);
285 break;
286 case nir_intrinsic_load_helper_invocation:
287 m_sv_values.set(es_helper_invocation);
288 break;
289 case nir_intrinsic_load_input:
290 return scan_input(intr, 0);
291 case nir_intrinsic_load_interpolated_input:
292 return scan_input(intr, 1);
293 default:
294 return false;
295 }
296 return true;
297 }
298
299 bool
emit_load_sample_mask_in(nir_intrinsic_instr * instr)300 FragmentShader::emit_load_sample_mask_in(nir_intrinsic_instr *instr)
301 {
302 auto& vf = value_factory();
303 auto dest = vf.dest(instr->def, 0, pin_free);
304 auto tmp = vf.temp_register();
305 assert(m_sample_id_reg);
306 assert(m_sample_mask_reg);
307
308 emit_instruction(
309 new AluInstr(op2_lshl_int, tmp, vf.one_i(), m_sample_id_reg, AluInstr::last_write));
310 emit_instruction(
311 new AluInstr(op2_and_int, dest, tmp, m_sample_mask_reg, AluInstr::last_write));
312 return true;
313 }
314
315 bool
emit_load_helper_invocation(nir_intrinsic_instr * instr)316 FragmentShader::emit_load_helper_invocation(nir_intrinsic_instr *instr)
317 {
318 assert(m_helper_invocation);
319 auto& vf = value_factory();
320 emit_instruction(
321 new AluInstr(op1_mov, m_helper_invocation, vf.literal(-1), AluInstr::last_write));
322 RegisterVec4 destvec{m_helper_invocation, nullptr, nullptr, nullptr, pin_group};
323
324 auto vtx = new LoadFromBuffer(destvec,
325 {4, 7, 7, 7},
326 m_helper_invocation,
327 0,
328 R600_BUFFER_INFO_CONST_BUFFER,
329 nullptr,
330 fmt_32_32_32_32_float);
331 vtx->set_fetch_flag(FetchInstr::vpm);
332 vtx->set_fetch_flag(FetchInstr::use_tc);
333 vtx->set_always_keep();
334 auto dst = value_factory().dest(instr->def, 0, pin_free);
335 auto ir = new AluInstr(op1_mov, dst, m_helper_invocation, AluInstr::last_write);
336 ir->add_required_instr(vtx);
337 emit_instruction(vtx);
338 emit_instruction(ir);
339
340 return true;
341 }
342
343 bool
scan_input(nir_intrinsic_instr * intr,int index_src_id)344 FragmentShader::scan_input(nir_intrinsic_instr *intr, int index_src_id)
345 {
346 auto index = nir_src_as_const_value(intr->src[index_src_id]);
347 assert(index);
348
349 const unsigned location_offset = chip_class() < ISA_CC_EVERGREEN ? 32 : 0;
350 bool uses_interpol_at_centroid = false;
351
352 auto location =
353 static_cast<gl_varying_slot>(nir_intrinsic_io_semantics(intr).location + index->u32);
354 unsigned driver_location = nir_intrinsic_base(intr) + index->u32;
355
356 if (location == VARYING_SLOT_POS) {
357 m_sv_values.set(es_pos);
358 m_pos_driver_loc = driver_location + location_offset;
359 ShaderInput pos_input(m_pos_driver_loc, location);
360 pos_input.set_interpolator(TGSI_INTERPOLATE_LINEAR,
361 TGSI_INTERPOLATE_LOC_CENTER,
362 false);
363 add_input(pos_input);
364 return true;
365 }
366
367 if (location == VARYING_SLOT_FACE) {
368 m_sv_values.set(es_face);
369 m_face_driver_loc = driver_location + location_offset;
370 ShaderInput face_input(m_face_driver_loc, location);
371 add_input(face_input);
372 return true;
373 }
374
375 tgsi_interpolate_mode tgsi_interpolate = TGSI_INTERPOLATE_CONSTANT;
376 tgsi_interpolate_loc tgsi_loc = TGSI_INTERPOLATE_LOC_CENTER;
377
378 const bool is_color =
379 (location >= VARYING_SLOT_COL0 && location <= VARYING_SLOT_COL1) ||
380 (location >= VARYING_SLOT_BFC0 && location <= VARYING_SLOT_BFC1);
381
382 if (index_src_id > 0) {
383 glsl_interp_mode mode = INTERP_MODE_NONE;
384 auto parent = nir_instr_as_intrinsic(intr->src[0].ssa->parent_instr);
385 mode = (glsl_interp_mode)nir_intrinsic_interp_mode(parent);
386 switch (parent->intrinsic) {
387 case nir_intrinsic_load_barycentric_sample:
388 tgsi_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
389 break;
390 case nir_intrinsic_load_barycentric_at_sample:
391 case nir_intrinsic_load_barycentric_at_offset:
392 case nir_intrinsic_load_barycentric_pixel:
393 tgsi_loc = TGSI_INTERPOLATE_LOC_CENTER;
394 break;
395 case nir_intrinsic_load_barycentric_centroid:
396 tgsi_loc = TGSI_INTERPOLATE_LOC_CENTROID;
397 uses_interpol_at_centroid = true;
398 break;
399 default:
400 std::cerr << "Instruction " << nir_intrinsic_infos[parent->intrinsic].name
401 << " as parent of " << nir_intrinsic_infos[intr->intrinsic].name
402 << " interpolator?\n";
403 assert(0);
404 }
405
406 switch (mode) {
407 case INTERP_MODE_NONE:
408 if (is_color) {
409 tgsi_interpolate = TGSI_INTERPOLATE_COLOR;
410 break;
411 }
412 FALLTHROUGH;
413 case INTERP_MODE_SMOOTH:
414 tgsi_interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
415 break;
416 case INTERP_MODE_NOPERSPECTIVE:
417 tgsi_interpolate = TGSI_INTERPOLATE_LINEAR;
418 break;
419 case INTERP_MODE_FLAT:
420 break;
421 case INTERP_MODE_EXPLICIT:
422 default:
423 assert(0);
424 }
425 }
426
427 if (location == VARYING_SLOT_PRIMITIVE_ID) {
428 m_gs_prim_id_input = true;
429 } else if (!(is_color || (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_MAX) ||
430 (location >= VARYING_SLOT_TEX0 && location <= VARYING_SLOT_TEX7) ||
431 (location >= VARYING_SLOT_CLIP_DIST0 && location <= VARYING_SLOT_CLIP_DIST1) ||
432 location == VARYING_SLOT_FOGC || location == VARYING_SLOT_LAYER ||
433 location == VARYING_SLOT_PNTC || location == VARYING_SLOT_VIEWPORT)) {
434 return false;
435 }
436
437 sfn_log << SfnLog::io << " have IO at " << driver_location << "\n";
438 auto iinput = find_input(driver_location);
439 if (iinput == input_not_found()) {
440 ShaderInput input(driver_location, location);
441 input.set_need_lds_pos();
442 input.set_interpolator(tgsi_interpolate, tgsi_loc, uses_interpol_at_centroid);
443 sfn_log << SfnLog::io << "add IO with LDS ID at " << input.location() << "\n";
444 add_input(input);
445 assert(find_input(input.location()) != input_not_found());
446 } else {
447 if (uses_interpol_at_centroid) {
448 iinput->second.set_uses_interpolate_at_centroid();
449 }
450 }
451 return true;
452 }
453
454 bool
emit_export_pixel(nir_intrinsic_instr & intr)455 FragmentShader::emit_export_pixel(nir_intrinsic_instr& intr)
456 {
457 RegisterVec4::Swizzle swizzle;
458 auto semantics = nir_intrinsic_io_semantics(&intr);
459 unsigned driver_location = nir_intrinsic_base(&intr);
460 unsigned write_mask = nir_intrinsic_write_mask(&intr);
461
462 switch (semantics.location) {
463 case FRAG_RESULT_DEPTH:
464 swizzle = {0, 7, 7, 7};
465 break;
466 case FRAG_RESULT_STENCIL:
467 swizzle = {7, 0, 7, 7};
468 break;
469 case FRAG_RESULT_SAMPLE_MASK:
470 swizzle = {7, 7, 0, 7};
471 break;
472 default:
473 for (int i = 0; i < 4; ++i) {
474 swizzle[i] = (1 << i) & write_mask ? i : 7;
475 }
476 }
477
478 auto value = value_factory().src_vec4(intr.src[0], pin_group, swizzle);
479
480 if (semantics.location == FRAG_RESULT_COLOR ||
481 (semantics.location >= FRAG_RESULT_DATA0 &&
482 semantics.location <= FRAG_RESULT_DATA7)) {
483
484 ShaderOutput output(driver_location, write_mask);
485 output.set_frag_result(static_cast<gl_frag_result>(semantics.location));
486 add_output(output);
487
488 unsigned color_outputs =
489 m_fs_write_all && chip_class() >= ISA_CC_R700 ? m_max_color_exports : 1;
490
491 for (unsigned k = 0; k < color_outputs; ++k) {
492
493 unsigned location = semantics.location - FRAG_RESULT_DATA0;
494
495 if (semantics.location == FRAG_RESULT_COLOR)
496 location = driver_location + k;
497
498 if (semantics.dual_source_blend_index)
499 location = semantics.dual_source_blend_index;
500
501 sfn_log << SfnLog::io << "Pixel output at loc:" << location
502 << "("<< semantics.location << ") of "<< m_max_color_exports<<"\n";
503
504 if (location >= m_max_color_exports) {
505 sfn_log << SfnLog::io << "Pixel output loc:" << location
506 << " dl:" << driver_location << " skipped because we have only "
507 << m_max_color_exports << " CBs\n";
508 return true;
509 }
510
511 m_last_pixel_export = new ExportInstr(ExportInstr::pixel, location, value);
512
513 if (m_export_highest < location)
514 m_export_highest = location;
515
516 m_num_color_exports++;
517
518 /* Hack: force dual source output handling if one color output has a
519 * dual_source_blend_index > 0 */
520 if (semantics.dual_source_blend_index > 0)
521 m_dual_source_blend = true;
522
523 if (m_num_color_exports > 1)
524 m_fs_write_all = false;
525 unsigned mask = (0xfu << (location * 4));
526
527 m_color_export_written_mask |= (1 << location);
528
529 /* If the i-th target format is set, all previous target formats must
530 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
531 /*/
532 for (unsigned i = 0; i < location; ++i)
533 mask |= (0x1u << (i * 4));
534
535 m_color_export_mask |= mask;
536
537 emit_instruction(m_last_pixel_export);
538 }
539 } else if (semantics.location == FRAG_RESULT_DEPTH ||
540 semantics.location == FRAG_RESULT_STENCIL ||
541 semantics.location == FRAG_RESULT_SAMPLE_MASK) {
542 emit_instruction(new ExportInstr(ExportInstr::pixel, 61, value));
543
544 ShaderOutput output(driver_location, write_mask);
545 output.set_frag_result(static_cast<gl_frag_result>(semantics.location));
546 add_output(output);
547
548 } else {
549 return false;
550 }
551 return true;
552 }
553
554 bool
emit_load_sample_pos(nir_intrinsic_instr * instr)555 FragmentShader::emit_load_sample_pos(nir_intrinsic_instr *instr)
556 {
557 auto dest = value_factory().dest_vec4(instr->def, pin_group);
558
559 auto fetch = new LoadFromBuffer(dest,
560 {0, 1, 2, 3},
561 m_sample_id_reg,
562 0,
563 R600_BUFFER_INFO_CONST_BUFFER,
564 nullptr,
565 fmt_32_32_32_32_float);
566 fetch->set_fetch_flag(FetchInstr::srf_mode);
567 emit_instruction(fetch);
568 return true;
569 }
570
571 void
do_finalize()572 FragmentShader::do_finalize()
573 {
574 /* On pre-evergreen not emtting something to all color exports that
575 * are enabled might lead to a hang.
576 * see: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9223
577 */
578 if (chip_class() < ISA_CC_EVERGREEN) {
579 unsigned i = 0;
580 unsigned mask = m_color_export_mask;
581
582 while (i < m_max_color_exports && (mask & (1u << (4 * i)))) {
583 if (!(m_color_export_written_mask & (1u << i))) {
584 RegisterVec4 value(0, false, {7, 7, 7, 7});
585 m_last_pixel_export = new ExportInstr(ExportInstr::pixel, i, value);
586 emit_instruction(m_last_pixel_export);
587 m_num_color_exports++;
588 if (m_export_highest < i)
589 m_export_highest = i;
590 }
591 ++i;
592 }
593 }
594
595 if (!m_last_pixel_export) {
596 RegisterVec4 value(0, false, {7, 7, 7, 7});
597 m_last_pixel_export = new ExportInstr(ExportInstr::pixel, 0, value);
598 emit_instruction(m_last_pixel_export);
599 m_num_color_exports++;
600 m_color_export_mask |= 0xf;
601 }
602 m_last_pixel_export->set_is_last_export(true);
603 }
604
605 bool
read_prop(std::istream & is)606 FragmentShader::read_prop(std::istream& is)
607 {
608 string value;
609 is >> value;
610
611 ASSERTED auto splitpos = value.find(':');
612 assert(splitpos != string::npos);
613
614 std::istringstream ival(value);
615 string name;
616 string val;
617
618 std::getline(ival, name, ':');
619
620 if (name == "MAX_COLOR_EXPORTS")
621 ival >> m_max_color_exports;
622 else if (name == "COLOR_EXPORTS")
623 ival >> m_num_color_exports;
624 else if (name == "COLOR_EXPORT_MASK")
625 ival >> m_color_export_mask;
626 else if (name == "WRITE_ALL_COLORS")
627 ival >> m_fs_write_all;
628 else
629 return false;
630 return true;
631 }
632
633 void
do_print_properties(std::ostream & os) const634 FragmentShader::do_print_properties(std::ostream& os) const
635 {
636 os << "PROP MAX_COLOR_EXPORTS:" << m_max_color_exports << "\n";
637 os << "PROP COLOR_EXPORTS:" << m_num_color_exports << "\n";
638 os << "PROP COLOR_EXPORT_MASK:" << m_color_export_mask << "\n";
639 os << "PROP WRITE_ALL_COLORS:" << m_fs_write_all << "\n";
640 }
641
642 int
allocate_interpolators_or_inputs()643 FragmentShaderR600::allocate_interpolators_or_inputs()
644 {
645 int pos = 0;
646 auto& vf = value_factory();
647 for (auto& [index, inp] : inputs()) {
648 if (inp.need_lds_pos()) {
649
650 RegisterVec4 input(vf.allocate_pinned_register(pos, 0),
651 vf.allocate_pinned_register(pos, 1),
652 vf.allocate_pinned_register(pos, 2),
653 vf.allocate_pinned_register(pos, 3),
654 pin_fully);
655 inp.set_gpr(pos++);
656
657 sfn_log << SfnLog::io << "Reseve input register at pos " << index << " as "
658 << input << " with register " << inp.gpr() << "\n";
659
660 m_interpolated_inputs[index] = input;
661 }
662 }
663 return pos;
664 }
665
666 bool
load_input_hw(nir_intrinsic_instr * intr)667 FragmentShaderR600::load_input_hw(nir_intrinsic_instr *intr)
668 {
669 auto& vf = value_factory();
670 AluInstr *ir = nullptr;
671 for (unsigned i = 0; i < intr->def.num_components; ++i) {
672 sfn_log << SfnLog::io << "Inject register "
673 << *m_interpolated_inputs[nir_intrinsic_base(intr)][i] << "\n";
674 unsigned index = nir_intrinsic_component(intr) + i;
675 assert(index < 4);
676 vf.inject_value(intr->def,
677 i,
678 m_interpolated_inputs[nir_intrinsic_base(intr)][index]);
679 }
680 if (ir)
681 ir->set_alu_flag(alu_last_instr);
682 return true;
683 }
684
685 bool
process_stage_intrinsic_hw(nir_intrinsic_instr * intr)686 FragmentShaderR600::process_stage_intrinsic_hw(nir_intrinsic_instr *intr)
687 {
688 switch (intr->intrinsic) {
689 case nir_intrinsic_load_barycentric_centroid:
690 case nir_intrinsic_load_barycentric_pixel:
691 case nir_intrinsic_load_barycentric_sample:
692 return true;
693 default:
694 return false;
695 }
696 }
697
698 bool
load_interpolated_input_hw(nir_intrinsic_instr * intr)699 FragmentShaderR600::load_interpolated_input_hw(nir_intrinsic_instr *intr)
700 {
701 return load_input_hw(intr);
702 }
703
704 bool
load_input_hw(nir_intrinsic_instr * intr)705 FragmentShaderEG::load_input_hw(nir_intrinsic_instr *intr)
706 {
707 auto& vf = value_factory();
708 auto io = input(nir_intrinsic_base(intr));
709 auto comp = nir_intrinsic_component(intr);
710
711 bool need_temp = comp > 0;
712 AluInstr *ir = nullptr;
713 for (unsigned i = 0; i < intr->def.num_components; ++i) {
714 if (need_temp) {
715 auto tmp = vf.temp_register(comp + i);
716 ir =
717 new AluInstr(op1_interp_load_p0,
718 tmp,
719 new InlineConstant(ALU_SRC_PARAM_BASE + io.lds_pos(), i + comp),
720 AluInstr::last_write);
721 emit_instruction(ir);
722 emit_instruction(new AluInstr(
723 op1_mov, vf.dest(intr->def, i, pin_chan), tmp, AluInstr::last_write));
724 } else {
725
726 ir = new AluInstr(op1_interp_load_p0,
727 vf.dest(intr->def, i, pin_chan),
728 new InlineConstant(ALU_SRC_PARAM_BASE + io.lds_pos(), i),
729 AluInstr::write);
730 emit_instruction(ir);
731 }
732 }
733 ir->set_alu_flag(alu_last_instr);
734 return true;
735 }
736
737 int
allocate_interpolators_or_inputs()738 FragmentShaderEG::allocate_interpolators_or_inputs()
739 {
740 for (unsigned i = 0; i < s_max_interpolators; ++i) {
741 if (interpolators_used(i)) {
742 sfn_log << SfnLog::io << "Interpolator " << i << " test enabled\n";
743 m_interpolator[i].enabled = true;
744 }
745 }
746
747 int num_baryc = 0;
748 for (int i = 0; i < 6; ++i) {
749 if (m_interpolator[i].enabled) {
750 sfn_log << SfnLog::io << "Interpolator " << i
751 << " is enabled with ij=" << num_baryc << " \n";
752 unsigned sel = num_baryc / 2;
753 unsigned chan = 2 * (num_baryc % 2);
754
755 m_interpolator[i].i = value_factory().allocate_pinned_register(sel, chan + 1);
756 m_interpolator[i].j = value_factory().allocate_pinned_register(sel, chan);
757
758 m_interpolator[i].ij_index = num_baryc++;
759 }
760 }
761 return (num_baryc + 1) >> 1;
762 }
763
764 bool
process_stage_intrinsic_hw(nir_intrinsic_instr * intr)765 FragmentShaderEG::process_stage_intrinsic_hw(nir_intrinsic_instr *intr)
766 {
767 auto& vf = value_factory();
768 switch (intr->intrinsic) {
769 case nir_intrinsic_load_barycentric_centroid:
770 case nir_intrinsic_load_barycentric_pixel:
771 case nir_intrinsic_load_barycentric_sample: {
772 unsigned ij = barycentric_ij_index(intr);
773 vf.inject_value(intr->def, 0, m_interpolator[ij].i);
774 vf.inject_value(intr->def, 1, m_interpolator[ij].j);
775 return true;
776 }
777 case nir_intrinsic_load_barycentric_at_offset:
778 return load_barycentric_at_offset(intr);
779 case nir_intrinsic_load_barycentric_at_sample:
780 return load_barycentric_at_sample(intr);
781 default:
782 return false;
783 }
784 }
785
786 bool
load_interpolated_input_hw(nir_intrinsic_instr * intr)787 FragmentShaderEG::load_interpolated_input_hw(nir_intrinsic_instr *intr)
788 {
789 auto& vf = value_factory();
790 ASSERTED auto param = nir_src_as_const_value(intr->src[1]);
791 assert(param && "Indirect PS inputs not (yet) supported");
792
793 int dest_num_comp = intr->def.num_components;
794 int start_comp = nir_intrinsic_component(intr);
795 bool need_temp = start_comp > 0;
796
797 auto dst = need_temp ? vf.temp_vec4(pin_chan) : vf.dest_vec4(intr->def, pin_chan);
798
799 InterpolateParams params;
800
801 params.i = vf.src(intr->src[0], 0);
802 params.j = vf.src(intr->src[0], 1);
803 params.base = input(nir_intrinsic_base(intr)).lds_pos();
804
805 if (!load_interpolated(dst, params, dest_num_comp, start_comp))
806 return false;
807
808 if (need_temp) {
809 AluInstr *ir = nullptr;
810 for (unsigned i = 0; i < intr->def.num_components; ++i) {
811 auto real_dst = vf.dest(intr->def, i, pin_chan);
812 ir = new AluInstr(op1_mov, real_dst, dst[i + start_comp], AluInstr::write);
813 emit_instruction(ir);
814 }
815 assert(ir);
816 ir->set_alu_flag(alu_last_instr);
817 }
818
819 return true;
820 }
821
822 bool
load_interpolated(RegisterVec4 & dest,const InterpolateParams & params,int num_dest_comp,int start_comp)823 FragmentShaderEG::load_interpolated(RegisterVec4& dest,
824 const InterpolateParams& params,
825 int num_dest_comp,
826 int start_comp)
827 {
828 sfn_log << SfnLog::io << "Using Interpolator (" << *params.j << ", " << *params.i
829 << ")"
830 << "\n";
831
832 if (num_dest_comp == 1) {
833 switch (start_comp) {
834 case 0:
835 return load_interpolated_one_comp(dest, params, op2_interp_x);
836 case 1:
837 return load_interpolated_two_comp_for_one(dest, params, op2_interp_xy, 1);
838 case 2:
839 return load_interpolated_one_comp(dest, params, op2_interp_z);
840 case 3:
841 return load_interpolated_two_comp_for_one(dest, params, op2_interp_zw, 3);
842 default:
843 assert(0);
844 }
845 }
846
847 if (num_dest_comp == 2) {
848 switch (start_comp) {
849 case 0:
850 return load_interpolated_two_comp(dest, params, op2_interp_xy, 0x3);
851 case 2:
852 return load_interpolated_two_comp(dest, params, op2_interp_zw, 0xc);
853 case 1:
854 return load_interpolated_one_comp(dest, params, op2_interp_z) &&
855 load_interpolated_two_comp_for_one(dest, params, op2_interp_xy, 1);
856 default:
857 assert(0);
858 }
859 }
860
861 if (num_dest_comp == 3 && start_comp == 0)
862 return load_interpolated_two_comp(dest, params, op2_interp_xy, 0x3) &&
863 load_interpolated_one_comp(dest, params, op2_interp_z);
864
865 int full_write_mask = ((1 << num_dest_comp) - 1) << start_comp;
866
867 bool success =
868 load_interpolated_two_comp(dest, params, op2_interp_zw, full_write_mask & 0xc);
869 success &=
870 load_interpolated_two_comp(dest, params, op2_interp_xy, full_write_mask & 0x3);
871 return success;
872 }
873
874 bool
load_barycentric_at_sample(nir_intrinsic_instr * instr)875 FragmentShaderEG::load_barycentric_at_sample(nir_intrinsic_instr *instr)
876 {
877 auto& vf = value_factory();
878 RegisterVec4 slope = vf.temp_vec4(pin_group);
879 auto src = emit_load_to_register(vf.src(instr->src[0], 0));
880 auto fetch = new LoadFromBuffer(slope,
881 {0, 1, 2, 3},
882 src,
883 0,
884 R600_BUFFER_INFO_CONST_BUFFER,
885 nullptr,
886 fmt_32_32_32_32_float);
887
888 fetch->set_fetch_flag(FetchInstr::srf_mode);
889 emit_instruction(fetch);
890
891 auto grad = vf.temp_vec4(pin_group);
892
893 auto interpolator = m_interpolator[barycentric_ij_index(instr)];
894 assert(interpolator.enabled);
895
896 RegisterVec4 interp(interpolator.j, interpolator.i, nullptr, nullptr, pin_group);
897
898 auto tex = new TexInstr(TexInstr::get_gradient_h, grad, {0, 1, 7, 7}, interp, 0, 0);
899 tex->set_tex_flag(TexInstr::grad_fine);
900 tex->set_tex_flag(TexInstr::x_unnormalized);
901 tex->set_tex_flag(TexInstr::y_unnormalized);
902 tex->set_tex_flag(TexInstr::z_unnormalized);
903 tex->set_tex_flag(TexInstr::w_unnormalized);
904 emit_instruction(tex);
905
906 tex = new TexInstr(TexInstr::get_gradient_v, grad, {7, 7, 0, 1}, interp, 0, 0);
907 tex->set_tex_flag(TexInstr::x_unnormalized);
908 tex->set_tex_flag(TexInstr::y_unnormalized);
909 tex->set_tex_flag(TexInstr::z_unnormalized);
910 tex->set_tex_flag(TexInstr::w_unnormalized);
911 tex->set_tex_flag(TexInstr::grad_fine);
912 emit_instruction(tex);
913
914 auto tmp0 = vf.temp_register();
915 auto tmp1 = vf.temp_register();
916
917 emit_instruction(
918 new AluInstr(op3_muladd, tmp0, grad[0], slope[2], interpolator.j, {alu_write}));
919 emit_instruction(new AluInstr(
920 op3_muladd, tmp1, grad[1], slope[2], interpolator.i, {alu_write, alu_last_instr}));
921
922 emit_instruction(new AluInstr(op3_muladd,
923 vf.dest(instr->def, 0, pin_none),
924 grad[3],
925 slope[3],
926 tmp1,
927 {alu_write}));
928 emit_instruction(new AluInstr(op3_muladd,
929 vf.dest(instr->def, 1, pin_none),
930 grad[2],
931 slope[3],
932 tmp0,
933 {alu_write, alu_last_instr}));
934
935 return true;
936 }
937
938 bool
load_barycentric_at_offset(nir_intrinsic_instr * instr)939 FragmentShaderEG::load_barycentric_at_offset(nir_intrinsic_instr *instr)
940 {
941 auto& vf = value_factory();
942 auto interpolator = m_interpolator[barycentric_ij_index(instr)];
943
944 auto help = vf.temp_vec4(pin_group);
945 RegisterVec4 interp(interpolator.j, interpolator.i, nullptr, nullptr, pin_group);
946
947 auto getgradh =
948 new TexInstr(TexInstr::get_gradient_h, help, {0, 1, 7, 7}, interp, 0, 0);
949 getgradh->set_tex_flag(TexInstr::x_unnormalized);
950 getgradh->set_tex_flag(TexInstr::y_unnormalized);
951 getgradh->set_tex_flag(TexInstr::z_unnormalized);
952 getgradh->set_tex_flag(TexInstr::w_unnormalized);
953 getgradh->set_tex_flag(TexInstr::grad_fine);
954 emit_instruction(getgradh);
955
956 auto getgradv =
957 new TexInstr(TexInstr::get_gradient_v, help, {7, 7, 0, 1}, interp, 0, 0);
958 getgradv->set_tex_flag(TexInstr::x_unnormalized);
959 getgradv->set_tex_flag(TexInstr::y_unnormalized);
960 getgradv->set_tex_flag(TexInstr::z_unnormalized);
961 getgradv->set_tex_flag(TexInstr::w_unnormalized);
962 getgradv->set_tex_flag(TexInstr::grad_fine);
963 emit_instruction(getgradv);
964
965 auto ofs_x = vf.src(instr->src[0], 0);
966 auto ofs_y = vf.src(instr->src[0], 1);
967 auto tmp0 = vf.temp_register();
968 auto tmp1 = vf.temp_register();
969 emit_instruction(
970 new AluInstr(op3_muladd, tmp0, help[0], ofs_x, interpolator.j, {alu_write}));
971 emit_instruction(new AluInstr(
972 op3_muladd, tmp1, help[1], ofs_x, interpolator.i, {alu_write, alu_last_instr}));
973 emit_instruction(new AluInstr(
974 op3_muladd, vf.dest(instr->def, 0, pin_none), help[3], ofs_y, tmp1, {alu_write}));
975 emit_instruction(new AluInstr(op3_muladd,
976 vf.dest(instr->def, 1, pin_none),
977 help[2],
978 ofs_y,
979 tmp0,
980 {alu_write, alu_last_instr}));
981
982 return true;
983 }
984
985 bool
load_interpolated_one_comp(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op)986 FragmentShaderEG::load_interpolated_one_comp(RegisterVec4& dest,
987 const InterpolateParams& params,
988 EAluOp op)
989 {
990 auto group = new AluGroup();
991 bool success = true;
992
993 AluInstr *ir = nullptr;
994 for (unsigned i = 0; i < 2 && success; ++i) {
995 int chan = i;
996 if (op == op2_interp_z)
997 chan += 2;
998
999 ir = new AluInstr(op,
1000 dest[chan],
1001 i & 1 ? params.j : params.i,
1002 new InlineConstant(ALU_SRC_PARAM_BASE + params.base, chan),
1003 i == 0 ? AluInstr::write : AluInstr::last);
1004
1005 ir->set_bank_swizzle(alu_vec_210);
1006 success = group->add_instruction(ir);
1007 }
1008 ir->set_alu_flag(alu_last_instr);
1009 if (success)
1010 emit_instruction(group);
1011 return success;
1012 }
1013
1014 bool
load_interpolated_two_comp(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op,int writemask)1015 FragmentShaderEG::load_interpolated_two_comp(RegisterVec4& dest,
1016 const InterpolateParams& params,
1017 EAluOp op,
1018 int writemask)
1019 {
1020 auto group = new AluGroup();
1021 bool success = true;
1022
1023 AluInstr *ir = nullptr;
1024 assert(params.j);
1025 assert(params.i);
1026 for (unsigned i = 0; i < 4; ++i) {
1027 ir = new AluInstr(op,
1028 dest[i],
1029 i & 1 ? params.j : params.i,
1030 new InlineConstant(ALU_SRC_PARAM_BASE + params.base, i),
1031 (writemask & (1 << i)) ? AluInstr::write : AluInstr::empty);
1032 ir->set_bank_swizzle(alu_vec_210);
1033 success = group->add_instruction(ir);
1034 }
1035 ir->set_alu_flag(alu_last_instr);
1036 if (success)
1037 emit_instruction(group);
1038 return success;
1039 }
1040
1041 bool
load_interpolated_two_comp_for_one(RegisterVec4 & dest,const InterpolateParams & params,EAluOp op,int comp)1042 FragmentShaderEG::load_interpolated_two_comp_for_one(RegisterVec4& dest,
1043 const InterpolateParams& params,
1044 EAluOp op,
1045 int comp)
1046 {
1047 auto group = new AluGroup();
1048 bool success = true;
1049 AluInstr *ir = nullptr;
1050
1051 for (int i = 0; i < 4; ++i) {
1052 ir = new AluInstr(op,
1053 dest[i],
1054 i & 1 ? params.j : params.i,
1055 new InlineConstant(ALU_SRC_PARAM_BASE + params.base, i),
1056 i == comp ? AluInstr::write : AluInstr::empty);
1057 ir->set_bank_swizzle(alu_vec_210);
1058 success = group->add_instruction(ir);
1059 }
1060 ir->set_alu_flag(alu_last_instr);
1061 if (success)
1062 emit_instruction(group);
1063
1064 return success;
1065 }
1066
Interpolator()1067 FragmentShaderEG::Interpolator::Interpolator():
1068 enabled(false)
1069 {
1070 }
1071
1072 } // namespace r600
1073