1 #include "sfn_vertexstageexport.h"
2
3 #include "sfn_shaderio.h"
4
5 namespace r600 {
6
7 using std::priority_queue;
8
VertexStageExportBase(VertexStage & proc)9 VertexStageExportBase::VertexStageExportBase(VertexStage& proc):
10 m_proc(proc),
11 m_cur_clip_pos(1)
12 {
13
14 }
15
~VertexStageExportBase()16 VertexStageExportBase::~VertexStageExportBase()
17 {
18
19 }
20
do_process_outputs(nir_variable * output)21 bool VertexStageExportBase::do_process_outputs(nir_variable *output)
22 {
23 return true;
24 }
25
emit_shader_start()26 void VertexStageExportBase::emit_shader_start()
27 {
28
29 }
30
scan_store_output(nir_intrinsic_instr * instr)31 void VertexStageExportBase::scan_store_output(nir_intrinsic_instr* instr)
32 {
33
34 }
35
store_output(nir_intrinsic_instr * instr)36 bool VertexStageExportBase::store_output(nir_intrinsic_instr* instr)
37 {
38 auto index = nir_src_as_const_value(instr->src[1]);
39 assert(index && "Indirect outputs not supported");
40
41 const store_loc store_info = {
42 nir_intrinsic_component(instr),
43 nir_intrinsic_io_semantics(instr).location,
44 (unsigned)nir_intrinsic_base(instr) + index->u32,
45 0
46 };
47
48 return do_store_output(store_info, instr);
49 }
50
VertexStageExportForFS(VertexStage & proc,const pipe_stream_output_info * so_info,r600_pipe_shader * pipe_shader,const r600_shader_key & key)51 VertexStageExportForFS::VertexStageExportForFS(VertexStage& proc,
52 const pipe_stream_output_info *so_info,
53 r600_pipe_shader *pipe_shader, const r600_shader_key &key):
54 VertexStageWithOutputInfo(proc),
55 m_last_param_export(nullptr),
56 m_last_pos_export(nullptr),
57 m_num_clip_dist(0),
58 m_enabled_stream_buffers_mask(0),
59 m_so_info(so_info),
60 m_pipe_shader(pipe_shader),
61 m_key(key)
62 {
63 }
64
do_process_outputs(nir_variable * output)65 bool VertexStageWithOutputInfo::do_process_outputs(nir_variable *output)
66 {
67 if (output->data.location == VARYING_SLOT_COL0 ||
68 output->data.location == VARYING_SLOT_COL1 ||
69 (output->data.location >= VARYING_SLOT_VAR0 &&
70 output->data.location <= VARYING_SLOT_VAR31) ||
71 (output->data.location >= VARYING_SLOT_TEX0 &&
72 output->data.location <= VARYING_SLOT_TEX7) ||
73 output->data.location == VARYING_SLOT_BFC0 ||
74 output->data.location == VARYING_SLOT_BFC1 ||
75 output->data.location == VARYING_SLOT_CLIP_VERTEX ||
76 output->data.location == VARYING_SLOT_CLIP_DIST0 ||
77 output->data.location == VARYING_SLOT_CLIP_DIST1 ||
78 output->data.location == VARYING_SLOT_POS ||
79 output->data.location == VARYING_SLOT_PSIZ ||
80 output->data.location == VARYING_SLOT_FOGC ||
81 output->data.location == VARYING_SLOT_LAYER ||
82 output->data.location == VARYING_SLOT_EDGE ||
83 output->data.location == VARYING_SLOT_VIEWPORT
84 ) {
85
86 r600_shader_io& io = m_proc.sh_info().output[output->data.driver_location];
87 auto semantic = r600_get_varying_semantic(output->data.location);
88 io.name = semantic.first;
89 io.sid = semantic.second;
90
91 m_proc.evaluate_spi_sid(io);
92 io.write_mask = ((1 << glsl_get_components(output->type)) - 1)
93 << output->data.location_frac;
94 ++m_proc.sh_info().noutput;
95
96 if (output->data.location == VARYING_SLOT_PSIZ ||
97 output->data.location == VARYING_SLOT_EDGE ||
98 output->data.location == VARYING_SLOT_LAYER) // VIEWPORT?
99 m_cur_clip_pos = 2;
100
101 if (output->data.location != VARYING_SLOT_POS &&
102 output->data.location != VARYING_SLOT_EDGE &&
103 output->data.location != VARYING_SLOT_PSIZ &&
104 output->data.location != VARYING_SLOT_CLIP_VERTEX)
105 m_param_driver_locations.push(output->data.driver_location);
106
107 return true;
108 }
109 return false;
110 }
111
do_store_output(const store_loc & store_info,nir_intrinsic_instr * instr)112 bool VertexStageExportForFS::do_store_output(const store_loc& store_info, nir_intrinsic_instr* instr)
113 {
114 switch (store_info.location) {
115 case VARYING_SLOT_PSIZ:
116 m_proc.sh_info().vs_out_point_size = 1;
117 m_proc.sh_info().vs_out_misc_write = 1;
118 FALLTHROUGH;
119 case VARYING_SLOT_POS:
120 return emit_varying_pos(store_info, instr);
121 case VARYING_SLOT_EDGE: {
122 std::array<uint32_t, 4> swizzle_override = {7 ,0, 7, 7};
123 return emit_varying_pos(store_info, instr, &swizzle_override);
124 }
125 case VARYING_SLOT_VIEWPORT: {
126 std::array<uint32_t, 4> swizzle_override = {7, 7, 7, 0};
127 return emit_varying_pos(store_info, instr, &swizzle_override) &&
128 emit_varying_param(store_info, instr);
129 }
130 case VARYING_SLOT_CLIP_VERTEX:
131 return emit_clip_vertices(store_info, instr);
132 case VARYING_SLOT_CLIP_DIST0:
133 case VARYING_SLOT_CLIP_DIST1:
134 m_num_clip_dist += 4;
135 return emit_varying_param(store_info, instr) && emit_varying_pos(store_info, instr);
136 case VARYING_SLOT_LAYER: {
137 m_proc.sh_info().vs_out_misc_write = 1;
138 m_proc.sh_info().vs_out_layer = 1;
139 std::array<uint32_t, 4> swz = {7,7,0,7};
140 return emit_varying_pos(store_info, instr, &swz) &&
141 emit_varying_param(store_info, instr);
142 }
143 case VARYING_SLOT_VIEW_INDEX:
144 return emit_varying_pos(store_info, instr) &&
145 emit_varying_param(store_info, instr);
146
147 default:
148 return emit_varying_param(store_info, instr);
149 }
150
151 fprintf(stderr, "r600-NIR: Unimplemented store_deref for %d\n",
152 store_info.location);
153 return false;
154 }
155
emit_varying_pos(const store_loc & store_info,nir_intrinsic_instr * instr,std::array<uint32_t,4> * swizzle_override)156 bool VertexStageExportForFS::emit_varying_pos(const store_loc &store_info, nir_intrinsic_instr* instr,
157 std::array<uint32_t, 4> *swizzle_override)
158 {
159 std::array<uint32_t,4> swizzle;
160 uint32_t write_mask = 0;
161
162 if (swizzle_override) {
163 swizzle = *swizzle_override;
164 for (int i = 0; i < 4; ++i) {
165 if (swizzle[i] < 6)
166 write_mask |= 1 << i;
167 }
168 } else {
169 write_mask = nir_intrinsic_write_mask(instr) << store_info.frac;
170 for (int i = 0; i < 4; ++i)
171 swizzle[i] = ((1 << i) & write_mask) ? i - store_info.frac : 7;
172 }
173
174 m_proc.sh_info().output[store_info.driver_location].write_mask = write_mask;
175
176 GPRVector value = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], write_mask, swizzle);
177 m_proc.set_output(store_info.driver_location, value.sel());
178
179 int export_slot = 0;
180
181 switch (store_info.location) {
182 case VARYING_SLOT_EDGE: {
183 m_proc.sh_info().vs_out_misc_write = 1;
184 m_proc.sh_info().vs_out_edgeflag = 1;
185 m_proc.emit_instruction(op1_mov, value.reg_i(1), {value.reg_i(1)}, {alu_write, alu_dst_clamp, alu_last_instr});
186 m_proc.emit_instruction(op1_flt_to_int, value.reg_i(1), {value.reg_i(1)}, {alu_write, alu_last_instr});
187 m_proc.sh_info().output[store_info.driver_location].write_mask = 0xf;
188 }
189 FALLTHROUGH;
190 case VARYING_SLOT_PSIZ:
191 case VARYING_SLOT_LAYER:
192 export_slot = 1;
193 break;
194 case VARYING_SLOT_VIEWPORT:
195 m_proc.sh_info().vs_out_misc_write = 1;
196 m_proc.sh_info().vs_out_viewport = 1;
197 export_slot = 1;
198 break;
199 case VARYING_SLOT_POS:
200 break;
201 case VARYING_SLOT_CLIP_DIST0:
202 case VARYING_SLOT_CLIP_DIST1:
203 export_slot = m_cur_clip_pos++;
204 break;
205 default:
206 sfn_log << SfnLog::err << __func__ << "Unsupported location "
207 << store_info.location << "\n";
208 return false;
209 }
210
211 m_last_pos_export = new ExportInstruction(export_slot, value, ExportInstruction::et_pos);
212 m_proc.emit_export_instruction(m_last_pos_export);
213 m_proc.add_param_output_reg(store_info.driver_location, m_last_pos_export->gpr_ptr());
214 return true;
215 }
216
emit_varying_param(const store_loc & store_info,nir_intrinsic_instr * instr)217 bool VertexStageExportForFS::emit_varying_param(const store_loc &store_info, nir_intrinsic_instr* instr)
218 {
219 assert(store_info.driver_location < m_proc.sh_info().noutput);
220 sfn_log << SfnLog::io << __func__ << ": emit DDL: " << store_info.driver_location << "\n";
221
222 int write_mask = nir_intrinsic_write_mask(instr) << store_info.frac;
223 std::array<uint32_t,4> swizzle;
224 for (int i = 0; i < 4; ++i)
225 swizzle[i] = ((1 << i) & write_mask) ? i - store_info.frac : 7;
226
227 //m_proc.sh_info().output[store_info.driver_location].write_mask = write_mask;
228
229 GPRVector value = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], write_mask, swizzle, true);
230 m_proc.sh_info().output[store_info.driver_location].gpr = value.sel();
231
232 /* This should use the registers!! */
233 m_proc.set_output(store_info.driver_location, value.sel());
234
235 m_last_param_export = new ExportInstruction(param_id(store_info.driver_location),
236 value, ExportInstruction::et_param);
237 m_proc.emit_export_instruction(m_last_param_export);
238 m_proc.add_param_output_reg(store_info.driver_location, m_last_param_export->gpr_ptr());
239 return true;
240 }
241
emit_clip_vertices(const store_loc & store_info,nir_intrinsic_instr * instr)242 bool VertexStageExportForFS::emit_clip_vertices(const store_loc &store_info, nir_intrinsic_instr* instr)
243 {
244 m_proc.sh_info().cc_dist_mask = 0xff;
245 m_proc.sh_info().clip_dist_write = 0xff;
246
247 m_clip_vertex = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], 0xf, {0,1,2,3});
248 m_proc.add_param_output_reg(store_info.driver_location, &m_clip_vertex);
249
250 for (int i = 0; i < 4; ++i)
251 m_proc.sh_info().output[store_info.driver_location].write_mask |= 1 << i;
252
253 GPRVector clip_dist[2] = { m_proc.get_temp_vec4(), m_proc.get_temp_vec4()};
254
255 for (int i = 0; i < 8; i++) {
256 int oreg = i >> 2;
257 int ochan = i & 3;
258 AluInstruction *ir = nullptr;
259 for (int j = 0; j < 4; j++) {
260 ir = new AluInstruction(op2_dot4_ieee, clip_dist[oreg].reg_i(j), m_clip_vertex.reg_i(j),
261 PValue(new UniformValue(512 + i, j, R600_BUFFER_INFO_CONST_BUFFER)),
262 (j == ochan) ? EmitInstruction::write : EmitInstruction::empty);
263 m_proc.emit_instruction(ir);
264 }
265 ir->set_flag(alu_last_instr);
266 }
267
268 m_last_pos_export = new ExportInstruction(m_cur_clip_pos++, clip_dist[0], ExportInstruction::et_pos);
269 m_proc.emit_export_instruction(m_last_pos_export);
270
271 m_last_pos_export = new ExportInstruction(m_cur_clip_pos, clip_dist[1], ExportInstruction::et_pos);
272 m_proc.emit_export_instruction(m_last_pos_export);
273
274 return true;
275 }
276
VertexStageWithOutputInfo(VertexStage & proc)277 VertexStageWithOutputInfo::VertexStageWithOutputInfo(VertexStage& proc):
278 VertexStageExportBase(proc),
279 m_current_param(0)
280 {
281
282 }
283
scan_store_output(nir_intrinsic_instr * instr)284 void VertexStageWithOutputInfo::scan_store_output(nir_intrinsic_instr* instr)
285 {
286 auto location = nir_intrinsic_io_semantics(instr).location;
287 auto driver_location = nir_intrinsic_base(instr);
288 auto index = nir_src_as_const_value(instr->src[1]);
289 assert(index);
290
291 unsigned noutputs = driver_location + index->u32 + 1;
292 if (m_proc.sh_info().noutput < noutputs)
293 m_proc.sh_info().noutput = noutputs;
294
295 r600_shader_io& io = m_proc.sh_info().output[driver_location + index->u32];
296 auto semantic = r600_get_varying_semantic(location + index->u32);
297 io.name = semantic.first;
298 io.sid = semantic.second;
299 m_proc.evaluate_spi_sid(io);
300 io.write_mask = nir_intrinsic_write_mask(instr);
301
302 if (location == VARYING_SLOT_PSIZ ||
303 location == VARYING_SLOT_EDGE ||
304 location == VARYING_SLOT_LAYER) // VIEWPORT?
305 m_cur_clip_pos = 2;
306
307 if (location != VARYING_SLOT_POS &&
308 location != VARYING_SLOT_EDGE &&
309 location != VARYING_SLOT_PSIZ &&
310 location != VARYING_SLOT_CLIP_VERTEX) {
311 m_param_driver_locations.push(driver_location + index->u32);
312 }
313 }
314
param_id(unsigned driver_location)315 unsigned VertexStageWithOutputInfo::param_id(unsigned driver_location)
316 {
317 auto param_loc = m_param_map.find(driver_location);
318 assert(param_loc != m_param_map.end());
319 return param_loc->second;
320 }
321
emit_shader_start()322 void VertexStageWithOutputInfo::emit_shader_start()
323 {
324 while (!m_param_driver_locations.empty()) {
325 auto loc = m_param_driver_locations.top();
326 m_param_driver_locations.pop();
327 m_param_map[loc] = m_current_param++;
328 }
329 }
330
current_param() const331 unsigned VertexStageWithOutputInfo::current_param() const
332 {
333 return m_current_param;
334 }
335
finalize_exports()336 void VertexStageExportForFS::finalize_exports()
337 {
338 if (m_key.vs.as_gs_a) {
339 PValue o(new GPRValue(0,PIPE_SWIZZLE_0));
340 GPRVector primid({m_proc.primitive_id(), o,o,o});
341 m_last_param_export = new ExportInstruction(current_param(), primid, ExportInstruction::et_param);
342 m_proc.emit_export_instruction(m_last_param_export);
343 int i;
344 i = m_proc.sh_info().noutput++;
345 auto& io = m_proc.sh_info().output[i];
346 io.name = TGSI_SEMANTIC_PRIMID;
347 io.sid = 0;
348 io.gpr = 0;
349 io.interpolate = TGSI_INTERPOLATE_CONSTANT;
350 io.write_mask = 0x1;
351 io.spi_sid = m_key.vs.prim_id_out;
352 m_proc.sh_info().vs_as_gs_a = 1;
353 }
354
355 if (m_so_info && m_so_info->num_outputs)
356 emit_stream(-1);
357
358 m_pipe_shader->enabled_stream_buffers_mask = m_enabled_stream_buffers_mask;
359
360 if (!m_last_param_export) {
361 GPRVector value(0,{7,7,7,7});
362 m_last_param_export = new ExportInstruction(0, value, ExportInstruction::et_param);
363 m_proc.emit_export_instruction(m_last_param_export);
364 }
365 m_last_param_export->set_last();
366
367 if (!m_last_pos_export) {
368 GPRVector value(0,{7,7,7,7});
369 m_last_pos_export = new ExportInstruction(0, value, ExportInstruction::et_pos);
370 m_proc.emit_export_instruction(m_last_pos_export);
371 }
372 m_last_pos_export->set_last();
373 }
374
emit_stream(int stream)375 bool VertexStageExportForFS::emit_stream(int stream)
376 {
377 assert(m_so_info);
378 if (m_so_info->num_outputs > PIPE_MAX_SO_OUTPUTS) {
379 R600_ERR("Too many stream outputs: %d\n", m_so_info->num_outputs);
380 return false;
381 }
382 for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
383 if (m_so_info->output[i].output_buffer >= 4) {
384 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
385 m_so_info->output[i].output_buffer);
386 return false;
387 }
388 }
389 const GPRVector *so_gpr[PIPE_MAX_SHADER_OUTPUTS];
390 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
391 std::vector<GPRVector> tmp(m_so_info->num_outputs);
392
393 /* Initialize locations where the outputs are stored. */
394 for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
395 if (stream != -1 && stream != m_so_info->output[i].stream)
396 continue;
397
398 sfn_log << SfnLog::instr << "Emit stream " << i
399 << " with register index " << m_so_info->output[i].register_index << " so_gpr:";
400
401
402 so_gpr[i] = m_proc.output_register(m_so_info->output[i].register_index);
403
404 if (!so_gpr[i]) {
405 sfn_log << SfnLog::err << "\nERR: register index "
406 << m_so_info->output[i].register_index
407 << " doesn't correspond to an output register\n";
408 return false;
409 }
410 start_comp[i] = m_so_info->output[i].start_component;
411 /* Lower outputs with dst_offset < start_component.
412 *
413 * We can only output 4D vectors with a write mask, e.g. we can
414 * only output the W component at offset 3, etc. If we want
415 * to store Y, Z, or W at buffer offset 0, we need to use MOV
416 * to move it to X and output X. */
417 if (m_so_info->output[i].dst_offset < m_so_info->output[i].start_component) {
418
419 GPRVector::Swizzle swizzle = {0,1,2,3};
420 for (auto j = m_so_info->output[i].num_components; j < 4; ++j)
421 swizzle[j] = 7;
422 tmp[i] = m_proc.get_temp_vec4(swizzle);
423
424 int sc = m_so_info->output[i].start_component;
425 AluInstruction *alu = nullptr;
426 for (int j = 0; j < m_so_info->output[i].num_components; j++) {
427 alu = new AluInstruction(op1_mov, tmp[i][j], so_gpr[i]->reg_i(j + sc), {alu_write});
428 m_proc.emit_instruction(alu);
429 }
430 if (alu)
431 alu->set_flag(alu_last_instr);
432
433 start_comp[i] = 0;
434 so_gpr[i] = &tmp[i];
435 }
436 sfn_log << SfnLog::instr << *so_gpr[i] << "\n";
437 }
438
439 /* Write outputs to buffers. */
440 for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
441 sfn_log << SfnLog::instr << "Write output buffer " << i
442 << " with register index " << m_so_info->output[i].register_index << "\n";
443
444 StreamOutIntruction *out_stream =
445 new StreamOutIntruction(*so_gpr[i],
446 m_so_info->output[i].num_components,
447 m_so_info->output[i].dst_offset - start_comp[i],
448 ((1 << m_so_info->output[i].num_components) - 1) << start_comp[i],
449 m_so_info->output[i].output_buffer,
450 m_so_info->output[i].stream);
451 m_proc.emit_export_instruction(out_stream);
452 m_enabled_stream_buffers_mask |= (1 << m_so_info->output[i].output_buffer) << m_so_info->output[i].stream * 4;
453 }
454 return true;
455 }
456
457
VertexStageExportForGS(VertexStage & proc,const r600_shader * gs_shader)458 VertexStageExportForGS::VertexStageExportForGS(VertexStage &proc,
459 const r600_shader *gs_shader):
460 VertexStageWithOutputInfo(proc),
461 m_num_clip_dist(0),
462 m_gs_shader(gs_shader)
463 {
464
465 }
466
do_store_output(const store_loc & store_info,nir_intrinsic_instr * instr)467 bool VertexStageExportForGS::do_store_output(const store_loc& store_info, nir_intrinsic_instr* instr)
468 {
469 int ring_offset = -1;
470 const r600_shader_io& out_io = m_proc.sh_info().output[store_info.driver_location];
471
472 sfn_log << SfnLog::io << "check output " << store_info.driver_location
473 << " name=" << out_io.name<< " sid=" << out_io.sid << "\n";
474 for (unsigned k = 0; k < m_gs_shader->ninput; ++k) {
475 auto& in_io = m_gs_shader->input[k];
476 sfn_log << SfnLog::io << " against " << k << " name=" << in_io.name<< " sid=" << in_io.sid << "\n";
477
478 if (in_io.name == out_io.name &&
479 in_io.sid == out_io.sid) {
480 ring_offset = in_io.ring_offset;
481 break;
482 }
483 }
484
485 if (store_info.location == VARYING_SLOT_VIEWPORT) {
486 m_proc.sh_info().vs_out_viewport = 1;
487 m_proc.sh_info().vs_out_misc_write = 1;
488 return true;
489 }
490
491 if (ring_offset == -1) {
492 sfn_log << SfnLog::err << "VS defines output at "
493 << store_info.driver_location << "name=" << out_io.name
494 << " sid=" << out_io.sid << " that is not consumed as GS input\n";
495 return true;
496 }
497
498 uint32_t write_mask = (1 << instr->num_components) - 1;
499
500 GPRVector value = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], write_mask,
501 swizzle_from_comps(instr->num_components), true);
502
503 auto ir = new MemRingOutIntruction(cf_mem_ring, mem_write, value,
504 ring_offset >> 2, 4, PValue());
505 m_proc.emit_export_instruction(ir);
506
507 m_proc.sh_info().output[store_info.driver_location].write_mask |= write_mask;
508 if (store_info.location == VARYING_SLOT_CLIP_DIST0 ||
509 store_info.location == VARYING_SLOT_CLIP_DIST1)
510 m_num_clip_dist += 4;
511
512 return true;
513 }
514
finalize_exports()515 void VertexStageExportForGS::finalize_exports()
516 {
517
518 }
519
VertexStageExportForES(VertexStage & proc)520 VertexStageExportForES::VertexStageExportForES(VertexStage& proc):
521 VertexStageExportBase(proc)
522 {
523 }
524
do_store_output(const store_loc & store_info,nir_intrinsic_instr * instr)525 bool VertexStageExportForES::do_store_output(const store_loc& store_info, nir_intrinsic_instr* instr)
526 {
527 return true;
528 }
529
finalize_exports()530 void VertexStageExportForES::finalize_exports()
531 {
532
533 }
534
535 }
536