1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26 /**
27 * \file gen6_gs_visitor.cpp
28 *
29 * Gen6 geometry shader implementation
30 */
31
32 #include "gen6_gs_visitor.h"
33 #include "brw_eu.h"
34
35 namespace brw {
36
37 void
emit_prolog()38 gen6_gs_visitor::emit_prolog()
39 {
40 vec4_gs_visitor::emit_prolog();
41
42 /* Gen6 geometry shaders require to allocate an initial VUE handle via
43 * FF_SYNC message, however the documentation remarks that only one thread
44 * can write to the URB simultaneously and the FF_SYNC message provides the
45 * synchronization mechanism for this, so using this message effectively
46 * stalls the thread until it is its turn to write to the URB. Because of
47 * this, the best way to implement geometry shader algorithms in gen6 is to
48 * execute the algorithm before the FF_SYNC message to maximize parallelism.
49 *
50 * To achieve this we buffer the geometry shader outputs for each emitted
51 * vertex in vertex_output during operation. Then, when we have processed
52 * the last vertex (that is, at thread end time), we send the FF_SYNC
53 * message to allocate the initial VUE handle and write all buffered vertex
54 * data to the URB in one go.
55 *
56 * For each emitted vertex, vertex_output will hold vue_map.num_slots
57 * data items plus one additional item to hold required flags
58 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
59 * which come right after the data items for that vertex. Vertex data and
60 * flags for the next vertex come right after the data items and flags for
61 * the previous vertex.
62 */
63 this->current_annotation = "gen6 prolog";
64 this->vertex_output = src_reg(this,
65 glsl_type::uint_type,
66 (prog_data->vue_map.num_slots + 1) *
67 nir->info.gs.vertices_out);
68 this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
69 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
70
71 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
72 * so initialize it once to R0.
73 */
74 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
75 retype(brw_vec8_grf(0, 0),
76 BRW_REGISTER_TYPE_UD)));
77 inst->force_writemask_all = true;
78
79 /* This will be used as a temporary to store writeback data of FF_SYNC
80 * and URB_WRITE messages.
81 */
82 this->temp = src_reg(this, glsl_type::uint_type);
83
84 /* This will be used to know when we are processing the first vertex of
85 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
86 * that we are processing the first vertex in the primitive and to zero
87 * otherwise. This way we can use its value directly in the URB write
88 * headers.
89 */
90 this->first_vertex = src_reg(this, glsl_type::uint_type);
91 emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
92
93 /* The FF_SYNC message requires to know the number of primitives generated,
94 * so keep a counter for this.
95 */
96 this->prim_count = src_reg(this, glsl_type::uint_type);
97 emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
98
99 if (prog->info.has_transform_feedback_varyings) {
100 /* Create a virtual register to hold destination indices in SOL */
101 this->destination_indices = src_reg(this, glsl_type::uvec4_type);
102 /* Create a virtual register to hold number of written primitives */
103 this->sol_prim_written = src_reg(this, glsl_type::uint_type);
104 /* Create a virtual register to hold Streamed Vertex Buffer Indices */
105 this->svbi = src_reg(this, glsl_type::uvec4_type);
106 /* Create a virtual register to hold max values of SVBI */
107 this->max_svbi = src_reg(this, glsl_type::uvec4_type);
108 emit(MOV(dst_reg(this->max_svbi),
109 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
110
111 xfb_setup();
112 }
113
114 /* PrimitveID is delivered in r0.1 of the thread payload. If the program
115 * needs it we have to move it to a separate register where we can map
116 * the atttribute.
117 *
118 * Notice that we cannot use a virtual register for this, because we need to
119 * map all input attributes to hardware registers in setup_payload(),
120 * which happens before virtual registers are mapped to hardware registers.
121 * We could work around that issue if we were able to compute the first
122 * non-payload register here and move the PrimitiveID information to that
123 * register, but we can't because at this point we don't know the final
124 * number uniforms that will be included in the payload.
125 *
126 * So, what we do is to place PrimitiveID information in r1, which is always
127 * delivered as part of the payload, but its only populated with data
128 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
129 * in the 3DSTATE_GS state packet. That information can be obtained by other
130 * means though, so we can safely use r1 for this purpose.
131 */
132 if (gs_prog_data->include_primitive_id) {
133 this->primitive_id =
134 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
135 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
136 }
137 }
138
139 void
gs_emit_vertex(int stream_id)140 gen6_gs_visitor::gs_emit_vertex(int stream_id)
141 {
142 this->current_annotation = "gen6 emit vertex";
143
144 /* Buffer all output slots for this vertex in vertex_output */
145 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
146 int varying = prog_data->vue_map.slot_to_varying[slot];
147 if (varying != VARYING_SLOT_PSIZ) {
148 dst_reg dst(this->vertex_output);
149 dst.reladdr = ralloc(mem_ctx, src_reg);
150 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
151 emit_urb_slot(dst, varying);
152 } else {
153 /* The PSIZ slot can pack multiple varyings in different channels
154 * and emit_urb_slot() will produce a MOV instruction for each of
155 * them. Since we are writing to an array, that will translate to
156 * possibly multiple MOV instructions with an array destination and
157 * each will generate a scratch write with the same offset into
158 * scratch space (thus, each one overwriting the previous). This is
159 * not what we want. What we will do instead is emit PSIZ to a
160 * a regular temporary register, then move that resgister into the
161 * array. This way we only have one instruction with an array
162 * destination and we only produce a single scratch write.
163 */
164 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
165 emit_urb_slot(tmp, varying);
166 dst_reg dst(this->vertex_output);
167 dst.reladdr = ralloc(mem_ctx, src_reg);
168 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
169 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
170 inst->force_writemask_all = true;
171 }
172
173 emit(ADD(dst_reg(this->vertex_output_offset),
174 this->vertex_output_offset, brw_imm_ud(1u)));
175 }
176
177 /* Now buffer flags for this vertex */
178 dst_reg dst(this->vertex_output);
179 dst.reladdr = ralloc(mem_ctx, src_reg);
180 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
181 if (nir->info.gs.output_primitive == GL_POINTS) {
182 /* If we are outputting points, then every vertex has PrimStart and
183 * PrimEnd set.
184 */
185 emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
186 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
187 emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
188 } else {
189 /* Otherwise, we can only set the PrimStart flag, which we have stored
190 * in the first_vertex register. We will have to wait until we execute
191 * EndPrimitive() or we end the thread to set the PrimEnd flag on a
192 * vertex.
193 */
194 emit(OR(dst, this->first_vertex,
195 brw_imm_ud(gs_prog_data->output_topology <<
196 URB_WRITE_PRIM_TYPE_SHIFT)));
197 emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
198 }
199 emit(ADD(dst_reg(this->vertex_output_offset),
200 this->vertex_output_offset, brw_imm_ud(1u)));
201 }
202
203 void
gs_end_primitive()204 gen6_gs_visitor::gs_end_primitive()
205 {
206 this->current_annotation = "gen6 end primitive";
207 /* Calling EndPrimitive() is optional for point output. In this case we set
208 * the PrimEnd flag when we process EmitVertex().
209 */
210 if (nir->info.gs.output_primitive == GL_POINTS)
211 return;
212
213 /* Otherwise we know that the last vertex we have processed was the last
214 * vertex in the primitive and we need to set its PrimEnd flag, so do this
215 * unless we haven't emitted that vertex at all (vertex_count != 0).
216 *
217 * Notice that we have already incremented vertex_count when we processed
218 * the last emit_vertex, so we need to take that into account in the
219 * comparison below (hence the num_output_vertices + 1 in the comparison
220 * below).
221 */
222 unsigned num_output_vertices = nir->info.gs.vertices_out;
223 emit(CMP(dst_null_ud(), this->vertex_count,
224 brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
225 vec4_instruction *inst = emit(CMP(dst_null_ud(),
226 this->vertex_count, brw_imm_ud(0u),
227 BRW_CONDITIONAL_NEQ));
228 inst->predicate = BRW_PREDICATE_NORMAL;
229 emit(IF(BRW_PREDICATE_NORMAL));
230 {
231 /* vertex_output_offset is already pointing at the first entry of the
232 * next vertex. So subtract 1 to modify the flags for the previous
233 * vertex.
234 */
235 src_reg offset(this, glsl_type::uint_type);
236 emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
237
238 src_reg dst(this->vertex_output);
239 dst.reladdr = ralloc(mem_ctx, src_reg);
240 memcpy(dst.reladdr, &offset, sizeof(src_reg));
241
242 emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
243 emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
244
245 /* Set the first vertex flag to indicate that the next vertex will start
246 * a primitive.
247 */
248 emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
249 }
250 emit(BRW_OPCODE_ENDIF);
251 }
252
253 void
emit_urb_write_header(int mrf)254 gen6_gs_visitor::emit_urb_write_header(int mrf)
255 {
256 this->current_annotation = "gen6 urb header";
257 /* Compute offset of the flags for the current vertex in vertex_output and
258 * write them in dw2 of the message header.
259 *
260 * Notice that by the time that emit_thread_end() calls here
261 * vertex_output_offset should point to the first data item of the current
262 * vertex in vertex_output, thus we only need to add the number of output
263 * slots per vertex to that offset to obtain the flags data offset.
264 */
265 src_reg flags_offset(this, glsl_type::uint_type);
266 emit(ADD(dst_reg(flags_offset),
267 this->vertex_output_offset,
268 brw_imm_d(prog_data->vue_map.num_slots)));
269
270 src_reg flags_data(this->vertex_output);
271 flags_data.reladdr = ralloc(mem_ctx, src_reg);
272 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
273
274 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
275 }
276
277 static int
align_interleaved_urb_mlen(int mlen)278 align_interleaved_urb_mlen(int mlen)
279 {
280 /* URB data written (does not include the message header reg) must
281 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
282 * section 5.4.3.2.2: URB_INTERLEAVED.
283 */
284 if ((mlen % 2) != 1)
285 mlen++;
286 return mlen;
287 }
288
289 void
emit_urb_write_opcode(bool complete,int base_mrf,int last_mrf,int urb_offset)290 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
291 int last_mrf, int urb_offset)
292 {
293 vec4_instruction *inst = NULL;
294
295 if (!complete) {
296 /* If the vertex is not complete we don't have to do anything special */
297 inst = emit(GS_OPCODE_URB_WRITE);
298 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
299 } else {
300 /* Otherwise we always request to allocate a new VUE handle. If this is
301 * the last write before the EOT message and the new handle never gets
302 * used it will be dereferenced when we send the EOT message. This is
303 * necessary to avoid different setups for the EOT message (one for the
304 * case when there is no output and another for the case when there is)
305 * which would require to end the program with an IF/ELSE/ENDIF block,
306 * something we do not want.
307 */
308 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
309 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
310 inst->dst = dst_reg(MRF, base_mrf);
311 inst->src[0] = this->temp;
312 }
313
314 inst->base_mrf = base_mrf;
315 inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
316 inst->offset = urb_offset;
317 }
318
319 void
emit_thread_end()320 gen6_gs_visitor::emit_thread_end()
321 {
322 /* Make sure the current primitive is ended: we know it is not ended when
323 * first_vertex is not zero. This is only relevant for outputs other than
324 * points because in the point case we set PrimEnd on all vertices.
325 */
326 if (nir->info.gs.output_primitive != GL_POINTS) {
327 emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
328 emit(IF(BRW_PREDICATE_NORMAL));
329 gs_end_primitive();
330 emit(BRW_OPCODE_ENDIF);
331 }
332
333 /* Here we have to:
334 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
335 * 2) Loop over all buffered vertex data and write it to corresponding
336 * URB entries.
337 * 3) Allocate new VUE handles for all vertices other than the first.
338 * 4) Send a final EOT message.
339 */
340
341 /* MRF 0 is reserved for the debugger, so start with message header
342 * in MRF 1.
343 */
344 int base_mrf = 1;
345
346 /* In the process of generating our URB write message contents, we
347 * may need to unspill a register or load from an array. Those
348 * reads would use MRFs 21..23
349 */
350 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
351
352 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
353 emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
354 emit(IF(BRW_PREDICATE_NORMAL));
355 {
356 this->current_annotation = "gen6 thread end: ff_sync";
357
358 vec4_instruction *inst;
359 if (prog->info.has_transform_feedback_varyings) {
360 src_reg sol_temp(this, glsl_type::uvec4_type);
361 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
362 dst_reg(this->svbi),
363 this->vertex_count,
364 this->prim_count,
365 sol_temp);
366 inst = emit(GS_OPCODE_FF_SYNC,
367 dst_reg(this->temp), this->prim_count, this->svbi);
368 } else {
369 inst = emit(GS_OPCODE_FF_SYNC,
370 dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
371 }
372 inst->base_mrf = base_mrf;
373
374 /* Loop over all buffered vertices and emit URB write messages */
375 this->current_annotation = "gen6 thread end: urb writes init";
376 src_reg vertex(this, glsl_type::uint_type);
377 emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
378 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
379
380 this->current_annotation = "gen6 thread end: urb writes";
381 emit(BRW_OPCODE_DO);
382 {
383 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
384 inst = emit(BRW_OPCODE_BREAK);
385 inst->predicate = BRW_PREDICATE_NORMAL;
386
387 /* First we prepare the message header */
388 emit_urb_write_header(base_mrf);
389
390 /* Then add vertex data to the message in interleaved fashion */
391 int slot = 0;
392 bool complete = false;
393 do {
394 int mrf = base_mrf + 1;
395
396 /* URB offset is in URB row increments, and each of our MRFs is half
397 * of one of those, since we're doing interleaved writes.
398 */
399 int urb_offset = slot / 2;
400
401 for (; slot < prog_data->vue_map.num_slots; ++slot) {
402 int varying = prog_data->vue_map.slot_to_varying[slot];
403 current_annotation = output_reg_annotation[varying];
404
405 /* Compute offset of this slot for the current vertex
406 * in vertex_output
407 */
408 src_reg data(this->vertex_output);
409 data.reladdr = ralloc(mem_ctx, src_reg);
410 memcpy(data.reladdr, &this->vertex_output_offset,
411 sizeof(src_reg));
412
413 /* Copy this slot to the appropriate message register */
414 dst_reg reg = dst_reg(MRF, mrf);
415 reg.type = output_reg[varying][0].type;
416 data.type = reg.type;
417 vec4_instruction *inst = emit(MOV(reg, data));
418 inst->force_writemask_all = true;
419
420 mrf++;
421 emit(ADD(dst_reg(this->vertex_output_offset),
422 this->vertex_output_offset, brw_imm_ud(1u)));
423
424 /* If this was max_usable_mrf, we can't fit anything more into
425 * this URB WRITE. Same if we reached the max. message length.
426 */
427 if (mrf > max_usable_mrf ||
428 align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
429 slot++;
430 break;
431 }
432 }
433
434 complete = slot >= prog_data->vue_map.num_slots;
435 emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
436 } while (!complete);
437
438 /* Skip over the flags data item so that vertex_output_offset points
439 * to the first data item of the next vertex, so that we can start
440 * writing the next vertex.
441 */
442 emit(ADD(dst_reg(this->vertex_output_offset),
443 this->vertex_output_offset, brw_imm_ud(1u)));
444
445 emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
446 }
447 emit(BRW_OPCODE_WHILE);
448
449 if (prog->info.has_transform_feedback_varyings)
450 xfb_write();
451 }
452 emit(BRW_OPCODE_ENDIF);
453
454 /* Finally, emit EOT message.
455 *
456 * In gen6 we need to end the thread differently depending on whether we have
457 * emitted at least one vertex or not. In case we did, the EOT message must
458 * always include the COMPLETE flag or else the GPU hangs. If we have not
459 * produced any output we can't use the COMPLETE flag.
460 *
461 * However, this would lead us to end the program with an ENDIF opcode,
462 * which we want to avoid, so what we do is that we always request a new
463 * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
464 * With this we make sure that whether we have emitted at least one vertex
465 * or none at all, we have to finish the thread without writing to the URB,
466 * which works for both cases by setting the COMPLETE and UNUSED flags in
467 * the EOT message.
468 */
469 this->current_annotation = "gen6 thread end: EOT";
470
471 if (prog->info.has_transform_feedback_varyings) {
472 /* When emitting EOT, set SONumPrimsWritten Increment Value. */
473 src_reg data(this, glsl_type::uint_type);
474 emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
475 emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
476 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
477 }
478
479 vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
480 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
481 inst->base_mrf = base_mrf;
482 inst->mlen = 1;
483 }
484
485 void
setup_payload()486 gen6_gs_visitor::setup_payload()
487 {
488 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
489
490 /* Attributes are going to be interleaved, so one register contains two
491 * attribute slots.
492 */
493 int attributes_per_reg = 2;
494
495 /* If a geometry shader tries to read from an input that wasn't written by
496 * the vertex shader, that produces undefined results, but it shouldn't
497 * crash anything. So initialize attribute_map to zeros--that ensures that
498 * these undefined results are read from r0.
499 */
500 memset(attribute_map, 0, sizeof(attribute_map));
501
502 int reg = 0;
503
504 /* The payload always contains important data in r0. */
505 reg++;
506
507 /* r1 is always part of the payload and it holds information relevant
508 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
509 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
510 * information (and move the original value to a virtual register if
511 * necessary).
512 */
513 if (gs_prog_data->include_primitive_id)
514 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
515 reg++;
516
517 reg = setup_uniforms(reg);
518
519 reg = setup_varying_inputs(reg, attributes_per_reg);
520
521 this->first_non_payload_grf = reg;
522 }
523
524 void
xfb_setup()525 gen6_gs_visitor::xfb_setup()
526 {
527 static const unsigned swizzle_for_offset[4] = {
528 BRW_SWIZZLE4(0, 1, 2, 3),
529 BRW_SWIZZLE4(1, 2, 3, 3),
530 BRW_SWIZZLE4(2, 3, 3, 3),
531 BRW_SWIZZLE4(3, 3, 3, 3)
532 };
533
534 const struct gl_transform_feedback_info *linked_xfb_info =
535 this->prog->sh.LinkedTransformFeedback;
536 int i;
537
538 /* Make sure that the VUE slots won't overflow the unsigned chars in
539 * prog_data->transform_feedback_bindings[].
540 */
541 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
542
543 /* Make sure that we don't need more binding table entries than we've
544 * set aside for use in transform feedback. (We shouldn't, since we
545 * set aside enough binding table entries to have one per component).
546 */
547 assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
548
549 gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
550 for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
551 gs_prog_data->transform_feedback_bindings[i] =
552 linked_xfb_info->Outputs[i].OutputRegister;
553 gs_prog_data->transform_feedback_swizzles[i] =
554 swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
555 }
556 }
557
558 void
xfb_write()559 gen6_gs_visitor::xfb_write()
560 {
561 unsigned num_verts;
562
563 if (!gs_prog_data->num_transform_feedback_bindings)
564 return;
565
566 switch (gs_prog_data->output_topology) {
567 case _3DPRIM_POINTLIST:
568 num_verts = 1;
569 break;
570 case _3DPRIM_LINELIST:
571 case _3DPRIM_LINESTRIP:
572 case _3DPRIM_LINELOOP:
573 num_verts = 2;
574 break;
575 case _3DPRIM_TRILIST:
576 case _3DPRIM_TRIFAN:
577 case _3DPRIM_TRISTRIP:
578 case _3DPRIM_RECTLIST:
579 num_verts = 3;
580 break;
581 case _3DPRIM_QUADLIST:
582 case _3DPRIM_QUADSTRIP:
583 case _3DPRIM_POLYGON:
584 num_verts = 3;
585 break;
586 default:
587 unreachable("Unexpected primitive type in Gen6 SOL program.");
588 }
589
590 this->current_annotation = "gen6 thread end: svb writes init";
591
592 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
593 emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
594
595 /* Check that at least one primitive can be written
596 *
597 * Note: since we use the binding table to keep track of buffer offsets
598 * and stride, the GS doesn't need to keep track of a separate pointer
599 * into each buffer; it uses a single pointer which increments by 1 for
600 * each vertex. So we use SVBI0 for this pointer, regardless of whether
601 * transform feedback is in interleaved or separate attribs mode.
602 */
603 src_reg sol_temp(this, glsl_type::uvec4_type);
604 emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
605
606 /* Compare SVBI calculated number with the maximum value, which is
607 * in R1.4 (previously saved in this->max_svbi) for gen6.
608 */
609 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
610 emit(IF(BRW_PREDICATE_NORMAL));
611 {
612 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
613 brw_imm_vf4(brw_float_to_vf(0.0),
614 brw_float_to_vf(1.0),
615 brw_float_to_vf(2.0),
616 brw_float_to_vf(0.0))));
617 inst->force_writemask_all = true;
618
619 emit(ADD(dst_reg(this->destination_indices),
620 this->destination_indices,
621 this->svbi));
622 }
623 emit(BRW_OPCODE_ENDIF);
624
625 /* Write transform feedback data for all processed vertices. */
626 for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
627 emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
628 emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
629 BRW_CONDITIONAL_L));
630 emit(IF(BRW_PREDICATE_NORMAL));
631 {
632 xfb_program(i, num_verts);
633 }
634 emit(BRW_OPCODE_ENDIF);
635 }
636 }
637
638 void
xfb_program(unsigned vertex,unsigned num_verts)639 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
640 {
641 unsigned binding;
642 unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
643 src_reg sol_temp(this, glsl_type::uvec4_type);
644
645 /* Check for buffer overflow: we need room to write the complete primitive
646 * (all vertices). Otherwise, avoid writing any vertices for it
647 */
648 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
649 emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
650 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
651 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
652 emit(IF(BRW_PREDICATE_NORMAL));
653 {
654 /* Avoid overwriting MRF 1 as it is used as URB write message header */
655 dst_reg mrf_reg(MRF, 2);
656
657 this->current_annotation = "gen6: emit SOL vertex data";
658 /* For each vertex, generate code to output each varying using the
659 * appropriate binding table entry.
660 */
661 for (binding = 0; binding < num_bindings; ++binding) {
662 unsigned char varying =
663 gs_prog_data->transform_feedback_bindings[binding];
664
665 /* Set up the correct destination index for this vertex */
666 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
667 mrf_reg,
668 this->destination_indices);
669 inst->sol_vertex = vertex % num_verts;
670
671 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
672 *
673 * "Prior to End of Thread with a URB_WRITE, the kernel must
674 * ensure that all writes are complete by sending the final
675 * write as a committed write."
676 */
677 bool final_write = binding == (unsigned) num_bindings - 1 &&
678 inst->sol_vertex == num_verts - 1;
679
680 /* Compute offset of this varying for the current vertex
681 * in vertex_output
682 */
683 this->current_annotation = output_reg_annotation[varying];
684 src_reg data(this->vertex_output);
685 data.reladdr = ralloc(mem_ctx, src_reg);
686 int offset = get_vertex_output_offset_for_varying(vertex, varying);
687 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
688 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
689 data.type = output_reg[varying][0].type;
690 data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
691
692 /* Write data */
693 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
694 inst->sol_binding = binding;
695 inst->sol_final_write = final_write;
696
697 if (final_write) {
698 /* This is the last vertex of the primitive, then increment
699 * SO num primitive counter and destination indices.
700 */
701 emit(ADD(dst_reg(this->destination_indices),
702 this->destination_indices,
703 brw_imm_ud(num_verts)));
704 emit(ADD(dst_reg(this->sol_prim_written),
705 this->sol_prim_written, brw_imm_ud(1u)));
706 }
707
708 }
709 this->current_annotation = NULL;
710 }
711 emit(BRW_OPCODE_ENDIF);
712 }
713
714 int
get_vertex_output_offset_for_varying(int vertex,int varying)715 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
716 {
717 /* Find the output slot assigned to this varying.
718 *
719 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
720 * as VARYING_SLOT_PSIZ.
721 */
722 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
723 varying = VARYING_SLOT_PSIZ;
724 int slot = prog_data->vue_map.varying_to_slot[varying];
725
726 if (slot < 0) {
727 /* This varying does not exist in the VUE so we are not writing to it
728 * and its value is undefined. We still want to return a valid offset
729 * into vertex_output though, to prevent any out-of-bound accesses into
730 * the vertex_output array. Since the value for this varying is undefined
731 * we don't really care for the value we assign to it, so any offset
732 * within the limits of vertex_output will do.
733 */
734 slot = 0;
735 }
736
737 return vertex * (prog_data->vue_map.num_slots + 1) + slot;
738 }
739
740 } /* namespace brw */
741