• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * This code is based on original work by Ilia Mirkin.
24  */
25 
26 /**
27  * \file gfx6_gs_visitor.cpp
28  *
29  * Gfx6 geometry shader implementation
30  */
31 
32 #include "gfx6_gs_visitor.h"
33 #include "brw_eu.h"
34 #include "brw_prim.h"
35 
36 namespace brw {
37 
38 void
emit_prolog()39 gfx6_gs_visitor::emit_prolog()
40 {
41    vec4_gs_visitor::emit_prolog();
42 
43    /* Gfx6 geometry shaders require to allocate an initial VUE handle via
44     * FF_SYNC message, however the documentation remarks that only one thread
45     * can write to the URB simultaneously and the FF_SYNC message provides the
46     * synchronization mechanism for this, so using this message effectively
47     * stalls the thread until it is its turn to write to the URB. Because of
48     * this, the best way to implement geometry shader algorithms in gfx6 is to
49     * execute the algorithm before the FF_SYNC message to maximize parallelism.
50     *
51     * To achieve this we buffer the geometry shader outputs for each emitted
52     * vertex in vertex_output during operation. Then, when we have processed
53     * the last vertex (that is, at thread end time), we send the FF_SYNC
54     * message to allocate the initial VUE handle and write all buffered vertex
55     * data to the URB in one go.
56     *
57     * For each emitted vertex, vertex_output will hold vue_map.num_slots
58     * data items plus one additional item to hold required flags
59     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
60     * which come right after the data items for that vertex. Vertex data and
61     * flags for the next vertex come right after the data items and flags for
62     * the previous vertex.
63     */
64    this->current_annotation = "gfx6 prolog";
65    this->vertex_output = src_reg(this,
66                                  glsl_type::uint_type,
67                                  (prog_data->vue_map.num_slots + 1) *
68                                  nir->info.gs.vertices_out);
69    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
70    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
71 
72    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
73     * so initialize it once to R0.
74     */
75    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
76                                      retype(brw_vec8_grf(0, 0),
77                                             BRW_REGISTER_TYPE_UD)));
78    inst->force_writemask_all = true;
79 
80    /* This will be used as a temporary to store writeback data of FF_SYNC
81     * and URB_WRITE messages.
82     */
83    this->temp = src_reg(this, glsl_type::uint_type);
84 
85    /* This will be used to know when we are processing the first vertex of
86     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
87     * that we are processing the first vertex in the primitive and to zero
88     * otherwise. This way we can use its value directly in the URB write
89     * headers.
90     */
91    this->first_vertex = src_reg(this, glsl_type::uint_type);
92    emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
93 
94    /* The FF_SYNC message requires to know the number of primitives generated,
95     * so keep a counter for this.
96     */
97    this->prim_count = src_reg(this, glsl_type::uint_type);
98    emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
99 
100    if (gs_prog_data->num_transform_feedback_bindings) {
101       /* Create a virtual register to hold destination indices in SOL */
102       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
103       /* Create a virtual register to hold number of written primitives */
104       this->sol_prim_written = src_reg(this, glsl_type::uint_type);
105       /* Create a virtual register to hold Streamed Vertex Buffer Indices */
106       this->svbi = src_reg(this, glsl_type::uvec4_type);
107       /* Create a virtual register to hold max values of SVBI */
108       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
109       emit(MOV(dst_reg(this->max_svbi),
110                src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
111    }
112 
113    /* PrimitveID is delivered in r0.1 of the thread payload. If the program
114     * needs it we have to move it to a separate register where we can map
115     * the attribute.
116     *
117     * Notice that we cannot use a virtual register for this, because we need to
118     * map all input attributes to hardware registers in setup_payload(),
119     * which happens before virtual registers are mapped to hardware registers.
120     * We could work around that issue if we were able to compute the first
121     * non-payload register here and move the PrimitiveID information to that
122     * register, but we can't because at this point we don't know the final
123     * number uniforms that will be included in the payload.
124     *
125     * So, what we do is to place PrimitiveID information in r1, which is always
126     * delivered as part of the payload, but its only populated with data
127     * relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE
128     * in the 3DSTATE_GS state packet. That information can be obtained by other
129     * means though, so we can safely use r1 for this purpose.
130     */
131    if (gs_prog_data->include_primitive_id) {
132       this->primitive_id =
133          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
134       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
135    }
136 }
137 
138 void
gs_emit_vertex(int stream_id)139 gfx6_gs_visitor::gs_emit_vertex(int stream_id)
140 {
141    this->current_annotation = "gfx6 emit vertex";
142 
143    /* Buffer all output slots for this vertex in vertex_output */
144    for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
145       int varying = prog_data->vue_map.slot_to_varying[slot];
146       if (varying != VARYING_SLOT_PSIZ) {
147          dst_reg dst(this->vertex_output);
148          dst.reladdr = ralloc(mem_ctx, src_reg);
149          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
150          emit_urb_slot(dst, varying);
151       } else {
152          /* The PSIZ slot can pack multiple varyings in different channels
153           * and emit_urb_slot() will produce a MOV instruction for each of
154           * them. Since we are writing to an array, that will translate to
155           * possibly multiple MOV instructions with an array destination and
156           * each will generate a scratch write with the same offset into
157           * scratch space (thus, each one overwriting the previous). This is
158           * not what we want. What we will do instead is emit PSIZ to a
159           * a regular temporary register, then move that register into the
160           * array. This way we only have one instruction with an array
161           * destination and we only produce a single scratch write.
162           */
163          dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
164          emit_urb_slot(tmp, varying);
165          dst_reg dst(this->vertex_output);
166          dst.reladdr = ralloc(mem_ctx, src_reg);
167          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
168          vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
169          inst->force_writemask_all = true;
170       }
171 
172       emit(ADD(dst_reg(this->vertex_output_offset),
173                this->vertex_output_offset, brw_imm_ud(1u)));
174    }
175 
176    /* Now buffer flags for this vertex */
177    dst_reg dst(this->vertex_output);
178    dst.reladdr = ralloc(mem_ctx, src_reg);
179    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
180    if (nir->info.gs.output_primitive == GL_POINTS) {
181       /* If we are outputting points, then every vertex has PrimStart and
182        * PrimEnd set.
183        */
184       emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
185                               URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
186       emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
187    } else {
188       /* Otherwise, we can only set the PrimStart flag, which we have stored
189        * in the first_vertex register. We will have to wait until we execute
190        * EndPrimitive() or we end the thread to set the PrimEnd flag on a
191        * vertex.
192        */
193       emit(OR(dst, this->first_vertex,
194               brw_imm_ud(gs_prog_data->output_topology <<
195                          URB_WRITE_PRIM_TYPE_SHIFT)));
196       emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
197    }
198    emit(ADD(dst_reg(this->vertex_output_offset),
199             this->vertex_output_offset, brw_imm_ud(1u)));
200 }
201 
202 void
gs_end_primitive()203 gfx6_gs_visitor::gs_end_primitive()
204 {
205    this->current_annotation = "gfx6 end primitive";
206    /* Calling EndPrimitive() is optional for point output. In this case we set
207     * the PrimEnd flag when we process EmitVertex().
208     */
209    if (nir->info.gs.output_primitive == GL_POINTS)
210       return;
211 
212    /* Otherwise we know that the last vertex we have processed was the last
213     * vertex in the primitive and we need to set its PrimEnd flag, so do this
214     * unless we haven't emitted that vertex at all (vertex_count != 0).
215     *
216     * Notice that we have already incremented vertex_count when we processed
217     * the last emit_vertex, so we need to take that into account in the
218     * comparison below (hence the num_output_vertices + 1 in the comparison
219     * below).
220     */
221    unsigned num_output_vertices = nir->info.gs.vertices_out;
222    emit(CMP(dst_null_ud(), this->vertex_count,
223             brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
224    vec4_instruction *inst = emit(CMP(dst_null_ud(),
225                                      this->vertex_count, brw_imm_ud(0u),
226                                      BRW_CONDITIONAL_NEQ));
227    inst->predicate = BRW_PREDICATE_NORMAL;
228    emit(IF(BRW_PREDICATE_NORMAL));
229    {
230       /* vertex_output_offset is already pointing at the first entry of the
231        * next vertex. So subtract 1 to modify the flags for the previous
232        * vertex.
233        */
234       src_reg offset(this, glsl_type::uint_type);
235       emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
236 
237       src_reg dst(this->vertex_output);
238       dst.reladdr = ralloc(mem_ctx, src_reg);
239       memcpy(dst.reladdr, &offset, sizeof(src_reg));
240 
241       emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
242       emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
243 
244       /* Set the first vertex flag to indicate that the next vertex will start
245        * a primitive.
246        */
247       emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
248    }
249    emit(BRW_OPCODE_ENDIF);
250 }
251 
252 void
emit_urb_write_header(int mrf)253 gfx6_gs_visitor::emit_urb_write_header(int mrf)
254 {
255    this->current_annotation = "gfx6 urb header";
256    /* Compute offset of the flags for the current vertex in vertex_output and
257     * write them in dw2 of the message header.
258     *
259     * Notice that by the time that emit_thread_end() calls here
260     * vertex_output_offset should point to the first data item of the current
261     * vertex in vertex_output, thus we only need to add the number of output
262     * slots per vertex to that offset to obtain the flags data offset.
263     */
264    src_reg flags_offset(this, glsl_type::uint_type);
265    emit(ADD(dst_reg(flags_offset),
266             this->vertex_output_offset,
267             brw_imm_d(prog_data->vue_map.num_slots)));
268 
269    src_reg flags_data(this->vertex_output);
270    flags_data.reladdr = ralloc(mem_ctx, src_reg);
271    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
272 
273    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
274 }
275 
276 static unsigned
align_interleaved_urb_mlen(unsigned mlen)277 align_interleaved_urb_mlen(unsigned mlen)
278 {
279    /* URB data written (does not include the message header reg) must
280     * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
281     * section 5.4.3.2.2: URB_INTERLEAVED.
282     */
283    if ((mlen % 2) != 1)
284       mlen++;
285    return mlen;
286 }
287 
288 void
emit_snb_gs_urb_write_opcode(bool complete,int base_mrf,int last_mrf,int urb_offset)289 gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf,
290                                               int last_mrf, int urb_offset)
291 {
292    vec4_instruction *inst = NULL;
293 
294    if (!complete) {
295       /* If the vertex is not complete we don't have to do anything special */
296       inst = emit(VEC4_GS_OPCODE_URB_WRITE);
297       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
298    } else {
299       /* Otherwise we always request to allocate a new VUE handle. If this is
300        * the last write before the EOT message and the new handle never gets
301        * used it will be dereferenced when we send the EOT message. This is
302        * necessary to avoid different setups for the EOT message (one for the
303        * case when there is no output and another for the case when there is)
304        * which would require to end the program with an IF/ELSE/ENDIF block,
305        * something we do not want.
306        */
307       inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE);
308       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
309       inst->dst = dst_reg(MRF, base_mrf);
310       inst->src[0] = this->temp;
311    }
312 
313    inst->base_mrf = base_mrf;
314    inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
315    inst->offset = urb_offset;
316 }
317 
318 void
emit_thread_end()319 gfx6_gs_visitor::emit_thread_end()
320 {
321    /* Make sure the current primitive is ended: we know it is not ended when
322     * first_vertex is not zero. This is only relevant for outputs other than
323     * points because in the point case we set PrimEnd on all vertices.
324     */
325    if (nir->info.gs.output_primitive != GL_POINTS) {
326       emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
327       emit(IF(BRW_PREDICATE_NORMAL));
328       gs_end_primitive();
329       emit(BRW_OPCODE_ENDIF);
330    }
331 
332    /* Here we have to:
333     * 1) Emit an FF_SYNC message to obtain an initial VUE handle.
334     * 2) Loop over all buffered vertex data and write it to corresponding
335     *    URB entries.
336     * 3) Allocate new VUE handles for all vertices other than the first.
337     * 4) Send a final EOT message.
338     */
339 
340    /* MRF 0 is reserved for the debugger, so start with message header
341     * in MRF 1.
342     */
343    int base_mrf = 1;
344 
345    /* In the process of generating our URB write message contents, we
346     * may need to unspill a register or load from an array.  Those
347     * reads would use MRFs 21..23
348     */
349    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
350 
351    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
352    this->current_annotation = "gfx6 thread end: ff_sync";
353 
354    vec4_instruction *inst = NULL;
355    if (gs_prog_data->num_transform_feedback_bindings) {
356       src_reg sol_temp(this, glsl_type::uvec4_type);
357       emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
358            dst_reg(this->svbi),
359            this->vertex_count,
360            this->prim_count,
361            sol_temp);
362       inst = emit(GS_OPCODE_FF_SYNC,
363                   dst_reg(this->temp), this->prim_count, this->svbi);
364    } else {
365       inst = emit(GS_OPCODE_FF_SYNC,
366                   dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
367    }
368    inst->base_mrf = base_mrf;
369 
370    emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
371    emit(IF(BRW_PREDICATE_NORMAL));
372    {
373       /* Loop over all buffered vertices and emit URB write messages */
374       this->current_annotation = "gfx6 thread end: urb writes init";
375       src_reg vertex(this, glsl_type::uint_type);
376       emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
377       emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
378 
379       this->current_annotation = "gfx6 thread end: urb writes";
380       emit(BRW_OPCODE_DO);
381       {
382          emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
383          inst = emit(BRW_OPCODE_BREAK);
384          inst->predicate = BRW_PREDICATE_NORMAL;
385 
386          /* First we prepare the message header */
387          emit_urb_write_header(base_mrf);
388 
389          /* Then add vertex data to the message in interleaved fashion */
390          int slot = 0;
391          bool complete = false;
392          do {
393             int mrf = base_mrf + 1;
394 
395             /* URB offset is in URB row increments, and each of our MRFs is half
396              * of one of those, since we're doing interleaved writes.
397              */
398             int urb_offset = slot / 2;
399 
400             for (; slot < prog_data->vue_map.num_slots; ++slot) {
401                int varying = prog_data->vue_map.slot_to_varying[slot];
402                current_annotation = output_reg_annotation[varying];
403 
404                /* Compute offset of this slot for the current vertex
405                 * in vertex_output
406                 */
407                src_reg data(this->vertex_output);
408                data.reladdr = ralloc(mem_ctx, src_reg);
409                memcpy(data.reladdr, &this->vertex_output_offset,
410                       sizeof(src_reg));
411 
412                /* Copy this slot to the appropriate message register */
413                dst_reg reg = dst_reg(MRF, mrf);
414                reg.type = output_reg[varying][0].type;
415                data.type = reg.type;
416                inst = emit(MOV(reg, data));
417                inst->force_writemask_all = true;
418 
419                mrf++;
420                emit(ADD(dst_reg(this->vertex_output_offset),
421                         this->vertex_output_offset, brw_imm_ud(1u)));
422 
423                /* If this was max_usable_mrf, we can't fit anything more into
424                 * this URB WRITE. Same if we reached the max. message length.
425                 */
426                if (mrf > max_usable_mrf ||
427                    align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
428                   slot++;
429                   break;
430                }
431             }
432 
433             complete = slot >= prog_data->vue_map.num_slots;
434             emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
435          } while (!complete);
436 
437          /* Skip over the flags data item so that vertex_output_offset points
438           * to the first data item of the next vertex, so that we can start
439           * writing the next vertex.
440           */
441          emit(ADD(dst_reg(this->vertex_output_offset),
442                   this->vertex_output_offset, brw_imm_ud(1u)));
443 
444          emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
445       }
446       emit(BRW_OPCODE_WHILE);
447 
448       if (gs_prog_data->num_transform_feedback_bindings)
449          xfb_write();
450    }
451    emit(BRW_OPCODE_ENDIF);
452 
453    /* Finally, emit EOT message.
454     *
455     * In gfx6 we need to end the thread differently depending on whether we have
456     * emitted at least one vertex or not. In case we did, the EOT message must
457     * always include the COMPLETE flag or else the GPU hangs. If we have not
458     * produced any output we can't use the COMPLETE flag.
459     *
460     * However, this would lead us to end the program with an ENDIF opcode,
461     * which we want to avoid, so what we do is that we always request a new
462     * VUE handle every time, even if GS produces no output.
463     * With this we make sure that whether we have emitted at least one vertex
464     * or none at all, we have to finish the thread without writing to the URB,
465     * which works for both cases by setting the COMPLETE and UNUSED flags in
466     * the EOT message.
467     */
468    this->current_annotation = "gfx6 thread end: EOT";
469 
470    if (gs_prog_data->num_transform_feedback_bindings) {
471       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
472       src_reg data(this, glsl_type::uint_type);
473       emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
474       emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
475       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
476    }
477 
478    inst = emit(GS_OPCODE_THREAD_END);
479    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
480    inst->base_mrf = base_mrf;
481    inst->mlen = 1;
482 }
483 
484 void
setup_payload()485 gfx6_gs_visitor::setup_payload()
486 {
487    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
488 
489    /* Attributes are going to be interleaved, so one register contains two
490     * attribute slots.
491     */
492    int attributes_per_reg = 2;
493 
494    /* If a geometry shader tries to read from an input that wasn't written by
495     * the vertex shader, that produces undefined results, but it shouldn't
496     * crash anything.  So initialize attribute_map to zeros--that ensures that
497     * these undefined results are read from r0.
498     */
499    memset(attribute_map, 0, sizeof(attribute_map));
500 
501    int reg = 0;
502 
503    /* The payload always contains important data in r0. */
504    reg++;
505 
506    /* r1 is always part of the payload and it holds information relevant
507     * for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in
508     * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
509     * information (and move the original value to a virtual register if
510     * necessary).
511     */
512    if (gs_prog_data->include_primitive_id)
513       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
514    reg++;
515 
516    reg = setup_uniforms(reg);
517 
518    reg = setup_varying_inputs(reg, attributes_per_reg);
519 
520    this->first_non_payload_grf = reg;
521 }
522 
523 void
xfb_write()524 gfx6_gs_visitor::xfb_write()
525 {
526    unsigned num_verts;
527 
528    switch (gs_prog_data->output_topology) {
529    case _3DPRIM_POINTLIST:
530       num_verts = 1;
531       break;
532    case _3DPRIM_LINELIST:
533    case _3DPRIM_LINESTRIP:
534    case _3DPRIM_LINELOOP:
535       num_verts = 2;
536       break;
537    case _3DPRIM_TRILIST:
538    case _3DPRIM_TRIFAN:
539    case _3DPRIM_TRISTRIP:
540    case _3DPRIM_RECTLIST:
541       num_verts = 3;
542       break;
543    case _3DPRIM_QUADLIST:
544    case _3DPRIM_QUADSTRIP:
545    case _3DPRIM_POLYGON:
546       num_verts = 3;
547       break;
548    default:
549       unreachable("Unexpected primitive type in Gfx6 SOL program.");
550    }
551 
552    this->current_annotation = "gfx6 thread end: svb writes init";
553 
554    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
555    emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
556 
557    /* Check that at least one primitive can be written
558     *
559     * Note: since we use the binding table to keep track of buffer offsets
560     * and stride, the GS doesn't need to keep track of a separate pointer
561     * into each buffer; it uses a single pointer which increments by 1 for
562     * each vertex.  So we use SVBI0 for this pointer, regardless of whether
563     * transform feedback is in interleaved or separate attribs mode.
564     */
565    src_reg sol_temp(this, glsl_type::uvec4_type);
566    emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
567 
568    /* Compare SVBI calculated number with the maximum value, which is
569     * in R1.4 (previously saved in this->max_svbi) for gfx6.
570     */
571    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
572    emit(IF(BRW_PREDICATE_NORMAL));
573    {
574       vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
575                                         brw_imm_vf4(brw_float_to_vf(0.0),
576                                                     brw_float_to_vf(1.0),
577                                                     brw_float_to_vf(2.0),
578                                                     brw_float_to_vf(0.0))));
579       inst->force_writemask_all = true;
580 
581       emit(ADD(dst_reg(this->destination_indices),
582                this->destination_indices,
583                this->svbi));
584    }
585    emit(BRW_OPCODE_ENDIF);
586 
587    /* Write transform feedback data for all processed vertices. */
588    for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
589       emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
590       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
591                BRW_CONDITIONAL_L));
592       emit(IF(BRW_PREDICATE_NORMAL));
593       {
594          xfb_program(i, num_verts);
595       }
596       emit(BRW_OPCODE_ENDIF);
597    }
598 }
599 
600 void
xfb_program(unsigned vertex,unsigned num_verts)601 gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
602 {
603    unsigned binding;
604    unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
605    src_reg sol_temp(this, glsl_type::uvec4_type);
606 
607    /* Check for buffer overflow: we need room to write the complete primitive
608     * (all vertices). Otherwise, avoid writing any vertices for it
609     */
610    emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
611    emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
612    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
613    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
614    emit(IF(BRW_PREDICATE_NORMAL));
615    {
616       /* Avoid overwriting MRF 1 as it is used as URB write message header */
617       dst_reg mrf_reg(MRF, 2);
618 
619       this->current_annotation = "gfx6: emit SOL vertex data";
620       /* For each vertex, generate code to output each varying using the
621        * appropriate binding table entry.
622        */
623       for (binding = 0; binding < num_bindings; ++binding) {
624          unsigned char varying =
625             gs_prog_data->transform_feedback_bindings[binding];
626 
627          /* Set up the correct destination index for this vertex */
628          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
629                                        mrf_reg,
630                                        this->destination_indices);
631          inst->sol_vertex = vertex % num_verts;
632 
633          /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
634           *
635           *   "Prior to End of Thread with a URB_WRITE, the kernel must
636           *   ensure that all writes are complete by sending the final
637           *   write as a committed write."
638           */
639          bool final_write = binding == (unsigned) num_bindings - 1 &&
640                             inst->sol_vertex == num_verts - 1;
641 
642          /* Compute offset of this varying for the current vertex
643           * in vertex_output
644           */
645          this->current_annotation = output_reg_annotation[varying];
646          src_reg data(this->vertex_output);
647          data.reladdr = ralloc(mem_ctx, src_reg);
648          int offset = get_vertex_output_offset_for_varying(vertex, varying);
649          emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
650          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
651          data.type = output_reg[varying][0].type;
652          data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
653 
654          /* Write data */
655          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
656          inst->sol_binding = binding;
657          inst->sol_final_write = final_write;
658 
659          if (final_write) {
660             /* This is the last vertex of the primitive, then increment
661              * SO num primitive counter and destination indices.
662              */
663             emit(ADD(dst_reg(this->destination_indices),
664                      this->destination_indices,
665                      brw_imm_ud(num_verts)));
666             emit(ADD(dst_reg(this->sol_prim_written),
667                      this->sol_prim_written, brw_imm_ud(1u)));
668          }
669 
670       }
671       this->current_annotation = NULL;
672    }
673    emit(BRW_OPCODE_ENDIF);
674 }
675 
676 int
get_vertex_output_offset_for_varying(int vertex,int varying)677 gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
678 {
679    /* Find the output slot assigned to this varying.
680     *
681     * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
682     * as VARYING_SLOT_PSIZ.
683     */
684    if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
685       varying = VARYING_SLOT_PSIZ;
686    int slot = prog_data->vue_map.varying_to_slot[varying];
687 
688    if (slot < 0) {
689       /* This varying does not exist in the VUE so we are not writing to it
690        * and its value is undefined. We still want to return a valid offset
691        * into vertex_output though, to prevent any out-of-bound accesses into
692        * the vertex_output array. Since the value for this varying is undefined
693        * we don't really care for the value we assign to it, so any offset
694        * within the limits of vertex_output will do.
695        */
696       slot = 0;
697    }
698 
699    return vertex * (prog_data->vue_map.num_slots + 1) + slot;
700 }
701 
702 } /* namespace brw */
703