• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 #include "brw_compiler.h"
33 #include "brw_eu.h"
34 #include "brw_prim.h"
35 
36 #include "dev/intel_debug.h"
37 
38 #define MAX_GS_VERTS (4)
39 
40 struct brw_ff_gs_compile {
41    struct brw_codegen func;
42    struct brw_ff_gs_prog_key key;
43    struct brw_ff_gs_prog_data *prog_data;
44 
45    struct {
46       struct brw_reg R0;
47 
48       /**
49        * Register holding streamed vertex buffer pointers -- see the Sandy
50        * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
51        * [DevSNB]).  These pointers are delivered in GRF 1.
52        */
53       struct brw_reg SVBI;
54 
55       struct brw_reg vertex[MAX_GS_VERTS];
56       struct brw_reg header;
57       struct brw_reg temp;
58 
59       /**
60        * Register holding destination indices for streamed buffer writes.
61        * Only used for SOL programs.
62        */
63       struct brw_reg destination_indices;
64    } reg;
65 
66    /* Number of registers used to store vertex data */
67    GLuint nr_regs;
68 
69    struct brw_vue_map vue_map;
70 };
71 
72 /**
73  * Allocate registers for GS.
74  *
75  * If sol_program is true, then:
76  *
77  * - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
78  *   1 needs to be set aside to hold the streamed vertex buffer indices.
79  *
80  * - The thread will need to use the destination_indices register.
81  */
brw_ff_gs_alloc_regs(struct brw_ff_gs_compile * c,GLuint nr_verts,bool sol_program)82 static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c,
83                                  GLuint nr_verts,
84                                  bool sol_program)
85 {
86    GLuint i = 0,j;
87 
88    /* Register usage is static, precompute here:
89     */
90    c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
91 
92    /* Streamed vertex buffer indices */
93    if (sol_program)
94       c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
95 
96    /* Payload vertices plus space for more generated vertices:
97     */
98    for (j = 0; j < nr_verts; j++) {
99       c->reg.vertex[j] = brw_vec4_grf(i, 0);
100       i += c->nr_regs;
101    }
102 
103    c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
104    c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
105 
106    if (sol_program) {
107       c->reg.destination_indices =
108          retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD);
109    }
110 
111    c->prog_data->urb_read_length = c->nr_regs;
112    c->prog_data->total_grf = i;
113 }
114 
115 
116 /**
117  * Set up the initial value of c->reg.header register based on c->reg.R0.
118  *
119  * The following information is passed to the GS thread in R0, and needs to be
120  * included in the first URB_WRITE or FF_SYNC message sent by the GS:
121  *
122  * - DWORD 0 [31:0] handle info (Gen4 only)
123  * - DWORD 5 [7:0] FFTID
124  * - DWORD 6 [31:0] Debug info
125  * - DWORD 7 [31:0] Debug info
126  *
127  * This function sets up the above data by copying by copying the contents of
128  * R0 to the header register.
129  */
brw_ff_gs_initialize_header(struct brw_ff_gs_compile * c)130 static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c)
131 {
132    struct brw_codegen *p = &c->func;
133    brw_MOV(p, c->reg.header, c->reg.R0);
134 }
135 
136 /**
137  * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
138  *
139  * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
140  * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
141  * need to be able to update on a per-vertex basis.
142  */
brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile * c,unsigned dw2)143 static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c,
144                                            unsigned dw2)
145 {
146    struct brw_codegen *p = &c->func;
147    brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
148 }
149 
150 /**
151  * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
152  *
153  * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
154  * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
155  * DWORD 2.  So this function extracts the primitive type field, bitshifts it
156  * appropriately, and stores it in c->reg.header.
157  */
brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile * c)158 static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c)
159 {
160    struct brw_codegen *p = &c->func;
161    brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
162            brw_imm_ud(0x1f));
163    brw_SHL(p, get_element_ud(c->reg.header, 2),
164            get_element_ud(c->reg.header, 2), brw_imm_ud(2));
165 }
166 
167 /**
168  * Apply an additive offset to DWORD 2 of c->reg.header.
169  *
170  * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
171  * for each vertex.
172  */
brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile * c,int offset)173 static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c,
174                                         int offset)
175 {
176    struct brw_codegen *p = &c->func;
177    brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
178            brw_imm_d(offset));
179 }
180 
181 
182 /**
183  * Emit a vertex using the URB_WRITE message.  Use the contents of
184  * c->reg.header for the message header, and the registers starting at \c vert
185  * for the vertex data.
186  *
187  * If \c last is true, then this is the last vertex, so no further URB space
188  * should be allocated, and this message should end the thread.
189  *
190  * If \c last is false, then a new URB entry will be allocated, and its handle
191  * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
192  * message.
193  */
brw_ff_gs_emit_vue(struct brw_ff_gs_compile * c,struct brw_reg vert,bool last)194 static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
195                                struct brw_reg vert,
196                                bool last)
197 {
198    struct brw_codegen *p = &c->func;
199    int write_offset = 0;
200    bool complete = false;
201 
202    do {
203       /* We can't write more than 14 registers at a time to the URB */
204       int write_len = MIN2(c->nr_regs - write_offset, 14);
205       if (write_len == c->nr_regs - write_offset)
206          complete = true;
207 
208       /* Copy the vertex from vertn into m1..mN+1:
209        */
210       brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);
211 
212       /* Send the vertex data to the URB.  If this is the last write for this
213        * vertex, then we mark it as complete, and either end the thread or
214        * allocate another vertex URB entry (depending whether this is the last
215        * vertex).
216        */
217       enum brw_urb_write_flags flags;
218       if (!complete)
219          flags = BRW_URB_WRITE_NO_FLAGS;
220       else if (last)
221          flags = BRW_URB_WRITE_EOT_COMPLETE;
222       else
223          flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
224       brw_urb_WRITE(p,
225                     (flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
226                     : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
227                     0,
228                     c->reg.header,
229                     flags,
230                     write_len + 1, /* msg length */
231                     (flags & BRW_URB_WRITE_ALLOCATE) ? 1
232                     : 0, /* response length */
233                     write_offset,  /* urb offset */
234                     BRW_URB_SWIZZLE_NONE);
235       write_offset += write_len;
236    } while (!complete);
237 
238    if (!last) {
239       brw_MOV(p, get_element_ud(c->reg.header, 0),
240               get_element_ud(c->reg.temp, 0));
241    }
242 }
243 
244 /**
245  * Send an FF_SYNC message to ensure that all previously spawned GS threads
246  * have finished sending primitives down the pipeline, and to allocate a URB
247  * entry for the first output vertex.  Only needed on Ironlake+.
248  *
249  * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
250  * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
251  * the allocated URB entry (which will be needed by the URB_WRITE meesage that
252  * follows).
253  */
brw_ff_gs_ff_sync(struct brw_ff_gs_compile * c,int num_prim)254 static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim)
255 {
256    struct brw_codegen *p = &c->func;
257 
258    brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
259    brw_ff_sync(p,
260                c->reg.temp,
261                0,
262                c->reg.header,
263                1, /* allocate */
264                1, /* response length */
265                0 /* eot */);
266    brw_MOV(p, get_element_ud(c->reg.header, 0),
267            get_element_ud(c->reg.temp, 0));
268 }
269 
270 
271 static void
brw_ff_gs_quads(struct brw_ff_gs_compile * c,const struct brw_ff_gs_prog_key * key)272 brw_ff_gs_quads(struct brw_ff_gs_compile *c,
273 		const struct brw_ff_gs_prog_key *key)
274 {
275    brw_ff_gs_alloc_regs(c, 4, false);
276    brw_ff_gs_initialize_header(c);
277    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
278     * is the PV for quads, but vertex 0 for polygons:
279     */
280    if (c->func.devinfo->ver == 5)
281       brw_ff_gs_ff_sync(c, 1);
282    brw_ff_gs_overwrite_header_dw2(
283       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
284           | URB_WRITE_PRIM_START));
285    if (key->pv_first) {
286       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
287       brw_ff_gs_overwrite_header_dw2(
288          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
289       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
290       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
291       brw_ff_gs_overwrite_header_dw2(
292          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
293              | URB_WRITE_PRIM_END));
294       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
295    }
296    else {
297       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
298       brw_ff_gs_overwrite_header_dw2(
299          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
300       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
301       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
302       brw_ff_gs_overwrite_header_dw2(
303          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
304              | URB_WRITE_PRIM_END));
305       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
306    }
307 }
308 
309 static void
brw_ff_gs_quad_strip(struct brw_ff_gs_compile * c,const struct brw_ff_gs_prog_key * key)310 brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c,
311                      const struct brw_ff_gs_prog_key *key)
312 {
313    brw_ff_gs_alloc_regs(c, 4, false);
314    brw_ff_gs_initialize_header(c);
315 
316    if (c->func.devinfo->ver == 5)
317       brw_ff_gs_ff_sync(c, 1);
318    brw_ff_gs_overwrite_header_dw2(
319       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
320           | URB_WRITE_PRIM_START));
321    if (key->pv_first) {
322       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
323       brw_ff_gs_overwrite_header_dw2(
324          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
325       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
326       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
327       brw_ff_gs_overwrite_header_dw2(
328          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
329              | URB_WRITE_PRIM_END));
330       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
331    }
332    else {
333       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
334       brw_ff_gs_overwrite_header_dw2(
335          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
336       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
337       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
338       brw_ff_gs_overwrite_header_dw2(
339          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
340              | URB_WRITE_PRIM_END));
341       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
342    }
343 }
344 
brw_ff_gs_lines(struct brw_ff_gs_compile * c)345 static void brw_ff_gs_lines(struct brw_ff_gs_compile *c)
346 {
347    brw_ff_gs_alloc_regs(c, 2, false);
348    brw_ff_gs_initialize_header(c);
349 
350    if (c->func.devinfo->ver == 5)
351       brw_ff_gs_ff_sync(c, 1);
352    brw_ff_gs_overwrite_header_dw2(
353       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
354           | URB_WRITE_PRIM_START));
355    brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
356    brw_ff_gs_overwrite_header_dw2(
357       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
358           | URB_WRITE_PRIM_END));
359    brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
360 }
361 
362 /**
363  * Generate the geometry shader program used on Gen6 to perform stream output
364  * (transform feedback).
365  */
366 static void
gfx6_sol_program(struct brw_ff_gs_compile * c,const struct brw_ff_gs_prog_key * key,unsigned num_verts,bool check_edge_flags)367 gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key,
368                  unsigned num_verts, bool check_edge_flags)
369 {
370    struct brw_codegen *p = &c->func;
371    brw_inst *inst;
372    c->prog_data->svbi_postincrement_value = num_verts;
373 
374    brw_ff_gs_alloc_regs(c, num_verts, true);
375    brw_ff_gs_initialize_header(c);
376 
377    if (key->num_transform_feedback_bindings > 0) {
378       unsigned vertex, binding;
379       struct brw_reg destination_indices_uw =
380          vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));
381 
382       /* Note: since we use the binding table to keep track of buffer offsets
383        * and stride, the GS doesn't need to keep track of a separate pointer
384        * into each buffer; it uses a single pointer which increments by 1 for
385        * each vertex.  So we use SVBI0 for this pointer, regardless of whether
386        * transform feedback is in interleaved or separate attribs mode.
387        *
388        * Make sure that the buffers have enough room for all the vertices.
389        */
390       brw_ADD(p, get_element_ud(c->reg.temp, 0),
391                  get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
392       brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
393                  get_element_ud(c->reg.temp, 0),
394                  get_element_ud(c->reg.SVBI, 4));
395       brw_IF(p, BRW_EXECUTE_1);
396 
397       /* Compute the destination indices to write to.  Usually we use SVBI[0]
398        * + (0, 1, 2).  However, for odd-numbered triangles in tristrips, the
399        * vertices come down the pipeline in reversed winding order, so we need
400        * to flip the order when writing to the transform feedback buffer.  To
401        * ensure that flatshading accuracy is preserved, we need to write them
402        * in order SVBI[0] + (0, 2, 1) if we're using the first provoking
403        * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
404        * the last provoking vertex convention.
405        *
406        * Note: since brw_imm_v can only be used in instructions in
407        * packed-word execution mode, and SVBI is a double-word, we need to
408        * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
409        * or (1, 0, 2)) to the destination_indices register, and then add SVBI
410        * using a separate instruction.  Also, since the immediate constant is
411        * expressed as packed words, and we need to load double-words into
412        * destination_indices, we need to intersperse zeros to fill the upper
413        * halves of each double-word.
414        */
415       brw_MOV(p, destination_indices_uw,
416               brw_imm_v(0x00020100)); /* (0, 1, 2) */
417       if (num_verts == 3) {
418          /* Get primitive type into temp register. */
419          brw_AND(p, get_element_ud(c->reg.temp, 0),
420                  get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));
421 
422          /* Test if primitive type is TRISTRIP_REVERSE.  We need to do this as
423           * an 8-wide comparison so that the conditional MOV that follows
424           * moves all 8 words correctly.
425           */
426          brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
427                  get_element_ud(c->reg.temp, 0),
428                  brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
429 
430          /* If so, then overwrite destination_indices_uw with the appropriate
431           * reordering.
432           */
433          inst = brw_MOV(p, destination_indices_uw,
434                         brw_imm_v(key->pv_first ? 0x00010200    /* (0, 2, 1) */
435                                                 : 0x00020001)); /* (1, 0, 2) */
436          brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL);
437       }
438 
439       assert(c->reg.destination_indices.width == BRW_EXECUTE_4);
440       brw_push_insn_state(p);
441       brw_set_default_exec_size(p, BRW_EXECUTE_4);
442       brw_ADD(p, c->reg.destination_indices,
443               c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
444       brw_pop_insn_state(p);
445       /* For each vertex, generate code to output each varying using the
446        * appropriate binding table entry.
447        */
448       for (vertex = 0; vertex < num_verts; ++vertex) {
449          /* Set up the correct destination index for this vertex */
450          brw_MOV(p, get_element_ud(c->reg.header, 5),
451                  get_element_ud(c->reg.destination_indices, vertex));
452 
453          for (binding = 0; binding < key->num_transform_feedback_bindings;
454               ++binding) {
455             unsigned char varying =
456                key->transform_feedback_bindings[binding];
457             unsigned char slot = c->vue_map.varying_to_slot[varying];
458             /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
459              *
460              *   "Prior to End of Thread with a URB_WRITE, the kernel must
461              *   ensure that all writes are complete by sending the final
462              *   write as a committed write."
463              */
464             bool final_write =
465                binding == key->num_transform_feedback_bindings - 1 &&
466                vertex == num_verts - 1;
467             struct brw_reg vertex_slot = c->reg.vertex[vertex];
468             vertex_slot.nr += slot / 2;
469             vertex_slot.subnr = (slot % 2) * 16;
470             /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
471             vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
472                ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
473             brw_set_default_access_mode(p, BRW_ALIGN_16);
474             brw_push_insn_state(p);
475             brw_set_default_exec_size(p, BRW_EXECUTE_4);
476 
477             brw_MOV(p, stride(c->reg.header, 4, 4, 1),
478                     retype(vertex_slot, BRW_REGISTER_TYPE_UD));
479             brw_pop_insn_state(p);
480 
481             brw_set_default_access_mode(p, BRW_ALIGN_1);
482             brw_svb_write(p,
483                           final_write ? c->reg.temp : brw_null_reg(), /* dest */
484                           1, /* msg_reg_nr */
485                           c->reg.header, /* src0 */
486                           BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
487                           final_write); /* send_commit_msg */
488          }
489       }
490       brw_ENDIF(p);
491 
492       /* Now, reinitialize the header register from R0 to restore the parts of
493        * the register that we overwrote while streaming out transform feedback
494        * data.
495        */
496       brw_ff_gs_initialize_header(c);
497 
498       /* Finally, wait for the write commit to occur so that we can proceed to
499        * other things safely.
500        *
501        * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
502        *
503        *   The write commit does not modify the destination register, but
504        *   merely clears the dependency associated with the destination
505        *   register. Thus, a simple “mov” instruction using the register as a
506        *   source is sufficient to wait for the write commit to occur.
507        */
508       brw_MOV(p, c->reg.temp, c->reg.temp);
509    }
510 
511    brw_ff_gs_ff_sync(c, 1);
512 
513    brw_ff_gs_overwrite_header_dw2_from_r0(c);
514    switch (num_verts) {
515    case 1:
516       brw_ff_gs_offset_header_dw2(c,
517                                   URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
518       brw_ff_gs_emit_vue(c, c->reg.vertex[0], true);
519       break;
520    case 2:
521       brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
522       brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
523       brw_ff_gs_offset_header_dw2(c,
524                                   URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
525       brw_ff_gs_emit_vue(c, c->reg.vertex[1], true);
526       break;
527    case 3:
528       if (check_edge_flags) {
529          /* Only emit vertices 0 and 1 if this is the first triangle of the
530           * polygon.  Otherwise they are redundant.
531           */
532          brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
533                  get_element_ud(c->reg.R0, 2),
534                  brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
535          brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
536          brw_IF(p, BRW_EXECUTE_1);
537       }
538       brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
539       brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
540       brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
541       brw_ff_gs_emit_vue(c, c->reg.vertex[1], false);
542       if (check_edge_flags) {
543          brw_ENDIF(p);
544          /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
545           * of the polygon.  Otherwise leave the primitive incomplete because
546           * there are more polygon vertices coming.
547           */
548          brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
549                  get_element_ud(c->reg.R0, 2),
550                  brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
551          brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
552          brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
553       }
554       brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
555       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
556       brw_ff_gs_emit_vue(c, c->reg.vertex[2], true);
557       break;
558    }
559 }
560 
561 const unsigned *
brw_compile_ff_gs_prog(struct brw_compiler * compiler,void * mem_ctx,const struct brw_ff_gs_prog_key * key,struct brw_ff_gs_prog_data * prog_data,struct brw_vue_map * vue_map,unsigned * final_assembly_size)562 brw_compile_ff_gs_prog(struct brw_compiler *compiler,
563 		       void *mem_ctx,
564 		       const struct brw_ff_gs_prog_key *key,
565 		       struct brw_ff_gs_prog_data *prog_data,
566 		       struct brw_vue_map *vue_map,
567 		       unsigned *final_assembly_size)
568 {
569    struct brw_ff_gs_compile c;
570    const GLuint *program;
571 
572    memset(&c, 0, sizeof(c));
573 
574    c.key = *key;
575    c.vue_map = *vue_map;
576    c.nr_regs = (c.vue_map.num_slots + 1)/2;
577    c.prog_data = prog_data;
578 
579    mem_ctx = ralloc_context(NULL);
580 
581    /* Begin the compilation:
582     */
583    brw_init_codegen(&compiler->isa, &c.func, mem_ctx);
584 
585    c.func.single_program_flow = 1;
586 
587    /* For some reason the thread is spawned with only 4 channels
588     * unmasked.
589     */
590    brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
591 
592    if (compiler->devinfo->ver >= 6) {
593       unsigned num_verts;
594       bool check_edge_flag;
595       /* On Sandybridge, we use the GS for implementing transform feedback
596        * (called "Stream Out" in the PRM).
597        */
598       switch (key->primitive) {
599       case _3DPRIM_POINTLIST:
600          num_verts = 1;
601          check_edge_flag = false;
602          break;
603       case _3DPRIM_LINELIST:
604       case _3DPRIM_LINESTRIP:
605       case _3DPRIM_LINELOOP:
606          num_verts = 2;
607          check_edge_flag = false;
608          break;
609       case _3DPRIM_TRILIST:
610       case _3DPRIM_TRIFAN:
611       case _3DPRIM_TRISTRIP:
612       case _3DPRIM_RECTLIST:
613          num_verts = 3;
614          check_edge_flag = false;
615          break;
616       case _3DPRIM_QUADLIST:
617       case _3DPRIM_QUADSTRIP:
618       case _3DPRIM_POLYGON:
619          num_verts = 3;
620          check_edge_flag = true;
621          break;
622       default:
623          unreachable("Unexpected primitive type in Gen6 SOL program.");
624       }
625       gfx6_sol_program(&c, key, num_verts, check_edge_flag);
626    } else {
627       /* On Gen4-5, we use the GS to decompose certain types of primitives.
628        * Note that primitives which don't require a GS program have already
629        * been weeded out by now.
630        */
631       switch (key->primitive) {
632       case _3DPRIM_QUADLIST:
633          brw_ff_gs_quads( &c, key );
634          break;
635       case _3DPRIM_QUADSTRIP:
636          brw_ff_gs_quad_strip( &c, key );
637          break;
638       case _3DPRIM_LINELOOP:
639          brw_ff_gs_lines( &c );
640          break;
641       default:
642          return NULL;
643       }
644    }
645 
646    brw_compact_instructions(&c.func, 0, NULL);
647 
648    /* get the program
649     */
650    program = brw_get_program(&c.func, final_assembly_size);
651 
652    if (INTEL_DEBUG(DEBUG_GS)) {
653       fprintf(stderr, "gs:\n");
654       brw_disassemble_with_labels(&compiler->isa, c.func.store,
655                                   0, *final_assembly_size, stderr);
656       fprintf(stderr, "\n");
657     }
658 
659    return program;
660 }
661 
662