• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 #include "brw_compiler.h"
33 #include "brw_eu.h"
34 
35 #include "dev/intel_debug.h"
36 
37 #define MAX_GS_VERTS (4)
38 
39 struct brw_ff_gs_compile {
40    struct brw_codegen func;
41    struct brw_ff_gs_prog_key key;
42    struct brw_ff_gs_prog_data *prog_data;
43 
44    struct {
45       struct brw_reg R0;
46 
47       /**
48        * Register holding streamed vertex buffer pointers -- see the Sandy
49        * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
50        * [DevSNB]).  These pointers are delivered in GRF 1.
51        */
52       struct brw_reg SVBI;
53 
54       struct brw_reg vertex[MAX_GS_VERTS];
55       struct brw_reg header;
56       struct brw_reg temp;
57 
58       /**
59        * Register holding destination indices for streamed buffer writes.
60        * Only used for SOL programs.
61        */
62       struct brw_reg destination_indices;
63    } reg;
64 
65    /* Number of registers used to store vertex data */
66    GLuint nr_regs;
67 
68    struct brw_vue_map vue_map;
69 };
70 
71 /**
72  * Allocate registers for GS.
73  *
74  * If sol_program is true, then:
75  *
76  * - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
77  *   1 needs to be set aside to hold the streamed vertex buffer indices.
78  *
79  * - The thread will need to use the destination_indices register.
80  */
brw_ff_gs_alloc_regs(struct brw_ff_gs_compile * c,GLuint nr_verts,bool sol_program)81 static void brw_ff_gs_alloc_regs(struct brw_ff_gs_compile *c,
82                                  GLuint nr_verts,
83                                  bool sol_program)
84 {
85    GLuint i = 0,j;
86 
87    /* Register usage is static, precompute here:
88     */
89    c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
90 
91    /* Streamed vertex buffer indices */
92    if (sol_program)
93       c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
94 
95    /* Payload vertices plus space for more generated vertices:
96     */
97    for (j = 0; j < nr_verts; j++) {
98       c->reg.vertex[j] = brw_vec4_grf(i, 0);
99       i += c->nr_regs;
100    }
101 
102    c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
103    c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
104 
105    if (sol_program) {
106       c->reg.destination_indices =
107          retype(brw_vec4_grf(i++, 0), BRW_REGISTER_TYPE_UD);
108    }
109 
110    c->prog_data->urb_read_length = c->nr_regs;
111    c->prog_data->total_grf = i;
112 }
113 
114 
115 /**
116  * Set up the initial value of c->reg.header register based on c->reg.R0.
117  *
118  * The following information is passed to the GS thread in R0, and needs to be
119  * included in the first URB_WRITE or FF_SYNC message sent by the GS:
120  *
121  * - DWORD 0 [31:0] handle info (Gen4 only)
122  * - DWORD 5 [7:0] FFTID
123  * - DWORD 6 [31:0] Debug info
124  * - DWORD 7 [31:0] Debug info
125  *
126  * This function sets up the above data by copying by copying the contents of
127  * R0 to the header register.
128  */
brw_ff_gs_initialize_header(struct brw_ff_gs_compile * c)129 static void brw_ff_gs_initialize_header(struct brw_ff_gs_compile *c)
130 {
131    struct brw_codegen *p = &c->func;
132    brw_MOV(p, c->reg.header, c->reg.R0);
133 }
134 
135 /**
136  * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
137  *
138  * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
139  * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
140  * need to be able to update on a per-vertex basis.
141  */
brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile * c,unsigned dw2)142 static void brw_ff_gs_overwrite_header_dw2(struct brw_ff_gs_compile *c,
143                                            unsigned dw2)
144 {
145    struct brw_codegen *p = &c->func;
146    brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
147 }
148 
149 /**
150  * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
151  *
152  * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
153  * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
154  * DWORD 2.  So this function extracts the primitive type field, bitshifts it
155  * appropriately, and stores it in c->reg.header.
156  */
brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile * c)157 static void brw_ff_gs_overwrite_header_dw2_from_r0(struct brw_ff_gs_compile *c)
158 {
159    struct brw_codegen *p = &c->func;
160    brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
161            brw_imm_ud(0x1f));
162    brw_SHL(p, get_element_ud(c->reg.header, 2),
163            get_element_ud(c->reg.header, 2), brw_imm_ud(2));
164 }
165 
166 /**
167  * Apply an additive offset to DWORD 2 of c->reg.header.
168  *
169  * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
170  * for each vertex.
171  */
brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile * c,int offset)172 static void brw_ff_gs_offset_header_dw2(struct brw_ff_gs_compile *c,
173                                         int offset)
174 {
175    struct brw_codegen *p = &c->func;
176    brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
177            brw_imm_d(offset));
178 }
179 
180 
181 /**
182  * Emit a vertex using the URB_WRITE message.  Use the contents of
183  * c->reg.header for the message header, and the registers starting at \c vert
184  * for the vertex data.
185  *
186  * If \c last is true, then this is the last vertex, so no further URB space
187  * should be allocated, and this message should end the thread.
188  *
189  * If \c last is false, then a new URB entry will be allocated, and its handle
190  * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
191  * message.
192  */
brw_ff_gs_emit_vue(struct brw_ff_gs_compile * c,struct brw_reg vert,bool last)193 static void brw_ff_gs_emit_vue(struct brw_ff_gs_compile *c,
194                                struct brw_reg vert,
195                                bool last)
196 {
197    struct brw_codegen *p = &c->func;
198    int write_offset = 0;
199    bool complete = false;
200 
201    do {
202       /* We can't write more than 14 registers at a time to the URB */
203       int write_len = MIN2(c->nr_regs - write_offset, 14);
204       if (write_len == c->nr_regs - write_offset)
205          complete = true;
206 
207       /* Copy the vertex from vertn into m1..mN+1:
208        */
209       brw_copy8(p, brw_message_reg(1), offset(vert, write_offset), write_len);
210 
211       /* Send the vertex data to the URB.  If this is the last write for this
212        * vertex, then we mark it as complete, and either end the thread or
213        * allocate another vertex URB entry (depending whether this is the last
214        * vertex).
215        */
216       enum brw_urb_write_flags flags;
217       if (!complete)
218          flags = BRW_URB_WRITE_NO_FLAGS;
219       else if (last)
220          flags = BRW_URB_WRITE_EOT_COMPLETE;
221       else
222          flags = BRW_URB_WRITE_ALLOCATE_COMPLETE;
223       brw_urb_WRITE(p,
224                     (flags & BRW_URB_WRITE_ALLOCATE) ? c->reg.temp
225                     : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
226                     0,
227                     c->reg.header,
228                     flags,
229                     write_len + 1, /* msg length */
230                     (flags & BRW_URB_WRITE_ALLOCATE) ? 1
231                     : 0, /* response length */
232                     write_offset,  /* urb offset */
233                     BRW_URB_SWIZZLE_NONE);
234       write_offset += write_len;
235    } while (!complete);
236 
237    if (!last) {
238       brw_MOV(p, get_element_ud(c->reg.header, 0),
239               get_element_ud(c->reg.temp, 0));
240    }
241 }
242 
243 /**
244  * Send an FF_SYNC message to ensure that all previously spawned GS threads
245  * have finished sending primitives down the pipeline, and to allocate a URB
246  * entry for the first output vertex.  Only needed on Ironlake+.
247  *
248  * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
249  * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
250  * the allocated URB entry (which will be needed by the URB_WRITE meesage that
251  * follows).
252  */
brw_ff_gs_ff_sync(struct brw_ff_gs_compile * c,int num_prim)253 static void brw_ff_gs_ff_sync(struct brw_ff_gs_compile *c, int num_prim)
254 {
255    struct brw_codegen *p = &c->func;
256 
257    brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
258    brw_ff_sync(p,
259                c->reg.temp,
260                0,
261                c->reg.header,
262                1, /* allocate */
263                1, /* response length */
264                0 /* eot */);
265    brw_MOV(p, get_element_ud(c->reg.header, 0),
266            get_element_ud(c->reg.temp, 0));
267 }
268 
269 
270 static void
brw_ff_gs_quads(struct brw_ff_gs_compile * c,const struct brw_ff_gs_prog_key * key)271 brw_ff_gs_quads(struct brw_ff_gs_compile *c,
272 		const struct brw_ff_gs_prog_key *key)
273 {
274    brw_ff_gs_alloc_regs(c, 4, false);
275    brw_ff_gs_initialize_header(c);
276    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
277     * is the PV for quads, but vertex 0 for polygons:
278     */
279    if (c->func.devinfo->ver == 5)
280       brw_ff_gs_ff_sync(c, 1);
281    brw_ff_gs_overwrite_header_dw2(
282       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
283           | URB_WRITE_PRIM_START));
284    if (key->pv_first) {
285       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
286       brw_ff_gs_overwrite_header_dw2(
287          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
288       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
289       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
290       brw_ff_gs_overwrite_header_dw2(
291          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
292              | URB_WRITE_PRIM_END));
293       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
294    }
295    else {
296       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
297       brw_ff_gs_overwrite_header_dw2(
298          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
299       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
300       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
301       brw_ff_gs_overwrite_header_dw2(
302          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
303              | URB_WRITE_PRIM_END));
304       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
305    }
306 }
307 
308 static void
brw_ff_gs_quad_strip(struct brw_ff_gs_compile * c,const struct brw_ff_gs_prog_key * key)309 brw_ff_gs_quad_strip(struct brw_ff_gs_compile *c,
310                      const struct brw_ff_gs_prog_key *key)
311 {
312    brw_ff_gs_alloc_regs(c, 4, false);
313    brw_ff_gs_initialize_header(c);
314 
315    if (c->func.devinfo->ver == 5)
316       brw_ff_gs_ff_sync(c, 1);
317    brw_ff_gs_overwrite_header_dw2(
318       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
319           | URB_WRITE_PRIM_START));
320    if (key->pv_first) {
321       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
322       brw_ff_gs_overwrite_header_dw2(
323          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
324       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
325       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
326       brw_ff_gs_overwrite_header_dw2(
327          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
328              | URB_WRITE_PRIM_END));
329       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
330    }
331    else {
332       brw_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
333       brw_ff_gs_overwrite_header_dw2(
334          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
335       brw_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
336       brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
337       brw_ff_gs_overwrite_header_dw2(
338          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
339              | URB_WRITE_PRIM_END));
340       brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
341    }
342 }
343 
brw_ff_gs_lines(struct brw_ff_gs_compile * c)344 static void brw_ff_gs_lines(struct brw_ff_gs_compile *c)
345 {
346    brw_ff_gs_alloc_regs(c, 2, false);
347    brw_ff_gs_initialize_header(c);
348 
349    if (c->func.devinfo->ver == 5)
350       brw_ff_gs_ff_sync(c, 1);
351    brw_ff_gs_overwrite_header_dw2(
352       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
353           | URB_WRITE_PRIM_START));
354    brw_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
355    brw_ff_gs_overwrite_header_dw2(
356       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
357           | URB_WRITE_PRIM_END));
358    brw_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
359 }
360 
361 /**
362  * Generate the geometry shader program used on Gen6 to perform stream output
363  * (transform feedback).
364  */
365 static void
gfx6_sol_program(struct brw_ff_gs_compile * c,const struct brw_ff_gs_prog_key * key,unsigned num_verts,bool check_edge_flags)366 gfx6_sol_program(struct brw_ff_gs_compile *c, const struct brw_ff_gs_prog_key *key,
367                  unsigned num_verts, bool check_edge_flags)
368 {
369    struct brw_codegen *p = &c->func;
370    brw_inst *inst;
371    c->prog_data->svbi_postincrement_value = num_verts;
372 
373    brw_ff_gs_alloc_regs(c, num_verts, true);
374    brw_ff_gs_initialize_header(c);
375 
376    if (key->num_transform_feedback_bindings > 0) {
377       unsigned vertex, binding;
378       struct brw_reg destination_indices_uw =
379          vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW));
380 
381       /* Note: since we use the binding table to keep track of buffer offsets
382        * and stride, the GS doesn't need to keep track of a separate pointer
383        * into each buffer; it uses a single pointer which increments by 1 for
384        * each vertex.  So we use SVBI0 for this pointer, regardless of whether
385        * transform feedback is in interleaved or separate attribs mode.
386        *
387        * Make sure that the buffers have enough room for all the vertices.
388        */
389       brw_ADD(p, get_element_ud(c->reg.temp, 0),
390                  get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
391       brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
392                  get_element_ud(c->reg.temp, 0),
393                  get_element_ud(c->reg.SVBI, 4));
394       brw_IF(p, BRW_EXECUTE_1);
395 
396       /* Compute the destination indices to write to.  Usually we use SVBI[0]
397        * + (0, 1, 2).  However, for odd-numbered triangles in tristrips, the
398        * vertices come down the pipeline in reversed winding order, so we need
399        * to flip the order when writing to the transform feedback buffer.  To
400        * ensure that flatshading accuracy is preserved, we need to write them
401        * in order SVBI[0] + (0, 2, 1) if we're using the first provoking
402        * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
403        * the last provoking vertex convention.
404        *
405        * Note: since brw_imm_v can only be used in instructions in
406        * packed-word execution mode, and SVBI is a double-word, we need to
407        * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
408        * or (1, 0, 2)) to the destination_indices register, and then add SVBI
409        * using a separate instruction.  Also, since the immediate constant is
410        * expressed as packed words, and we need to load double-words into
411        * destination_indices, we need to intersperse zeros to fill the upper
412        * halves of each double-word.
413        */
414       brw_MOV(p, destination_indices_uw,
415               brw_imm_v(0x00020100)); /* (0, 1, 2) */
416       if (num_verts == 3) {
417          /* Get primitive type into temp register. */
418          brw_AND(p, get_element_ud(c->reg.temp, 0),
419                  get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f));
420 
421          /* Test if primitive type is TRISTRIP_REVERSE.  We need to do this as
422           * an 8-wide comparison so that the conditional MOV that follows
423           * moves all 8 words correctly.
424           */
425          brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ,
426                  get_element_ud(c->reg.temp, 0),
427                  brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
428 
429          /* If so, then overwrite destination_indices_uw with the appropriate
430           * reordering.
431           */
432          inst = brw_MOV(p, destination_indices_uw,
433                         brw_imm_v(key->pv_first ? 0x00010200    /* (0, 2, 1) */
434                                                 : 0x00020001)); /* (1, 0, 2) */
435          brw_inst_set_pred_control(p->devinfo, inst, BRW_PREDICATE_NORMAL);
436       }
437 
438       assert(c->reg.destination_indices.width == BRW_EXECUTE_4);
439       brw_push_insn_state(p);
440       brw_set_default_exec_size(p, BRW_EXECUTE_4);
441       brw_ADD(p, c->reg.destination_indices,
442               c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
443       brw_pop_insn_state(p);
444       /* For each vertex, generate code to output each varying using the
445        * appropriate binding table entry.
446        */
447       for (vertex = 0; vertex < num_verts; ++vertex) {
448          /* Set up the correct destination index for this vertex */
449          brw_MOV(p, get_element_ud(c->reg.header, 5),
450                  get_element_ud(c->reg.destination_indices, vertex));
451 
452          for (binding = 0; binding < key->num_transform_feedback_bindings;
453               ++binding) {
454             unsigned char varying =
455                key->transform_feedback_bindings[binding];
456             unsigned char slot = c->vue_map.varying_to_slot[varying];
457             /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
458              *
459              *   "Prior to End of Thread with a URB_WRITE, the kernel must
460              *   ensure that all writes are complete by sending the final
461              *   write as a committed write."
462              */
463             bool final_write =
464                binding == key->num_transform_feedback_bindings - 1 &&
465                vertex == num_verts - 1;
466             struct brw_reg vertex_slot = c->reg.vertex[vertex];
467             vertex_slot.nr += slot / 2;
468             vertex_slot.subnr = (slot % 2) * 16;
469             /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
470             vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
471                ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
472             brw_set_default_access_mode(p, BRW_ALIGN_16);
473             brw_push_insn_state(p);
474             brw_set_default_exec_size(p, BRW_EXECUTE_4);
475 
476             brw_MOV(p, stride(c->reg.header, 4, 4, 1),
477                     retype(vertex_slot, BRW_REGISTER_TYPE_UD));
478             brw_pop_insn_state(p);
479 
480             brw_set_default_access_mode(p, BRW_ALIGN_1);
481             brw_svb_write(p,
482                           final_write ? c->reg.temp : brw_null_reg(), /* dest */
483                           1, /* msg_reg_nr */
484                           c->reg.header, /* src0 */
485                           BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
486                           final_write); /* send_commit_msg */
487          }
488       }
489       brw_ENDIF(p);
490 
491       /* Now, reinitialize the header register from R0 to restore the parts of
492        * the register that we overwrote while streaming out transform feedback
493        * data.
494        */
495       brw_ff_gs_initialize_header(c);
496 
497       /* Finally, wait for the write commit to occur so that we can proceed to
498        * other things safely.
499        *
500        * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
501        *
502        *   The write commit does not modify the destination register, but
503        *   merely clears the dependency associated with the destination
504        *   register. Thus, a simple “mov” instruction using the register as a
505        *   source is sufficient to wait for the write commit to occur.
506        */
507       brw_MOV(p, c->reg.temp, c->reg.temp);
508    }
509 
510    brw_ff_gs_ff_sync(c, 1);
511 
512    brw_ff_gs_overwrite_header_dw2_from_r0(c);
513    switch (num_verts) {
514    case 1:
515       brw_ff_gs_offset_header_dw2(c,
516                                   URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
517       brw_ff_gs_emit_vue(c, c->reg.vertex[0], true);
518       break;
519    case 2:
520       brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
521       brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
522       brw_ff_gs_offset_header_dw2(c,
523                                   URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
524       brw_ff_gs_emit_vue(c, c->reg.vertex[1], true);
525       break;
526    case 3:
527       if (check_edge_flags) {
528          /* Only emit vertices 0 and 1 if this is the first triangle of the
529           * polygon.  Otherwise they are redundant.
530           */
531          brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
532                  get_element_ud(c->reg.R0, 2),
533                  brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
534          brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
535          brw_IF(p, BRW_EXECUTE_1);
536       }
537       brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
538       brw_ff_gs_emit_vue(c, c->reg.vertex[0], false);
539       brw_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
540       brw_ff_gs_emit_vue(c, c->reg.vertex[1], false);
541       if (check_edge_flags) {
542          brw_ENDIF(p);
543          /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
544           * of the polygon.  Otherwise leave the primitive incomplete because
545           * there are more polygon vertices coming.
546           */
547          brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
548                  get_element_ud(c->reg.R0, 2),
549                  brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
550          brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
551          brw_set_default_predicate_control(p, BRW_PREDICATE_NORMAL);
552       }
553       brw_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
554       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
555       brw_ff_gs_emit_vue(c, c->reg.vertex[2], true);
556       break;
557    }
558 }
559 
560 const unsigned *
brw_compile_ff_gs_prog(struct brw_compiler * compiler,void * mem_ctx,const struct brw_ff_gs_prog_key * key,struct brw_ff_gs_prog_data * prog_data,struct brw_vue_map * vue_map,unsigned * final_assembly_size)561 brw_compile_ff_gs_prog(struct brw_compiler *compiler,
562 		       void *mem_ctx,
563 		       const struct brw_ff_gs_prog_key *key,
564 		       struct brw_ff_gs_prog_data *prog_data,
565 		       struct brw_vue_map *vue_map,
566 		       unsigned *final_assembly_size)
567 {
568    struct brw_ff_gs_compile c;
569    const GLuint *program;
570 
571    memset(&c, 0, sizeof(c));
572 
573    c.key = *key;
574    c.vue_map = *vue_map;
575    c.nr_regs = (c.vue_map.num_slots + 1)/2;
576    c.prog_data = prog_data;
577 
578    mem_ctx = ralloc_context(NULL);
579 
580    /* Begin the compilation:
581     */
582    brw_init_codegen(compiler->devinfo, &c.func, mem_ctx);
583 
584    c.func.single_program_flow = 1;
585 
586    /* For some reason the thread is spawned with only 4 channels
587     * unmasked.
588     */
589    brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);
590 
591    if (compiler->devinfo->ver >= 6) {
592       unsigned num_verts;
593       bool check_edge_flag;
594       /* On Sandybridge, we use the GS for implementing transform feedback
595        * (called "Stream Out" in the PRM).
596        */
597       switch (key->primitive) {
598       case _3DPRIM_POINTLIST:
599          num_verts = 1;
600          check_edge_flag = false;
601          break;
602       case _3DPRIM_LINELIST:
603       case _3DPRIM_LINESTRIP:
604       case _3DPRIM_LINELOOP:
605          num_verts = 2;
606          check_edge_flag = false;
607          break;
608       case _3DPRIM_TRILIST:
609       case _3DPRIM_TRIFAN:
610       case _3DPRIM_TRISTRIP:
611       case _3DPRIM_RECTLIST:
612          num_verts = 3;
613          check_edge_flag = false;
614          break;
615       case _3DPRIM_QUADLIST:
616       case _3DPRIM_QUADSTRIP:
617       case _3DPRIM_POLYGON:
618          num_verts = 3;
619          check_edge_flag = true;
620          break;
621       default:
622          unreachable("Unexpected primitive type in Gen6 SOL program.");
623       }
624       gfx6_sol_program(&c, key, num_verts, check_edge_flag);
625    } else {
626       /* On Gen4-5, we use the GS to decompose certain types of primitives.
627        * Note that primitives which don't require a GS program have already
628        * been weeded out by now.
629        */
630       switch (key->primitive) {
631       case _3DPRIM_QUADLIST:
632          brw_ff_gs_quads( &c, key );
633          break;
634       case _3DPRIM_QUADSTRIP:
635          brw_ff_gs_quad_strip( &c, key );
636          break;
637       case _3DPRIM_LINELOOP:
638          brw_ff_gs_lines( &c );
639          break;
640       default:
641          return NULL;
642       }
643    }
644 
645    brw_compact_instructions(&c.func, 0, NULL);
646 
647    /* get the program
648     */
649    program = brw_get_program(&c.func, final_assembly_size);
650 
651    if (INTEL_DEBUG(DEBUG_GS)) {
652       fprintf(stderr, "gs:\n");
653       brw_disassemble_with_labels(compiler->devinfo, c.func.store,
654                                   0, *final_assembly_size, stderr);
655       fprintf(stderr, "\n");
656     }
657 
658    return program;
659 }
660 
661