• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2013 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_prim.h"
9 #include "brw_nir.h"
10 #include "brw_private.h"
11 #include "dev/intel_debug.h"
12 
13 static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
14    [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
15    [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
16    [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
17    [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
18    [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
19    [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
20    [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
21    [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
22    [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
23    [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
24    [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
25    [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
26    [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
27    [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
28 };
29 
30 extern "C" const unsigned *
brw_compile_gs(const struct brw_compiler * compiler,struct brw_compile_gs_params * params)31 brw_compile_gs(const struct brw_compiler *compiler,
32                struct brw_compile_gs_params *params)
33 {
34    nir_shader *nir = params->base.nir;
35    const struct brw_gs_prog_key *key = params->key;
36    struct brw_gs_prog_data *prog_data = params->prog_data;
37 
38    struct brw_gs_compile c;
39    memset(&c, 0, sizeof(c));
40    c.key = *key;
41 
42    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
43 
44    prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
45    prog_data->base.base.ray_queries = nir->info.ray_queries;
46    prog_data->base.base.total_scratch = 0;
47 
48    /* The GLSL linker will have already matched up GS inputs and the outputs
49     * of prior stages.  The driver does extend VS outputs in some cases, but
50     * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
51     * geometry shader support.  So we can safely ignore that.
52     *
53     * For SSO pipelines, we use a fixed VUE map layout based on variable
54     * locations, so we can rely on rendezvous-by-location making this work.
55     */
56    GLbitfield64 inputs_read = nir->info.inputs_read;
57    brw_compute_vue_map(compiler->devinfo,
58                        &c.input_vue_map, inputs_read,
59                        nir->info.separate_shader, 1);
60 
61    brw_nir_apply_key(nir, compiler, &key->base, 8);
62    brw_nir_lower_vue_inputs(nir, &c.input_vue_map);
63    brw_nir_lower_vue_outputs(nir);
64    brw_postprocess_nir(nir, compiler, debug_enabled,
65                        key->base.robust_flags);
66 
67    prog_data->base.clip_distance_mask =
68       ((1 << nir->info.clip_distance_array_size) - 1);
69    prog_data->base.cull_distance_mask =
70       ((1 << nir->info.cull_distance_array_size) - 1) <<
71       nir->info.clip_distance_array_size;
72 
73    prog_data->include_primitive_id =
74       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
75 
76    prog_data->invocations = nir->info.gs.invocations;
77 
78    nir_gs_count_vertices_and_primitives(
79       nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);
80 
81    if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
82       /* When the output type is points, the geometry shader may output data
83        * to multiple streams, and EndPrimitive() has no effect.  So we
84        * configure the hardware to interpret the control data as stream ID.
85        */
86       prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
87 
88       /* We only have to emit control bits if we are using non-zero streams */
89       if (nir->info.gs.active_stream_mask != (1 << 0))
90          c.control_data_bits_per_vertex = 2;
91       else
92          c.control_data_bits_per_vertex = 0;
93    } else {
94       /* When the output type is triangle_strip or line_strip, EndPrimitive()
95        * may be used to terminate the current strip and start a new one
96        * (similar to primitive restart), and outputting data to multiple
97        * streams is not supported.  So we configure the hardware to interpret
98        * the control data as EndPrimitive information (a.k.a. "cut bits").
99        */
100       prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
101 
102       /* We only need to output control data if the shader actually calls
103        * EndPrimitive().
104        */
105       c.control_data_bits_per_vertex =
106          nir->info.gs.uses_end_primitive ? 1 : 0;
107    }
108 
109    c.control_data_header_size_bits =
110       nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
111 
112    /* 1 HWORD = 32 bytes = 256 bits */
113    prog_data->control_data_header_size_hwords =
114       ALIGN(c.control_data_header_size_bits, 256) / 256;
115 
116    /* Compute the output vertex size.
117     *
118     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
119     * Size (p168):
120     *
121     *     [0,62] indicating [1,63] 16B units
122     *
123     *     Specifies the size of each vertex stored in the GS output entry
124     *     (following any Control Header data) as a number of 128-bit units
125     *     (minus one).
126     *
127     *     Programming Restrictions: The vertex size must be programmed as a
128     *     multiple of 32B units with the following exception: Rendering is
129     *     disabled (as per SOL stage state) and the vertex size output by the
130     *     GS thread is 16B.
131     *
132     *     If rendering is enabled (as per SOL state) the vertex size must be
133     *     programmed as a multiple of 32B units. In other words, the only time
134     *     software can program a vertex size with an odd number of 16B units
135     *     is when rendering is disabled.
136     *
137     * Note: B=bytes in the above text.
138     *
139     * It doesn't seem worth the extra trouble to optimize the case where the
140     * vertex size is 16B (especially since this would require special-casing
141     * the GEN assembly that writes to the URB).  So we just set the vertex
142     * size to a multiple of 32B (2 vec4's) in all cases.
143     *
144     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
145     * budget that as follows:
146     *
147     *   512 bytes for varyings (a varying component is 4 bytes and
148     *             gl_MaxGeometryOutputComponents = 128)
149     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
150     *             bytes)
151     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
152     *             even if it's not used)
153     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
154     *             whenever clip planes are enabled, even if the shader doesn't
155     *             write to gl_ClipDistance)
156     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
157     *             (see below)--this causes up to 1 VUE slot to be wasted
158     *   400 bytes available for varying packing overhead
159     *
160     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
161     * per interpolation type, so this is plenty.
162     *
163     */
164    unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
165    assert(output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
166    prog_data->output_vertex_size_hwords =
167       ALIGN(output_vertex_size_bytes, 32) / 32;
168 
169    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
170     * That divides up as follows:
171     *
172     *     64 bytes for the control data header (cut indices or StreamID bits)
173     *   4096 bytes for varyings (a varying component is 4 bytes and
174     *              gl_MaxGeometryTotalOutputComponents = 1024)
175     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
176     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
177     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
178     *              even if it's not used)
179     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
180     *              whenever clip planes are enabled, even if the shader doesn't
181     *              write to gl_ClipDistance)
182     *   4096 bytes overhead since the VUE size must be a multiple of 32
183     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
184     *   8128 bytes available for varying packing overhead
185     *
186     * Worst-case varying packing overhead is 3/4 of a varying slot per
187     * interpolation type, which works out to 3072 bytes, so this would allow
188     * us to accommodate 2 interpolation types without any danger of running
189     * out of URB space.
190     *
191     * In practice, the risk of running out of URB space is very small, since
192     * the above figures are all worst-case, and most of them scale with the
193     * number of output vertices.  So we'll just calculate the amount of space
194     * we need, and if it's too large, fail to compile.
195     *
196     * The above is for gfx7+ where we have a single URB entry that will hold
197     * all the output.
198     */
199    unsigned output_size_bytes =
200       prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
201    output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
202 
203    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
204     * which comes before the control header.
205     */
206    output_size_bytes += 32;
207 
208    /* Shaders can technically set max_vertices = 0, at which point we
209     * may have a URB size of 0 bytes.  Nothing good can come from that,
210     * so enforce a minimum size.
211     */
212    if (output_size_bytes == 0)
213       output_size_bytes = 1;
214 
215    unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
216    if (output_size_bytes > max_output_size_bytes)
217       return NULL;
218 
219 
220    /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+. */
221    prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
222 
223    assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
224    prog_data->output_topology =
225       gl_prim_to_hw_prim[nir->info.gs.output_primitive];
226 
227    prog_data->vertices_in = nir->info.gs.vertices_in;
228 
229    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
230     * need to program a URB read length of ceiling(num_slots / 2).
231     */
232    prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
233 
234    /* Now that prog_data setup is done, we are ready to actually compile the
235     * program.
236     */
237    if (unlikely(debug_enabled)) {
238       fprintf(stderr, "GS Input ");
239       brw_print_vue_map(stderr, &c.input_vue_map, MESA_SHADER_GEOMETRY);
240       fprintf(stderr, "GS Output ");
241       brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
242    }
243 
244    fs_visitor v(compiler, &params->base, &c, prog_data, nir,
245                 params->base.stats != NULL, debug_enabled);
246    if (v.run_gs()) {
247       prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
248 
249       assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
250       prog_data->base.base.dispatch_grf_start_reg =
251          v.payload().num_regs / reg_unit(compiler->devinfo);
252 
253       fs_generator g(compiler, &params->base,
254                      &prog_data->base.base, MESA_SHADER_GEOMETRY);
255       if (unlikely(debug_enabled)) {
256          const char *label =
257             nir->info.label ? nir->info.label : "unnamed";
258          char *name = ralloc_asprintf(params->base.mem_ctx,
259                                       "%s geometry shader %s",
260                                       label, nir->info.name);
261          g.enable_debug(name);
262       }
263       g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
264                       v.performance_analysis.require(), params->base.stats);
265       g.add_const_data(nir->constant_data, nir->constant_data_size);
266       return g.get_assembly();
267    }
268 
269    params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
270 
271    return NULL;
272 }
273 
274