1 /*
2 * Copyright © 2013 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_builder.h"
9 #include "brw_generator.h"
10 #include "brw_prim.h"
11 #include "brw_nir.h"
12 #include "brw_private.h"
13 #include "dev/intel_debug.h"
14
15 using namespace brw;
16
17 static const GLuint gl_prim_to_hw_prim[MESA_PRIM_TRIANGLE_STRIP_ADJACENCY+1] = {
18 [MESA_PRIM_POINTS] =_3DPRIM_POINTLIST,
19 [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
20 [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
21 [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
22 [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
23 [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
24 [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
25 [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
26 [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
27 [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
28 [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
29 [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
30 [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
31 [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
32 };
33
34 static void
brw_emit_gs_thread_end(fs_visitor & s)35 brw_emit_gs_thread_end(fs_visitor &s)
36 {
37 assert(s.stage == MESA_SHADER_GEOMETRY);
38
39 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
40
41 if (s.gs_compile->control_data_header_size_bits > 0) {
42 s.emit_gs_control_data_bits(s.final_gs_vertex_count);
43 }
44
45 const brw_builder abld = brw_builder(&s).at_end().annotate("thread end");
46 fs_inst *inst;
47
48 if (gs_prog_data->static_vertex_count != -1) {
49 /* Try and tag the last URB write with EOT instead of emitting a whole
50 * separate write just to finish the thread.
51 */
52 if (s.mark_last_urb_write_with_eot())
53 return;
54
55 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
56 srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
57 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
58 inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
59 srcs, ARRAY_SIZE(srcs));
60 } else {
61 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
62 srcs[URB_LOGICAL_SRC_HANDLE] = s.gs_payload().urb_handles;
63 srcs[URB_LOGICAL_SRC_DATA] = s.final_gs_vertex_count;
64 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
65 inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
66 srcs, ARRAY_SIZE(srcs));
67 }
68 inst->eot = true;
69 inst->offset = 0;
70 }
71
72 static void
brw_assign_gs_urb_setup(fs_visitor & s)73 brw_assign_gs_urb_setup(fs_visitor &s)
74 {
75 assert(s.stage == MESA_SHADER_GEOMETRY);
76
77 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);
78
79 s.first_non_payload_grf +=
80 8 * vue_prog_data->urb_read_length * s.nir->info.gs.vertices_in;
81
82 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
83 /* Rewrite all ATTR file references to GRFs. */
84 s.convert_attr_sources_to_hw_regs(inst);
85 }
86 }
87
88 static bool
run_gs(fs_visitor & s)89 run_gs(fs_visitor &s)
90 {
91 assert(s.stage == MESA_SHADER_GEOMETRY);
92
93 s.payload_ = new gs_thread_payload(s);
94
95 const brw_builder bld = brw_builder(&s).at_end();
96
97 s.final_gs_vertex_count = bld.vgrf(BRW_TYPE_UD);
98
99 if (s.gs_compile->control_data_header_size_bits > 0) {
100 /* Create a VGRF to store accumulated control data bits. */
101 s.control_data_bits = bld.vgrf(BRW_TYPE_UD);
102
103 /* If we're outputting more than 32 control data bits, then EmitVertex()
104 * will set control_data_bits to 0 after emitting the first vertex.
105 * Otherwise, we need to initialize it to 0 here.
106 */
107 if (s.gs_compile->control_data_header_size_bits <= 32) {
108 const brw_builder abld = bld.annotate("initialize control data bits");
109 abld.MOV(s.control_data_bits, brw_imm_ud(0u));
110 }
111 }
112
113 nir_to_brw(&s);
114
115 brw_emit_gs_thread_end(s);
116
117 if (s.failed)
118 return false;
119
120 brw_calculate_cfg(s);
121
122 brw_optimize(s);
123
124 s.assign_curb_setup();
125 brw_assign_gs_urb_setup(s);
126
127 brw_lower_3src_null_dest(s);
128 brw_workaround_memory_fence_before_eot(s);
129 brw_workaround_emit_dummy_mov_instruction(s);
130
131 brw_allocate_registers(s, true /* allow_spilling */);
132
133 brw_workaround_source_arf_before_eot(s);
134
135 return !s.failed;
136 }
137
138 extern "C" const unsigned *
brw_compile_gs(const struct brw_compiler * compiler,struct brw_compile_gs_params * params)139 brw_compile_gs(const struct brw_compiler *compiler,
140 struct brw_compile_gs_params *params)
141 {
142 nir_shader *nir = params->base.nir;
143 const struct brw_gs_prog_key *key = params->key;
144 struct brw_gs_prog_data *prog_data = params->prog_data;
145 const unsigned dispatch_width = brw_geometry_stage_dispatch_width(compiler->devinfo);
146
147 struct brw_gs_compile c;
148 memset(&c, 0, sizeof(c));
149 c.key = *key;
150
151 const bool debug_enabled = brw_should_print_shader(nir, DEBUG_GS);
152
153 prog_data->base.base.stage = MESA_SHADER_GEOMETRY;
154 prog_data->base.base.ray_queries = nir->info.ray_queries;
155 prog_data->base.base.total_scratch = 0;
156
157 /* The GLSL linker will have already matched up GS inputs and the outputs
158 * of prior stages. The driver does extend VS outputs in some cases, but
159 * only for legacy OpenGL or Gfx4-5 hardware, neither of which offer
160 * geometry shader support. So we can safely ignore that.
161 *
162 * For SSO pipelines, we use a fixed VUE map layout based on variable
163 * locations, so we can rely on rendezvous-by-location making this work.
164 */
165 GLbitfield64 inputs_read = nir->info.inputs_read;
166 brw_compute_vue_map(compiler->devinfo,
167 &c.input_vue_map, inputs_read,
168 nir->info.separate_shader, 1);
169
170 brw_nir_apply_key(nir, compiler, &key->base, dispatch_width);
171 brw_nir_lower_vue_inputs(nir, &c.input_vue_map);
172 brw_nir_lower_vue_outputs(nir);
173 brw_postprocess_nir(nir, compiler, debug_enabled,
174 key->base.robust_flags);
175
176 prog_data->base.clip_distance_mask =
177 ((1 << nir->info.clip_distance_array_size) - 1);
178 prog_data->base.cull_distance_mask =
179 ((1 << nir->info.cull_distance_array_size) - 1) <<
180 nir->info.clip_distance_array_size;
181
182 prog_data->include_primitive_id =
183 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
184
185 prog_data->invocations = nir->info.gs.invocations;
186
187 nir_gs_count_vertices_and_primitives(
188 nir, &prog_data->static_vertex_count, nullptr, nullptr, 1u);
189
190 if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
191 /* When the output type is points, the geometry shader may output data
192 * to multiple streams, and EndPrimitive() has no effect. So we
193 * configure the hardware to interpret the control data as stream ID.
194 */
195 prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
196
197 /* We only have to emit control bits if we are using non-zero streams */
198 if (nir->info.gs.active_stream_mask != (1 << 0))
199 c.control_data_bits_per_vertex = 2;
200 else
201 c.control_data_bits_per_vertex = 0;
202 } else {
203 /* When the output type is triangle_strip or line_strip, EndPrimitive()
204 * may be used to terminate the current strip and start a new one
205 * (similar to primitive restart), and outputting data to multiple
206 * streams is not supported. So we configure the hardware to interpret
207 * the control data as EndPrimitive information (a.k.a. "cut bits").
208 */
209 prog_data->control_data_format = GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
210
211 /* We only need to output control data if the shader actually calls
212 * EndPrimitive().
213 */
214 c.control_data_bits_per_vertex =
215 nir->info.gs.uses_end_primitive ? 1 : 0;
216 }
217
218 c.control_data_header_size_bits =
219 nir->info.gs.vertices_out * c.control_data_bits_per_vertex;
220
221 /* 1 HWORD = 32 bytes = 256 bits */
222 prog_data->control_data_header_size_hwords =
223 ALIGN(c.control_data_header_size_bits, 256) / 256;
224
225 /* Compute the output vertex size.
226 *
227 * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
228 * Size (p168):
229 *
230 * [0,62] indicating [1,63] 16B units
231 *
232 * Specifies the size of each vertex stored in the GS output entry
233 * (following any Control Header data) as a number of 128-bit units
234 * (minus one).
235 *
236 * Programming Restrictions: The vertex size must be programmed as a
237 * multiple of 32B units with the following exception: Rendering is
238 * disabled (as per SOL stage state) and the vertex size output by the
239 * GS thread is 16B.
240 *
241 * If rendering is enabled (as per SOL state) the vertex size must be
242 * programmed as a multiple of 32B units. In other words, the only time
243 * software can program a vertex size with an odd number of 16B units
244 * is when rendering is disabled.
245 *
246 * Note: B=bytes in the above text.
247 *
248 * It doesn't seem worth the extra trouble to optimize the case where the
249 * vertex size is 16B (especially since this would require special-casing
250 * the GEN assembly that writes to the URB). So we just set the vertex
251 * size to a multiple of 32B (2 vec4's) in all cases.
252 *
253 * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We
254 * budget that as follows:
255 *
256 * 512 bytes for varyings (a varying component is 4 bytes and
257 * gl_MaxGeometryOutputComponents = 128)
258 * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
259 * bytes)
260 * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE
261 * even if it's not used)
262 * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
263 * whenever clip planes are enabled, even if the shader doesn't
264 * write to gl_ClipDistance)
265 * 16 bytes overhead since the VUE size must be a multiple of 32 bytes
266 * (see below)--this causes up to 1 VUE slot to be wasted
267 * 400 bytes available for varying packing overhead
268 *
269 * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
270 * per interpolation type, so this is plenty.
271 *
272 */
273 unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
274 assert(output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
275 prog_data->output_vertex_size_hwords =
276 ALIGN(output_vertex_size_bytes, 32) / 32;
277
278 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
279 * That divides up as follows:
280 *
281 * 64 bytes for the control data header (cut indices or StreamID bits)
282 * 4096 bytes for varyings (a varying component is 4 bytes and
283 * gl_MaxGeometryTotalOutputComponents = 1024)
284 * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
285 * bytes/vertex and gl_MaxGeometryOutputVertices is 256)
286 * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
287 * even if it's not used)
288 * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
289 * whenever clip planes are enabled, even if the shader doesn't
290 * write to gl_ClipDistance)
291 * 4096 bytes overhead since the VUE size must be a multiple of 32
292 * bytes (see above)--this causes up to 1 VUE slot to be wasted
293 * 8128 bytes available for varying packing overhead
294 *
295 * Worst-case varying packing overhead is 3/4 of a varying slot per
296 * interpolation type, which works out to 3072 bytes, so this would allow
297 * us to accommodate 2 interpolation types without any danger of running
298 * out of URB space.
299 *
300 * In practice, the risk of running out of URB space is very small, since
301 * the above figures are all worst-case, and most of them scale with the
302 * number of output vertices. So we'll just calculate the amount of space
303 * we need, and if it's too large, fail to compile.
304 *
305 * The above is for gfx7+ where we have a single URB entry that will hold
306 * all the output.
307 */
308 unsigned output_size_bytes =
309 prog_data->output_vertex_size_hwords * 32 * nir->info.gs.vertices_out;
310 output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
311
312 /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
313 * which comes before the control header.
314 */
315 output_size_bytes += 32;
316
317 /* Shaders can technically set max_vertices = 0, at which point we
318 * may have a URB size of 0 bytes. Nothing good can come from that,
319 * so enforce a minimum size.
320 */
321 if (output_size_bytes == 0)
322 output_size_bytes = 1;
323
324 unsigned max_output_size_bytes = GFX7_MAX_GS_URB_ENTRY_SIZE_BYTES;
325 if (output_size_bytes > max_output_size_bytes)
326 return NULL;
327
328
329 /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+. */
330 prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
331
332 assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
333 prog_data->output_topology =
334 gl_prim_to_hw_prim[nir->info.gs.output_primitive];
335
336 prog_data->vertices_in = nir->info.gs.vertices_in;
337
338 /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
339 * need to program a URB read length of ceiling(num_slots / 2).
340 */
341 prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
342
343 /* Now that prog_data setup is done, we are ready to actually compile the
344 * program.
345 */
346 if (unlikely(debug_enabled)) {
347 fprintf(stderr, "GS Input ");
348 brw_print_vue_map(stderr, &c.input_vue_map, MESA_SHADER_GEOMETRY);
349 fprintf(stderr, "GS Output ");
350 brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_GEOMETRY);
351 }
352
353 fs_visitor v(compiler, ¶ms->base, &c, prog_data, nir,
354 params->base.stats != NULL, debug_enabled);
355 if (run_gs(v)) {
356 prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
357
358 assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
359 prog_data->base.base.dispatch_grf_start_reg =
360 v.payload().num_regs / reg_unit(compiler->devinfo);
361 prog_data->base.base.grf_used = v.grf_used;
362
363 brw_generator g(compiler, ¶ms->base,
364 &prog_data->base.base, MESA_SHADER_GEOMETRY);
365 if (unlikely(debug_enabled)) {
366 const char *label =
367 nir->info.label ? nir->info.label : "unnamed";
368 char *name = ralloc_asprintf(params->base.mem_ctx,
369 "%s geometry shader %s",
370 label, nir->info.name);
371 g.enable_debug(name);
372 }
373 g.generate_code(v.cfg, v.dispatch_width, v.shader_stats,
374 v.performance_analysis.require(), params->base.stats);
375 g.add_const_data(nir->constant_data, nir->constant_data_size);
376 return g.get_assembly();
377 }
378
379 params->base.error_str = ralloc_strdup(params->base.mem_ctx, v.fail_msg);
380
381 return NULL;
382 }
383
384