1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file elk_vec4_tcs.cpp
26 *
27 * Tessellaton control shader specific code derived from the vec4_visitor class.
28 */
29
30 #include "../intel_nir.h"
31 #include "elk_nir.h"
32 #include "elk_vec4_tcs.h"
33 #include "elk_fs.h"
34 #include "elk_private.h"
35 #include "dev/intel_debug.h"
36
37 namespace elk {
38
vec4_tcs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const struct elk_tcs_prog_key * key,struct elk_tcs_prog_data * prog_data,const nir_shader * nir,bool debug_enabled)39 vec4_tcs_visitor::vec4_tcs_visitor(const struct elk_compiler *compiler,
40 const struct elk_compile_params *params,
41 const struct elk_tcs_prog_key *key,
42 struct elk_tcs_prog_data *prog_data,
43 const nir_shader *nir,
44 bool debug_enabled)
45 : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
46 nir, false, debug_enabled),
47 key(key)
48 {
49 }
50
51
52 void
setup_payload()53 vec4_tcs_visitor::setup_payload()
54 {
55 int reg = 0;
56
57 /* The payload always contains important data in r0, which contains
58 * the URB handles that are passed on to the URB write at the end
59 * of the thread.
60 */
61 reg++;
62
63 /* r1.0 - r4.7 may contain the input control point URB handles,
64 * which we use to pull vertex data.
65 */
66 reg += 4;
67
68 /* Push constants may start at r5.0 */
69 reg = setup_uniforms(reg);
70
71 this->first_non_payload_grf = reg;
72 }
73
74
75 void
emit_prolog()76 vec4_tcs_visitor::emit_prolog()
77 {
78 invocation_id = src_reg(this, glsl_uint_type());
79 emit(ELK_TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
80
81 /* HS threads are dispatched with the dispatch mask set to 0xFF.
82 * If there are an odd number of output vertices, then the final
83 * HS instance dispatched will only have its bottom half doing real
84 * work, and so we need to disable the upper half:
85 */
86 if (nir->info.tess.tcs_vertices_out % 2) {
87 emit(CMP(dst_null_d(), invocation_id,
88 elk_imm_ud(nir->info.tess.tcs_vertices_out),
89 ELK_CONDITIONAL_L));
90
91 /* Matching ENDIF is in emit_thread_end() */
92 emit(IF(ELK_PREDICATE_NORMAL));
93 }
94 }
95
96
97 void
emit_thread_end()98 vec4_tcs_visitor::emit_thread_end()
99 {
100 vec4_instruction *inst;
101 current_annotation = "thread end";
102
103 if (nir->info.tess.tcs_vertices_out % 2) {
104 emit(ELK_OPCODE_ENDIF);
105 }
106
107 if (devinfo->ver == 7) {
108 struct elk_tcs_prog_data *tcs_prog_data =
109 (struct elk_tcs_prog_data *) prog_data;
110
111 current_annotation = "release input vertices";
112
113 /* Synchronize all threads, so we know that no one is still
114 * using the input URB handles.
115 */
116 if (tcs_prog_data->instances > 1) {
117 dst_reg header = dst_reg(this, glsl_uvec4_type());
118 emit(ELK_TCS_OPCODE_CREATE_BARRIER_HEADER, header);
119 emit(ELK_SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
120 }
121
122 /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
123 * We want to compare the bottom half of invocation_id with 0, but
124 * use that truth value for the top half as well. Unfortunately,
125 * we don't have stride in the vec4 world, nor UV immediates in
126 * align16, so we need an opcode to get invocation_id<0,4,0>.
127 */
128 set_condmod(ELK_CONDITIONAL_Z,
129 emit(ELK_TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
130 invocation_id));
131 emit(IF(ELK_PREDICATE_NORMAL));
132 for (unsigned i = 0; i < key->input_vertices; i += 2) {
133 /* If we have an odd number of input vertices, the last will be
134 * unpaired. We don't want to use an interleaved URB write in
135 * that case.
136 */
137 const bool is_unpaired = i == key->input_vertices - 1;
138
139 dst_reg header(this, glsl_uvec4_type());
140 emit(ELK_TCS_OPCODE_RELEASE_INPUT, header, elk_imm_ud(i),
141 elk_imm_ud(is_unpaired));
142 }
143 emit(ELK_OPCODE_ENDIF);
144 }
145
146 inst = emit(ELK_TCS_OPCODE_THREAD_END);
147 inst->base_mrf = 14;
148 inst->mlen = 2;
149 }
150
151
152 void
emit_input_urb_read(const dst_reg & dst,const src_reg & vertex_index,unsigned base_offset,unsigned first_component,const src_reg & indirect_offset)153 vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
154 const src_reg &vertex_index,
155 unsigned base_offset,
156 unsigned first_component,
157 const src_reg &indirect_offset)
158 {
159 vec4_instruction *inst;
160 dst_reg temp(this, glsl_ivec4_type());
161 temp.type = dst.type;
162
163 /* Set up the message header to reference the proper parts of the URB */
164 dst_reg header = dst_reg(this, glsl_uvec4_type());
165 inst = emit(ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
166 indirect_offset);
167 inst->force_writemask_all = true;
168
169 /* Read into a temporary, ignoring writemasking. */
170 inst = emit(ELK_VEC4_OPCODE_URB_READ, temp, src_reg(header));
171 inst->offset = base_offset;
172 inst->mlen = 1;
173 inst->base_mrf = -1;
174
175 /* Copy the temporary to the destination to deal with writemasking.
176 *
177 * Also attempt to deal with gl_PointSize being in the .w component.
178 */
179 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
180 emit(MOV(dst, swizzle(src_reg(temp), ELK_SWIZZLE_WWWW)));
181 } else {
182 src_reg src = src_reg(temp);
183 src.swizzle = ELK_SWZ_COMP_INPUT(first_component);
184 emit(MOV(dst, src));
185 }
186 }
187
188 void
emit_output_urb_read(const dst_reg & dst,unsigned base_offset,unsigned first_component,const src_reg & indirect_offset)189 vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
190 unsigned base_offset,
191 unsigned first_component,
192 const src_reg &indirect_offset)
193 {
194 vec4_instruction *inst;
195
196 /* Set up the message header to reference the proper parts of the URB */
197 dst_reg header = dst_reg(this, glsl_uvec4_type());
198 inst = emit(ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
199 elk_imm_ud(dst.writemask << first_component), indirect_offset);
200 inst->force_writemask_all = true;
201
202 vec4_instruction *read = emit(ELK_VEC4_OPCODE_URB_READ, dst, src_reg(header));
203 read->offset = base_offset;
204 read->mlen = 1;
205 read->base_mrf = -1;
206
207 if (first_component) {
208 /* Read into a temporary and copy with a swizzle and writemask. */
209 read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type);
210 emit(MOV(dst, swizzle(src_reg(read->dst),
211 ELK_SWZ_COMP_INPUT(first_component))));
212 }
213 }
214
215 void
emit_urb_write(const src_reg & value,unsigned writemask,unsigned base_offset,const src_reg & indirect_offset)216 vec4_tcs_visitor::emit_urb_write(const src_reg &value,
217 unsigned writemask,
218 unsigned base_offset,
219 const src_reg &indirect_offset)
220 {
221 if (writemask == 0)
222 return;
223
224 src_reg message(this, glsl_uvec4_type(), 2);
225 vec4_instruction *inst;
226
227 inst = emit(ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
228 elk_imm_ud(writemask), indirect_offset);
229 inst->force_writemask_all = true;
230 inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
231 value));
232 inst->force_writemask_all = true;
233
234 inst = emit(ELK_VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message);
235 inst->offset = base_offset;
236 inst->mlen = 2;
237 inst->base_mrf = -1;
238 }
239
240 void
nir_emit_intrinsic(nir_intrinsic_instr * instr)241 vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
242 {
243 switch (instr->intrinsic) {
244 case nir_intrinsic_load_invocation_id:
245 emit(MOV(get_nir_def(instr->def, ELK_REGISTER_TYPE_UD),
246 invocation_id));
247 break;
248 case nir_intrinsic_load_primitive_id:
249 emit(ELK_TCS_OPCODE_GET_PRIMITIVE_ID,
250 get_nir_def(instr->def, ELK_REGISTER_TYPE_UD));
251 break;
252 case nir_intrinsic_load_patch_vertices_in:
253 emit(MOV(get_nir_def(instr->def, ELK_REGISTER_TYPE_D),
254 elk_imm_d(key->input_vertices)));
255 break;
256 case nir_intrinsic_load_per_vertex_input: {
257 assert(instr->def.bit_size == 32);
258 src_reg indirect_offset = get_indirect_offset(instr);
259 unsigned imm_offset = nir_intrinsic_base(instr);
260
261 src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
262 ELK_REGISTER_TYPE_UD);
263
264 unsigned first_component = nir_intrinsic_component(instr);
265 dst_reg dst = get_nir_def(instr->def, ELK_REGISTER_TYPE_D);
266 dst.writemask = elk_writemask_for_size(instr->num_components);
267 emit_input_urb_read(dst, vertex_index, imm_offset,
268 first_component, indirect_offset);
269 break;
270 }
271 case nir_intrinsic_load_input:
272 unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
273 break;
274 case nir_intrinsic_load_output:
275 case nir_intrinsic_load_per_vertex_output: {
276 src_reg indirect_offset = get_indirect_offset(instr);
277 unsigned imm_offset = nir_intrinsic_base(instr);
278
279 dst_reg dst = get_nir_def(instr->def, ELK_REGISTER_TYPE_D);
280 dst.writemask = elk_writemask_for_size(instr->num_components);
281
282 emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
283 indirect_offset);
284 break;
285 }
286 case nir_intrinsic_store_output:
287 case nir_intrinsic_store_per_vertex_output: {
288 assert(nir_src_bit_size(instr->src[0]) == 32);
289 src_reg value = get_nir_src(instr->src[0]);
290 unsigned mask = nir_intrinsic_write_mask(instr);
291 unsigned swiz = ELK_SWIZZLE_XYZW;
292
293 src_reg indirect_offset = get_indirect_offset(instr);
294 unsigned imm_offset = nir_intrinsic_base(instr);
295
296 unsigned first_component = nir_intrinsic_component(instr);
297 if (first_component) {
298 assert(swiz == ELK_SWIZZLE_XYZW);
299 swiz = ELK_SWZ_COMP_OUTPUT(first_component);
300 mask = mask << first_component;
301 }
302
303 emit_urb_write(swizzle(value, swiz), mask,
304 imm_offset, indirect_offset);
305 break;
306 }
307
308 case nir_intrinsic_barrier:
309 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
310 vec4_visitor::nir_emit_intrinsic(instr);
311 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
312 dst_reg header = dst_reg(this, glsl_uvec4_type());
313 emit(ELK_TCS_OPCODE_CREATE_BARRIER_HEADER, header);
314 emit(ELK_SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
315 }
316 break;
317
318 default:
319 vec4_visitor::nir_emit_intrinsic(instr);
320 }
321 }
322
323 /**
324 * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
325 * launched. In cases with a large number of input control points and a large
326 * amount of VS outputs, the VS URB space needed to store an entire 8 patches
327 * worth of data can be prohibitive, so it can be beneficial to launch threads
328 * early.
329 *
330 * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
331 * values. Note that 0 means to "disable" early dispatch, meaning to wait for
332 * a full 8 patches as normal.
333 */
334 static int
get_patch_count_threshold(int input_control_points)335 get_patch_count_threshold(int input_control_points)
336 {
337 if (input_control_points <= 4)
338 return 0;
339 else if (input_control_points <= 6)
340 return 5;
341 else if (input_control_points <= 8)
342 return 4;
343 else if (input_control_points <= 10)
344 return 3;
345 else if (input_control_points <= 14)
346 return 2;
347
348 /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
349 return 1;
350 }
351
352 } /* namespace elk */
353
354 extern "C" const unsigned *
elk_compile_tcs(const struct elk_compiler * compiler,struct elk_compile_tcs_params * params)355 elk_compile_tcs(const struct elk_compiler *compiler,
356 struct elk_compile_tcs_params *params)
357 {
358 const struct intel_device_info *devinfo = compiler->devinfo;
359 nir_shader *nir = params->base.nir;
360 const struct elk_tcs_prog_key *key = params->key;
361 struct elk_tcs_prog_data *prog_data = params->prog_data;
362 struct elk_vue_prog_data *vue_prog_data = &prog_data->base;
363
364 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
365 const bool debug_enabled = elk_should_print_shader(nir, DEBUG_TCS);
366 const unsigned *assembly;
367
368 vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
369 prog_data->base.base.ray_queries = nir->info.ray_queries;
370 prog_data->base.base.total_scratch = 0;
371
372 nir->info.outputs_written = key->outputs_written;
373 nir->info.patch_outputs_written = key->patch_outputs_written;
374
375 struct intel_vue_map input_vue_map;
376 elk_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
377 nir->info.separate_shader, 1);
378 elk_compute_tess_vue_map(&vue_prog_data->vue_map,
379 nir->info.outputs_written,
380 nir->info.patch_outputs_written);
381
382 elk_nir_apply_key(nir, compiler, &key->base, 8);
383 elk_nir_lower_vue_inputs(nir, &input_vue_map);
384 elk_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
385 key->_tes_primitive_mode);
386 if (key->quads_workaround)
387 intel_nir_apply_tcs_quads_workaround(nir);
388 if (key->input_vertices > 0)
389 intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
390
391 elk_postprocess_nir(nir, compiler, debug_enabled,
392 key->base.robust_flags);
393
394 bool has_primitive_id =
395 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
396
397 prog_data->patch_count_threshold = elk::get_patch_count_threshold(key->input_vertices);
398
399 if (compiler->use_tcs_multi_patch) {
400 vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
401 prog_data->instances = nir->info.tess.tcs_vertices_out;
402 prog_data->include_primitive_id = has_primitive_id;
403 } else {
404 unsigned verts_per_thread = is_scalar ? 8 : 2;
405 vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
406 prog_data->instances =
407 DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
408 }
409
410 /* Compute URB entry size. The maximum allowed URB entry size is 32k.
411 * That divides up as follows:
412 *
413 * 32 bytes for the patch header (tessellation factors)
414 * 480 bytes for per-patch varyings (a varying component is 4 bytes and
415 * gl_MaxTessPatchComponents = 120)
416 * 16384 bytes for per-vertex varyings (a varying component is 4 bytes,
417 * gl_MaxPatchVertices = 32 and
418 * gl_MaxTessControlOutputComponents = 128)
419 *
420 * 15808 bytes left for varying packing overhead
421 */
422 const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
423 const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
424 unsigned output_size_bytes = 0;
425 /* Note that the patch header is counted in num_per_patch_slots. */
426 output_size_bytes += num_per_patch_slots * 16;
427 output_size_bytes += nir->info.tess.tcs_vertices_out *
428 num_per_vertex_slots * 16;
429
430 assert(output_size_bytes >= 1);
431 if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
432 return NULL;
433
434 /* URB entry sizes are stored as a multiple of 64 bytes. */
435 vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
436
437 /* HS does not use the usual payload pushing from URB to GRFs,
438 * because we don't have enough registers for a full-size payload, and
439 * the hardware is broken on Haswell anyway.
440 */
441 vue_prog_data->urb_read_length = 0;
442
443 if (unlikely(debug_enabled)) {
444 fprintf(stderr, "TCS Input ");
445 elk_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
446 fprintf(stderr, "TCS Output ");
447 elk_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
448 }
449
450 if (is_scalar) {
451 const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
452 elk_fs_visitor v(compiler, ¶ms->base, &key->base,
453 &prog_data->base.base, nir, dispatch_width,
454 params->base.stats != NULL, debug_enabled);
455 if (!v.run_tcs()) {
456 params->base.error_str =
457 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
458 return NULL;
459 }
460
461 assert(v.payload().num_regs % reg_unit(devinfo) == 0);
462 prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
463
464 elk_fs_generator g(compiler, ¶ms->base,
465 &prog_data->base.base, false, MESA_SHADER_TESS_CTRL);
466 if (unlikely(debug_enabled)) {
467 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
468 "%s tessellation control shader %s",
469 nir->info.label ? nir->info.label
470 : "unnamed",
471 nir->info.name));
472 }
473
474 g.generate_code(v.cfg, dispatch_width, v.shader_stats,
475 v.performance_analysis.require(), params->base.stats);
476
477 g.add_const_data(nir->constant_data, nir->constant_data_size);
478
479 assembly = g.get_assembly();
480 } else {
481 elk::vec4_tcs_visitor v(compiler, ¶ms->base, key, prog_data,
482 nir, debug_enabled);
483 if (!v.run()) {
484 params->base.error_str =
485 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
486 return NULL;
487 }
488
489 if (INTEL_DEBUG(DEBUG_TCS))
490 v.dump_instructions();
491
492
493 assembly = elk_vec4_generate_assembly(compiler, ¶ms->base, nir,
494 &prog_data->base, v.cfg,
495 v.performance_analysis.require(),
496 debug_enabled);
497 }
498
499 return assembly;
500 }
501