• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2013 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "intel_nir.h"
8 #include "brw_nir.h"
9 #include "brw_fs.h"
10 #include "brw_fs_builder.h"
11 #include "brw_generator.h"
12 #include "brw_private.h"
13 #include "dev/intel_debug.h"
14 
15 using namespace brw;
16 
17 /**
18  * Return the number of patches to accumulate before a MULTI_PATCH mode thread is
19  * launched.  In cases with a large number of input control points and a large
20  * amount of VS outputs, the VS URB space needed to store an entire 8 patches
21  * worth of data can be prohibitive, so it can be beneficial to launch threads
22  * early.
23  *
24  * See the 3DSTATE_HS::Patch Count Threshold documentation for the recommended
25  * values.  Note that 0 means to "disable" early dispatch, meaning to wait for
26  * a full 8 patches as normal.
27  */
28 static int
get_patch_count_threshold(int input_control_points)29 get_patch_count_threshold(int input_control_points)
30 {
31    if (input_control_points <= 4)
32       return 0;
33    else if (input_control_points <= 6)
34       return 5;
35    else if (input_control_points <= 8)
36       return 4;
37    else if (input_control_points <= 10)
38       return 3;
39    else if (input_control_points <= 14)
40       return 2;
41 
42    /* Return patch count 1 for PATCHLIST_15 - PATCHLIST_32 */
43    return 1;
44 }
45 
46 static void
brw_set_tcs_invocation_id(fs_visitor & s)47 brw_set_tcs_invocation_id(fs_visitor &s)
48 {
49    const struct intel_device_info *devinfo = s.devinfo;
50    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
51    struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
52    const fs_builder bld = fs_builder(&s).at_end();
53 
54    const unsigned instance_id_mask =
55       (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
56       (devinfo->ver >= 11)     ? INTEL_MASK(22, 16) :
57                                  INTEL_MASK(23, 17);
58    const unsigned instance_id_shift =
59       (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
60 
61    /* Get instance number from g0.2 bits:
62     *  * 7:0 on DG2+
63     *  * 22:16 on gfx11+
64     *  * 23:17 otherwise
65     */
66    brw_reg t =
67       bld.AND(brw_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD)),
68               brw_imm_ud(instance_id_mask));
69 
70    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
71       /* gl_InvocationID is just the thread number */
72       s.invocation_id = bld.SHR(t, brw_imm_ud(instance_id_shift));
73       return;
74    }
75 
76    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
77 
78    brw_reg channels_uw = bld.vgrf(BRW_TYPE_UW);
79    brw_reg channels_ud = bld.vgrf(BRW_TYPE_UD);
80    bld.MOV(channels_uw, brw_reg(brw_imm_uv(0x76543210)));
81    bld.MOV(channels_ud, channels_uw);
82 
83    if (tcs_prog_data->instances == 1) {
84       s.invocation_id = channels_ud;
85    } else {
86       /* instance_id = 8 * t + <76543210> */
87       s.invocation_id =
88          bld.ADD(bld.SHR(t, brw_imm_ud(instance_id_shift - 3)), channels_ud);
89    }
90 }
91 
92 static void
brw_emit_tcs_thread_end(fs_visitor & s)93 brw_emit_tcs_thread_end(fs_visitor &s)
94 {
95    /* Try and tag the last URB write with EOT instead of emitting a whole
96     * separate write just to finish the thread.  There isn't guaranteed to
97     * be one, so this may not succeed.
98     */
99    if (s.mark_last_urb_write_with_eot())
100       return;
101 
102    const fs_builder bld = fs_builder(&s).at_end();
103 
104    /* Emit a URB write to end the thread.  On Broadwell, we use this to write
105     * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
106     * algorithm to set it optimally).  On other platforms, we simply write
107     * zero to a reserved/MBZ patch header DWord which has no consequence.
108     */
109    brw_reg srcs[URB_LOGICAL_NUM_SRCS];
110    srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
111    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
112    srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
113    srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
114    fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
115                             reg_undef, srcs, ARRAY_SIZE(srcs));
116    inst->eot = true;
117 }
118 
119 static void
brw_assign_tcs_urb_setup(fs_visitor & s)120 brw_assign_tcs_urb_setup(fs_visitor &s)
121 {
122    assert(s.stage == MESA_SHADER_TESS_CTRL);
123 
124    /* Rewrite all ATTR file references to HW_REGs. */
125    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
126       s.convert_attr_sources_to_hw_regs(inst);
127    }
128 }
129 
130 static bool
run_tcs(fs_visitor & s)131 run_tcs(fs_visitor &s)
132 {
133    assert(s.stage == MESA_SHADER_TESS_CTRL);
134 
135    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(s.prog_data);
136    const fs_builder bld = fs_builder(&s).at_end();
137 
138    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
139           vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
140 
141    s.payload_ = new tcs_thread_payload(s);
142 
143    /* Initialize gl_InvocationID */
144    brw_set_tcs_invocation_id(s);
145 
146    const bool fix_dispatch_mask =
147       vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
148       (s.nir->info.tess.tcs_vertices_out % 8) != 0;
149 
150    /* Fix the disptach mask */
151    if (fix_dispatch_mask) {
152       bld.CMP(bld.null_reg_ud(), s.invocation_id,
153               brw_imm_ud(s.nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
154       bld.IF(BRW_PREDICATE_NORMAL);
155    }
156 
157    nir_to_brw(&s);
158 
159    if (fix_dispatch_mask) {
160       bld.emit(BRW_OPCODE_ENDIF);
161    }
162 
163    brw_emit_tcs_thread_end(s);
164 
165    if (s.failed)
166       return false;
167 
168    brw_calculate_cfg(s);
169 
170    brw_optimize(s);
171 
172    s.assign_curb_setup();
173    brw_assign_tcs_urb_setup(s);
174 
175    brw_lower_3src_null_dest(s);
176    brw_workaround_memory_fence_before_eot(s);
177    brw_workaround_emit_dummy_mov_instruction(s);
178 
179    brw_allocate_registers(s, true /* allow_spilling */);
180 
181    brw_workaround_source_arf_before_eot(s);
182 
183    return !s.failed;
184 }
185 
186 extern "C" const unsigned *
brw_compile_tcs(const struct brw_compiler * compiler,struct brw_compile_tcs_params * params)187 brw_compile_tcs(const struct brw_compiler *compiler,
188                 struct brw_compile_tcs_params *params)
189 {
190    const struct intel_device_info *devinfo = compiler->devinfo;
191    nir_shader *nir = params->base.nir;
192    const struct brw_tcs_prog_key *key = params->key;
193    struct brw_tcs_prog_data *prog_data = params->prog_data;
194    struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
195 
196    const bool debug_enabled = brw_should_print_shader(nir, DEBUG_TCS);
197 
198    vue_prog_data->base.stage = MESA_SHADER_TESS_CTRL;
199    prog_data->base.base.ray_queries = nir->info.ray_queries;
200    prog_data->base.base.total_scratch = 0;
201 
202    nir->info.outputs_written = key->outputs_written;
203    nir->info.patch_outputs_written = key->patch_outputs_written;
204 
205    struct intel_vue_map input_vue_map;
206    brw_compute_vue_map(devinfo, &input_vue_map, nir->info.inputs_read,
207                        nir->info.separate_shader, 1);
208    brw_compute_tess_vue_map(&vue_prog_data->vue_map,
209                             nir->info.outputs_written,
210                             nir->info.patch_outputs_written);
211 
212    brw_nir_apply_key(nir, compiler, &key->base,
213                      brw_geometry_stage_dispatch_width(compiler->devinfo));
214    brw_nir_lower_vue_inputs(nir, &input_vue_map);
215    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
216                              key->_tes_primitive_mode);
217    if (key->input_vertices > 0)
218       intel_nir_lower_patch_vertices_in(nir, key->input_vertices);
219 
220    brw_postprocess_nir(nir, compiler, debug_enabled,
221                        key->base.robust_flags);
222 
223    bool has_primitive_id =
224       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
225 
226    prog_data->patch_count_threshold = get_patch_count_threshold(key->input_vertices);
227 
228    if (compiler->use_tcs_multi_patch) {
229       vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
230       prog_data->instances = nir->info.tess.tcs_vertices_out;
231       prog_data->include_primitive_id = has_primitive_id;
232    } else {
233       unsigned verts_per_thread = 8;
234       vue_prog_data->dispatch_mode = INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH;
235       prog_data->instances =
236          DIV_ROUND_UP(nir->info.tess.tcs_vertices_out, verts_per_thread);
237    }
238 
239    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
240     * That divides up as follows:
241     *
242     *     32 bytes for the patch header (tessellation factors)
243     *    480 bytes for per-patch varyings (a varying component is 4 bytes and
244     *              gl_MaxTessPatchComponents = 120)
245     *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
246     *              gl_MaxPatchVertices = 32 and
247     *              gl_MaxTessControlOutputComponents = 128)
248     *
249     *  15808 bytes left for varying packing overhead
250     */
251    const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
252    const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
253    unsigned output_size_bytes = 0;
254    /* Note that the patch header is counted in num_per_patch_slots. */
255    output_size_bytes += num_per_patch_slots * 16;
256    output_size_bytes += nir->info.tess.tcs_vertices_out *
257                         num_per_vertex_slots * 16;
258 
259    assert(output_size_bytes >= 1);
260    if (output_size_bytes > GFX7_MAX_HS_URB_ENTRY_SIZE_BYTES)
261       return NULL;
262 
263    /* URB entry sizes are stored as a multiple of 64 bytes. */
264    vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
265 
266    /* HS does not use the usual payload pushing from URB to GRFs,
267     * because we don't have enough registers for a full-size payload, and
268     * the hardware is broken on Haswell anyway.
269     */
270    vue_prog_data->urb_read_length = 0;
271 
272    if (unlikely(debug_enabled)) {
273       fprintf(stderr, "TCS Input ");
274       brw_print_vue_map(stderr, &input_vue_map, MESA_SHADER_TESS_CTRL);
275       fprintf(stderr, "TCS Output ");
276       brw_print_vue_map(stderr, &vue_prog_data->vue_map, MESA_SHADER_TESS_CTRL);
277    }
278 
279    const unsigned dispatch_width = devinfo->ver >= 20 ? 16 : 8;
280    fs_visitor v(compiler, &params->base, &key->base,
281                 &prog_data->base.base, nir, dispatch_width,
282                 params->base.stats != NULL, debug_enabled);
283    if (!run_tcs(v)) {
284       params->base.error_str =
285          ralloc_strdup(params->base.mem_ctx, v.fail_msg);
286       return NULL;
287    }
288 
289    assert(v.payload().num_regs % reg_unit(devinfo) == 0);
290    prog_data->base.base.dispatch_grf_start_reg = v.payload().num_regs / reg_unit(devinfo);
291 
292    brw_generator g(compiler, &params->base,
293                   &prog_data->base.base, MESA_SHADER_TESS_CTRL);
294    if (unlikely(debug_enabled)) {
295       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
296                                      "%s tessellation control shader %s",
297                                      nir->info.label ? nir->info.label
298                                                      : "unnamed",
299                                      nir->info.name));
300    }
301 
302    g.generate_code(v.cfg, dispatch_width, v.shader_stats,
303                    v.performance_analysis.require(), params->base.stats);
304 
305    g.add_const_data(nir->constant_data, nir->constant_data_size);
306 
307    return g.get_assembly();
308 }
309