1 /*
2 * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "util/ralloc.h"
28
29 #include "freedreno_dev_info.h"
30
31 #include "ir3_compiler.h"
32
33 static const struct debug_named_value shader_debug_options[] = {
34 /* clang-format off */
35 {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
36 {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
37 {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
38 {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"},
39 {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
40 {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
41 {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
42 {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"},
43 {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"},
44 {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"},
45 {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"},
46 {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"},
47 {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"},
48 {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
49 #ifdef DEBUG
50 /* DEBUG-only options: */
51 {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"},
52 {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"},
53 #endif
54 DEBUG_NAMED_VALUE_END
55 /* clang-format on */
56 };
57
58 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
59 shader_debug_options, 0)
60 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
61 NULL)
62
63 enum ir3_shader_debug ir3_shader_debug = 0;
64 const char *ir3_shader_override_path = NULL;
65
66 void
ir3_compiler_destroy(struct ir3_compiler * compiler)67 ir3_compiler_destroy(struct ir3_compiler *compiler)
68 {
69 disk_cache_destroy(compiler->disk_cache);
70 ralloc_free(compiler);
71 }
72
73 #define COMMON_OPTIONS \
74 .lower_fpow = true, \
75 .lower_scmp = true, \
76 .lower_flrp16 = true, \
77 .lower_flrp32 = true, \
78 .lower_flrp64 = true, \
79 .lower_ffract = true, \
80 .lower_fmod = true, \
81 .lower_fdiv = true, \
82 .lower_isign = true, \
83 .lower_ldexp = true, \
84 .lower_uadd_carry = true, \
85 .lower_usub_borrow = true, \
86 .lower_mul_high = true, \
87 .lower_mul_2x32_64 = true, \
88 .fuse_ffma16 = true, \
89 .fuse_ffma32 = true, \
90 .fuse_ffma64 = true, \
91 .vertex_id_zero_based = false, \
92 .lower_extract_byte = true, \
93 .lower_extract_word = true, \
94 .lower_insert_byte = true, \
95 .lower_insert_word = true, \
96 .lower_helper_invocation = true, \
97 .lower_bitfield_insert_to_shifts = true, \
98 .lower_bitfield_extract_to_shifts = true, \
99 .lower_pack_half_2x16 = true, \
100 .lower_pack_snorm_4x8 = true, \
101 .lower_pack_snorm_2x16 = true, \
102 .lower_pack_unorm_4x8 = true, \
103 .lower_pack_unorm_2x16 = true, \
104 .lower_unpack_half_2x16 = true, \
105 .lower_unpack_snorm_4x8 = true, \
106 .lower_unpack_snorm_2x16 = true, \
107 .lower_unpack_unorm_4x8 = true, \
108 .lower_unpack_unorm_2x16 = true, \
109 .lower_pack_split = true, \
110 .use_interpolated_input_intrinsics = true, \
111 .lower_rotate = true, \
112 .lower_to_scalar = true, \
113 .has_imul24 = true, \
114 .has_fsub = true, \
115 .has_isub = true, \
116 .force_indirect_unrolling_sampler = true, \
117 .lower_uniforms_to_ubo = true, \
118 .use_scoped_barrier = true, \
119 .max_unroll_iterations = 32
120
121 static const nir_shader_compiler_options nir_options = {
122 COMMON_OPTIONS,
123 .lower_wpos_pntc = true,
124 .lower_cs_local_index_to_id = true,
125
126 /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
127 * but that should be harmless for GL since 64b is not
128 * supported there.
129 */
130 .lower_int64_options = (nir_lower_int64_options)~0,
131 };
132
133 /* we don't want to lower vertex_id to _zero_based on newer gpus: */
134 static const nir_shader_compiler_options nir_options_a6xx = {
135 COMMON_OPTIONS,
136 .vectorize_io = true,
137 .force_indirect_unrolling = nir_var_all,
138 .lower_wpos_pntc = true,
139 .lower_cs_local_index_to_id = true,
140
141 /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
142 * but that should be harmless for GL since 64b is not
143 * supported there.
144 */
145 .lower_int64_options = (nir_lower_int64_options)~0,
146 .lower_device_index_to_zero = true,
147 .has_udot_4x8 = true,
148 .has_sudot_4x8 = true,
149 };
150
151 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct ir3_compiler_options * options)152 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
153 const struct ir3_compiler_options *options)
154 {
155 struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
156
157 ir3_shader_debug = debug_get_option_ir3_shader_debug();
158 ir3_shader_override_path =
159 !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
160
161 if (ir3_shader_override_path) {
162 ir3_shader_debug |= IR3_DBG_NOCACHE;
163 }
164
165 compiler->dev = dev;
166 compiler->dev_id = dev_id;
167 compiler->gen = fd_dev_gen(dev_id);
168 compiler->robust_buffer_access2 = options->robust_buffer_access2;
169
170 /* All known GPU's have 32k local memory (aka shared) */
171 compiler->local_mem_size = 32 * 1024;
172 /* TODO see if older GPU's were different here */
173 compiler->branchstack_size = 64;
174 compiler->wave_granularity = 2;
175 compiler->max_waves = 16;
176
177 compiler->max_variable_workgroup_size = 1024;
178
179 const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
180
181 if (compiler->gen >= 6) {
182 compiler->samgq_workaround = true;
183 /* a6xx split the pipeline state into geometry and fragment state, in
184 * order to let the VS run ahead of the FS. As a result there are now
185 * separate const files for the the fragment shader and everything
186 * else, and separate limits. There seems to be a shared limit, but
187 * it's higher than the vert or frag limits.
188 *
189 * Also, according to the observation on a630/a650/a660, max_const_pipeline
190 * has to be 512 when all geometry stages are present. Otherwise a gpu hang
191 * happens. Accordingly maximum safe size for each stage should be under
192 * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
193 * const files.
194 *
195 * Only when VS and FS stages are present, the limit is 640.
196 *
197 * TODO: The shared limit seems to be different on different models.
198 */
199 compiler->max_const_pipeline = 512;
200 compiler->max_const_frag = 512;
201 compiler->max_const_geom = 512;
202 compiler->max_const_safe = 100;
203
204 /* Compute shaders don't share a const file with the FS. Instead they
205 * have their own file, which is smaller than the FS one.
206 *
207 * TODO: is this true on earlier gen's?
208 */
209 compiler->max_const_compute = 256;
210
211 /* TODO: implement clip+cull distances on earlier gen's */
212 compiler->has_clip_cull = true;
213
214 /* TODO: implement private memory on earlier gen's */
215 compiler->has_pvtmem = true;
216
217 compiler->has_preamble = true;
218
219 compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
220
221 compiler->storage_16bit = dev_info->a6xx.storage_16bit;
222
223 compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
224
225 compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
226 compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
227
228 compiler->shared_consts_base_offset = 504;
229 compiler->shared_consts_size = 8;
230 compiler->geom_shared_consts_size_quirk = 16;
231 } else {
232 compiler->max_const_pipeline = 512;
233 compiler->max_const_geom = 512;
234 compiler->max_const_frag = 512;
235 compiler->max_const_compute = 512;
236
237 /* Note: this will have to change if/when we support tess+GS on
238 * earlier gen's.
239 */
240 compiler->max_const_safe = 256;
241 }
242
243 if (compiler->gen >= 6) {
244 compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
245 } else if (compiler->gen >= 4) {
246 /* On a4xx-a5xx, using r24.x and above requires using the smallest
247 * threadsize.
248 */
249 compiler->reg_size_vec4 = 48;
250 } else {
251 /* TODO: confirm this */
252 compiler->reg_size_vec4 = 96;
253 }
254
255 if (compiler->gen >= 6) {
256 compiler->threadsize_base = 64;
257 } else if (compiler->gen >= 4) {
258 /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
259 * 1.1 subgroupSize which is 32.
260 */
261 compiler->threadsize_base = 32;
262 } else {
263 compiler->threadsize_base = 8;
264 }
265
266 if (compiler->gen >= 4) {
267 /* need special handling for "flat" */
268 compiler->flat_bypass = true;
269 compiler->levels_add_one = false;
270 compiler->unminify_coords = false;
271 compiler->txf_ms_with_isaml = false;
272 compiler->array_index_add_half = true;
273 compiler->instr_align = 16;
274 compiler->const_upload_unit = 4;
275 } else {
276 /* no special handling for "flat" */
277 compiler->flat_bypass = false;
278 compiler->levels_add_one = true;
279 compiler->unminify_coords = true;
280 compiler->txf_ms_with_isaml = true;
281 compiler->array_index_add_half = false;
282 compiler->instr_align = 4;
283 compiler->const_upload_unit = 8;
284 }
285
286 compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
287 compiler->has_shared_regfile = compiler->gen >= 5;
288
289 compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;
290
291 /* The driver can't request this unless preambles are supported. */
292 if (options->push_ubo_with_preamble)
293 assert(compiler->has_preamble);
294
295 if (compiler->gen >= 6) {
296 compiler->nir_options = nir_options_a6xx;
297 compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
298 compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
299 } else {
300 compiler->nir_options = nir_options;
301 /* a2xx compiler doesn't handle indirect: */
302 if (compiler->gen <= 2)
303 compiler->nir_options.force_indirect_unrolling = nir_var_all;
304 }
305
306 /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
307 * this core NIR option enables some optimizations of 16-bit operations.
308 */
309 if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
310 compiler->nir_options.support_16bit_alu = true;
311
312 if (!options->disable_cache)
313 ir3_disk_cache_init(compiler);
314
315 return compiler;
316 }
317
318 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)319 ir3_get_compiler_options(struct ir3_compiler *compiler)
320 {
321 return &compiler->nir_options;
322 }
323