• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/ralloc.h"
28 
29 #include "freedreno_dev_info.h"
30 
31 #include "ir3_compiler.h"
32 
33 static const struct debug_named_value shader_debug_options[] = {
34    /* clang-format off */
35    {"vs",         IR3_DBG_SHADER_VS,  "Print shader disasm for vertex shaders"},
36    {"tcs",        IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
37    {"tes",        IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
38    {"gs",         IR3_DBG_SHADER_GS,  "Print shader disasm for geometry shaders"},
39    {"fs",         IR3_DBG_SHADER_FS,  "Print shader disasm for fragment shaders"},
40    {"cs",         IR3_DBG_SHADER_CS,  "Print shader disasm for compute shaders"},
41    {"disasm",     IR3_DBG_DISASM,     "Dump NIR and adreno shader disassembly"},
42    {"optmsgs",    IR3_DBG_OPTMSGS,    "Enable optimizer debug messages"},
43    {"forces2en",  IR3_DBG_FORCES2EN,  "Force s2en mode for tex sampler instructions"},
44    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
45    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
46    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
47    {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
48    {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
49 #ifdef DEBUG
50    /* DEBUG-only options: */
51    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
52    {"ramsgs",     IR3_DBG_RAMSGS,     "Enable register-allocation debug messages"},
53 #endif
54    DEBUG_NAMED_VALUE_END
55    /* clang-format on */
56 };
57 
58 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
59                             shader_debug_options, 0)
60 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
61                       NULL)
62 
63 enum ir3_shader_debug ir3_shader_debug = 0;
64 const char *ir3_shader_override_path = NULL;
65 
66 void
ir3_compiler_destroy(struct ir3_compiler * compiler)67 ir3_compiler_destroy(struct ir3_compiler *compiler)
68 {
69    disk_cache_destroy(compiler->disk_cache);
70    ralloc_free(compiler);
71 }
72 
73 #define COMMON_OPTIONS                                                        \
74    .lower_fpow = true,                                                        \
75    .lower_scmp = true,                                                        \
76    .lower_flrp16 = true,                                                      \
77    .lower_flrp32 = true,                                                      \
78    .lower_flrp64 = true,                                                      \
79    .lower_ffract = true,                                                      \
80    .lower_fmod = true,                                                        \
81    .lower_fdiv = true,                                                        \
82    .lower_isign = true,                                                       \
83    .lower_ldexp = true,                                                       \
84    .lower_uadd_carry = true,                                                  \
85    .lower_usub_borrow = true,                                                 \
86    .lower_mul_high = true,                                                    \
87    .lower_mul_2x32_64 = true,                                                 \
88    .fuse_ffma16 = true,                                                       \
89    .fuse_ffma32 = true,                                                       \
90    .fuse_ffma64 = true,                                                       \
91    .vertex_id_zero_based = false,                                             \
92    .lower_extract_byte = true,                                                \
93    .lower_extract_word = true,                                                \
94    .lower_insert_byte = true,                                                 \
95    .lower_insert_word = true,                                                 \
96    .lower_helper_invocation = true,                                           \
97    .lower_bitfield_insert_to_shifts = true,                                   \
98    .lower_bitfield_extract_to_shifts = true,                                  \
99    .lower_pack_half_2x16 = true,                                              \
100    .lower_pack_snorm_4x8 = true,                                              \
101    .lower_pack_snorm_2x16 = true,                                             \
102    .lower_pack_unorm_4x8 = true,                                              \
103    .lower_pack_unorm_2x16 = true,                                             \
104    .lower_unpack_half_2x16 = true,                                            \
105    .lower_unpack_snorm_4x8 = true,                                            \
106    .lower_unpack_snorm_2x16 = true,                                           \
107    .lower_unpack_unorm_4x8 = true,                                            \
108    .lower_unpack_unorm_2x16 = true,                                           \
109    .lower_pack_split = true,                                                  \
110    .use_interpolated_input_intrinsics = true,                                 \
111    .lower_rotate = true,                                                      \
112    .lower_to_scalar = true,                                                   \
113    .has_imul24 = true,                                                        \
114    .has_fsub = true,                                                          \
115    .has_isub = true,                                                          \
116    .force_indirect_unrolling_sampler = true,                                  \
117    .lower_uniforms_to_ubo = true,                                             \
118    .use_scoped_barrier = true,                                                \
119    .max_unroll_iterations = 32
120 
121 static const nir_shader_compiler_options nir_options = {
122    COMMON_OPTIONS,
123    .lower_wpos_pntc = true,
124    .lower_cs_local_index_to_id = true,
125 
126    /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
127     * but that should be harmless for GL since 64b is not
128     * supported there.
129     */
130    .lower_int64_options = (nir_lower_int64_options)~0,
131 };
132 
133 /* we don't want to lower vertex_id to _zero_based on newer gpus: */
134 static const nir_shader_compiler_options nir_options_a6xx = {
135    COMMON_OPTIONS,
136    .vectorize_io = true,
137    .force_indirect_unrolling = nir_var_all,
138    .lower_wpos_pntc = true,
139    .lower_cs_local_index_to_id = true,
140 
141    /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c
142     * but that should be harmless for GL since 64b is not
143     * supported there.
144     */
145    .lower_int64_options = (nir_lower_int64_options)~0,
146    .lower_device_index_to_zero = true,
147    .has_udot_4x8 = true,
148    .has_sudot_4x8 = true,
149 };
150 
151 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct ir3_compiler_options * options)152 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
153                     const struct ir3_compiler_options *options)
154 {
155    struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
156 
157    ir3_shader_debug = debug_get_option_ir3_shader_debug();
158    ir3_shader_override_path =
159       !__check_suid() ? debug_get_option_ir3_shader_override_path() : NULL;
160 
161    if (ir3_shader_override_path) {
162       ir3_shader_debug |= IR3_DBG_NOCACHE;
163    }
164 
165    compiler->dev = dev;
166    compiler->dev_id = dev_id;
167    compiler->gen = fd_dev_gen(dev_id);
168    compiler->robust_buffer_access2 = options->robust_buffer_access2;
169 
170    /* All known GPU's have 32k local memory (aka shared) */
171    compiler->local_mem_size = 32 * 1024;
172    /* TODO see if older GPU's were different here */
173    compiler->branchstack_size = 64;
174    compiler->wave_granularity = 2;
175    compiler->max_waves = 16;
176 
177    compiler->max_variable_workgroup_size = 1024;
178 
179    const struct fd_dev_info *dev_info = fd_dev_info(compiler->dev_id);
180 
181    if (compiler->gen >= 6) {
182       compiler->samgq_workaround = true;
183       /* a6xx split the pipeline state into geometry and fragment state, in
184        * order to let the VS run ahead of the FS. As a result there are now
185        * separate const files for the the fragment shader and everything
186        * else, and separate limits. There seems to be a shared limit, but
187        * it's higher than the vert or frag limits.
188        *
189        * Also, according to the observation on a630/a650/a660, max_const_pipeline
190        * has to be 512 when all geometry stages are present. Otherwise a gpu hang
191        * happens. Accordingly maximum safe size for each stage should be under
192        * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
193        * const files.
194        *
195        * Only when VS and FS stages are present, the limit is 640.
196        *
197        * TODO: The shared limit seems to be different on different models.
198        */
199       compiler->max_const_pipeline = 512;
200       compiler->max_const_frag = 512;
201       compiler->max_const_geom = 512;
202       compiler->max_const_safe = 100;
203 
204       /* Compute shaders don't share a const file with the FS. Instead they
205        * have their own file, which is smaller than the FS one.
206        *
207        * TODO: is this true on earlier gen's?
208        */
209       compiler->max_const_compute = 256;
210 
211       /* TODO: implement clip+cull distances on earlier gen's */
212       compiler->has_clip_cull = true;
213 
214       /* TODO: implement private memory on earlier gen's */
215       compiler->has_pvtmem = true;
216 
217       compiler->has_preamble = true;
218 
219       compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
220 
221       compiler->storage_16bit = dev_info->a6xx.storage_16bit;
222 
223       compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
224 
225       compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
226       compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
227 
228       compiler->shared_consts_base_offset = 504;
229       compiler->shared_consts_size = 8;
230       compiler->geom_shared_consts_size_quirk = 16;
231    } else {
232       compiler->max_const_pipeline = 512;
233       compiler->max_const_geom = 512;
234       compiler->max_const_frag = 512;
235       compiler->max_const_compute = 512;
236 
237       /* Note: this will have to change if/when we support tess+GS on
238        * earlier gen's.
239        */
240       compiler->max_const_safe = 256;
241    }
242 
243    if (compiler->gen >= 6) {
244       compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
245    } else if (compiler->gen >= 4) {
246       /* On a4xx-a5xx, using r24.x and above requires using the smallest
247        * threadsize.
248        */
249       compiler->reg_size_vec4 = 48;
250    } else {
251       /* TODO: confirm this */
252       compiler->reg_size_vec4 = 96;
253    }
254 
255    if (compiler->gen >= 6) {
256       compiler->threadsize_base = 64;
257    } else if (compiler->gen >= 4) {
258       /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
259        * 1.1 subgroupSize which is 32.
260        */
261       compiler->threadsize_base = 32;
262    } else {
263       compiler->threadsize_base = 8;
264    }
265 
266    if (compiler->gen >= 4) {
267       /* need special handling for "flat" */
268       compiler->flat_bypass = true;
269       compiler->levels_add_one = false;
270       compiler->unminify_coords = false;
271       compiler->txf_ms_with_isaml = false;
272       compiler->array_index_add_half = true;
273       compiler->instr_align = 16;
274       compiler->const_upload_unit = 4;
275    } else {
276       /* no special handling for "flat" */
277       compiler->flat_bypass = false;
278       compiler->levels_add_one = true;
279       compiler->unminify_coords = true;
280       compiler->txf_ms_with_isaml = true;
281       compiler->array_index_add_half = false;
282       compiler->instr_align = 4;
283       compiler->const_upload_unit = 8;
284    }
285 
286    compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
287    compiler->has_shared_regfile = compiler->gen >= 5;
288 
289    compiler->push_ubo_with_preamble = options->push_ubo_with_preamble;
290 
291    /* The driver can't request this unless preambles are supported. */
292    if (options->push_ubo_with_preamble)
293       assert(compiler->has_preamble);
294 
295    if (compiler->gen >= 6) {
296       compiler->nir_options = nir_options_a6xx;
297       compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
298       compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
299    } else {
300       compiler->nir_options = nir_options;
301       /* a2xx compiler doesn't handle indirect: */
302       if (compiler->gen <= 2)
303          compiler->nir_options.force_indirect_unrolling = nir_var_all;
304    }
305 
306    /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
307     * this core NIR option enables some optimizations of 16-bit operations.
308     */
309    if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
310       compiler->nir_options.support_16bit_alu = true;
311 
312    if (!options->disable_cache)
313       ir3_disk_cache_init(compiler);
314 
315    return compiler;
316 }
317 
318 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)319 ir3_get_compiler_options(struct ir3_compiler *compiler)
320 {
321    return &compiler->nir_options;
322 }
323