• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/ralloc.h"
28 
29 #include "freedreno_dev_info.h"
30 
31 #include "ir3_compiler.h"
32 
33 static const struct debug_named_value shader_debug_options[] = {
34    /* clang-format off */
35    {"vs",         IR3_DBG_SHADER_VS,  "Print shader disasm for vertex shaders"},
36    {"tcs",        IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
37    {"tes",        IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
38    {"gs",         IR3_DBG_SHADER_GS,  "Print shader disasm for geometry shaders"},
39    {"fs",         IR3_DBG_SHADER_FS,  "Print shader disasm for fragment shaders"},
40    {"cs",         IR3_DBG_SHADER_CS,  "Print shader disasm for compute shaders"},
41    {"internal",   IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
42    {"disasm",     IR3_DBG_DISASM,     "Dump NIR and adreno shader disassembly"},
43    {"optmsgs",    IR3_DBG_OPTMSGS,    "Enable optimizer debug messages"},
44    {"forces2en",  IR3_DBG_FORCES2EN,  "Force s2en mode for tex sampler instructions"},
45    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
46    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
47    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
48    {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
49    {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
50    {"fullsync",   IR3_DBG_FULLSYNC,   "Add (sy) + (ss) after each cat5/cat6"},
51    {"fullnop",    IR3_DBG_FULLNOP,    "Add nops before each instruction"},
52 #ifdef DEBUG
53    /* DEBUG-only options: */
54    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
55    {"ramsgs",     IR3_DBG_RAMSGS,     "Enable register-allocation debug messages"},
56 #endif
57    DEBUG_NAMED_VALUE_END
58    /* clang-format on */
59 };
60 
61 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
62                             shader_debug_options, 0)
63 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
64                       NULL)
65 
66 enum ir3_shader_debug ir3_shader_debug = 0;
67 const char *ir3_shader_override_path = NULL;
68 
69 void
ir3_compiler_destroy(struct ir3_compiler * compiler)70 ir3_compiler_destroy(struct ir3_compiler *compiler)
71 {
72    disk_cache_destroy(compiler->disk_cache);
73    ralloc_free(compiler);
74 }
75 
76 static const nir_shader_compiler_options ir3_base_options = {
77    .lower_fpow = true,
78    .lower_scmp = true,
79    .lower_flrp16 = true,
80    .lower_flrp32 = true,
81    .lower_flrp64 = true,
82    .lower_ffract = true,
83    .lower_fmod = true,
84    .lower_fdiv = true,
85    .lower_isign = true,
86    .lower_ldexp = true,
87    .lower_uadd_carry = true,
88    .lower_usub_borrow = true,
89    .lower_mul_high = true,
90    .lower_mul_2x32_64 = true,
91    .fuse_ffma16 = true,
92    .fuse_ffma32 = true,
93    .fuse_ffma64 = true,
94    .vertex_id_zero_based = false,
95    .lower_extract_byte = true,
96    .lower_extract_word = true,
97    .lower_insert_byte = true,
98    .lower_insert_word = true,
99    .lower_helper_invocation = true,
100    .lower_bitfield_insert = true,
101    .lower_bitfield_extract = true,
102    .lower_pack_half_2x16 = true,
103    .lower_pack_snorm_4x8 = true,
104    .lower_pack_snorm_2x16 = true,
105    .lower_pack_unorm_4x8 = true,
106    .lower_pack_unorm_2x16 = true,
107    .lower_unpack_half_2x16 = true,
108    .lower_unpack_snorm_4x8 = true,
109    .lower_unpack_snorm_2x16 = true,
110    .lower_unpack_unorm_4x8 = true,
111    .lower_unpack_unorm_2x16 = true,
112    .lower_pack_split = true,
113    .use_interpolated_input_intrinsics = true,
114    .lower_to_scalar = true,
115    .has_imul24 = true,
116    .has_fsub = true,
117    .has_isub = true,
118    .force_indirect_unrolling_sampler = true,
119    .lower_uniforms_to_ubo = true,
120    .max_unroll_iterations = 32,
121 
122    .lower_cs_local_index_to_id = true,
123    .lower_wpos_pntc = true,
124 
125    .lower_int64_options = (nir_lower_int64_options)~0,
126    .lower_doubles_options = (nir_lower_doubles_options)~0,
127 };
128 
129 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct fd_dev_info * dev_info,const struct ir3_compiler_options * options)130 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
131                     const struct fd_dev_info *dev_info,
132                     const struct ir3_compiler_options *options)
133 {
134    struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
135 
136    ir3_shader_debug = debug_get_option_ir3_shader_debug();
137    ir3_shader_override_path =
138       __normal_user() ? debug_get_option_ir3_shader_override_path() : NULL;
139 
140    if (ir3_shader_override_path) {
141       ir3_shader_debug |= IR3_DBG_NOCACHE;
142    }
143 
144    compiler->dev = dev;
145    compiler->dev_id = dev_id;
146    compiler->gen = fd_dev_gen(dev_id);
147    compiler->is_64bit = fd_dev_64b(dev_id);
148    compiler->options = *options;
149 
150    /* TODO see if older GPU's were different here */
151    compiler->branchstack_size = 64;
152    compiler->wave_granularity = dev_info->wave_granularity;
153    compiler->max_waves = 16;
154 
155    compiler->max_variable_workgroup_size = 1024;
156 
157    compiler->local_mem_size = dev_info->cs_shared_mem_size;
158 
159    if (compiler->gen >= 6) {
160       compiler->samgq_workaround = true;
161       /* a6xx split the pipeline state into geometry and fragment state, in
162        * order to let the VS run ahead of the FS. As a result there are now
163        * separate const files for the the fragment shader and everything
164        * else, and separate limits. There seems to be a shared limit, but
165        * it's higher than the vert or frag limits.
166        *
167        * Also, according to the observation on a630/a650/a660, max_const_pipeline
168        * has to be 512 when all geometry stages are present. Otherwise a gpu hang
169        * happens. Accordingly maximum safe size for each stage should be under
170        * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
171        * const files.
172        *
173        * Only when VS and FS stages are present, the limit is 640.
174        *
175        * TODO: The shared limit seems to be different on different models.
176        */
177       compiler->max_const_pipeline = 512;
178       compiler->max_const_frag = 512;
179       compiler->max_const_geom = 512;
180       compiler->max_const_safe = 100;
181 
182       /* Compute shaders don't share a const file with the FS. Instead they
183        * have their own file, which is smaller than the FS one.
184        *
185        * TODO: is this true on earlier gen's?
186        */
187       compiler->max_const_compute = 256;
188 
189       /* TODO: implement clip+cull distances on earlier gen's */
190       compiler->has_clip_cull = true;
191 
192       compiler->has_preamble = true;
193 
194       compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
195 
196       compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
197 
198       compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
199       compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
200 
201       if (compiler->gen == 6 && options->shared_push_consts) {
202          compiler->shared_consts_base_offset = 504;
203          compiler->shared_consts_size = 8;
204          compiler->geom_shared_consts_size_quirk = 16;
205       } else {
206          compiler->shared_consts_base_offset = -1;
207          compiler->shared_consts_size = 0;
208          compiler->geom_shared_consts_size_quirk = 0;
209       }
210 
211       compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
212       compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
213       compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
214       compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
215    } else {
216       compiler->max_const_pipeline = 512;
217       compiler->max_const_geom = 512;
218       compiler->max_const_frag = 512;
219       compiler->max_const_compute = 512;
220 
221       /* Note: this will have to change if/when we support tess+GS on
222        * earlier gen's.
223        */
224       compiler->max_const_safe = 256;
225    }
226 
227    /* This is just a guess for a4xx. */
228    compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
229    /* TODO: implement private memory on earlier gen's */
230    compiler->has_pvtmem = compiler->gen >= 5;
231 
232    compiler->has_isam_ssbo = compiler->gen >= 6;
233 
234    if (compiler->gen >= 6) {
235       compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
236    } else if (compiler->gen >= 4) {
237       /* On a4xx-a5xx, using r24.x and above requires using the smallest
238        * threadsize.
239        */
240       compiler->reg_size_vec4 = 48;
241    } else {
242       /* TODO: confirm this */
243       compiler->reg_size_vec4 = 96;
244    }
245 
246    if (compiler->gen >= 6) {
247       compiler->threadsize_base = 64;
248    } else if (compiler->gen >= 4) {
249       /* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
250        * 1.1 subgroupSize which is 32.
251        */
252       compiler->threadsize_base = 32;
253    } else {
254       compiler->threadsize_base = 8;
255    }
256 
257    if (compiler->gen >= 4) {
258       /* need special handling for "flat" */
259       compiler->flat_bypass = true;
260       compiler->levels_add_one = false;
261       compiler->unminify_coords = false;
262       compiler->txf_ms_with_isaml = false;
263       compiler->array_index_add_half = true;
264       compiler->instr_align = 16;
265       compiler->const_upload_unit = 4;
266    } else {
267       /* no special handling for "flat" */
268       compiler->flat_bypass = false;
269       compiler->levels_add_one = true;
270       compiler->unminify_coords = true;
271       compiler->txf_ms_with_isaml = true;
272       compiler->array_index_add_half = false;
273       compiler->instr_align = 4;
274       compiler->const_upload_unit = 8;
275    }
276 
277    compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
278    compiler->has_shared_regfile = compiler->gen >= 5;
279 
280    /* The driver can't request this unless preambles are supported. */
281    if (options->push_ubo_with_preamble)
282       assert(compiler->has_preamble);
283 
284    /* Set up nir shader compiler options, using device-specific overrides of our base settings. */
285    compiler->nir_options = ir3_base_options;
286 
287    if (compiler->gen >= 6) {
288       compiler->nir_options.vectorize_io = true,
289       compiler->nir_options.force_indirect_unrolling = nir_var_all,
290 
291       compiler->nir_options.lower_device_index_to_zero = true,
292       compiler->nir_options.has_udot_4x8 = true,
293       compiler->nir_options.has_sudot_4x8 = true,
294       compiler->nir_options.has_udot_4x8 = dev_info->a6xx.has_dp2acc;
295       compiler->nir_options.has_sudot_4x8 = dev_info->a6xx.has_dp2acc;
296       compiler->nir_options.has_udot_4x8_sat = dev_info->a6xx.has_dp2acc;
297       compiler->nir_options.has_sudot_4x8_sat = dev_info->a6xx.has_dp2acc;
298    } else if (compiler->gen >= 3 && compiler->gen <= 5) {
299       compiler->nir_options.vertex_id_zero_based = true;
300    } else if (compiler->gen <= 2) {
301       /* a2xx compiler doesn't handle indirect: */
302       compiler->nir_options.force_indirect_unrolling = nir_var_all;
303    }
304 
305    if (options->lower_base_vertex) {
306       compiler->nir_options.lower_base_vertex = true;
307    }
308 
309    /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
310     * this core NIR option enables some optimizations of 16-bit operations.
311     */
312    if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
313       compiler->nir_options.support_16bit_alu = true;
314 
315    if (!options->disable_cache)
316       ir3_disk_cache_init(compiler);
317 
318    return compiler;
319 }
320 
321 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)322 ir3_get_compiler_options(struct ir3_compiler *compiler)
323 {
324    return &compiler->nir_options;
325 }
326