• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28 
29 /**
30  * Return a value that is equal to the given i32 \p index if it lies in [0,num)
31  * or an undefined value in the same interval otherwise.
32  */
si_llvm_bound_index(struct si_shader_context * ctx,LLVMValueRef index,unsigned num)33 static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
34                                         unsigned num)
35 {
36    LLVMBuilderRef builder = ctx->ac.builder;
37    LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
38    LLVMValueRef cc;
39 
40    if (util_is_power_of_two_or_zero(num)) {
41       index = LLVMBuildAnd(builder, index, c_max, "");
42    } else {
43       /* In theory, this MAX pattern should result in code that is
44        * as good as the bit-wise AND above.
45        *
46        * In practice, LLVM generates worse code (at the time of
47        * writing), because its value tracking is not strong enough.
48        */
49       cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
50       index = LLVMBuildSelect(builder, cc, index, c_max, "");
51    }
52 
53    return index;
54 }
55 
load_const_buffer_desc_fast_path(struct si_shader_context * ctx)56 static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
57 {
58    LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
59    struct si_shader_selector *sel = ctx->shader->selector;
60 
61    /* Do the bounds checking with a descriptor, because
62     * doing computation and manual bounds checking of 64-bit
63     * addresses generates horrible VALU code with very high
64     * VGPR usage and very low SIMD occupancy.
65     */
66    ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
67 
68    LLVMValueRef desc0, desc1;
69    desc0 = ptr;
70    desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
71 
72    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
73                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
74 
75    if (ctx->screen->info.gfx_level >= GFX11)
76       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
77                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
78    else if (ctx->screen->info.gfx_level >= GFX10)
79       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
80                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
81    else
82       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
83                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
84 
85    LLVMValueRef desc_elems[] = {desc0, desc1,
86                                 LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
87                                 LLVMConstInt(ctx->ac.i32, rsrc3, false)};
88 
89    return ac_build_gather_values(&ctx->ac, desc_elems, 4);
90 }
91 
load_ubo(struct ac_shader_abi * abi,LLVMValueRef index)92 static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
93 {
94    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
95    struct si_shader_selector *sel = ctx->shader->selector;
96 
97    LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
98 
99    if (sel->info.base.num_ubos == 1 && sel->info.base.num_ssbos == 0) {
100       return load_const_buffer_desc_fast_path(ctx);
101    }
102 
103    index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
104    index =
105       LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
106 
107    return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
108 }
109 
load_ssbo(struct ac_shader_abi * abi,LLVMValueRef index,bool write,bool non_uniform)110 static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write, bool non_uniform)
111 {
112    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
113 
114    /* Fast path if the shader buffer is in user SGPRs. */
115    if (LLVMIsConstant(index) &&
116        LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs)
117       return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]);
118 
119    LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
120 
121    index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
122    index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
123                         index, "");
124 
125    return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
126 }
127 
128 /**
129  * Given a 256-bit resource descriptor, force the DCC enable bit to off.
130  *
131  * At least on Tonga, executing image stores on images with DCC enabled and
132  * non-trivial can eventually lead to lockups. This can occur when an
133  * application binds an image as read-only but then uses a shader that writes
134  * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
135  * program termination) in this case, but it doesn't cost much to be a bit
136  * nicer: disabling DCC in the shader still leads to undefined results but
137  * avoids the lockup.
138  */
force_dcc_off(struct si_shader_context * ctx,LLVMValueRef rsrc)139 static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
140 {
141    if (ctx->screen->info.gfx_level <= GFX7) {
142       return rsrc;
143    } else {
144       LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
145       LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
146       LLVMValueRef tmp;
147 
148       tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
149       tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
150       return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
151    }
152 }
153 
force_write_compress_off(struct si_shader_context * ctx,LLVMValueRef rsrc)154 static LLVMValueRef force_write_compress_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
155 {
156    LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
157    LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, 0);
158    LLVMValueRef tmp;
159 
160    tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
161    tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
162    return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
163 }
164 
fixup_image_desc(struct si_shader_context * ctx,LLVMValueRef rsrc,bool uses_store)165 static LLVMValueRef fixup_image_desc(struct si_shader_context *ctx, LLVMValueRef rsrc,
166                                      bool uses_store)
167 {
168    if (uses_store && ctx->ac.gfx_level <= GFX9)
169       rsrc = force_dcc_off(ctx, rsrc);
170 
171    if (!uses_store && ctx->screen->info.has_image_load_dcc_bug &&
172        ctx->screen->always_allow_dcc_stores)
173       rsrc = force_write_compress_off(ctx, rsrc);
174 
175    return rsrc;
176 }
177 
178 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
179  * adjust "index" to point to FMASK. */
si_load_image_desc(struct si_shader_context * ctx,LLVMValueRef list,LLVMValueRef index,enum ac_descriptor_type desc_type,bool uses_store,bool bindless)180 static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
181                                        LLVMValueRef index, enum ac_descriptor_type desc_type,
182                                        bool uses_store, bool bindless)
183 {
184    LLVMBuilderRef builder = ctx->ac.builder;
185    LLVMValueRef rsrc;
186 
187    if (desc_type == AC_DESC_BUFFER) {
188       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
189       list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
190    } else {
191       assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
192    }
193 
194    if (bindless)
195       rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
196    else
197       rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
198 
199    if (desc_type == AC_DESC_IMAGE)
200       rsrc = fixup_image_desc(ctx, rsrc, uses_store);
201 
202    return rsrc;
203 }
204 
205 /**
206  * Load an image view, fmask view. or sampler state descriptor.
207  */
si_load_sampler_desc(struct si_shader_context * ctx,LLVMValueRef list,LLVMValueRef index,enum ac_descriptor_type type)208 static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
209                                          LLVMValueRef index, enum ac_descriptor_type type)
210 {
211    LLVMBuilderRef builder = ctx->ac.builder;
212 
213    switch (type) {
214    case AC_DESC_IMAGE:
215       /* The image is at [0:7]. */
216       index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
217       break;
218    case AC_DESC_BUFFER:
219       /* The buffer is in [4:7]. */
220       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
221       list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
222       break;
223    case AC_DESC_FMASK:
224       /* The FMASK is at [8:15]. */
225       assert(ctx->screen->info.gfx_level < GFX11);
226       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
227       break;
228    case AC_DESC_SAMPLER:
229       /* The sampler state is at [12:15]. */
230       index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
231                             LLVMConstInt(ctx->ac.i32, 3, 0));
232       list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
233       break;
234    case AC_DESC_PLANE_0:
235    case AC_DESC_PLANE_1:
236    case AC_DESC_PLANE_2:
237       /* Only used for the multiplane image support for Vulkan. Should
238        * never be reached in radeonsi.
239        */
240       unreachable("Plane descriptor requested in radeonsi.");
241    }
242 
243    return ac_build_load_to_sgpr(&ctx->ac, list, index);
244 }
245 
si_nir_load_sampler_desc(struct ac_shader_abi * abi,unsigned descriptor_set,unsigned base_index,unsigned constant_index,LLVMValueRef dynamic_index,enum ac_descriptor_type desc_type,bool image,bool write,bool bindless)246 static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
247                                              unsigned base_index, unsigned constant_index,
248                                              LLVMValueRef dynamic_index,
249                                              enum ac_descriptor_type desc_type, bool image,
250                                              bool write, bool bindless)
251 {
252    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
253    LLVMBuilderRef builder = ctx->ac.builder;
254    unsigned const_index = base_index + constant_index;
255 
256    assert(!descriptor_set);
257    assert(desc_type <= AC_DESC_BUFFER);
258 
259    if (bindless) {
260       LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
261 
262       /* dynamic_index is the bindless handle */
263       if (image) {
264          /* Bindless image descriptors use 16-dword slots. */
265          dynamic_index =
266             LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
267          /* FMASK is right after the image. */
268          if (desc_type == AC_DESC_FMASK) {
269             dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
270          }
271 
272          return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
273       }
274 
275       /* Since bindless handle arithmetic can contain an unsigned integer
276        * wraparound and si_load_sampler_desc assumes there isn't any,
277        * use GEP without "inbounds" (inside ac_build_pointer_add)
278        * to prevent incorrect code generation and hangs.
279        */
280       dynamic_index =
281          LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
282       list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
283       return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
284    }
285 
286    unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
287 
288    /* Redirect invalid resource indices to the first array element. */
289    if (const_index >= num_slots)
290       const_index = base_index;
291 
292    LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
293    LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
294 
295    if (dynamic_index) {
296       index = LLVMBuildAdd(builder, index, dynamic_index, "");
297 
298       /* From the GL_ARB_shader_image_load_store extension spec:
299        *
300        *    If a shader performs an image load, store, or atomic
301        *    operation using an image variable declared as an array,
302        *    and if the index used to select an individual element is
303        *    negative or greater than or equal to the size of the
304        *    array, the results of the operation are undefined but may
305        *    not lead to termination.
306        */
307       index = si_llvm_bound_index(ctx, index, num_slots);
308    }
309 
310    if (image) {
311       /* Fast path if the image is in user SGPRs. */
312       if (!dynamic_index &&
313           const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&
314           (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) {
315          LLVMValueRef rsrc = ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);
316 
317          if (desc_type == AC_DESC_IMAGE)
318             rsrc = fixup_image_desc(ctx, rsrc, write);
319          return rsrc;
320       }
321 
322       /* FMASKs are separate from images. */
323       if (desc_type == AC_DESC_FMASK) {
324          index =
325             LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
326       }
327       index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
328                            index, "");
329       return si_load_image_desc(ctx, list, index, desc_type, write, false);
330    }
331 
332    index = LLVMBuildAdd(ctx->ac.builder, index,
333                         LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
334    return si_load_sampler_desc(ctx, list, index, desc_type);
335 }
336 
si_llvm_init_resource_callbacks(struct si_shader_context * ctx)337 void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
338 {
339    ctx->abi.load_ubo = load_ubo;
340    ctx->abi.load_ssbo = load_ssbo;
341    ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
342 }
343