1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * IN THE SOFTWARE.
26 */
27
28 #include "meta/radv_meta.h"
29 #include "nir/nir.h"
30 #include "nir/nir_builder.h"
31 #include "nir/nir_serialize.h"
32 #include "nir/radv_nir.h"
33 #include "spirv/nir_spirv.h"
34 #include "util/disk_cache.h"
35 #include "util/mesa-sha1.h"
36 #include "util/os_time.h"
37 #include "util/u_atomic.h"
38 #include "radv_cs.h"
39 #include "radv_debug.h"
40 #include "radv_private.h"
41 #include "radv_shader.h"
42 #include "radv_shader_args.h"
43 #include "vk_nir_convert_ycbcr.h"
44 #include "vk_pipeline.h"
45 #include "vk_render_pass.h"
46 #include "vk_util.h"
47
48 #include "util/u_debug.h"
49 #include "ac_binary.h"
50 #include "ac_nir.h"
51 #include "ac_shader_util.h"
52 #include "aco_interface.h"
53 #include "sid.h"
54 #include "vk_format.h"
55
56 void
radv_emit_compute_shader(const struct radv_physical_device * pdevice,struct radeon_cmdbuf * cs,const struct radv_shader * shader)57 radv_emit_compute_shader(const struct radv_physical_device *pdevice, struct radeon_cmdbuf *cs,
58 const struct radv_shader *shader)
59 {
60 uint64_t va = radv_shader_get_va(shader);
61 unsigned threads_per_threadgroup;
62 unsigned threadgroups_per_cu = 1;
63 unsigned waves_per_threadgroup;
64 unsigned max_waves_per_sh = 0;
65
66 radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
67
68 radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
69 radeon_emit(cs, shader->config.rsrc1);
70 radeon_emit(cs, shader->config.rsrc2);
71 if (pdevice->rad_info.gfx_level >= GFX10) {
72 radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
73 }
74
75 /* Calculate best compute resource limits. */
76 threads_per_threadgroup =
77 shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2];
78 waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size);
79
80 if (pdevice->rad_info.gfx_level >= GFX10 && waves_per_threadgroup == 1)
81 threadgroups_per_cu = 2;
82
83 radeon_set_sh_reg(
84 cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
85 ac_get_compute_resource_limits(&pdevice->rad_info, waves_per_threadgroup, max_waves_per_sh, threadgroups_per_cu));
86
87 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
88 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
89 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
90 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
91 }
92
93 static void
radv_compute_generate_pm4(const struct radv_device * device,struct radv_compute_pipeline * pipeline,struct radv_shader * shader)94 radv_compute_generate_pm4(const struct radv_device *device, struct radv_compute_pipeline *pipeline,
95 struct radv_shader *shader)
96 {
97 struct radv_physical_device *pdevice = device->physical_device;
98 struct radeon_cmdbuf *cs = &pipeline->base.cs;
99
100 cs->reserved_dw = cs->max_dw = pdevice->rad_info.gfx_level >= GFX10 ? 19 : 16;
101 cs->buf = malloc(cs->max_dw * 4);
102
103 radv_emit_compute_shader(pdevice, cs, shader);
104
105 assert(pipeline->base.cs.cdw <= pipeline->base.cs.max_dw);
106 }
107
108 void
radv_compute_pipeline_init(const struct radv_device * device,struct radv_compute_pipeline * pipeline,const struct radv_pipeline_layout * layout,struct radv_shader * shader)109 radv_compute_pipeline_init(const struct radv_device *device, struct radv_compute_pipeline *pipeline,
110 const struct radv_pipeline_layout *layout, struct radv_shader *shader)
111 {
112 pipeline->base.need_indirect_descriptor_sets |= radv_shader_need_indirect_descriptor_sets(shader);
113
114 pipeline->base.push_constant_size = layout->push_constant_size;
115 pipeline->base.dynamic_offset_count = layout->dynamic_offset_count;
116
117 pipeline->base.shader_upload_seq = shader->upload_seq;
118
119 radv_compute_generate_pm4(device, pipeline, shader);
120 }
121
122 struct radv_shader *
radv_compile_cs(struct radv_device * device,struct vk_pipeline_cache * cache,struct radv_shader_stage * cs_stage,bool keep_executable_info,bool keep_statistic_info,bool is_internal,struct radv_shader_binary ** cs_binary)123 radv_compile_cs(struct radv_device *device, struct vk_pipeline_cache *cache, struct radv_shader_stage *cs_stage,
124 bool keep_executable_info, bool keep_statistic_info, bool is_internal,
125 struct radv_shader_binary **cs_binary)
126 {
127 struct radv_shader *cs_shader;
128
129 /* Compile SPIR-V shader to NIR. */
130 cs_stage->nir = radv_shader_spirv_to_nir(device, cs_stage, NULL, is_internal);
131
132 radv_optimize_nir(cs_stage->nir, cs_stage->key.optimisations_disabled);
133
134 /* Gather info again, information such as outputs_read can be out-of-date. */
135 nir_shader_gather_info(cs_stage->nir, nir_shader_get_entrypoint(cs_stage->nir));
136
137 /* Run the shader info pass. */
138 radv_nir_shader_info_init(cs_stage->stage, MESA_SHADER_NONE, &cs_stage->info);
139 radv_nir_shader_info_pass(device, cs_stage->nir, &cs_stage->layout, &cs_stage->key, NULL, RADV_PIPELINE_COMPUTE,
140 false, &cs_stage->info);
141
142 radv_declare_shader_args(device, NULL, &cs_stage->info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &cs_stage->args);
143
144 cs_stage->info.user_sgprs_locs = cs_stage->args.user_sgprs_locs;
145 cs_stage->info.inline_push_constant_mask = cs_stage->args.ac.inline_push_const_mask;
146
147 /* Postprocess NIR. */
148 radv_postprocess_nir(device, NULL, cs_stage);
149
150 if (radv_can_dump_shader(device, cs_stage->nir, false))
151 nir_print_shader(cs_stage->nir, stderr);
152
153 /* Compile NIR shader to AMD assembly. */
154 bool dump_shader = radv_can_dump_shader(device, cs_stage->nir, false);
155
156 *cs_binary =
157 radv_shader_nir_to_asm(device, cs_stage, &cs_stage->nir, 1, NULL, keep_executable_info, keep_statistic_info);
158
159 cs_shader = radv_shader_create(device, cache, *cs_binary, keep_executable_info || dump_shader);
160
161 radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, *cs_binary, cs_shader, &cs_stage->nir, 1,
162 &cs_stage->info);
163
164 if (keep_executable_info && cs_stage->spirv.size) {
165 cs_shader->spirv = malloc(cs_stage->spirv.size);
166 memcpy(cs_shader->spirv, cs_stage->spirv.data, cs_stage->spirv.size);
167 cs_shader->spirv_size = cs_stage->spirv.size;
168 }
169
170 return cs_shader;
171 }
172
173 static VkResult
radv_compute_pipeline_compile(struct radv_compute_pipeline * pipeline,struct radv_pipeline_layout * pipeline_layout,struct radv_device * device,struct vk_pipeline_cache * cache,const struct radv_shader_stage_key * stage_key,const VkPipelineShaderStageCreateInfo * pStage,const VkPipelineCreationFeedbackCreateInfo * creation_feedback)174 radv_compute_pipeline_compile(struct radv_compute_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
175 struct radv_device *device, struct vk_pipeline_cache *cache,
176 const struct radv_shader_stage_key *stage_key,
177 const VkPipelineShaderStageCreateInfo *pStage,
178 const VkPipelineCreationFeedbackCreateInfo *creation_feedback)
179 {
180 struct radv_shader_binary *cs_binary = NULL;
181 unsigned char hash[20];
182 bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.create_flags);
183 bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.create_flags);
184 struct radv_shader_stage cs_stage = {0};
185 VkPipelineCreationFeedback pipeline_feedback = {
186 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
187 };
188 VkResult result = VK_SUCCESS;
189
190 int64_t pipeline_start = os_time_get_nano();
191
192 radv_pipeline_stage_init(pStage, pipeline_layout, stage_key, &cs_stage);
193
194 radv_hash_shaders(device, hash, &cs_stage, 1, pipeline_layout, NULL);
195
196 pipeline->base.pipeline_hash = *(uint64_t *)hash;
197
198 bool found_in_application_cache = true;
199 if (!keep_executable_info &&
200 radv_pipeline_cache_search(device, cache, &pipeline->base, hash, &found_in_application_cache)) {
201 if (found_in_application_cache)
202 pipeline_feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
203 result = VK_SUCCESS;
204 goto done;
205 }
206
207 if (pipeline->base.create_flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)
208 return VK_PIPELINE_COMPILE_REQUIRED;
209
210 int64_t stage_start = os_time_get_nano();
211
212 pipeline->base.shaders[MESA_SHADER_COMPUTE] = radv_compile_cs(
213 device, cache, &cs_stage, keep_executable_info, keep_statistic_info, pipeline->base.is_internal, &cs_binary);
214
215 cs_stage.feedback.duration += os_time_get_nano() - stage_start;
216
217 if (!keep_executable_info) {
218 radv_pipeline_cache_insert(device, cache, &pipeline->base, hash);
219 }
220
221 free(cs_binary);
222 if (radv_can_dump_shader_stats(device, cs_stage.nir)) {
223 radv_dump_shader_stats(device, &pipeline->base, pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE,
224 stderr);
225 }
226 ralloc_free(cs_stage.nir);
227
228 done:
229 pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
230
231 if (creation_feedback) {
232 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
233
234 if (creation_feedback->pipelineStageCreationFeedbackCount) {
235 assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
236 creation_feedback->pPipelineStageCreationFeedbacks[0] = cs_stage.feedback;
237 }
238 }
239
240 return result;
241 }
242
243 VkResult
radv_compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)244 radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkComputePipelineCreateInfo *pCreateInfo,
245 const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
246 {
247 RADV_FROM_HANDLE(radv_device, device, _device);
248 VK_FROM_HANDLE(vk_pipeline_cache, cache, _cache);
249 RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
250 struct radv_compute_pipeline *pipeline;
251 VkResult result;
252
253 pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
254 if (pipeline == NULL) {
255 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
256 }
257
258 radv_pipeline_init(device, &pipeline->base, RADV_PIPELINE_COMPUTE);
259 pipeline->base.create_flags = vk_compute_pipeline_create_flags(pCreateInfo);
260 pipeline->base.is_internal = _cache == device->meta_state.cache;
261
262 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
263 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
264
265 struct radv_shader_stage_key stage_key =
266 radv_pipeline_get_shader_key(device, &pCreateInfo->stage, pipeline->base.create_flags, pCreateInfo->pNext);
267
268 result = radv_compute_pipeline_compile(pipeline, pipeline_layout, device, cache, &stage_key, &pCreateInfo->stage,
269 creation_feedback);
270 if (result != VK_SUCCESS) {
271 radv_pipeline_destroy(device, &pipeline->base, pAllocator);
272 return result;
273 }
274
275 radv_compute_pipeline_init(device, pipeline, pipeline_layout, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
276
277 *pPipeline = radv_pipeline_to_handle(&pipeline->base);
278 radv_rmv_log_compute_pipeline_create(device, &pipeline->base, pipeline->base.is_internal);
279 return VK_SUCCESS;
280 }
281
282 static VkResult
radv_create_compute_pipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)283 radv_create_compute_pipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
284 const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator,
285 VkPipeline *pPipelines)
286 {
287 VkResult result = VK_SUCCESS;
288
289 unsigned i = 0;
290 for (; i < count; i++) {
291 VkResult r;
292 r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, &pPipelines[i]);
293 if (r != VK_SUCCESS) {
294 result = r;
295 pPipelines[i] = VK_NULL_HANDLE;
296
297 VkPipelineCreateFlagBits2KHR create_flags = vk_compute_pipeline_create_flags(&pCreateInfos[i]);
298 if (create_flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
299 break;
300 }
301 }
302
303 for (; i < count; ++i)
304 pPipelines[i] = VK_NULL_HANDLE;
305
306 return result;
307 }
308
309 void
radv_destroy_compute_pipeline(struct radv_device * device,struct radv_compute_pipeline * pipeline)310 radv_destroy_compute_pipeline(struct radv_device *device, struct radv_compute_pipeline *pipeline)
311 {
312 if (pipeline->base.shaders[MESA_SHADER_COMPUTE])
313 radv_shader_unref(device, pipeline->base.shaders[MESA_SHADER_COMPUTE]);
314 }
315
316 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)317 radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
318 const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator,
319 VkPipeline *pPipelines)
320 {
321 return radv_create_compute_pipelines(_device, pipelineCache, count, pCreateInfos, pAllocator, pPipelines);
322 }
323