1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * based in part on v3dv driver which is:
5 * Copyright © 2019 Raspberry Pi
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26
27 #include <assert.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <vulkan/vulkan.h>
32
33 #include "compiler/shader_enums.h"
34 #include "hwdef/rogue_hw_utils.h"
35 #include "nir/nir.h"
36 #include "pvr_bo.h"
37 #include "pvr_csb.h"
38 #include "pvr_csb_enum_helpers.h"
39 #include "pvr_hardcode.h"
40 #include "pvr_pds.h"
41 #include "pvr_private.h"
42 #include "pvr_shader.h"
43 #include "pvr_types.h"
44 #include "rogue/rogue.h"
45 #include "rogue/rogue_build_data.h"
46 #include "util/log.h"
47 #include "util/macros.h"
48 #include "util/ralloc.h"
49 #include "util/u_math.h"
50 #include "vk_alloc.h"
51 #include "vk_log.h"
52 #include "vk_object.h"
53 #include "vk_util.h"
54
55 /*****************************************************************************
56 PDS functions
57 *****************************************************************************/
58
59 /* If allocator == NULL, the internal one will be used. */
pvr_pds_coeff_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const uint32_t * fpu_iterators,uint32_t fpu_iterators_count,const uint32_t * destinations,struct pvr_pds_upload * const pds_upload_out)60 static VkResult pvr_pds_coeff_program_create_and_upload(
61 struct pvr_device *device,
62 const VkAllocationCallbacks *allocator,
63 const uint32_t *fpu_iterators,
64 uint32_t fpu_iterators_count,
65 const uint32_t *destinations,
66 struct pvr_pds_upload *const pds_upload_out)
67 {
68 struct pvr_pds_coeff_loading_program program = {
69 .num_fpu_iterators = fpu_iterators_count,
70 };
71 uint32_t staging_buffer_size;
72 uint32_t *staging_buffer;
73 VkResult result;
74
75 assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS);
76
77 /* Get the size of the program and then allocate that much memory. */
78 pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES);
79
80 staging_buffer_size =
81 (program.code_size + program.data_size) * sizeof(*staging_buffer);
82
83 staging_buffer = vk_alloc2(&device->vk.alloc,
84 allocator,
85 staging_buffer_size,
86 8,
87 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
88 if (!staging_buffer)
89 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
90
91 /* FIXME: Should we save pointers when we redesign the pds gen api ? */
92 typed_memcpy(program.FPU_iterators,
93 fpu_iterators,
94 program.num_fpu_iterators);
95
96 typed_memcpy(program.destination, destinations, program.num_fpu_iterators);
97
98 /* Generate the program into is the staging_buffer. */
99 pvr_pds_coefficient_loading(&program,
100 staging_buffer,
101 PDS_GENERATE_CODEDATA_SEGMENTS);
102
103 /* FIXME: Figure out the define for alignment of 16. */
104 result = pvr_gpu_upload_pds(device,
105 &staging_buffer[0],
106 program.data_size,
107 16,
108 &staging_buffer[program.data_size],
109 program.code_size,
110 16,
111 16,
112 pds_upload_out);
113 if (result != VK_SUCCESS) {
114 vk_free2(&device->vk.alloc, allocator, staging_buffer);
115 return result;
116 }
117
118 vk_free2(&device->vk.alloc, allocator, staging_buffer);
119
120 return VK_SUCCESS;
121 }
122
123 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
124 /* If allocator == NULL, the internal one will be used. */
pvr_pds_fragment_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const struct pvr_bo * fragment_shader_bo,uint32_t fragment_temp_count,enum rogue_msaa_mode msaa_mode,bool has_phase_rate_change,struct pvr_pds_upload * const pds_upload_out)125 VkResult pvr_pds_fragment_program_create_and_upload(
126 struct pvr_device *device,
127 const VkAllocationCallbacks *allocator,
128 const struct pvr_bo *fragment_shader_bo,
129 uint32_t fragment_temp_count,
130 enum rogue_msaa_mode msaa_mode,
131 bool has_phase_rate_change,
132 struct pvr_pds_upload *const pds_upload_out)
133 {
134 const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE)
135 sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode);
136 struct pvr_pds_kickusc_program program = { 0 };
137 uint32_t staging_buffer_size;
138 uint32_t *staging_buffer;
139 VkResult result;
140
141 /* FIXME: Should it be passing in the USC offset rather than address here?
142 */
143 /* Note this is not strictly required to be done before calculating the
144 * staging_buffer_size in this particular case. It can also be done after
145 * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
146 */
147 pvr_pds_setup_doutu(&program.usc_task_control,
148 fragment_shader_bo->vma->dev_addr.addr,
149 fragment_temp_count,
150 sample_rate,
151 has_phase_rate_change);
152
153 pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
154
155 staging_buffer_size =
156 (program.code_size + program.data_size) * sizeof(*staging_buffer);
157
158 staging_buffer = vk_alloc2(&device->vk.alloc,
159 allocator,
160 staging_buffer_size,
161 8,
162 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
163 if (!staging_buffer)
164 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
165
166 pvr_pds_kick_usc(&program,
167 staging_buffer,
168 0,
169 false,
170 PDS_GENERATE_CODEDATA_SEGMENTS);
171
172 /* FIXME: Figure out the define for alignment of 16. */
173 result = pvr_gpu_upload_pds(device,
174 &staging_buffer[0],
175 program.data_size,
176 16,
177 &staging_buffer[program.data_size],
178 program.code_size,
179 16,
180 16,
181 pds_upload_out);
182 if (result != VK_SUCCESS) {
183 vk_free2(&device->vk.alloc, allocator, staging_buffer);
184 return result;
185 }
186
187 vk_free2(&device->vk.alloc, allocator, staging_buffer);
188
189 return VK_SUCCESS;
190 }
191
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(const struct pvr_device_info * dev_info,bool robust_buffer_access)192 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
193 const struct pvr_device_info *dev_info,
194 bool robust_buffer_access)
195 {
196 /* FIXME: Use more local variable to improve formatting. */
197
198 /* Maximum memory allocation needed for const map entries in
199 * pvr_pds_generate_vertex_primary_program().
200 * When robustBufferAccess is disabled, it must be >= 410.
201 * When robustBufferAccess is enabled, it must be >= 570.
202 *
203 * 1. Size of entry for base instance
204 * (pvr_const_map_entry_base_instance)
205 *
206 * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
207 * if (!robustBufferAccess)
208 * size of vertex attribute entry
209 * (pvr_const_map_entry_vertex_attribute_address) +
210 * else
211 * size of robust vertex attribute entry
212 * (pvr_const_map_entry_robust_vertex_attribute_address) +
213 * size of entry for max attribute index
214 * (pvr_const_map_entry_vertex_attribute_max_index) +
215 * fi
216 * size of Unified Store burst entry
217 * (pvr_const_map_entry_literal32) +
218 * size of entry for vertex stride
219 * (pvr_const_map_entry_literal32) +
220 * size of entries for DDMAD control word
221 * (num_ddmad_literals * pvr_const_map_entry_literal32))
222 *
223 * 3. Size of entry for DOUTW vertex/instance control word
224 * (pvr_const_map_entry_literal32)
225 *
226 * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
227 */
228
229 const size_t attribute_size =
230 (!robust_buffer_access)
231 ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
232 : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
233 sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
234
235 /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
236 * and is increased by one DWORD to contain the data for the DDMADT's
237 * out-of-bounds check.
238 */
239 const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
240 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
241
242 return (sizeof(struct pvr_const_map_entry_base_instance) +
243 PVR_MAX_VERTEX_INPUT_BINDINGS *
244 (attribute_size +
245 (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
246 sizeof(struct pvr_const_map_entry_literal32)) +
247 sizeof(struct pvr_const_map_entry_literal32) +
248 sizeof(struct pvr_const_map_entry_doutu_address));
249 }
250
251 /* This is a const pointer to an array of pvr_pds_vertex_dma structs.
252 * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size.
253 */
254 typedef struct pvr_pds_vertex_dma (
255 *const
256 pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS];
257
258 /* dma_descriptions_out_ptr is a pointer to the array used as output.
259 * The whole array might not be filled so dma_count_out indicates how many
260 * elements were used.
261 */
pvr_pds_vertex_attrib_init_dma_descriptions(const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,const struct rogue_vs_build_data * vs_data,pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,uint32_t * const dma_count_out)262 static void pvr_pds_vertex_attrib_init_dma_descriptions(
263 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
264 const struct rogue_vs_build_data *vs_data,
265 pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
266 uint32_t *const dma_count_out)
267 {
268 struct pvr_pds_vertex_dma *const dma_descriptions =
269 *dma_descriptions_out_ptr;
270 uint32_t dma_count = 0;
271
272 if (!vertex_input_state) {
273 *dma_count_out = 0;
274 return;
275 }
276
277 for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
278 i++) {
279 const VkVertexInputAttributeDescription *const attrib_desc =
280 &vertex_input_state->pVertexAttributeDescriptions[i];
281 const VkVertexInputBindingDescription *binding_desc = NULL;
282
283 /* Finding the matching binding description. */
284 for (uint32_t j = 0;
285 j < vertex_input_state->vertexBindingDescriptionCount;
286 j++) {
287 const VkVertexInputBindingDescription *const current_binding_desc =
288 &vertex_input_state->pVertexBindingDescriptions[j];
289
290 if (current_binding_desc->binding == attrib_desc->binding) {
291 binding_desc = current_binding_desc;
292 break;
293 }
294 }
295
296 /* From the Vulkan 1.2.195 spec for
297 * VkPipelineVertexInputStateCreateInfo:
298 *
299 * "For every binding specified by each element of
300 * pVertexAttributeDescriptions, a
301 * VkVertexInputBindingDescription must exist in
302 * pVertexBindingDescriptions with the same value of binding"
303 *
304 * So we don't check if we found the matching binding description
305 * or not.
306 */
307
308 struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count];
309
310 size_t location = attrib_desc->location;
311 assert(location < vs_data->inputs.num_input_vars);
312
313 dma_desc->offset = attrib_desc->offset;
314 dma_desc->stride = binding_desc->stride;
315
316 dma_desc->flags = 0;
317
318 if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
319 dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
320
321 dma_desc->size_in_dwords = vs_data->inputs.components[location];
322 /* TODO: This will be different when other types are supported.
323 * Store in vs_data with base and components?
324 */
325 /* TODO: Use attrib_desc->format. */
326 dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES;
327 dma_desc->destination = vs_data->inputs.base[location];
328 dma_desc->binding_index = attrib_desc->binding;
329 dma_desc->divisor = 1;
330 dma_desc->robustness_buffer_offset = 0;
331
332 ++dma_count;
333 }
334
335 *dma_count_out = dma_count;
336 }
337
pvr_pds_vertex_attrib_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_pds_vertex_primary_program_input * const input,struct pvr_pds_attrib_program * const program_out)338 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
339 struct pvr_device *const device,
340 const VkAllocationCallbacks *const allocator,
341 struct pvr_pds_vertex_primary_program_input *const input,
342 struct pvr_pds_attrib_program *const program_out)
343 {
344 const size_t const_entries_size_in_bytes =
345 pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
346 &device->pdevice->dev_info,
347 device->features.robustBufferAccess);
348 struct pvr_pds_upload *const program = &program_out->program;
349 struct pvr_pds_info *const info = &program_out->info;
350 struct pvr_const_map_entry *entries_buffer;
351 ASSERTED uint32_t code_size_in_dwords;
352 size_t staging_buffer_size;
353 uint32_t *staging_buffer;
354 VkResult result;
355
356 memset(info, 0, sizeof(*info));
357
358 entries_buffer = vk_alloc2(&device->vk.alloc,
359 allocator,
360 const_entries_size_in_bytes,
361 8,
362 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
363 if (!entries_buffer)
364 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
365
366 info->entries = entries_buffer;
367 info->entries_size_in_bytes = const_entries_size_in_bytes;
368
369 pvr_pds_generate_vertex_primary_program(input,
370 NULL,
371 info,
372 device->features.robustBufferAccess,
373 &device->pdevice->dev_info);
374
375 code_size_in_dwords = info->code_size_in_dwords;
376 staging_buffer_size = info->code_size_in_dwords * sizeof(*staging_buffer);
377
378 staging_buffer = vk_alloc2(&device->vk.alloc,
379 allocator,
380 staging_buffer_size,
381 8,
382 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
383 if (!staging_buffer) {
384 vk_free2(&device->vk.alloc, allocator, entries_buffer);
385 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
386 }
387
388 /* This also fills in info->entries. */
389 pvr_pds_generate_vertex_primary_program(input,
390 staging_buffer,
391 info,
392 device->features.robustBufferAccess,
393 &device->pdevice->dev_info);
394
395 assert(info->code_size_in_dwords <= code_size_in_dwords);
396
397 /* FIXME: Add a vk_realloc2() ? */
398 entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
399 entries_buffer,
400 info->entries_written_size_in_bytes,
401 8,
402 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
403 if (!entries_buffer) {
404 vk_free2(&device->vk.alloc, allocator, staging_buffer);
405 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
406 }
407
408 info->entries = entries_buffer;
409 info->entries_size_in_bytes = info->entries_written_size_in_bytes;
410
411 /* FIXME: Figure out the define for alignment of 16. */
412 result = pvr_gpu_upload_pds(device,
413 NULL,
414 0,
415 0,
416 staging_buffer,
417 info->code_size_in_dwords,
418 16,
419 16,
420 program);
421 if (result != VK_SUCCESS) {
422 vk_free2(&device->vk.alloc, allocator, entries_buffer);
423 vk_free2(&device->vk.alloc, allocator, staging_buffer);
424
425 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
426 }
427
428 vk_free2(&device->vk.alloc, allocator, staging_buffer);
429
430 return VK_SUCCESS;
431 }
432
pvr_pds_vertex_attrib_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_attrib_program * const program)433 static inline void pvr_pds_vertex_attrib_program_destroy(
434 struct pvr_device *const device,
435 const struct VkAllocationCallbacks *const allocator,
436 struct pvr_pds_attrib_program *const program)
437 {
438 pvr_bo_free(device, program->program.pvr_bo);
439 vk_free2(&device->vk.alloc, allocator, program->info.entries);
440 }
441
442 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
443 * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
444 */
445 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
446 [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
447
448 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
449 * inputs. This will bake the code segment and create a template of the data
450 * segment for the command buffer to fill in.
451 */
452 /* If allocator == NULL, the internal one will be used.
453 *
454 * programs_out_ptr is a pointer to the array where the outputs will be placed.
455 * */
pvr_pds_vertex_attrib_programs_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * const allocator,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,uint32_t usc_temp_count,const struct rogue_vs_build_data * vs_data,pvr_pds_attrib_programs_array_ptr programs_out_ptr)456 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
457 struct pvr_device *device,
458 const VkAllocationCallbacks *const allocator,
459 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
460 uint32_t usc_temp_count,
461 const struct rogue_vs_build_data *vs_data,
462 pvr_pds_attrib_programs_array_ptr programs_out_ptr)
463 {
464 struct pvr_pds_vertex_dma dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
465 struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
466 struct pvr_pds_vertex_primary_program_input input = {
467 .dma_list = dma_descriptions,
468 };
469 VkResult result;
470
471 pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state,
472 vs_data,
473 &dma_descriptions,
474 &input.dma_count);
475
476 pvr_pds_setup_doutu(&input.usc_task_control,
477 0,
478 usc_temp_count,
479 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
480 false);
481
482 /* TODO: If statements for all the "bRequired"s + ui32ExtraFlags. */
483
484 /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
485 * typedef.
486 */
487 for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
488 switch (i) {
489 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
490 input.flags = 0;
491 break;
492
493 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
494 input.flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
495 break;
496
497 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
498 /* We unset INSTANCE and set INDIRECT. */
499 input.flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
500 break;
501
502 default:
503 unreachable("Invalid vertex attrib program type.");
504 }
505
506 result =
507 pvr_pds_vertex_attrib_program_create_and_upload(device,
508 allocator,
509 &input,
510 &programs_out[i]);
511 if (result != VK_SUCCESS) {
512 for (uint32_t j = 0; j < i; j++) {
513 pvr_pds_vertex_attrib_program_destroy(device,
514 allocator,
515 &programs_out[j]);
516 }
517
518 return result;
519 }
520 }
521
522 return VK_SUCCESS;
523 }
524
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes()525 static size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes()
526 {
527 /* Maximum memory allocation needed for const map entries in
528 * pvr_pds_generate_descriptor_upload_program().
529 * It must be >= 688 bytes. This size is calculated as the sum of:
530 *
531 * 1. Max. number of descriptor sets (8) * (
532 * size of descriptor entry
533 * (pvr_const_map_entry_descriptor_set) +
534 * size of Common Store burst entry
535 * (pvr_const_map_entry_literal32))
536 *
537 * 2. Max. number of PDS program buffers (24) * (
538 * size of the largest buffer structure
539 * (pvr_const_map_entry_constant_buffer) +
540 * size of Common Store burst entry
541 * (pvr_const_map_entry_literal32)
542 *
543 * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
544 */
545
546 /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
547 * say that it should be 8.
548 * Figure our a define for this or is the comment wrong?
549 */
550 return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
551 sizeof(struct pvr_const_map_entry_literal32)) +
552 PVR_PDS_MAX_BUFFERS *
553 (sizeof(struct pvr_const_map_entry_constant_buffer) +
554 sizeof(struct pvr_const_map_entry_literal32)) +
555 sizeof(struct pvr_const_map_entry_doutu_address));
556 }
557
558 /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
559 * structs.
560 */
561 typedef struct pvr_pds_buffer (
562 *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS];
563
564 /**
565 * \brief Setup buffers for the PDS descriptor program.
566 *
567 * Sets up buffers required by the PDS gen api based on compiler info.
568 *
569 * For compile time static constants that need DMAing it uploads them and
570 * returns the upload in \r static_consts_pvr_bo_out .
571 */
pvr_pds_descriptor_program_setup_buffers(struct pvr_device * device,bool robust_buffer_access,const struct rogue_compile_time_consts_data * compile_time_consts_data,const struct rogue_ubo_data * ubo_data,pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,uint32_t * const buffer_count_out,struct pvr_bo ** const static_consts_pvr_bo_out)572 static VkResult pvr_pds_descriptor_program_setup_buffers(
573 struct pvr_device *device,
574 bool robust_buffer_access,
575 const struct rogue_compile_time_consts_data *compile_time_consts_data,
576 const struct rogue_ubo_data *ubo_data,
577 pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,
578 uint32_t *const buffer_count_out,
579 struct pvr_bo **const static_consts_pvr_bo_out)
580 {
581 struct pvr_pds_buffer *const buffers = *buffers_out_ptr;
582 uint32_t buffer_count = 0;
583
584 for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) {
585 struct pvr_pds_buffer *current_buffer = &buffers[buffer_count];
586
587 /* This is fine since buffers_out_ptr is a pointer to an array. */
588 assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
589
590 current_buffer->type = PVR_BUFFER_TYPE_UBO;
591 current_buffer->size_in_dwords = ubo_data->size[i];
592 current_buffer->destination = ubo_data->dest[i];
593
594 current_buffer->buffer_id = buffer_count;
595 current_buffer->desc_set = ubo_data->desc_set[i];
596 current_buffer->binding = ubo_data->binding[i];
597 /* TODO: Is this always the case?
598 * E.g. can multiple UBOs have the same base buffer?
599 */
600 current_buffer->source_offset = 0;
601
602 buffer_count++;
603 }
604
605 if (compile_time_consts_data->static_consts.num > 0) {
606 VkResult result;
607
608 assert(compile_time_consts_data->static_consts.num <=
609 ARRAY_SIZE(compile_time_consts_data->static_consts.value));
610
611 /* This is fine since buffers_out_ptr is a pointer to an array. */
612 assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
613
614 /* TODO: Is it possible to have multiple static consts buffer where the
615 * destination is not adjoining? If so we need to handle that.
616 * Currently we're only setting up a single buffer.
617 */
618 buffers[buffer_count++] = (struct pvr_pds_buffer){
619 .type = PVR_BUFFER_TYPES_COMPILE_TIME,
620 .size_in_dwords = compile_time_consts_data->static_consts.num,
621 .destination = compile_time_consts_data->static_consts.dest,
622 };
623
624 result = pvr_gpu_upload(device,
625 device->heaps.general_heap,
626 compile_time_consts_data->static_consts.value,
627 compile_time_consts_data->static_consts.num *
628 ROGUE_REG_SIZE_BYTES,
629 ROGUE_REG_SIZE_BYTES,
630 static_consts_pvr_bo_out);
631 if (result != VK_SUCCESS)
632 return result;
633 } else {
634 *static_consts_pvr_bo_out = NULL;
635 }
636
637 *buffer_count_out = buffer_count;
638
639 return VK_SUCCESS;
640 }
641
pvr_pds_descriptor_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const struct rogue_compile_time_consts_data * const compile_time_consts_data,const struct rogue_ubo_data * const ubo_data,const struct pvr_explicit_constant_usage * const explicit_const_usage,const struct pvr_pipeline_layout * const layout,enum pvr_stage_allocation stage,struct pvr_stage_allocation_descriptor_state * const descriptor_state)642 static VkResult pvr_pds_descriptor_program_create_and_upload(
643 struct pvr_device *const device,
644 const VkAllocationCallbacks *const allocator,
645 const struct rogue_compile_time_consts_data *const compile_time_consts_data,
646 const struct rogue_ubo_data *const ubo_data,
647 const struct pvr_explicit_constant_usage *const explicit_const_usage,
648 const struct pvr_pipeline_layout *const layout,
649 enum pvr_stage_allocation stage,
650 struct pvr_stage_allocation_descriptor_state *const descriptor_state)
651 {
652 const size_t const_entries_size_in_bytes =
653 pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
654 struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
655 struct pvr_descriptor_program_input program = { 0 };
656 struct pvr_const_map_entry *entries_buffer;
657 ASSERTED uint32_t code_size_in_dwords;
658 uint32_t staging_buffer_size;
659 uint32_t *staging_buffer;
660 VkResult result;
661
662 assert(stage != PVR_STAGE_ALLOCATION_COUNT);
663
664 *pds_info = (struct pvr_pds_info){ 0 };
665
666 result = pvr_pds_descriptor_program_setup_buffers(
667 device,
668 device->features.robustBufferAccess,
669 compile_time_consts_data,
670 ubo_data,
671 &program.buffers,
672 &program.buffer_count,
673 &descriptor_state->static_consts);
674 if (result != VK_SUCCESS)
675 return result;
676
677 if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords)
678 assert(!"Unimplemented");
679
680 for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) {
681 const struct pvr_descriptor_set_layout_mem_layout *const reg_layout =
682 &layout->register_layout_in_dwords_per_stage[stage][set_num];
683 const uint32_t start_offset = explicit_const_usage->start_offset;
684
685 /* TODO: Use compiler usage info to optimize this? */
686
687 /* Only dma primaries if they are actually required. */
688 if (reg_layout->primary_size) {
689 program.descriptor_sets[program.descriptor_set_count++] =
690 (struct pvr_pds_descriptor_set){
691 .descriptor_set = set_num,
692 .size_in_dwords = reg_layout->primary_size,
693 .destination = reg_layout->primary_offset + start_offset,
694 .primary = true,
695 };
696 }
697
698 /* Only dma secondaries if they are actually required. */
699 if (!reg_layout->secondary_size)
700 continue;
701
702 program.descriptor_sets[program.descriptor_set_count++] =
703 (struct pvr_pds_descriptor_set){
704 .descriptor_set = set_num,
705 .size_in_dwords = reg_layout->secondary_size,
706 .destination = reg_layout->secondary_offset + start_offset,
707 };
708 }
709
710 entries_buffer = vk_alloc2(&device->vk.alloc,
711 allocator,
712 const_entries_size_in_bytes,
713 8,
714 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
715 if (!entries_buffer) {
716 pvr_bo_free(device, descriptor_state->static_consts);
717
718 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
719 }
720
721 pds_info->entries = entries_buffer;
722 pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
723
724 pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
725
726 code_size_in_dwords = pds_info->code_size_in_dwords;
727 staging_buffer_size =
728 pds_info->code_size_in_dwords * sizeof(*staging_buffer);
729
730 if (!staging_buffer_size) {
731 vk_free2(&device->vk.alloc, allocator, entries_buffer);
732
733 *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
734
735 return VK_SUCCESS;
736 }
737
738 staging_buffer = vk_alloc2(&device->vk.alloc,
739 allocator,
740 staging_buffer_size,
741 8,
742 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
743 if (!staging_buffer) {
744 pvr_bo_free(device, descriptor_state->static_consts);
745 vk_free2(&device->vk.alloc, allocator, entries_buffer);
746
747 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
748 }
749
750 pvr_pds_generate_descriptor_upload_program(&program,
751 staging_buffer,
752 pds_info);
753
754 assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
755
756 /* FIXME: use vk_realloc2() ? */
757 entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
758 entries_buffer,
759 pds_info->entries_written_size_in_bytes,
760 8,
761 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
762 if (!entries_buffer) {
763 pvr_bo_free(device, descriptor_state->static_consts);
764 vk_free2(&device->vk.alloc, allocator, staging_buffer);
765
766 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
767 }
768
769 pds_info->entries = entries_buffer;
770 pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
771
772 /* FIXME: Figure out the define for alignment of 16. */
773 result = pvr_gpu_upload_pds(device,
774 NULL,
775 0,
776 0,
777 staging_buffer,
778 pds_info->code_size_in_dwords,
779 16,
780 16,
781 &descriptor_state->pds_code);
782 if (result != VK_SUCCESS) {
783 pvr_bo_free(device, descriptor_state->static_consts);
784 vk_free2(&device->vk.alloc, allocator, entries_buffer);
785 vk_free2(&device->vk.alloc, allocator, staging_buffer);
786
787 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
788 }
789
790 vk_free2(&device->vk.alloc, allocator, staging_buffer);
791
792 return VK_SUCCESS;
793 }
794
pvr_pds_descriptor_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_stage_allocation_descriptor_state * const descriptor_state)795 static void pvr_pds_descriptor_program_destroy(
796 struct pvr_device *const device,
797 const struct VkAllocationCallbacks *const allocator,
798 struct pvr_stage_allocation_descriptor_state *const descriptor_state)
799 {
800 pvr_bo_free(device, descriptor_state->pds_code.pvr_bo);
801 vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
802 pvr_bo_free(device, descriptor_state->static_consts);
803 }
804
pvr_pds_compute_program_setup(const struct pvr_device_info * dev_info,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,bool add_base_workgroup,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_compute_shader_program * const program)805 static void pvr_pds_compute_program_setup(
806 const struct pvr_device_info *dev_info,
807 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
808 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
809 uint32_t barrier_coefficient,
810 bool add_base_workgroup,
811 uint32_t usc_temps,
812 pvr_dev_addr_t usc_shader_dev_addr,
813 struct pvr_pds_compute_shader_program *const program)
814 {
815 *program = (struct pvr_pds_compute_shader_program){
816 /* clang-format off */
817 .local_input_regs = {
818 local_input_regs[0],
819 local_input_regs[1],
820 local_input_regs[2]
821 },
822 .work_group_input_regs = {
823 work_group_input_regs[0],
824 work_group_input_regs[1],
825 work_group_input_regs[2]
826 },
827 .global_input_regs = {
828 [0 ... (PVR_WORKGROUP_DIMENSIONS - 1)] =
829 PVR_PDS_COMPUTE_INPUT_REG_UNUSED
830 },
831 /* clang-format on */
832 .barrier_coefficient = barrier_coefficient,
833 .flattened_work_groups = true,
834 .clear_pds_barrier = false,
835 .add_base_workgroup = add_base_workgroup,
836 .kick_usc = true,
837 };
838
839 STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
840 PVR_WORKGROUP_DIMENSIONS);
841 STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
842 PVR_WORKGROUP_DIMENSIONS);
843 STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
844 PVR_WORKGROUP_DIMENSIONS);
845
846 pvr_pds_setup_doutu(&program->usc_task_control,
847 usc_shader_dev_addr.addr,
848 usc_temps,
849 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
850 false);
851
852 pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
853 }
854
855 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
856 */
pvr_pds_compute_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_upload * const pds_upload_out,struct pvr_pds_info * const pds_info_out)857 static VkResult pvr_pds_compute_program_create_and_upload(
858 struct pvr_device *const device,
859 const VkAllocationCallbacks *const allocator,
860 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
861 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
862 uint32_t barrier_coefficient,
863 uint32_t usc_temps,
864 pvr_dev_addr_t usc_shader_dev_addr,
865 struct pvr_pds_upload *const pds_upload_out,
866 struct pvr_pds_info *const pds_info_out)
867 {
868 struct pvr_device_info *dev_info = &device->pdevice->dev_info;
869 struct pvr_pds_compute_shader_program program;
870 uint32_t staging_buffer_size;
871 uint32_t *staging_buffer;
872 VkResult result;
873
874 pvr_pds_compute_program_setup(dev_info,
875 local_input_regs,
876 work_group_input_regs,
877 barrier_coefficient,
878 false,
879 usc_temps,
880 usc_shader_dev_addr,
881 &program);
882
883 /* FIXME: According to pvr_device_init_compute_pds_program() the code size
884 * is in bytes. Investigate this.
885 */
886 staging_buffer_size =
887 (program.code_size + program.data_size) * sizeof(*staging_buffer);
888
889 staging_buffer = vk_alloc2(&device->vk.alloc,
890 allocator,
891 staging_buffer_size,
892 8,
893 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
894 if (!staging_buffer)
895 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
896
897 /* FIXME: pvr_pds_compute_shader doesn't implement
898 * PDS_GENERATE_CODEDATA_SEGMENTS.
899 */
900 pvr_pds_compute_shader(&program,
901 &staging_buffer[0],
902 PDS_GENERATE_CODE_SEGMENT,
903 dev_info);
904
905 pvr_pds_compute_shader(&program,
906 &staging_buffer[program.code_size],
907 PDS_GENERATE_DATA_SEGMENT,
908 dev_info);
909
910 /* FIXME: Figure out the define for alignment of 16. */
911 result = pvr_gpu_upload_pds(device,
912 &staging_buffer[program.code_size],
913 program.data_size,
914 16,
915 &staging_buffer[0],
916 program.code_size,
917 16,
918 16,
919 pds_upload_out);
920 if (result != VK_SUCCESS) {
921 vk_free2(&device->vk.alloc, allocator, staging_buffer);
922 return result;
923 }
924
925 *pds_info_out = (struct pvr_pds_info){
926 .temps_required = program.highest_temp,
927 .code_size_in_dwords = program.code_size,
928 .data_size_in_dwords = program.data_size,
929 };
930
931 vk_free2(&device->vk.alloc, allocator, staging_buffer);
932
933 return VK_SUCCESS;
934 };
935
pvr_pds_compute_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_upload * const pds_program,struct pvr_pds_info * const pds_info)936 static void pvr_pds_compute_program_destroy(
937 struct pvr_device *const device,
938 const struct VkAllocationCallbacks *const allocator,
939 struct pvr_pds_upload *const pds_program,
940 struct pvr_pds_info *const pds_info)
941 {
942 /* We don't allocate an entries buffer so we don't need to free it */
943 pvr_bo_free(device, pds_program->pvr_bo);
944 }
945
946 /* This only uploads the code segment. The data segment will need to be patched
947 * with the base workgroup before uploading.
948 */
pvr_pds_compute_base_workgroup_variant_program_init(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_base_workgroup_program * program_out)949 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
950 struct pvr_device *const device,
951 const VkAllocationCallbacks *const allocator,
952 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
953 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
954 uint32_t barrier_coefficient,
955 uint32_t usc_temps,
956 pvr_dev_addr_t usc_shader_dev_addr,
957 struct pvr_pds_base_workgroup_program *program_out)
958 {
959 struct pvr_device_info *dev_info = &device->pdevice->dev_info;
960 struct pvr_pds_compute_shader_program program;
961 uint32_t buffer_size;
962 uint32_t *buffer;
963 VkResult result;
964
965 pvr_pds_compute_program_setup(dev_info,
966 local_input_regs,
967 work_group_input_regs,
968 barrier_coefficient,
969 true,
970 usc_temps,
971 usc_shader_dev_addr,
972 &program);
973
974 /* FIXME: According to pvr_device_init_compute_pds_program() the code size
975 * is in bytes. Investigate this.
976 */
977 buffer_size = MAX2(program.code_size, program.data_size) * sizeof(*buffer);
978
979 buffer = vk_alloc2(&device->vk.alloc,
980 allocator,
981 buffer_size,
982 8,
983 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
984 if (!buffer)
985 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
986
987 pvr_pds_compute_shader(&program,
988 &buffer[0],
989 PDS_GENERATE_CODE_SEGMENT,
990 dev_info);
991
992 /* FIXME: Figure out the define for alignment of 16. */
993 result = pvr_gpu_upload_pds(device,
994 NULL,
995 0,
996 0,
997 buffer,
998 program.code_size,
999 16,
1000 16,
1001 &program_out->code_upload);
1002 if (result != VK_SUCCESS) {
1003 vk_free2(&device->vk.alloc, allocator, buffer);
1004 return result;
1005 }
1006
1007 pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
1008
1009 program_out->data_section = buffer;
1010
1011 /* We'll need to patch the base workgroup in the PDS data section before
1012 * dispatch so we save the offsets at which to patch. We only need to save
1013 * the offset for the first workgroup id since the workgroup ids are stored
1014 * contiguously in the data segment.
1015 */
1016 program_out->base_workgroup_data_patching_offset =
1017 program.base_workgroup_constant_offset_in_dwords[0];
1018
1019 program_out->info = (struct pvr_pds_info){
1020 .temps_required = program.highest_temp,
1021 .code_size_in_dwords = program.code_size,
1022 .data_size_in_dwords = program.data_size,
1023 };
1024
1025 return VK_SUCCESS;
1026 }
1027
pvr_pds_compute_base_workgroup_variant_program_finish(struct pvr_device * device,const VkAllocationCallbacks * const allocator,struct pvr_pds_base_workgroup_program * const state)1028 static void pvr_pds_compute_base_workgroup_variant_program_finish(
1029 struct pvr_device *device,
1030 const VkAllocationCallbacks *const allocator,
1031 struct pvr_pds_base_workgroup_program *const state)
1032 {
1033 pvr_bo_free(device, state->code_upload.pvr_bo);
1034 vk_free2(&device->vk.alloc, allocator, state->data_section);
1035 }
1036
1037 /******************************************************************************
1038 Generic pipeline functions
1039 ******************************************************************************/
1040
pvr_pipeline_init(struct pvr_device * device,enum pvr_pipeline_type type,struct pvr_pipeline * const pipeline)1041 static void pvr_pipeline_init(struct pvr_device *device,
1042 enum pvr_pipeline_type type,
1043 struct pvr_pipeline *const pipeline)
1044 {
1045 assert(!pipeline->layout);
1046
1047 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
1048
1049 pipeline->type = type;
1050 }
1051
pvr_pipeline_finish(struct pvr_pipeline * pipeline)1052 static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
1053 {
1054 vk_object_base_finish(&pipeline->base);
1055 }
1056
1057 /******************************************************************************
1058 Compute pipeline functions
1059 ******************************************************************************/
1060
1061 /* Compiles and uploads shaders and PDS programs. */
pvr_compute_pipeline_compile(struct pvr_device * const device,struct pvr_pipeline_cache * pipeline_cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1062 static VkResult pvr_compute_pipeline_compile(
1063 struct pvr_device *const device,
1064 struct pvr_pipeline_cache *pipeline_cache,
1065 const VkComputePipelineCreateInfo *pCreateInfo,
1066 const VkAllocationCallbacks *const allocator,
1067 struct pvr_compute_pipeline *const compute_pipeline)
1068 {
1069 struct rogue_compile_time_consts_data compile_time_consts_data;
1070 uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
1071 struct pvr_explicit_constant_usage explicit_const_usage;
1072 uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
1073 struct rogue_ubo_data ubo_data;
1074 uint32_t barrier_coefficient;
1075 uint32_t usc_temps;
1076 VkResult result;
1077
1078 if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) {
1079 struct pvr_hard_code_compute_build_info build_info;
1080
1081 result = pvr_hard_code_compute_pipeline(device,
1082 &compute_pipeline->state.shader,
1083 &build_info);
1084 if (result != VK_SUCCESS)
1085 return result;
1086
1087 ubo_data = build_info.ubo_data;
1088 compile_time_consts_data = build_info.compile_time_consts_data;
1089
1090 /* We make sure that the compiler's unused reg value is compatible with
1091 * the pds api.
1092 */
1093 STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED);
1094
1095 barrier_coefficient = build_info.barrier_reg;
1096
1097 /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */
1098 local_input_regs[0] = build_info.local_invocation_regs[0];
1099 local_input_regs[1] = build_info.local_invocation_regs[1];
1100 /* This is not a mistake. We want to assign element 1 to 2. */
1101 local_input_regs[2] = build_info.local_invocation_regs[1];
1102
1103 STATIC_ASSERT(
1104 __same_type(work_group_input_regs, build_info.work_group_regs));
1105 typed_memcpy(work_group_input_regs,
1106 build_info.work_group_regs,
1107 PVR_WORKGROUP_DIMENSIONS);
1108
1109 usc_temps = build_info.usc_temps;
1110
1111 explicit_const_usage = build_info.explicit_conts_usage;
1112
1113 } else {
1114 /* FIXME: Compile and upload the shader. */
1115 /* FIXME: Initialize the shader state and setup build info. */
1116 abort();
1117 };
1118
1119 result = pvr_pds_descriptor_program_create_and_upload(
1120 device,
1121 allocator,
1122 &compile_time_consts_data,
1123 &ubo_data,
1124 &explicit_const_usage,
1125 compute_pipeline->base.layout,
1126 PVR_STAGE_ALLOCATION_COMPUTE,
1127 &compute_pipeline->state.descriptor);
1128 if (result != VK_SUCCESS)
1129 goto err_free_shader;
1130
1131 result = pvr_pds_compute_program_create_and_upload(
1132 device,
1133 allocator,
1134 local_input_regs,
1135 work_group_input_regs,
1136 barrier_coefficient,
1137 usc_temps,
1138 compute_pipeline->state.shader.bo->vma->dev_addr,
1139 &compute_pipeline->state.primary_program,
1140 &compute_pipeline->state.primary_program_info);
1141 if (result != VK_SUCCESS)
1142 goto err_free_descriptor_program;
1143
1144 /* If the workgroup ID is required, then we require the base workgroup
1145 * variant of the PDS compute program as well.
1146 */
1147 compute_pipeline->state.flags.base_workgroup =
1148 work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1149 work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1150 work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
1151
1152 if (compute_pipeline->state.flags.base_workgroup) {
1153 result = pvr_pds_compute_base_workgroup_variant_program_init(
1154 device,
1155 allocator,
1156 local_input_regs,
1157 work_group_input_regs,
1158 barrier_coefficient,
1159 usc_temps,
1160 compute_pipeline->state.shader.bo->vma->dev_addr,
1161 &compute_pipeline->state.primary_base_workgroup_variant_program);
1162 if (result != VK_SUCCESS)
1163 goto err_destroy_compute_program;
1164 }
1165
1166 return VK_SUCCESS;
1167
1168 err_destroy_compute_program:
1169 pvr_pds_compute_program_destroy(
1170 device,
1171 allocator,
1172 &compute_pipeline->state.primary_program,
1173 &compute_pipeline->state.primary_program_info);
1174
1175 err_free_descriptor_program:
1176 pvr_bo_free(device, compute_pipeline->state.descriptor.pds_code.pvr_bo);
1177
1178 err_free_shader:
1179 pvr_bo_free(device, compute_pipeline->state.shader.bo);
1180
1181 return result;
1182 }
1183
1184 static VkResult
pvr_compute_pipeline_init(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_compute_pipeline * compute_pipeline)1185 pvr_compute_pipeline_init(struct pvr_device *device,
1186 struct pvr_pipeline_cache *pipeline_cache,
1187 const VkComputePipelineCreateInfo *pCreateInfo,
1188 const VkAllocationCallbacks *allocator,
1189 struct pvr_compute_pipeline *compute_pipeline)
1190 {
1191 VkResult result;
1192
1193 pvr_pipeline_init(device,
1194 PVR_PIPELINE_TYPE_COMPUTE,
1195 &compute_pipeline->base);
1196
1197 compute_pipeline->base.layout =
1198 pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1199
1200 result = pvr_compute_pipeline_compile(device,
1201 pipeline_cache,
1202 pCreateInfo,
1203 allocator,
1204 compute_pipeline);
1205 if (result != VK_SUCCESS) {
1206 pvr_pipeline_finish(&compute_pipeline->base);
1207 return result;
1208 }
1209
1210 return VK_SUCCESS;
1211 }
1212
1213 static VkResult
pvr_compute_pipeline_create(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1214 pvr_compute_pipeline_create(struct pvr_device *device,
1215 struct pvr_pipeline_cache *pipeline_cache,
1216 const VkComputePipelineCreateInfo *pCreateInfo,
1217 const VkAllocationCallbacks *allocator,
1218 VkPipeline *const pipeline_out)
1219 {
1220 struct pvr_compute_pipeline *compute_pipeline;
1221 VkResult result;
1222
1223 compute_pipeline = vk_zalloc2(&device->vk.alloc,
1224 allocator,
1225 sizeof(*compute_pipeline),
1226 8,
1227 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1228 if (!compute_pipeline)
1229 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1230
1231 /* Compiles and uploads shaders and PDS programs. */
1232 result = pvr_compute_pipeline_init(device,
1233 pipeline_cache,
1234 pCreateInfo,
1235 allocator,
1236 compute_pipeline);
1237 if (result != VK_SUCCESS) {
1238 vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1239 return result;
1240 }
1241
1242 *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1243
1244 return VK_SUCCESS;
1245 }
1246
pvr_compute_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1247 static void pvr_compute_pipeline_destroy(
1248 struct pvr_device *const device,
1249 const VkAllocationCallbacks *const allocator,
1250 struct pvr_compute_pipeline *const compute_pipeline)
1251 {
1252 if (compute_pipeline->state.flags.base_workgroup) {
1253 pvr_pds_compute_base_workgroup_variant_program_finish(
1254 device,
1255 allocator,
1256 &compute_pipeline->state.primary_base_workgroup_variant_program);
1257 }
1258
1259 pvr_pds_compute_program_destroy(
1260 device,
1261 allocator,
1262 &compute_pipeline->state.primary_program,
1263 &compute_pipeline->state.primary_program_info);
1264 pvr_pds_descriptor_program_destroy(device,
1265 allocator,
1266 &compute_pipeline->state.descriptor);
1267 pvr_bo_free(device, compute_pipeline->state.shader.bo);
1268
1269 pvr_pipeline_finish(&compute_pipeline->base);
1270
1271 vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1272 }
1273
1274 VkResult
pvr_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1275 pvr_CreateComputePipelines(VkDevice _device,
1276 VkPipelineCache pipelineCache,
1277 uint32_t createInfoCount,
1278 const VkComputePipelineCreateInfo *pCreateInfos,
1279 const VkAllocationCallbacks *pAllocator,
1280 VkPipeline *pPipelines)
1281 {
1282 PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache);
1283 PVR_FROM_HANDLE(pvr_device, device, _device);
1284 VkResult result = VK_SUCCESS;
1285
1286 for (uint32_t i = 0; i < createInfoCount; i++) {
1287 const VkResult local_result =
1288 pvr_compute_pipeline_create(device,
1289 pipeline_cache,
1290 &pCreateInfos[i],
1291 pAllocator,
1292 &pPipelines[i]);
1293 if (local_result != VK_SUCCESS) {
1294 result = local_result;
1295 pPipelines[i] = VK_NULL_HANDLE;
1296 }
1297 }
1298
1299 return result;
1300 }
1301
1302 /******************************************************************************
1303 Graphics pipeline functions
1304 ******************************************************************************/
1305
pvr_dynamic_state_bit_from_vk(VkDynamicState state)1306 static inline uint32_t pvr_dynamic_state_bit_from_vk(VkDynamicState state)
1307 {
1308 switch (state) {
1309 case VK_DYNAMIC_STATE_VIEWPORT:
1310 return PVR_DYNAMIC_STATE_BIT_VIEWPORT;
1311 case VK_DYNAMIC_STATE_SCISSOR:
1312 return PVR_DYNAMIC_STATE_BIT_SCISSOR;
1313 case VK_DYNAMIC_STATE_LINE_WIDTH:
1314 return PVR_DYNAMIC_STATE_BIT_LINE_WIDTH;
1315 case VK_DYNAMIC_STATE_DEPTH_BIAS:
1316 return PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS;
1317 case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1318 return PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS;
1319 case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1320 return PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK;
1321 case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1322 return PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK;
1323 case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1324 return PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE;
1325 default:
1326 unreachable("Unsupported state.");
1327 }
1328 }
1329
1330 static void
pvr_graphics_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1331 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1332 const VkAllocationCallbacks *const allocator,
1333 struct pvr_graphics_pipeline *const gfx_pipeline)
1334 {
1335 const uint32_t num_vertex_attrib_programs =
1336 ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1337
1338 pvr_pds_descriptor_program_destroy(
1339 device,
1340 allocator,
1341 &gfx_pipeline->fragment_shader_state.descriptor_state);
1342
1343 pvr_pds_descriptor_program_destroy(
1344 device,
1345 allocator,
1346 &gfx_pipeline->vertex_shader_state.descriptor_state);
1347
1348 for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1349 struct pvr_pds_attrib_program *const attrib_program =
1350 &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i];
1351
1352 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1353 }
1354
1355 pvr_bo_free(device,
1356 gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo);
1357 pvr_bo_free(device,
1358 gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo);
1359
1360 pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo);
1361 pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo);
1362
1363 pvr_pipeline_finish(&gfx_pipeline->base);
1364
1365 vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1366 }
1367
1368 static void
pvr_vertex_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data,const struct rogue_vs_build_data * vs_data)1369 pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1370 const struct rogue_common_build_data *common_data,
1371 const struct rogue_vs_build_data *vs_data)
1372 {
1373 struct pvr_vertex_shader_state *vertex_state =
1374 &gfx_pipeline->vertex_shader_state;
1375
1376 /* TODO: Hard coding these for now. These should be populated based on the
1377 * information returned by the compiler.
1378 */
1379 vertex_state->stage_state.const_shared_reg_count = common_data->shareds;
1380 vertex_state->stage_state.const_shared_reg_offset = 0;
1381 vertex_state->stage_state.temps_count = common_data->temps;
1382 vertex_state->stage_state.coefficient_size = common_data->coeffs;
1383 vertex_state->stage_state.uses_atomic_ops = false;
1384 vertex_state->stage_state.uses_texture_rw = false;
1385 vertex_state->stage_state.uses_barrier = false;
1386 vertex_state->stage_state.has_side_effects = false;
1387 vertex_state->stage_state.empty_program = false;
1388
1389 vertex_state->vertex_input_size = vs_data->num_vertex_input_regs;
1390 vertex_state->vertex_output_size =
1391 vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES;
1392 vertex_state->user_clip_planes_mask = 0;
1393 vertex_state->entry_offset = 0;
1394
1395 /* TODO: The number of varyings should be checked against the fragment
1396 * shader inputs and assigned in the place where that happens.
1397 * There will also be an opportunity to cull unused fs inputs/vs outputs.
1398 */
1399 pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[0],
1400 TA_STATE_VARYING0,
1401 varying0) {
1402 varying0.f32_linear = vs_data->num_varyings;
1403 varying0.f32_flat = 0;
1404 varying0.f32_npc = 0;
1405 }
1406
1407 pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[1],
1408 TA_STATE_VARYING1,
1409 varying1) {
1410 varying1.f16_linear = 0;
1411 varying1.f16_flat = 0;
1412 varying1.f16_npc = 0;
1413 }
1414 }
1415
1416 static void
pvr_fragment_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data)1417 pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1418 const struct rogue_common_build_data *common_data)
1419 {
1420 struct pvr_fragment_shader_state *fragment_state =
1421 &gfx_pipeline->fragment_shader_state;
1422
1423 /* TODO: Hard coding these for now. These should be populated based on the
1424 * information returned by the compiler.
1425 */
1426 fragment_state->stage_state.const_shared_reg_count = 0;
1427 fragment_state->stage_state.const_shared_reg_offset = 0;
1428 fragment_state->stage_state.temps_count = common_data->temps;
1429 fragment_state->stage_state.coefficient_size = common_data->coeffs;
1430 fragment_state->stage_state.uses_atomic_ops = false;
1431 fragment_state->stage_state.uses_texture_rw = false;
1432 fragment_state->stage_state.uses_barrier = false;
1433 fragment_state->stage_state.has_side_effects = false;
1434 fragment_state->stage_state.empty_program = false;
1435
1436 fragment_state->pass_type = 0;
1437 fragment_state->entry_offset = 0;
1438 }
1439
1440 /* Compiles and uploads shaders and PDS programs. */
1441 static VkResult
pvr_graphics_pipeline_compile(struct pvr_device * const device,struct pvr_pipeline_cache * pipeline_cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1442 pvr_graphics_pipeline_compile(struct pvr_device *const device,
1443 struct pvr_pipeline_cache *pipeline_cache,
1444 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1445 const VkAllocationCallbacks *const allocator,
1446 struct pvr_graphics_pipeline *const gfx_pipeline)
1447 {
1448 /* FIXME: Remove this hard coding. */
1449 struct pvr_explicit_constant_usage vert_explicit_const_usage = {
1450 .start_offset = 16,
1451 };
1452 struct pvr_explicit_constant_usage frag_explicit_const_usage = {
1453 .start_offset = 0,
1454 };
1455 static uint32_t hard_code_pipeline_n = 0;
1456
1457 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
1458 pCreateInfo->pVertexInputState;
1459 const uint32_t cache_line_size =
1460 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
1461 struct rogue_compiler *compiler = device->pdevice->compiler;
1462 struct rogue_build_ctx *ctx;
1463 VkResult result;
1464
1465 /* Setup shared build context. */
1466 ctx = rogue_create_build_context(compiler);
1467 if (!ctx)
1468 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1469
1470 /* NIR middle-end translation. */
1471 for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1472 stage--) {
1473 const VkPipelineShaderStageCreateInfo *create_info;
1474 size_t stage_index = gfx_pipeline->stage_indices[stage];
1475
1476 if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) {
1477 if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1478 BITFIELD_BIT(stage)) {
1479 continue;
1480 }
1481 }
1482
1483 /* Skip unused/inactive stages. */
1484 if (stage_index == ~0)
1485 continue;
1486
1487 create_info = &pCreateInfo->pStages[stage_index];
1488
1489 /* SPIR-V to NIR. */
1490 ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info);
1491 if (!ctx->nir[stage]) {
1492 ralloc_free(ctx);
1493 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1494 }
1495 }
1496
1497 /* Pre-back-end analysis and optimization, driver data extraction. */
1498 /* TODO: Analyze and cull unused I/O between stages. */
1499 /* TODO: Allocate UBOs between stages;
1500 * pipeline->layout->set_{count,layout}.
1501 */
1502
1503 /* Back-end translation. */
1504 for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1505 stage--) {
1506 if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1507 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1508 BITFIELD_BIT(stage)) {
1509 const struct pvr_device_info *const dev_info =
1510 &device->pdevice->dev_info;
1511 struct pvr_explicit_constant_usage *explicit_const_usage;
1512
1513 switch (stage) {
1514 case MESA_SHADER_VERTEX:
1515 explicit_const_usage = &vert_explicit_const_usage;
1516 break;
1517
1518 case MESA_SHADER_FRAGMENT:
1519 explicit_const_usage = &frag_explicit_const_usage;
1520 break;
1521
1522 default:
1523 unreachable("Unsupported stage.");
1524 }
1525
1526 pvr_hard_code_graphics_shader(dev_info,
1527 hard_code_pipeline_n,
1528 stage,
1529 &ctx->binary[stage]);
1530
1531 pvr_hard_code_graphics_get_build_info(dev_info,
1532 hard_code_pipeline_n,
1533 stage,
1534 &ctx->common_data[stage],
1535 &ctx->stage_data,
1536 explicit_const_usage);
1537
1538 continue;
1539 }
1540
1541 if (!ctx->nir[stage])
1542 continue;
1543
1544 ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]);
1545 if (!ctx->rogue[stage]) {
1546 ralloc_free(ctx);
1547 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1548 }
1549
1550 ctx->binary[stage] = pvr_rogue_to_binary(ctx, ctx->rogue[stage]);
1551 if (!ctx->binary[stage]) {
1552 ralloc_free(ctx);
1553 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1554 }
1555 }
1556
1557 if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1558 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1559 BITFIELD_BIT(MESA_SHADER_VERTEX)) {
1560 pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info,
1561 hard_code_pipeline_n,
1562 &gfx_pipeline->vertex_shader_state);
1563 } else {
1564 pvr_vertex_state_init(gfx_pipeline,
1565 &ctx->common_data[MESA_SHADER_VERTEX],
1566 &ctx->stage_data.vs);
1567 }
1568
1569 result = pvr_gpu_upload_usc(device,
1570 ctx->binary[MESA_SHADER_VERTEX]->data,
1571 ctx->binary[MESA_SHADER_VERTEX]->size,
1572 cache_line_size,
1573 &gfx_pipeline->vertex_shader_state.bo);
1574 if (result != VK_SUCCESS)
1575 goto err_free_build_context;
1576
1577 if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1578 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1579 BITFIELD_BIT(MESA_SHADER_FRAGMENT)) {
1580 pvr_hard_code_graphics_fragment_state(
1581 &device->pdevice->dev_info,
1582 hard_code_pipeline_n,
1583 &gfx_pipeline->fragment_shader_state);
1584 } else {
1585 pvr_fragment_state_init(gfx_pipeline,
1586 &ctx->common_data[MESA_SHADER_FRAGMENT]);
1587 }
1588
1589 result = pvr_gpu_upload_usc(device,
1590 ctx->binary[MESA_SHADER_FRAGMENT]->data,
1591 ctx->binary[MESA_SHADER_FRAGMENT]->size,
1592 cache_line_size,
1593 &gfx_pipeline->fragment_shader_state.bo);
1594 if (result != VK_SUCCESS)
1595 goto err_free_vertex_bo;
1596
1597 /* TODO: powervr has an optimization where it attempts to recompile shaders.
1598 * See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented since in our
1599 * case the optimization doesn't happen.
1600 */
1601
1602 /* TODO: The programs we use are hard coded for now, but these should be
1603 * selected dynamically.
1604 */
1605
1606 result = pvr_pds_coeff_program_create_and_upload(
1607 device,
1608 allocator,
1609 ctx->stage_data.fs.iterator_args.fpu_iterators,
1610 ctx->stage_data.fs.iterator_args.num_fpu_iterators,
1611 ctx->stage_data.fs.iterator_args.destination,
1612 &gfx_pipeline->fragment_shader_state.pds_coeff_program);
1613 if (result != VK_SUCCESS)
1614 goto err_free_fragment_bo;
1615
1616 result = pvr_pds_fragment_program_create_and_upload(
1617 device,
1618 allocator,
1619 gfx_pipeline->fragment_shader_state.bo,
1620 ctx->common_data[MESA_SHADER_FRAGMENT].temps,
1621 ctx->stage_data.fs.msaa_mode,
1622 ctx->stage_data.fs.phas,
1623 &gfx_pipeline->fragment_shader_state.pds_fragment_program);
1624 if (result != VK_SUCCESS)
1625 goto err_free_coeff_program;
1626
1627 result = pvr_pds_vertex_attrib_programs_create_and_upload(
1628 device,
1629 allocator,
1630 vertex_input_state,
1631 ctx->common_data[MESA_SHADER_VERTEX].temps,
1632 &ctx->stage_data.vs,
1633 &gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1634 if (result != VK_SUCCESS)
1635 goto err_free_frag_program;
1636
1637 result = pvr_pds_descriptor_program_create_and_upload(
1638 device,
1639 allocator,
1640 &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data,
1641 &ctx->common_data[MESA_SHADER_VERTEX].ubo_data,
1642 &vert_explicit_const_usage,
1643 gfx_pipeline->base.layout,
1644 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
1645 &gfx_pipeline->vertex_shader_state.descriptor_state);
1646 if (result != VK_SUCCESS)
1647 goto err_free_vertex_attrib_program;
1648
1649 /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
1650 * scratch buffer for both vertex and fragment stage.
1651 * Figure out the best place to do this.
1652 */
1653 /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
1654 /* TODO: Implement spilling with the above. */
1655
1656 /* TODO: Call pvr_pds_program_program_create_and_upload in a loop. */
1657 /* FIXME: For now we pass in the same explicit_const_usage since it contains
1658 * all invalid entries. Fix this by hooking it up to the compiler.
1659 */
1660 result = pvr_pds_descriptor_program_create_and_upload(
1661 device,
1662 allocator,
1663 &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data,
1664 &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data,
1665 &frag_explicit_const_usage,
1666 gfx_pipeline->base.layout,
1667 PVR_STAGE_ALLOCATION_FRAGMENT,
1668 &gfx_pipeline->fragment_shader_state.descriptor_state);
1669 if (result != VK_SUCCESS)
1670 goto err_free_vertex_descriptor_program;
1671
1672 ralloc_free(ctx);
1673
1674 hard_code_pipeline_n++;
1675
1676 return VK_SUCCESS;
1677
1678 err_free_vertex_descriptor_program:
1679 pvr_pds_descriptor_program_destroy(
1680 device,
1681 allocator,
1682 &gfx_pipeline->vertex_shader_state.descriptor_state);
1683 err_free_vertex_attrib_program:
1684 for (uint32_t i = 0;
1685 i < ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1686 i++) {
1687 struct pvr_pds_attrib_program *const attrib_program =
1688 &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i];
1689
1690 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1691 }
1692 err_free_frag_program:
1693 pvr_bo_free(device,
1694 gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo);
1695 err_free_coeff_program:
1696 pvr_bo_free(device,
1697 gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo);
1698 err_free_fragment_bo:
1699 pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo);
1700 err_free_vertex_bo:
1701 pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo);
1702 err_free_build_context:
1703 ralloc_free(ctx);
1704 return result;
1705 }
1706
pvr_graphics_pipeline_init_depth_and_stencil_state(struct pvr_graphics_pipeline * gfx_pipeline,const VkPipelineDepthStencilStateCreateInfo * depth_stencil_state)1707 static void pvr_graphics_pipeline_init_depth_and_stencil_state(
1708 struct pvr_graphics_pipeline *gfx_pipeline,
1709 const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state)
1710 {
1711 const VkStencilOpState *front;
1712 const VkStencilOpState *back;
1713
1714 if (!depth_stencil_state)
1715 return;
1716
1717 front = &depth_stencil_state->front;
1718 back = &depth_stencil_state->back;
1719
1720 if (depth_stencil_state->depthTestEnable) {
1721 gfx_pipeline->depth_compare_op = depth_stencil_state->depthCompareOp;
1722 gfx_pipeline->depth_write_disable =
1723 !depth_stencil_state->depthWriteEnable;
1724 } else {
1725 gfx_pipeline->depth_compare_op = VK_COMPARE_OP_ALWAYS;
1726 gfx_pipeline->depth_write_disable = true;
1727 }
1728
1729 if (depth_stencil_state->stencilTestEnable) {
1730 gfx_pipeline->stencil_front.compare_op = front->compareOp;
1731 gfx_pipeline->stencil_front.fail_op = front->failOp;
1732 gfx_pipeline->stencil_front.depth_fail_op = front->depthFailOp;
1733 gfx_pipeline->stencil_front.pass_op = front->passOp;
1734
1735 gfx_pipeline->stencil_back.compare_op = back->compareOp;
1736 gfx_pipeline->stencil_back.fail_op = back->failOp;
1737 gfx_pipeline->stencil_back.depth_fail_op = back->depthFailOp;
1738 gfx_pipeline->stencil_back.pass_op = back->passOp;
1739 } else {
1740 gfx_pipeline->stencil_front.compare_op = VK_COMPARE_OP_ALWAYS;
1741 gfx_pipeline->stencil_front.fail_op = VK_STENCIL_OP_KEEP;
1742 gfx_pipeline->stencil_front.depth_fail_op = VK_STENCIL_OP_KEEP;
1743 gfx_pipeline->stencil_front.pass_op = VK_STENCIL_OP_KEEP;
1744
1745 gfx_pipeline->stencil_back = gfx_pipeline->stencil_front;
1746 }
1747 }
1748
pvr_graphics_pipeline_init_dynamic_state(struct pvr_graphics_pipeline * gfx_pipeline,const VkPipelineDynamicStateCreateInfo * dynamic_state,const VkPipelineViewportStateCreateInfo * viewport_state,const VkPipelineDepthStencilStateCreateInfo * depth_stencil_state,const VkPipelineColorBlendStateCreateInfo * color_blend_state,const VkPipelineRasterizationStateCreateInfo * rasterization_state)1749 static void pvr_graphics_pipeline_init_dynamic_state(
1750 struct pvr_graphics_pipeline *gfx_pipeline,
1751 const VkPipelineDynamicStateCreateInfo *dynamic_state,
1752 const VkPipelineViewportStateCreateInfo *viewport_state,
1753 const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state,
1754 const VkPipelineColorBlendStateCreateInfo *color_blend_state,
1755 const VkPipelineRasterizationStateCreateInfo *rasterization_state)
1756 {
1757 struct pvr_dynamic_state *const internal_dynamic_state =
1758 &gfx_pipeline->dynamic_state;
1759 uint32_t dynamic_states = 0;
1760
1761 if (dynamic_state) {
1762 for (uint32_t i = 0; i < dynamic_state->dynamicStateCount; i++) {
1763 dynamic_states |=
1764 pvr_dynamic_state_bit_from_vk(dynamic_state->pDynamicStates[i]);
1765 }
1766 }
1767
1768 /* TODO: Verify this.
1769 * We don't zero out the pipeline's state if they are dynamic since they
1770 * should be set later on in the command buffer.
1771 */
1772
1773 /* TODO: Handle rasterizerDiscardEnable. */
1774
1775 if (rasterization_state) {
1776 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH))
1777 internal_dynamic_state->line_width = rasterization_state->lineWidth;
1778
1779 /* TODO: Do we need the depthBiasEnable check? */
1780 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) {
1781 internal_dynamic_state->depth_bias.constant_factor =
1782 rasterization_state->depthBiasConstantFactor;
1783 internal_dynamic_state->depth_bias.clamp =
1784 rasterization_state->depthBiasClamp;
1785 internal_dynamic_state->depth_bias.slope_factor =
1786 rasterization_state->depthBiasSlopeFactor;
1787 }
1788 }
1789
1790 /* TODO: handle viewport state flags. */
1791
1792 /* TODO: handle static viewport state. */
1793 /* We assume the viewport state to by dynamic for now. */
1794
1795 /* TODO: handle static scissor state. */
1796 /* We assume the scissor state to by dynamic for now. */
1797
1798 if (depth_stencil_state) {
1799 const VkStencilOpState *const front = &depth_stencil_state->front;
1800 const VkStencilOpState *const back = &depth_stencil_state->back;
1801
1802 /* VkPhysicalDeviceFeatures->depthBounds is false. */
1803 assert(depth_stencil_state->depthBoundsTestEnable == VK_FALSE);
1804
1805 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) {
1806 internal_dynamic_state->compare_mask.front = front->compareMask;
1807 internal_dynamic_state->compare_mask.back = back->compareMask;
1808 }
1809
1810 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) {
1811 internal_dynamic_state->write_mask.front = front->writeMask;
1812 internal_dynamic_state->write_mask.back = back->writeMask;
1813 }
1814
1815 if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) {
1816 internal_dynamic_state->reference.front = front->reference;
1817 internal_dynamic_state->reference.back = back->reference;
1818 }
1819 }
1820
1821 if (color_blend_state &&
1822 !(dynamic_states & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) {
1823 STATIC_ASSERT(__same_type(internal_dynamic_state->blend_constants,
1824 color_blend_state->blendConstants));
1825
1826 typed_memcpy(internal_dynamic_state->blend_constants,
1827 color_blend_state->blendConstants,
1828 ARRAY_SIZE(internal_dynamic_state->blend_constants));
1829 }
1830
1831 /* TODO: handle STATIC_STATE_DEPTH_BOUNDS ? */
1832
1833 internal_dynamic_state->mask = dynamic_states;
1834 }
1835
1836 static VkResult
pvr_graphics_pipeline_init(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_graphics_pipeline * gfx_pipeline)1837 pvr_graphics_pipeline_init(struct pvr_device *device,
1838 struct pvr_pipeline_cache *pipeline_cache,
1839 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1840 const VkAllocationCallbacks *allocator,
1841 struct pvr_graphics_pipeline *gfx_pipeline)
1842 {
1843 /* If rasterization is not enabled, various CreateInfo structs must be
1844 * ignored.
1845 */
1846 const bool raster_discard_enabled =
1847 pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1848 const VkPipelineViewportStateCreateInfo *vs_info =
1849 !raster_discard_enabled ? pCreateInfo->pViewportState : NULL;
1850 const VkPipelineDepthStencilStateCreateInfo *dss_info =
1851 !raster_discard_enabled ? pCreateInfo->pDepthStencilState : NULL;
1852 const VkPipelineRasterizationStateCreateInfo *rs_info =
1853 !raster_discard_enabled ? pCreateInfo->pRasterizationState : NULL;
1854 const VkPipelineColorBlendStateCreateInfo *cbs_info =
1855 !raster_discard_enabled ? pCreateInfo->pColorBlendState : NULL;
1856 const VkPipelineMultisampleStateCreateInfo *ms_info =
1857 !raster_discard_enabled ? pCreateInfo->pMultisampleState : NULL;
1858 VkResult result;
1859
1860 pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
1861
1862 pvr_finishme("ignoring pCreateInfo flags.");
1863 pvr_finishme("ignoring pipeline cache.");
1864
1865 gfx_pipeline->raster_state.discard_enable = raster_discard_enabled;
1866 gfx_pipeline->raster_state.cull_mode =
1867 pCreateInfo->pRasterizationState->cullMode;
1868 gfx_pipeline->raster_state.front_face =
1869 pCreateInfo->pRasterizationState->frontFace;
1870 gfx_pipeline->raster_state.depth_bias_enable =
1871 pCreateInfo->pRasterizationState->depthBiasEnable;
1872 gfx_pipeline->raster_state.depth_clamp_enable =
1873 pCreateInfo->pRasterizationState->depthClampEnable;
1874
1875 /* FIXME: Handle depthClampEnable. */
1876
1877 pvr_graphics_pipeline_init_depth_and_stencil_state(gfx_pipeline, dss_info);
1878 pvr_graphics_pipeline_init_dynamic_state(gfx_pipeline,
1879 pCreateInfo->pDynamicState,
1880 vs_info,
1881 dss_info,
1882 cbs_info,
1883 rs_info);
1884
1885 if (pCreateInfo->pInputAssemblyState) {
1886 gfx_pipeline->input_asm_state.topology =
1887 pCreateInfo->pInputAssemblyState->topology;
1888 gfx_pipeline->input_asm_state.primitive_restart =
1889 pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
1890 }
1891
1892 memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
1893
1894 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
1895 VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
1896 gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
1897 /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
1898 *
1899 * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
1900 * or VK_SHADER_STAGE_ALL."
1901 *
1902 * So we don't handle that.
1903 *
1904 * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
1905 * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
1906 * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
1907 * structure returned by the driver.
1908 */
1909 switch (pCreateInfo->pStages[i].stage) {
1910 case VK_SHADER_STAGE_VERTEX_BIT:
1911 case VK_SHADER_STAGE_FRAGMENT_BIT:
1912 gfx_pipeline->stage_indices[gl_stage] = i;
1913 break;
1914 default:
1915 unreachable("Unsupported stage.");
1916 }
1917 }
1918
1919 gfx_pipeline->base.layout =
1920 pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1921
1922 if (ms_info) {
1923 gfx_pipeline->rasterization_samples = ms_info->rasterizationSamples;
1924 gfx_pipeline->sample_mask =
1925 (ms_info->pSampleMask) ? ms_info->pSampleMask[0] : 0xFFFFFFFF;
1926 } else {
1927 gfx_pipeline->rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
1928 gfx_pipeline->sample_mask = 0xFFFFFFFF;
1929 }
1930
1931 /* Compiles and uploads shaders and PDS programs. */
1932 result = pvr_graphics_pipeline_compile(device,
1933 pipeline_cache,
1934 pCreateInfo,
1935 allocator,
1936 gfx_pipeline);
1937 if (result != VK_SUCCESS) {
1938 pvr_pipeline_finish(&gfx_pipeline->base);
1939 return result;
1940 }
1941
1942 return VK_SUCCESS;
1943 }
1944
1945 /* If allocator == NULL, the internal one will be used. */
1946 static VkResult
pvr_graphics_pipeline_create(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1947 pvr_graphics_pipeline_create(struct pvr_device *device,
1948 struct pvr_pipeline_cache *pipeline_cache,
1949 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1950 const VkAllocationCallbacks *allocator,
1951 VkPipeline *const pipeline_out)
1952 {
1953 struct pvr_graphics_pipeline *gfx_pipeline;
1954 VkResult result;
1955
1956 gfx_pipeline = vk_zalloc2(&device->vk.alloc,
1957 allocator,
1958 sizeof(*gfx_pipeline),
1959 8,
1960 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1961 if (!gfx_pipeline)
1962 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1963
1964 /* Compiles and uploads shaders and PDS programs too. */
1965 result = pvr_graphics_pipeline_init(device,
1966 pipeline_cache,
1967 pCreateInfo,
1968 allocator,
1969 gfx_pipeline);
1970 if (result != VK_SUCCESS) {
1971 vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1972 return result;
1973 }
1974
1975 *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
1976
1977 return VK_SUCCESS;
1978 }
1979
1980 VkResult
pvr_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1981 pvr_CreateGraphicsPipelines(VkDevice _device,
1982 VkPipelineCache pipelineCache,
1983 uint32_t createInfoCount,
1984 const VkGraphicsPipelineCreateInfo *pCreateInfos,
1985 const VkAllocationCallbacks *pAllocator,
1986 VkPipeline *pPipelines)
1987 {
1988 PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache);
1989 PVR_FROM_HANDLE(pvr_device, device, _device);
1990 VkResult result = VK_SUCCESS;
1991
1992 for (uint32_t i = 0; i < createInfoCount; i++) {
1993 const VkResult local_result =
1994 pvr_graphics_pipeline_create(device,
1995 pipeline_cache,
1996 &pCreateInfos[i],
1997 pAllocator,
1998 &pPipelines[i]);
1999 if (local_result != VK_SUCCESS) {
2000 result = local_result;
2001 pPipelines[i] = VK_NULL_HANDLE;
2002 }
2003 }
2004
2005 return result;
2006 }
2007
2008 /*****************************************************************************
2009 Other functions
2010 *****************************************************************************/
2011
pvr_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2012 void pvr_DestroyPipeline(VkDevice _device,
2013 VkPipeline _pipeline,
2014 const VkAllocationCallbacks *pAllocator)
2015 {
2016 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2017 PVR_FROM_HANDLE(pvr_device, device, _device);
2018
2019 if (!pipeline)
2020 return;
2021
2022 switch (pipeline->type) {
2023 case PVR_PIPELINE_TYPE_GRAPHICS: {
2024 struct pvr_graphics_pipeline *const gfx_pipeline =
2025 to_pvr_graphics_pipeline(pipeline);
2026
2027 pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2028 break;
2029 }
2030
2031 case PVR_PIPELINE_TYPE_COMPUTE: {
2032 struct pvr_compute_pipeline *const compute_pipeline =
2033 to_pvr_compute_pipeline(pipeline);
2034
2035 pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2036 break;
2037 }
2038
2039 default:
2040 unreachable("Unknown pipeline type.");
2041 }
2042 }
2043