1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * based in part on v3dv driver which is:
5 * Copyright © 2019 Raspberry Pi
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26
27 #include <assert.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <vulkan/vulkan.h>
32
33 #include "compiler/shader_enums.h"
34 #include "hwdef/rogue_hw_utils.h"
35 #include "nir/nir.h"
36 #include "pco/pco.h"
37 #include "pco/pco_data.h"
38 #include "pvr_bo.h"
39 #include "pvr_csb.h"
40 #include "pvr_csb_enum_helpers.h"
41 #include "pvr_hardcode.h"
42 #include "pvr_nir.h"
43 #include "pvr_pds.h"
44 #include "pvr_private.h"
45 #include "pvr_robustness.h"
46 #include "pvr_shader.h"
47 #include "pvr_types.h"
48 #include "rogue/rogue.h"
49 #include "util/log.h"
50 #include "util/macros.h"
51 #include "util/ralloc.h"
52 #include "util/u_dynarray.h"
53 #include "util/u_math.h"
54 #include "vk_alloc.h"
55 #include "vk_format.h"
56 #include "vk_graphics_state.h"
57 #include "vk_log.h"
58 #include "vk_object.h"
59 #include "vk_pipeline_cache.h"
60 #include "vk_render_pass.h"
61 #include "vk_util.h"
62 #include "vulkan/runtime/vk_pipeline.h"
63
64 /*****************************************************************************
65 PDS functions
66 *****************************************************************************/
67
68 /* If allocator == NULL, the internal one will be used. */
pvr_pds_coeff_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,struct pvr_pds_coeff_loading_program * program,struct pvr_fragment_shader_state * fragment_state)69 static VkResult pvr_pds_coeff_program_create_and_upload(
70 struct pvr_device *device,
71 const VkAllocationCallbacks *allocator,
72 struct pvr_pds_coeff_loading_program *program,
73 struct pvr_fragment_shader_state *fragment_state)
74 {
75 uint32_t staging_buffer_size;
76 uint32_t *staging_buffer;
77 VkResult result;
78
79 assert(program->num_fpu_iterators < PVR_MAXIMUM_ITERATIONS);
80
81 /* Get the size of the program and then allocate that much memory. */
82 pvr_pds_coefficient_loading(program, NULL, PDS_GENERATE_SIZES);
83
84 if (!program->code_size) {
85 fragment_state->pds_coeff_program.pvr_bo = NULL;
86 fragment_state->pds_coeff_program.code_size = 0;
87 fragment_state->pds_coeff_program.data_size = 0;
88 fragment_state->stage_state.pds_temps_count = 0;
89
90 return VK_SUCCESS;
91 }
92
93 staging_buffer_size =
94 PVR_DW_TO_BYTES(program->code_size + program->data_size);
95
96 staging_buffer = vk_alloc2(&device->vk.alloc,
97 allocator,
98 staging_buffer_size,
99 8,
100 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
101 if (!staging_buffer)
102 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
103
104 /* Generate the program into is the staging_buffer. */
105 pvr_pds_coefficient_loading(program,
106 staging_buffer,
107 PDS_GENERATE_CODEDATA_SEGMENTS);
108
109 /* FIXME: Figure out the define for alignment of 16. */
110 result = pvr_gpu_upload_pds(device,
111 &staging_buffer[0],
112 program->data_size,
113 16,
114 &staging_buffer[program->data_size],
115 program->code_size,
116 16,
117 16,
118 &fragment_state->pds_coeff_program);
119 if (result != VK_SUCCESS) {
120 vk_free2(&device->vk.alloc, allocator, staging_buffer);
121 return result;
122 }
123
124 vk_free2(&device->vk.alloc, allocator, staging_buffer);
125
126 fragment_state->stage_state.pds_temps_count = program->temps_used;
127
128 return VK_SUCCESS;
129 }
130
131 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
132 /* If allocator == NULL, the internal one will be used. */
pvr_pds_fragment_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,pco_shader * fs,struct pvr_fragment_shader_state * fragment_state)133 VkResult pvr_pds_fragment_program_create_and_upload(
134 struct pvr_device *device,
135 const VkAllocationCallbacks *allocator,
136 pco_shader *fs,
137 struct pvr_fragment_shader_state *fragment_state)
138 {
139 /* TODO: remove the below + revert the pvr_pds_setup_doutu
140 * args and make sure fs isn't NULL instead;
141 * temporarily in place for hardcoded load ops in
142 * pvr_pass.c:pvr_generate_load_op_shader()
143 */
144 unsigned temps = 0;
145 bool has_phase_rate_change = false;
146 unsigned entry_offset = 0;
147
148 if (fs) {
149 pco_data *fs_data = pco_shader_data(fs);
150 temps = fs_data->common.temps;
151 has_phase_rate_change = fs_data->fs.uses.phase_change;
152 entry_offset = fs_data->common.entry_offset;
153 }
154
155 struct pvr_pds_kickusc_program program = { 0 };
156 uint32_t staging_buffer_size;
157 uint32_t *staging_buffer;
158 VkResult result;
159
160 const pvr_dev_addr_t exec_addr =
161 PVR_DEV_ADDR_OFFSET(fragment_state->bo->dev_addr,
162 /* fs_data->common.entry_offset */ entry_offset);
163
164 /* Note this is not strictly required to be done before calculating the
165 * staging_buffer_size in this particular case. It can also be done after
166 * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
167 */
168 pvr_pds_setup_doutu(
169 &program.usc_task_control,
170 exec_addr.addr,
171 /* fs_data->common.temps */ temps,
172 fragment_state->sample_rate,
173 /* fs_data->fs.uses.phase_change */ has_phase_rate_change);
174
175 pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
176
177 staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
178
179 staging_buffer = vk_alloc2(&device->vk.alloc,
180 allocator,
181 staging_buffer_size,
182 8,
183 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
184 if (!staging_buffer)
185 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
186
187 pvr_pds_kick_usc(&program,
188 staging_buffer,
189 0,
190 false,
191 PDS_GENERATE_CODEDATA_SEGMENTS);
192
193 /* FIXME: Figure out the define for alignment of 16. */
194 result = pvr_gpu_upload_pds(device,
195 &staging_buffer[0],
196 program.data_size,
197 16,
198 &staging_buffer[program.data_size],
199 program.code_size,
200 16,
201 16,
202 &fragment_state->pds_fragment_program);
203 if (result != VK_SUCCESS) {
204 vk_free2(&device->vk.alloc, allocator, staging_buffer);
205 return result;
206 }
207
208 vk_free2(&device->vk.alloc, allocator, staging_buffer);
209
210 return VK_SUCCESS;
211 }
212
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(const struct pvr_device_info * dev_info,bool robust_buffer_access)213 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
214 const struct pvr_device_info *dev_info,
215 bool robust_buffer_access)
216 {
217 /* FIXME: Use more local variable to improve formatting. */
218
219 /* Maximum memory allocation needed for const map entries in
220 * pvr_pds_generate_vertex_primary_program().
221 * When robustBufferAccess is disabled, it must be >= 410.
222 * When robustBufferAccess is enabled, it must be >= 570.
223 *
224 * 1. Size of entry for base instance
225 * (pvr_const_map_entry_base_instance)
226 *
227 * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
228 * if (!robustBufferAccess)
229 * size of vertex attribute entry
230 * (pvr_const_map_entry_vertex_attribute_address) +
231 * else
232 * size of robust vertex attribute entry
233 * (pvr_const_map_entry_robust_vertex_attribute_address) +
234 * size of entry for max attribute index
235 * (pvr_const_map_entry_vertex_attribute_max_index) +
236 * fi
237 * size of Unified Store burst entry
238 * (pvr_const_map_entry_literal32) +
239 * size of entry for vertex stride
240 * (pvr_const_map_entry_literal32) +
241 * size of entries for DDMAD control word
242 * (num_ddmad_literals * pvr_const_map_entry_literal32))
243 *
244 * 3. Size of entry for DOUTW vertex/instance control word
245 * (pvr_const_map_entry_literal32)
246 *
247 * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
248 */
249
250 const size_t attribute_size =
251 (!robust_buffer_access)
252 ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
253 : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
254 sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
255
256 /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
257 * and is increased by one DWORD to contain the data for the DDMADT's
258 * out-of-bounds check.
259 */
260 const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
261 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
262
263 return (sizeof(struct pvr_const_map_entry_base_instance) +
264 PVR_MAX_VERTEX_INPUT_BINDINGS *
265 (attribute_size +
266 (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
267 sizeof(struct pvr_const_map_entry_literal32)) +
268 sizeof(struct pvr_const_map_entry_literal32) +
269 sizeof(struct pvr_const_map_entry_doutu_address));
270 }
271
pvr_pds_vertex_attrib_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_pds_vertex_primary_program_input * const input,struct pvr_pds_attrib_program * const program_out)272 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
273 struct pvr_device *const device,
274 const VkAllocationCallbacks *const allocator,
275 struct pvr_pds_vertex_primary_program_input *const input,
276 struct pvr_pds_attrib_program *const program_out)
277 {
278 const size_t const_entries_size_in_bytes =
279 pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
280 &device->pdevice->dev_info,
281 device->vk.enabled_features.robustBufferAccess);
282 struct pvr_pds_upload *const program = &program_out->program;
283 struct pvr_pds_info *const info = &program_out->info;
284 struct pvr_const_map_entry *new_entries;
285 ASSERTED uint32_t code_size_in_dwords;
286 size_t staging_buffer_size;
287 uint32_t *staging_buffer;
288 VkResult result;
289
290 memset(info, 0, sizeof(*info));
291
292 info->entries = vk_alloc2(&device->vk.alloc,
293 allocator,
294 const_entries_size_in_bytes,
295 8,
296 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
297 if (!info->entries) {
298 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
299 goto err_out;
300 }
301
302 info->entries_size_in_bytes = const_entries_size_in_bytes;
303
304 pvr_pds_generate_vertex_primary_program(
305 input,
306 NULL,
307 info,
308 device->vk.enabled_features.robustBufferAccess,
309 &device->pdevice->dev_info);
310
311 code_size_in_dwords = info->code_size_in_dwords;
312 staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords);
313
314 staging_buffer = vk_alloc2(&device->vk.alloc,
315 allocator,
316 staging_buffer_size,
317 8,
318 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
319 if (!staging_buffer) {
320 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
321 goto err_free_entries;
322 }
323
324 /* This also fills in info->entries. */
325 pvr_pds_generate_vertex_primary_program(
326 input,
327 staging_buffer,
328 info,
329 device->vk.enabled_features.robustBufferAccess,
330 &device->pdevice->dev_info);
331
332 assert(info->code_size_in_dwords <= code_size_in_dwords);
333
334 /* FIXME: Add a vk_realloc2() ? */
335 new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
336 info->entries,
337 info->entries_written_size_in_bytes,
338 8,
339 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
340 if (!new_entries) {
341 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
342 goto err_free_staging_buffer;
343 }
344
345 info->entries = new_entries;
346 info->entries_size_in_bytes = info->entries_written_size_in_bytes;
347
348 /* FIXME: Figure out the define for alignment of 16. */
349 result = pvr_gpu_upload_pds(device,
350 NULL,
351 0,
352 0,
353 staging_buffer,
354 info->code_size_in_dwords,
355 16,
356 16,
357 program);
358 if (result != VK_SUCCESS)
359 goto err_free_staging_buffer;
360
361 vk_free2(&device->vk.alloc, allocator, staging_buffer);
362
363 return VK_SUCCESS;
364
365 err_free_staging_buffer:
366 vk_free2(&device->vk.alloc, allocator, staging_buffer);
367
368 err_free_entries:
369 vk_free2(&device->vk.alloc, allocator, info->entries);
370
371 err_out:
372 return result;
373 }
374
pvr_pds_vertex_attrib_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_attrib_program * const program)375 static inline void pvr_pds_vertex_attrib_program_destroy(
376 struct pvr_device *const device,
377 const struct VkAllocationCallbacks *const allocator,
378 struct pvr_pds_attrib_program *const program)
379 {
380 pvr_bo_suballoc_free(program->program.pvr_bo);
381 vk_free2(&device->vk.alloc, allocator, program->info.entries);
382 }
383
384 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
385 * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
386 */
387 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
388 [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
389
390 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
391 * inputs. This will bake the code segment and create a template of the data
392 * segment for the command buffer to fill in.
393 */
394 /* If allocator == NULL, the internal one will be used.
395 *
396 * programs_out_ptr is a pointer to the array where the outputs will be placed.
397 */
pvr_pds_vertex_attrib_programs_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * const allocator,pco_data * shader_data,const struct pvr_pds_vertex_dma dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],uint32_t dma_count,pvr_pds_attrib_programs_array_ptr programs_out_ptr)398 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
399 struct pvr_device *device,
400 const VkAllocationCallbacks *const allocator,
401 pco_data *shader_data,
402 const struct pvr_pds_vertex_dma
403 dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],
404 uint32_t dma_count,
405 pvr_pds_attrib_programs_array_ptr programs_out_ptr)
406 {
407 struct pvr_pds_vertex_primary_program_input input = {
408 .dma_list = dma_descriptions,
409 .dma_count = dma_count,
410 };
411 uint32_t usc_temp_count = shader_data->common.temps;
412 struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
413 VkResult result;
414
415 pco_range *sys_vals = shader_data->common.sys_vals;
416 if (sys_vals[SYSTEM_VALUE_VERTEX_ID].count > 0) {
417 input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED;
418 input.vertex_id_register = sys_vals[SYSTEM_VALUE_VERTEX_ID].start;
419 }
420
421 if (sys_vals[SYSTEM_VALUE_INSTANCE_ID].count > 0) {
422 input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED;
423 input.instance_id_register = sys_vals[SYSTEM_VALUE_INSTANCE_ID].start;
424 }
425
426 if (sys_vals[SYSTEM_VALUE_BASE_INSTANCE].count > 0) {
427 input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_REQUIRED;
428 input.base_instance_register = sys_vals[SYSTEM_VALUE_BASE_INSTANCE].start;
429 }
430
431 if (sys_vals[SYSTEM_VALUE_BASE_VERTEX].count > 0) {
432 input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_VERTEX_REQUIRED;
433 input.base_vertex_register = sys_vals[SYSTEM_VALUE_BASE_VERTEX].start;
434 }
435
436 if (sys_vals[SYSTEM_VALUE_DRAW_ID].count > 0) {
437 input.flags |= PVR_PDS_VERTEX_FLAGS_DRAW_INDEX_REQUIRED;
438 input.draw_index_register = sys_vals[SYSTEM_VALUE_DRAW_ID].start;
439 }
440
441 pvr_pds_setup_doutu(&input.usc_task_control,
442 0,
443 usc_temp_count,
444 ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
445 false);
446
447 /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
448 * typedef.
449 */
450 for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
451 uint32_t extra_flags;
452
453 switch (i) {
454 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
455 extra_flags = 0;
456 break;
457
458 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
459 extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
460 break;
461
462 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
463 extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
464 break;
465
466 default:
467 unreachable("Invalid vertex attrib program type.");
468 }
469
470 input.flags |= extra_flags;
471
472 result =
473 pvr_pds_vertex_attrib_program_create_and_upload(device,
474 allocator,
475 &input,
476 &programs_out[i]);
477 if (result != VK_SUCCESS) {
478 for (uint32_t j = 0; j < i; j++) {
479 pvr_pds_vertex_attrib_program_destroy(device,
480 allocator,
481 &programs_out[j]);
482 }
483
484 return result;
485 }
486
487 input.flags &= ~extra_flags;
488 }
489
490 return VK_SUCCESS;
491 }
492
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)493 size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
494 {
495 /* Maximum memory allocation needed for const map entries in
496 * pvr_pds_generate_descriptor_upload_program().
497 * It must be >= 688 bytes. This size is calculated as the sum of:
498 *
499 * 1. Max. number of descriptor sets (8) * (
500 * size of descriptor entry
501 * (pvr_const_map_entry_descriptor_set) +
502 * size of Common Store burst entry
503 * (pvr_const_map_entry_literal32))
504 *
505 * 2. Max. number of PDS program buffers (24) * (
506 * size of the largest buffer structure
507 * (pvr_const_map_entry_constant_buffer) +
508 * size of Common Store burst entry
509 * (pvr_const_map_entry_literal32)
510 *
511 * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
512 *
513 * 4. Max. number of PDS address literals (8) * (
514 * size of entry
515 * (pvr_const_map_entry_descriptor_set_addrs_table)
516 *
517 * 5. Max. number of address literals with single buffer entry to DOUTD
518 size of entry
519 (pvr_pds_const_map_entry_addr_literal_buffer) +
520 8 * size of entry (pvr_pds_const_map_entry_addr_literal)
521 */
522
523 /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
524 * say that it should be 8.
525 * Figure our a define for this or is the comment wrong?
526 */
527 return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
528 sizeof(struct pvr_const_map_entry_literal32)) +
529 PVR_PDS_MAX_BUFFERS *
530 (sizeof(struct pvr_const_map_entry_constant_buffer) +
531 sizeof(struct pvr_const_map_entry_literal32)) +
532 sizeof(struct pvr_const_map_entry_doutu_address) +
533 sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) +
534 8 * sizeof(struct pvr_pds_const_map_entry_addr_literal));
535 }
536
pvr_pds_descriptor_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const struct pvr_pipeline_layout * const layout,enum pvr_stage_allocation stage,const struct pvr_sh_reg_layout * sh_reg_layout,struct pvr_stage_allocation_descriptor_state * const descriptor_state)537 static VkResult pvr_pds_descriptor_program_create_and_upload(
538 struct pvr_device *const device,
539 const VkAllocationCallbacks *const allocator,
540 const struct pvr_pipeline_layout *const layout,
541 enum pvr_stage_allocation stage,
542 const struct pvr_sh_reg_layout *sh_reg_layout,
543 struct pvr_stage_allocation_descriptor_state *const descriptor_state)
544 {
545 const size_t const_entries_size_in_bytes =
546 pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
547 struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
548 struct pvr_pds_descriptor_program_input program = { 0 };
549 struct pvr_const_map_entry *new_entries;
550 ASSERTED uint32_t code_size_in_dwords;
551 uint32_t staging_buffer_size;
552 uint32_t addr_literals = 0;
553 uint32_t *staging_buffer;
554 VkResult result;
555
556 assert(stage != PVR_STAGE_ALLOCATION_COUNT);
557
558 *pds_info = (struct pvr_pds_info){ 0 };
559
560 if (sh_reg_layout->descriptor_set_addrs_table.present) {
561 program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
562 .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
563 .destination = sh_reg_layout->descriptor_set_addrs_table.offset,
564 };
565 addr_literals++;
566 }
567
568 if (sh_reg_layout->push_consts.present) {
569 program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
570 .type = PVR_PDS_ADDR_LITERAL_PUSH_CONSTS,
571 .destination = sh_reg_layout->push_consts.offset,
572 };
573 addr_literals++;
574 }
575
576 if (sh_reg_layout->blend_consts.present) {
577 program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
578 .type = PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS,
579 .destination = sh_reg_layout->blend_consts.offset,
580 };
581 addr_literals++;
582 }
583
584 program.addr_literal_count = addr_literals;
585
586 pds_info->entries = vk_alloc2(&device->vk.alloc,
587 allocator,
588 const_entries_size_in_bytes,
589 8,
590 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
591 if (!pds_info->entries) {
592 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
593 goto err_free_static_consts;
594 }
595
596 pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
597
598 pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
599
600 code_size_in_dwords = pds_info->code_size_in_dwords;
601 staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords);
602
603 if (!staging_buffer_size) {
604 vk_free2(&device->vk.alloc, allocator, pds_info->entries);
605
606 *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
607
608 return VK_SUCCESS;
609 }
610
611 staging_buffer = vk_alloc2(&device->vk.alloc,
612 allocator,
613 staging_buffer_size,
614 8,
615 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
616 if (!staging_buffer) {
617 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
618 goto err_free_entries;
619 }
620
621 pvr_pds_generate_descriptor_upload_program(&program,
622 staging_buffer,
623 pds_info);
624
625 assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
626
627 /* FIXME: use vk_realloc2() ? */
628 new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
629 pds_info->entries,
630 pds_info->entries_written_size_in_bytes,
631 8,
632 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
633 if (!new_entries) {
634 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
635 goto err_free_staging_buffer;
636 }
637
638 pds_info->entries = new_entries;
639 pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
640
641 /* FIXME: Figure out the define for alignment of 16. */
642 result = pvr_gpu_upload_pds(device,
643 NULL,
644 0,
645 0,
646 staging_buffer,
647 pds_info->code_size_in_dwords,
648 16,
649 16,
650 &descriptor_state->pds_code);
651 if (result != VK_SUCCESS)
652 goto err_free_staging_buffer;
653
654 vk_free2(&device->vk.alloc, allocator, staging_buffer);
655
656 return VK_SUCCESS;
657
658 err_free_staging_buffer:
659 vk_free2(&device->vk.alloc, allocator, staging_buffer);
660
661 err_free_entries:
662 vk_free2(&device->vk.alloc, allocator, pds_info->entries);
663
664 err_free_static_consts:
665 pvr_bo_suballoc_free(descriptor_state->static_consts);
666
667 return result;
668 }
669
pvr_pds_descriptor_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_stage_allocation_descriptor_state * const descriptor_state)670 static void pvr_pds_descriptor_program_destroy(
671 struct pvr_device *const device,
672 const struct VkAllocationCallbacks *const allocator,
673 struct pvr_stage_allocation_descriptor_state *const descriptor_state)
674 {
675 if (!descriptor_state)
676 return;
677
678 pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo);
679 vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
680 pvr_bo_suballoc_free(descriptor_state->static_consts);
681 }
682
pvr_pds_compute_program_setup(const struct pvr_device_info * dev_info,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,bool add_base_workgroup,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_compute_shader_program * const program)683 static void pvr_pds_compute_program_setup(
684 const struct pvr_device_info *dev_info,
685 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
686 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
687 uint32_t barrier_coefficient,
688 bool add_base_workgroup,
689 uint32_t usc_temps,
690 pvr_dev_addr_t usc_shader_dev_addr,
691 struct pvr_pds_compute_shader_program *const program)
692 {
693 pvr_pds_compute_shader_program_init(program);
694 program->local_input_regs[0] = local_input_regs[0];
695 program->local_input_regs[1] = local_input_regs[1];
696 program->local_input_regs[2] = local_input_regs[2];
697 program->work_group_input_regs[0] = work_group_input_regs[0];
698 program->work_group_input_regs[1] = work_group_input_regs[1];
699 program->work_group_input_regs[2] = work_group_input_regs[2];
700 program->barrier_coefficient = barrier_coefficient;
701 program->add_base_workgroup = add_base_workgroup;
702 program->flattened_work_groups = true;
703 program->kick_usc = true;
704
705 STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
706 PVR_WORKGROUP_DIMENSIONS);
707 STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
708 PVR_WORKGROUP_DIMENSIONS);
709 STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
710 PVR_WORKGROUP_DIMENSIONS);
711
712 pvr_pds_setup_doutu(&program->usc_task_control,
713 usc_shader_dev_addr.addr,
714 usc_temps,
715 ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
716 false);
717
718 pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
719 }
720
721 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
722 */
pvr_pds_compute_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_upload * const pds_upload_out,struct pvr_pds_info * const pds_info_out)723 static VkResult pvr_pds_compute_program_create_and_upload(
724 struct pvr_device *const device,
725 const VkAllocationCallbacks *const allocator,
726 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
727 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
728 uint32_t barrier_coefficient,
729 uint32_t usc_temps,
730 pvr_dev_addr_t usc_shader_dev_addr,
731 struct pvr_pds_upload *const pds_upload_out,
732 struct pvr_pds_info *const pds_info_out)
733 {
734 struct pvr_device_info *dev_info = &device->pdevice->dev_info;
735 struct pvr_pds_compute_shader_program program;
736 uint32_t staging_buffer_size;
737 uint32_t *staging_buffer;
738 VkResult result;
739
740 pvr_pds_compute_program_setup(dev_info,
741 local_input_regs,
742 work_group_input_regs,
743 barrier_coefficient,
744 false,
745 usc_temps,
746 usc_shader_dev_addr,
747 &program);
748
749 /* FIXME: According to pvr_device_init_compute_pds_program() the code size
750 * is in bytes. Investigate this.
751 */
752 staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
753
754 staging_buffer = vk_alloc2(&device->vk.alloc,
755 allocator,
756 staging_buffer_size,
757 8,
758 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
759 if (!staging_buffer)
760 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
761
762 /* FIXME: pvr_pds_compute_shader doesn't implement
763 * PDS_GENERATE_CODEDATA_SEGMENTS.
764 */
765 pvr_pds_compute_shader(&program,
766 &staging_buffer[0],
767 PDS_GENERATE_CODE_SEGMENT,
768 dev_info);
769
770 pvr_pds_compute_shader(&program,
771 &staging_buffer[program.code_size],
772 PDS_GENERATE_DATA_SEGMENT,
773 dev_info);
774
775 /* FIXME: Figure out the define for alignment of 16. */
776 result = pvr_gpu_upload_pds(device,
777 &staging_buffer[program.code_size],
778 program.data_size,
779 16,
780 &staging_buffer[0],
781 program.code_size,
782 16,
783 16,
784 pds_upload_out);
785 if (result != VK_SUCCESS) {
786 vk_free2(&device->vk.alloc, allocator, staging_buffer);
787 return result;
788 }
789
790 *pds_info_out = (struct pvr_pds_info){
791 .temps_required = program.highest_temp,
792 .code_size_in_dwords = program.code_size,
793 .data_size_in_dwords = program.data_size,
794 };
795
796 vk_free2(&device->vk.alloc, allocator, staging_buffer);
797
798 return VK_SUCCESS;
799 };
800
pvr_pds_compute_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_upload * const pds_program,struct pvr_pds_info * const pds_info)801 static void pvr_pds_compute_program_destroy(
802 struct pvr_device *const device,
803 const struct VkAllocationCallbacks *const allocator,
804 struct pvr_pds_upload *const pds_program,
805 struct pvr_pds_info *const pds_info)
806 {
807 /* We don't allocate an entries buffer so we don't need to free it */
808 pvr_bo_suballoc_free(pds_program->pvr_bo);
809 }
810
811 /* This only uploads the code segment. The data segment will need to be patched
812 * with the base workgroup before uploading.
813 */
pvr_pds_compute_base_workgroup_variant_program_init(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_base_workgroup_program * program_out)814 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
815 struct pvr_device *const device,
816 const VkAllocationCallbacks *const allocator,
817 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
818 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
819 uint32_t barrier_coefficient,
820 uint32_t usc_temps,
821 pvr_dev_addr_t usc_shader_dev_addr,
822 struct pvr_pds_base_workgroup_program *program_out)
823 {
824 struct pvr_device_info *dev_info = &device->pdevice->dev_info;
825 struct pvr_pds_compute_shader_program program;
826 uint32_t buffer_size;
827 uint32_t *buffer;
828 VkResult result;
829
830 pvr_pds_compute_program_setup(dev_info,
831 local_input_regs,
832 work_group_input_regs,
833 barrier_coefficient,
834 true,
835 usc_temps,
836 usc_shader_dev_addr,
837 &program);
838
839 /* FIXME: According to pvr_device_init_compute_pds_program() the code size
840 * is in bytes. Investigate this.
841 */
842 buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
843
844 buffer = vk_alloc2(&device->vk.alloc,
845 allocator,
846 buffer_size,
847 8,
848 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
849 if (!buffer)
850 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
851
852 pvr_pds_compute_shader(&program,
853 &buffer[0],
854 PDS_GENERATE_CODE_SEGMENT,
855 dev_info);
856
857 /* FIXME: Figure out the define for alignment of 16. */
858 result = pvr_gpu_upload_pds(device,
859 NULL,
860 0,
861 0,
862 buffer,
863 program.code_size,
864 16,
865 16,
866 &program_out->code_upload);
867 if (result != VK_SUCCESS) {
868 vk_free2(&device->vk.alloc, allocator, buffer);
869 return result;
870 }
871
872 pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
873
874 program_out->data_section = buffer;
875
876 /* We'll need to patch the base workgroup in the PDS data section before
877 * dispatch so we save the offsets at which to patch. We only need to save
878 * the offset for the first workgroup id since the workgroup ids are stored
879 * contiguously in the data segment.
880 */
881 program_out->base_workgroup_data_patching_offset =
882 program.base_workgroup_constant_offset_in_dwords[0];
883
884 program_out->info = (struct pvr_pds_info){
885 .temps_required = program.highest_temp,
886 .code_size_in_dwords = program.code_size,
887 .data_size_in_dwords = program.data_size,
888 };
889
890 return VK_SUCCESS;
891 }
892
pvr_pds_compute_base_workgroup_variant_program_finish(struct pvr_device * device,const VkAllocationCallbacks * const allocator,struct pvr_pds_base_workgroup_program * const state)893 static void pvr_pds_compute_base_workgroup_variant_program_finish(
894 struct pvr_device *device,
895 const VkAllocationCallbacks *const allocator,
896 struct pvr_pds_base_workgroup_program *const state)
897 {
898 pvr_bo_suballoc_free(state->code_upload.pvr_bo);
899 vk_free2(&device->vk.alloc, allocator, state->data_section);
900 }
901
902 /******************************************************************************
903 Generic pipeline functions
904 ******************************************************************************/
905
pvr_pipeline_init(struct pvr_device * device,enum pvr_pipeline_type type,struct pvr_pipeline * const pipeline)906 static void pvr_pipeline_init(struct pvr_device *device,
907 enum pvr_pipeline_type type,
908 struct pvr_pipeline *const pipeline)
909 {
910 assert(!pipeline->layout);
911
912 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
913
914 pipeline->type = type;
915 }
916
pvr_pipeline_finish(struct pvr_pipeline * pipeline)917 static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
918 {
919 vk_object_base_finish(&pipeline->base);
920 }
921
922 /* How many shared regs it takes to store a pvr_dev_addr_t.
923 * Each shared reg is 32 bits.
924 */
925 #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
926 DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
927
928 /**
929 * \brief Allocates shared registers.
930 *
931 * \return How many sh regs are required.
932 */
933 static uint32_t
pvr_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_pipeline_layout * layout,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)934 pvr_pipeline_alloc_shareds(const struct pvr_device *device,
935 const struct pvr_pipeline_layout *layout,
936 enum pvr_stage_allocation stage,
937 struct pvr_sh_reg_layout *const sh_reg_layout_out)
938 {
939 ASSERTED const uint64_t reserved_shared_size =
940 device->pdevice->dev_runtime_info.reserved_shared_size;
941 ASSERTED const uint64_t max_coeff =
942 device->pdevice->dev_runtime_info.max_coeffs;
943
944 struct pvr_sh_reg_layout reg_layout = { 0 };
945 uint32_t next_free_sh_reg = 0;
946
947 reg_layout.descriptor_set_addrs_table.present =
948 !!(layout->shader_stage_mask & BITFIELD_BIT(stage));
949
950 if (reg_layout.descriptor_set_addrs_table.present) {
951 reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg;
952 next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
953 }
954
955 reg_layout.push_consts.present =
956 !!(layout->push_constants_shader_stages & BITFIELD_BIT(stage));
957
958 if (reg_layout.push_consts.present) {
959 reg_layout.push_consts.offset = next_free_sh_reg;
960 next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
961 }
962
963 *sh_reg_layout_out = reg_layout;
964
965 /* FIXME: We might need to take more things into consideration.
966 * See pvr_calc_fscommon_size_and_tiles_in_flight().
967 */
968 assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
969
970 return next_free_sh_reg;
971 }
972
973 /******************************************************************************
974 Compute pipeline functions
975 ******************************************************************************/
976
977 /* Compiles and uploads shaders and PDS programs. */
pvr_compute_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)978 static VkResult pvr_compute_pipeline_compile(
979 struct pvr_device *const device,
980 struct vk_pipeline_cache *cache,
981 const VkComputePipelineCreateInfo *pCreateInfo,
982 const VkAllocationCallbacks *const allocator,
983 struct pvr_compute_pipeline *const compute_pipeline)
984 {
985 struct pvr_pipeline_layout *layout = compute_pipeline->base.layout;
986 struct pvr_sh_reg_layout *sh_reg_layout =
987 &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_COMPUTE];
988 uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
989 uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
990 uint32_t barrier_coefficient;
991 uint32_t usc_temps;
992 uint32_t sh_count;
993 VkResult result;
994
995 sh_count = pvr_pipeline_alloc_shareds(device,
996 layout,
997 PVR_STAGE_ALLOCATION_COMPUTE,
998 sh_reg_layout);
999
1000 compute_pipeline->shader_state.const_shared_reg_count = sh_count;
1001
1002 /* FIXME: Compile and upload the shader. */
1003 /* FIXME: Initialize the shader state and setup build info. */
1004 unreachable("finishme: compute support");
1005
1006 result = pvr_pds_descriptor_program_create_and_upload(
1007 device,
1008 allocator,
1009 layout,
1010 PVR_STAGE_ALLOCATION_COMPUTE,
1011 sh_reg_layout,
1012 &compute_pipeline->descriptor_state);
1013 if (result != VK_SUCCESS)
1014 goto err_free_shader;
1015
1016 result = pvr_pds_compute_program_create_and_upload(
1017 device,
1018 allocator,
1019 local_input_regs,
1020 work_group_input_regs,
1021 barrier_coefficient,
1022 usc_temps,
1023 compute_pipeline->shader_state.bo->dev_addr,
1024 &compute_pipeline->primary_program,
1025 &compute_pipeline->primary_program_info);
1026 if (result != VK_SUCCESS)
1027 goto err_free_descriptor_program;
1028
1029 /* If the workgroup ID is required, then we require the base workgroup
1030 * variant of the PDS compute program as well.
1031 */
1032 compute_pipeline->flags.base_workgroup =
1033 work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
1034 work_group_input_regs[1] != PVR_PDS_REG_UNUSED ||
1035 work_group_input_regs[2] != PVR_PDS_REG_UNUSED;
1036
1037 if (compute_pipeline->flags.base_workgroup) {
1038 result = pvr_pds_compute_base_workgroup_variant_program_init(
1039 device,
1040 allocator,
1041 local_input_regs,
1042 work_group_input_regs,
1043 barrier_coefficient,
1044 usc_temps,
1045 compute_pipeline->shader_state.bo->dev_addr,
1046 &compute_pipeline->primary_base_workgroup_variant_program);
1047 if (result != VK_SUCCESS)
1048 goto err_destroy_compute_program;
1049 }
1050
1051 return VK_SUCCESS;
1052
1053 err_destroy_compute_program:
1054 pvr_pds_compute_program_destroy(device,
1055 allocator,
1056 &compute_pipeline->primary_program,
1057 &compute_pipeline->primary_program_info);
1058
1059 err_free_descriptor_program:
1060 pvr_pds_descriptor_program_destroy(device,
1061 allocator,
1062 &compute_pipeline->descriptor_state);
1063
1064 err_free_shader:
1065 pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1066
1067 return result;
1068 }
1069
1070 static VkResult
pvr_compute_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_compute_pipeline * compute_pipeline)1071 pvr_compute_pipeline_init(struct pvr_device *device,
1072 struct vk_pipeline_cache *cache,
1073 const VkComputePipelineCreateInfo *pCreateInfo,
1074 const VkAllocationCallbacks *allocator,
1075 struct pvr_compute_pipeline *compute_pipeline)
1076 {
1077 VkResult result;
1078
1079 pvr_pipeline_init(device,
1080 PVR_PIPELINE_TYPE_COMPUTE,
1081 &compute_pipeline->base);
1082
1083 compute_pipeline->base.layout =
1084 pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1085
1086 result = pvr_compute_pipeline_compile(device,
1087 cache,
1088 pCreateInfo,
1089 allocator,
1090 compute_pipeline);
1091 if (result != VK_SUCCESS) {
1092 pvr_pipeline_finish(&compute_pipeline->base);
1093 return result;
1094 }
1095
1096 return VK_SUCCESS;
1097 }
1098
1099 static VkResult
pvr_compute_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1100 pvr_compute_pipeline_create(struct pvr_device *device,
1101 struct vk_pipeline_cache *cache,
1102 const VkComputePipelineCreateInfo *pCreateInfo,
1103 const VkAllocationCallbacks *allocator,
1104 VkPipeline *const pipeline_out)
1105 {
1106 struct pvr_compute_pipeline *compute_pipeline;
1107 VkResult result;
1108
1109 compute_pipeline = vk_zalloc2(&device->vk.alloc,
1110 allocator,
1111 sizeof(*compute_pipeline),
1112 8,
1113 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1114 if (!compute_pipeline)
1115 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1116
1117 /* Compiles and uploads shaders and PDS programs. */
1118 result = pvr_compute_pipeline_init(device,
1119 cache,
1120 pCreateInfo,
1121 allocator,
1122 compute_pipeline);
1123 if (result != VK_SUCCESS) {
1124 vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1125 return result;
1126 }
1127
1128 *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1129
1130 return VK_SUCCESS;
1131 }
1132
pvr_compute_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1133 static void pvr_compute_pipeline_destroy(
1134 struct pvr_device *const device,
1135 const VkAllocationCallbacks *const allocator,
1136 struct pvr_compute_pipeline *const compute_pipeline)
1137 {
1138 if (compute_pipeline->flags.base_workgroup) {
1139 pvr_pds_compute_base_workgroup_variant_program_finish(
1140 device,
1141 allocator,
1142 &compute_pipeline->primary_base_workgroup_variant_program);
1143 }
1144
1145 pvr_pds_compute_program_destroy(device,
1146 allocator,
1147 &compute_pipeline->primary_program,
1148 &compute_pipeline->primary_program_info);
1149 pvr_pds_descriptor_program_destroy(device,
1150 allocator,
1151 &compute_pipeline->descriptor_state);
1152 pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1153
1154 pvr_pipeline_finish(&compute_pipeline->base);
1155
1156 vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1157 }
1158
1159 VkResult
pvr_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1160 pvr_CreateComputePipelines(VkDevice _device,
1161 VkPipelineCache pipelineCache,
1162 uint32_t createInfoCount,
1163 const VkComputePipelineCreateInfo *pCreateInfos,
1164 const VkAllocationCallbacks *pAllocator,
1165 VkPipeline *pPipelines)
1166 {
1167 VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
1168 PVR_FROM_HANDLE(pvr_device, device, _device);
1169 VkResult result = VK_SUCCESS;
1170
1171 for (uint32_t i = 0; i < createInfoCount; i++) {
1172 const VkResult local_result =
1173 pvr_compute_pipeline_create(device,
1174 cache,
1175 &pCreateInfos[i],
1176 pAllocator,
1177 &pPipelines[i]);
1178 if (local_result != VK_SUCCESS) {
1179 result = local_result;
1180 pPipelines[i] = VK_NULL_HANDLE;
1181 }
1182 }
1183
1184 return result;
1185 }
1186
1187 /******************************************************************************
1188 Graphics pipeline functions
1189 ******************************************************************************/
1190
1191 static void
pvr_graphics_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1192 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1193 const VkAllocationCallbacks *const allocator,
1194 struct pvr_graphics_pipeline *const gfx_pipeline)
1195 {
1196 const uint32_t num_vertex_attrib_programs =
1197 ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
1198
1199 pvr_pds_descriptor_program_destroy(
1200 device,
1201 allocator,
1202 &gfx_pipeline->shader_state.fragment.descriptor_state);
1203
1204 pvr_pds_descriptor_program_destroy(
1205 device,
1206 allocator,
1207 &gfx_pipeline->shader_state.vertex.descriptor_state);
1208
1209 for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1210 struct pvr_pds_attrib_program *const attrib_program =
1211 &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
1212
1213 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1214 }
1215
1216 pvr_bo_suballoc_free(
1217 gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
1218 pvr_bo_suballoc_free(
1219 gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
1220
1221 pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
1222 pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
1223
1224 pvr_pipeline_finish(&gfx_pipeline->base);
1225
1226 vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1227 }
1228
pvr_vertex_state_save(struct pvr_graphics_pipeline * gfx_pipeline,pco_shader * vs)1229 static void pvr_vertex_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
1230 pco_shader *vs)
1231 {
1232 struct pvr_vertex_shader_state *vertex_state =
1233 &gfx_pipeline->shader_state.vertex;
1234
1235 const pco_data *shader_data = pco_shader_data(vs);
1236 memcpy(&gfx_pipeline->vs_data, shader_data, sizeof(*shader_data));
1237
1238 /* This ends up unused since we'll use the temp_usage for the PDS program we
1239 * end up selecting, and the descriptor PDS program doesn't use any temps.
1240 * Let's set it to ~0 in case it ever gets used.
1241 */
1242 vertex_state->stage_state.pds_temps_count = ~0;
1243 }
1244
pvr_fragment_state_save(struct pvr_graphics_pipeline * gfx_pipeline,pco_shader * fs)1245 static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
1246 pco_shader *fs)
1247 {
1248 struct pvr_fragment_shader_state *fragment_state =
1249 &gfx_pipeline->shader_state.fragment;
1250
1251 const pco_data *shader_data = pco_shader_data(fs);
1252 memcpy(&gfx_pipeline->fs_data, shader_data, sizeof(*shader_data));
1253
1254 /* TODO: add selection for other values of pass type and sample rate. */
1255 fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE;
1256 fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE;
1257
1258 /* We can't initialize it yet since we still need to generate the PDS
1259 * programs so set it to `~0` to make sure that we set this up later on.
1260 */
1261 fragment_state->stage_state.pds_temps_count = ~0;
1262 }
1263
pvr_blend_factor_requires_consts(VkBlendFactor factor)1264 static bool pvr_blend_factor_requires_consts(VkBlendFactor factor)
1265 {
1266 switch (factor) {
1267 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1268 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1269 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1270 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1271 return true;
1272
1273 default:
1274 return false;
1275 }
1276 }
1277
1278 /**
1279 * \brief Indicates whether dynamic blend constants are needed.
1280 *
1281 * If the user has specified the blend constants to be dynamic, they might not
1282 * necessarily be using them. This function makes sure that they are being used
1283 * in order to determine whether we need to upload them later on for the shader
1284 * to access them.
1285 */
pvr_graphics_pipeline_requires_dynamic_blend_consts(const struct pvr_graphics_pipeline * gfx_pipeline)1286 static bool pvr_graphics_pipeline_requires_dynamic_blend_consts(
1287 const struct pvr_graphics_pipeline *gfx_pipeline)
1288 {
1289 const struct vk_dynamic_graphics_state *const state =
1290 &gfx_pipeline->dynamic_state;
1291
1292 if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
1293 return false;
1294
1295 for (uint32_t i = 0; i < state->cb.attachment_count; i++) {
1296 const struct vk_color_blend_attachment_state *attachment =
1297 &state->cb.attachments[i];
1298
1299 const bool has_color_write =
1300 attachment->write_mask &
1301 (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
1302 VK_COLOR_COMPONENT_B_BIT);
1303 const bool has_alpha_write = attachment->write_mask &
1304 VK_COLOR_COMPONENT_A_BIT;
1305
1306 if (!attachment->blend_enable || attachment->write_mask == 0)
1307 continue;
1308
1309 if (has_color_write) {
1310 const uint8_t src_color_blend_factor =
1311 attachment->src_color_blend_factor;
1312 const uint8_t dst_color_blend_factor =
1313 attachment->dst_color_blend_factor;
1314
1315 if (pvr_blend_factor_requires_consts(src_color_blend_factor) ||
1316 pvr_blend_factor_requires_consts(dst_color_blend_factor)) {
1317 return true;
1318 }
1319 }
1320
1321 if (has_alpha_write) {
1322 const uint8_t src_alpha_blend_factor =
1323 attachment->src_alpha_blend_factor;
1324 const uint8_t dst_alpha_blend_factor =
1325 attachment->dst_alpha_blend_factor;
1326
1327 if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) ||
1328 pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) {
1329 return true;
1330 }
1331 }
1332 }
1333
1334 return false;
1335 }
1336
pvr_graphics_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_graphics_pipeline * gfx_pipeline,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)1337 static uint32_t pvr_graphics_pipeline_alloc_shareds(
1338 const struct pvr_device *device,
1339 const struct pvr_graphics_pipeline *gfx_pipeline,
1340 enum pvr_stage_allocation stage,
1341 struct pvr_sh_reg_layout *const sh_reg_layout_out)
1342 {
1343 ASSERTED const uint64_t reserved_shared_size =
1344 device->pdevice->dev_runtime_info.reserved_shared_size;
1345 ASSERTED const uint64_t max_coeff =
1346 device->pdevice->dev_runtime_info.max_coeffs;
1347
1348 const struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
1349 struct pvr_sh_reg_layout reg_layout = { 0 };
1350 uint32_t next_free_sh_reg = 0;
1351
1352 next_free_sh_reg =
1353 pvr_pipeline_alloc_shareds(device, layout, stage, ®_layout);
1354
1355 reg_layout.blend_consts.present =
1356 (stage == PVR_STAGE_ALLOCATION_FRAGMENT &&
1357 pvr_graphics_pipeline_requires_dynamic_blend_consts(gfx_pipeline));
1358 if (reg_layout.blend_consts.present) {
1359 reg_layout.blend_consts.offset = next_free_sh_reg;
1360 next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1361 }
1362
1363 *sh_reg_layout_out = reg_layout;
1364
1365 /* FIXME: We might need to take more things into consideration.
1366 * See pvr_calc_fscommon_size_and_tiles_in_flight().
1367 */
1368 assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
1369
1370 return next_free_sh_reg;
1371 }
1372
1373 #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS
1374
pvr_graphics_pipeline_setup_vertex_dma(pco_shader * vs,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,struct pvr_pds_vertex_dma * const dma_descriptions,uint32_t * const dma_count)1375 static void pvr_graphics_pipeline_setup_vertex_dma(
1376 pco_shader *vs,
1377 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
1378 struct pvr_pds_vertex_dma *const dma_descriptions,
1379 uint32_t *const dma_count)
1380 {
1381 pco_vs_data *vs_data = &pco_shader_data(vs)->vs;
1382
1383 const VkVertexInputBindingDescription
1384 *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1385 const VkVertexInputAttributeDescription
1386 *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1387
1388 /* Vertex attributes map to the `layout(location = x)` annotation in the
1389 * shader where `x` is the attribute's location.
1390 * Vertex bindings have NO relation to the shader. They have nothing to do
1391 * with the `layout(set = x, binding = y)` notation. They instead indicate
1392 * where the data for a collection of vertex attributes comes from. The
1393 * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific
1394 * binding number and based on that we'll know which buffer to DMA the data
1395 * from, to fill in the collection of vertex attributes.
1396 */
1397
1398 for (uint32_t i = 0; i < vertex_input_state->vertexBindingDescriptionCount;
1399 i++) {
1400 const VkVertexInputBindingDescription *binding_desc =
1401 &vertex_input_state->pVertexBindingDescriptions[i];
1402
1403 sorted_bindings[binding_desc->binding] = binding_desc;
1404 }
1405
1406 for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
1407 i++) {
1408 const VkVertexInputAttributeDescription *attribute_desc =
1409 &vertex_input_state->pVertexAttributeDescriptions[i];
1410
1411 sorted_attributes[attribute_desc->location] = attribute_desc;
1412 }
1413
1414 for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
1415 i++) {
1416 const VkVertexInputAttributeDescription *attribute = sorted_attributes[i];
1417 if (!attribute)
1418 continue;
1419
1420 gl_vert_attrib location = attribute->location + VERT_ATTRIB_GENERIC0;
1421 const VkVertexInputBindingDescription *binding =
1422 sorted_bindings[attribute->binding];
1423 struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[*dma_count];
1424 const struct util_format_description *fmt_description =
1425 vk_format_description(attribute->format);
1426
1427 const pco_range *attrib_range = &vs_data->attribs[location];
1428
1429 /* Skip unused attributes. */
1430 if (!attrib_range->count)
1431 continue;
1432
1433 /* DMA setup. */
1434
1435 /* The PDS program sets up DDMADs to DMA attributes into vtxin regs.
1436 *
1437 * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address).
1438 * DMA source addr = src0 * src1 + src2
1439 * DMA params = src3
1440 *
1441 * In the PDS program we setup src0 with the binding's stride and src1
1442 * with either the instance id or vertex id (both of which get filled by
1443 * the hardware). We setup src2 later on once we know which VkBuffer to
1444 * DMA the data from so it's saved for later when we patch the data
1445 * section.
1446 */
1447
1448 /* TODO: Right now we're setting up a DMA per attribute. In a case where
1449 * there are multiple attributes packed into a single binding with
1450 * adjacent locations we'd still be DMAing them separately. This is not
1451 * great so the DMA setup should be smarter and could do with some
1452 * optimization.
1453 */
1454
1455 *dma_desc = (struct pvr_pds_vertex_dma){ 0 };
1456
1457 /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1458 * this corresponds to `attribDesc.offset`.
1459 * The PDS program doesn't do anything with it but just save it in the
1460 * PDS program entry.
1461 */
1462 dma_desc->offset = attribute->offset;
1463
1464 /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1465 * this corresponds to `bindingDesc.stride`.
1466 * The PDS program will calculate the `effectiveVertexOffset` with this
1467 * and add it to the address provided in the patched data segment.
1468 */
1469 dma_desc->stride = binding->stride;
1470
1471 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1472 dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
1473 else
1474 dma_desc->flags = 0;
1475
1476 /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */
1477 /* TODO: what if not all components are used */
1478 assert(attrib_range->count == fmt_description->block.bits / 32);
1479 dma_desc->size_in_dwords = attrib_range->count;
1480
1481 /* Vtxin reg offset to start DMAing into. */
1482 dma_desc->destination = attrib_range->start;
1483
1484 /* Will be used by the driver to figure out buffer address to patch in the
1485 * data section. I.e. which binding we should DMA from.
1486 */
1487 dma_desc->binding_index = attribute->binding;
1488
1489 /* We don't currently support VK_EXT_vertex_attribute_divisor so no
1490 * repeating of instance-rate vertex attributes needed. We should always
1491 * move on to the next vertex attribute.
1492 */
1493 assert(binding->inputRate != VK_VERTEX_INPUT_RATE_INSTANCE);
1494 dma_desc->divisor = 1;
1495
1496 /* Will be used to generate PDS code that takes care of robust buffer
1497 * access, and later on by the driver to write the correct robustness
1498 * buffer address to DMA the fallback values from.
1499 */
1500 dma_desc->robustness_buffer_offset =
1501 pvr_get_robustness_buffer_format_offset(attribute->format);
1502
1503 /* Used by later on by the driver to figure out if the buffer is being
1504 * accessed out of bounds, for robust buffer access.
1505 */
1506 dma_desc->component_size_in_bytes =
1507 fmt_description->block.bits / fmt_description->nr_channels / 8;
1508
1509 ++*dma_count;
1510 }
1511 }
1512
pvr_graphics_pipeline_setup_fragment_coeff_program(pco_fs_data * fs_data,pco_vs_data * vs_data,nir_shader * fs,struct pvr_pds_coeff_loading_program * frag_coeff_program)1513 static void pvr_graphics_pipeline_setup_fragment_coeff_program(
1514 pco_fs_data *fs_data,
1515 pco_vs_data *vs_data,
1516 nir_shader *fs,
1517 struct pvr_pds_coeff_loading_program *frag_coeff_program)
1518 {
1519 uint64_t varyings_used = fs->info.inputs_read &
1520 BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING);
1521
1522 unsigned fpu = 0;
1523 unsigned dest = 0;
1524
1525 if (fs_data->uses.z) {
1526 pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1527 PDSINST_DOUT_FIELDS_DOUTI_SRC,
1528 douti_src) {
1529 /* TODO: define instead of sizeof(uint16_t). */
1530 douti_src.f32_offset = fs_data->uses.w ? 1 * sizeof(uint16_t) : 0;
1531 douti_src.f16_offset = douti_src.f32_offset;
1532 douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1533 douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D;
1534 }
1535
1536 frag_coeff_program->destination[fpu++] = dest++;
1537 }
1538
1539 if (fs_data->uses.w) {
1540 pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1541 PDSINST_DOUT_FIELDS_DOUTI_SRC,
1542 douti_src) {
1543 douti_src.f32_offset = 0;
1544 douti_src.f16_offset = douti_src.f32_offset;
1545 douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1546 douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D;
1547 }
1548
1549 frag_coeff_program->destination[fpu++] = dest++;
1550 }
1551
1552 if (fs_data->uses.pntc) {
1553 pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1554 PDSINST_DOUT_FIELDS_DOUTI_SRC,
1555 douti_src) {
1556 douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1557 douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_2D;
1558 douti_src.pointsprite = true;
1559 }
1560
1561 frag_coeff_program->destination[fpu++] = dest;
1562 dest += 2;
1563 }
1564
1565 u_foreach_bit64 (varying, varyings_used) {
1566 nir_variable *var =
1567 nir_find_variable_with_location(fs, nir_var_shader_in, varying);
1568 assert(var);
1569
1570 pco_range *cf_range = &fs_data->varyings[varying];
1571 assert(cf_range->count > 0);
1572 assert(!(cf_range->start % ROGUE_USC_COEFFICIENT_SET_SIZE));
1573 assert(!(cf_range->count % ROGUE_USC_COEFFICIENT_SET_SIZE));
1574
1575 pco_range *vtxout_range = &vs_data->varyings[varying];
1576 assert(vtxout_range->count > 0);
1577 assert(vtxout_range->start >= 4);
1578
1579 assert(vtxout_range->count ==
1580 cf_range->count / ROGUE_USC_COEFFICIENT_SET_SIZE);
1581
1582 unsigned count = vtxout_range->count;
1583
1584 unsigned vtxout = vtxout_range->start;
1585
1586 /* pos.x, pos.y unused. */
1587 vtxout -= 2;
1588
1589 /* pos.z unused. */
1590 if (!fs_data->uses.z)
1591 vtxout -= 1;
1592
1593 /* pos.w unused. */
1594 if (!fs_data->uses.w)
1595 vtxout -= 1;
1596
1597 pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1598 PDSINST_DOUT_FIELDS_DOUTI_SRC,
1599 douti_src) {
1600 /* TODO: define instead of sizeof(uint16_t). */
1601 douti_src.f32_offset = vtxout * sizeof(uint16_t);
1602 /* TODO: f16 support. */
1603 douti_src.f16 = false;
1604 douti_src.f16_offset = douti_src.f32_offset;
1605
1606 switch (var->data.interpolation) {
1607 case INTERP_MODE_SMOOTH:
1608 douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1609 douti_src.perspective = true;
1610 break;
1611
1612 case INTERP_MODE_NOPERSPECTIVE:
1613 douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1614 break;
1615
1616 case INTERP_MODE_FLAT:
1617 /* TODO: triangle fan, provoking vertex last. */
1618 douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX0;
1619 break;
1620
1621 default:
1622 unreachable("Unimplemented interpolation type.");
1623 }
1624
1625 douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D + count - 1;
1626 }
1627
1628 frag_coeff_program->destination[fpu++] =
1629 cf_range->start / ROGUE_USC_COEFFICIENT_SET_SIZE;
1630 }
1631
1632 frag_coeff_program->num_fpu_iterators = fpu;
1633 }
1634
set_var(pco_range * allocation_list,unsigned to,nir_variable * var,unsigned dwords_each)1635 static void set_var(pco_range *allocation_list,
1636 unsigned to,
1637 nir_variable *var,
1638 unsigned dwords_each)
1639 {
1640 unsigned slots = glsl_count_dword_slots(var->type, false);
1641
1642 allocation_list[var->data.location] = (pco_range){
1643 .start = to,
1644 .count = slots * dwords_each,
1645 };
1646 }
1647
allocate_var(pco_range * allocation_list,unsigned * counter,nir_variable * var,unsigned dwords_each)1648 static void allocate_var(pco_range *allocation_list,
1649 unsigned *counter,
1650 nir_variable *var,
1651 unsigned dwords_each)
1652 {
1653 unsigned slots = glsl_count_dword_slots(var->type, false);
1654
1655 allocation_list[var->data.location] = (pco_range){
1656 .start = *counter,
1657 .count = slots * dwords_each,
1658 };
1659
1660 *counter += slots * dwords_each;
1661 }
1662
try_allocate_var(pco_range * allocation_list,unsigned * counter,nir_shader * nir,uint64_t bitset,nir_variable_mode mode,int location,unsigned dwords_each)1663 static void try_allocate_var(pco_range *allocation_list,
1664 unsigned *counter,
1665 nir_shader *nir,
1666 uint64_t bitset,
1667 nir_variable_mode mode,
1668 int location,
1669 unsigned dwords_each)
1670 {
1671 nir_variable *var = nir_find_variable_with_location(nir, mode, location);
1672
1673 if (!(bitset & BITFIELD64_BIT(location)))
1674 return;
1675
1676 assert(var);
1677
1678 allocate_var(allocation_list, counter, var, dwords_each);
1679 }
1680
try_allocate_vars(pco_range * allocation_list,unsigned * counter,nir_shader * nir,uint64_t * bitset,nir_variable_mode mode,bool f16,enum glsl_interp_mode interp_mode,unsigned dwords_each)1681 static void try_allocate_vars(pco_range *allocation_list,
1682 unsigned *counter,
1683 nir_shader *nir,
1684 uint64_t *bitset,
1685 nir_variable_mode mode,
1686 bool f16,
1687 enum glsl_interp_mode interp_mode,
1688 unsigned dwords_each)
1689 {
1690 uint64_t skipped = 0;
1691
1692 while (*bitset) {
1693 int location = u_bit_scan64(bitset);
1694
1695 nir_variable *var = nir_find_variable_with_location(nir, mode, location);
1696 assert(var);
1697
1698 if (glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)) != f16 ||
1699 var->data.interpolation != interp_mode) {
1700 skipped |= BITFIELD64_BIT(location);
1701 continue;
1702 }
1703
1704 allocate_var(allocation_list, counter, var, dwords_each);
1705 }
1706
1707 *bitset |= skipped;
1708 }
1709
allocate_val(pco_range * allocation_list,unsigned * counter,unsigned location,unsigned dwords_each)1710 static void allocate_val(pco_range *allocation_list,
1711 unsigned *counter,
1712 unsigned location,
1713 unsigned dwords_each)
1714 {
1715 allocation_list[location] = (pco_range){
1716 .start = *counter,
1717 .count = dwords_each,
1718 };
1719
1720 *counter += dwords_each;
1721 }
1722
pvr_alloc_vs_sysvals(pco_data * data,nir_shader * nir)1723 static void pvr_alloc_vs_sysvals(pco_data *data, nir_shader *nir)
1724 {
1725 BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX);
1726 BITSET_COPY(system_values_read, nir->info.system_values_read);
1727
1728 gl_system_value sys_vals[] = {
1729 SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_INSTANCE_ID,
1730 SYSTEM_VALUE_BASE_INSTANCE, SYSTEM_VALUE_BASE_VERTEX,
1731 SYSTEM_VALUE_DRAW_ID,
1732 };
1733
1734 for (unsigned u = 0; u < ARRAY_SIZE(sys_vals); ++u) {
1735 if (BITSET_TEST(system_values_read, sys_vals[u])) {
1736 allocate_val(data->common.sys_vals,
1737 &data->common.vtxins,
1738 sys_vals[u],
1739 1);
1740
1741 BITSET_CLEAR(system_values_read, sys_vals[u]);
1742 }
1743 }
1744
1745 assert(BITSET_IS_EMPTY(system_values_read));
1746 }
1747
pvr_init_vs_attribs(pco_data * data,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state)1748 static void pvr_init_vs_attribs(
1749 pco_data *data,
1750 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state)
1751 {
1752 for (unsigned u = 0; u < vertex_input_state->vertexAttributeDescriptionCount;
1753 ++u) {
1754 const VkVertexInputAttributeDescription *attrib =
1755 &vertex_input_state->pVertexAttributeDescriptions[u];
1756
1757 gl_vert_attrib location = attrib->location + VERT_ATTRIB_GENERIC0;
1758
1759 data->vs.attrib_formats[location] =
1760 vk_format_to_pipe_format(attrib->format);
1761 }
1762 }
1763
pvr_alloc_vs_attribs(pco_data * data,nir_shader * nir)1764 static void pvr_alloc_vs_attribs(pco_data *data, nir_shader *nir)
1765 {
1766 /* TODO NEXT: this should be based on the format size. */
1767 nir_foreach_shader_in_variable (var, nir) {
1768 allocate_var(data->vs.attribs, &data->common.vtxins, var, 1);
1769 }
1770 }
1771
pvr_alloc_vs_varyings(pco_data * data,nir_shader * nir)1772 static void pvr_alloc_vs_varyings(pco_data *data, nir_shader *nir)
1773 {
1774 uint64_t vars_mask = nir->info.outputs_written &
1775 BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING);
1776
1777 /* Output position must be present. */
1778 assert(nir_find_variable_with_location(nir,
1779 nir_var_shader_out,
1780 VARYING_SLOT_POS));
1781
1782 /* Varying ordering is specific. */
1783 try_allocate_var(data->vs.varyings,
1784 &data->vs.vtxouts,
1785 nir,
1786 nir->info.outputs_written,
1787 nir_var_shader_out,
1788 VARYING_SLOT_POS,
1789 1);
1790
1791 /* Save varying counts. */
1792 u_foreach_bit64 (location, vars_mask) {
1793 nir_variable *var =
1794 nir_find_variable_with_location(nir, nir_var_shader_out, location);
1795 assert(var);
1796
1797 /* TODO: f16 support. */
1798 bool f16 = glsl_type_is_16bit(glsl_without_array_or_matrix(var->type));
1799 assert(!f16);
1800 unsigned components = glsl_get_components(var->type);
1801
1802 switch (var->data.interpolation) {
1803 case INTERP_MODE_SMOOTH:
1804 if (f16)
1805 data->vs.f16_smooth += components;
1806 else
1807 data->vs.f32_smooth += components;
1808
1809 break;
1810
1811 case INTERP_MODE_FLAT:
1812 if (f16)
1813 data->vs.f16_flat += components;
1814 else
1815 data->vs.f32_flat += components;
1816
1817 break;
1818
1819 case INTERP_MODE_NOPERSPECTIVE:
1820 if (f16)
1821 data->vs.f16_npc += components;
1822 else
1823 data->vs.f32_npc += components;
1824
1825 break;
1826
1827 default:
1828 unreachable();
1829 }
1830 }
1831
1832 for (unsigned f16 = 0; f16 <= 1; ++f16) {
1833 for (enum glsl_interp_mode interp_mode = INTERP_MODE_SMOOTH;
1834 interp_mode <= INTERP_MODE_NOPERSPECTIVE;
1835 ++interp_mode) {
1836 try_allocate_vars(data->vs.varyings,
1837 &data->vs.vtxouts,
1838 nir,
1839 &vars_mask,
1840 nir_var_shader_out,
1841 f16,
1842 interp_mode,
1843 1);
1844 }
1845 }
1846
1847 assert(!vars_mask);
1848
1849 const gl_varying_slot last_slots[] = {
1850 VARYING_SLOT_PSIZ,
1851 VARYING_SLOT_VIEWPORT,
1852 VARYING_SLOT_LAYER,
1853 };
1854
1855 for (unsigned u = 0; u < ARRAY_SIZE(last_slots); ++u) {
1856 try_allocate_var(data->vs.varyings,
1857 &data->vs.vtxouts,
1858 nir,
1859 nir->info.outputs_written,
1860 nir_var_shader_out,
1861 last_slots[u],
1862 1);
1863 }
1864 }
1865
pvr_alloc_fs_sysvals(pco_data * data,nir_shader * nir)1866 static void pvr_alloc_fs_sysvals(pco_data *data, nir_shader *nir)
1867 {
1868 /* TODO */
1869 }
1870
pvr_alloc_fs_varyings(pco_data * data,nir_shader * nir)1871 static void pvr_alloc_fs_varyings(pco_data *data, nir_shader *nir)
1872 {
1873 assert(!data->common.coeffs);
1874
1875 /* Save the z/w locations. */
1876 unsigned zw_count = !!data->fs.uses.z + !!data->fs.uses.w;
1877 allocate_val(data->fs.varyings,
1878 &data->common.coeffs,
1879 VARYING_SLOT_POS,
1880 zw_count * ROGUE_USC_COEFFICIENT_SET_SIZE);
1881
1882 /* If point coords are used, they come after z/w (if present). */
1883 nir_variable *var = nir_find_variable_with_location(nir,
1884 nir_var_shader_in,
1885 VARYING_SLOT_PNTC);
1886 if (var) {
1887 assert(!var->data.location_frac);
1888 unsigned count = glsl_get_components(var->type);
1889 assert(count == 2);
1890
1891 allocate_var(data->fs.varyings,
1892 &data->common.coeffs,
1893 var,
1894 ROGUE_USC_COEFFICIENT_SET_SIZE);
1895
1896 data->fs.uses.pntc = true;
1897 }
1898
1899 /* Allocate the rest of the input varyings. */
1900 nir_foreach_shader_in_variable (var, nir) {
1901 /* Already handled. */
1902 if (var->data.location == VARYING_SLOT_POS ||
1903 var->data.location == VARYING_SLOT_PNTC)
1904 continue;
1905
1906 allocate_var(data->fs.varyings,
1907 &data->common.coeffs,
1908 var,
1909 ROGUE_USC_COEFFICIENT_SET_SIZE);
1910 }
1911 }
1912
1913 static void
pvr_init_fs_outputs(pco_data * data,const struct pvr_render_pass * pass,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1914 pvr_init_fs_outputs(pco_data *data,
1915 const struct pvr_render_pass *pass,
1916 const struct pvr_render_subpass *const subpass,
1917 const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
1918 {
1919 for (unsigned u = 0; u < subpass->color_count; ++u) {
1920 unsigned idx = subpass->color_attachments[u];
1921 if (idx == VK_ATTACHMENT_UNUSED)
1922 continue;
1923
1924 gl_frag_result location = FRAG_RESULT_DATA0 + u;
1925 VkFormat vk_format = pass->attachments[idx].vk_format;
1926 data->fs.output_formats[location] = vk_format_to_pipe_format(vk_format);
1927 }
1928
1929 /* TODO: z-replicate. */
1930 }
1931
1932 static void
pvr_setup_fs_outputs(pco_data * data,nir_shader * nir,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1933 pvr_setup_fs_outputs(pco_data *data,
1934 nir_shader *nir,
1935 const struct pvr_render_subpass *const subpass,
1936 const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
1937 {
1938 ASSERTED unsigned num_outputs = hw_subpass->setup.num_render_targets;
1939 assert(num_outputs == subpass->color_count);
1940
1941 uint64_t outputs_written = nir->info.outputs_written;
1942 assert(util_bitcount64(outputs_written) == num_outputs);
1943
1944 for (unsigned u = 0; u < subpass->color_count; ++u) {
1945 gl_frag_result location = FRAG_RESULT_DATA0 + u;
1946 unsigned idx = subpass->color_attachments[u];
1947 const struct usc_mrt_resource *mrt_resource;
1948 ASSERTED bool output_reg;
1949 enum pipe_format format;
1950 unsigned format_bits;
1951 nir_variable *var;
1952
1953 if (idx == VK_ATTACHMENT_UNUSED)
1954 continue;
1955
1956 assert(u == idx); /* TODO: not sure if this is true or not... */
1957
1958 mrt_resource = &hw_subpass->setup.mrt_resources[u];
1959 output_reg = mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1960
1961 assert(output_reg);
1962 /* TODO: tile buffer support. */
1963
1964 var = nir_find_variable_with_location(nir, nir_var_shader_out, location);
1965 assert(var);
1966
1967 format = data->fs.output_formats[location];
1968 format_bits = util_format_get_blocksizebits(format);
1969 /* TODO: other sized formats. */
1970 assert(!(format_bits % 32));
1971
1972 assert(mrt_resource->intermediate_size == format_bits / 8);
1973
1974 set_var(data->fs.outputs,
1975 mrt_resource->reg.output_reg,
1976 var,
1977 format_bits / 32);
1978 data->fs.output_reg[location] = output_reg;
1979
1980 outputs_written &= ~BITFIELD64_BIT(location);
1981 }
1982
1983 /* TODO: z-replicate. */
1984
1985 assert(!outputs_written);
1986 }
1987
pvr_init_fs_input_attachments(pco_data * data,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1988 static void pvr_init_fs_input_attachments(
1989 pco_data *data,
1990 const struct pvr_render_subpass *const subpass,
1991 const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
1992 {
1993 pvr_finishme("pvr_init_fs_input_attachments");
1994 }
1995
pvr_setup_fs_input_attachments(pco_data * data,nir_shader * nir,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1996 static void pvr_setup_fs_input_attachments(
1997 pco_data *data,
1998 nir_shader *nir,
1999 const struct pvr_render_subpass *const subpass,
2000 const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
2001 {
2002 pvr_finishme("pvr_setup_fs_input_attachments");
2003 }
2004
2005 static void
pvr_preprocess_shader_data(pco_data * data,nir_shader * nir,const VkGraphicsPipelineCreateInfo * pCreateInfo)2006 pvr_preprocess_shader_data(pco_data *data,
2007 nir_shader *nir,
2008 const VkGraphicsPipelineCreateInfo *pCreateInfo)
2009 {
2010 switch (nir->info.stage) {
2011 case MESA_SHADER_VERTEX: {
2012 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
2013 pCreateInfo->pVertexInputState;
2014
2015 pvr_init_vs_attribs(data, vertex_input_state);
2016 break;
2017 }
2018
2019 case MESA_SHADER_FRAGMENT: {
2020 PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
2021 const struct pvr_render_subpass *const subpass =
2022 &pass->subpasses[pCreateInfo->subpass];
2023 const struct pvr_renderpass_hw_map *subpass_map =
2024 &pass->hw_setup->subpass_map[pCreateInfo->subpass];
2025 const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
2026 &pass->hw_setup->renders[subpass_map->render]
2027 .subpasses[subpass_map->subpass];
2028
2029 pvr_init_fs_outputs(data, pass, subpass, hw_subpass);
2030 pvr_init_fs_input_attachments(data, subpass, hw_subpass);
2031
2032 /* TODO: push consts, blend consts, dynamic state, etc. */
2033 break;
2034 }
2035
2036 default:
2037 unreachable();
2038 }
2039
2040 /* TODO: common things, like large constants being put into shareds. */
2041 }
2042
2043 static void
pvr_postprocess_shader_data(pco_data * data,nir_shader * nir,const VkGraphicsPipelineCreateInfo * pCreateInfo)2044 pvr_postprocess_shader_data(pco_data *data,
2045 nir_shader *nir,
2046 const VkGraphicsPipelineCreateInfo *pCreateInfo)
2047 {
2048 switch (nir->info.stage) {
2049 case MESA_SHADER_VERTEX: {
2050 pvr_alloc_vs_sysvals(data, nir);
2051 pvr_alloc_vs_attribs(data, nir);
2052 pvr_alloc_vs_varyings(data, nir);
2053 break;
2054 }
2055
2056 case MESA_SHADER_FRAGMENT: {
2057 PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
2058 const struct pvr_render_subpass *const subpass =
2059 &pass->subpasses[pCreateInfo->subpass];
2060 const struct pvr_renderpass_hw_map *subpass_map =
2061 &pass->hw_setup->subpass_map[pCreateInfo->subpass];
2062 const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
2063 &pass->hw_setup->renders[subpass_map->render]
2064 .subpasses[subpass_map->subpass];
2065
2066 pvr_alloc_fs_sysvals(data, nir);
2067 pvr_alloc_fs_varyings(data, nir);
2068 pvr_setup_fs_outputs(data, nir, subpass, hw_subpass);
2069 pvr_setup_fs_input_attachments(data, nir, subpass, hw_subpass);
2070
2071 /* TODO: push consts, blend consts, dynamic state, etc. */
2072 break;
2073 }
2074
2075 default:
2076 unreachable();
2077 }
2078
2079 /* TODO: common things, like large constants being put into shareds. */
2080 }
2081
2082 /* Compiles and uploads shaders and PDS programs. */
2083 static VkResult
pvr_graphics_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)2084 pvr_graphics_pipeline_compile(struct pvr_device *const device,
2085 struct vk_pipeline_cache *cache,
2086 const VkGraphicsPipelineCreateInfo *pCreateInfo,
2087 const VkAllocationCallbacks *const allocator,
2088 struct pvr_graphics_pipeline *const gfx_pipeline)
2089 {
2090 struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
2091 struct pvr_sh_reg_layout *sh_reg_layout_vert =
2092 &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
2093 struct pvr_sh_reg_layout *sh_reg_layout_frag =
2094 &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_FRAGMENT];
2095 const uint32_t cache_line_size =
2096 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
2097 VkResult result;
2098
2099 struct pvr_vertex_shader_state *vertex_state =
2100 &gfx_pipeline->shader_state.vertex;
2101 struct pvr_fragment_shader_state *fragment_state =
2102 &gfx_pipeline->shader_state.fragment;
2103
2104 pco_ctx *pco_ctx = device->pdevice->pco_ctx;
2105 const struct spirv_to_nir_options *spirv_options =
2106 pco_spirv_options(pco_ctx);
2107 const nir_shader_compiler_options *nir_options = pco_nir_options(pco_ctx);
2108
2109 nir_shader *producer = NULL;
2110 nir_shader *consumer = NULL;
2111 pco_data shader_data[MESA_SHADER_STAGES] = { 0 };
2112 nir_shader *nir_shaders[MESA_SHADER_STAGES] = { 0 };
2113 pco_shader *pco_shaders[MESA_SHADER_STAGES] = { 0 };
2114 pco_shader **vs = &pco_shaders[MESA_SHADER_VERTEX];
2115 pco_shader **fs = &pco_shaders[MESA_SHADER_FRAGMENT];
2116 void *shader_mem_ctx = ralloc_context(NULL);
2117
2118 struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
2119 uint32_t vtx_dma_count = 0;
2120
2121 struct pvr_pds_coeff_loading_program frag_coeff_program = { 0 };
2122
2123 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2124 size_t stage_index = gfx_pipeline->stage_indices[stage];
2125
2126 /* Skip unused/inactive stages. */
2127 if (stage_index == ~0)
2128 continue;
2129
2130 result =
2131 vk_pipeline_shader_stage_to_nir(&device->vk,
2132 gfx_pipeline->base.pipeline_flags,
2133 &pCreateInfo->pStages[stage_index],
2134 spirv_options,
2135 nir_options,
2136 shader_mem_ctx,
2137 &nir_shaders[stage]);
2138 if (result != VK_SUCCESS)
2139 goto err_free_build_context;
2140
2141 pco_preprocess_nir(pco_ctx, nir_shaders[stage]);
2142 }
2143
2144 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2145 if (!nir_shaders[stage])
2146 continue;
2147
2148 if (producer)
2149 pco_link_nir(pco_ctx, producer, nir_shaders[stage]);
2150
2151 producer = nir_shaders[stage];
2152 }
2153
2154 for (gl_shader_stage stage = MESA_SHADER_STAGES; stage-- > 0;) {
2155 if (!nir_shaders[stage])
2156 continue;
2157
2158 if (consumer)
2159 pco_rev_link_nir(pco_ctx, nir_shaders[stage], consumer);
2160
2161 consumer = nir_shaders[stage];
2162 }
2163
2164 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2165 if (!nir_shaders[stage])
2166 continue;
2167
2168 pvr_preprocess_shader_data(&shader_data[stage],
2169 nir_shaders[stage],
2170 pCreateInfo);
2171
2172 pco_lower_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]);
2173 pvr_lower_nir(pco_ctx, layout, nir_shaders[stage]);
2174
2175 pco_postprocess_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]);
2176
2177 pvr_postprocess_shader_data(&shader_data[stage],
2178 nir_shaders[stage],
2179 pCreateInfo);
2180 }
2181
2182 /* TODO NEXT: setup shareds/for descriptors, here or in
2183 * pvr_{pre,post}process_shader_data.
2184 */
2185 memset(sh_reg_layout_vert, 0, sizeof(*sh_reg_layout_vert));
2186 memset(sh_reg_layout_frag, 0, sizeof(*sh_reg_layout_frag));
2187
2188 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2189 pco_shader **pco = &pco_shaders[stage];
2190
2191 /* Skip unused/inactive stages. */
2192 if (!nir_shaders[stage])
2193 continue;
2194
2195 *pco = pco_trans_nir(pco_ctx,
2196 nir_shaders[stage],
2197 &shader_data[stage],
2198 shader_mem_ctx);
2199 if (!*pco) {
2200 result = VK_ERROR_INITIALIZATION_FAILED;
2201 goto err_free_build_context;
2202 }
2203
2204 pco_process_ir(pco_ctx, *pco);
2205 pco_encode_ir(pco_ctx, *pco);
2206 pco_shader_finalize(pco_ctx, *pco);
2207 }
2208
2209 pvr_graphics_pipeline_setup_vertex_dma(*vs,
2210 pCreateInfo->pVertexInputState,
2211 vtx_dma_descriptions,
2212 &vtx_dma_count);
2213
2214 pvr_vertex_state_save(gfx_pipeline, *vs);
2215
2216 result = pvr_gpu_upload_usc(
2217 device,
2218 pco_shader_binary_data(pco_shaders[MESA_SHADER_VERTEX]),
2219 pco_shader_binary_size(pco_shaders[MESA_SHADER_VERTEX]),
2220 cache_line_size,
2221 &vertex_state->bo);
2222 if (result != VK_SUCCESS)
2223 goto err_free_build_context;
2224
2225 if (pco_shaders[MESA_SHADER_FRAGMENT]) {
2226 pvr_graphics_pipeline_setup_fragment_coeff_program(
2227 &pco_shader_data(pco_shaders[MESA_SHADER_FRAGMENT])->fs,
2228 &pco_shader_data(pco_shaders[MESA_SHADER_VERTEX])->vs,
2229 nir_shaders[MESA_SHADER_FRAGMENT],
2230 &frag_coeff_program);
2231
2232 pvr_fragment_state_save(gfx_pipeline, *fs);
2233
2234 result = pvr_gpu_upload_usc(
2235 device,
2236 pco_shader_binary_data(pco_shaders[MESA_SHADER_FRAGMENT]),
2237 pco_shader_binary_size(pco_shaders[MESA_SHADER_FRAGMENT]),
2238 cache_line_size,
2239 &fragment_state->bo);
2240 if (result != VK_SUCCESS)
2241 goto err_free_vertex_bo;
2242
2243 /* TODO: powervr has an optimization where it attempts to recompile
2244 * shaders. See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented
2245 * since in our case the optimization doesn't happen.
2246 */
2247
2248 result = pvr_pds_coeff_program_create_and_upload(device,
2249 allocator,
2250 &frag_coeff_program,
2251 fragment_state);
2252 if (result != VK_SUCCESS)
2253 goto err_free_fragment_bo;
2254
2255 result = pvr_pds_fragment_program_create_and_upload(device,
2256 allocator,
2257 *fs,
2258 fragment_state);
2259 if (result != VK_SUCCESS)
2260 goto err_free_coeff_program;
2261
2262 result = pvr_pds_descriptor_program_create_and_upload(
2263 device,
2264 allocator,
2265 layout,
2266 PVR_STAGE_ALLOCATION_FRAGMENT,
2267 sh_reg_layout_frag,
2268 &fragment_state->descriptor_state);
2269 if (result != VK_SUCCESS)
2270 goto err_free_frag_program;
2271
2272 /* If not, we need to MAX2() and set
2273 * `fragment_state->stage_state.pds_temps_count` appropriately.
2274 */
2275 assert(fragment_state->descriptor_state.pds_info.temps_required == 0);
2276 }
2277
2278 result = pvr_pds_vertex_attrib_programs_create_and_upload(
2279 device,
2280 allocator,
2281 pco_shader_data(pco_shaders[MESA_SHADER_VERTEX]),
2282 vtx_dma_descriptions,
2283 vtx_dma_count,
2284 &vertex_state->pds_attrib_programs);
2285 if (result != VK_SUCCESS)
2286 goto err_free_frag_descriptor_program;
2287
2288 result = pvr_pds_descriptor_program_create_and_upload(
2289 device,
2290 allocator,
2291 layout,
2292 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
2293 sh_reg_layout_vert,
2294 &vertex_state->descriptor_state);
2295 if (result != VK_SUCCESS)
2296 goto err_free_vertex_attrib_program;
2297
2298 /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
2299 * scratch buffer for both vertex and fragment stage.
2300 * Figure out the best place to do this.
2301 */
2302 /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
2303 /* TODO: Implement spilling with the above. */
2304
2305 ralloc_free(shader_mem_ctx);
2306
2307 return VK_SUCCESS;
2308
2309 err_free_vertex_attrib_program:
2310 for (uint32_t i = 0; i < ARRAY_SIZE(vertex_state->pds_attrib_programs);
2311 i++) {
2312 struct pvr_pds_attrib_program *const attrib_program =
2313 &vertex_state->pds_attrib_programs[i];
2314
2315 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
2316 }
2317 err_free_frag_descriptor_program:
2318 pvr_pds_descriptor_program_destroy(device,
2319 allocator,
2320 &fragment_state->descriptor_state);
2321 err_free_frag_program:
2322 pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo);
2323 err_free_coeff_program:
2324 pvr_bo_suballoc_free(fragment_state->pds_coeff_program.pvr_bo);
2325 err_free_fragment_bo:
2326 pvr_bo_suballoc_free(fragment_state->bo);
2327 err_free_vertex_bo:
2328 pvr_bo_suballoc_free(vertex_state->bo);
2329 err_free_build_context:
2330 ralloc_free(shader_mem_ctx);
2331 return result;
2332 }
2333
2334 static struct vk_render_pass_state
pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo * const info)2335 pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info)
2336 {
2337 PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass);
2338 const struct pvr_render_subpass *const subpass =
2339 &pass->subpasses[info->subpass];
2340
2341 enum vk_rp_attachment_flags attachments = 0;
2342
2343 assert(info->subpass < pass->subpass_count);
2344
2345 for (uint32_t i = 0; i < subpass->color_count; i++) {
2346 if (pass->attachments[subpass->color_attachments[i]].aspects)
2347 attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i;
2348 }
2349
2350 if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
2351 VkImageAspectFlags ds_aspects =
2352 pass->attachments[subpass->depth_stencil_attachment].aspects;
2353 if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2354 attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2355 if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
2356 attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2357 }
2358
2359 return (struct vk_render_pass_state){
2360 .attachments = attachments,
2361
2362 /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2),
2363 * which is not currently supported.
2364 */
2365 .view_mask = 0,
2366 };
2367 }
2368
2369 static VkResult
pvr_graphics_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_graphics_pipeline * gfx_pipeline)2370 pvr_graphics_pipeline_init(struct pvr_device *device,
2371 struct vk_pipeline_cache *cache,
2372 const VkGraphicsPipelineCreateInfo *pCreateInfo,
2373 const VkAllocationCallbacks *allocator,
2374 struct pvr_graphics_pipeline *gfx_pipeline)
2375 {
2376 struct vk_dynamic_graphics_state *const dynamic_state =
2377 &gfx_pipeline->dynamic_state;
2378 const struct vk_render_pass_state rp_state =
2379 pvr_create_renderpass_state(pCreateInfo);
2380
2381 struct vk_graphics_pipeline_all_state all_state;
2382 struct vk_graphics_pipeline_state state = { 0 };
2383
2384 VkResult result;
2385
2386 pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
2387
2388 result = vk_graphics_pipeline_state_fill(&device->vk,
2389 &state,
2390 pCreateInfo,
2391 &rp_state,
2392 0,
2393 &all_state,
2394 NULL,
2395 0,
2396 NULL);
2397 if (result != VK_SUCCESS)
2398 goto err_pipeline_finish;
2399
2400 vk_dynamic_graphics_state_init(dynamic_state);
2401
2402 /* Load static state into base dynamic state holder. */
2403 vk_dynamic_graphics_state_fill(dynamic_state, &state);
2404
2405 /* The value of ms.rasterization_samples is undefined when
2406 * rasterizer_discard_enable is set, but we need a specific value.
2407 * Fill that in here.
2408 */
2409 if (state.rs->rasterizer_discard_enable)
2410 dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
2411
2412 memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
2413
2414 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
2415 VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
2416 gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
2417 /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
2418 *
2419 * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
2420 * or VK_SHADER_STAGE_ALL."
2421 *
2422 * So we don't handle that.
2423 *
2424 * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
2425 * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
2426 * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
2427 * structure returned by the driver.
2428 */
2429 switch (pCreateInfo->pStages[i].stage) {
2430 case VK_SHADER_STAGE_VERTEX_BIT:
2431 case VK_SHADER_STAGE_FRAGMENT_BIT:
2432 gfx_pipeline->stage_indices[gl_stage] = i;
2433 break;
2434 default:
2435 unreachable("Unsupported stage.");
2436 }
2437 }
2438
2439 gfx_pipeline->base.layout =
2440 pvr_pipeline_layout_from_handle(pCreateInfo->layout);
2441
2442 /* Compiles and uploads shaders and PDS programs. */
2443 result = pvr_graphics_pipeline_compile(device,
2444 cache,
2445 pCreateInfo,
2446 allocator,
2447 gfx_pipeline);
2448 if (result != VK_SUCCESS)
2449 goto err_pipeline_finish;
2450
2451 return VK_SUCCESS;
2452
2453 err_pipeline_finish:
2454 pvr_pipeline_finish(&gfx_pipeline->base);
2455
2456 return result;
2457 }
2458
2459 /* If allocator == NULL, the internal one will be used. */
2460 static VkResult
pvr_graphics_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)2461 pvr_graphics_pipeline_create(struct pvr_device *device,
2462 struct vk_pipeline_cache *cache,
2463 const VkGraphicsPipelineCreateInfo *pCreateInfo,
2464 const VkAllocationCallbacks *allocator,
2465 VkPipeline *const pipeline_out)
2466 {
2467 struct pvr_graphics_pipeline *gfx_pipeline;
2468 VkResult result;
2469
2470 gfx_pipeline = vk_zalloc2(&device->vk.alloc,
2471 allocator,
2472 sizeof(*gfx_pipeline),
2473 8,
2474 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2475 if (!gfx_pipeline)
2476 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2477
2478 /* Compiles and uploads shaders and PDS programs too. */
2479 result = pvr_graphics_pipeline_init(device,
2480 cache,
2481 pCreateInfo,
2482 allocator,
2483 gfx_pipeline);
2484 if (result != VK_SUCCESS) {
2485 vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
2486 return result;
2487 }
2488
2489 *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
2490
2491 return VK_SUCCESS;
2492 }
2493
2494 VkResult
pvr_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)2495 pvr_CreateGraphicsPipelines(VkDevice _device,
2496 VkPipelineCache pipelineCache,
2497 uint32_t createInfoCount,
2498 const VkGraphicsPipelineCreateInfo *pCreateInfos,
2499 const VkAllocationCallbacks *pAllocator,
2500 VkPipeline *pPipelines)
2501 {
2502 VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
2503 PVR_FROM_HANDLE(pvr_device, device, _device);
2504 VkResult result = VK_SUCCESS;
2505
2506 for (uint32_t i = 0; i < createInfoCount; i++) {
2507 const VkResult local_result =
2508 pvr_graphics_pipeline_create(device,
2509 cache,
2510 &pCreateInfos[i],
2511 pAllocator,
2512 &pPipelines[i]);
2513 if (local_result != VK_SUCCESS) {
2514 result = local_result;
2515 pPipelines[i] = VK_NULL_HANDLE;
2516 }
2517 }
2518
2519 return result;
2520 }
2521
2522 /*****************************************************************************
2523 Other functions
2524 *****************************************************************************/
2525
pvr_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2526 void pvr_DestroyPipeline(VkDevice _device,
2527 VkPipeline _pipeline,
2528 const VkAllocationCallbacks *pAllocator)
2529 {
2530 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2531 PVR_FROM_HANDLE(pvr_device, device, _device);
2532
2533 if (!pipeline)
2534 return;
2535
2536 switch (pipeline->type) {
2537 case PVR_PIPELINE_TYPE_GRAPHICS: {
2538 struct pvr_graphics_pipeline *const gfx_pipeline =
2539 to_pvr_graphics_pipeline(pipeline);
2540
2541 pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2542 break;
2543 }
2544
2545 case PVR_PIPELINE_TYPE_COMPUTE: {
2546 struct pvr_compute_pipeline *const compute_pipeline =
2547 to_pvr_compute_pipeline(pipeline);
2548
2549 pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2550 break;
2551 }
2552
2553 default:
2554 unreachable("Unknown pipeline type.");
2555 }
2556 }
2557