/* * Copyright © 2022 Imagination Technologies Ltd. * * based in part on v3dv driver which is: * Copyright © 2019 Raspberry Pi * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include "compiler/shader_enums.h" #include "hwdef/rogue_hw_utils.h" #include "nir/nir.h" #include "pco/pco.h" #include "pco/pco_data.h" #include "pvr_bo.h" #include "pvr_csb.h" #include "pvr_csb_enum_helpers.h" #include "pvr_hardcode.h" #include "pvr_nir.h" #include "pvr_pds.h" #include "pvr_private.h" #include "pvr_robustness.h" #include "pvr_shader.h" #include "pvr_types.h" #include "rogue/rogue.h" #include "util/log.h" #include "util/macros.h" #include "util/ralloc.h" #include "util/u_dynarray.h" #include "util/u_math.h" #include "vk_alloc.h" #include "vk_format.h" #include "vk_graphics_state.h" #include "vk_log.h" #include "vk_object.h" #include "vk_pipeline_cache.h" #include "vk_render_pass.h" #include "vk_util.h" #include "vulkan/runtime/vk_pipeline.h" /***************************************************************************** PDS functions *****************************************************************************/ /* If allocator == NULL, the internal one will be used. */ static VkResult pvr_pds_coeff_program_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *allocator, struct pvr_pds_coeff_loading_program *program, struct pvr_fragment_shader_state *fragment_state) { uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; assert(program->num_fpu_iterators < PVR_MAXIMUM_ITERATIONS); /* Get the size of the program and then allocate that much memory. */ pvr_pds_coefficient_loading(program, NULL, PDS_GENERATE_SIZES); if (!program->code_size) { fragment_state->pds_coeff_program.pvr_bo = NULL; fragment_state->pds_coeff_program.code_size = 0; fragment_state->pds_coeff_program.data_size = 0; fragment_state->stage_state.pds_temps_count = 0; return VK_SUCCESS; } staging_buffer_size = PVR_DW_TO_BYTES(program->code_size + program->data_size); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* Generate the program into is the staging_buffer. */ pvr_pds_coefficient_loading(program, staging_buffer, PDS_GENERATE_CODEDATA_SEGMENTS); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, &staging_buffer[0], program->data_size, 16, &staging_buffer[program->data_size], program->code_size, 16, 16, &fragment_state->pds_coeff_program); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return result; } vk_free2(&device->vk.alloc, allocator, staging_buffer); fragment_state->stage_state.pds_temps_count = program->temps_used; return VK_SUCCESS; } /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */ /* If allocator == NULL, the internal one will be used. */ VkResult pvr_pds_fragment_program_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *allocator, pco_shader *fs, struct pvr_fragment_shader_state *fragment_state) { /* TODO: remove the below + revert the pvr_pds_setup_doutu * args and make sure fs isn't NULL instead; * temporarily in place for hardcoded load ops in * pvr_pass.c:pvr_generate_load_op_shader() */ unsigned temps = 0; bool has_phase_rate_change = false; unsigned entry_offset = 0; if (fs) { pco_data *fs_data = pco_shader_data(fs); temps = fs_data->common.temps; has_phase_rate_change = fs_data->fs.uses.phase_change; entry_offset = fs_data->common.entry_offset; } struct pvr_pds_kickusc_program program = { 0 }; uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; const pvr_dev_addr_t exec_addr = PVR_DEV_ADDR_OFFSET(fragment_state->bo->dev_addr, /* fs_data->common.entry_offset */ entry_offset); /* Note this is not strictly required to be done before calculating the * staging_buffer_size in this particular case. It can also be done after * allocating the buffer. The size from pvr_pds_kick_usc() is constant. */ pvr_pds_setup_doutu( &program.usc_task_control, exec_addr.addr, /* fs_data->common.temps */ temps, fragment_state->sample_rate, /* fs_data->fs.uses.phase_change */ has_phase_rate_change); pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES); staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); pvr_pds_kick_usc(&program, staging_buffer, 0, false, PDS_GENERATE_CODEDATA_SEGMENTS); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, &staging_buffer[0], program.data_size, 16, &staging_buffer[program.data_size], program.code_size, 16, 16, &fragment_state->pds_fragment_program); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return result; } vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; } static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes( const struct pvr_device_info *dev_info, bool robust_buffer_access) { /* FIXME: Use more local variable to improve formatting. */ /* Maximum memory allocation needed for const map entries in * pvr_pds_generate_vertex_primary_program(). * When robustBufferAccess is disabled, it must be >= 410. * When robustBufferAccess is enabled, it must be >= 570. * * 1. Size of entry for base instance * (pvr_const_map_entry_base_instance) * * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * ( * if (!robustBufferAccess) * size of vertex attribute entry * (pvr_const_map_entry_vertex_attribute_address) + * else * size of robust vertex attribute entry * (pvr_const_map_entry_robust_vertex_attribute_address) + * size of entry for max attribute index * (pvr_const_map_entry_vertex_attribute_max_index) + * fi * size of Unified Store burst entry * (pvr_const_map_entry_literal32) + * size of entry for vertex stride * (pvr_const_map_entry_literal32) + * size of entries for DDMAD control word * (num_ddmad_literals * pvr_const_map_entry_literal32)) * * 3. Size of entry for DOUTW vertex/instance control word * (pvr_const_map_entry_literal32) * * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address) */ const size_t attribute_size = (!robust_buffer_access) ? sizeof(struct pvr_const_map_entry_vertex_attribute_address) : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) + sizeof(struct pvr_const_map_entry_vertex_attribute_max_index); /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word * and is increased by one DWORD to contain the data for the DDMADT's * out-of-bounds check. */ const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals = 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt); return (sizeof(struct pvr_const_map_entry_base_instance) + PVR_MAX_VERTEX_INPUT_BINDINGS * (attribute_size + (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) * sizeof(struct pvr_const_map_entry_literal32)) + sizeof(struct pvr_const_map_entry_literal32) + sizeof(struct pvr_const_map_entry_doutu_address)); } static VkResult pvr_pds_vertex_attrib_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_pds_vertex_primary_program_input *const input, struct pvr_pds_attrib_program *const program_out) { const size_t const_entries_size_in_bytes = pvr_pds_get_max_vertex_program_const_map_size_in_bytes( &device->pdevice->dev_info, device->vk.enabled_features.robustBufferAccess); struct pvr_pds_upload *const program = &program_out->program; struct pvr_pds_info *const info = &program_out->info; struct pvr_const_map_entry *new_entries; ASSERTED uint32_t code_size_in_dwords; size_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; memset(info, 0, sizeof(*info)); info->entries = vk_alloc2(&device->vk.alloc, allocator, const_entries_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!info->entries) { result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto err_out; } info->entries_size_in_bytes = const_entries_size_in_bytes; pvr_pds_generate_vertex_primary_program( input, NULL, info, device->vk.enabled_features.robustBufferAccess, &device->pdevice->dev_info); code_size_in_dwords = info->code_size_in_dwords; staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) { result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto err_free_entries; } /* This also fills in info->entries. */ pvr_pds_generate_vertex_primary_program( input, staging_buffer, info, device->vk.enabled_features.robustBufferAccess, &device->pdevice->dev_info); assert(info->code_size_in_dwords <= code_size_in_dwords); /* FIXME: Add a vk_realloc2() ? */ new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator, info->entries, info->entries_written_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!new_entries) { result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto err_free_staging_buffer; } info->entries = new_entries; info->entries_size_in_bytes = info->entries_written_size_in_bytes; /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, NULL, 0, 0, staging_buffer, info->code_size_in_dwords, 16, 16, program); if (result != VK_SUCCESS) goto err_free_staging_buffer; vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; err_free_staging_buffer: vk_free2(&device->vk.alloc, allocator, staging_buffer); err_free_entries: vk_free2(&device->vk.alloc, allocator, info->entries); err_out: return result; } static inline void pvr_pds_vertex_attrib_program_destroy( struct pvr_device *const device, const struct VkAllocationCallbacks *const allocator, struct pvr_pds_attrib_program *const program) { pvr_bo_suballoc_free(program->program.pvr_bo); vk_free2(&device->vk.alloc, allocator, program->info.entries); } /* This is a const pointer to an array of pvr_pds_attrib_program structs. * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size. */ typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr) [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT]; /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex * inputs. This will bake the code segment and create a template of the data * segment for the command buffer to fill in. */ /* If allocator == NULL, the internal one will be used. * * programs_out_ptr is a pointer to the array where the outputs will be placed. */ static VkResult pvr_pds_vertex_attrib_programs_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *const allocator, pco_data *shader_data, const struct pvr_pds_vertex_dma dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS], uint32_t dma_count, pvr_pds_attrib_programs_array_ptr programs_out_ptr) { struct pvr_pds_vertex_primary_program_input input = { .dma_list = dma_descriptions, .dma_count = dma_count, }; uint32_t usc_temp_count = shader_data->common.temps; struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr; VkResult result; pco_range *sys_vals = shader_data->common.sys_vals; if (sys_vals[SYSTEM_VALUE_VERTEX_ID].count > 0) { input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED; input.vertex_id_register = sys_vals[SYSTEM_VALUE_VERTEX_ID].start; } if (sys_vals[SYSTEM_VALUE_INSTANCE_ID].count > 0) { input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED; input.instance_id_register = sys_vals[SYSTEM_VALUE_INSTANCE_ID].start; } if (sys_vals[SYSTEM_VALUE_BASE_INSTANCE].count > 0) { input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_REQUIRED; input.base_instance_register = sys_vals[SYSTEM_VALUE_BASE_INSTANCE].start; } if (sys_vals[SYSTEM_VALUE_BASE_VERTEX].count > 0) { input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_VERTEX_REQUIRED; input.base_vertex_register = sys_vals[SYSTEM_VALUE_BASE_VERTEX].start; } if (sys_vals[SYSTEM_VALUE_DRAW_ID].count > 0) { input.flags |= PVR_PDS_VERTEX_FLAGS_DRAW_INDEX_REQUIRED; input.draw_index_register = sys_vals[SYSTEM_VALUE_DRAW_ID].start; } pvr_pds_setup_doutu(&input.usc_task_control, 0, usc_temp_count, ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, false); /* Note: programs_out_ptr is a pointer to an array so this is fine. See the * typedef. */ for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) { uint32_t extra_flags; switch (i) { case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC: extra_flags = 0; break; case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE: extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT; break; case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT: extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT; break; default: unreachable("Invalid vertex attrib program type."); } input.flags |= extra_flags; result = pvr_pds_vertex_attrib_program_create_and_upload(device, allocator, &input, &programs_out[i]); if (result != VK_SUCCESS) { for (uint32_t j = 0; j < i; j++) { pvr_pds_vertex_attrib_program_destroy(device, allocator, &programs_out[j]); } return result; } input.flags &= ~extra_flags; } return VK_SUCCESS; } size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void) { /* Maximum memory allocation needed for const map entries in * pvr_pds_generate_descriptor_upload_program(). * It must be >= 688 bytes. This size is calculated as the sum of: * * 1. Max. number of descriptor sets (8) * ( * size of descriptor entry * (pvr_const_map_entry_descriptor_set) + * size of Common Store burst entry * (pvr_const_map_entry_literal32)) * * 2. Max. number of PDS program buffers (24) * ( * size of the largest buffer structure * (pvr_const_map_entry_constant_buffer) + * size of Common Store burst entry * (pvr_const_map_entry_literal32) * * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address) * * 4. Max. number of PDS address literals (8) * ( * size of entry * (pvr_const_map_entry_descriptor_set_addrs_table) * * 5. Max. number of address literals with single buffer entry to DOUTD size of entry (pvr_pds_const_map_entry_addr_literal_buffer) + 8 * size of entry (pvr_pds_const_map_entry_addr_literal) */ /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to * say that it should be 8. * Figure our a define for this or is the comment wrong? */ return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) + sizeof(struct pvr_const_map_entry_literal32)) + PVR_PDS_MAX_BUFFERS * (sizeof(struct pvr_const_map_entry_constant_buffer) + sizeof(struct pvr_const_map_entry_literal32)) + sizeof(struct pvr_const_map_entry_doutu_address) + sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) + 8 * sizeof(struct pvr_pds_const_map_entry_addr_literal)); } static VkResult pvr_pds_descriptor_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, const struct pvr_pipeline_layout *const layout, enum pvr_stage_allocation stage, const struct pvr_sh_reg_layout *sh_reg_layout, struct pvr_stage_allocation_descriptor_state *const descriptor_state) { const size_t const_entries_size_in_bytes = pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(); struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; struct pvr_pds_descriptor_program_input program = { 0 }; struct pvr_const_map_entry *new_entries; ASSERTED uint32_t code_size_in_dwords; uint32_t staging_buffer_size; uint32_t addr_literals = 0; uint32_t *staging_buffer; VkResult result; assert(stage != PVR_STAGE_ALLOCATION_COUNT); *pds_info = (struct pvr_pds_info){ 0 }; if (sh_reg_layout->descriptor_set_addrs_table.present) { program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){ .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE, .destination = sh_reg_layout->descriptor_set_addrs_table.offset, }; addr_literals++; } if (sh_reg_layout->push_consts.present) { program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){ .type = PVR_PDS_ADDR_LITERAL_PUSH_CONSTS, .destination = sh_reg_layout->push_consts.offset, }; addr_literals++; } if (sh_reg_layout->blend_consts.present) { program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){ .type = PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS, .destination = sh_reg_layout->blend_consts.offset, }; addr_literals++; } program.addr_literal_count = addr_literals; pds_info->entries = vk_alloc2(&device->vk.alloc, allocator, const_entries_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!pds_info->entries) { result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto err_free_static_consts; } pds_info->entries_size_in_bytes = const_entries_size_in_bytes; pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info); code_size_in_dwords = pds_info->code_size_in_dwords; staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords); if (!staging_buffer_size) { vk_free2(&device->vk.alloc, allocator, pds_info->entries); *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 }; return VK_SUCCESS; } staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) { result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto err_free_entries; } pvr_pds_generate_descriptor_upload_program(&program, staging_buffer, pds_info); assert(pds_info->code_size_in_dwords <= code_size_in_dwords); /* FIXME: use vk_realloc2() ? */ new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator, pds_info->entries, pds_info->entries_written_size_in_bytes, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!new_entries) { result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); goto err_free_staging_buffer; } pds_info->entries = new_entries; pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes; /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, NULL, 0, 0, staging_buffer, pds_info->code_size_in_dwords, 16, 16, &descriptor_state->pds_code); if (result != VK_SUCCESS) goto err_free_staging_buffer; vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; err_free_staging_buffer: vk_free2(&device->vk.alloc, allocator, staging_buffer); err_free_entries: vk_free2(&device->vk.alloc, allocator, pds_info->entries); err_free_static_consts: pvr_bo_suballoc_free(descriptor_state->static_consts); return result; } static void pvr_pds_descriptor_program_destroy( struct pvr_device *const device, const struct VkAllocationCallbacks *const allocator, struct pvr_stage_allocation_descriptor_state *const descriptor_state) { if (!descriptor_state) return; pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo); vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries); pvr_bo_suballoc_free(descriptor_state->static_consts); } static void pvr_pds_compute_program_setup( const struct pvr_device_info *dev_info, const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], uint32_t barrier_coefficient, bool add_base_workgroup, uint32_t usc_temps, pvr_dev_addr_t usc_shader_dev_addr, struct pvr_pds_compute_shader_program *const program) { pvr_pds_compute_shader_program_init(program); program->local_input_regs[0] = local_input_regs[0]; program->local_input_regs[1] = local_input_regs[1]; program->local_input_regs[2] = local_input_regs[2]; program->work_group_input_regs[0] = work_group_input_regs[0]; program->work_group_input_regs[1] = work_group_input_regs[1]; program->work_group_input_regs[2] = work_group_input_regs[2]; program->barrier_coefficient = barrier_coefficient; program->add_base_workgroup = add_base_workgroup; program->flattened_work_groups = true; program->kick_usc = true; STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) == PVR_WORKGROUP_DIMENSIONS); STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) == PVR_WORKGROUP_DIMENSIONS); STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) == PVR_WORKGROUP_DIMENSIONS); pvr_pds_setup_doutu(&program->usc_task_control, usc_shader_dev_addr.addr, usc_temps, ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, false); pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info); } /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged. */ static VkResult pvr_pds_compute_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], uint32_t barrier_coefficient, uint32_t usc_temps, pvr_dev_addr_t usc_shader_dev_addr, struct pvr_pds_upload *const pds_upload_out, struct pvr_pds_info *const pds_info_out) { struct pvr_device_info *dev_info = &device->pdevice->dev_info; struct pvr_pds_compute_shader_program program; uint32_t staging_buffer_size; uint32_t *staging_buffer; VkResult result; pvr_pds_compute_program_setup(dev_info, local_input_regs, work_group_input_regs, barrier_coefficient, false, usc_temps, usc_shader_dev_addr, &program); /* FIXME: According to pvr_device_init_compute_pds_program() the code size * is in bytes. Investigate this. */ staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, staging_buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* FIXME: pvr_pds_compute_shader doesn't implement * PDS_GENERATE_CODEDATA_SEGMENTS. */ pvr_pds_compute_shader(&program, &staging_buffer[0], PDS_GENERATE_CODE_SEGMENT, dev_info); pvr_pds_compute_shader(&program, &staging_buffer[program.code_size], PDS_GENERATE_DATA_SEGMENT, dev_info); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, &staging_buffer[program.code_size], program.data_size, 16, &staging_buffer[0], program.code_size, 16, 16, pds_upload_out); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, staging_buffer); return result; } *pds_info_out = (struct pvr_pds_info){ .temps_required = program.highest_temp, .code_size_in_dwords = program.code_size, .data_size_in_dwords = program.data_size, }; vk_free2(&device->vk.alloc, allocator, staging_buffer); return VK_SUCCESS; }; static void pvr_pds_compute_program_destroy( struct pvr_device *const device, const struct VkAllocationCallbacks *const allocator, struct pvr_pds_upload *const pds_program, struct pvr_pds_info *const pds_info) { /* We don't allocate an entries buffer so we don't need to free it */ pvr_bo_suballoc_free(pds_program->pvr_bo); } /* This only uploads the code segment. The data segment will need to be patched * with the base workgroup before uploading. */ static VkResult pvr_pds_compute_base_workgroup_variant_program_init( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], uint32_t barrier_coefficient, uint32_t usc_temps, pvr_dev_addr_t usc_shader_dev_addr, struct pvr_pds_base_workgroup_program *program_out) { struct pvr_device_info *dev_info = &device->pdevice->dev_info; struct pvr_pds_compute_shader_program program; uint32_t buffer_size; uint32_t *buffer; VkResult result; pvr_pds_compute_program_setup(dev_info, local_input_regs, work_group_input_regs, barrier_coefficient, true, usc_temps, usc_shader_dev_addr, &program); /* FIXME: According to pvr_device_init_compute_pds_program() the code size * is in bytes. Investigate this. */ buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size)); buffer = vk_alloc2(&device->vk.alloc, allocator, buffer_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); pvr_pds_compute_shader(&program, &buffer[0], PDS_GENERATE_CODE_SEGMENT, dev_info); /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, NULL, 0, 0, buffer, program.code_size, 16, 16, &program_out->code_upload); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, buffer); return result; } pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info); program_out->data_section = buffer; /* We'll need to patch the base workgroup in the PDS data section before * dispatch so we save the offsets at which to patch. We only need to save * the offset for the first workgroup id since the workgroup ids are stored * contiguously in the data segment. */ program_out->base_workgroup_data_patching_offset = program.base_workgroup_constant_offset_in_dwords[0]; program_out->info = (struct pvr_pds_info){ .temps_required = program.highest_temp, .code_size_in_dwords = program.code_size, .data_size_in_dwords = program.data_size, }; return VK_SUCCESS; } static void pvr_pds_compute_base_workgroup_variant_program_finish( struct pvr_device *device, const VkAllocationCallbacks *const allocator, struct pvr_pds_base_workgroup_program *const state) { pvr_bo_suballoc_free(state->code_upload.pvr_bo); vk_free2(&device->vk.alloc, allocator, state->data_section); } /****************************************************************************** Generic pipeline functions ******************************************************************************/ static void pvr_pipeline_init(struct pvr_device *device, enum pvr_pipeline_type type, struct pvr_pipeline *const pipeline) { assert(!pipeline->layout); vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE); pipeline->type = type; } static void pvr_pipeline_finish(struct pvr_pipeline *pipeline) { vk_object_base_finish(&pipeline->base); } /* How many shared regs it takes to store a pvr_dev_addr_t. * Each shared reg is 32 bits. */ #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \ DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t)) /** * \brief Allocates shared registers. * * \return How many sh regs are required. */ static uint32_t pvr_pipeline_alloc_shareds(const struct pvr_device *device, const struct pvr_pipeline_layout *layout, enum pvr_stage_allocation stage, struct pvr_sh_reg_layout *const sh_reg_layout_out) { ASSERTED const uint64_t reserved_shared_size = device->pdevice->dev_runtime_info.reserved_shared_size; ASSERTED const uint64_t max_coeff = device->pdevice->dev_runtime_info.max_coeffs; struct pvr_sh_reg_layout reg_layout = { 0 }; uint32_t next_free_sh_reg = 0; reg_layout.descriptor_set_addrs_table.present = !!(layout->shader_stage_mask & BITFIELD_BIT(stage)); if (reg_layout.descriptor_set_addrs_table.present) { reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg; next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS; } reg_layout.push_consts.present = !!(layout->push_constants_shader_stages & BITFIELD_BIT(stage)); if (reg_layout.push_consts.present) { reg_layout.push_consts.offset = next_free_sh_reg; next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS; } *sh_reg_layout_out = reg_layout; /* FIXME: We might need to take more things into consideration. * See pvr_calc_fscommon_size_and_tiles_in_flight(). */ assert(next_free_sh_reg <= reserved_shared_size - max_coeff); return next_free_sh_reg; } /****************************************************************************** Compute pipeline functions ******************************************************************************/ /* Compiles and uploads shaders and PDS programs. */ static VkResult pvr_compute_pipeline_compile( struct pvr_device *const device, struct vk_pipeline_cache *cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *const allocator, struct pvr_compute_pipeline *const compute_pipeline) { struct pvr_pipeline_layout *layout = compute_pipeline->base.layout; struct pvr_sh_reg_layout *sh_reg_layout = &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_COMPUTE]; uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS]; uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS]; uint32_t barrier_coefficient; uint32_t usc_temps; uint32_t sh_count; VkResult result; sh_count = pvr_pipeline_alloc_shareds(device, layout, PVR_STAGE_ALLOCATION_COMPUTE, sh_reg_layout); compute_pipeline->shader_state.const_shared_reg_count = sh_count; /* FIXME: Compile and upload the shader. */ /* FIXME: Initialize the shader state and setup build info. */ unreachable("finishme: compute support"); result = pvr_pds_descriptor_program_create_and_upload( device, allocator, layout, PVR_STAGE_ALLOCATION_COMPUTE, sh_reg_layout, &compute_pipeline->descriptor_state); if (result != VK_SUCCESS) goto err_free_shader; result = pvr_pds_compute_program_create_and_upload( device, allocator, local_input_regs, work_group_input_regs, barrier_coefficient, usc_temps, compute_pipeline->shader_state.bo->dev_addr, &compute_pipeline->primary_program, &compute_pipeline->primary_program_info); if (result != VK_SUCCESS) goto err_free_descriptor_program; /* If the workgroup ID is required, then we require the base workgroup * variant of the PDS compute program as well. */ compute_pipeline->flags.base_workgroup = work_group_input_regs[0] != PVR_PDS_REG_UNUSED || work_group_input_regs[1] != PVR_PDS_REG_UNUSED || work_group_input_regs[2] != PVR_PDS_REG_UNUSED; if (compute_pipeline->flags.base_workgroup) { result = pvr_pds_compute_base_workgroup_variant_program_init( device, allocator, local_input_regs, work_group_input_regs, barrier_coefficient, usc_temps, compute_pipeline->shader_state.bo->dev_addr, &compute_pipeline->primary_base_workgroup_variant_program); if (result != VK_SUCCESS) goto err_destroy_compute_program; } return VK_SUCCESS; err_destroy_compute_program: pvr_pds_compute_program_destroy(device, allocator, &compute_pipeline->primary_program, &compute_pipeline->primary_program_info); err_free_descriptor_program: pvr_pds_descriptor_program_destroy(device, allocator, &compute_pipeline->descriptor_state); err_free_shader: pvr_bo_suballoc_free(compute_pipeline->shader_state.bo); return result; } static VkResult pvr_compute_pipeline_init(struct pvr_device *device, struct vk_pipeline_cache *cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, struct pvr_compute_pipeline *compute_pipeline) { VkResult result; pvr_pipeline_init(device, PVR_PIPELINE_TYPE_COMPUTE, &compute_pipeline->base); compute_pipeline->base.layout = pvr_pipeline_layout_from_handle(pCreateInfo->layout); result = pvr_compute_pipeline_compile(device, cache, pCreateInfo, allocator, compute_pipeline); if (result != VK_SUCCESS) { pvr_pipeline_finish(&compute_pipeline->base); return result; } return VK_SUCCESS; } static VkResult pvr_compute_pipeline_create(struct pvr_device *device, struct vk_pipeline_cache *cache, const VkComputePipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, VkPipeline *const pipeline_out) { struct pvr_compute_pipeline *compute_pipeline; VkResult result; compute_pipeline = vk_zalloc2(&device->vk.alloc, allocator, sizeof(*compute_pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!compute_pipeline) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* Compiles and uploads shaders and PDS programs. */ result = pvr_compute_pipeline_init(device, cache, pCreateInfo, allocator, compute_pipeline); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, compute_pipeline); return result; } *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base); return VK_SUCCESS; } static void pvr_compute_pipeline_destroy( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_compute_pipeline *const compute_pipeline) { if (compute_pipeline->flags.base_workgroup) { pvr_pds_compute_base_workgroup_variant_program_finish( device, allocator, &compute_pipeline->primary_base_workgroup_variant_program); } pvr_pds_compute_program_destroy(device, allocator, &compute_pipeline->primary_program, &compute_pipeline->primary_program_info); pvr_pds_descriptor_program_destroy(device, allocator, &compute_pipeline->descriptor_state); pvr_bo_suballoc_free(compute_pipeline->shader_state.bo); pvr_pipeline_finish(&compute_pipeline->base); vk_free2(&device->vk.alloc, allocator, compute_pipeline); } VkResult pvr_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkComputePipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); PVR_FROM_HANDLE(pvr_device, device, _device); VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < createInfoCount; i++) { const VkResult local_result = pvr_compute_pipeline_create(device, cache, &pCreateInfos[i], pAllocator, &pPipelines[i]); if (local_result != VK_SUCCESS) { result = local_result; pPipelines[i] = VK_NULL_HANDLE; } } return result; } /****************************************************************************** Graphics pipeline functions ******************************************************************************/ static void pvr_graphics_pipeline_destroy(struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_graphics_pipeline *const gfx_pipeline) { const uint32_t num_vertex_attrib_programs = ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs); pvr_pds_descriptor_program_destroy( device, allocator, &gfx_pipeline->shader_state.fragment.descriptor_state); pvr_pds_descriptor_program_destroy( device, allocator, &gfx_pipeline->shader_state.vertex.descriptor_state); for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) { struct pvr_pds_attrib_program *const attrib_program = &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i]; pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); } pvr_bo_suballoc_free( gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo); pvr_bo_suballoc_free( gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo); pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo); pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo); pvr_pipeline_finish(&gfx_pipeline->base); vk_free2(&device->vk.alloc, allocator, gfx_pipeline); } static void pvr_vertex_state_save(struct pvr_graphics_pipeline *gfx_pipeline, pco_shader *vs) { struct pvr_vertex_shader_state *vertex_state = &gfx_pipeline->shader_state.vertex; const pco_data *shader_data = pco_shader_data(vs); memcpy(&gfx_pipeline->vs_data, shader_data, sizeof(*shader_data)); /* This ends up unused since we'll use the temp_usage for the PDS program we * end up selecting, and the descriptor PDS program doesn't use any temps. * Let's set it to ~0 in case it ever gets used. */ vertex_state->stage_state.pds_temps_count = ~0; } static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline, pco_shader *fs) { struct pvr_fragment_shader_state *fragment_state = &gfx_pipeline->shader_state.fragment; const pco_data *shader_data = pco_shader_data(fs); memcpy(&gfx_pipeline->fs_data, shader_data, sizeof(*shader_data)); /* TODO: add selection for other values of pass type and sample rate. */ fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE; fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE; /* We can't initialize it yet since we still need to generate the PDS * programs so set it to `~0` to make sure that we set this up later on. */ fragment_state->stage_state.pds_temps_count = ~0; } static bool pvr_blend_factor_requires_consts(VkBlendFactor factor) { switch (factor) { case VK_BLEND_FACTOR_CONSTANT_COLOR: case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR: case VK_BLEND_FACTOR_CONSTANT_ALPHA: case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA: return true; default: return false; } } /** * \brief Indicates whether dynamic blend constants are needed. * * If the user has specified the blend constants to be dynamic, they might not * necessarily be using them. This function makes sure that they are being used * in order to determine whether we need to upload them later on for the shader * to access them. */ static bool pvr_graphics_pipeline_requires_dynamic_blend_consts( const struct pvr_graphics_pipeline *gfx_pipeline) { const struct vk_dynamic_graphics_state *const state = &gfx_pipeline->dynamic_state; if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) return false; for (uint32_t i = 0; i < state->cb.attachment_count; i++) { const struct vk_color_blend_attachment_state *attachment = &state->cb.attachments[i]; const bool has_color_write = attachment->write_mask & (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT); const bool has_alpha_write = attachment->write_mask & VK_COLOR_COMPONENT_A_BIT; if (!attachment->blend_enable || attachment->write_mask == 0) continue; if (has_color_write) { const uint8_t src_color_blend_factor = attachment->src_color_blend_factor; const uint8_t dst_color_blend_factor = attachment->dst_color_blend_factor; if (pvr_blend_factor_requires_consts(src_color_blend_factor) || pvr_blend_factor_requires_consts(dst_color_blend_factor)) { return true; } } if (has_alpha_write) { const uint8_t src_alpha_blend_factor = attachment->src_alpha_blend_factor; const uint8_t dst_alpha_blend_factor = attachment->dst_alpha_blend_factor; if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) || pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) { return true; } } } return false; } static uint32_t pvr_graphics_pipeline_alloc_shareds( const struct pvr_device *device, const struct pvr_graphics_pipeline *gfx_pipeline, enum pvr_stage_allocation stage, struct pvr_sh_reg_layout *const sh_reg_layout_out) { ASSERTED const uint64_t reserved_shared_size = device->pdevice->dev_runtime_info.reserved_shared_size; ASSERTED const uint64_t max_coeff = device->pdevice->dev_runtime_info.max_coeffs; const struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout; struct pvr_sh_reg_layout reg_layout = { 0 }; uint32_t next_free_sh_reg = 0; next_free_sh_reg = pvr_pipeline_alloc_shareds(device, layout, stage, ®_layout); reg_layout.blend_consts.present = (stage == PVR_STAGE_ALLOCATION_FRAGMENT && pvr_graphics_pipeline_requires_dynamic_blend_consts(gfx_pipeline)); if (reg_layout.blend_consts.present) { reg_layout.blend_consts.offset = next_free_sh_reg; next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS; } *sh_reg_layout_out = reg_layout; /* FIXME: We might need to take more things into consideration. * See pvr_calc_fscommon_size_and_tiles_in_flight(). */ assert(next_free_sh_reg <= reserved_shared_size - max_coeff); return next_free_sh_reg; } #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS static void pvr_graphics_pipeline_setup_vertex_dma( pco_shader *vs, const VkPipelineVertexInputStateCreateInfo *const vertex_input_state, struct pvr_pds_vertex_dma *const dma_descriptions, uint32_t *const dma_count) { pco_vs_data *vs_data = &pco_shader_data(vs)->vs; const VkVertexInputBindingDescription *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 }; const VkVertexInputAttributeDescription *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 }; /* Vertex attributes map to the `layout(location = x)` annotation in the * shader where `x` is the attribute's location. * Vertex bindings have NO relation to the shader. They have nothing to do * with the `layout(set = x, binding = y)` notation. They instead indicate * where the data for a collection of vertex attributes comes from. The * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific * binding number and based on that we'll know which buffer to DMA the data * from, to fill in the collection of vertex attributes. */ for (uint32_t i = 0; i < vertex_input_state->vertexBindingDescriptionCount; i++) { const VkVertexInputBindingDescription *binding_desc = &vertex_input_state->pVertexBindingDescriptions[i]; sorted_bindings[binding_desc->binding] = binding_desc; } for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount; i++) { const VkVertexInputAttributeDescription *attribute_desc = &vertex_input_state->pVertexAttributeDescriptions[i]; sorted_attributes[attribute_desc->location] = attribute_desc; } for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount; i++) { const VkVertexInputAttributeDescription *attribute = sorted_attributes[i]; if (!attribute) continue; gl_vert_attrib location = attribute->location + VERT_ATTRIB_GENERIC0; const VkVertexInputBindingDescription *binding = sorted_bindings[attribute->binding]; struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[*dma_count]; const struct util_format_description *fmt_description = vk_format_description(attribute->format); const pco_range *attrib_range = &vs_data->attribs[location]; /* Skip unused attributes. */ if (!attrib_range->count) continue; /* DMA setup. */ /* The PDS program sets up DDMADs to DMA attributes into vtxin regs. * * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address). * DMA source addr = src0 * src1 + src2 * DMA params = src3 * * In the PDS program we setup src0 with the binding's stride and src1 * with either the instance id or vertex id (both of which get filled by * the hardware). We setup src2 later on once we know which VkBuffer to * DMA the data from so it's saved for later when we patch the data * section. */ /* TODO: Right now we're setting up a DMA per attribute. In a case where * there are multiple attributes packed into a single binding with * adjacent locations we'd still be DMAing them separately. This is not * great so the DMA setup should be smarter and could do with some * optimization. */ *dma_desc = (struct pvr_pds_vertex_dma){ 0 }; /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation * this corresponds to `attribDesc.offset`. * The PDS program doesn't do anything with it but just save it in the * PDS program entry. */ dma_desc->offset = attribute->offset; /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation * this corresponds to `bindingDesc.stride`. * The PDS program will calculate the `effectiveVertexOffset` with this * and add it to the address provided in the patched data segment. */ dma_desc->stride = binding->stride; if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE; else dma_desc->flags = 0; /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */ /* TODO: what if not all components are used */ assert(attrib_range->count == fmt_description->block.bits / 32); dma_desc->size_in_dwords = attrib_range->count; /* Vtxin reg offset to start DMAing into. */ dma_desc->destination = attrib_range->start; /* Will be used by the driver to figure out buffer address to patch in the * data section. I.e. which binding we should DMA from. */ dma_desc->binding_index = attribute->binding; /* We don't currently support VK_EXT_vertex_attribute_divisor so no * repeating of instance-rate vertex attributes needed. We should always * move on to the next vertex attribute. */ assert(binding->inputRate != VK_VERTEX_INPUT_RATE_INSTANCE); dma_desc->divisor = 1; /* Will be used to generate PDS code that takes care of robust buffer * access, and later on by the driver to write the correct robustness * buffer address to DMA the fallback values from. */ dma_desc->robustness_buffer_offset = pvr_get_robustness_buffer_format_offset(attribute->format); /* Used by later on by the driver to figure out if the buffer is being * accessed out of bounds, for robust buffer access. */ dma_desc->component_size_in_bytes = fmt_description->block.bits / fmt_description->nr_channels / 8; ++*dma_count; } } static void pvr_graphics_pipeline_setup_fragment_coeff_program( pco_fs_data *fs_data, pco_vs_data *vs_data, nir_shader *fs, struct pvr_pds_coeff_loading_program *frag_coeff_program) { uint64_t varyings_used = fs->info.inputs_read & BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING); unsigned fpu = 0; unsigned dest = 0; if (fs_data->uses.z) { pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], PDSINST_DOUT_FIELDS_DOUTI_SRC, douti_src) { /* TODO: define instead of sizeof(uint16_t). */ douti_src.f32_offset = fs_data->uses.w ? 1 * sizeof(uint16_t) : 0; douti_src.f16_offset = douti_src.f32_offset; douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D; } frag_coeff_program->destination[fpu++] = dest++; } if (fs_data->uses.w) { pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], PDSINST_DOUT_FIELDS_DOUTI_SRC, douti_src) { douti_src.f32_offset = 0; douti_src.f16_offset = douti_src.f32_offset; douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D; } frag_coeff_program->destination[fpu++] = dest++; } if (fs_data->uses.pntc) { pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], PDSINST_DOUT_FIELDS_DOUTI_SRC, douti_src) { douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_2D; douti_src.pointsprite = true; } frag_coeff_program->destination[fpu++] = dest; dest += 2; } u_foreach_bit64 (varying, varyings_used) { nir_variable *var = nir_find_variable_with_location(fs, nir_var_shader_in, varying); assert(var); pco_range *cf_range = &fs_data->varyings[varying]; assert(cf_range->count > 0); assert(!(cf_range->start % ROGUE_USC_COEFFICIENT_SET_SIZE)); assert(!(cf_range->count % ROGUE_USC_COEFFICIENT_SET_SIZE)); pco_range *vtxout_range = &vs_data->varyings[varying]; assert(vtxout_range->count > 0); assert(vtxout_range->start >= 4); assert(vtxout_range->count == cf_range->count / ROGUE_USC_COEFFICIENT_SET_SIZE); unsigned count = vtxout_range->count; unsigned vtxout = vtxout_range->start; /* pos.x, pos.y unused. */ vtxout -= 2; /* pos.z unused. */ if (!fs_data->uses.z) vtxout -= 1; /* pos.w unused. */ if (!fs_data->uses.w) vtxout -= 1; pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu], PDSINST_DOUT_FIELDS_DOUTI_SRC, douti_src) { /* TODO: define instead of sizeof(uint16_t). */ douti_src.f32_offset = vtxout * sizeof(uint16_t); /* TODO: f16 support. */ douti_src.f16 = false; douti_src.f16_offset = douti_src.f32_offset; switch (var->data.interpolation) { case INTERP_MODE_SMOOTH: douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; douti_src.perspective = true; break; case INTERP_MODE_NOPERSPECTIVE: douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD; break; case INTERP_MODE_FLAT: /* TODO: triangle fan, provoking vertex last. */ douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX0; break; default: unreachable("Unimplemented interpolation type."); } douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D + count - 1; } frag_coeff_program->destination[fpu++] = cf_range->start / ROGUE_USC_COEFFICIENT_SET_SIZE; } frag_coeff_program->num_fpu_iterators = fpu; } static void set_var(pco_range *allocation_list, unsigned to, nir_variable *var, unsigned dwords_each) { unsigned slots = glsl_count_dword_slots(var->type, false); allocation_list[var->data.location] = (pco_range){ .start = to, .count = slots * dwords_each, }; } static void allocate_var(pco_range *allocation_list, unsigned *counter, nir_variable *var, unsigned dwords_each) { unsigned slots = glsl_count_dword_slots(var->type, false); allocation_list[var->data.location] = (pco_range){ .start = *counter, .count = slots * dwords_each, }; *counter += slots * dwords_each; } static void try_allocate_var(pco_range *allocation_list, unsigned *counter, nir_shader *nir, uint64_t bitset, nir_variable_mode mode, int location, unsigned dwords_each) { nir_variable *var = nir_find_variable_with_location(nir, mode, location); if (!(bitset & BITFIELD64_BIT(location))) return; assert(var); allocate_var(allocation_list, counter, var, dwords_each); } static void try_allocate_vars(pco_range *allocation_list, unsigned *counter, nir_shader *nir, uint64_t *bitset, nir_variable_mode mode, bool f16, enum glsl_interp_mode interp_mode, unsigned dwords_each) { uint64_t skipped = 0; while (*bitset) { int location = u_bit_scan64(bitset); nir_variable *var = nir_find_variable_with_location(nir, mode, location); assert(var); if (glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)) != f16 || var->data.interpolation != interp_mode) { skipped |= BITFIELD64_BIT(location); continue; } allocate_var(allocation_list, counter, var, dwords_each); } *bitset |= skipped; } static void allocate_val(pco_range *allocation_list, unsigned *counter, unsigned location, unsigned dwords_each) { allocation_list[location] = (pco_range){ .start = *counter, .count = dwords_each, }; *counter += dwords_each; } static void pvr_alloc_vs_sysvals(pco_data *data, nir_shader *nir) { BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX); BITSET_COPY(system_values_read, nir->info.system_values_read); gl_system_value sys_vals[] = { SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_INSTANCE_ID, SYSTEM_VALUE_BASE_INSTANCE, SYSTEM_VALUE_BASE_VERTEX, SYSTEM_VALUE_DRAW_ID, }; for (unsigned u = 0; u < ARRAY_SIZE(sys_vals); ++u) { if (BITSET_TEST(system_values_read, sys_vals[u])) { allocate_val(data->common.sys_vals, &data->common.vtxins, sys_vals[u], 1); BITSET_CLEAR(system_values_read, sys_vals[u]); } } assert(BITSET_IS_EMPTY(system_values_read)); } static void pvr_init_vs_attribs( pco_data *data, const VkPipelineVertexInputStateCreateInfo *const vertex_input_state) { for (unsigned u = 0; u < vertex_input_state->vertexAttributeDescriptionCount; ++u) { const VkVertexInputAttributeDescription *attrib = &vertex_input_state->pVertexAttributeDescriptions[u]; gl_vert_attrib location = attrib->location + VERT_ATTRIB_GENERIC0; data->vs.attrib_formats[location] = vk_format_to_pipe_format(attrib->format); } } static void pvr_alloc_vs_attribs(pco_data *data, nir_shader *nir) { /* TODO NEXT: this should be based on the format size. */ nir_foreach_shader_in_variable (var, nir) { allocate_var(data->vs.attribs, &data->common.vtxins, var, 1); } } static void pvr_alloc_vs_varyings(pco_data *data, nir_shader *nir) { uint64_t vars_mask = nir->info.outputs_written & BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING); /* Output position must be present. */ assert(nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS)); /* Varying ordering is specific. */ try_allocate_var(data->vs.varyings, &data->vs.vtxouts, nir, nir->info.outputs_written, nir_var_shader_out, VARYING_SLOT_POS, 1); /* Save varying counts. */ u_foreach_bit64 (location, vars_mask) { nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out, location); assert(var); /* TODO: f16 support. */ bool f16 = glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)); assert(!f16); unsigned components = glsl_get_components(var->type); switch (var->data.interpolation) { case INTERP_MODE_SMOOTH: if (f16) data->vs.f16_smooth += components; else data->vs.f32_smooth += components; break; case INTERP_MODE_FLAT: if (f16) data->vs.f16_flat += components; else data->vs.f32_flat += components; break; case INTERP_MODE_NOPERSPECTIVE: if (f16) data->vs.f16_npc += components; else data->vs.f32_npc += components; break; default: unreachable(); } } for (unsigned f16 = 0; f16 <= 1; ++f16) { for (enum glsl_interp_mode interp_mode = INTERP_MODE_SMOOTH; interp_mode <= INTERP_MODE_NOPERSPECTIVE; ++interp_mode) { try_allocate_vars(data->vs.varyings, &data->vs.vtxouts, nir, &vars_mask, nir_var_shader_out, f16, interp_mode, 1); } } assert(!vars_mask); const gl_varying_slot last_slots[] = { VARYING_SLOT_PSIZ, VARYING_SLOT_VIEWPORT, VARYING_SLOT_LAYER, }; for (unsigned u = 0; u < ARRAY_SIZE(last_slots); ++u) { try_allocate_var(data->vs.varyings, &data->vs.vtxouts, nir, nir->info.outputs_written, nir_var_shader_out, last_slots[u], 1); } } static void pvr_alloc_fs_sysvals(pco_data *data, nir_shader *nir) { /* TODO */ } static void pvr_alloc_fs_varyings(pco_data *data, nir_shader *nir) { assert(!data->common.coeffs); /* Save the z/w locations. */ unsigned zw_count = !!data->fs.uses.z + !!data->fs.uses.w; allocate_val(data->fs.varyings, &data->common.coeffs, VARYING_SLOT_POS, zw_count * ROGUE_USC_COEFFICIENT_SET_SIZE); /* If point coords are used, they come after z/w (if present). */ nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_in, VARYING_SLOT_PNTC); if (var) { assert(!var->data.location_frac); unsigned count = glsl_get_components(var->type); assert(count == 2); allocate_var(data->fs.varyings, &data->common.coeffs, var, ROGUE_USC_COEFFICIENT_SET_SIZE); data->fs.uses.pntc = true; } /* Allocate the rest of the input varyings. */ nir_foreach_shader_in_variable (var, nir) { /* Already handled. */ if (var->data.location == VARYING_SLOT_POS || var->data.location == VARYING_SLOT_PNTC) continue; allocate_var(data->fs.varyings, &data->common.coeffs, var, ROGUE_USC_COEFFICIENT_SET_SIZE); } } static void pvr_init_fs_outputs(pco_data *data, const struct pvr_render_pass *pass, const struct pvr_render_subpass *const subpass, const struct pvr_renderpass_hwsetup_subpass *hw_subpass) { for (unsigned u = 0; u < subpass->color_count; ++u) { unsigned idx = subpass->color_attachments[u]; if (idx == VK_ATTACHMENT_UNUSED) continue; gl_frag_result location = FRAG_RESULT_DATA0 + u; VkFormat vk_format = pass->attachments[idx].vk_format; data->fs.output_formats[location] = vk_format_to_pipe_format(vk_format); } /* TODO: z-replicate. */ } static void pvr_setup_fs_outputs(pco_data *data, nir_shader *nir, const struct pvr_render_subpass *const subpass, const struct pvr_renderpass_hwsetup_subpass *hw_subpass) { ASSERTED unsigned num_outputs = hw_subpass->setup.num_render_targets; assert(num_outputs == subpass->color_count); uint64_t outputs_written = nir->info.outputs_written; assert(util_bitcount64(outputs_written) == num_outputs); for (unsigned u = 0; u < subpass->color_count; ++u) { gl_frag_result location = FRAG_RESULT_DATA0 + u; unsigned idx = subpass->color_attachments[u]; const struct usc_mrt_resource *mrt_resource; ASSERTED bool output_reg; enum pipe_format format; unsigned format_bits; nir_variable *var; if (idx == VK_ATTACHMENT_UNUSED) continue; assert(u == idx); /* TODO: not sure if this is true or not... */ mrt_resource = &hw_subpass->setup.mrt_resources[u]; output_reg = mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG; assert(output_reg); /* TODO: tile buffer support. */ var = nir_find_variable_with_location(nir, nir_var_shader_out, location); assert(var); format = data->fs.output_formats[location]; format_bits = util_format_get_blocksizebits(format); /* TODO: other sized formats. */ assert(!(format_bits % 32)); assert(mrt_resource->intermediate_size == format_bits / 8); set_var(data->fs.outputs, mrt_resource->reg.output_reg, var, format_bits / 32); data->fs.output_reg[location] = output_reg; outputs_written &= ~BITFIELD64_BIT(location); } /* TODO: z-replicate. */ assert(!outputs_written); } static void pvr_init_fs_input_attachments( pco_data *data, const struct pvr_render_subpass *const subpass, const struct pvr_renderpass_hwsetup_subpass *hw_subpass) { pvr_finishme("pvr_init_fs_input_attachments"); } static void pvr_setup_fs_input_attachments( pco_data *data, nir_shader *nir, const struct pvr_render_subpass *const subpass, const struct pvr_renderpass_hwsetup_subpass *hw_subpass) { pvr_finishme("pvr_setup_fs_input_attachments"); } static void pvr_preprocess_shader_data(pco_data *data, nir_shader *nir, const VkGraphicsPipelineCreateInfo *pCreateInfo) { switch (nir->info.stage) { case MESA_SHADER_VERTEX: { const VkPipelineVertexInputStateCreateInfo *const vertex_input_state = pCreateInfo->pVertexInputState; pvr_init_vs_attribs(data, vertex_input_state); break; } case MESA_SHADER_FRAGMENT: { PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass); const struct pvr_render_subpass *const subpass = &pass->subpasses[pCreateInfo->subpass]; const struct pvr_renderpass_hw_map *subpass_map = &pass->hw_setup->subpass_map[pCreateInfo->subpass]; const struct pvr_renderpass_hwsetup_subpass *hw_subpass = &pass->hw_setup->renders[subpass_map->render] .subpasses[subpass_map->subpass]; pvr_init_fs_outputs(data, pass, subpass, hw_subpass); pvr_init_fs_input_attachments(data, subpass, hw_subpass); /* TODO: push consts, blend consts, dynamic state, etc. */ break; } default: unreachable(); } /* TODO: common things, like large constants being put into shareds. */ } static void pvr_postprocess_shader_data(pco_data *data, nir_shader *nir, const VkGraphicsPipelineCreateInfo *pCreateInfo) { switch (nir->info.stage) { case MESA_SHADER_VERTEX: { pvr_alloc_vs_sysvals(data, nir); pvr_alloc_vs_attribs(data, nir); pvr_alloc_vs_varyings(data, nir); break; } case MESA_SHADER_FRAGMENT: { PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass); const struct pvr_render_subpass *const subpass = &pass->subpasses[pCreateInfo->subpass]; const struct pvr_renderpass_hw_map *subpass_map = &pass->hw_setup->subpass_map[pCreateInfo->subpass]; const struct pvr_renderpass_hwsetup_subpass *hw_subpass = &pass->hw_setup->renders[subpass_map->render] .subpasses[subpass_map->subpass]; pvr_alloc_fs_sysvals(data, nir); pvr_alloc_fs_varyings(data, nir); pvr_setup_fs_outputs(data, nir, subpass, hw_subpass); pvr_setup_fs_input_attachments(data, nir, subpass, hw_subpass); /* TODO: push consts, blend consts, dynamic state, etc. */ break; } default: unreachable(); } /* TODO: common things, like large constants being put into shareds. */ } /* Compiles and uploads shaders and PDS programs. */ static VkResult pvr_graphics_pipeline_compile(struct pvr_device *const device, struct vk_pipeline_cache *cache, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *const allocator, struct pvr_graphics_pipeline *const gfx_pipeline) { struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout; struct pvr_sh_reg_layout *sh_reg_layout_vert = &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY]; struct pvr_sh_reg_layout *sh_reg_layout_frag = &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_FRAGMENT]; const uint32_t cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info); VkResult result; struct pvr_vertex_shader_state *vertex_state = &gfx_pipeline->shader_state.vertex; struct pvr_fragment_shader_state *fragment_state = &gfx_pipeline->shader_state.fragment; pco_ctx *pco_ctx = device->pdevice->pco_ctx; const struct spirv_to_nir_options *spirv_options = pco_spirv_options(pco_ctx); const nir_shader_compiler_options *nir_options = pco_nir_options(pco_ctx); nir_shader *producer = NULL; nir_shader *consumer = NULL; pco_data shader_data[MESA_SHADER_STAGES] = { 0 }; nir_shader *nir_shaders[MESA_SHADER_STAGES] = { 0 }; pco_shader *pco_shaders[MESA_SHADER_STAGES] = { 0 }; pco_shader **vs = &pco_shaders[MESA_SHADER_VERTEX]; pco_shader **fs = &pco_shaders[MESA_SHADER_FRAGMENT]; void *shader_mem_ctx = ralloc_context(NULL); struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS]; uint32_t vtx_dma_count = 0; struct pvr_pds_coeff_loading_program frag_coeff_program = { 0 }; for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { size_t stage_index = gfx_pipeline->stage_indices[stage]; /* Skip unused/inactive stages. */ if (stage_index == ~0) continue; result = vk_pipeline_shader_stage_to_nir(&device->vk, gfx_pipeline->base.pipeline_flags, &pCreateInfo->pStages[stage_index], spirv_options, nir_options, shader_mem_ctx, &nir_shaders[stage]); if (result != VK_SUCCESS) goto err_free_build_context; pco_preprocess_nir(pco_ctx, nir_shaders[stage]); } for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { if (!nir_shaders[stage]) continue; if (producer) pco_link_nir(pco_ctx, producer, nir_shaders[stage]); producer = nir_shaders[stage]; } for (gl_shader_stage stage = MESA_SHADER_STAGES; stage-- > 0;) { if (!nir_shaders[stage]) continue; if (consumer) pco_rev_link_nir(pco_ctx, nir_shaders[stage], consumer); consumer = nir_shaders[stage]; } for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { if (!nir_shaders[stage]) continue; pvr_preprocess_shader_data(&shader_data[stage], nir_shaders[stage], pCreateInfo); pco_lower_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]); pvr_lower_nir(pco_ctx, layout, nir_shaders[stage]); pco_postprocess_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]); pvr_postprocess_shader_data(&shader_data[stage], nir_shaders[stage], pCreateInfo); } /* TODO NEXT: setup shareds/for descriptors, here or in * pvr_{pre,post}process_shader_data. */ memset(sh_reg_layout_vert, 0, sizeof(*sh_reg_layout_vert)); memset(sh_reg_layout_frag, 0, sizeof(*sh_reg_layout_frag)); for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) { pco_shader **pco = &pco_shaders[stage]; /* Skip unused/inactive stages. */ if (!nir_shaders[stage]) continue; *pco = pco_trans_nir(pco_ctx, nir_shaders[stage], &shader_data[stage], shader_mem_ctx); if (!*pco) { result = VK_ERROR_INITIALIZATION_FAILED; goto err_free_build_context; } pco_process_ir(pco_ctx, *pco); pco_encode_ir(pco_ctx, *pco); pco_shader_finalize(pco_ctx, *pco); } pvr_graphics_pipeline_setup_vertex_dma(*vs, pCreateInfo->pVertexInputState, vtx_dma_descriptions, &vtx_dma_count); pvr_vertex_state_save(gfx_pipeline, *vs); result = pvr_gpu_upload_usc( device, pco_shader_binary_data(pco_shaders[MESA_SHADER_VERTEX]), pco_shader_binary_size(pco_shaders[MESA_SHADER_VERTEX]), cache_line_size, &vertex_state->bo); if (result != VK_SUCCESS) goto err_free_build_context; if (pco_shaders[MESA_SHADER_FRAGMENT]) { pvr_graphics_pipeline_setup_fragment_coeff_program( &pco_shader_data(pco_shaders[MESA_SHADER_FRAGMENT])->fs, &pco_shader_data(pco_shaders[MESA_SHADER_VERTEX])->vs, nir_shaders[MESA_SHADER_FRAGMENT], &frag_coeff_program); pvr_fragment_state_save(gfx_pipeline, *fs); result = pvr_gpu_upload_usc( device, pco_shader_binary_data(pco_shaders[MESA_SHADER_FRAGMENT]), pco_shader_binary_size(pco_shaders[MESA_SHADER_FRAGMENT]), cache_line_size, &fragment_state->bo); if (result != VK_SUCCESS) goto err_free_vertex_bo; /* TODO: powervr has an optimization where it attempts to recompile * shaders. See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented * since in our case the optimization doesn't happen. */ result = pvr_pds_coeff_program_create_and_upload(device, allocator, &frag_coeff_program, fragment_state); if (result != VK_SUCCESS) goto err_free_fragment_bo; result = pvr_pds_fragment_program_create_and_upload(device, allocator, *fs, fragment_state); if (result != VK_SUCCESS) goto err_free_coeff_program; result = pvr_pds_descriptor_program_create_and_upload( device, allocator, layout, PVR_STAGE_ALLOCATION_FRAGMENT, sh_reg_layout_frag, &fragment_state->descriptor_state); if (result != VK_SUCCESS) goto err_free_frag_program; /* If not, we need to MAX2() and set * `fragment_state->stage_state.pds_temps_count` appropriately. */ assert(fragment_state->descriptor_state.pds_info.temps_required == 0); } result = pvr_pds_vertex_attrib_programs_create_and_upload( device, allocator, pco_shader_data(pco_shaders[MESA_SHADER_VERTEX]), vtx_dma_descriptions, vtx_dma_count, &vertex_state->pds_attrib_programs); if (result != VK_SUCCESS) goto err_free_frag_descriptor_program; result = pvr_pds_descriptor_program_create_and_upload( device, allocator, layout, PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY, sh_reg_layout_vert, &vertex_state->descriptor_state); if (result != VK_SUCCESS) goto err_free_vertex_attrib_program; /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a * scratch buffer for both vertex and fragment stage. * Figure out the best place to do this. */ /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */ /* TODO: Implement spilling with the above. */ ralloc_free(shader_mem_ctx); return VK_SUCCESS; err_free_vertex_attrib_program: for (uint32_t i = 0; i < ARRAY_SIZE(vertex_state->pds_attrib_programs); i++) { struct pvr_pds_attrib_program *const attrib_program = &vertex_state->pds_attrib_programs[i]; pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); } err_free_frag_descriptor_program: pvr_pds_descriptor_program_destroy(device, allocator, &fragment_state->descriptor_state); err_free_frag_program: pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo); err_free_coeff_program: pvr_bo_suballoc_free(fragment_state->pds_coeff_program.pvr_bo); err_free_fragment_bo: pvr_bo_suballoc_free(fragment_state->bo); err_free_vertex_bo: pvr_bo_suballoc_free(vertex_state->bo); err_free_build_context: ralloc_free(shader_mem_ctx); return result; } static struct vk_render_pass_state pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info) { PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass); const struct pvr_render_subpass *const subpass = &pass->subpasses[info->subpass]; enum vk_rp_attachment_flags attachments = 0; assert(info->subpass < pass->subpass_count); for (uint32_t i = 0; i < subpass->color_count; i++) { if (pass->attachments[subpass->color_attachments[i]].aspects) attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i; } if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) { VkImageAspectFlags ds_aspects = pass->attachments[subpass->depth_stencil_attachment].aspects; if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT; if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT; } return (struct vk_render_pass_state){ .attachments = attachments, /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2), * which is not currently supported. */ .view_mask = 0, }; } static VkResult pvr_graphics_pipeline_init(struct pvr_device *device, struct vk_pipeline_cache *cache, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, struct pvr_graphics_pipeline *gfx_pipeline) { struct vk_dynamic_graphics_state *const dynamic_state = &gfx_pipeline->dynamic_state; const struct vk_render_pass_state rp_state = pvr_create_renderpass_state(pCreateInfo); struct vk_graphics_pipeline_all_state all_state; struct vk_graphics_pipeline_state state = { 0 }; VkResult result; pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base); result = vk_graphics_pipeline_state_fill(&device->vk, &state, pCreateInfo, &rp_state, 0, &all_state, NULL, 0, NULL); if (result != VK_SUCCESS) goto err_pipeline_finish; vk_dynamic_graphics_state_init(dynamic_state); /* Load static state into base dynamic state holder. */ vk_dynamic_graphics_state_fill(dynamic_state, &state); /* The value of ms.rasterization_samples is undefined when * rasterizer_discard_enable is set, but we need a specific value. * Fill that in here. */ if (state.rs->rasterizer_discard_enable) dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT; memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices)); for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) { VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage; gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage); /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo: * * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS, * or VK_SHADER_STAGE_ALL." * * So we don't handle that. * * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures * structure returned by the driver. */ switch (pCreateInfo->pStages[i].stage) { case VK_SHADER_STAGE_VERTEX_BIT: case VK_SHADER_STAGE_FRAGMENT_BIT: gfx_pipeline->stage_indices[gl_stage] = i; break; default: unreachable("Unsupported stage."); } } gfx_pipeline->base.layout = pvr_pipeline_layout_from_handle(pCreateInfo->layout); /* Compiles and uploads shaders and PDS programs. */ result = pvr_graphics_pipeline_compile(device, cache, pCreateInfo, allocator, gfx_pipeline); if (result != VK_SUCCESS) goto err_pipeline_finish; return VK_SUCCESS; err_pipeline_finish: pvr_pipeline_finish(&gfx_pipeline->base); return result; } /* If allocator == NULL, the internal one will be used. */ static VkResult pvr_graphics_pipeline_create(struct pvr_device *device, struct vk_pipeline_cache *cache, const VkGraphicsPipelineCreateInfo *pCreateInfo, const VkAllocationCallbacks *allocator, VkPipeline *const pipeline_out) { struct pvr_graphics_pipeline *gfx_pipeline; VkResult result; gfx_pipeline = vk_zalloc2(&device->vk.alloc, allocator, sizeof(*gfx_pipeline), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (!gfx_pipeline) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); /* Compiles and uploads shaders and PDS programs too. */ result = pvr_graphics_pipeline_init(device, cache, pCreateInfo, allocator, gfx_pipeline); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, allocator, gfx_pipeline); return result; } *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base); return VK_SUCCESS; } VkResult pvr_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t createInfoCount, const VkGraphicsPipelineCreateInfo *pCreateInfos, const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines) { VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache); PVR_FROM_HANDLE(pvr_device, device, _device); VkResult result = VK_SUCCESS; for (uint32_t i = 0; i < createInfoCount; i++) { const VkResult local_result = pvr_graphics_pipeline_create(device, cache, &pCreateInfos[i], pAllocator, &pPipelines[i]); if (local_result != VK_SUCCESS) { result = local_result; pPipelines[i] = VK_NULL_HANDLE; } } return result; } /***************************************************************************** Other functions *****************************************************************************/ void pvr_DestroyPipeline(VkDevice _device, VkPipeline _pipeline, const VkAllocationCallbacks *pAllocator) { PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline); PVR_FROM_HANDLE(pvr_device, device, _device); if (!pipeline) return; switch (pipeline->type) { case PVR_PIPELINE_TYPE_GRAPHICS: { struct pvr_graphics_pipeline *const gfx_pipeline = to_pvr_graphics_pipeline(pipeline); pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline); break; } case PVR_PIPELINE_TYPE_COMPUTE: { struct pvr_compute_pipeline *const compute_pipeline = to_pvr_compute_pipeline(pipeline); pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline); break; } default: unreachable("Unknown pipeline type."); } }