• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * based in part on v3dv driver which is:
5  * Copyright © 2019 Raspberry Pi
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  */
26 
27 #include <assert.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <vulkan/vulkan.h>
32 
33 #include "compiler/shader_enums.h"
34 #include "hwdef/rogue_hw_utils.h"
35 #include "nir/nir.h"
36 #include "pco/pco.h"
37 #include "pco/pco_data.h"
38 #include "pvr_bo.h"
39 #include "pvr_csb.h"
40 #include "pvr_csb_enum_helpers.h"
41 #include "pvr_hardcode.h"
42 #include "pvr_nir.h"
43 #include "pvr_pds.h"
44 #include "pvr_private.h"
45 #include "pvr_robustness.h"
46 #include "pvr_shader.h"
47 #include "pvr_types.h"
48 #include "rogue/rogue.h"
49 #include "util/log.h"
50 #include "util/macros.h"
51 #include "util/ralloc.h"
52 #include "util/u_dynarray.h"
53 #include "util/u_math.h"
54 #include "vk_alloc.h"
55 #include "vk_format.h"
56 #include "vk_graphics_state.h"
57 #include "vk_log.h"
58 #include "vk_object.h"
59 #include "vk_pipeline_cache.h"
60 #include "vk_render_pass.h"
61 #include "vk_util.h"
62 #include "vulkan/runtime/vk_pipeline.h"
63 
64 /*****************************************************************************
65    PDS functions
66 *****************************************************************************/
67 
68 /* If allocator == NULL, the internal one will be used. */
pvr_pds_coeff_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,struct pvr_pds_coeff_loading_program * program,struct pvr_fragment_shader_state * fragment_state)69 static VkResult pvr_pds_coeff_program_create_and_upload(
70    struct pvr_device *device,
71    const VkAllocationCallbacks *allocator,
72    struct pvr_pds_coeff_loading_program *program,
73    struct pvr_fragment_shader_state *fragment_state)
74 {
75    uint32_t staging_buffer_size;
76    uint32_t *staging_buffer;
77    VkResult result;
78 
79    assert(program->num_fpu_iterators < PVR_MAXIMUM_ITERATIONS);
80 
81    /* Get the size of the program and then allocate that much memory. */
82    pvr_pds_coefficient_loading(program, NULL, PDS_GENERATE_SIZES);
83 
84    if (!program->code_size) {
85       fragment_state->pds_coeff_program.pvr_bo = NULL;
86       fragment_state->pds_coeff_program.code_size = 0;
87       fragment_state->pds_coeff_program.data_size = 0;
88       fragment_state->stage_state.pds_temps_count = 0;
89 
90       return VK_SUCCESS;
91    }
92 
93    staging_buffer_size =
94       PVR_DW_TO_BYTES(program->code_size + program->data_size);
95 
96    staging_buffer = vk_alloc2(&device->vk.alloc,
97                               allocator,
98                               staging_buffer_size,
99                               8,
100                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
101    if (!staging_buffer)
102       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
103 
104    /* Generate the program into is the staging_buffer. */
105    pvr_pds_coefficient_loading(program,
106                                staging_buffer,
107                                PDS_GENERATE_CODEDATA_SEGMENTS);
108 
109    /* FIXME: Figure out the define for alignment of 16. */
110    result = pvr_gpu_upload_pds(device,
111                                &staging_buffer[0],
112                                program->data_size,
113                                16,
114                                &staging_buffer[program->data_size],
115                                program->code_size,
116                                16,
117                                16,
118                                &fragment_state->pds_coeff_program);
119    if (result != VK_SUCCESS) {
120       vk_free2(&device->vk.alloc, allocator, staging_buffer);
121       return result;
122    }
123 
124    vk_free2(&device->vk.alloc, allocator, staging_buffer);
125 
126    fragment_state->stage_state.pds_temps_count = program->temps_used;
127 
128    return VK_SUCCESS;
129 }
130 
131 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
132 /* If allocator == NULL, the internal one will be used. */
pvr_pds_fragment_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,pco_shader * fs,struct pvr_fragment_shader_state * fragment_state)133 VkResult pvr_pds_fragment_program_create_and_upload(
134    struct pvr_device *device,
135    const VkAllocationCallbacks *allocator,
136    pco_shader *fs,
137    struct pvr_fragment_shader_state *fragment_state)
138 {
139    /* TODO: remove the below + revert the pvr_pds_setup_doutu
140     * args and make sure fs isn't NULL instead;
141     * temporarily in place for hardcoded load ops in
142     * pvr_pass.c:pvr_generate_load_op_shader()
143     */
144    unsigned temps = 0;
145    bool has_phase_rate_change = false;
146    unsigned entry_offset = 0;
147 
148    if (fs) {
149       pco_data *fs_data = pco_shader_data(fs);
150       temps = fs_data->common.temps;
151       has_phase_rate_change = fs_data->fs.uses.phase_change;
152       entry_offset = fs_data->common.entry_offset;
153    }
154 
155    struct pvr_pds_kickusc_program program = { 0 };
156    uint32_t staging_buffer_size;
157    uint32_t *staging_buffer;
158    VkResult result;
159 
160    const pvr_dev_addr_t exec_addr =
161       PVR_DEV_ADDR_OFFSET(fragment_state->bo->dev_addr,
162                           /* fs_data->common.entry_offset */ entry_offset);
163 
164    /* Note this is not strictly required to be done before calculating the
165     * staging_buffer_size in this particular case. It can also be done after
166     * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
167     */
168    pvr_pds_setup_doutu(
169       &program.usc_task_control,
170       exec_addr.addr,
171       /* fs_data->common.temps */ temps,
172       fragment_state->sample_rate,
173       /* fs_data->fs.uses.phase_change */ has_phase_rate_change);
174 
175    pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
176 
177    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
178 
179    staging_buffer = vk_alloc2(&device->vk.alloc,
180                               allocator,
181                               staging_buffer_size,
182                               8,
183                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
184    if (!staging_buffer)
185       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
186 
187    pvr_pds_kick_usc(&program,
188                     staging_buffer,
189                     0,
190                     false,
191                     PDS_GENERATE_CODEDATA_SEGMENTS);
192 
193    /* FIXME: Figure out the define for alignment of 16. */
194    result = pvr_gpu_upload_pds(device,
195                                &staging_buffer[0],
196                                program.data_size,
197                                16,
198                                &staging_buffer[program.data_size],
199                                program.code_size,
200                                16,
201                                16,
202                                &fragment_state->pds_fragment_program);
203    if (result != VK_SUCCESS) {
204       vk_free2(&device->vk.alloc, allocator, staging_buffer);
205       return result;
206    }
207 
208    vk_free2(&device->vk.alloc, allocator, staging_buffer);
209 
210    return VK_SUCCESS;
211 }
212 
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(const struct pvr_device_info * dev_info,bool robust_buffer_access)213 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
214    const struct pvr_device_info *dev_info,
215    bool robust_buffer_access)
216 {
217    /* FIXME: Use more local variable to improve formatting. */
218 
219    /* Maximum memory allocation needed for const map entries in
220     * pvr_pds_generate_vertex_primary_program().
221     * When robustBufferAccess is disabled, it must be >= 410.
222     * When robustBufferAccess is enabled, it must be >= 570.
223     *
224     * 1. Size of entry for base instance
225     *        (pvr_const_map_entry_base_instance)
226     *
227     * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
228     *     if (!robustBufferAccess)
229     *         size of vertex attribute entry
230     *             (pvr_const_map_entry_vertex_attribute_address) +
231     *     else
232     *         size of robust vertex attribute entry
233     *             (pvr_const_map_entry_robust_vertex_attribute_address) +
234     *         size of entry for max attribute index
235     *             (pvr_const_map_entry_vertex_attribute_max_index) +
236     *     fi
237     *     size of Unified Store burst entry
238     *         (pvr_const_map_entry_literal32) +
239     *     size of entry for vertex stride
240     *         (pvr_const_map_entry_literal32) +
241     *     size of entries for DDMAD control word
242     *         (num_ddmad_literals * pvr_const_map_entry_literal32))
243     *
244     * 3. Size of entry for DOUTW vertex/instance control word
245     *     (pvr_const_map_entry_literal32)
246     *
247     * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
248     */
249 
250    const size_t attribute_size =
251       (!robust_buffer_access)
252          ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
253          : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
254               sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
255 
256    /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
257     * and is increased by one DWORD to contain the data for the DDMADT's
258     * out-of-bounds check.
259     */
260    const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
261       1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
262 
263    return (sizeof(struct pvr_const_map_entry_base_instance) +
264            PVR_MAX_VERTEX_INPUT_BINDINGS *
265               (attribute_size +
266                (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
267                   sizeof(struct pvr_const_map_entry_literal32)) +
268            sizeof(struct pvr_const_map_entry_literal32) +
269            sizeof(struct pvr_const_map_entry_doutu_address));
270 }
271 
pvr_pds_vertex_attrib_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_pds_vertex_primary_program_input * const input,struct pvr_pds_attrib_program * const program_out)272 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
273    struct pvr_device *const device,
274    const VkAllocationCallbacks *const allocator,
275    struct pvr_pds_vertex_primary_program_input *const input,
276    struct pvr_pds_attrib_program *const program_out)
277 {
278    const size_t const_entries_size_in_bytes =
279       pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
280          &device->pdevice->dev_info,
281          device->vk.enabled_features.robustBufferAccess);
282    struct pvr_pds_upload *const program = &program_out->program;
283    struct pvr_pds_info *const info = &program_out->info;
284    struct pvr_const_map_entry *new_entries;
285    ASSERTED uint32_t code_size_in_dwords;
286    size_t staging_buffer_size;
287    uint32_t *staging_buffer;
288    VkResult result;
289 
290    memset(info, 0, sizeof(*info));
291 
292    info->entries = vk_alloc2(&device->vk.alloc,
293                              allocator,
294                              const_entries_size_in_bytes,
295                              8,
296                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
297    if (!info->entries) {
298       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
299       goto err_out;
300    }
301 
302    info->entries_size_in_bytes = const_entries_size_in_bytes;
303 
304    pvr_pds_generate_vertex_primary_program(
305       input,
306       NULL,
307       info,
308       device->vk.enabled_features.robustBufferAccess,
309       &device->pdevice->dev_info);
310 
311    code_size_in_dwords = info->code_size_in_dwords;
312    staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords);
313 
314    staging_buffer = vk_alloc2(&device->vk.alloc,
315                               allocator,
316                               staging_buffer_size,
317                               8,
318                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
319    if (!staging_buffer) {
320       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
321       goto err_free_entries;
322    }
323 
324    /* This also fills in info->entries. */
325    pvr_pds_generate_vertex_primary_program(
326       input,
327       staging_buffer,
328       info,
329       device->vk.enabled_features.robustBufferAccess,
330       &device->pdevice->dev_info);
331 
332    assert(info->code_size_in_dwords <= code_size_in_dwords);
333 
334    /* FIXME: Add a vk_realloc2() ? */
335    new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
336                             info->entries,
337                             info->entries_written_size_in_bytes,
338                             8,
339                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
340    if (!new_entries) {
341       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
342       goto err_free_staging_buffer;
343    }
344 
345    info->entries = new_entries;
346    info->entries_size_in_bytes = info->entries_written_size_in_bytes;
347 
348    /* FIXME: Figure out the define for alignment of 16. */
349    result = pvr_gpu_upload_pds(device,
350                                NULL,
351                                0,
352                                0,
353                                staging_buffer,
354                                info->code_size_in_dwords,
355                                16,
356                                16,
357                                program);
358    if (result != VK_SUCCESS)
359       goto err_free_staging_buffer;
360 
361    vk_free2(&device->vk.alloc, allocator, staging_buffer);
362 
363    return VK_SUCCESS;
364 
365 err_free_staging_buffer:
366    vk_free2(&device->vk.alloc, allocator, staging_buffer);
367 
368 err_free_entries:
369    vk_free2(&device->vk.alloc, allocator, info->entries);
370 
371 err_out:
372    return result;
373 }
374 
pvr_pds_vertex_attrib_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_attrib_program * const program)375 static inline void pvr_pds_vertex_attrib_program_destroy(
376    struct pvr_device *const device,
377    const struct VkAllocationCallbacks *const allocator,
378    struct pvr_pds_attrib_program *const program)
379 {
380    pvr_bo_suballoc_free(program->program.pvr_bo);
381    vk_free2(&device->vk.alloc, allocator, program->info.entries);
382 }
383 
384 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
385  * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
386  */
387 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
388    [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
389 
390 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
391  * inputs. This will bake the code segment and create a template of the data
392  * segment for the command buffer to fill in.
393  */
394 /* If allocator == NULL, the internal one will be used.
395  *
396  * programs_out_ptr is a pointer to the array where the outputs will be placed.
397  */
pvr_pds_vertex_attrib_programs_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * const allocator,pco_data * shader_data,const struct pvr_pds_vertex_dma dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],uint32_t dma_count,pvr_pds_attrib_programs_array_ptr programs_out_ptr)398 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
399    struct pvr_device *device,
400    const VkAllocationCallbacks *const allocator,
401    pco_data *shader_data,
402    const struct pvr_pds_vertex_dma
403       dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],
404    uint32_t dma_count,
405    pvr_pds_attrib_programs_array_ptr programs_out_ptr)
406 {
407    struct pvr_pds_vertex_primary_program_input input = {
408       .dma_list = dma_descriptions,
409       .dma_count = dma_count,
410    };
411    uint32_t usc_temp_count = shader_data->common.temps;
412    struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
413    VkResult result;
414 
415    pco_range *sys_vals = shader_data->common.sys_vals;
416    if (sys_vals[SYSTEM_VALUE_VERTEX_ID].count > 0) {
417       input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED;
418       input.vertex_id_register = sys_vals[SYSTEM_VALUE_VERTEX_ID].start;
419    }
420 
421    if (sys_vals[SYSTEM_VALUE_INSTANCE_ID].count > 0) {
422       input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED;
423       input.instance_id_register = sys_vals[SYSTEM_VALUE_INSTANCE_ID].start;
424    }
425 
426    if (sys_vals[SYSTEM_VALUE_BASE_INSTANCE].count > 0) {
427       input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_REQUIRED;
428       input.base_instance_register = sys_vals[SYSTEM_VALUE_BASE_INSTANCE].start;
429    }
430 
431    if (sys_vals[SYSTEM_VALUE_BASE_VERTEX].count > 0) {
432       input.flags |= PVR_PDS_VERTEX_FLAGS_BASE_VERTEX_REQUIRED;
433       input.base_vertex_register = sys_vals[SYSTEM_VALUE_BASE_VERTEX].start;
434    }
435 
436    if (sys_vals[SYSTEM_VALUE_DRAW_ID].count > 0) {
437       input.flags |= PVR_PDS_VERTEX_FLAGS_DRAW_INDEX_REQUIRED;
438       input.draw_index_register = sys_vals[SYSTEM_VALUE_DRAW_ID].start;
439    }
440 
441    pvr_pds_setup_doutu(&input.usc_task_control,
442                        0,
443                        usc_temp_count,
444                        ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
445                        false);
446 
447    /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
448     * typedef.
449     */
450    for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
451       uint32_t extra_flags;
452 
453       switch (i) {
454       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
455          extra_flags = 0;
456          break;
457 
458       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
459          extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
460          break;
461 
462       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
463          extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
464          break;
465 
466       default:
467          unreachable("Invalid vertex attrib program type.");
468       }
469 
470       input.flags |= extra_flags;
471 
472       result =
473          pvr_pds_vertex_attrib_program_create_and_upload(device,
474                                                          allocator,
475                                                          &input,
476                                                          &programs_out[i]);
477       if (result != VK_SUCCESS) {
478          for (uint32_t j = 0; j < i; j++) {
479             pvr_pds_vertex_attrib_program_destroy(device,
480                                                   allocator,
481                                                   &programs_out[j]);
482          }
483 
484          return result;
485       }
486 
487       input.flags &= ~extra_flags;
488    }
489 
490    return VK_SUCCESS;
491 }
492 
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)493 size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
494 {
495    /* Maximum memory allocation needed for const map entries in
496     * pvr_pds_generate_descriptor_upload_program().
497     * It must be >= 688 bytes. This size is calculated as the sum of:
498     *
499     *  1. Max. number of descriptor sets (8) * (
500     *         size of descriptor entry
501     *             (pvr_const_map_entry_descriptor_set) +
502     *         size of Common Store burst entry
503     *             (pvr_const_map_entry_literal32))
504     *
505     *  2. Max. number of PDS program buffers (24) * (
506     *         size of the largest buffer structure
507     *             (pvr_const_map_entry_constant_buffer) +
508     *         size of Common Store burst entry
509     *             (pvr_const_map_entry_literal32)
510     *
511     *  3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
512     *
513     *  4. Max. number of PDS address literals (8) * (
514     *         size of entry
515     *             (pvr_const_map_entry_descriptor_set_addrs_table)
516     *
517     *  5. Max. number of address literals with single buffer entry to DOUTD
518               size of entry
519                   (pvr_pds_const_map_entry_addr_literal_buffer) +
520               8 * size of entry (pvr_pds_const_map_entry_addr_literal)
521     */
522 
523    /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
524     * say that it should be 8.
525     * Figure our a define for this or is the comment wrong?
526     */
527    return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
528                 sizeof(struct pvr_const_map_entry_literal32)) +
529            PVR_PDS_MAX_BUFFERS *
530               (sizeof(struct pvr_const_map_entry_constant_buffer) +
531                sizeof(struct pvr_const_map_entry_literal32)) +
532            sizeof(struct pvr_const_map_entry_doutu_address) +
533            sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) +
534            8 * sizeof(struct pvr_pds_const_map_entry_addr_literal));
535 }
536 
pvr_pds_descriptor_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const struct pvr_pipeline_layout * const layout,enum pvr_stage_allocation stage,const struct pvr_sh_reg_layout * sh_reg_layout,struct pvr_stage_allocation_descriptor_state * const descriptor_state)537 static VkResult pvr_pds_descriptor_program_create_and_upload(
538    struct pvr_device *const device,
539    const VkAllocationCallbacks *const allocator,
540    const struct pvr_pipeline_layout *const layout,
541    enum pvr_stage_allocation stage,
542    const struct pvr_sh_reg_layout *sh_reg_layout,
543    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
544 {
545    const size_t const_entries_size_in_bytes =
546       pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
547    struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
548    struct pvr_pds_descriptor_program_input program = { 0 };
549    struct pvr_const_map_entry *new_entries;
550    ASSERTED uint32_t code_size_in_dwords;
551    uint32_t staging_buffer_size;
552    uint32_t addr_literals = 0;
553    uint32_t *staging_buffer;
554    VkResult result;
555 
556    assert(stage != PVR_STAGE_ALLOCATION_COUNT);
557 
558    *pds_info = (struct pvr_pds_info){ 0 };
559 
560    if (sh_reg_layout->descriptor_set_addrs_table.present) {
561       program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
562          .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
563          .destination = sh_reg_layout->descriptor_set_addrs_table.offset,
564       };
565       addr_literals++;
566    }
567 
568    if (sh_reg_layout->push_consts.present) {
569       program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
570          .type = PVR_PDS_ADDR_LITERAL_PUSH_CONSTS,
571          .destination = sh_reg_layout->push_consts.offset,
572       };
573       addr_literals++;
574    }
575 
576    if (sh_reg_layout->blend_consts.present) {
577       program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
578          .type = PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS,
579          .destination = sh_reg_layout->blend_consts.offset,
580       };
581       addr_literals++;
582    }
583 
584    program.addr_literal_count = addr_literals;
585 
586    pds_info->entries = vk_alloc2(&device->vk.alloc,
587                                  allocator,
588                                  const_entries_size_in_bytes,
589                                  8,
590                                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
591    if (!pds_info->entries) {
592       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
593       goto err_free_static_consts;
594    }
595 
596    pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
597 
598    pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
599 
600    code_size_in_dwords = pds_info->code_size_in_dwords;
601    staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords);
602 
603    if (!staging_buffer_size) {
604       vk_free2(&device->vk.alloc, allocator, pds_info->entries);
605 
606       *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
607 
608       return VK_SUCCESS;
609    }
610 
611    staging_buffer = vk_alloc2(&device->vk.alloc,
612                               allocator,
613                               staging_buffer_size,
614                               8,
615                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
616    if (!staging_buffer) {
617       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
618       goto err_free_entries;
619    }
620 
621    pvr_pds_generate_descriptor_upload_program(&program,
622                                               staging_buffer,
623                                               pds_info);
624 
625    assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
626 
627    /* FIXME: use vk_realloc2() ? */
628    new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
629                             pds_info->entries,
630                             pds_info->entries_written_size_in_bytes,
631                             8,
632                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
633    if (!new_entries) {
634       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
635       goto err_free_staging_buffer;
636    }
637 
638    pds_info->entries = new_entries;
639    pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
640 
641    /* FIXME: Figure out the define for alignment of 16. */
642    result = pvr_gpu_upload_pds(device,
643                                NULL,
644                                0,
645                                0,
646                                staging_buffer,
647                                pds_info->code_size_in_dwords,
648                                16,
649                                16,
650                                &descriptor_state->pds_code);
651    if (result != VK_SUCCESS)
652       goto err_free_staging_buffer;
653 
654    vk_free2(&device->vk.alloc, allocator, staging_buffer);
655 
656    return VK_SUCCESS;
657 
658 err_free_staging_buffer:
659    vk_free2(&device->vk.alloc, allocator, staging_buffer);
660 
661 err_free_entries:
662    vk_free2(&device->vk.alloc, allocator, pds_info->entries);
663 
664 err_free_static_consts:
665    pvr_bo_suballoc_free(descriptor_state->static_consts);
666 
667    return result;
668 }
669 
pvr_pds_descriptor_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_stage_allocation_descriptor_state * const descriptor_state)670 static void pvr_pds_descriptor_program_destroy(
671    struct pvr_device *const device,
672    const struct VkAllocationCallbacks *const allocator,
673    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
674 {
675    if (!descriptor_state)
676       return;
677 
678    pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo);
679    vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
680    pvr_bo_suballoc_free(descriptor_state->static_consts);
681 }
682 
pvr_pds_compute_program_setup(const struct pvr_device_info * dev_info,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,bool add_base_workgroup,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_compute_shader_program * const program)683 static void pvr_pds_compute_program_setup(
684    const struct pvr_device_info *dev_info,
685    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
686    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
687    uint32_t barrier_coefficient,
688    bool add_base_workgroup,
689    uint32_t usc_temps,
690    pvr_dev_addr_t usc_shader_dev_addr,
691    struct pvr_pds_compute_shader_program *const program)
692 {
693    pvr_pds_compute_shader_program_init(program);
694    program->local_input_regs[0] = local_input_regs[0];
695    program->local_input_regs[1] = local_input_regs[1];
696    program->local_input_regs[2] = local_input_regs[2];
697    program->work_group_input_regs[0] = work_group_input_regs[0];
698    program->work_group_input_regs[1] = work_group_input_regs[1];
699    program->work_group_input_regs[2] = work_group_input_regs[2];
700    program->barrier_coefficient = barrier_coefficient;
701    program->add_base_workgroup = add_base_workgroup;
702    program->flattened_work_groups = true;
703    program->kick_usc = true;
704 
705    STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
706                  PVR_WORKGROUP_DIMENSIONS);
707    STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
708                  PVR_WORKGROUP_DIMENSIONS);
709    STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
710                  PVR_WORKGROUP_DIMENSIONS);
711 
712    pvr_pds_setup_doutu(&program->usc_task_control,
713                        usc_shader_dev_addr.addr,
714                        usc_temps,
715                        ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
716                        false);
717 
718    pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
719 }
720 
721 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
722  */
pvr_pds_compute_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_upload * const pds_upload_out,struct pvr_pds_info * const pds_info_out)723 static VkResult pvr_pds_compute_program_create_and_upload(
724    struct pvr_device *const device,
725    const VkAllocationCallbacks *const allocator,
726    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
727    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
728    uint32_t barrier_coefficient,
729    uint32_t usc_temps,
730    pvr_dev_addr_t usc_shader_dev_addr,
731    struct pvr_pds_upload *const pds_upload_out,
732    struct pvr_pds_info *const pds_info_out)
733 {
734    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
735    struct pvr_pds_compute_shader_program program;
736    uint32_t staging_buffer_size;
737    uint32_t *staging_buffer;
738    VkResult result;
739 
740    pvr_pds_compute_program_setup(dev_info,
741                                  local_input_regs,
742                                  work_group_input_regs,
743                                  barrier_coefficient,
744                                  false,
745                                  usc_temps,
746                                  usc_shader_dev_addr,
747                                  &program);
748 
749    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
750     * is in bytes. Investigate this.
751     */
752    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
753 
754    staging_buffer = vk_alloc2(&device->vk.alloc,
755                               allocator,
756                               staging_buffer_size,
757                               8,
758                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
759    if (!staging_buffer)
760       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
761 
762    /* FIXME: pvr_pds_compute_shader doesn't implement
763     * PDS_GENERATE_CODEDATA_SEGMENTS.
764     */
765    pvr_pds_compute_shader(&program,
766                           &staging_buffer[0],
767                           PDS_GENERATE_CODE_SEGMENT,
768                           dev_info);
769 
770    pvr_pds_compute_shader(&program,
771                           &staging_buffer[program.code_size],
772                           PDS_GENERATE_DATA_SEGMENT,
773                           dev_info);
774 
775    /* FIXME: Figure out the define for alignment of 16. */
776    result = pvr_gpu_upload_pds(device,
777                                &staging_buffer[program.code_size],
778                                program.data_size,
779                                16,
780                                &staging_buffer[0],
781                                program.code_size,
782                                16,
783                                16,
784                                pds_upload_out);
785    if (result != VK_SUCCESS) {
786       vk_free2(&device->vk.alloc, allocator, staging_buffer);
787       return result;
788    }
789 
790    *pds_info_out = (struct pvr_pds_info){
791       .temps_required = program.highest_temp,
792       .code_size_in_dwords = program.code_size,
793       .data_size_in_dwords = program.data_size,
794    };
795 
796    vk_free2(&device->vk.alloc, allocator, staging_buffer);
797 
798    return VK_SUCCESS;
799 };
800 
pvr_pds_compute_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_upload * const pds_program,struct pvr_pds_info * const pds_info)801 static void pvr_pds_compute_program_destroy(
802    struct pvr_device *const device,
803    const struct VkAllocationCallbacks *const allocator,
804    struct pvr_pds_upload *const pds_program,
805    struct pvr_pds_info *const pds_info)
806 {
807    /* We don't allocate an entries buffer so we don't need to free it */
808    pvr_bo_suballoc_free(pds_program->pvr_bo);
809 }
810 
811 /* This only uploads the code segment. The data segment will need to be patched
812  * with the base workgroup before uploading.
813  */
pvr_pds_compute_base_workgroup_variant_program_init(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_base_workgroup_program * program_out)814 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
815    struct pvr_device *const device,
816    const VkAllocationCallbacks *const allocator,
817    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
818    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
819    uint32_t barrier_coefficient,
820    uint32_t usc_temps,
821    pvr_dev_addr_t usc_shader_dev_addr,
822    struct pvr_pds_base_workgroup_program *program_out)
823 {
824    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
825    struct pvr_pds_compute_shader_program program;
826    uint32_t buffer_size;
827    uint32_t *buffer;
828    VkResult result;
829 
830    pvr_pds_compute_program_setup(dev_info,
831                                  local_input_regs,
832                                  work_group_input_regs,
833                                  barrier_coefficient,
834                                  true,
835                                  usc_temps,
836                                  usc_shader_dev_addr,
837                                  &program);
838 
839    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
840     * is in bytes. Investigate this.
841     */
842    buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
843 
844    buffer = vk_alloc2(&device->vk.alloc,
845                       allocator,
846                       buffer_size,
847                       8,
848                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
849    if (!buffer)
850       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
851 
852    pvr_pds_compute_shader(&program,
853                           &buffer[0],
854                           PDS_GENERATE_CODE_SEGMENT,
855                           dev_info);
856 
857    /* FIXME: Figure out the define for alignment of 16. */
858    result = pvr_gpu_upload_pds(device,
859                                NULL,
860                                0,
861                                0,
862                                buffer,
863                                program.code_size,
864                                16,
865                                16,
866                                &program_out->code_upload);
867    if (result != VK_SUCCESS) {
868       vk_free2(&device->vk.alloc, allocator, buffer);
869       return result;
870    }
871 
872    pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
873 
874    program_out->data_section = buffer;
875 
876    /* We'll need to patch the base workgroup in the PDS data section before
877     * dispatch so we save the offsets at which to patch. We only need to save
878     * the offset for the first workgroup id since the workgroup ids are stored
879     * contiguously in the data segment.
880     */
881    program_out->base_workgroup_data_patching_offset =
882       program.base_workgroup_constant_offset_in_dwords[0];
883 
884    program_out->info = (struct pvr_pds_info){
885       .temps_required = program.highest_temp,
886       .code_size_in_dwords = program.code_size,
887       .data_size_in_dwords = program.data_size,
888    };
889 
890    return VK_SUCCESS;
891 }
892 
pvr_pds_compute_base_workgroup_variant_program_finish(struct pvr_device * device,const VkAllocationCallbacks * const allocator,struct pvr_pds_base_workgroup_program * const state)893 static void pvr_pds_compute_base_workgroup_variant_program_finish(
894    struct pvr_device *device,
895    const VkAllocationCallbacks *const allocator,
896    struct pvr_pds_base_workgroup_program *const state)
897 {
898    pvr_bo_suballoc_free(state->code_upload.pvr_bo);
899    vk_free2(&device->vk.alloc, allocator, state->data_section);
900 }
901 
902 /******************************************************************************
903    Generic pipeline functions
904  ******************************************************************************/
905 
pvr_pipeline_init(struct pvr_device * device,enum pvr_pipeline_type type,struct pvr_pipeline * const pipeline)906 static void pvr_pipeline_init(struct pvr_device *device,
907                               enum pvr_pipeline_type type,
908                               struct pvr_pipeline *const pipeline)
909 {
910    assert(!pipeline->layout);
911 
912    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
913 
914    pipeline->type = type;
915 }
916 
pvr_pipeline_finish(struct pvr_pipeline * pipeline)917 static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
918 {
919    vk_object_base_finish(&pipeline->base);
920 }
921 
922 /* How many shared regs it takes to store a pvr_dev_addr_t.
923  * Each shared reg is 32 bits.
924  */
925 #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
926    DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
927 
928 /**
929  * \brief Allocates shared registers.
930  *
931  * \return How many sh regs are required.
932  */
933 static uint32_t
pvr_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_pipeline_layout * layout,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)934 pvr_pipeline_alloc_shareds(const struct pvr_device *device,
935                            const struct pvr_pipeline_layout *layout,
936                            enum pvr_stage_allocation stage,
937                            struct pvr_sh_reg_layout *const sh_reg_layout_out)
938 {
939    ASSERTED const uint64_t reserved_shared_size =
940       device->pdevice->dev_runtime_info.reserved_shared_size;
941    ASSERTED const uint64_t max_coeff =
942       device->pdevice->dev_runtime_info.max_coeffs;
943 
944    struct pvr_sh_reg_layout reg_layout = { 0 };
945    uint32_t next_free_sh_reg = 0;
946 
947    reg_layout.descriptor_set_addrs_table.present =
948       !!(layout->shader_stage_mask & BITFIELD_BIT(stage));
949 
950    if (reg_layout.descriptor_set_addrs_table.present) {
951       reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg;
952       next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
953    }
954 
955    reg_layout.push_consts.present =
956       !!(layout->push_constants_shader_stages & BITFIELD_BIT(stage));
957 
958    if (reg_layout.push_consts.present) {
959       reg_layout.push_consts.offset = next_free_sh_reg;
960       next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
961    }
962 
963    *sh_reg_layout_out = reg_layout;
964 
965    /* FIXME: We might need to take more things into consideration.
966     * See pvr_calc_fscommon_size_and_tiles_in_flight().
967     */
968    assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
969 
970    return next_free_sh_reg;
971 }
972 
973 /******************************************************************************
974    Compute pipeline functions
975  ******************************************************************************/
976 
977 /* Compiles and uploads shaders and PDS programs. */
pvr_compute_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)978 static VkResult pvr_compute_pipeline_compile(
979    struct pvr_device *const device,
980    struct vk_pipeline_cache *cache,
981    const VkComputePipelineCreateInfo *pCreateInfo,
982    const VkAllocationCallbacks *const allocator,
983    struct pvr_compute_pipeline *const compute_pipeline)
984 {
985    struct pvr_pipeline_layout *layout = compute_pipeline->base.layout;
986    struct pvr_sh_reg_layout *sh_reg_layout =
987       &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_COMPUTE];
988    uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
989    uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
990    uint32_t barrier_coefficient;
991    uint32_t usc_temps;
992    uint32_t sh_count;
993    VkResult result;
994 
995    sh_count = pvr_pipeline_alloc_shareds(device,
996                                          layout,
997                                          PVR_STAGE_ALLOCATION_COMPUTE,
998                                          sh_reg_layout);
999 
1000    compute_pipeline->shader_state.const_shared_reg_count = sh_count;
1001 
1002    /* FIXME: Compile and upload the shader. */
1003    /* FIXME: Initialize the shader state and setup build info. */
1004    unreachable("finishme: compute support");
1005 
1006    result = pvr_pds_descriptor_program_create_and_upload(
1007       device,
1008       allocator,
1009       layout,
1010       PVR_STAGE_ALLOCATION_COMPUTE,
1011       sh_reg_layout,
1012       &compute_pipeline->descriptor_state);
1013    if (result != VK_SUCCESS)
1014       goto err_free_shader;
1015 
1016    result = pvr_pds_compute_program_create_and_upload(
1017       device,
1018       allocator,
1019       local_input_regs,
1020       work_group_input_regs,
1021       barrier_coefficient,
1022       usc_temps,
1023       compute_pipeline->shader_state.bo->dev_addr,
1024       &compute_pipeline->primary_program,
1025       &compute_pipeline->primary_program_info);
1026    if (result != VK_SUCCESS)
1027       goto err_free_descriptor_program;
1028 
1029    /* If the workgroup ID is required, then we require the base workgroup
1030     * variant of the PDS compute program as well.
1031     */
1032    compute_pipeline->flags.base_workgroup =
1033       work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
1034       work_group_input_regs[1] != PVR_PDS_REG_UNUSED ||
1035       work_group_input_regs[2] != PVR_PDS_REG_UNUSED;
1036 
1037    if (compute_pipeline->flags.base_workgroup) {
1038       result = pvr_pds_compute_base_workgroup_variant_program_init(
1039          device,
1040          allocator,
1041          local_input_regs,
1042          work_group_input_regs,
1043          barrier_coefficient,
1044          usc_temps,
1045          compute_pipeline->shader_state.bo->dev_addr,
1046          &compute_pipeline->primary_base_workgroup_variant_program);
1047       if (result != VK_SUCCESS)
1048          goto err_destroy_compute_program;
1049    }
1050 
1051    return VK_SUCCESS;
1052 
1053 err_destroy_compute_program:
1054    pvr_pds_compute_program_destroy(device,
1055                                    allocator,
1056                                    &compute_pipeline->primary_program,
1057                                    &compute_pipeline->primary_program_info);
1058 
1059 err_free_descriptor_program:
1060    pvr_pds_descriptor_program_destroy(device,
1061                                       allocator,
1062                                       &compute_pipeline->descriptor_state);
1063 
1064 err_free_shader:
1065    pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1066 
1067    return result;
1068 }
1069 
1070 static VkResult
pvr_compute_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_compute_pipeline * compute_pipeline)1071 pvr_compute_pipeline_init(struct pvr_device *device,
1072                           struct vk_pipeline_cache *cache,
1073                           const VkComputePipelineCreateInfo *pCreateInfo,
1074                           const VkAllocationCallbacks *allocator,
1075                           struct pvr_compute_pipeline *compute_pipeline)
1076 {
1077    VkResult result;
1078 
1079    pvr_pipeline_init(device,
1080                      PVR_PIPELINE_TYPE_COMPUTE,
1081                      &compute_pipeline->base);
1082 
1083    compute_pipeline->base.layout =
1084       pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1085 
1086    result = pvr_compute_pipeline_compile(device,
1087                                          cache,
1088                                          pCreateInfo,
1089                                          allocator,
1090                                          compute_pipeline);
1091    if (result != VK_SUCCESS) {
1092       pvr_pipeline_finish(&compute_pipeline->base);
1093       return result;
1094    }
1095 
1096    return VK_SUCCESS;
1097 }
1098 
1099 static VkResult
pvr_compute_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1100 pvr_compute_pipeline_create(struct pvr_device *device,
1101                             struct vk_pipeline_cache *cache,
1102                             const VkComputePipelineCreateInfo *pCreateInfo,
1103                             const VkAllocationCallbacks *allocator,
1104                             VkPipeline *const pipeline_out)
1105 {
1106    struct pvr_compute_pipeline *compute_pipeline;
1107    VkResult result;
1108 
1109    compute_pipeline = vk_zalloc2(&device->vk.alloc,
1110                                  allocator,
1111                                  sizeof(*compute_pipeline),
1112                                  8,
1113                                  VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1114    if (!compute_pipeline)
1115       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1116 
1117    /* Compiles and uploads shaders and PDS programs. */
1118    result = pvr_compute_pipeline_init(device,
1119                                       cache,
1120                                       pCreateInfo,
1121                                       allocator,
1122                                       compute_pipeline);
1123    if (result != VK_SUCCESS) {
1124       vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1125       return result;
1126    }
1127 
1128    *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1129 
1130    return VK_SUCCESS;
1131 }
1132 
pvr_compute_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1133 static void pvr_compute_pipeline_destroy(
1134    struct pvr_device *const device,
1135    const VkAllocationCallbacks *const allocator,
1136    struct pvr_compute_pipeline *const compute_pipeline)
1137 {
1138    if (compute_pipeline->flags.base_workgroup) {
1139       pvr_pds_compute_base_workgroup_variant_program_finish(
1140          device,
1141          allocator,
1142          &compute_pipeline->primary_base_workgroup_variant_program);
1143    }
1144 
1145    pvr_pds_compute_program_destroy(device,
1146                                    allocator,
1147                                    &compute_pipeline->primary_program,
1148                                    &compute_pipeline->primary_program_info);
1149    pvr_pds_descriptor_program_destroy(device,
1150                                       allocator,
1151                                       &compute_pipeline->descriptor_state);
1152    pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1153 
1154    pvr_pipeline_finish(&compute_pipeline->base);
1155 
1156    vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1157 }
1158 
1159 VkResult
pvr_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1160 pvr_CreateComputePipelines(VkDevice _device,
1161                            VkPipelineCache pipelineCache,
1162                            uint32_t createInfoCount,
1163                            const VkComputePipelineCreateInfo *pCreateInfos,
1164                            const VkAllocationCallbacks *pAllocator,
1165                            VkPipeline *pPipelines)
1166 {
1167    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
1168    PVR_FROM_HANDLE(pvr_device, device, _device);
1169    VkResult result = VK_SUCCESS;
1170 
1171    for (uint32_t i = 0; i < createInfoCount; i++) {
1172       const VkResult local_result =
1173          pvr_compute_pipeline_create(device,
1174                                      cache,
1175                                      &pCreateInfos[i],
1176                                      pAllocator,
1177                                      &pPipelines[i]);
1178       if (local_result != VK_SUCCESS) {
1179          result = local_result;
1180          pPipelines[i] = VK_NULL_HANDLE;
1181       }
1182    }
1183 
1184    return result;
1185 }
1186 
1187 /******************************************************************************
1188    Graphics pipeline functions
1189  ******************************************************************************/
1190 
1191 static void
pvr_graphics_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1192 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1193                               const VkAllocationCallbacks *const allocator,
1194                               struct pvr_graphics_pipeline *const gfx_pipeline)
1195 {
1196    const uint32_t num_vertex_attrib_programs =
1197       ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
1198 
1199    pvr_pds_descriptor_program_destroy(
1200       device,
1201       allocator,
1202       &gfx_pipeline->shader_state.fragment.descriptor_state);
1203 
1204    pvr_pds_descriptor_program_destroy(
1205       device,
1206       allocator,
1207       &gfx_pipeline->shader_state.vertex.descriptor_state);
1208 
1209    for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1210       struct pvr_pds_attrib_program *const attrib_program =
1211          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
1212 
1213       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1214    }
1215 
1216    pvr_bo_suballoc_free(
1217       gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
1218    pvr_bo_suballoc_free(
1219       gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
1220 
1221    pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
1222    pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
1223 
1224    pvr_pipeline_finish(&gfx_pipeline->base);
1225 
1226    vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1227 }
1228 
pvr_vertex_state_save(struct pvr_graphics_pipeline * gfx_pipeline,pco_shader * vs)1229 static void pvr_vertex_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
1230                                   pco_shader *vs)
1231 {
1232    struct pvr_vertex_shader_state *vertex_state =
1233       &gfx_pipeline->shader_state.vertex;
1234 
1235    const pco_data *shader_data = pco_shader_data(vs);
1236    memcpy(&gfx_pipeline->vs_data, shader_data, sizeof(*shader_data));
1237 
1238    /* This ends up unused since we'll use the temp_usage for the PDS program we
1239     * end up selecting, and the descriptor PDS program doesn't use any temps.
1240     * Let's set it to ~0 in case it ever gets used.
1241     */
1242    vertex_state->stage_state.pds_temps_count = ~0;
1243 }
1244 
pvr_fragment_state_save(struct pvr_graphics_pipeline * gfx_pipeline,pco_shader * fs)1245 static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
1246                                     pco_shader *fs)
1247 {
1248    struct pvr_fragment_shader_state *fragment_state =
1249       &gfx_pipeline->shader_state.fragment;
1250 
1251    const pco_data *shader_data = pco_shader_data(fs);
1252    memcpy(&gfx_pipeline->fs_data, shader_data, sizeof(*shader_data));
1253 
1254    /* TODO: add selection for other values of pass type and sample rate. */
1255    fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE;
1256    fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE;
1257 
1258    /* We can't initialize it yet since we still need to generate the PDS
1259     * programs so set it to `~0` to make sure that we set this up later on.
1260     */
1261    fragment_state->stage_state.pds_temps_count = ~0;
1262 }
1263 
pvr_blend_factor_requires_consts(VkBlendFactor factor)1264 static bool pvr_blend_factor_requires_consts(VkBlendFactor factor)
1265 {
1266    switch (factor) {
1267    case VK_BLEND_FACTOR_CONSTANT_COLOR:
1268    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1269    case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1270    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1271       return true;
1272 
1273    default:
1274       return false;
1275    }
1276 }
1277 
1278 /**
1279  * \brief Indicates whether dynamic blend constants are needed.
1280  *
1281  * If the user has specified the blend constants to be dynamic, they might not
1282  * necessarily be using them. This function makes sure that they are being used
1283  * in order to determine whether we need to upload them later on for the shader
1284  * to access them.
1285  */
pvr_graphics_pipeline_requires_dynamic_blend_consts(const struct pvr_graphics_pipeline * gfx_pipeline)1286 static bool pvr_graphics_pipeline_requires_dynamic_blend_consts(
1287    const struct pvr_graphics_pipeline *gfx_pipeline)
1288 {
1289    const struct vk_dynamic_graphics_state *const state =
1290       &gfx_pipeline->dynamic_state;
1291 
1292    if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
1293       return false;
1294 
1295    for (uint32_t i = 0; i < state->cb.attachment_count; i++) {
1296       const struct vk_color_blend_attachment_state *attachment =
1297          &state->cb.attachments[i];
1298 
1299       const bool has_color_write =
1300          attachment->write_mask &
1301          (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
1302           VK_COLOR_COMPONENT_B_BIT);
1303       const bool has_alpha_write = attachment->write_mask &
1304                                    VK_COLOR_COMPONENT_A_BIT;
1305 
1306       if (!attachment->blend_enable || attachment->write_mask == 0)
1307          continue;
1308 
1309       if (has_color_write) {
1310          const uint8_t src_color_blend_factor =
1311             attachment->src_color_blend_factor;
1312          const uint8_t dst_color_blend_factor =
1313             attachment->dst_color_blend_factor;
1314 
1315          if (pvr_blend_factor_requires_consts(src_color_blend_factor) ||
1316              pvr_blend_factor_requires_consts(dst_color_blend_factor)) {
1317             return true;
1318          }
1319       }
1320 
1321       if (has_alpha_write) {
1322          const uint8_t src_alpha_blend_factor =
1323             attachment->src_alpha_blend_factor;
1324          const uint8_t dst_alpha_blend_factor =
1325             attachment->dst_alpha_blend_factor;
1326 
1327          if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) ||
1328              pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) {
1329             return true;
1330          }
1331       }
1332    }
1333 
1334    return false;
1335 }
1336 
pvr_graphics_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_graphics_pipeline * gfx_pipeline,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)1337 static uint32_t pvr_graphics_pipeline_alloc_shareds(
1338    const struct pvr_device *device,
1339    const struct pvr_graphics_pipeline *gfx_pipeline,
1340    enum pvr_stage_allocation stage,
1341    struct pvr_sh_reg_layout *const sh_reg_layout_out)
1342 {
1343    ASSERTED const uint64_t reserved_shared_size =
1344       device->pdevice->dev_runtime_info.reserved_shared_size;
1345    ASSERTED const uint64_t max_coeff =
1346       device->pdevice->dev_runtime_info.max_coeffs;
1347 
1348    const struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
1349    struct pvr_sh_reg_layout reg_layout = { 0 };
1350    uint32_t next_free_sh_reg = 0;
1351 
1352    next_free_sh_reg =
1353       pvr_pipeline_alloc_shareds(device, layout, stage, &reg_layout);
1354 
1355    reg_layout.blend_consts.present =
1356       (stage == PVR_STAGE_ALLOCATION_FRAGMENT &&
1357        pvr_graphics_pipeline_requires_dynamic_blend_consts(gfx_pipeline));
1358    if (reg_layout.blend_consts.present) {
1359       reg_layout.blend_consts.offset = next_free_sh_reg;
1360       next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1361    }
1362 
1363    *sh_reg_layout_out = reg_layout;
1364 
1365    /* FIXME: We might need to take more things into consideration.
1366     * See pvr_calc_fscommon_size_and_tiles_in_flight().
1367     */
1368    assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
1369 
1370    return next_free_sh_reg;
1371 }
1372 
1373 #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS
1374 
pvr_graphics_pipeline_setup_vertex_dma(pco_shader * vs,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,struct pvr_pds_vertex_dma * const dma_descriptions,uint32_t * const dma_count)1375 static void pvr_graphics_pipeline_setup_vertex_dma(
1376    pco_shader *vs,
1377    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
1378    struct pvr_pds_vertex_dma *const dma_descriptions,
1379    uint32_t *const dma_count)
1380 {
1381    pco_vs_data *vs_data = &pco_shader_data(vs)->vs;
1382 
1383    const VkVertexInputBindingDescription
1384       *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1385    const VkVertexInputAttributeDescription
1386       *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1387 
1388    /* Vertex attributes map to the `layout(location = x)` annotation in the
1389     * shader where `x` is the attribute's location.
1390     * Vertex bindings have NO relation to the shader. They have nothing to do
1391     * with the `layout(set = x, binding = y)` notation. They instead indicate
1392     * where the data for a collection of vertex attributes comes from. The
1393     * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific
1394     * binding number and based on that we'll know which buffer to DMA the data
1395     * from, to fill in the collection of vertex attributes.
1396     */
1397 
1398    for (uint32_t i = 0; i < vertex_input_state->vertexBindingDescriptionCount;
1399         i++) {
1400       const VkVertexInputBindingDescription *binding_desc =
1401          &vertex_input_state->pVertexBindingDescriptions[i];
1402 
1403       sorted_bindings[binding_desc->binding] = binding_desc;
1404    }
1405 
1406    for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
1407         i++) {
1408       const VkVertexInputAttributeDescription *attribute_desc =
1409          &vertex_input_state->pVertexAttributeDescriptions[i];
1410 
1411       sorted_attributes[attribute_desc->location] = attribute_desc;
1412    }
1413 
1414    for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
1415         i++) {
1416       const VkVertexInputAttributeDescription *attribute = sorted_attributes[i];
1417       if (!attribute)
1418          continue;
1419 
1420       gl_vert_attrib location = attribute->location + VERT_ATTRIB_GENERIC0;
1421       const VkVertexInputBindingDescription *binding =
1422          sorted_bindings[attribute->binding];
1423       struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[*dma_count];
1424       const struct util_format_description *fmt_description =
1425          vk_format_description(attribute->format);
1426 
1427       const pco_range *attrib_range = &vs_data->attribs[location];
1428 
1429       /* Skip unused attributes. */
1430       if (!attrib_range->count)
1431          continue;
1432 
1433       /* DMA setup. */
1434 
1435       /* The PDS program sets up DDMADs to DMA attributes into vtxin regs.
1436        *
1437        * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address).
1438        *          DMA source addr = src0 * src1 + src2
1439        *          DMA params = src3
1440        *
1441        * In the PDS program we setup src0 with the binding's stride and src1
1442        * with either the instance id or vertex id (both of which get filled by
1443        * the hardware). We setup src2 later on once we know which VkBuffer to
1444        * DMA the data from so it's saved for later when we patch the data
1445        * section.
1446        */
1447 
1448       /* TODO: Right now we're setting up a DMA per attribute. In a case where
1449        * there are multiple attributes packed into a single binding with
1450        * adjacent locations we'd still be DMAing them separately. This is not
1451        * great so the DMA setup should be smarter and could do with some
1452        * optimization.
1453        */
1454 
1455       *dma_desc = (struct pvr_pds_vertex_dma){ 0 };
1456 
1457       /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1458        * this corresponds to `attribDesc.offset`.
1459        * The PDS program doesn't do anything with it but just save it in the
1460        * PDS program entry.
1461        */
1462       dma_desc->offset = attribute->offset;
1463 
1464       /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1465        * this corresponds to `bindingDesc.stride`.
1466        * The PDS program will calculate the `effectiveVertexOffset` with this
1467        * and add it to the address provided in the patched data segment.
1468        */
1469       dma_desc->stride = binding->stride;
1470 
1471       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1472          dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
1473       else
1474          dma_desc->flags = 0;
1475 
1476       /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */
1477       /* TODO: what if not all components are used */
1478       assert(attrib_range->count == fmt_description->block.bits / 32);
1479       dma_desc->size_in_dwords = attrib_range->count;
1480 
1481       /* Vtxin reg offset to start DMAing into. */
1482       dma_desc->destination = attrib_range->start;
1483 
1484       /* Will be used by the driver to figure out buffer address to patch in the
1485        * data section. I.e. which binding we should DMA from.
1486        */
1487       dma_desc->binding_index = attribute->binding;
1488 
1489       /* We don't currently support VK_EXT_vertex_attribute_divisor so no
1490        * repeating of instance-rate vertex attributes needed. We should always
1491        * move on to the next vertex attribute.
1492        */
1493       assert(binding->inputRate != VK_VERTEX_INPUT_RATE_INSTANCE);
1494       dma_desc->divisor = 1;
1495 
1496       /* Will be used to generate PDS code that takes care of robust buffer
1497        * access, and later on by the driver to write the correct robustness
1498        * buffer address to DMA the fallback values from.
1499        */
1500       dma_desc->robustness_buffer_offset =
1501          pvr_get_robustness_buffer_format_offset(attribute->format);
1502 
1503       /* Used by later on by the driver to figure out if the buffer is being
1504        * accessed out of bounds, for robust buffer access.
1505        */
1506       dma_desc->component_size_in_bytes =
1507          fmt_description->block.bits / fmt_description->nr_channels / 8;
1508 
1509       ++*dma_count;
1510    }
1511 }
1512 
pvr_graphics_pipeline_setup_fragment_coeff_program(pco_fs_data * fs_data,pco_vs_data * vs_data,nir_shader * fs,struct pvr_pds_coeff_loading_program * frag_coeff_program)1513 static void pvr_graphics_pipeline_setup_fragment_coeff_program(
1514    pco_fs_data *fs_data,
1515    pco_vs_data *vs_data,
1516    nir_shader *fs,
1517    struct pvr_pds_coeff_loading_program *frag_coeff_program)
1518 {
1519    uint64_t varyings_used = fs->info.inputs_read &
1520                             BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING);
1521 
1522    unsigned fpu = 0;
1523    unsigned dest = 0;
1524 
1525    if (fs_data->uses.z) {
1526       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1527                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
1528                     douti_src) {
1529          /* TODO: define instead of sizeof(uint16_t). */
1530          douti_src.f32_offset = fs_data->uses.w ? 1 * sizeof(uint16_t) : 0;
1531          douti_src.f16_offset = douti_src.f32_offset;
1532          douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1533          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D;
1534       }
1535 
1536       frag_coeff_program->destination[fpu++] = dest++;
1537    }
1538 
1539    if (fs_data->uses.w) {
1540       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1541                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
1542                     douti_src) {
1543          douti_src.f32_offset = 0;
1544          douti_src.f16_offset = douti_src.f32_offset;
1545          douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1546          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D;
1547       }
1548 
1549       frag_coeff_program->destination[fpu++] = dest++;
1550    }
1551 
1552    if (fs_data->uses.pntc) {
1553       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1554                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
1555                     douti_src) {
1556          douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1557          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_2D;
1558          douti_src.pointsprite = true;
1559       }
1560 
1561       frag_coeff_program->destination[fpu++] = dest;
1562       dest += 2;
1563    }
1564 
1565    u_foreach_bit64 (varying, varyings_used) {
1566       nir_variable *var =
1567          nir_find_variable_with_location(fs, nir_var_shader_in, varying);
1568       assert(var);
1569 
1570       pco_range *cf_range = &fs_data->varyings[varying];
1571       assert(cf_range->count > 0);
1572       assert(!(cf_range->start % ROGUE_USC_COEFFICIENT_SET_SIZE));
1573       assert(!(cf_range->count % ROGUE_USC_COEFFICIENT_SET_SIZE));
1574 
1575       pco_range *vtxout_range = &vs_data->varyings[varying];
1576       assert(vtxout_range->count > 0);
1577       assert(vtxout_range->start >= 4);
1578 
1579       assert(vtxout_range->count ==
1580              cf_range->count / ROGUE_USC_COEFFICIENT_SET_SIZE);
1581 
1582       unsigned count = vtxout_range->count;
1583 
1584       unsigned vtxout = vtxout_range->start;
1585 
1586       /* pos.x, pos.y unused. */
1587       vtxout -= 2;
1588 
1589       /* pos.z unused. */
1590       if (!fs_data->uses.z)
1591          vtxout -= 1;
1592 
1593       /* pos.w unused. */
1594       if (!fs_data->uses.w)
1595          vtxout -= 1;
1596 
1597       pvr_csb_pack (&frag_coeff_program->FPU_iterators[fpu],
1598                     PDSINST_DOUT_FIELDS_DOUTI_SRC,
1599                     douti_src) {
1600          /* TODO: define instead of sizeof(uint16_t). */
1601          douti_src.f32_offset = vtxout * sizeof(uint16_t);
1602          /* TODO: f16 support. */
1603          douti_src.f16 = false;
1604          douti_src.f16_offset = douti_src.f32_offset;
1605 
1606          switch (var->data.interpolation) {
1607          case INTERP_MODE_SMOOTH:
1608             douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1609             douti_src.perspective = true;
1610             break;
1611 
1612          case INTERP_MODE_NOPERSPECTIVE:
1613             douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_GOURUAD;
1614             break;
1615 
1616          case INTERP_MODE_FLAT:
1617             /* TODO: triangle fan, provoking vertex last. */
1618             douti_src.shademodel = ROGUE_PDSINST_DOUTI_SHADEMODEL_FLAT_VERTEX0;
1619             break;
1620 
1621          default:
1622             unreachable("Unimplemented interpolation type.");
1623          }
1624 
1625          douti_src.size = ROGUE_PDSINST_DOUTI_SIZE_1D + count - 1;
1626       }
1627 
1628       frag_coeff_program->destination[fpu++] =
1629          cf_range->start / ROGUE_USC_COEFFICIENT_SET_SIZE;
1630    }
1631 
1632    frag_coeff_program->num_fpu_iterators = fpu;
1633 }
1634 
set_var(pco_range * allocation_list,unsigned to,nir_variable * var,unsigned dwords_each)1635 static void set_var(pco_range *allocation_list,
1636                     unsigned to,
1637                     nir_variable *var,
1638                     unsigned dwords_each)
1639 {
1640    unsigned slots = glsl_count_dword_slots(var->type, false);
1641 
1642    allocation_list[var->data.location] = (pco_range){
1643       .start = to,
1644       .count = slots * dwords_each,
1645    };
1646 }
1647 
allocate_var(pco_range * allocation_list,unsigned * counter,nir_variable * var,unsigned dwords_each)1648 static void allocate_var(pco_range *allocation_list,
1649                          unsigned *counter,
1650                          nir_variable *var,
1651                          unsigned dwords_each)
1652 {
1653    unsigned slots = glsl_count_dword_slots(var->type, false);
1654 
1655    allocation_list[var->data.location] = (pco_range){
1656       .start = *counter,
1657       .count = slots * dwords_each,
1658    };
1659 
1660    *counter += slots * dwords_each;
1661 }
1662 
try_allocate_var(pco_range * allocation_list,unsigned * counter,nir_shader * nir,uint64_t bitset,nir_variable_mode mode,int location,unsigned dwords_each)1663 static void try_allocate_var(pco_range *allocation_list,
1664                              unsigned *counter,
1665                              nir_shader *nir,
1666                              uint64_t bitset,
1667                              nir_variable_mode mode,
1668                              int location,
1669                              unsigned dwords_each)
1670 {
1671    nir_variable *var = nir_find_variable_with_location(nir, mode, location);
1672 
1673    if (!(bitset & BITFIELD64_BIT(location)))
1674       return;
1675 
1676    assert(var);
1677 
1678    allocate_var(allocation_list, counter, var, dwords_each);
1679 }
1680 
try_allocate_vars(pco_range * allocation_list,unsigned * counter,nir_shader * nir,uint64_t * bitset,nir_variable_mode mode,bool f16,enum glsl_interp_mode interp_mode,unsigned dwords_each)1681 static void try_allocate_vars(pco_range *allocation_list,
1682                               unsigned *counter,
1683                               nir_shader *nir,
1684                               uint64_t *bitset,
1685                               nir_variable_mode mode,
1686                               bool f16,
1687                               enum glsl_interp_mode interp_mode,
1688                               unsigned dwords_each)
1689 {
1690    uint64_t skipped = 0;
1691 
1692    while (*bitset) {
1693       int location = u_bit_scan64(bitset);
1694 
1695       nir_variable *var = nir_find_variable_with_location(nir, mode, location);
1696       assert(var);
1697 
1698       if (glsl_type_is_16bit(glsl_without_array_or_matrix(var->type)) != f16 ||
1699           var->data.interpolation != interp_mode) {
1700          skipped |= BITFIELD64_BIT(location);
1701          continue;
1702       }
1703 
1704       allocate_var(allocation_list, counter, var, dwords_each);
1705    }
1706 
1707    *bitset |= skipped;
1708 }
1709 
allocate_val(pco_range * allocation_list,unsigned * counter,unsigned location,unsigned dwords_each)1710 static void allocate_val(pco_range *allocation_list,
1711                          unsigned *counter,
1712                          unsigned location,
1713                          unsigned dwords_each)
1714 {
1715    allocation_list[location] = (pco_range){
1716       .start = *counter,
1717       .count = dwords_each,
1718    };
1719 
1720    *counter += dwords_each;
1721 }
1722 
pvr_alloc_vs_sysvals(pco_data * data,nir_shader * nir)1723 static void pvr_alloc_vs_sysvals(pco_data *data, nir_shader *nir)
1724 {
1725    BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX);
1726    BITSET_COPY(system_values_read, nir->info.system_values_read);
1727 
1728    gl_system_value sys_vals[] = {
1729       SYSTEM_VALUE_VERTEX_ID,     SYSTEM_VALUE_INSTANCE_ID,
1730       SYSTEM_VALUE_BASE_INSTANCE, SYSTEM_VALUE_BASE_VERTEX,
1731       SYSTEM_VALUE_DRAW_ID,
1732    };
1733 
1734    for (unsigned u = 0; u < ARRAY_SIZE(sys_vals); ++u) {
1735       if (BITSET_TEST(system_values_read, sys_vals[u])) {
1736          allocate_val(data->common.sys_vals,
1737                       &data->common.vtxins,
1738                       sys_vals[u],
1739                       1);
1740 
1741          BITSET_CLEAR(system_values_read, sys_vals[u]);
1742       }
1743    }
1744 
1745    assert(BITSET_IS_EMPTY(system_values_read));
1746 }
1747 
pvr_init_vs_attribs(pco_data * data,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state)1748 static void pvr_init_vs_attribs(
1749    pco_data *data,
1750    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state)
1751 {
1752    for (unsigned u = 0; u < vertex_input_state->vertexAttributeDescriptionCount;
1753         ++u) {
1754       const VkVertexInputAttributeDescription *attrib =
1755          &vertex_input_state->pVertexAttributeDescriptions[u];
1756 
1757       gl_vert_attrib location = attrib->location + VERT_ATTRIB_GENERIC0;
1758 
1759       data->vs.attrib_formats[location] =
1760          vk_format_to_pipe_format(attrib->format);
1761    }
1762 }
1763 
pvr_alloc_vs_attribs(pco_data * data,nir_shader * nir)1764 static void pvr_alloc_vs_attribs(pco_data *data, nir_shader *nir)
1765 {
1766    /* TODO NEXT: this should be based on the format size. */
1767    nir_foreach_shader_in_variable (var, nir) {
1768       allocate_var(data->vs.attribs, &data->common.vtxins, var, 1);
1769    }
1770 }
1771 
pvr_alloc_vs_varyings(pco_data * data,nir_shader * nir)1772 static void pvr_alloc_vs_varyings(pco_data *data, nir_shader *nir)
1773 {
1774    uint64_t vars_mask = nir->info.outputs_written &
1775                         BITFIELD64_RANGE(VARYING_SLOT_VAR0, MAX_VARYING);
1776 
1777    /* Output position must be present. */
1778    assert(nir_find_variable_with_location(nir,
1779                                           nir_var_shader_out,
1780                                           VARYING_SLOT_POS));
1781 
1782    /* Varying ordering is specific. */
1783    try_allocate_var(data->vs.varyings,
1784                     &data->vs.vtxouts,
1785                     nir,
1786                     nir->info.outputs_written,
1787                     nir_var_shader_out,
1788                     VARYING_SLOT_POS,
1789                     1);
1790 
1791    /* Save varying counts. */
1792    u_foreach_bit64 (location, vars_mask) {
1793       nir_variable *var =
1794          nir_find_variable_with_location(nir, nir_var_shader_out, location);
1795       assert(var);
1796 
1797       /* TODO: f16 support. */
1798       bool f16 = glsl_type_is_16bit(glsl_without_array_or_matrix(var->type));
1799       assert(!f16);
1800       unsigned components = glsl_get_components(var->type);
1801 
1802       switch (var->data.interpolation) {
1803       case INTERP_MODE_SMOOTH:
1804          if (f16)
1805             data->vs.f16_smooth += components;
1806          else
1807             data->vs.f32_smooth += components;
1808 
1809          break;
1810 
1811       case INTERP_MODE_FLAT:
1812          if (f16)
1813             data->vs.f16_flat += components;
1814          else
1815             data->vs.f32_flat += components;
1816 
1817          break;
1818 
1819       case INTERP_MODE_NOPERSPECTIVE:
1820          if (f16)
1821             data->vs.f16_npc += components;
1822          else
1823             data->vs.f32_npc += components;
1824 
1825          break;
1826 
1827       default:
1828          unreachable();
1829       }
1830    }
1831 
1832    for (unsigned f16 = 0; f16 <= 1; ++f16) {
1833       for (enum glsl_interp_mode interp_mode = INTERP_MODE_SMOOTH;
1834            interp_mode <= INTERP_MODE_NOPERSPECTIVE;
1835            ++interp_mode) {
1836          try_allocate_vars(data->vs.varyings,
1837                            &data->vs.vtxouts,
1838                            nir,
1839                            &vars_mask,
1840                            nir_var_shader_out,
1841                            f16,
1842                            interp_mode,
1843                            1);
1844       }
1845    }
1846 
1847    assert(!vars_mask);
1848 
1849    const gl_varying_slot last_slots[] = {
1850       VARYING_SLOT_PSIZ,
1851       VARYING_SLOT_VIEWPORT,
1852       VARYING_SLOT_LAYER,
1853    };
1854 
1855    for (unsigned u = 0; u < ARRAY_SIZE(last_slots); ++u) {
1856       try_allocate_var(data->vs.varyings,
1857                        &data->vs.vtxouts,
1858                        nir,
1859                        nir->info.outputs_written,
1860                        nir_var_shader_out,
1861                        last_slots[u],
1862                        1);
1863    }
1864 }
1865 
pvr_alloc_fs_sysvals(pco_data * data,nir_shader * nir)1866 static void pvr_alloc_fs_sysvals(pco_data *data, nir_shader *nir)
1867 {
1868    /* TODO */
1869 }
1870 
pvr_alloc_fs_varyings(pco_data * data,nir_shader * nir)1871 static void pvr_alloc_fs_varyings(pco_data *data, nir_shader *nir)
1872 {
1873    assert(!data->common.coeffs);
1874 
1875    /* Save the z/w locations. */
1876    unsigned zw_count = !!data->fs.uses.z + !!data->fs.uses.w;
1877    allocate_val(data->fs.varyings,
1878                 &data->common.coeffs,
1879                 VARYING_SLOT_POS,
1880                 zw_count * ROGUE_USC_COEFFICIENT_SET_SIZE);
1881 
1882    /* If point coords are used, they come after z/w (if present). */
1883    nir_variable *var = nir_find_variable_with_location(nir,
1884                                                        nir_var_shader_in,
1885                                                        VARYING_SLOT_PNTC);
1886    if (var) {
1887       assert(!var->data.location_frac);
1888       unsigned count = glsl_get_components(var->type);
1889       assert(count == 2);
1890 
1891       allocate_var(data->fs.varyings,
1892                    &data->common.coeffs,
1893                    var,
1894                    ROGUE_USC_COEFFICIENT_SET_SIZE);
1895 
1896       data->fs.uses.pntc = true;
1897    }
1898 
1899    /* Allocate the rest of the input varyings. */
1900    nir_foreach_shader_in_variable (var, nir) {
1901       /* Already handled. */
1902       if (var->data.location == VARYING_SLOT_POS ||
1903           var->data.location == VARYING_SLOT_PNTC)
1904          continue;
1905 
1906       allocate_var(data->fs.varyings,
1907                    &data->common.coeffs,
1908                    var,
1909                    ROGUE_USC_COEFFICIENT_SET_SIZE);
1910    }
1911 }
1912 
1913 static void
pvr_init_fs_outputs(pco_data * data,const struct pvr_render_pass * pass,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1914 pvr_init_fs_outputs(pco_data *data,
1915                     const struct pvr_render_pass *pass,
1916                     const struct pvr_render_subpass *const subpass,
1917                     const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
1918 {
1919    for (unsigned u = 0; u < subpass->color_count; ++u) {
1920       unsigned idx = subpass->color_attachments[u];
1921       if (idx == VK_ATTACHMENT_UNUSED)
1922          continue;
1923 
1924       gl_frag_result location = FRAG_RESULT_DATA0 + u;
1925       VkFormat vk_format = pass->attachments[idx].vk_format;
1926       data->fs.output_formats[location] = vk_format_to_pipe_format(vk_format);
1927    }
1928 
1929    /* TODO: z-replicate. */
1930 }
1931 
1932 static void
pvr_setup_fs_outputs(pco_data * data,nir_shader * nir,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1933 pvr_setup_fs_outputs(pco_data *data,
1934                      nir_shader *nir,
1935                      const struct pvr_render_subpass *const subpass,
1936                      const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
1937 {
1938    ASSERTED unsigned num_outputs = hw_subpass->setup.num_render_targets;
1939    assert(num_outputs == subpass->color_count);
1940 
1941    uint64_t outputs_written = nir->info.outputs_written;
1942    assert(util_bitcount64(outputs_written) == num_outputs);
1943 
1944    for (unsigned u = 0; u < subpass->color_count; ++u) {
1945       gl_frag_result location = FRAG_RESULT_DATA0 + u;
1946       unsigned idx = subpass->color_attachments[u];
1947       const struct usc_mrt_resource *mrt_resource;
1948       ASSERTED bool output_reg;
1949       enum pipe_format format;
1950       unsigned format_bits;
1951       nir_variable *var;
1952 
1953       if (idx == VK_ATTACHMENT_UNUSED)
1954          continue;
1955 
1956       assert(u == idx); /* TODO: not sure if this is true or not... */
1957 
1958       mrt_resource = &hw_subpass->setup.mrt_resources[u];
1959       output_reg = mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1960 
1961       assert(output_reg);
1962       /* TODO: tile buffer support. */
1963 
1964       var = nir_find_variable_with_location(nir, nir_var_shader_out, location);
1965       assert(var);
1966 
1967       format = data->fs.output_formats[location];
1968       format_bits = util_format_get_blocksizebits(format);
1969       /* TODO: other sized formats. */
1970       assert(!(format_bits % 32));
1971 
1972       assert(mrt_resource->intermediate_size == format_bits / 8);
1973 
1974       set_var(data->fs.outputs,
1975               mrt_resource->reg.output_reg,
1976               var,
1977               format_bits / 32);
1978       data->fs.output_reg[location] = output_reg;
1979 
1980       outputs_written &= ~BITFIELD64_BIT(location);
1981    }
1982 
1983    /* TODO: z-replicate. */
1984 
1985    assert(!outputs_written);
1986 }
1987 
pvr_init_fs_input_attachments(pco_data * data,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1988 static void pvr_init_fs_input_attachments(
1989    pco_data *data,
1990    const struct pvr_render_subpass *const subpass,
1991    const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
1992 {
1993    pvr_finishme("pvr_init_fs_input_attachments");
1994 }
1995 
pvr_setup_fs_input_attachments(pco_data * data,nir_shader * nir,const struct pvr_render_subpass * const subpass,const struct pvr_renderpass_hwsetup_subpass * hw_subpass)1996 static void pvr_setup_fs_input_attachments(
1997    pco_data *data,
1998    nir_shader *nir,
1999    const struct pvr_render_subpass *const subpass,
2000    const struct pvr_renderpass_hwsetup_subpass *hw_subpass)
2001 {
2002    pvr_finishme("pvr_setup_fs_input_attachments");
2003 }
2004 
2005 static void
pvr_preprocess_shader_data(pco_data * data,nir_shader * nir,const VkGraphicsPipelineCreateInfo * pCreateInfo)2006 pvr_preprocess_shader_data(pco_data *data,
2007                            nir_shader *nir,
2008                            const VkGraphicsPipelineCreateInfo *pCreateInfo)
2009 {
2010    switch (nir->info.stage) {
2011    case MESA_SHADER_VERTEX: {
2012       const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
2013          pCreateInfo->pVertexInputState;
2014 
2015       pvr_init_vs_attribs(data, vertex_input_state);
2016       break;
2017    }
2018 
2019    case MESA_SHADER_FRAGMENT: {
2020       PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
2021       const struct pvr_render_subpass *const subpass =
2022          &pass->subpasses[pCreateInfo->subpass];
2023       const struct pvr_renderpass_hw_map *subpass_map =
2024          &pass->hw_setup->subpass_map[pCreateInfo->subpass];
2025       const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
2026          &pass->hw_setup->renders[subpass_map->render]
2027              .subpasses[subpass_map->subpass];
2028 
2029       pvr_init_fs_outputs(data, pass, subpass, hw_subpass);
2030       pvr_init_fs_input_attachments(data, subpass, hw_subpass);
2031 
2032       /* TODO: push consts, blend consts, dynamic state, etc. */
2033       break;
2034    }
2035 
2036    default:
2037       unreachable();
2038    }
2039 
2040    /* TODO: common things, like large constants being put into shareds. */
2041 }
2042 
2043 static void
pvr_postprocess_shader_data(pco_data * data,nir_shader * nir,const VkGraphicsPipelineCreateInfo * pCreateInfo)2044 pvr_postprocess_shader_data(pco_data *data,
2045                             nir_shader *nir,
2046                             const VkGraphicsPipelineCreateInfo *pCreateInfo)
2047 {
2048    switch (nir->info.stage) {
2049    case MESA_SHADER_VERTEX: {
2050       pvr_alloc_vs_sysvals(data, nir);
2051       pvr_alloc_vs_attribs(data, nir);
2052       pvr_alloc_vs_varyings(data, nir);
2053       break;
2054    }
2055 
2056    case MESA_SHADER_FRAGMENT: {
2057       PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
2058       const struct pvr_render_subpass *const subpass =
2059          &pass->subpasses[pCreateInfo->subpass];
2060       const struct pvr_renderpass_hw_map *subpass_map =
2061          &pass->hw_setup->subpass_map[pCreateInfo->subpass];
2062       const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
2063          &pass->hw_setup->renders[subpass_map->render]
2064              .subpasses[subpass_map->subpass];
2065 
2066       pvr_alloc_fs_sysvals(data, nir);
2067       pvr_alloc_fs_varyings(data, nir);
2068       pvr_setup_fs_outputs(data, nir, subpass, hw_subpass);
2069       pvr_setup_fs_input_attachments(data, nir, subpass, hw_subpass);
2070 
2071       /* TODO: push consts, blend consts, dynamic state, etc. */
2072       break;
2073    }
2074 
2075    default:
2076       unreachable();
2077    }
2078 
2079    /* TODO: common things, like large constants being put into shareds. */
2080 }
2081 
2082 /* Compiles and uploads shaders and PDS programs. */
2083 static VkResult
pvr_graphics_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)2084 pvr_graphics_pipeline_compile(struct pvr_device *const device,
2085                               struct vk_pipeline_cache *cache,
2086                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
2087                               const VkAllocationCallbacks *const allocator,
2088                               struct pvr_graphics_pipeline *const gfx_pipeline)
2089 {
2090    struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
2091    struct pvr_sh_reg_layout *sh_reg_layout_vert =
2092       &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
2093    struct pvr_sh_reg_layout *sh_reg_layout_frag =
2094       &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_FRAGMENT];
2095    const uint32_t cache_line_size =
2096       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
2097    VkResult result;
2098 
2099    struct pvr_vertex_shader_state *vertex_state =
2100       &gfx_pipeline->shader_state.vertex;
2101    struct pvr_fragment_shader_state *fragment_state =
2102       &gfx_pipeline->shader_state.fragment;
2103 
2104    pco_ctx *pco_ctx = device->pdevice->pco_ctx;
2105    const struct spirv_to_nir_options *spirv_options =
2106       pco_spirv_options(pco_ctx);
2107    const nir_shader_compiler_options *nir_options = pco_nir_options(pco_ctx);
2108 
2109    nir_shader *producer = NULL;
2110    nir_shader *consumer = NULL;
2111    pco_data shader_data[MESA_SHADER_STAGES] = { 0 };
2112    nir_shader *nir_shaders[MESA_SHADER_STAGES] = { 0 };
2113    pco_shader *pco_shaders[MESA_SHADER_STAGES] = { 0 };
2114    pco_shader **vs = &pco_shaders[MESA_SHADER_VERTEX];
2115    pco_shader **fs = &pco_shaders[MESA_SHADER_FRAGMENT];
2116    void *shader_mem_ctx = ralloc_context(NULL);
2117 
2118    struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
2119    uint32_t vtx_dma_count = 0;
2120 
2121    struct pvr_pds_coeff_loading_program frag_coeff_program = { 0 };
2122 
2123    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2124       size_t stage_index = gfx_pipeline->stage_indices[stage];
2125 
2126       /* Skip unused/inactive stages. */
2127       if (stage_index == ~0)
2128          continue;
2129 
2130       result =
2131          vk_pipeline_shader_stage_to_nir(&device->vk,
2132                                          gfx_pipeline->base.pipeline_flags,
2133                                          &pCreateInfo->pStages[stage_index],
2134                                          spirv_options,
2135                                          nir_options,
2136                                          shader_mem_ctx,
2137                                          &nir_shaders[stage]);
2138       if (result != VK_SUCCESS)
2139          goto err_free_build_context;
2140 
2141       pco_preprocess_nir(pco_ctx, nir_shaders[stage]);
2142    }
2143 
2144    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2145       if (!nir_shaders[stage])
2146          continue;
2147 
2148       if (producer)
2149          pco_link_nir(pco_ctx, producer, nir_shaders[stage]);
2150 
2151       producer = nir_shaders[stage];
2152    }
2153 
2154    for (gl_shader_stage stage = MESA_SHADER_STAGES; stage-- > 0;) {
2155       if (!nir_shaders[stage])
2156          continue;
2157 
2158       if (consumer)
2159          pco_rev_link_nir(pco_ctx, nir_shaders[stage], consumer);
2160 
2161       consumer = nir_shaders[stage];
2162    }
2163 
2164    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2165       if (!nir_shaders[stage])
2166          continue;
2167 
2168       pvr_preprocess_shader_data(&shader_data[stage],
2169                                  nir_shaders[stage],
2170                                  pCreateInfo);
2171 
2172       pco_lower_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]);
2173       pvr_lower_nir(pco_ctx, layout, nir_shaders[stage]);
2174 
2175       pco_postprocess_nir(pco_ctx, nir_shaders[stage], &shader_data[stage]);
2176 
2177       pvr_postprocess_shader_data(&shader_data[stage],
2178                                   nir_shaders[stage],
2179                                   pCreateInfo);
2180    }
2181 
2182    /* TODO NEXT: setup shareds/for descriptors, here or in
2183     * pvr_{pre,post}process_shader_data.
2184     */
2185    memset(sh_reg_layout_vert, 0, sizeof(*sh_reg_layout_vert));
2186    memset(sh_reg_layout_frag, 0, sizeof(*sh_reg_layout_frag));
2187 
2188    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
2189       pco_shader **pco = &pco_shaders[stage];
2190 
2191       /* Skip unused/inactive stages. */
2192       if (!nir_shaders[stage])
2193          continue;
2194 
2195       *pco = pco_trans_nir(pco_ctx,
2196                            nir_shaders[stage],
2197                            &shader_data[stage],
2198                            shader_mem_ctx);
2199       if (!*pco) {
2200          result = VK_ERROR_INITIALIZATION_FAILED;
2201          goto err_free_build_context;
2202       }
2203 
2204       pco_process_ir(pco_ctx, *pco);
2205       pco_encode_ir(pco_ctx, *pco);
2206       pco_shader_finalize(pco_ctx, *pco);
2207    }
2208 
2209    pvr_graphics_pipeline_setup_vertex_dma(*vs,
2210                                           pCreateInfo->pVertexInputState,
2211                                           vtx_dma_descriptions,
2212                                           &vtx_dma_count);
2213 
2214    pvr_vertex_state_save(gfx_pipeline, *vs);
2215 
2216    result = pvr_gpu_upload_usc(
2217       device,
2218       pco_shader_binary_data(pco_shaders[MESA_SHADER_VERTEX]),
2219       pco_shader_binary_size(pco_shaders[MESA_SHADER_VERTEX]),
2220       cache_line_size,
2221       &vertex_state->bo);
2222    if (result != VK_SUCCESS)
2223       goto err_free_build_context;
2224 
2225    if (pco_shaders[MESA_SHADER_FRAGMENT]) {
2226       pvr_graphics_pipeline_setup_fragment_coeff_program(
2227          &pco_shader_data(pco_shaders[MESA_SHADER_FRAGMENT])->fs,
2228          &pco_shader_data(pco_shaders[MESA_SHADER_VERTEX])->vs,
2229          nir_shaders[MESA_SHADER_FRAGMENT],
2230          &frag_coeff_program);
2231 
2232       pvr_fragment_state_save(gfx_pipeline, *fs);
2233 
2234       result = pvr_gpu_upload_usc(
2235          device,
2236          pco_shader_binary_data(pco_shaders[MESA_SHADER_FRAGMENT]),
2237          pco_shader_binary_size(pco_shaders[MESA_SHADER_FRAGMENT]),
2238          cache_line_size,
2239          &fragment_state->bo);
2240       if (result != VK_SUCCESS)
2241          goto err_free_vertex_bo;
2242 
2243       /* TODO: powervr has an optimization where it attempts to recompile
2244        * shaders. See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented
2245        * since in our case the optimization doesn't happen.
2246        */
2247 
2248       result = pvr_pds_coeff_program_create_and_upload(device,
2249                                                        allocator,
2250                                                        &frag_coeff_program,
2251                                                        fragment_state);
2252       if (result != VK_SUCCESS)
2253          goto err_free_fragment_bo;
2254 
2255       result = pvr_pds_fragment_program_create_and_upload(device,
2256                                                           allocator,
2257                                                           *fs,
2258                                                           fragment_state);
2259       if (result != VK_SUCCESS)
2260          goto err_free_coeff_program;
2261 
2262       result = pvr_pds_descriptor_program_create_and_upload(
2263          device,
2264          allocator,
2265          layout,
2266          PVR_STAGE_ALLOCATION_FRAGMENT,
2267          sh_reg_layout_frag,
2268          &fragment_state->descriptor_state);
2269       if (result != VK_SUCCESS)
2270          goto err_free_frag_program;
2271 
2272       /* If not, we need to MAX2() and set
2273        * `fragment_state->stage_state.pds_temps_count` appropriately.
2274        */
2275       assert(fragment_state->descriptor_state.pds_info.temps_required == 0);
2276    }
2277 
2278    result = pvr_pds_vertex_attrib_programs_create_and_upload(
2279       device,
2280       allocator,
2281       pco_shader_data(pco_shaders[MESA_SHADER_VERTEX]),
2282       vtx_dma_descriptions,
2283       vtx_dma_count,
2284       &vertex_state->pds_attrib_programs);
2285    if (result != VK_SUCCESS)
2286       goto err_free_frag_descriptor_program;
2287 
2288    result = pvr_pds_descriptor_program_create_and_upload(
2289       device,
2290       allocator,
2291       layout,
2292       PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
2293       sh_reg_layout_vert,
2294       &vertex_state->descriptor_state);
2295    if (result != VK_SUCCESS)
2296       goto err_free_vertex_attrib_program;
2297 
2298    /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
2299     * scratch buffer for both vertex and fragment stage.
2300     * Figure out the best place to do this.
2301     */
2302    /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
2303    /* TODO: Implement spilling with the above. */
2304 
2305    ralloc_free(shader_mem_ctx);
2306 
2307    return VK_SUCCESS;
2308 
2309 err_free_vertex_attrib_program:
2310    for (uint32_t i = 0; i < ARRAY_SIZE(vertex_state->pds_attrib_programs);
2311         i++) {
2312       struct pvr_pds_attrib_program *const attrib_program =
2313          &vertex_state->pds_attrib_programs[i];
2314 
2315       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
2316    }
2317 err_free_frag_descriptor_program:
2318    pvr_pds_descriptor_program_destroy(device,
2319                                       allocator,
2320                                       &fragment_state->descriptor_state);
2321 err_free_frag_program:
2322    pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo);
2323 err_free_coeff_program:
2324    pvr_bo_suballoc_free(fragment_state->pds_coeff_program.pvr_bo);
2325 err_free_fragment_bo:
2326    pvr_bo_suballoc_free(fragment_state->bo);
2327 err_free_vertex_bo:
2328    pvr_bo_suballoc_free(vertex_state->bo);
2329 err_free_build_context:
2330    ralloc_free(shader_mem_ctx);
2331    return result;
2332 }
2333 
2334 static struct vk_render_pass_state
pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo * const info)2335 pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info)
2336 {
2337    PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass);
2338    const struct pvr_render_subpass *const subpass =
2339       &pass->subpasses[info->subpass];
2340 
2341    enum vk_rp_attachment_flags attachments = 0;
2342 
2343    assert(info->subpass < pass->subpass_count);
2344 
2345    for (uint32_t i = 0; i < subpass->color_count; i++) {
2346       if (pass->attachments[subpass->color_attachments[i]].aspects)
2347          attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i;
2348    }
2349 
2350    if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
2351       VkImageAspectFlags ds_aspects =
2352          pass->attachments[subpass->depth_stencil_attachment].aspects;
2353       if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2354          attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2355       if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
2356          attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2357    }
2358 
2359    return (struct vk_render_pass_state){
2360       .attachments = attachments,
2361 
2362       /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2),
2363        * which is not currently supported.
2364        */
2365       .view_mask = 0,
2366    };
2367 }
2368 
2369 static VkResult
pvr_graphics_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_graphics_pipeline * gfx_pipeline)2370 pvr_graphics_pipeline_init(struct pvr_device *device,
2371                            struct vk_pipeline_cache *cache,
2372                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
2373                            const VkAllocationCallbacks *allocator,
2374                            struct pvr_graphics_pipeline *gfx_pipeline)
2375 {
2376    struct vk_dynamic_graphics_state *const dynamic_state =
2377       &gfx_pipeline->dynamic_state;
2378    const struct vk_render_pass_state rp_state =
2379       pvr_create_renderpass_state(pCreateInfo);
2380 
2381    struct vk_graphics_pipeline_all_state all_state;
2382    struct vk_graphics_pipeline_state state = { 0 };
2383 
2384    VkResult result;
2385 
2386    pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
2387 
2388    result = vk_graphics_pipeline_state_fill(&device->vk,
2389                                             &state,
2390                                             pCreateInfo,
2391                                             &rp_state,
2392                                             0,
2393                                             &all_state,
2394                                             NULL,
2395                                             0,
2396                                             NULL);
2397    if (result != VK_SUCCESS)
2398       goto err_pipeline_finish;
2399 
2400    vk_dynamic_graphics_state_init(dynamic_state);
2401 
2402    /* Load static state into base dynamic state holder. */
2403    vk_dynamic_graphics_state_fill(dynamic_state, &state);
2404 
2405    /* The value of ms.rasterization_samples is undefined when
2406     * rasterizer_discard_enable is set, but we need a specific value.
2407     * Fill that in here.
2408     */
2409    if (state.rs->rasterizer_discard_enable)
2410       dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
2411 
2412    memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
2413 
2414    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
2415       VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
2416       gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
2417       /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
2418        *
2419        *    "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
2420        *    or VK_SHADER_STAGE_ALL."
2421        *
2422        * So we don't handle that.
2423        *
2424        * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
2425        * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
2426        * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
2427        * structure returned by the driver.
2428        */
2429       switch (pCreateInfo->pStages[i].stage) {
2430       case VK_SHADER_STAGE_VERTEX_BIT:
2431       case VK_SHADER_STAGE_FRAGMENT_BIT:
2432          gfx_pipeline->stage_indices[gl_stage] = i;
2433          break;
2434       default:
2435          unreachable("Unsupported stage.");
2436       }
2437    }
2438 
2439    gfx_pipeline->base.layout =
2440       pvr_pipeline_layout_from_handle(pCreateInfo->layout);
2441 
2442    /* Compiles and uploads shaders and PDS programs. */
2443    result = pvr_graphics_pipeline_compile(device,
2444                                           cache,
2445                                           pCreateInfo,
2446                                           allocator,
2447                                           gfx_pipeline);
2448    if (result != VK_SUCCESS)
2449       goto err_pipeline_finish;
2450 
2451    return VK_SUCCESS;
2452 
2453 err_pipeline_finish:
2454    pvr_pipeline_finish(&gfx_pipeline->base);
2455 
2456    return result;
2457 }
2458 
2459 /* If allocator == NULL, the internal one will be used. */
2460 static VkResult
pvr_graphics_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)2461 pvr_graphics_pipeline_create(struct pvr_device *device,
2462                              struct vk_pipeline_cache *cache,
2463                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
2464                              const VkAllocationCallbacks *allocator,
2465                              VkPipeline *const pipeline_out)
2466 {
2467    struct pvr_graphics_pipeline *gfx_pipeline;
2468    VkResult result;
2469 
2470    gfx_pipeline = vk_zalloc2(&device->vk.alloc,
2471                              allocator,
2472                              sizeof(*gfx_pipeline),
2473                              8,
2474                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2475    if (!gfx_pipeline)
2476       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2477 
2478    /* Compiles and uploads shaders and PDS programs too. */
2479    result = pvr_graphics_pipeline_init(device,
2480                                        cache,
2481                                        pCreateInfo,
2482                                        allocator,
2483                                        gfx_pipeline);
2484    if (result != VK_SUCCESS) {
2485       vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
2486       return result;
2487    }
2488 
2489    *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
2490 
2491    return VK_SUCCESS;
2492 }
2493 
2494 VkResult
pvr_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)2495 pvr_CreateGraphicsPipelines(VkDevice _device,
2496                             VkPipelineCache pipelineCache,
2497                             uint32_t createInfoCount,
2498                             const VkGraphicsPipelineCreateInfo *pCreateInfos,
2499                             const VkAllocationCallbacks *pAllocator,
2500                             VkPipeline *pPipelines)
2501 {
2502    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
2503    PVR_FROM_HANDLE(pvr_device, device, _device);
2504    VkResult result = VK_SUCCESS;
2505 
2506    for (uint32_t i = 0; i < createInfoCount; i++) {
2507       const VkResult local_result =
2508          pvr_graphics_pipeline_create(device,
2509                                       cache,
2510                                       &pCreateInfos[i],
2511                                       pAllocator,
2512                                       &pPipelines[i]);
2513       if (local_result != VK_SUCCESS) {
2514          result = local_result;
2515          pPipelines[i] = VK_NULL_HANDLE;
2516       }
2517    }
2518 
2519    return result;
2520 }
2521 
2522 /*****************************************************************************
2523    Other functions
2524 *****************************************************************************/
2525 
pvr_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2526 void pvr_DestroyPipeline(VkDevice _device,
2527                          VkPipeline _pipeline,
2528                          const VkAllocationCallbacks *pAllocator)
2529 {
2530    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2531    PVR_FROM_HANDLE(pvr_device, device, _device);
2532 
2533    if (!pipeline)
2534       return;
2535 
2536    switch (pipeline->type) {
2537    case PVR_PIPELINE_TYPE_GRAPHICS: {
2538       struct pvr_graphics_pipeline *const gfx_pipeline =
2539          to_pvr_graphics_pipeline(pipeline);
2540 
2541       pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2542       break;
2543    }
2544 
2545    case PVR_PIPELINE_TYPE_COMPUTE: {
2546       struct pvr_compute_pipeline *const compute_pipeline =
2547          to_pvr_compute_pipeline(pipeline);
2548 
2549       pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2550       break;
2551    }
2552 
2553    default:
2554       unreachable("Unknown pipeline type.");
2555    }
2556 }
2557