• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * based in part on v3dv driver which is:
5  * Copyright © 2019 Raspberry Pi
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  */
26 
27 #include <assert.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <vulkan/vulkan.h>
32 
33 #include "compiler/shader_enums.h"
34 #include "hwdef/rogue_hw_utils.h"
35 #include "nir/nir.h"
36 #include "pvr_bo.h"
37 #include "pvr_csb.h"
38 #include "pvr_csb_enum_helpers.h"
39 #include "pvr_hardcode.h"
40 #include "pvr_pds.h"
41 #include "pvr_private.h"
42 #include "pvr_shader.h"
43 #include "pvr_types.h"
44 #include "rogue/rogue.h"
45 #include "rogue/rogue_build_data.h"
46 #include "util/log.h"
47 #include "util/macros.h"
48 #include "util/ralloc.h"
49 #include "util/u_math.h"
50 #include "vk_alloc.h"
51 #include "vk_log.h"
52 #include "vk_object.h"
53 #include "vk_util.h"
54 
55 /*****************************************************************************
56    PDS functions
57 *****************************************************************************/
58 
59 /* If allocator == NULL, the internal one will be used. */
pvr_pds_coeff_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const uint32_t * fpu_iterators,uint32_t fpu_iterators_count,const uint32_t * destinations,struct pvr_pds_upload * const pds_upload_out)60 static VkResult pvr_pds_coeff_program_create_and_upload(
61    struct pvr_device *device,
62    const VkAllocationCallbacks *allocator,
63    const uint32_t *fpu_iterators,
64    uint32_t fpu_iterators_count,
65    const uint32_t *destinations,
66    struct pvr_pds_upload *const pds_upload_out)
67 {
68    struct pvr_pds_coeff_loading_program program = {
69       .num_fpu_iterators = fpu_iterators_count,
70    };
71    uint32_t staging_buffer_size;
72    uint32_t *staging_buffer;
73    VkResult result;
74 
75    assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS);
76 
77    /* Get the size of the program and then allocate that much memory. */
78    pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES);
79 
80    staging_buffer_size =
81       (program.code_size + program.data_size) * sizeof(*staging_buffer);
82 
83    staging_buffer = vk_alloc2(&device->vk.alloc,
84                               allocator,
85                               staging_buffer_size,
86                               8,
87                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
88    if (!staging_buffer)
89       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
90 
91    /* FIXME: Should we save pointers when we redesign the pds gen api ? */
92    typed_memcpy(program.FPU_iterators,
93                 fpu_iterators,
94                 program.num_fpu_iterators);
95 
96    typed_memcpy(program.destination, destinations, program.num_fpu_iterators);
97 
98    /* Generate the program into is the staging_buffer. */
99    pvr_pds_coefficient_loading(&program,
100                                staging_buffer,
101                                PDS_GENERATE_CODEDATA_SEGMENTS);
102 
103    /* FIXME: Figure out the define for alignment of 16. */
104    result = pvr_gpu_upload_pds(device,
105                                &staging_buffer[0],
106                                program.data_size,
107                                16,
108                                &staging_buffer[program.data_size],
109                                program.code_size,
110                                16,
111                                16,
112                                pds_upload_out);
113    if (result != VK_SUCCESS) {
114       vk_free2(&device->vk.alloc, allocator, staging_buffer);
115       return result;
116    }
117 
118    vk_free2(&device->vk.alloc, allocator, staging_buffer);
119 
120    return VK_SUCCESS;
121 }
122 
123 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
124 /* If allocator == NULL, the internal one will be used. */
pvr_pds_fragment_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const struct pvr_bo * fragment_shader_bo,uint32_t fragment_temp_count,enum rogue_msaa_mode msaa_mode,bool has_phase_rate_change,struct pvr_pds_upload * const pds_upload_out)125 VkResult pvr_pds_fragment_program_create_and_upload(
126    struct pvr_device *device,
127    const VkAllocationCallbacks *allocator,
128    const struct pvr_bo *fragment_shader_bo,
129    uint32_t fragment_temp_count,
130    enum rogue_msaa_mode msaa_mode,
131    bool has_phase_rate_change,
132    struct pvr_pds_upload *const pds_upload_out)
133 {
134    const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE)
135       sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode);
136    struct pvr_pds_kickusc_program program = { 0 };
137    uint32_t staging_buffer_size;
138    uint32_t *staging_buffer;
139    VkResult result;
140 
141    /* FIXME: Should it be passing in the USC offset rather than address here?
142     */
143    /* Note this is not strictly required to be done before calculating the
144     * staging_buffer_size in this particular case. It can also be done after
145     * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
146     */
147    pvr_pds_setup_doutu(&program.usc_task_control,
148                        fragment_shader_bo->vma->dev_addr.addr,
149                        fragment_temp_count,
150                        sample_rate,
151                        has_phase_rate_change);
152 
153    pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
154 
155    staging_buffer_size =
156       (program.code_size + program.data_size) * sizeof(*staging_buffer);
157 
158    staging_buffer = vk_alloc2(&device->vk.alloc,
159                               allocator,
160                               staging_buffer_size,
161                               8,
162                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
163    if (!staging_buffer)
164       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
165 
166    pvr_pds_kick_usc(&program,
167                     staging_buffer,
168                     0,
169                     false,
170                     PDS_GENERATE_CODEDATA_SEGMENTS);
171 
172    /* FIXME: Figure out the define for alignment of 16. */
173    result = pvr_gpu_upload_pds(device,
174                                &staging_buffer[0],
175                                program.data_size,
176                                16,
177                                &staging_buffer[program.data_size],
178                                program.code_size,
179                                16,
180                                16,
181                                pds_upload_out);
182    if (result != VK_SUCCESS) {
183       vk_free2(&device->vk.alloc, allocator, staging_buffer);
184       return result;
185    }
186 
187    vk_free2(&device->vk.alloc, allocator, staging_buffer);
188 
189    return VK_SUCCESS;
190 }
191 
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(const struct pvr_device_info * dev_info,bool robust_buffer_access)192 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
193    const struct pvr_device_info *dev_info,
194    bool robust_buffer_access)
195 {
196    /* FIXME: Use more local variable to improve formatting. */
197 
198    /* Maximum memory allocation needed for const map entries in
199     * pvr_pds_generate_vertex_primary_program().
200     * When robustBufferAccess is disabled, it must be >= 410.
201     * When robustBufferAccess is enabled, it must be >= 570.
202     *
203     * 1. Size of entry for base instance
204     *        (pvr_const_map_entry_base_instance)
205     *
206     * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
207     *     if (!robustBufferAccess)
208     *         size of vertex attribute entry
209     *             (pvr_const_map_entry_vertex_attribute_address) +
210     *     else
211     *         size of robust vertex attribute entry
212     *             (pvr_const_map_entry_robust_vertex_attribute_address) +
213     *         size of entry for max attribute index
214     *             (pvr_const_map_entry_vertex_attribute_max_index) +
215     *     fi
216     *     size of Unified Store burst entry
217     *         (pvr_const_map_entry_literal32) +
218     *     size of entry for vertex stride
219     *         (pvr_const_map_entry_literal32) +
220     *     size of entries for DDMAD control word
221     *         (num_ddmad_literals * pvr_const_map_entry_literal32))
222     *
223     * 3. Size of entry for DOUTW vertex/instance control word
224     *     (pvr_const_map_entry_literal32)
225     *
226     * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
227     */
228 
229    const size_t attribute_size =
230       (!robust_buffer_access)
231          ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
232          : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
233               sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
234 
235    /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
236     * and is increased by one DWORD to contain the data for the DDMADT's
237     * out-of-bounds check.
238     */
239    const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
240       1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
241 
242    return (sizeof(struct pvr_const_map_entry_base_instance) +
243            PVR_MAX_VERTEX_INPUT_BINDINGS *
244               (attribute_size +
245                (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
246                   sizeof(struct pvr_const_map_entry_literal32)) +
247            sizeof(struct pvr_const_map_entry_literal32) +
248            sizeof(struct pvr_const_map_entry_doutu_address));
249 }
250 
251 /* This is a const pointer to an array of pvr_pds_vertex_dma structs.
252  * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size.
253  */
254 typedef struct pvr_pds_vertex_dma (
255       *const
256          pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS];
257 
258 /* dma_descriptions_out_ptr is a pointer to the array used as output.
259  * The whole array might not be filled so dma_count_out indicates how many
260  * elements were used.
261  */
pvr_pds_vertex_attrib_init_dma_descriptions(const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,const struct rogue_vs_build_data * vs_data,pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,uint32_t * const dma_count_out)262 static void pvr_pds_vertex_attrib_init_dma_descriptions(
263    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
264    const struct rogue_vs_build_data *vs_data,
265    pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
266    uint32_t *const dma_count_out)
267 {
268    struct pvr_pds_vertex_dma *const dma_descriptions =
269       *dma_descriptions_out_ptr;
270    uint32_t dma_count = 0;
271 
272    if (!vertex_input_state) {
273       *dma_count_out = 0;
274       return;
275    }
276 
277    for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
278         i++) {
279       const VkVertexInputAttributeDescription *const attrib_desc =
280          &vertex_input_state->pVertexAttributeDescriptions[i];
281       const VkVertexInputBindingDescription *binding_desc = NULL;
282 
283       /* Finding the matching binding description. */
284       for (uint32_t j = 0;
285            j < vertex_input_state->vertexBindingDescriptionCount;
286            j++) {
287          const VkVertexInputBindingDescription *const current_binding_desc =
288             &vertex_input_state->pVertexBindingDescriptions[j];
289 
290          if (current_binding_desc->binding == attrib_desc->binding) {
291             binding_desc = current_binding_desc;
292             break;
293          }
294       }
295 
296       /* From the Vulkan 1.2.195 spec for
297        * VkPipelineVertexInputStateCreateInfo:
298        *
299        *    "For every binding specified by each element of
300        *    pVertexAttributeDescriptions, a
301        *    VkVertexInputBindingDescription must exist in
302        *    pVertexBindingDescriptions with the same value of binding"
303        *
304        * So we don't check if we found the matching binding description
305        * or not.
306        */
307 
308       struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count];
309 
310       size_t location = attrib_desc->location;
311       assert(location < vs_data->inputs.num_input_vars);
312 
313       dma_desc->offset = attrib_desc->offset;
314       dma_desc->stride = binding_desc->stride;
315 
316       dma_desc->flags = 0;
317 
318       if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
319          dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
320 
321       dma_desc->size_in_dwords = vs_data->inputs.components[location];
322       /* TODO: This will be different when other types are supported.
323        * Store in vs_data with base and components?
324        */
325       /* TODO: Use attrib_desc->format. */
326       dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES;
327       dma_desc->destination = vs_data->inputs.base[location];
328       dma_desc->binding_index = attrib_desc->binding;
329       dma_desc->divisor = 1;
330       dma_desc->robustness_buffer_offset = 0;
331 
332       ++dma_count;
333    }
334 
335    *dma_count_out = dma_count;
336 }
337 
pvr_pds_vertex_attrib_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_pds_vertex_primary_program_input * const input,struct pvr_pds_attrib_program * const program_out)338 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
339    struct pvr_device *const device,
340    const VkAllocationCallbacks *const allocator,
341    struct pvr_pds_vertex_primary_program_input *const input,
342    struct pvr_pds_attrib_program *const program_out)
343 {
344    const size_t const_entries_size_in_bytes =
345       pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
346          &device->pdevice->dev_info,
347          device->features.robustBufferAccess);
348    struct pvr_pds_upload *const program = &program_out->program;
349    struct pvr_pds_info *const info = &program_out->info;
350    struct pvr_const_map_entry *entries_buffer;
351    ASSERTED uint32_t code_size_in_dwords;
352    size_t staging_buffer_size;
353    uint32_t *staging_buffer;
354    VkResult result;
355 
356    memset(info, 0, sizeof(*info));
357 
358    entries_buffer = vk_alloc2(&device->vk.alloc,
359                               allocator,
360                               const_entries_size_in_bytes,
361                               8,
362                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
363    if (!entries_buffer)
364       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
365 
366    info->entries = entries_buffer;
367    info->entries_size_in_bytes = const_entries_size_in_bytes;
368 
369    pvr_pds_generate_vertex_primary_program(input,
370                                            NULL,
371                                            info,
372                                            device->features.robustBufferAccess,
373                                            &device->pdevice->dev_info);
374 
375    code_size_in_dwords = info->code_size_in_dwords;
376    staging_buffer_size = info->code_size_in_dwords * sizeof(*staging_buffer);
377 
378    staging_buffer = vk_alloc2(&device->vk.alloc,
379                               allocator,
380                               staging_buffer_size,
381                               8,
382                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
383    if (!staging_buffer) {
384       vk_free2(&device->vk.alloc, allocator, entries_buffer);
385       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
386    }
387 
388    /* This also fills in info->entries. */
389    pvr_pds_generate_vertex_primary_program(input,
390                                            staging_buffer,
391                                            info,
392                                            device->features.robustBufferAccess,
393                                            &device->pdevice->dev_info);
394 
395    assert(info->code_size_in_dwords <= code_size_in_dwords);
396 
397    /* FIXME: Add a vk_realloc2() ? */
398    entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
399                                entries_buffer,
400                                info->entries_written_size_in_bytes,
401                                8,
402                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
403    if (!entries_buffer) {
404       vk_free2(&device->vk.alloc, allocator, staging_buffer);
405       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
406    }
407 
408    info->entries = entries_buffer;
409    info->entries_size_in_bytes = info->entries_written_size_in_bytes;
410 
411    /* FIXME: Figure out the define for alignment of 16. */
412    result = pvr_gpu_upload_pds(device,
413                                NULL,
414                                0,
415                                0,
416                                staging_buffer,
417                                info->code_size_in_dwords,
418                                16,
419                                16,
420                                program);
421    if (result != VK_SUCCESS) {
422       vk_free2(&device->vk.alloc, allocator, entries_buffer);
423       vk_free2(&device->vk.alloc, allocator, staging_buffer);
424 
425       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
426    }
427 
428    vk_free2(&device->vk.alloc, allocator, staging_buffer);
429 
430    return VK_SUCCESS;
431 }
432 
pvr_pds_vertex_attrib_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_attrib_program * const program)433 static inline void pvr_pds_vertex_attrib_program_destroy(
434    struct pvr_device *const device,
435    const struct VkAllocationCallbacks *const allocator,
436    struct pvr_pds_attrib_program *const program)
437 {
438    pvr_bo_free(device, program->program.pvr_bo);
439    vk_free2(&device->vk.alloc, allocator, program->info.entries);
440 }
441 
442 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
443  * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
444  */
445 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
446    [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
447 
448 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
449  * inputs. This will bake the code segment and create a template of the data
450  * segment for the command buffer to fill in.
451  */
452 /* If allocator == NULL, the internal one will be used.
453  *
454  * programs_out_ptr is a pointer to the array where the outputs will be placed.
455  * */
pvr_pds_vertex_attrib_programs_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * const allocator,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,uint32_t usc_temp_count,const struct rogue_vs_build_data * vs_data,pvr_pds_attrib_programs_array_ptr programs_out_ptr)456 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
457    struct pvr_device *device,
458    const VkAllocationCallbacks *const allocator,
459    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
460    uint32_t usc_temp_count,
461    const struct rogue_vs_build_data *vs_data,
462    pvr_pds_attrib_programs_array_ptr programs_out_ptr)
463 {
464    struct pvr_pds_vertex_dma dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
465    struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
466    struct pvr_pds_vertex_primary_program_input input = {
467       .dma_list = dma_descriptions,
468    };
469    VkResult result;
470 
471    pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state,
472                                                vs_data,
473                                                &dma_descriptions,
474                                                &input.dma_count);
475 
476    pvr_pds_setup_doutu(&input.usc_task_control,
477                        0,
478                        usc_temp_count,
479                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
480                        false);
481 
482    /* TODO: If statements for all the "bRequired"s + ui32ExtraFlags. */
483 
484    /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
485     * typedef.
486     */
487    for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
488       switch (i) {
489       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
490          input.flags = 0;
491          break;
492 
493       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
494          input.flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
495          break;
496 
497       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
498          /* We unset INSTANCE and set INDIRECT. */
499          input.flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
500          break;
501 
502       default:
503          unreachable("Invalid vertex attrib program type.");
504       }
505 
506       result =
507          pvr_pds_vertex_attrib_program_create_and_upload(device,
508                                                          allocator,
509                                                          &input,
510                                                          &programs_out[i]);
511       if (result != VK_SUCCESS) {
512          for (uint32_t j = 0; j < i; j++) {
513             pvr_pds_vertex_attrib_program_destroy(device,
514                                                   allocator,
515                                                   &programs_out[j]);
516          }
517 
518          return result;
519       }
520    }
521 
522    return VK_SUCCESS;
523 }
524 
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes()525 static size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes()
526 {
527    /* Maximum memory allocation needed for const map entries in
528     * pvr_pds_generate_descriptor_upload_program().
529     * It must be >= 688 bytes. This size is calculated as the sum of:
530     *
531     *  1. Max. number of descriptor sets (8) * (
532     *         size of descriptor entry
533     *             (pvr_const_map_entry_descriptor_set) +
534     *         size of Common Store burst entry
535     *             (pvr_const_map_entry_literal32))
536     *
537     *  2. Max. number of PDS program buffers (24) * (
538     *         size of the largest buffer structure
539     *             (pvr_const_map_entry_constant_buffer) +
540     *         size of Common Store burst entry
541     *             (pvr_const_map_entry_literal32)
542     *
543     *  3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
544     */
545 
546    /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
547     * say that it should be 8.
548     * Figure our a define for this or is the comment wrong?
549     */
550    return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
551                 sizeof(struct pvr_const_map_entry_literal32)) +
552            PVR_PDS_MAX_BUFFERS *
553               (sizeof(struct pvr_const_map_entry_constant_buffer) +
554                sizeof(struct pvr_const_map_entry_literal32)) +
555            sizeof(struct pvr_const_map_entry_doutu_address));
556 }
557 
558 /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
559  * structs.
560  */
561 typedef struct pvr_pds_buffer (
562       *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS];
563 
564 /**
565  * \brief Setup buffers for the PDS descriptor program.
566  *
567  * Sets up buffers required by the PDS gen api based on compiler info.
568  *
569  * For compile time static constants that need DMAing it uploads them and
570  * returns the upload in \r static_consts_pvr_bo_out .
571  */
pvr_pds_descriptor_program_setup_buffers(struct pvr_device * device,bool robust_buffer_access,const struct rogue_compile_time_consts_data * compile_time_consts_data,const struct rogue_ubo_data * ubo_data,pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,uint32_t * const buffer_count_out,struct pvr_bo ** const static_consts_pvr_bo_out)572 static VkResult pvr_pds_descriptor_program_setup_buffers(
573    struct pvr_device *device,
574    bool robust_buffer_access,
575    const struct rogue_compile_time_consts_data *compile_time_consts_data,
576    const struct rogue_ubo_data *ubo_data,
577    pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,
578    uint32_t *const buffer_count_out,
579    struct pvr_bo **const static_consts_pvr_bo_out)
580 {
581    struct pvr_pds_buffer *const buffers = *buffers_out_ptr;
582    uint32_t buffer_count = 0;
583 
584    for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) {
585       struct pvr_pds_buffer *current_buffer = &buffers[buffer_count];
586 
587       /* This is fine since buffers_out_ptr is a pointer to an array. */
588       assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
589 
590       current_buffer->type = PVR_BUFFER_TYPE_UBO;
591       current_buffer->size_in_dwords = ubo_data->size[i];
592       current_buffer->destination = ubo_data->dest[i];
593 
594       current_buffer->buffer_id = buffer_count;
595       current_buffer->desc_set = ubo_data->desc_set[i];
596       current_buffer->binding = ubo_data->binding[i];
597       /* TODO: Is this always the case?
598        * E.g. can multiple UBOs have the same base buffer?
599        */
600       current_buffer->source_offset = 0;
601 
602       buffer_count++;
603    }
604 
605    if (compile_time_consts_data->static_consts.num > 0) {
606       VkResult result;
607 
608       assert(compile_time_consts_data->static_consts.num <=
609              ARRAY_SIZE(compile_time_consts_data->static_consts.value));
610 
611       /* This is fine since buffers_out_ptr is a pointer to an array. */
612       assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
613 
614       /* TODO: Is it possible to have multiple static consts buffer where the
615        * destination is not adjoining? If so we need to handle that.
616        * Currently we're only setting up a single buffer.
617        */
618       buffers[buffer_count++] = (struct pvr_pds_buffer){
619          .type = PVR_BUFFER_TYPES_COMPILE_TIME,
620          .size_in_dwords = compile_time_consts_data->static_consts.num,
621          .destination = compile_time_consts_data->static_consts.dest,
622       };
623 
624       result = pvr_gpu_upload(device,
625                               device->heaps.general_heap,
626                               compile_time_consts_data->static_consts.value,
627                               compile_time_consts_data->static_consts.num *
628                                  ROGUE_REG_SIZE_BYTES,
629                               ROGUE_REG_SIZE_BYTES,
630                               static_consts_pvr_bo_out);
631       if (result != VK_SUCCESS)
632          return result;
633    } else {
634       *static_consts_pvr_bo_out = NULL;
635    }
636 
637    *buffer_count_out = buffer_count;
638 
639    return VK_SUCCESS;
640 }
641 
pvr_pds_descriptor_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const struct rogue_compile_time_consts_data * const compile_time_consts_data,const struct rogue_ubo_data * const ubo_data,const struct pvr_explicit_constant_usage * const explicit_const_usage,const struct pvr_pipeline_layout * const layout,enum pvr_stage_allocation stage,struct pvr_stage_allocation_descriptor_state * const descriptor_state)642 static VkResult pvr_pds_descriptor_program_create_and_upload(
643    struct pvr_device *const device,
644    const VkAllocationCallbacks *const allocator,
645    const struct rogue_compile_time_consts_data *const compile_time_consts_data,
646    const struct rogue_ubo_data *const ubo_data,
647    const struct pvr_explicit_constant_usage *const explicit_const_usage,
648    const struct pvr_pipeline_layout *const layout,
649    enum pvr_stage_allocation stage,
650    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
651 {
652    const size_t const_entries_size_in_bytes =
653       pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
654    struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
655    struct pvr_descriptor_program_input program = { 0 };
656    struct pvr_const_map_entry *entries_buffer;
657    ASSERTED uint32_t code_size_in_dwords;
658    uint32_t staging_buffer_size;
659    uint32_t *staging_buffer;
660    VkResult result;
661 
662    assert(stage != PVR_STAGE_ALLOCATION_COUNT);
663 
664    *pds_info = (struct pvr_pds_info){ 0 };
665 
666    result = pvr_pds_descriptor_program_setup_buffers(
667       device,
668       device->features.robustBufferAccess,
669       compile_time_consts_data,
670       ubo_data,
671       &program.buffers,
672       &program.buffer_count,
673       &descriptor_state->static_consts);
674    if (result != VK_SUCCESS)
675       return result;
676 
677    if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords)
678       assert(!"Unimplemented");
679 
680    for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) {
681       const struct pvr_descriptor_set_layout_mem_layout *const reg_layout =
682          &layout->register_layout_in_dwords_per_stage[stage][set_num];
683       const uint32_t start_offset = explicit_const_usage->start_offset;
684 
685       /* TODO: Use compiler usage info to optimize this? */
686 
687       /* Only dma primaries if they are actually required. */
688       if (reg_layout->primary_size) {
689          program.descriptor_sets[program.descriptor_set_count++] =
690             (struct pvr_pds_descriptor_set){
691                .descriptor_set = set_num,
692                .size_in_dwords = reg_layout->primary_size,
693                .destination = reg_layout->primary_offset + start_offset,
694                .primary = true,
695             };
696       }
697 
698       /* Only dma secondaries if they are actually required. */
699       if (!reg_layout->secondary_size)
700          continue;
701 
702       program.descriptor_sets[program.descriptor_set_count++] =
703          (struct pvr_pds_descriptor_set){
704             .descriptor_set = set_num,
705             .size_in_dwords = reg_layout->secondary_size,
706             .destination = reg_layout->secondary_offset + start_offset,
707          };
708    }
709 
710    entries_buffer = vk_alloc2(&device->vk.alloc,
711                               allocator,
712                               const_entries_size_in_bytes,
713                               8,
714                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
715    if (!entries_buffer) {
716       pvr_bo_free(device, descriptor_state->static_consts);
717 
718       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
719    }
720 
721    pds_info->entries = entries_buffer;
722    pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
723 
724    pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
725 
726    code_size_in_dwords = pds_info->code_size_in_dwords;
727    staging_buffer_size =
728       pds_info->code_size_in_dwords * sizeof(*staging_buffer);
729 
730    if (!staging_buffer_size) {
731       vk_free2(&device->vk.alloc, allocator, entries_buffer);
732 
733       *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
734 
735       return VK_SUCCESS;
736    }
737 
738    staging_buffer = vk_alloc2(&device->vk.alloc,
739                               allocator,
740                               staging_buffer_size,
741                               8,
742                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
743    if (!staging_buffer) {
744       pvr_bo_free(device, descriptor_state->static_consts);
745       vk_free2(&device->vk.alloc, allocator, entries_buffer);
746 
747       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
748    }
749 
750    pvr_pds_generate_descriptor_upload_program(&program,
751                                               staging_buffer,
752                                               pds_info);
753 
754    assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
755 
756    /* FIXME: use vk_realloc2() ? */
757    entries_buffer = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
758                                entries_buffer,
759                                pds_info->entries_written_size_in_bytes,
760                                8,
761                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
762    if (!entries_buffer) {
763       pvr_bo_free(device, descriptor_state->static_consts);
764       vk_free2(&device->vk.alloc, allocator, staging_buffer);
765 
766       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
767    }
768 
769    pds_info->entries = entries_buffer;
770    pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
771 
772    /* FIXME: Figure out the define for alignment of 16. */
773    result = pvr_gpu_upload_pds(device,
774                                NULL,
775                                0,
776                                0,
777                                staging_buffer,
778                                pds_info->code_size_in_dwords,
779                                16,
780                                16,
781                                &descriptor_state->pds_code);
782    if (result != VK_SUCCESS) {
783       pvr_bo_free(device, descriptor_state->static_consts);
784       vk_free2(&device->vk.alloc, allocator, entries_buffer);
785       vk_free2(&device->vk.alloc, allocator, staging_buffer);
786 
787       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
788    }
789 
790    vk_free2(&device->vk.alloc, allocator, staging_buffer);
791 
792    return VK_SUCCESS;
793 }
794 
pvr_pds_descriptor_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_stage_allocation_descriptor_state * const descriptor_state)795 static void pvr_pds_descriptor_program_destroy(
796    struct pvr_device *const device,
797    const struct VkAllocationCallbacks *const allocator,
798    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
799 {
800    pvr_bo_free(device, descriptor_state->pds_code.pvr_bo);
801    vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
802    pvr_bo_free(device, descriptor_state->static_consts);
803 }
804 
pvr_pds_compute_program_setup(const struct pvr_device_info * dev_info,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,bool add_base_workgroup,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_compute_shader_program * const program)805 static void pvr_pds_compute_program_setup(
806    const struct pvr_device_info *dev_info,
807    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
808    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
809    uint32_t barrier_coefficient,
810    bool add_base_workgroup,
811    uint32_t usc_temps,
812    pvr_dev_addr_t usc_shader_dev_addr,
813    struct pvr_pds_compute_shader_program *const program)
814 {
815    *program = (struct pvr_pds_compute_shader_program){
816       /* clang-format off */
817       .local_input_regs = {
818          local_input_regs[0],
819          local_input_regs[1],
820          local_input_regs[2]
821       },
822       .work_group_input_regs = {
823          work_group_input_regs[0],
824          work_group_input_regs[1],
825          work_group_input_regs[2]
826       },
827       .global_input_regs = {
828          [0 ... (PVR_WORKGROUP_DIMENSIONS - 1)] =
829             PVR_PDS_COMPUTE_INPUT_REG_UNUSED
830       },
831       /* clang-format on */
832       .barrier_coefficient = barrier_coefficient,
833       .flattened_work_groups = true,
834       .clear_pds_barrier = false,
835       .add_base_workgroup = add_base_workgroup,
836       .kick_usc = true,
837    };
838 
839    STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
840                  PVR_WORKGROUP_DIMENSIONS);
841    STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
842                  PVR_WORKGROUP_DIMENSIONS);
843    STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
844                  PVR_WORKGROUP_DIMENSIONS);
845 
846    pvr_pds_setup_doutu(&program->usc_task_control,
847                        usc_shader_dev_addr.addr,
848                        usc_temps,
849                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
850                        false);
851 
852    pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
853 }
854 
855 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
856  */
pvr_pds_compute_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_upload * const pds_upload_out,struct pvr_pds_info * const pds_info_out)857 static VkResult pvr_pds_compute_program_create_and_upload(
858    struct pvr_device *const device,
859    const VkAllocationCallbacks *const allocator,
860    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
861    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
862    uint32_t barrier_coefficient,
863    uint32_t usc_temps,
864    pvr_dev_addr_t usc_shader_dev_addr,
865    struct pvr_pds_upload *const pds_upload_out,
866    struct pvr_pds_info *const pds_info_out)
867 {
868    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
869    struct pvr_pds_compute_shader_program program;
870    uint32_t staging_buffer_size;
871    uint32_t *staging_buffer;
872    VkResult result;
873 
874    pvr_pds_compute_program_setup(dev_info,
875                                  local_input_regs,
876                                  work_group_input_regs,
877                                  barrier_coefficient,
878                                  false,
879                                  usc_temps,
880                                  usc_shader_dev_addr,
881                                  &program);
882 
883    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
884     * is in bytes. Investigate this.
885     */
886    staging_buffer_size =
887       (program.code_size + program.data_size) * sizeof(*staging_buffer);
888 
889    staging_buffer = vk_alloc2(&device->vk.alloc,
890                               allocator,
891                               staging_buffer_size,
892                               8,
893                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
894    if (!staging_buffer)
895       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
896 
897    /* FIXME: pvr_pds_compute_shader doesn't implement
898     * PDS_GENERATE_CODEDATA_SEGMENTS.
899     */
900    pvr_pds_compute_shader(&program,
901                           &staging_buffer[0],
902                           PDS_GENERATE_CODE_SEGMENT,
903                           dev_info);
904 
905    pvr_pds_compute_shader(&program,
906                           &staging_buffer[program.code_size],
907                           PDS_GENERATE_DATA_SEGMENT,
908                           dev_info);
909 
910    /* FIXME: Figure out the define for alignment of 16. */
911    result = pvr_gpu_upload_pds(device,
912                                &staging_buffer[program.code_size],
913                                program.data_size,
914                                16,
915                                &staging_buffer[0],
916                                program.code_size,
917                                16,
918                                16,
919                                pds_upload_out);
920    if (result != VK_SUCCESS) {
921       vk_free2(&device->vk.alloc, allocator, staging_buffer);
922       return result;
923    }
924 
925    *pds_info_out = (struct pvr_pds_info){
926       .temps_required = program.highest_temp,
927       .code_size_in_dwords = program.code_size,
928       .data_size_in_dwords = program.data_size,
929    };
930 
931    vk_free2(&device->vk.alloc, allocator, staging_buffer);
932 
933    return VK_SUCCESS;
934 };
935 
pvr_pds_compute_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_upload * const pds_program,struct pvr_pds_info * const pds_info)936 static void pvr_pds_compute_program_destroy(
937    struct pvr_device *const device,
938    const struct VkAllocationCallbacks *const allocator,
939    struct pvr_pds_upload *const pds_program,
940    struct pvr_pds_info *const pds_info)
941 {
942    /* We don't allocate an entries buffer so we don't need to free it */
943    pvr_bo_free(device, pds_program->pvr_bo);
944 }
945 
946 /* This only uploads the code segment. The data segment will need to be patched
947  * with the base workgroup before uploading.
948  */
pvr_pds_compute_base_workgroup_variant_program_init(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_base_workgroup_program * program_out)949 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
950    struct pvr_device *const device,
951    const VkAllocationCallbacks *const allocator,
952    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
953    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
954    uint32_t barrier_coefficient,
955    uint32_t usc_temps,
956    pvr_dev_addr_t usc_shader_dev_addr,
957    struct pvr_pds_base_workgroup_program *program_out)
958 {
959    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
960    struct pvr_pds_compute_shader_program program;
961    uint32_t buffer_size;
962    uint32_t *buffer;
963    VkResult result;
964 
965    pvr_pds_compute_program_setup(dev_info,
966                                  local_input_regs,
967                                  work_group_input_regs,
968                                  barrier_coefficient,
969                                  true,
970                                  usc_temps,
971                                  usc_shader_dev_addr,
972                                  &program);
973 
974    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
975     * is in bytes. Investigate this.
976     */
977    buffer_size = MAX2(program.code_size, program.data_size) * sizeof(*buffer);
978 
979    buffer = vk_alloc2(&device->vk.alloc,
980                       allocator,
981                       buffer_size,
982                       8,
983                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
984    if (!buffer)
985       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
986 
987    pvr_pds_compute_shader(&program,
988                           &buffer[0],
989                           PDS_GENERATE_CODE_SEGMENT,
990                           dev_info);
991 
992    /* FIXME: Figure out the define for alignment of 16. */
993    result = pvr_gpu_upload_pds(device,
994                                NULL,
995                                0,
996                                0,
997                                buffer,
998                                program.code_size,
999                                16,
1000                                16,
1001                                &program_out->code_upload);
1002    if (result != VK_SUCCESS) {
1003       vk_free2(&device->vk.alloc, allocator, buffer);
1004       return result;
1005    }
1006 
1007    pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
1008 
1009    program_out->data_section = buffer;
1010 
1011    /* We'll need to patch the base workgroup in the PDS data section before
1012     * dispatch so we save the offsets at which to patch. We only need to save
1013     * the offset for the first workgroup id since the workgroup ids are stored
1014     * contiguously in the data segment.
1015     */
1016    program_out->base_workgroup_data_patching_offset =
1017       program.base_workgroup_constant_offset_in_dwords[0];
1018 
1019    program_out->info = (struct pvr_pds_info){
1020       .temps_required = program.highest_temp,
1021       .code_size_in_dwords = program.code_size,
1022       .data_size_in_dwords = program.data_size,
1023    };
1024 
1025    return VK_SUCCESS;
1026 }
1027 
pvr_pds_compute_base_workgroup_variant_program_finish(struct pvr_device * device,const VkAllocationCallbacks * const allocator,struct pvr_pds_base_workgroup_program * const state)1028 static void pvr_pds_compute_base_workgroup_variant_program_finish(
1029    struct pvr_device *device,
1030    const VkAllocationCallbacks *const allocator,
1031    struct pvr_pds_base_workgroup_program *const state)
1032 {
1033    pvr_bo_free(device, state->code_upload.pvr_bo);
1034    vk_free2(&device->vk.alloc, allocator, state->data_section);
1035 }
1036 
1037 /******************************************************************************
1038    Generic pipeline functions
1039  ******************************************************************************/
1040 
pvr_pipeline_init(struct pvr_device * device,enum pvr_pipeline_type type,struct pvr_pipeline * const pipeline)1041 static void pvr_pipeline_init(struct pvr_device *device,
1042                               enum pvr_pipeline_type type,
1043                               struct pvr_pipeline *const pipeline)
1044 {
1045    assert(!pipeline->layout);
1046 
1047    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
1048 
1049    pipeline->type = type;
1050 }
1051 
pvr_pipeline_finish(struct pvr_pipeline * pipeline)1052 static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
1053 {
1054    vk_object_base_finish(&pipeline->base);
1055 }
1056 
1057 /******************************************************************************
1058    Compute pipeline functions
1059  ******************************************************************************/
1060 
1061 /* Compiles and uploads shaders and PDS programs. */
pvr_compute_pipeline_compile(struct pvr_device * const device,struct pvr_pipeline_cache * pipeline_cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1062 static VkResult pvr_compute_pipeline_compile(
1063    struct pvr_device *const device,
1064    struct pvr_pipeline_cache *pipeline_cache,
1065    const VkComputePipelineCreateInfo *pCreateInfo,
1066    const VkAllocationCallbacks *const allocator,
1067    struct pvr_compute_pipeline *const compute_pipeline)
1068 {
1069    struct rogue_compile_time_consts_data compile_time_consts_data;
1070    uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
1071    struct pvr_explicit_constant_usage explicit_const_usage;
1072    uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
1073    struct rogue_ubo_data ubo_data;
1074    uint32_t barrier_coefficient;
1075    uint32_t usc_temps;
1076    VkResult result;
1077 
1078    if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) {
1079       struct pvr_hard_code_compute_build_info build_info;
1080 
1081       result = pvr_hard_code_compute_pipeline(device,
1082                                               &compute_pipeline->state.shader,
1083                                               &build_info);
1084       if (result != VK_SUCCESS)
1085          return result;
1086 
1087       ubo_data = build_info.ubo_data;
1088       compile_time_consts_data = build_info.compile_time_consts_data;
1089 
1090       /* We make sure that the compiler's unused reg value is compatible with
1091        * the pds api.
1092        */
1093       STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED);
1094 
1095       barrier_coefficient = build_info.barrier_reg;
1096 
1097       /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */
1098       local_input_regs[0] = build_info.local_invocation_regs[0];
1099       local_input_regs[1] = build_info.local_invocation_regs[1];
1100       /* This is not a mistake. We want to assign element 1 to 2. */
1101       local_input_regs[2] = build_info.local_invocation_regs[1];
1102 
1103       STATIC_ASSERT(
1104          __same_type(work_group_input_regs, build_info.work_group_regs));
1105       typed_memcpy(work_group_input_regs,
1106                    build_info.work_group_regs,
1107                    PVR_WORKGROUP_DIMENSIONS);
1108 
1109       usc_temps = build_info.usc_temps;
1110 
1111       explicit_const_usage = build_info.explicit_conts_usage;
1112 
1113    } else {
1114       /* FIXME: Compile and upload the shader. */
1115       /* FIXME: Initialize the shader state and setup build info. */
1116       abort();
1117    };
1118 
1119    result = pvr_pds_descriptor_program_create_and_upload(
1120       device,
1121       allocator,
1122       &compile_time_consts_data,
1123       &ubo_data,
1124       &explicit_const_usage,
1125       compute_pipeline->base.layout,
1126       PVR_STAGE_ALLOCATION_COMPUTE,
1127       &compute_pipeline->state.descriptor);
1128    if (result != VK_SUCCESS)
1129       goto err_free_shader;
1130 
1131    result = pvr_pds_compute_program_create_and_upload(
1132       device,
1133       allocator,
1134       local_input_regs,
1135       work_group_input_regs,
1136       barrier_coefficient,
1137       usc_temps,
1138       compute_pipeline->state.shader.bo->vma->dev_addr,
1139       &compute_pipeline->state.primary_program,
1140       &compute_pipeline->state.primary_program_info);
1141    if (result != VK_SUCCESS)
1142       goto err_free_descriptor_program;
1143 
1144    /* If the workgroup ID is required, then we require the base workgroup
1145     * variant of the PDS compute program as well.
1146     */
1147    compute_pipeline->state.flags.base_workgroup =
1148       work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1149       work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1150       work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
1151 
1152    if (compute_pipeline->state.flags.base_workgroup) {
1153       result = pvr_pds_compute_base_workgroup_variant_program_init(
1154          device,
1155          allocator,
1156          local_input_regs,
1157          work_group_input_regs,
1158          barrier_coefficient,
1159          usc_temps,
1160          compute_pipeline->state.shader.bo->vma->dev_addr,
1161          &compute_pipeline->state.primary_base_workgroup_variant_program);
1162       if (result != VK_SUCCESS)
1163          goto err_destroy_compute_program;
1164    }
1165 
1166    return VK_SUCCESS;
1167 
1168 err_destroy_compute_program:
1169    pvr_pds_compute_program_destroy(
1170       device,
1171       allocator,
1172       &compute_pipeline->state.primary_program,
1173       &compute_pipeline->state.primary_program_info);
1174 
1175 err_free_descriptor_program:
1176    pvr_bo_free(device, compute_pipeline->state.descriptor.pds_code.pvr_bo);
1177 
1178 err_free_shader:
1179    pvr_bo_free(device, compute_pipeline->state.shader.bo);
1180 
1181    return result;
1182 }
1183 
1184 static VkResult
pvr_compute_pipeline_init(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_compute_pipeline * compute_pipeline)1185 pvr_compute_pipeline_init(struct pvr_device *device,
1186                           struct pvr_pipeline_cache *pipeline_cache,
1187                           const VkComputePipelineCreateInfo *pCreateInfo,
1188                           const VkAllocationCallbacks *allocator,
1189                           struct pvr_compute_pipeline *compute_pipeline)
1190 {
1191    VkResult result;
1192 
1193    pvr_pipeline_init(device,
1194                      PVR_PIPELINE_TYPE_COMPUTE,
1195                      &compute_pipeline->base);
1196 
1197    compute_pipeline->base.layout =
1198       pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1199 
1200    result = pvr_compute_pipeline_compile(device,
1201                                          pipeline_cache,
1202                                          pCreateInfo,
1203                                          allocator,
1204                                          compute_pipeline);
1205    if (result != VK_SUCCESS) {
1206       pvr_pipeline_finish(&compute_pipeline->base);
1207       return result;
1208    }
1209 
1210    return VK_SUCCESS;
1211 }
1212 
1213 static VkResult
pvr_compute_pipeline_create(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1214 pvr_compute_pipeline_create(struct pvr_device *device,
1215                             struct pvr_pipeline_cache *pipeline_cache,
1216                             const VkComputePipelineCreateInfo *pCreateInfo,
1217                             const VkAllocationCallbacks *allocator,
1218                             VkPipeline *const pipeline_out)
1219 {
1220    struct pvr_compute_pipeline *compute_pipeline;
1221    VkResult result;
1222 
1223    compute_pipeline = vk_zalloc2(&device->vk.alloc,
1224                                  allocator,
1225                                  sizeof(*compute_pipeline),
1226                                  8,
1227                                  VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1228    if (!compute_pipeline)
1229       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1230 
1231    /* Compiles and uploads shaders and PDS programs. */
1232    result = pvr_compute_pipeline_init(device,
1233                                       pipeline_cache,
1234                                       pCreateInfo,
1235                                       allocator,
1236                                       compute_pipeline);
1237    if (result != VK_SUCCESS) {
1238       vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1239       return result;
1240    }
1241 
1242    *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1243 
1244    return VK_SUCCESS;
1245 }
1246 
pvr_compute_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1247 static void pvr_compute_pipeline_destroy(
1248    struct pvr_device *const device,
1249    const VkAllocationCallbacks *const allocator,
1250    struct pvr_compute_pipeline *const compute_pipeline)
1251 {
1252    if (compute_pipeline->state.flags.base_workgroup) {
1253       pvr_pds_compute_base_workgroup_variant_program_finish(
1254          device,
1255          allocator,
1256          &compute_pipeline->state.primary_base_workgroup_variant_program);
1257    }
1258 
1259    pvr_pds_compute_program_destroy(
1260       device,
1261       allocator,
1262       &compute_pipeline->state.primary_program,
1263       &compute_pipeline->state.primary_program_info);
1264    pvr_pds_descriptor_program_destroy(device,
1265                                       allocator,
1266                                       &compute_pipeline->state.descriptor);
1267    pvr_bo_free(device, compute_pipeline->state.shader.bo);
1268 
1269    pvr_pipeline_finish(&compute_pipeline->base);
1270 
1271    vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1272 }
1273 
1274 VkResult
pvr_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1275 pvr_CreateComputePipelines(VkDevice _device,
1276                            VkPipelineCache pipelineCache,
1277                            uint32_t createInfoCount,
1278                            const VkComputePipelineCreateInfo *pCreateInfos,
1279                            const VkAllocationCallbacks *pAllocator,
1280                            VkPipeline *pPipelines)
1281 {
1282    PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache);
1283    PVR_FROM_HANDLE(pvr_device, device, _device);
1284    VkResult result = VK_SUCCESS;
1285 
1286    for (uint32_t i = 0; i < createInfoCount; i++) {
1287       const VkResult local_result =
1288          pvr_compute_pipeline_create(device,
1289                                      pipeline_cache,
1290                                      &pCreateInfos[i],
1291                                      pAllocator,
1292                                      &pPipelines[i]);
1293       if (local_result != VK_SUCCESS) {
1294          result = local_result;
1295          pPipelines[i] = VK_NULL_HANDLE;
1296       }
1297    }
1298 
1299    return result;
1300 }
1301 
1302 /******************************************************************************
1303    Graphics pipeline functions
1304  ******************************************************************************/
1305 
pvr_dynamic_state_bit_from_vk(VkDynamicState state)1306 static inline uint32_t pvr_dynamic_state_bit_from_vk(VkDynamicState state)
1307 {
1308    switch (state) {
1309    case VK_DYNAMIC_STATE_VIEWPORT:
1310       return PVR_DYNAMIC_STATE_BIT_VIEWPORT;
1311    case VK_DYNAMIC_STATE_SCISSOR:
1312       return PVR_DYNAMIC_STATE_BIT_SCISSOR;
1313    case VK_DYNAMIC_STATE_LINE_WIDTH:
1314       return PVR_DYNAMIC_STATE_BIT_LINE_WIDTH;
1315    case VK_DYNAMIC_STATE_DEPTH_BIAS:
1316       return PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS;
1317    case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
1318       return PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS;
1319    case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
1320       return PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK;
1321    case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
1322       return PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK;
1323    case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
1324       return PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE;
1325    default:
1326       unreachable("Unsupported state.");
1327    }
1328 }
1329 
1330 static void
pvr_graphics_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1331 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1332                               const VkAllocationCallbacks *const allocator,
1333                               struct pvr_graphics_pipeline *const gfx_pipeline)
1334 {
1335    const uint32_t num_vertex_attrib_programs =
1336       ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1337 
1338    pvr_pds_descriptor_program_destroy(
1339       device,
1340       allocator,
1341       &gfx_pipeline->fragment_shader_state.descriptor_state);
1342 
1343    pvr_pds_descriptor_program_destroy(
1344       device,
1345       allocator,
1346       &gfx_pipeline->vertex_shader_state.descriptor_state);
1347 
1348    for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1349       struct pvr_pds_attrib_program *const attrib_program =
1350          &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i];
1351 
1352       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1353    }
1354 
1355    pvr_bo_free(device,
1356                gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo);
1357    pvr_bo_free(device,
1358                gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo);
1359 
1360    pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo);
1361    pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo);
1362 
1363    pvr_pipeline_finish(&gfx_pipeline->base);
1364 
1365    vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1366 }
1367 
1368 static void
pvr_vertex_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data,const struct rogue_vs_build_data * vs_data)1369 pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1370                       const struct rogue_common_build_data *common_data,
1371                       const struct rogue_vs_build_data *vs_data)
1372 {
1373    struct pvr_vertex_shader_state *vertex_state =
1374       &gfx_pipeline->vertex_shader_state;
1375 
1376    /* TODO: Hard coding these for now. These should be populated based on the
1377     * information returned by the compiler.
1378     */
1379    vertex_state->stage_state.const_shared_reg_count = common_data->shareds;
1380    vertex_state->stage_state.const_shared_reg_offset = 0;
1381    vertex_state->stage_state.temps_count = common_data->temps;
1382    vertex_state->stage_state.coefficient_size = common_data->coeffs;
1383    vertex_state->stage_state.uses_atomic_ops = false;
1384    vertex_state->stage_state.uses_texture_rw = false;
1385    vertex_state->stage_state.uses_barrier = false;
1386    vertex_state->stage_state.has_side_effects = false;
1387    vertex_state->stage_state.empty_program = false;
1388 
1389    vertex_state->vertex_input_size = vs_data->num_vertex_input_regs;
1390    vertex_state->vertex_output_size =
1391       vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES;
1392    vertex_state->user_clip_planes_mask = 0;
1393    vertex_state->entry_offset = 0;
1394 
1395    /* TODO: The number of varyings should be checked against the fragment
1396     * shader inputs and assigned in the place where that happens.
1397     * There will also be an opportunity to cull unused fs inputs/vs outputs.
1398     */
1399    pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[0],
1400                  TA_STATE_VARYING0,
1401                  varying0) {
1402       varying0.f32_linear = vs_data->num_varyings;
1403       varying0.f32_flat = 0;
1404       varying0.f32_npc = 0;
1405    }
1406 
1407    pvr_csb_pack (&gfx_pipeline->vertex_shader_state.varying[1],
1408                  TA_STATE_VARYING1,
1409                  varying1) {
1410       varying1.f16_linear = 0;
1411       varying1.f16_flat = 0;
1412       varying1.f16_npc = 0;
1413    }
1414 }
1415 
1416 static void
pvr_fragment_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data)1417 pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1418                         const struct rogue_common_build_data *common_data)
1419 {
1420    struct pvr_fragment_shader_state *fragment_state =
1421       &gfx_pipeline->fragment_shader_state;
1422 
1423    /* TODO: Hard coding these for now. These should be populated based on the
1424     * information returned by the compiler.
1425     */
1426    fragment_state->stage_state.const_shared_reg_count = 0;
1427    fragment_state->stage_state.const_shared_reg_offset = 0;
1428    fragment_state->stage_state.temps_count = common_data->temps;
1429    fragment_state->stage_state.coefficient_size = common_data->coeffs;
1430    fragment_state->stage_state.uses_atomic_ops = false;
1431    fragment_state->stage_state.uses_texture_rw = false;
1432    fragment_state->stage_state.uses_barrier = false;
1433    fragment_state->stage_state.has_side_effects = false;
1434    fragment_state->stage_state.empty_program = false;
1435 
1436    fragment_state->pass_type = 0;
1437    fragment_state->entry_offset = 0;
1438 }
1439 
1440 /* Compiles and uploads shaders and PDS programs. */
1441 static VkResult
pvr_graphics_pipeline_compile(struct pvr_device * const device,struct pvr_pipeline_cache * pipeline_cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1442 pvr_graphics_pipeline_compile(struct pvr_device *const device,
1443                               struct pvr_pipeline_cache *pipeline_cache,
1444                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
1445                               const VkAllocationCallbacks *const allocator,
1446                               struct pvr_graphics_pipeline *const gfx_pipeline)
1447 {
1448    /* FIXME: Remove this hard coding. */
1449    struct pvr_explicit_constant_usage vert_explicit_const_usage = {
1450       .start_offset = 16,
1451    };
1452    struct pvr_explicit_constant_usage frag_explicit_const_usage = {
1453       .start_offset = 0,
1454    };
1455    static uint32_t hard_code_pipeline_n = 0;
1456 
1457    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
1458       pCreateInfo->pVertexInputState;
1459    const uint32_t cache_line_size =
1460       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
1461    struct rogue_compiler *compiler = device->pdevice->compiler;
1462    struct rogue_build_ctx *ctx;
1463    VkResult result;
1464 
1465    /* Setup shared build context. */
1466    ctx = rogue_create_build_context(compiler);
1467    if (!ctx)
1468       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1469 
1470    /* NIR middle-end translation. */
1471    for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1472         stage--) {
1473       const VkPipelineShaderStageCreateInfo *create_info;
1474       size_t stage_index = gfx_pipeline->stage_indices[stage];
1475 
1476       if (pvr_hard_code_shader_required(&device->pdevice->dev_info)) {
1477          if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1478              BITFIELD_BIT(stage)) {
1479             continue;
1480          }
1481       }
1482 
1483       /* Skip unused/inactive stages. */
1484       if (stage_index == ~0)
1485          continue;
1486 
1487       create_info = &pCreateInfo->pStages[stage_index];
1488 
1489       /* SPIR-V to NIR. */
1490       ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info);
1491       if (!ctx->nir[stage]) {
1492          ralloc_free(ctx);
1493          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1494       }
1495    }
1496 
1497    /* Pre-back-end analysis and optimization, driver data extraction. */
1498    /* TODO: Analyze and cull unused I/O between stages. */
1499    /* TODO: Allocate UBOs between stages;
1500     * pipeline->layout->set_{count,layout}.
1501     */
1502 
1503    /* Back-end translation. */
1504    for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1505         stage--) {
1506       if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1507           pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1508              BITFIELD_BIT(stage)) {
1509          const struct pvr_device_info *const dev_info =
1510             &device->pdevice->dev_info;
1511          struct pvr_explicit_constant_usage *explicit_const_usage;
1512 
1513          switch (stage) {
1514          case MESA_SHADER_VERTEX:
1515             explicit_const_usage = &vert_explicit_const_usage;
1516             break;
1517 
1518          case MESA_SHADER_FRAGMENT:
1519             explicit_const_usage = &frag_explicit_const_usage;
1520             break;
1521 
1522          default:
1523             unreachable("Unsupported stage.");
1524          }
1525 
1526          pvr_hard_code_graphics_shader(dev_info,
1527                                        hard_code_pipeline_n,
1528                                        stage,
1529                                        &ctx->binary[stage]);
1530 
1531          pvr_hard_code_graphics_get_build_info(dev_info,
1532                                                hard_code_pipeline_n,
1533                                                stage,
1534                                                &ctx->common_data[stage],
1535                                                &ctx->stage_data,
1536                                                explicit_const_usage);
1537 
1538          continue;
1539       }
1540 
1541       if (!ctx->nir[stage])
1542          continue;
1543 
1544       ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]);
1545       if (!ctx->rogue[stage]) {
1546          ralloc_free(ctx);
1547          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1548       }
1549 
1550       ctx->binary[stage] = pvr_rogue_to_binary(ctx, ctx->rogue[stage]);
1551       if (!ctx->binary[stage]) {
1552          ralloc_free(ctx);
1553          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1554       }
1555    }
1556 
1557    if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1558        pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1559           BITFIELD_BIT(MESA_SHADER_VERTEX)) {
1560       pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info,
1561                                           hard_code_pipeline_n,
1562                                           &gfx_pipeline->vertex_shader_state);
1563    } else {
1564       pvr_vertex_state_init(gfx_pipeline,
1565                             &ctx->common_data[MESA_SHADER_VERTEX],
1566                             &ctx->stage_data.vs);
1567    }
1568 
1569    result = pvr_gpu_upload_usc(device,
1570                                ctx->binary[MESA_SHADER_VERTEX]->data,
1571                                ctx->binary[MESA_SHADER_VERTEX]->size,
1572                                cache_line_size,
1573                                &gfx_pipeline->vertex_shader_state.bo);
1574    if (result != VK_SUCCESS)
1575       goto err_free_build_context;
1576 
1577    if (pvr_hard_code_shader_required(&device->pdevice->dev_info) &&
1578        pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1579           BITFIELD_BIT(MESA_SHADER_FRAGMENT)) {
1580       pvr_hard_code_graphics_fragment_state(
1581          &device->pdevice->dev_info,
1582          hard_code_pipeline_n,
1583          &gfx_pipeline->fragment_shader_state);
1584    } else {
1585       pvr_fragment_state_init(gfx_pipeline,
1586                               &ctx->common_data[MESA_SHADER_FRAGMENT]);
1587    }
1588 
1589    result = pvr_gpu_upload_usc(device,
1590                                ctx->binary[MESA_SHADER_FRAGMENT]->data,
1591                                ctx->binary[MESA_SHADER_FRAGMENT]->size,
1592                                cache_line_size,
1593                                &gfx_pipeline->fragment_shader_state.bo);
1594    if (result != VK_SUCCESS)
1595       goto err_free_vertex_bo;
1596 
1597    /* TODO: powervr has an optimization where it attempts to recompile shaders.
1598     * See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented since in our
1599     * case the optimization doesn't happen.
1600     */
1601 
1602    /* TODO: The programs we use are hard coded for now, but these should be
1603     * selected dynamically.
1604     */
1605 
1606    result = pvr_pds_coeff_program_create_and_upload(
1607       device,
1608       allocator,
1609       ctx->stage_data.fs.iterator_args.fpu_iterators,
1610       ctx->stage_data.fs.iterator_args.num_fpu_iterators,
1611       ctx->stage_data.fs.iterator_args.destination,
1612       &gfx_pipeline->fragment_shader_state.pds_coeff_program);
1613    if (result != VK_SUCCESS)
1614       goto err_free_fragment_bo;
1615 
1616    result = pvr_pds_fragment_program_create_and_upload(
1617       device,
1618       allocator,
1619       gfx_pipeline->fragment_shader_state.bo,
1620       ctx->common_data[MESA_SHADER_FRAGMENT].temps,
1621       ctx->stage_data.fs.msaa_mode,
1622       ctx->stage_data.fs.phas,
1623       &gfx_pipeline->fragment_shader_state.pds_fragment_program);
1624    if (result != VK_SUCCESS)
1625       goto err_free_coeff_program;
1626 
1627    result = pvr_pds_vertex_attrib_programs_create_and_upload(
1628       device,
1629       allocator,
1630       vertex_input_state,
1631       ctx->common_data[MESA_SHADER_VERTEX].temps,
1632       &ctx->stage_data.vs,
1633       &gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1634    if (result != VK_SUCCESS)
1635       goto err_free_frag_program;
1636 
1637    result = pvr_pds_descriptor_program_create_and_upload(
1638       device,
1639       allocator,
1640       &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data,
1641       &ctx->common_data[MESA_SHADER_VERTEX].ubo_data,
1642       &vert_explicit_const_usage,
1643       gfx_pipeline->base.layout,
1644       PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
1645       &gfx_pipeline->vertex_shader_state.descriptor_state);
1646    if (result != VK_SUCCESS)
1647       goto err_free_vertex_attrib_program;
1648 
1649    /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
1650     * scratch buffer for both vertex and fragment stage.
1651     * Figure out the best place to do this.
1652     */
1653    /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
1654    /* TODO: Implement spilling with the above. */
1655 
1656    /* TODO: Call pvr_pds_program_program_create_and_upload in a loop. */
1657    /* FIXME: For now we pass in the same explicit_const_usage since it contains
1658     * all invalid entries. Fix this by hooking it up to the compiler.
1659     */
1660    result = pvr_pds_descriptor_program_create_and_upload(
1661       device,
1662       allocator,
1663       &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data,
1664       &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data,
1665       &frag_explicit_const_usage,
1666       gfx_pipeline->base.layout,
1667       PVR_STAGE_ALLOCATION_FRAGMENT,
1668       &gfx_pipeline->fragment_shader_state.descriptor_state);
1669    if (result != VK_SUCCESS)
1670       goto err_free_vertex_descriptor_program;
1671 
1672    ralloc_free(ctx);
1673 
1674    hard_code_pipeline_n++;
1675 
1676    return VK_SUCCESS;
1677 
1678 err_free_vertex_descriptor_program:
1679    pvr_pds_descriptor_program_destroy(
1680       device,
1681       allocator,
1682       &gfx_pipeline->vertex_shader_state.descriptor_state);
1683 err_free_vertex_attrib_program:
1684    for (uint32_t i = 0;
1685         i < ARRAY_SIZE(gfx_pipeline->vertex_shader_state.pds_attrib_programs);
1686         i++) {
1687       struct pvr_pds_attrib_program *const attrib_program =
1688          &gfx_pipeline->vertex_shader_state.pds_attrib_programs[i];
1689 
1690       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1691    }
1692 err_free_frag_program:
1693    pvr_bo_free(device,
1694                gfx_pipeline->fragment_shader_state.pds_fragment_program.pvr_bo);
1695 err_free_coeff_program:
1696    pvr_bo_free(device,
1697                gfx_pipeline->fragment_shader_state.pds_coeff_program.pvr_bo);
1698 err_free_fragment_bo:
1699    pvr_bo_free(device, gfx_pipeline->fragment_shader_state.bo);
1700 err_free_vertex_bo:
1701    pvr_bo_free(device, gfx_pipeline->vertex_shader_state.bo);
1702 err_free_build_context:
1703    ralloc_free(ctx);
1704    return result;
1705 }
1706 
pvr_graphics_pipeline_init_depth_and_stencil_state(struct pvr_graphics_pipeline * gfx_pipeline,const VkPipelineDepthStencilStateCreateInfo * depth_stencil_state)1707 static void pvr_graphics_pipeline_init_depth_and_stencil_state(
1708    struct pvr_graphics_pipeline *gfx_pipeline,
1709    const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state)
1710 {
1711    const VkStencilOpState *front;
1712    const VkStencilOpState *back;
1713 
1714    if (!depth_stencil_state)
1715       return;
1716 
1717    front = &depth_stencil_state->front;
1718    back = &depth_stencil_state->back;
1719 
1720    if (depth_stencil_state->depthTestEnable) {
1721       gfx_pipeline->depth_compare_op = depth_stencil_state->depthCompareOp;
1722       gfx_pipeline->depth_write_disable =
1723          !depth_stencil_state->depthWriteEnable;
1724    } else {
1725       gfx_pipeline->depth_compare_op = VK_COMPARE_OP_ALWAYS;
1726       gfx_pipeline->depth_write_disable = true;
1727    }
1728 
1729    if (depth_stencil_state->stencilTestEnable) {
1730       gfx_pipeline->stencil_front.compare_op = front->compareOp;
1731       gfx_pipeline->stencil_front.fail_op = front->failOp;
1732       gfx_pipeline->stencil_front.depth_fail_op = front->depthFailOp;
1733       gfx_pipeline->stencil_front.pass_op = front->passOp;
1734 
1735       gfx_pipeline->stencil_back.compare_op = back->compareOp;
1736       gfx_pipeline->stencil_back.fail_op = back->failOp;
1737       gfx_pipeline->stencil_back.depth_fail_op = back->depthFailOp;
1738       gfx_pipeline->stencil_back.pass_op = back->passOp;
1739    } else {
1740       gfx_pipeline->stencil_front.compare_op = VK_COMPARE_OP_ALWAYS;
1741       gfx_pipeline->stencil_front.fail_op = VK_STENCIL_OP_KEEP;
1742       gfx_pipeline->stencil_front.depth_fail_op = VK_STENCIL_OP_KEEP;
1743       gfx_pipeline->stencil_front.pass_op = VK_STENCIL_OP_KEEP;
1744 
1745       gfx_pipeline->stencil_back = gfx_pipeline->stencil_front;
1746    }
1747 }
1748 
pvr_graphics_pipeline_init_dynamic_state(struct pvr_graphics_pipeline * gfx_pipeline,const VkPipelineDynamicStateCreateInfo * dynamic_state,const VkPipelineViewportStateCreateInfo * viewport_state,const VkPipelineDepthStencilStateCreateInfo * depth_stencil_state,const VkPipelineColorBlendStateCreateInfo * color_blend_state,const VkPipelineRasterizationStateCreateInfo * rasterization_state)1749 static void pvr_graphics_pipeline_init_dynamic_state(
1750    struct pvr_graphics_pipeline *gfx_pipeline,
1751    const VkPipelineDynamicStateCreateInfo *dynamic_state,
1752    const VkPipelineViewportStateCreateInfo *viewport_state,
1753    const VkPipelineDepthStencilStateCreateInfo *depth_stencil_state,
1754    const VkPipelineColorBlendStateCreateInfo *color_blend_state,
1755    const VkPipelineRasterizationStateCreateInfo *rasterization_state)
1756 {
1757    struct pvr_dynamic_state *const internal_dynamic_state =
1758       &gfx_pipeline->dynamic_state;
1759    uint32_t dynamic_states = 0;
1760 
1761    if (dynamic_state) {
1762       for (uint32_t i = 0; i < dynamic_state->dynamicStateCount; i++) {
1763          dynamic_states |=
1764             pvr_dynamic_state_bit_from_vk(dynamic_state->pDynamicStates[i]);
1765       }
1766    }
1767 
1768    /* TODO: Verify this.
1769     * We don't zero out the pipeline's state if they are dynamic since they
1770     * should be set later on in the command buffer.
1771     */
1772 
1773    /* TODO: Handle rasterizerDiscardEnable. */
1774 
1775    if (rasterization_state) {
1776       if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_LINE_WIDTH))
1777          internal_dynamic_state->line_width = rasterization_state->lineWidth;
1778 
1779       /* TODO: Do we need the depthBiasEnable check? */
1780       if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_DEPTH_BIAS)) {
1781          internal_dynamic_state->depth_bias.constant_factor =
1782             rasterization_state->depthBiasConstantFactor;
1783          internal_dynamic_state->depth_bias.clamp =
1784             rasterization_state->depthBiasClamp;
1785          internal_dynamic_state->depth_bias.slope_factor =
1786             rasterization_state->depthBiasSlopeFactor;
1787       }
1788    }
1789 
1790    /* TODO: handle viewport state flags. */
1791 
1792    /* TODO: handle static viewport state. */
1793    /* We assume the viewport state to by dynamic for now. */
1794 
1795    /* TODO: handle static scissor state. */
1796    /* We assume the scissor state to by dynamic for now. */
1797 
1798    if (depth_stencil_state) {
1799       const VkStencilOpState *const front = &depth_stencil_state->front;
1800       const VkStencilOpState *const back = &depth_stencil_state->back;
1801 
1802       /* VkPhysicalDeviceFeatures->depthBounds is false. */
1803       assert(depth_stencil_state->depthBoundsTestEnable == VK_FALSE);
1804 
1805       if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_COMPARE_MASK)) {
1806          internal_dynamic_state->compare_mask.front = front->compareMask;
1807          internal_dynamic_state->compare_mask.back = back->compareMask;
1808       }
1809 
1810       if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_WRITE_MASK)) {
1811          internal_dynamic_state->write_mask.front = front->writeMask;
1812          internal_dynamic_state->write_mask.back = back->writeMask;
1813       }
1814 
1815       if (!(dynamic_states & PVR_DYNAMIC_STATE_BIT_STENCIL_REFERENCE)) {
1816          internal_dynamic_state->reference.front = front->reference;
1817          internal_dynamic_state->reference.back = back->reference;
1818       }
1819    }
1820 
1821    if (color_blend_state &&
1822        !(dynamic_states & PVR_DYNAMIC_STATE_BIT_BLEND_CONSTANTS)) {
1823       STATIC_ASSERT(__same_type(internal_dynamic_state->blend_constants,
1824                                 color_blend_state->blendConstants));
1825 
1826       typed_memcpy(internal_dynamic_state->blend_constants,
1827                    color_blend_state->blendConstants,
1828                    ARRAY_SIZE(internal_dynamic_state->blend_constants));
1829    }
1830 
1831    /* TODO: handle STATIC_STATE_DEPTH_BOUNDS ? */
1832 
1833    internal_dynamic_state->mask = dynamic_states;
1834 }
1835 
1836 static VkResult
pvr_graphics_pipeline_init(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_graphics_pipeline * gfx_pipeline)1837 pvr_graphics_pipeline_init(struct pvr_device *device,
1838                            struct pvr_pipeline_cache *pipeline_cache,
1839                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
1840                            const VkAllocationCallbacks *allocator,
1841                            struct pvr_graphics_pipeline *gfx_pipeline)
1842 {
1843    /* If rasterization is not enabled, various CreateInfo structs must be
1844     * ignored.
1845     */
1846    const bool raster_discard_enabled =
1847       pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1848    const VkPipelineViewportStateCreateInfo *vs_info =
1849       !raster_discard_enabled ? pCreateInfo->pViewportState : NULL;
1850    const VkPipelineDepthStencilStateCreateInfo *dss_info =
1851       !raster_discard_enabled ? pCreateInfo->pDepthStencilState : NULL;
1852    const VkPipelineRasterizationStateCreateInfo *rs_info =
1853       !raster_discard_enabled ? pCreateInfo->pRasterizationState : NULL;
1854    const VkPipelineColorBlendStateCreateInfo *cbs_info =
1855       !raster_discard_enabled ? pCreateInfo->pColorBlendState : NULL;
1856    const VkPipelineMultisampleStateCreateInfo *ms_info =
1857       !raster_discard_enabled ? pCreateInfo->pMultisampleState : NULL;
1858    VkResult result;
1859 
1860    pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
1861 
1862    pvr_finishme("ignoring pCreateInfo flags.");
1863    pvr_finishme("ignoring pipeline cache.");
1864 
1865    gfx_pipeline->raster_state.discard_enable = raster_discard_enabled;
1866    gfx_pipeline->raster_state.cull_mode =
1867       pCreateInfo->pRasterizationState->cullMode;
1868    gfx_pipeline->raster_state.front_face =
1869       pCreateInfo->pRasterizationState->frontFace;
1870    gfx_pipeline->raster_state.depth_bias_enable =
1871       pCreateInfo->pRasterizationState->depthBiasEnable;
1872    gfx_pipeline->raster_state.depth_clamp_enable =
1873       pCreateInfo->pRasterizationState->depthClampEnable;
1874 
1875    /* FIXME: Handle depthClampEnable. */
1876 
1877    pvr_graphics_pipeline_init_depth_and_stencil_state(gfx_pipeline, dss_info);
1878    pvr_graphics_pipeline_init_dynamic_state(gfx_pipeline,
1879                                             pCreateInfo->pDynamicState,
1880                                             vs_info,
1881                                             dss_info,
1882                                             cbs_info,
1883                                             rs_info);
1884 
1885    if (pCreateInfo->pInputAssemblyState) {
1886       gfx_pipeline->input_asm_state.topology =
1887          pCreateInfo->pInputAssemblyState->topology;
1888       gfx_pipeline->input_asm_state.primitive_restart =
1889          pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
1890    }
1891 
1892    memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
1893 
1894    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
1895       VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
1896       gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
1897       /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
1898        *
1899        *    "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
1900        *    or VK_SHADER_STAGE_ALL."
1901        *
1902        * So we don't handle that.
1903        *
1904        * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
1905        * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
1906        * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
1907        * structure returned by the driver.
1908        */
1909       switch (pCreateInfo->pStages[i].stage) {
1910       case VK_SHADER_STAGE_VERTEX_BIT:
1911       case VK_SHADER_STAGE_FRAGMENT_BIT:
1912          gfx_pipeline->stage_indices[gl_stage] = i;
1913          break;
1914       default:
1915          unreachable("Unsupported stage.");
1916       }
1917    }
1918 
1919    gfx_pipeline->base.layout =
1920       pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1921 
1922    if (ms_info) {
1923       gfx_pipeline->rasterization_samples = ms_info->rasterizationSamples;
1924       gfx_pipeline->sample_mask =
1925          (ms_info->pSampleMask) ? ms_info->pSampleMask[0] : 0xFFFFFFFF;
1926    } else {
1927       gfx_pipeline->rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
1928       gfx_pipeline->sample_mask = 0xFFFFFFFF;
1929    }
1930 
1931    /* Compiles and uploads shaders and PDS programs. */
1932    result = pvr_graphics_pipeline_compile(device,
1933                                           pipeline_cache,
1934                                           pCreateInfo,
1935                                           allocator,
1936                                           gfx_pipeline);
1937    if (result != VK_SUCCESS) {
1938       pvr_pipeline_finish(&gfx_pipeline->base);
1939       return result;
1940    }
1941 
1942    return VK_SUCCESS;
1943 }
1944 
1945 /* If allocator == NULL, the internal one will be used. */
1946 static VkResult
pvr_graphics_pipeline_create(struct pvr_device * device,struct pvr_pipeline_cache * pipeline_cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1947 pvr_graphics_pipeline_create(struct pvr_device *device,
1948                              struct pvr_pipeline_cache *pipeline_cache,
1949                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1950                              const VkAllocationCallbacks *allocator,
1951                              VkPipeline *const pipeline_out)
1952 {
1953    struct pvr_graphics_pipeline *gfx_pipeline;
1954    VkResult result;
1955 
1956    gfx_pipeline = vk_zalloc2(&device->vk.alloc,
1957                              allocator,
1958                              sizeof(*gfx_pipeline),
1959                              8,
1960                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1961    if (!gfx_pipeline)
1962       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1963 
1964    /* Compiles and uploads shaders and PDS programs too. */
1965    result = pvr_graphics_pipeline_init(device,
1966                                        pipeline_cache,
1967                                        pCreateInfo,
1968                                        allocator,
1969                                        gfx_pipeline);
1970    if (result != VK_SUCCESS) {
1971       vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1972       return result;
1973    }
1974 
1975    *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
1976 
1977    return VK_SUCCESS;
1978 }
1979 
1980 VkResult
pvr_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1981 pvr_CreateGraphicsPipelines(VkDevice _device,
1982                             VkPipelineCache pipelineCache,
1983                             uint32_t createInfoCount,
1984                             const VkGraphicsPipelineCreateInfo *pCreateInfos,
1985                             const VkAllocationCallbacks *pAllocator,
1986                             VkPipeline *pPipelines)
1987 {
1988    PVR_FROM_HANDLE(pvr_pipeline_cache, pipeline_cache, pipelineCache);
1989    PVR_FROM_HANDLE(pvr_device, device, _device);
1990    VkResult result = VK_SUCCESS;
1991 
1992    for (uint32_t i = 0; i < createInfoCount; i++) {
1993       const VkResult local_result =
1994          pvr_graphics_pipeline_create(device,
1995                                       pipeline_cache,
1996                                       &pCreateInfos[i],
1997                                       pAllocator,
1998                                       &pPipelines[i]);
1999       if (local_result != VK_SUCCESS) {
2000          result = local_result;
2001          pPipelines[i] = VK_NULL_HANDLE;
2002       }
2003    }
2004 
2005    return result;
2006 }
2007 
2008 /*****************************************************************************
2009    Other functions
2010 *****************************************************************************/
2011 
pvr_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2012 void pvr_DestroyPipeline(VkDevice _device,
2013                          VkPipeline _pipeline,
2014                          const VkAllocationCallbacks *pAllocator)
2015 {
2016    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2017    PVR_FROM_HANDLE(pvr_device, device, _device);
2018 
2019    if (!pipeline)
2020       return;
2021 
2022    switch (pipeline->type) {
2023    case PVR_PIPELINE_TYPE_GRAPHICS: {
2024       struct pvr_graphics_pipeline *const gfx_pipeline =
2025          to_pvr_graphics_pipeline(pipeline);
2026 
2027       pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2028       break;
2029    }
2030 
2031    case PVR_PIPELINE_TYPE_COMPUTE: {
2032       struct pvr_compute_pipeline *const compute_pipeline =
2033          to_pvr_compute_pipeline(pipeline);
2034 
2035       pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2036       break;
2037    }
2038 
2039    default:
2040       unreachable("Unknown pipeline type.");
2041    }
2042 }
2043