1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <stdint.h>
25 #include <vulkan/vulkan_core.h>
26 
27 #include "hwdef/rogue_hw_utils.h"
28 #include "pvr_clear.h"
29 #include "pvr_hardcode.h"
30 #include "pvr_pds.h"
31 #include "pvr_private.h"
32 #include "usc/programs/pvr_shader_factory.h"
33 #include "usc/programs/pvr_static_shaders.h"
34 #include "pvr_types.h"
35 #include "vk_alloc.h"
36 #include "vk_log.h"
37 
pvr_device_setup_graphics_static_clear_ppp_base(struct pvr_static_clear_ppp_base * const base)38 static void pvr_device_setup_graphics_static_clear_ppp_base(
39    struct pvr_static_clear_ppp_base *const base)
40 {
41    pvr_csb_pack (&base->wclamp, TA_WCLAMP, wclamp) {
42       wclamp.val = fui(0.00001f);
43    }
44 
45    /* clang-format off */
46    pvr_csb_pack (&base->varying_word[0], TA_STATE_VARYING0, varying0);
47    pvr_csb_pack (&base->varying_word[1], TA_STATE_VARYING1, varying1);
48    pvr_csb_pack (&base->varying_word[2], TA_STATE_VARYING2, varying2);
49    /* clang-format on */
50 
51    pvr_csb_pack (&base->ppp_ctrl, TA_STATE_PPP_CTRL, ppp_ctrl) {
52       ppp_ctrl.pretransform = true;
53       ppp_ctrl.cullmode = ROGUE_TA_CULLMODE_NO_CULLING;
54    }
55 
56    /* clang-format off */
57    pvr_csb_pack (&base->stream_out0, TA_STATE_STREAM_OUT0, stream_out0);
58    /* clang-format on */
59 }
60 
pvr_device_setup_graphics_static_clear_ppp_templates(struct pvr_static_clear_ppp_template templates[static PVR_STATIC_CLEAR_VARIANT_COUNT])61 static void pvr_device_setup_graphics_static_clear_ppp_templates(
62    struct pvr_static_clear_ppp_template
63       templates[static PVR_STATIC_CLEAR_VARIANT_COUNT])
64 {
65    for (uint32_t i = 0; i < PVR_STATIC_CLEAR_VARIANT_COUNT; i++) {
66       const bool has_color = !!(i & VK_IMAGE_ASPECT_COLOR_BIT);
67       const bool has_depth = !!(i & VK_IMAGE_ASPECT_DEPTH_BIT);
68       const bool has_stencil = !!(i & VK_IMAGE_ASPECT_STENCIL_BIT);
69 
70       struct pvr_static_clear_ppp_template *const template = &templates[i];
71 
72       template->requires_pds_state = has_color;
73 
74       pvr_csb_pack (&template->header, TA_STATE_HEADER, header) {
75          header.pres_stream_out_size = true;
76          header.pres_ppp_ctrl = true;
77          header.pres_varying_word2 = true;
78          header.pres_varying_word1 = true;
79          header.pres_varying_word0 = true;
80          header.pres_outselects = true;
81          header.pres_wclamp = true;
82          header.pres_region_clip = true;
83          header.pres_pds_state_ptr2 = template->requires_pds_state;
84          header.pres_pds_state_ptr1 = template->requires_pds_state;
85          header.pres_pds_state_ptr0 = template->requires_pds_state;
86          header.pres_ispctl_fb = true;
87          header.pres_ispctl_fa = true;
88          header.pres_ispctl = true;
89       }
90 
91 #define CS_HEADER(cs)    \
92    (struct ROGUE_##cs)   \
93    {                     \
94       pvr_cmd_header(cs) \
95    }
96 
97       template->config.ispctl = CS_HEADER(TA_STATE_ISPCTL);
98       template->config.ispctl.tagwritedisable = !has_color;
99       template->config.ispctl.bpres = true;
100 
101       template->config.ispa = CS_HEADER(TA_STATE_ISPA);
102       template->config.ispa.objtype = ROGUE_TA_OBJTYPE_TRIANGLE;
103       template->config.ispa.passtype = ROGUE_TA_PASSTYPE_TRANSLUCENT;
104       template->config.ispa.dwritedisable = !has_depth;
105       template->config.ispa.dcmpmode = (i == 0) ? ROGUE_TA_CMPMODE_NEVER
106                                                 : ROGUE_TA_CMPMODE_ALWAYS;
107       template->config.ispa.sref =
108          has_stencil ? ROGUE_TA_STATE_ISPA_SREF_SIZE_MAX : 0;
109 
110       pvr_csb_pack (&template->ispb, TA_STATE_ISPB, ispb) {
111          ispb.scmpmode = ROGUE_TA_CMPMODE_ALWAYS;
112          ispb.sop1 = ROGUE_TA_ISPB_STENCILOP_KEEP;
113          ispb.sop2 = ROGUE_TA_ISPB_STENCILOP_KEEP;
114 
115          ispb.sop3 = has_stencil ? ROGUE_TA_ISPB_STENCILOP_REPLACE
116                                  : ROGUE_TA_ISPB_STENCILOP_KEEP;
117 
118          ispb.swmask = has_stencil ? 0xFF : 0;
119       }
120 
121       template->config.pds_state = NULL;
122 
123       template->config.region_clip0 = CS_HEADER(TA_REGION_CLIP0);
124       template->config.region_clip0.mode = ROGUE_TA_REGION_CLIP_MODE_OUTSIDE;
125       template->config.region_clip0.left = 0;
126       template->config.region_clip0.right = ROGUE_TA_REGION_CLIP_MAX;
127 
128       template->config.region_clip1 = CS_HEADER(TA_REGION_CLIP1);
129       template->config.region_clip1.top = 0;
130       template->config.region_clip1.bottom = ROGUE_TA_REGION_CLIP_MAX;
131 
132       template->config.output_sel = CS_HEADER(TA_OUTPUT_SEL);
133       template->config.output_sel.vtxsize = 4;
134       template->config.output_sel.rhw_pres = true;
135 
136 #undef CS_HEADER
137    }
138 }
139 
140 /**
141  * \brief Emit geom state from a configurable template.
142  *
143  * Note that the state is emitted by joining the template with a base so the
144  * base must have been setup before calling this.
145  *
146  * \param[in] csb          Control stream to emit to.
147  * \param[in] template     The configured template.
148  * \param[out] pvr_bo_out  Uploaded state's pvr_bo object.
149  *
150  * \return   VK_SUCCESS if the state was successfully uploaded.
151  */
pvr_emit_ppp_from_template(struct pvr_csb * const csb,const struct pvr_static_clear_ppp_template * const template,struct pvr_suballoc_bo ** const pvr_bo_out)152 VkResult pvr_emit_ppp_from_template(
153    struct pvr_csb *const csb,
154    const struct pvr_static_clear_ppp_template *const template,
155    struct pvr_suballoc_bo **const pvr_bo_out)
156 {
157    const uint32_t dword_count =
158       pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPCTL) +
159       pvr_cmd_length(TA_STATE_ISPA) + pvr_cmd_length(TA_STATE_ISPB) +
160       (template->requires_pds_state ? PVR_STATIC_CLEAR_PDS_STATE_COUNT : 0) +
161       pvr_cmd_length(TA_REGION_CLIP0) + pvr_cmd_length(TA_REGION_CLIP1) +
162       pvr_cmd_length(TA_WCLAMP) + pvr_cmd_length(TA_OUTPUT_SEL) +
163       pvr_cmd_length(TA_STATE_VARYING0) + pvr_cmd_length(TA_STATE_VARYING1) +
164       pvr_cmd_length(TA_STATE_VARYING2) + pvr_cmd_length(TA_STATE_PPP_CTRL) +
165       pvr_cmd_length(TA_STATE_STREAM_OUT0);
166 
167    struct pvr_device *const device = csb->device;
168    const uint32_t cache_line_size =
169       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
170    const struct pvr_static_clear_ppp_base *const base =
171       &device->static_clear_state.ppp_base;
172    struct pvr_suballoc_bo *pvr_bo;
173    uint32_t *stream;
174    VkResult result;
175 
176    result = pvr_bo_suballoc(&device->suballoc_general,
177                             PVR_DW_TO_BYTES(dword_count),
178                             cache_line_size,
179                             false,
180                             &pvr_bo);
181    if (result != VK_SUCCESS) {
182       *pvr_bo_out = NULL;
183       return result;
184    }
185 
186    stream = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
187 
188    pvr_csb_write_value(stream, TA_STATE_HEADER, template->header);
189    pvr_csb_write_struct(stream, TA_STATE_ISPCTL, &template->config.ispctl);
190    pvr_csb_write_struct(stream, TA_STATE_ISPA, &template->config.ispa);
191    pvr_csb_write_value(stream, TA_STATE_ISPB, template->ispb);
192 
193    if (template->requires_pds_state) {
194       static_assert(sizeof(*stream) == sizeof((*template->config.pds_state)[0]),
195                     "Size mismatch");
196       for (uint32_t i = 0; i < PVR_STATIC_CLEAR_PDS_STATE_COUNT; i++)
197          *stream++ = (*template->config.pds_state)[i];
198    }
199 
200    pvr_csb_write_struct(stream,
201                         TA_REGION_CLIP0,
202                         &template->config.region_clip0);
203    pvr_csb_write_struct(stream,
204                         TA_REGION_CLIP1,
205                         &template->config.region_clip1);
206    pvr_csb_write_value(stream, TA_WCLAMP, base->wclamp);
207    pvr_csb_write_struct(stream, TA_OUTPUT_SEL, &template->config.output_sel);
208    pvr_csb_write_value(stream, TA_STATE_VARYING0, base->varying_word[0]);
209    pvr_csb_write_value(stream, TA_STATE_VARYING1, base->varying_word[1]);
210    pvr_csb_write_value(stream, TA_STATE_VARYING2, base->varying_word[2]);
211    pvr_csb_write_value(stream, TA_STATE_PPP_CTRL, base->ppp_ctrl);
212    pvr_csb_write_value(stream, TA_STATE_STREAM_OUT0, base->stream_out0);
213 
214    assert((uint64_t)(stream - (uint32_t *)pvr_bo_suballoc_get_map_addr(
215                                  pvr_bo)) == dword_count);
216 
217    stream = NULL;
218 
219    pvr_csb_set_relocation_mark(csb);
220 
221    pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state) {
222       state.word_count = dword_count;
223       state.addrmsb = pvr_bo->dev_addr;
224    }
225 
226    pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state) {
227       state.addrlsb = pvr_bo->dev_addr;
228    }
229 
230    pvr_csb_clear_relocation_mark(csb);
231 
232    *pvr_bo_out = pvr_bo;
233 
234    return VK_SUCCESS;
235 }
236 
237 static VkResult
pvr_device_init_clear_attachment_programs(struct pvr_device * device)238 pvr_device_init_clear_attachment_programs(struct pvr_device *device)
239 {
240    const uint32_t pds_prog_alignment =
241       MAX2(ROGUE_TA_STATE_PDS_TEXUNICODEBASE_ADDR_ALIGNMENT,
242            ROGUE_TA_STATE_PDS_SHADERBASE_ADDR_ALIGNMENT);
243    struct pvr_device_static_clear_state *clear_state =
244       &device->static_clear_state;
245    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
246    uint32_t pds_texture_program_offsets[PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT];
247    uint32_t pds_pixel_program_offsets[PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT];
248    uint32_t usc_program_offsets[PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT];
249    uint64_t usc_upload_offset;
250    uint64_t pds_upload_offset;
251    uint32_t alloc_size = 0;
252    VkResult result;
253    uint8_t *ptr;
254 
255 #if !defined(NDEBUG)
256    uint32_t clear_attachment_info_count = 0;
257 
258    for (uint32_t i = 0; i < ARRAY_SIZE(clear_attachment_collection); i++) {
259       if (!clear_attachment_collection[i].info)
260          continue;
261 
262       clear_attachment_info_count++;
263    }
264 
265    assert(clear_attachment_info_count == PVR_CLEAR_ATTACHMENT_PROGRAM_COUNT);
266 #endif
267 
268    /* Upload USC fragment shaders. */
269 
270    for (uint32_t i = 0, offset_idx = 0;
271         i < ARRAY_SIZE(clear_attachment_collection);
272         i++) {
273       if (!clear_attachment_collection[i].info)
274          continue;
275 
276       usc_program_offsets[offset_idx] = alloc_size;
277       /* TODO: The compiler will likely give us a pre-aligned size for the USC
278        * shader so don't bother aligning here when it's hooked up.
279        */
280       alloc_size += ALIGN_POT(clear_attachment_collection[i].size, 4);
281 
282       offset_idx++;
283    }
284 
285    result = pvr_bo_suballoc(&device->suballoc_usc,
286                             alloc_size,
287                             4,
288                             false,
289                             &clear_state->usc_clear_attachment_programs);
290    if (result != VK_SUCCESS)
291       return result;
292 
293    usc_upload_offset =
294       clear_state->usc_clear_attachment_programs->dev_addr.addr -
295       device->heaps.usc_heap->base_addr.addr;
296    ptr = (uint8_t *)pvr_bo_suballoc_get_map_addr(
297       clear_state->usc_clear_attachment_programs);
298 
299    for (uint32_t i = 0, offset_idx = 0;
300         i < ARRAY_SIZE(clear_attachment_collection);
301         i++) {
302       if (!clear_attachment_collection[i].info)
303          continue;
304 
305       memcpy(ptr + usc_program_offsets[offset_idx],
306              clear_attachment_collection[i].code,
307              clear_attachment_collection[i].size);
308 
309       offset_idx++;
310    }
311 
312    /* Upload PDS programs. */
313 
314    alloc_size = 0;
315 
316    for (uint32_t i = 0, offset_idx = 0;
317         i < ARRAY_SIZE(clear_attachment_collection);
318         i++) {
319       struct pvr_pds_pixel_shader_sa_program texture_pds_program;
320       struct pvr_pds_kickusc_program pixel_shader_pds_program;
321       uint32_t program_size;
322 
323       if (!clear_attachment_collection[i].info)
324          continue;
325 
326       /* Texture program to load colors. */
327 
328       texture_pds_program = (struct pvr_pds_pixel_shader_sa_program){
329          .num_texture_dma_kicks = 1,
330       };
331 
332       pvr_pds_set_sizes_pixel_shader_uniform_texture_code(&texture_pds_program);
333 
334       pds_texture_program_offsets[offset_idx] = alloc_size;
335       alloc_size += ALIGN_POT(PVR_DW_TO_BYTES(texture_pds_program.code_size),
336                               pds_prog_alignment);
337 
338       /* Pixel program to load fragment shader. */
339 
340       pixel_shader_pds_program = (struct pvr_pds_kickusc_program){ 0 };
341 
342       pvr_pds_setup_doutu(&pixel_shader_pds_program.usc_task_control,
343                           usc_upload_offset + usc_program_offsets[offset_idx],
344                           clear_attachment_collection[i].info->temps_required,
345                           ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
346                           false);
347 
348       pvr_pds_set_sizes_pixel_shader(&pixel_shader_pds_program);
349 
350       program_size = pixel_shader_pds_program.code_size +
351                      pixel_shader_pds_program.data_size;
352       program_size = PVR_DW_TO_BYTES(program_size);
353 
354       pds_pixel_program_offsets[offset_idx] = alloc_size;
355       alloc_size += ALIGN_POT(program_size, pds_prog_alignment);
356 
357       offset_idx++;
358    }
359 
360    result = pvr_bo_suballoc(&device->suballoc_pds,
361                             alloc_size,
362                             pds_prog_alignment,
363                             false,
364                             &clear_state->pds_clear_attachment_programs);
365    if (result != VK_SUCCESS) {
366       pvr_bo_suballoc_free(clear_state->usc_clear_attachment_programs);
367       return result;
368    }
369 
370    pds_upload_offset =
371       clear_state->pds_clear_attachment_programs->dev_addr.addr -
372       device->heaps.pds_heap->base_addr.addr;
373    ptr =
374       pvr_bo_suballoc_get_map_addr(clear_state->pds_clear_attachment_programs);
375 
376    for (uint32_t i = 0, offset_idx = 0;
377         i < ARRAY_SIZE(clear_attachment_collection);
378         i++) {
379       struct pvr_pds_pixel_shader_sa_program texture_pds_program;
380       struct pvr_pds_kickusc_program pixel_shader_pds_program;
381 
382       if (!clear_attachment_collection[i].info) {
383          clear_state->pds_clear_attachment_program_info[i] =
384             (struct pvr_pds_clear_attachment_program_info){ 0 };
385 
386          continue;
387       }
388 
389       /* Texture program to load colors. */
390 
391       texture_pds_program = (struct pvr_pds_pixel_shader_sa_program){
392          .num_texture_dma_kicks = 1,
393       };
394 
395       pvr_pds_generate_pixel_shader_sa_code_segment(
396          &texture_pds_program,
397          (uint32_t *)(ptr + pds_texture_program_offsets[offset_idx]));
398 
399       /* Pixel program to load fragment shader. */
400 
401       pixel_shader_pds_program = (struct pvr_pds_kickusc_program){ 0 };
402 
403       pvr_pds_setup_doutu(&pixel_shader_pds_program.usc_task_control,
404                           usc_upload_offset + usc_program_offsets[offset_idx],
405                           clear_attachment_collection[i].info->temps_required,
406                           ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
407                           false);
408 
409       pvr_pds_generate_pixel_shader_program(
410          &pixel_shader_pds_program,
411          (uint32_t *)(ptr + pds_pixel_program_offsets[offset_idx]));
412 
413       /* Setup the PDS program info. */
414 
415       pvr_pds_set_sizes_pixel_shader_sa_texture_data(&texture_pds_program,
416                                                      dev_info);
417 
418       clear_state->pds_clear_attachment_program_info[i] =
419          (struct pvr_pds_clear_attachment_program_info){
420             .texture_program_offset = PVR_DEV_ADDR(
421                pds_upload_offset + pds_texture_program_offsets[offset_idx]),
422             .pixel_program_offset = PVR_DEV_ADDR(
423                pds_upload_offset + pds_pixel_program_offsets[offset_idx]),
424 
425             .texture_program_pds_temps_count = texture_pds_program.temps_used,
426             .texture_program_data_size = texture_pds_program.data_size,
427          };
428 
429       offset_idx++;
430    }
431 
432    return VK_SUCCESS;
433 }
434 
435 static void
pvr_device_finish_clear_attachment_programs(struct pvr_device * device)436 pvr_device_finish_clear_attachment_programs(struct pvr_device *device)
437 {
438    struct pvr_device_static_clear_state *clear_state =
439       &device->static_clear_state;
440 
441    pvr_bo_suballoc_free(clear_state->usc_clear_attachment_programs);
442    pvr_bo_suballoc_free(clear_state->pds_clear_attachment_programs);
443 }
444 
445 /**
446  * \brief Generate and uploads vertices required to clear the rect area.
447  *
448  * We use the triangle strip topology for clears so this functions generates 4
449  * vertices to represent the rect. Note that the coordinates are in screen space
450  * and not NDC.
451  *
452  * \param[in]  device      Device to upload to.
453  * \param[in]  rect        Area to clear.
454  * \param[in]  depth       Depth (i.e. Z coordinate) of the area to clear.
455  * \param[out] pvr_bo_out  BO upload object.
456  * \return VK_SUCCESS if the upload succeeded.
457  */
pvr_clear_vertices_upload(struct pvr_device * device,const VkRect2D * rect,float depth,struct pvr_suballoc_bo ** const pvr_bo_out)458 VkResult pvr_clear_vertices_upload(struct pvr_device *device,
459                                    const VkRect2D *rect,
460                                    float depth,
461                                    struct pvr_suballoc_bo **const pvr_bo_out)
462 {
463    const float y1 = (float)(rect->offset.y + rect->extent.height);
464    const float x1 = (float)(rect->offset.x + rect->extent.width);
465    const float y0 = (float)rect->offset.y;
466    const float x0 = (float)rect->offset.x;
467 
468    const float vertices[PVR_CLEAR_VERTEX_COUNT][PVR_CLEAR_VERTEX_COORDINATES] = {
469       [0] = { [0] = x0, [1] = y0, [2] = depth },
470       [1] = { [0] = x0, [1] = y1, [2] = depth },
471       [2] = { [0] = x1, [1] = y0, [2] = depth },
472       [3] = { [0] = x1, [1] = y1, [2] = depth }
473    };
474 
475    return pvr_gpu_upload(device,
476                          device->heaps.general_heap,
477                          vertices,
478                          sizeof(vertices),
479                          4,
480                          pvr_bo_out);
481 }
482 
pvr_device_init_graphics_static_clear_state(struct pvr_device * device)483 VkResult pvr_device_init_graphics_static_clear_state(struct pvr_device *device)
484 {
485    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
486    const VkRect2D vf_rect = {
487       .offset = { .x = 0, .y = 0 },
488       .extent = { .width = rogue_get_param_vf_max_x(dev_info),
489                   .height = rogue_get_param_vf_max_y(dev_info) }
490    };
491 
492    const uint32_t vdm_state_size_in_dw =
493       pvr_clear_vdm_state_get_size_in_dw(dev_info, 1);
494    struct pvr_device_static_clear_state *state = &device->static_clear_state;
495    const uint32_t cache_line_size = rogue_get_slc_cache_line_size(dev_info);
496    struct pvr_pds_vertex_shader_program pds_program;
497    struct util_dynarray passthrough_vert_shader;
498    uint32_t *state_buffer;
499    VkResult result;
500 
501    if (PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
502       struct util_dynarray passthrough_rta_vert_shader;
503 
504       util_dynarray_init(&passthrough_rta_vert_shader, NULL);
505       pvr_hard_code_get_passthrough_rta_vertex_shader(
506          dev_info,
507          &passthrough_rta_vert_shader);
508 
509       result = pvr_gpu_upload_usc(device,
510                                   passthrough_rta_vert_shader.data,
511                                   passthrough_rta_vert_shader.size,
512                                   cache_line_size,
513                                   &state->usc_multi_layer_vertex_shader_bo);
514       if (result != VK_SUCCESS) {
515          util_dynarray_fini(&passthrough_rta_vert_shader);
516          return result;
517       }
518 
519       util_dynarray_fini(&passthrough_rta_vert_shader);
520    } else {
521       state->usc_multi_layer_vertex_shader_bo = NULL;
522    }
523 
524    util_dynarray_init(&passthrough_vert_shader, NULL);
525    pvr_hard_code_get_passthrough_vertex_shader(dev_info,
526                                                &passthrough_vert_shader);
527 
528    result = pvr_gpu_upload_usc(device,
529                                passthrough_vert_shader.data,
530                                passthrough_vert_shader.size,
531                                cache_line_size,
532                                &state->usc_vertex_shader_bo);
533    util_dynarray_fini(&passthrough_vert_shader);
534    if (result != VK_SUCCESS)
535       goto err_free_usc_multi_layer_shader;
536 
537    result =
538       pvr_clear_vertices_upload(device, &vf_rect, 0.0f, &state->vertices_bo);
539    if (result != VK_SUCCESS)
540       goto err_free_usc_shader;
541 
542    pvr_pds_clear_vertex_shader_program_init_base(&pds_program,
543                                                  state->usc_vertex_shader_bo);
544 
545    result =
546       pvr_pds_clear_vertex_shader_program_create_and_upload(&pds_program,
547                                                             device,
548                                                             state->vertices_bo,
549                                                             &state->pds);
550    if (result != VK_SUCCESS)
551       goto err_free_vertices_buffer;
552 
553    pvr_device_setup_graphics_static_clear_ppp_base(&state->ppp_base);
554    pvr_device_setup_graphics_static_clear_ppp_templates(state->ppp_templates);
555 
556    assert(pds_program.code_size <= state->pds.code_size);
557 
558    state_buffer = vk_alloc(&device->vk.alloc,
559                            PVR_DW_TO_BYTES(vdm_state_size_in_dw * 2),
560                            8,
561                            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
562    if (state_buffer == NULL) {
563       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
564       goto err_free_pds_program;
565    }
566 
567    /* TODO: The difference between the large and normal words is only the last
568     * word. The value is 3 or 4 depending on the amount of indices. Should we
569     * dedup this?
570     */
571 
572    /* The large clear state words cover the max framebuffer. The normal clear
573     * state words cover only half (since 3 indices are passed, forming a single
574     * triangle, instead of 4) and are used when the render area fits within a
575     * quarter of the max framebuffer, i.e. fit within the single triangle.
576     */
577    /* 4 * sizeof(uint32_t) because of the 4 pixel output regs. */
578    /* TODO: Replace 4 * sizeof(uint32_t) with a defines from the compiler or
579     * hook up the value directly to it using some compiler info.
580     */
581    pvr_pack_clear_vdm_state(&device->pdevice->dev_info,
582                             &state->pds,
583                             pds_program.temps_used,
584                             3,
585                             4 * sizeof(uint32_t),
586                             1,
587                             state_buffer);
588    state->vdm_words = state_buffer;
589    state_buffer += vdm_state_size_in_dw;
590 
591    pvr_pack_clear_vdm_state(&device->pdevice->dev_info,
592                             &state->pds,
593                             pds_program.temps_used,
594                             4,
595                             4 * sizeof(uint32_t),
596                             1,
597                             state_buffer);
598    state->large_clear_vdm_words = state_buffer;
599 
600    result = pvr_device_init_clear_attachment_programs(device);
601    if (result != VK_SUCCESS)
602       goto err_free_vdm_state;
603 
604    return VK_SUCCESS;
605 
606 err_free_vdm_state:
607    /* Cast away the const :( */
608    vk_free(&device->vk.alloc, (void *)state->vdm_words);
609 
610 err_free_pds_program:
611    pvr_bo_suballoc_free(state->pds.pvr_bo);
612 
613 err_free_vertices_buffer:
614    pvr_bo_suballoc_free(state->vertices_bo);
615 
616 err_free_usc_shader:
617    pvr_bo_suballoc_free(state->usc_vertex_shader_bo);
618 
619 err_free_usc_multi_layer_shader:
620    pvr_bo_suballoc_free(state->usc_multi_layer_vertex_shader_bo);
621 
622    return result;
623 }
624 
pvr_device_finish_graphics_static_clear_state(struct pvr_device * device)625 void pvr_device_finish_graphics_static_clear_state(struct pvr_device *device)
626 {
627    struct pvr_device_static_clear_state *state = &device->static_clear_state;
628 
629    pvr_device_finish_clear_attachment_programs(device);
630 
631    /* Don't free `large_clear_vdm_words` since it was allocated together with
632     * `vdm_words`.
633     */
634    /* Cast away the const :( */
635    vk_free(&device->vk.alloc, (void *)state->vdm_words);
636 
637    pvr_bo_suballoc_free(state->pds.pvr_bo);
638    pvr_bo_suballoc_free(state->vertices_bo);
639    pvr_bo_suballoc_free(state->usc_vertex_shader_bo);
640    pvr_bo_suballoc_free(state->usc_multi_layer_vertex_shader_bo);
641 }
642 
pvr_pds_clear_vertex_shader_program_init_base(struct pvr_pds_vertex_shader_program * program,const struct pvr_suballoc_bo * usc_shader_bo)643 void pvr_pds_clear_vertex_shader_program_init_base(
644    struct pvr_pds_vertex_shader_program *program,
645    const struct pvr_suballoc_bo *usc_shader_bo)
646 {
647    *program = (struct pvr_pds_vertex_shader_program){
648       .num_streams = 1,
649       .streams = {
650          [0] = {
651             /* We'll get this from this interface's client when generating the
652              * data segment. This will be the address of the vertex buffer.
653              */
654             .address = 0,
655             .stride = PVR_CLEAR_VERTEX_COORDINATES * sizeof(uint32_t),
656             .num_elements = 1,
657             .elements = {
658                [0] = {
659                   .size = PVR_CLEAR_VERTEX_COUNT * PVR_CLEAR_VERTEX_COORDINATES,
660                },
661             },
662          },
663       },
664    };
665 
666    pvr_pds_setup_doutu(&program->usc_task_control,
667                        usc_shader_bo->dev_addr.addr,
668                        0,
669                        ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
670                        false);
671 }
672 
pvr_pds_clear_vertex_shader_program_create_and_upload(struct pvr_pds_vertex_shader_program * program,struct pvr_device * device,const struct pvr_suballoc_bo * vertices_bo,struct pvr_pds_upload * const upload_out)673 VkResult pvr_pds_clear_vertex_shader_program_create_and_upload(
674    struct pvr_pds_vertex_shader_program *program,
675    struct pvr_device *device,
676    const struct pvr_suballoc_bo *vertices_bo,
677    struct pvr_pds_upload *const upload_out)
678 {
679    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
680    uint32_t staging_buffer_size;
681    uint32_t *staging_buffer;
682    VkResult result;
683 
684    program->streams[0].address = vertices_bo->dev_addr.addr;
685 
686    pvr_pds_vertex_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
687 
688    staging_buffer_size =
689       PVR_DW_TO_BYTES(program->code_size + program->data_size);
690 
691    staging_buffer = vk_alloc(&device->vk.alloc,
692                              staging_buffer_size,
693                              8,
694                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
695    if (!staging_buffer) {
696       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
697       goto err_exit;
698    }
699 
700    pvr_pds_vertex_shader(program,
701                          staging_buffer,
702                          PDS_GENERATE_DATA_SEGMENT,
703                          dev_info);
704    pvr_pds_vertex_shader(program,
705                          &staging_buffer[program->data_size],
706                          PDS_GENERATE_CODE_SEGMENT,
707                          dev_info);
708 
709    /* FIXME: Figure out the define for alignment of 16. */
710    result = pvr_gpu_upload_pds(device,
711                                &staging_buffer[0],
712                                program->data_size,
713                                16,
714                                &staging_buffer[program->data_size],
715                                program->code_size,
716                                16,
717                                16,
718                                upload_out);
719    if (result != VK_SUCCESS)
720       goto err_free_staging_buffer;
721 
722    vk_free(&device->vk.alloc, staging_buffer);
723    return VK_SUCCESS;
724 
725 err_free_staging_buffer:
726    vk_free(&device->vk.alloc, staging_buffer);
727 
728 err_exit:
729    *upload_out = (struct pvr_pds_upload){ 0 };
730    return result;
731 }
732 
pvr_pds_clear_vertex_shader_program_create_and_upload_data(struct pvr_pds_vertex_shader_program * program,struct pvr_cmd_buffer * cmd_buffer,struct pvr_suballoc_bo * vertices_bo,struct pvr_pds_upload * const pds_upload_out)733 VkResult pvr_pds_clear_vertex_shader_program_create_and_upload_data(
734    struct pvr_pds_vertex_shader_program *program,
735    struct pvr_cmd_buffer *cmd_buffer,
736    struct pvr_suballoc_bo *vertices_bo,
737    struct pvr_pds_upload *const pds_upload_out)
738 {
739    struct pvr_device_info *dev_info = &cmd_buffer->device->pdevice->dev_info;
740    uint32_t staging_buffer_size;
741    uint32_t *staging_buffer;
742    VkResult result;
743 
744    program->streams[0].address = vertices_bo->dev_addr.addr;
745 
746    pvr_pds_vertex_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
747 
748    staging_buffer_size = PVR_DW_TO_BYTES(program->data_size);
749 
750    staging_buffer = vk_alloc(&cmd_buffer->device->vk.alloc,
751                              staging_buffer_size,
752                              8,
753                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
754    if (!staging_buffer) {
755       *pds_upload_out = (struct pvr_pds_upload){ 0 };
756 
757       return vk_command_buffer_set_error(&cmd_buffer->vk,
758                                          VK_ERROR_OUT_OF_HOST_MEMORY);
759    }
760 
761    pvr_pds_vertex_shader(program,
762                          staging_buffer,
763                          PDS_GENERATE_DATA_SEGMENT,
764                          dev_info);
765 
766    result = pvr_cmd_buffer_upload_pds(cmd_buffer,
767                                       staging_buffer,
768                                       program->data_size,
769                                       4,
770                                       NULL,
771                                       0,
772                                       0,
773                                       4,
774                                       pds_upload_out);
775    if (result != VK_SUCCESS) {
776       vk_free(&cmd_buffer->device->vk.alloc, staging_buffer);
777 
778       *pds_upload_out = (struct pvr_pds_upload){ 0 };
779 
780       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
781    }
782 
783    vk_free(&cmd_buffer->device->vk.alloc, staging_buffer);
784 
785    return VK_SUCCESS;
786 }
787 
pvr_pds_clear_rta_vertex_shader_program_init_base(struct pvr_pds_vertex_shader_program * program,const struct pvr_suballoc_bo * usc_shader_bo)788 void pvr_pds_clear_rta_vertex_shader_program_init_base(
789    struct pvr_pds_vertex_shader_program *program,
790    const struct pvr_suballoc_bo *usc_shader_bo)
791 {
792    pvr_pds_clear_vertex_shader_program_init_base(program, usc_shader_bo);
793 
794    /* We'll set the render target index to be the instance id + base array
795     * layer. Since the base array layer can change in between clear rects, we
796     * don't set it here and ask for it when generating the code and data
797     * section.
798     */
799    /* This is 3 because the instance id register will follow the xyz coordinate
800     * registers in the register file.
801     * TODO: Maybe we want this to be hooked up to the compiler?
802     */
803    program->iterate_instance_id = true;
804    program->instance_id_register = 3;
805 }
806 
pvr_pds_clear_rta_vertex_shader_program_create_and_upload_code(struct pvr_pds_vertex_shader_program * program,struct pvr_cmd_buffer * cmd_buffer,uint32_t base_array_layer,struct pvr_pds_upload * const pds_upload_out)807 VkResult pvr_pds_clear_rta_vertex_shader_program_create_and_upload_code(
808    struct pvr_pds_vertex_shader_program *program,
809    struct pvr_cmd_buffer *cmd_buffer,
810    uint32_t base_array_layer,
811    struct pvr_pds_upload *const pds_upload_out)
812 {
813    struct pvr_device_info *dev_info = &cmd_buffer->device->pdevice->dev_info;
814    uint32_t staging_buffer_size;
815    uint32_t *staging_buffer;
816    VkResult result;
817 
818    program->instance_id_modifier = base_array_layer;
819 
820    pvr_pds_vertex_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
821 
822    staging_buffer_size = PVR_DW_TO_BYTES(program->code_size);
823 
824    staging_buffer = vk_alloc(&cmd_buffer->device->vk.alloc,
825                              staging_buffer_size,
826                              8,
827                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
828    if (!staging_buffer) {
829       *pds_upload_out = (struct pvr_pds_upload){ 0 };
830 
831       return vk_command_buffer_set_error(&cmd_buffer->vk,
832                                          VK_ERROR_OUT_OF_HOST_MEMORY);
833    }
834 
835    pvr_pds_vertex_shader(program,
836                          staging_buffer,
837                          PDS_GENERATE_CODE_SEGMENT,
838                          dev_info);
839 
840    result = pvr_cmd_buffer_upload_pds(cmd_buffer,
841                                       NULL,
842                                       0,
843                                       0,
844                                       staging_buffer,
845                                       program->code_size,
846                                       4,
847                                       4,
848                                       pds_upload_out);
849    if (result != VK_SUCCESS) {
850       vk_free(&cmd_buffer->device->vk.alloc, staging_buffer);
851 
852       *pds_upload_out = (struct pvr_pds_upload){ 0 };
853 
854       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
855    }
856 
857    vk_free(&cmd_buffer->device->vk.alloc, staging_buffer);
858 
859    return VK_SUCCESS;
860 }
861 
862 /**
863  * Pack VDM control stream words for clear.
864  *
865  * The size of the `state_buffer` provided is expected to point to a buffer of
866  * size equal to what is returned by `pvr_clear_vdm_state_get_size_in_dw()`.
867  */
pvr_pack_clear_vdm_state(const struct pvr_device_info * const dev_info,const struct pvr_pds_upload * const program,uint32_t temps,uint32_t index_count,uint32_t vs_output_size_in_bytes,uint32_t layer_count,uint32_t * const state_buffer)868 void pvr_pack_clear_vdm_state(const struct pvr_device_info *const dev_info,
869                               const struct pvr_pds_upload *const program,
870                               uint32_t temps,
871                               uint32_t index_count,
872                               uint32_t vs_output_size_in_bytes,
873                               uint32_t layer_count,
874                               uint32_t *const state_buffer)
875 {
876    const uint32_t vs_output_size =
877       DIV_ROUND_UP(vs_output_size_in_bytes,
878                    ROGUE_VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE);
879    const bool needs_instance_count =
880       !PVR_HAS_FEATURE(dev_info, gs_rta_support) && layer_count > 1;
881    uint32_t *stream = state_buffer;
882    uint32_t max_instances;
883    uint32_t cam_size;
884 
885    /* The layer count should at least be 1. For vkCmdClearAttachment() the spec.
886     * guarantees that the layer count is not 0.
887     */
888    assert(layer_count != 0);
889 
890    pvr_calculate_vertex_cam_size(dev_info,
891                                  vs_output_size,
892                                  true,
893                                  &cam_size,
894                                  &max_instances);
895 
896    pvr_csb_pack (stream, VDMCTRL_VDM_STATE0, state0) {
897       state0.vs_data_addr_present = true;
898       state0.vs_other_present = true;
899       state0.cam_size = cam_size;
900       state0.uvs_scratch_size_select =
901          ROGUE_VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE;
902       state0.flatshade_control = ROGUE_VDMCTRL_FLATSHADE_CONTROL_VERTEX_0;
903    }
904    stream += pvr_cmd_length(VDMCTRL_VDM_STATE0);
905 
906    pvr_csb_pack (stream, VDMCTRL_VDM_STATE2, state2) {
907       state2.vs_pds_data_base_addr = PVR_DEV_ADDR(program->data_offset);
908    }
909    stream += pvr_cmd_length(VDMCTRL_VDM_STATE2);
910 
911    pvr_csb_pack (stream, VDMCTRL_VDM_STATE3, state3) {
912       state3.vs_pds_code_base_addr = PVR_DEV_ADDR(program->code_offset);
913    }
914    stream += pvr_cmd_length(VDMCTRL_VDM_STATE3);
915 
916    pvr_csb_pack (stream, VDMCTRL_VDM_STATE4, state4) {
917       state4.vs_output_size = vs_output_size;
918    }
919    stream += pvr_cmd_length(VDMCTRL_VDM_STATE4);
920 
921    pvr_csb_pack (stream, VDMCTRL_VDM_STATE5, state5) {
922       state5.vs_max_instances = max_instances;
923       /* This is the size of the input vertex. The hw manages the USC
924        * temporaries separately so we don't need to include them here.
925        */
926       state5.vs_usc_unified_size =
927          DIV_ROUND_UP(PVR_CLEAR_VERTEX_COORDINATES * sizeof(uint32_t),
928                       ROGUE_VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE);
929       state5.vs_pds_temp_size =
930          DIV_ROUND_UP(temps,
931                       ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE);
932       state5.vs_pds_data_size =
933          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
934                       ROGUE_VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE);
935    }
936    stream += pvr_cmd_length(VDMCTRL_VDM_STATE5);
937 
938    /* TODO: Here we're doing another state update. If emitting directly to the
939     * control stream, we don't mark them as separate state updates by setting
940     * the relocation mark so we might be wasting a little bit of memory. See if
941     * it's worth changing the code to use the relocation mark.
942     */
943 
944    pvr_csb_pack (stream, VDMCTRL_INDEX_LIST0, index_list0) {
945       index_list0.index_count_present = true;
946       index_list0.index_instance_count_present = needs_instance_count;
947       index_list0.primitive_topology =
948          ROGUE_VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP;
949    }
950    stream += pvr_cmd_length(VDMCTRL_INDEX_LIST0);
951 
952    pvr_csb_pack (stream, VDMCTRL_INDEX_LIST2, index_list3) {
953       index_list3.index_count = index_count;
954    }
955    stream += pvr_cmd_length(VDMCTRL_INDEX_LIST2);
956 
957    if (needs_instance_count) {
958       pvr_csb_pack (stream, VDMCTRL_INDEX_LIST3, index_list3) {
959          index_list3.instance_count = layer_count - 1;
960       }
961       stream += pvr_cmd_length(VDMCTRL_INDEX_LIST3);
962    }
963 
964    assert((uint64_t)(stream - state_buffer) ==
965           pvr_clear_vdm_state_get_size_in_dw(dev_info, layer_count));
966 }
967