• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  * DEALINGS IN THE SOFTWARE.
26  */
27 #include "tu_private.h"
28 
29 #include "vk_util.h"
30 #include "vk_format.h"
31 
32 /* Return true if we have to fallback to sysmem rendering because the
33  * dependency can't be satisfied with tiled rendering.
34  */
35 
36 static bool
dep_invalid_for_gmem(const VkSubpassDependency2 * dep)37 dep_invalid_for_gmem(const VkSubpassDependency2 *dep)
38 {
39    /* External dependencies don't matter here. */
40    if (dep->srcSubpass == VK_SUBPASS_EXTERNAL ||
41        dep->dstSubpass == VK_SUBPASS_EXTERNAL)
42       return false;
43 
44    /* We can conceptually break down the process of rewriting a sysmem
45     * renderpass into a gmem one into two parts:
46     *
47     * 1. Split each draw and multisample resolve into N copies, one for each
48     * bin. (If hardware binning, add one more copy where the FS is disabled
49     * for the binning pass). This is always allowed because the vertex stage
50     * is allowed to run an arbitrary number of times and there are no extra
51     * ordering constraints within a draw.
52     * 2. Take the last copy of the second-to-last draw and slide it down to
53     * before the last copy of the last draw. Repeat for each earlier draw
54     * until the draw pass for the last bin is complete, then repeat for each
55     * earlier bin until we finish with the first bin.
56     *
57     * During this rearranging process, we can't slide draws past each other in
58     * a way that breaks the subpass dependencies. For each draw, we must slide
59     * it past (copies of) the rest of the draws in the renderpass. We can
60     * slide a draw past another if there isn't a dependency between them, or
61     * if the dependenc(ies) are dependencies between framebuffer-space stages
62     * only with the BY_REGION bit set. Note that this includes
63     * self-dependencies, since these may result in pipeline barriers that also
64     * break the rearranging process.
65     */
66 
67    /* This is straight from the Vulkan 1.2 spec, section 6.1.4 "Framebuffer
68     * Region Dependencies":
69     */
70    const VkPipelineStageFlags framebuffer_space_stages =
71       VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
72       VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
73       VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
74       VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
75 
76    return
77       (dep->srcStageMask & ~framebuffer_space_stages) ||
78       (dep->dstStageMask & ~framebuffer_space_stages) ||
79       !(dep->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT);
80 }
81 
82 static void
tu_render_pass_add_subpass_dep(struct tu_render_pass * pass,const VkSubpassDependency2 * dep)83 tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
84                                const VkSubpassDependency2 *dep)
85 {
86    uint32_t src = dep->srcSubpass;
87    uint32_t dst = dep->dstSubpass;
88 
89    /* Ignore subpass self-dependencies as they allow the app to call
90     * vkCmdPipelineBarrier() inside the render pass and the driver should only
91     * do the barrier when called, not when starting the render pass.
92     *
93     * We cannot decide whether to allow gmem rendering before a barrier
94     * is actually emitted, so we delay the decision until then.
95     */
96    if (src == dst)
97       return;
98 
99    if (dep_invalid_for_gmem(dep))
100       pass->gmem_pixels = 0;
101 
102    struct tu_subpass_barrier *dst_barrier;
103    if (dst == VK_SUBPASS_EXTERNAL) {
104       dst_barrier = &pass->end_barrier;
105    } else {
106       dst_barrier = &pass->subpasses[dst].start_barrier;
107    }
108 
109    dst_barrier->src_stage_mask |= dep->srcStageMask;
110    dst_barrier->dst_stage_mask |= dep->dstStageMask;
111    dst_barrier->src_access_mask |= dep->srcAccessMask;
112    dst_barrier->dst_access_mask |= dep->dstAccessMask;
113 }
114 
115 /* We currently only care about undefined layouts, because we have to
116  * flush/invalidate CCU for those. PREINITIALIZED is the same thing as
117  * UNDEFINED for anything not linear tiled, but we don't know yet whether the
118  * images used are tiled, so just assume they are.
119  */
120 
121 static bool
layout_undefined(VkImageLayout layout)122 layout_undefined(VkImageLayout layout)
123 {
124    return layout == VK_IMAGE_LAYOUT_UNDEFINED ||
125           layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
126 }
127 
128 /* This implements the following bit of spec text:
129  *
130  *    If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the
131  *    first subpass that uses an attachment, then an implicit subpass
132  *    dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is
133  *    used in. The implicit subpass dependency only exists if there
134  *    exists an automatic layout transition away from initialLayout.
135  *    The subpass dependency operates as if defined with the
136  *    following parameters:
137  *
138  *    VkSubpassDependency implicitDependency = {
139  *        .srcSubpass = VK_SUBPASS_EXTERNAL;
140  *        .dstSubpass = firstSubpass; // First subpass attachment is used in
141  *        .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
142  *        .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
143  *        .srcAccessMask = 0;
144  *        .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
145  *                         VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
146  *                         VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
147  *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
148  *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
149  *        .dependencyFlags = 0;
150  *    };
151  *
152  *    Similarly, if there is no subpass dependency from the last subpass
153  *    that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit
154  *    subpass dependency exists from the last subpass it is used in to
155  *    VK_SUBPASS_EXTERNAL. The implicit subpass dependency only exists
156  *    if there exists an automatic layout transition into finalLayout.
157  *    The subpass dependency operates as if defined with the following
158  *    parameters:
159  *
160  *    VkSubpassDependency implicitDependency = {
161  *        .srcSubpass = lastSubpass; // Last subpass attachment is used in
162  *        .dstSubpass = VK_SUBPASS_EXTERNAL;
163  *        .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
164  *        .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
165  *        .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
166  *                         VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
167  *                         VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
168  *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
169  *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
170  *        .dstAccessMask = 0;
171  *        .dependencyFlags = 0;
172  *    };
173  *
174  * Note: currently this is the only use we have for layout transitions,
175  * besides needing to invalidate CCU at the beginning, so we also flag
176  * transitions from UNDEFINED here.
177  */
178 static void
tu_render_pass_add_implicit_deps(struct tu_render_pass * pass,const VkRenderPassCreateInfo2 * info)179 tu_render_pass_add_implicit_deps(struct tu_render_pass *pass,
180                                  const VkRenderPassCreateInfo2 *info)
181 {
182    const VkAttachmentDescription2* att = info->pAttachments;
183    bool has_external_src[info->subpassCount];
184    bool has_external_dst[info->subpassCount];
185    bool att_used[pass->attachment_count];
186 
187    memset(has_external_src, 0, sizeof(has_external_src));
188    memset(has_external_dst, 0, sizeof(has_external_dst));
189 
190    for (uint32_t i = 0; i < info->dependencyCount; i++) {
191       uint32_t src = info->pDependencies[i].srcSubpass;
192       uint32_t dst = info->pDependencies[i].dstSubpass;
193 
194       if (src == dst)
195          continue;
196 
197       if (src == VK_SUBPASS_EXTERNAL)
198          has_external_src[dst] = true;
199       if (dst == VK_SUBPASS_EXTERNAL)
200          has_external_dst[src] = true;
201    }
202 
203    memset(att_used, 0, sizeof(att_used));
204 
205    for (unsigned i = 0; i < info->subpassCount; i++) {
206       const VkSubpassDescription2 *subpass = &info->pSubpasses[i];
207       bool src_implicit_dep = false;
208 
209       for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) {
210          uint32_t a = subpass->pInputAttachments[j].attachment;
211          if (a == VK_ATTACHMENT_UNUSED)
212             continue;
213          if (att[a].initialLayout != subpass->pInputAttachments[j].layout &&
214              !att_used[a] && !has_external_src[i])
215             src_implicit_dep = true;
216          att_used[a] = true;
217       }
218 
219       for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
220          uint32_t a = subpass->pColorAttachments[j].attachment;
221          if (a == VK_ATTACHMENT_UNUSED)
222             continue;
223          if (att[a].initialLayout != subpass->pColorAttachments[j].layout &&
224              !att_used[a] && !has_external_src[i])
225             src_implicit_dep = true;
226          att_used[a] = true;
227       }
228 
229       if (subpass->pDepthStencilAttachment &&
230           subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
231          uint32_t a = subpass->pDepthStencilAttachment->attachment;
232          if (att[a].initialLayout != subpass->pDepthStencilAttachment->layout &&
233              !att_used[a] && !has_external_src[i])
234             src_implicit_dep = true;
235          att_used[a] = true;
236       }
237 
238       if (subpass->pResolveAttachments) {
239          for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
240             uint32_t a = subpass->pResolveAttachments[j].attachment;
241             if (a == VK_ATTACHMENT_UNUSED)
242                continue;
243             if (att[a].initialLayout != subpass->pResolveAttachments[j].layout &&
244                !att_used[a] && !has_external_src[i])
245                src_implicit_dep = true;
246             att_used[a] = true;
247          }
248       }
249 
250       const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
251          vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
252 
253       if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment &&
254           ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
255             uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
256             if (att[a].initialLayout != subpass->pDepthStencilAttachment->layout &&
257                !att_used[a] && !has_external_src[i])
258                src_implicit_dep = true;
259             att_used[a] = true;
260       }
261 
262       if (src_implicit_dep) {
263          tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2KHR) {
264             .srcSubpass = VK_SUBPASS_EXTERNAL,
265             .dstSubpass = i,
266             .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
267             .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
268             .srcAccessMask = 0,
269             .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
270                              VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
271                              VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
272                              VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
273                              VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
274             .dependencyFlags = 0,
275          });
276       }
277    }
278 
279    memset(att_used, 0, sizeof(att_used));
280 
281    for (int i = info->subpassCount - 1; i >= 0; i--) {
282       const VkSubpassDescription2 *subpass = &info->pSubpasses[i];
283       bool dst_implicit_dep = false;
284 
285       for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) {
286          uint32_t a = subpass->pInputAttachments[j].attachment;
287          if (a == VK_ATTACHMENT_UNUSED)
288             continue;
289          if (att[a].finalLayout != subpass->pInputAttachments[j].layout &&
290              !att_used[a] && !has_external_dst[i])
291             dst_implicit_dep = true;
292          att_used[a] = true;
293       }
294 
295       for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
296          uint32_t a = subpass->pColorAttachments[j].attachment;
297          if (a == VK_ATTACHMENT_UNUSED)
298             continue;
299          if (att[a].finalLayout != subpass->pColorAttachments[j].layout &&
300              !att_used[a] && !has_external_dst[i])
301             dst_implicit_dep = true;
302          att_used[a] = true;
303       }
304 
305       if (subpass->pDepthStencilAttachment &&
306           subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
307          uint32_t a = subpass->pDepthStencilAttachment->attachment;
308          if (att[a].finalLayout != subpass->pDepthStencilAttachment->layout &&
309              !att_used[a] && !has_external_dst[i])
310             dst_implicit_dep = true;
311          att_used[a] = true;
312       }
313 
314       if (subpass->pResolveAttachments) {
315          for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
316             uint32_t a = subpass->pResolveAttachments[j].attachment;
317             if (a == VK_ATTACHMENT_UNUSED)
318                continue;
319             if (att[a].finalLayout != subpass->pResolveAttachments[j].layout &&
320                 !att_used[a] && !has_external_dst[i])
321                dst_implicit_dep = true;
322             att_used[a] = true;
323          }
324       }
325 
326       const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
327          vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
328 
329       if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment &&
330           ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
331             uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
332             if (att[a].finalLayout != subpass->pDepthStencilAttachment->layout &&
333                !att_used[a] && !has_external_dst[i])
334                dst_implicit_dep = true;
335             att_used[a] = true;
336       }
337 
338       if (dst_implicit_dep) {
339          tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2KHR) {
340             .srcSubpass = i,
341             .dstSubpass = VK_SUBPASS_EXTERNAL,
342             .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
343             .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
344             .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
345                              VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
346                              VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
347                              VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
348                              VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
349             .dstAccessMask = 0,
350             .dependencyFlags = 0,
351          });
352       }
353    }
354 
355    /* Handle UNDEFINED transitions, similar to the handling in tu_barrier().
356     * Assume that if an attachment has an initial layout of UNDEFINED, it gets
357     * transitioned eventually.
358     */
359    for (unsigned i = 0; i < info->attachmentCount; i++) {
360       if (layout_undefined(att[i].initialLayout)) {
361          if (vk_format_is_depth_or_stencil(att[i].format)) {
362             pass->subpasses[0].start_barrier.incoherent_ccu_depth = true;
363          } else {
364             pass->subpasses[0].start_barrier.incoherent_ccu_color = true;
365          }
366       }
367    }
368 }
369 
370 /* If an input attachment is used without an intervening write to the same
371  * attachment, then we can just use the original image, even in GMEM mode.
372  * This is an optimization, but it's also important because it allows us to
373  * avoid having to invalidate UCHE at the beginning of each tile due to it
374  * becoming invalid. The only reads of GMEM via UCHE should be after an
375  * earlier subpass modified it, which only works if there's already an
376  * appropriate dependency that will add the CACHE_INVALIDATE anyway. We
377  * don't consider this in the dependency code, so this is also required for
378  * correctness.
379  */
380 static void
tu_render_pass_patch_input_gmem(struct tu_render_pass * pass)381 tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
382 {
383    bool written[pass->attachment_count];
384 
385    memset(written, 0, sizeof(written));
386 
387    for (unsigned i = 0; i < pass->subpass_count; i++) {
388       struct tu_subpass *subpass = &pass->subpasses[i];
389 
390       for (unsigned j = 0; j < subpass->input_count; j++) {
391          uint32_t a = subpass->input_attachments[j].attachment;
392          if (a == VK_ATTACHMENT_UNUSED)
393             continue;
394          subpass->input_attachments[j].patch_input_gmem = written[a];
395       }
396 
397       for (unsigned j = 0; j < subpass->color_count; j++) {
398          uint32_t a = subpass->color_attachments[j].attachment;
399          if (a == VK_ATTACHMENT_UNUSED)
400             continue;
401          written[a] = true;
402 
403          for (unsigned k = 0; k < subpass->input_count; k++) {
404             if (subpass->input_attachments[k].attachment == a &&
405                 !subpass->input_attachments[k].patch_input_gmem) {
406                /* For render feedback loops, we have no idea whether the use
407                 * as a color attachment or input attachment will come first,
408                 * so we have to always use GMEM in case the color attachment
409                 * comes first and defensively invalidate UCHE in case the
410                 * input attachment comes first.
411                 */
412                subpass->feedback_invalidate = true;
413                subpass->input_attachments[k].patch_input_gmem = true;
414             }
415          }
416       }
417 
418       for (unsigned j = 0; j < subpass->resolve_count; j++) {
419          uint32_t a = subpass->resolve_attachments[j].attachment;
420          if (a == VK_ATTACHMENT_UNUSED)
421             continue;
422          written[a] = true;
423       }
424 
425       if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
426          written[subpass->depth_stencil_attachment.attachment] = true;
427          for (unsigned k = 0; k < subpass->input_count; k++) {
428             if (subpass->input_attachments[k].attachment ==
429                 subpass->depth_stencil_attachment.attachment &&
430                 !subpass->input_attachments[k].patch_input_gmem) {
431                subpass->feedback_invalidate = true;
432                subpass->input_attachments[k].patch_input_gmem = true;
433             }
434          }
435       }
436    }
437 }
438 
439 static void
tu_render_pass_check_feedback_loop(struct tu_render_pass * pass)440 tu_render_pass_check_feedback_loop(struct tu_render_pass *pass)
441 {
442    for (unsigned i = 0; i < pass->subpass_count; i++) {
443       struct tu_subpass *subpass = &pass->subpasses[i];
444 
445       for (unsigned j = 0; j < subpass->color_count; j++) {
446          uint32_t a = subpass->color_attachments[j].attachment;
447          if (a == VK_ATTACHMENT_UNUSED)
448             continue;
449          for (unsigned k = 0; k < subpass->input_count; k++) {
450             if (subpass->input_attachments[k].attachment == a) {
451                subpass->feedback = true;
452                break;
453             }
454          }
455       }
456 
457       if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
458          for (unsigned k = 0; k < subpass->input_count; k++) {
459             if (subpass->input_attachments[k].attachment ==
460                 subpass->depth_stencil_attachment.attachment) {
461                subpass->feedback = true;
462                break;
463             }
464          }
465       }
466    }
467 }
468 
update_samples(struct tu_subpass * subpass,VkSampleCountFlagBits samples)469 static void update_samples(struct tu_subpass *subpass,
470                            VkSampleCountFlagBits samples)
471 {
472    assert(subpass->samples == 0 || subpass->samples == samples);
473    subpass->samples = samples;
474 }
475 
476 static void
tu_render_pass_gmem_config(struct tu_render_pass * pass,const struct tu_physical_device * phys_dev)477 tu_render_pass_gmem_config(struct tu_render_pass *pass,
478                            const struct tu_physical_device *phys_dev)
479 {
480    uint32_t block_align_shift = 3; /* log2(gmem_align/(tile_align_w*tile_align_h)) */
481    uint32_t tile_align_w = phys_dev->info->tile_align_w;
482    uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h;
483 
484    /* calculate total bytes per pixel */
485    uint32_t cpp_total = 0;
486    for (uint32_t i = 0; i < pass->attachment_count; i++) {
487       struct tu_render_pass_attachment *att = &pass->attachments[i];
488       bool cpp1 = (att->cpp == 1);
489       if (att->gmem_offset >= 0) {
490          cpp_total += att->cpp;
491 
492          /* take into account the separate stencil: */
493          if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
494             cpp1 = (att->samples == 1);
495             cpp_total += att->samples;
496          }
497 
498          /* texture pitch must be aligned to 64, use a tile_align_w that is
499           * a multiple of 64 for cpp==1 attachment to work as input attachment
500           */
501          if (cpp1 && tile_align_w % 64 != 0) {
502             tile_align_w *= 2;
503             block_align_shift -= 1;
504          }
505       }
506    }
507 
508    pass->tile_align_w = tile_align_w;
509 
510    /* no gmem attachments */
511    if (cpp_total == 0) {
512       /* any value non-zero value so tiling config works with no attachments */
513       pass->gmem_pixels = 1024*1024;
514       return;
515    }
516 
517    /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path
518     * doesn't break things. maybe there is a better solution?
519     * TODO: this algorithm isn't optimal
520     * for example, two attachments with cpp = {1, 4}
521     * result:  nblocks = {12, 52}, pixels = 196608
522     * optimal: nblocks = {13, 51}, pixels = 208896
523     */
524    uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align;
525    uint32_t offset = 0, pixels = ~0u, i;
526    for (i = 0; i < pass->attachment_count; i++) {
527       struct tu_render_pass_attachment *att = &pass->attachments[i];
528       if (att->gmem_offset < 0)
529          continue;
530 
531       att->gmem_offset = offset;
532 
533       uint32_t align = MAX2(1, att->cpp >> block_align_shift);
534       uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
535 
536       if (nblocks > gmem_blocks)
537          break;
538 
539       gmem_blocks -= nblocks;
540       cpp_total -= att->cpp;
541       offset += nblocks * gmem_align;
542       pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
543 
544       /* repeat the same for separate stencil */
545       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
546          att->gmem_offset_stencil = offset;
547 
548          /* note: for s8_uint, block align is always 1 */
549          uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
550          if (nblocks > gmem_blocks)
551             break;
552 
553          gmem_blocks -= nblocks;
554          cpp_total -= att->samples;
555          offset += nblocks * gmem_align;
556          pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
557       }
558    }
559 
560    /* if the loop didn't complete then the gmem config is impossible */
561    if (i == pass->attachment_count)
562       pass->gmem_pixels = pixels;
563 }
564 
565 static void
attachment_set_ops(struct tu_render_pass_attachment * att,VkAttachmentLoadOp load_op,VkAttachmentLoadOp stencil_load_op,VkAttachmentStoreOp store_op,VkAttachmentStoreOp stencil_store_op)566 attachment_set_ops(struct tu_render_pass_attachment *att,
567                    VkAttachmentLoadOp load_op,
568                    VkAttachmentLoadOp stencil_load_op,
569                    VkAttachmentStoreOp store_op,
570                    VkAttachmentStoreOp stencil_store_op)
571 {
572    /* load/store ops */
573    att->clear_mask =
574       (load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) ? VK_IMAGE_ASPECT_COLOR_BIT : 0;
575    att->load = (load_op == VK_ATTACHMENT_LOAD_OP_LOAD);
576    att->store = (store_op == VK_ATTACHMENT_STORE_OP_STORE);
577 
578    bool stencil_clear = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR);
579    bool stencil_load = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD);
580    bool stencil_store = (stencil_store_op == VK_ATTACHMENT_STORE_OP_STORE);
581 
582    switch (att->format) {
583    case VK_FORMAT_D24_UNORM_S8_UINT: /* || stencil load/store */
584       if (att->clear_mask)
585          att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
586       if (stencil_clear)
587          att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
588       if (stencil_load)
589          att->load = true;
590       if (stencil_store)
591          att->store = true;
592       break;
593    case VK_FORMAT_S8_UINT: /* replace load/store with stencil load/store */
594       att->clear_mask = stencil_clear ? VK_IMAGE_ASPECT_COLOR_BIT : 0;
595       att->load = stencil_load;
596       att->store = stencil_store;
597       break;
598    case VK_FORMAT_D32_SFLOAT_S8_UINT: /* separate stencil */
599       if (att->clear_mask)
600          att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
601       if (stencil_clear)
602          att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
603       if (stencil_load)
604          att->load_stencil = true;
605       if (stencil_store)
606          att->store_stencil = true;
607       break;
608    default:
609       break;
610    }
611 }
612 
613 static bool
is_depth_stencil_resolve_enabled(const VkSubpassDescriptionDepthStencilResolve * depth_stencil_resolve)614 is_depth_stencil_resolve_enabled(const VkSubpassDescriptionDepthStencilResolve *depth_stencil_resolve)
615 {
616    if (depth_stencil_resolve &&
617        depth_stencil_resolve->pDepthStencilResolveAttachment &&
618        depth_stencil_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
619       return true;
620    }
621    return false;
622 }
623 
624 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateRenderPass2(VkDevice _device,const VkRenderPassCreateInfo2KHR * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkRenderPass * pRenderPass)625 tu_CreateRenderPass2(VkDevice _device,
626                      const VkRenderPassCreateInfo2KHR *pCreateInfo,
627                      const VkAllocationCallbacks *pAllocator,
628                      VkRenderPass *pRenderPass)
629 {
630    TU_FROM_HANDLE(tu_device, device, _device);
631    struct tu_render_pass *pass;
632    size_t size;
633    size_t attachments_offset;
634 
635    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
636 
637    size = sizeof(*pass);
638    size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
639    attachments_offset = size;
640    size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]);
641 
642    pass = vk_object_zalloc(&device->vk, pAllocator, size,
643                            VK_OBJECT_TYPE_RENDER_PASS);
644    if (pass == NULL)
645       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
646 
647    pass->attachment_count = pCreateInfo->attachmentCount;
648    pass->subpass_count = pCreateInfo->subpassCount;
649    pass->attachments = (void *) pass + attachments_offset;
650 
651    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
652       struct tu_render_pass_attachment *att = &pass->attachments[i];
653 
654       att->format = pCreateInfo->pAttachments[i].format;
655       att->samples = pCreateInfo->pAttachments[i].samples;
656       /* for d32s8, cpp is for the depth image, and
657        * att->samples will be used as the cpp for the stencil image
658        */
659       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
660          att->cpp = 4 * att->samples;
661       else
662          att->cpp = vk_format_get_blocksize(att->format) * att->samples;
663       att->gmem_offset = -1;
664 
665       attachment_set_ops(att,
666                          pCreateInfo->pAttachments[i].loadOp,
667                          pCreateInfo->pAttachments[i].stencilLoadOp,
668                          pCreateInfo->pAttachments[i].storeOp,
669                          pCreateInfo->pAttachments[i].stencilStoreOp);
670    }
671    uint32_t subpass_attachment_count = 0;
672    struct tu_subpass_attachment *p;
673    for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
674       const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
675       const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
676          vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
677 
678       subpass_attachment_count +=
679          desc->inputAttachmentCount + desc->colorAttachmentCount +
680          (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
681          (is_depth_stencil_resolve_enabled(ds_resolve) ? 1 : 0);
682    }
683 
684    if (subpass_attachment_count) {
685       pass->subpass_attachments = vk_alloc2(
686          &device->vk.alloc, pAllocator,
687          subpass_attachment_count * sizeof(struct tu_subpass_attachment), 8,
688          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
689       if (pass->subpass_attachments == NULL) {
690          vk_object_free(&device->vk, pAllocator, pass);
691          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
692       }
693    } else
694       pass->subpass_attachments = NULL;
695 
696    p = pass->subpass_attachments;
697    for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
698       const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
699       const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
700          vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
701       struct tu_subpass *subpass = &pass->subpasses[i];
702 
703       subpass->input_count = desc->inputAttachmentCount;
704       subpass->color_count = desc->colorAttachmentCount;
705       subpass->resolve_count = 0;
706       subpass->resolve_depth_stencil = is_depth_stencil_resolve_enabled(ds_resolve);
707       subpass->samples = 0;
708       subpass->srgb_cntl = 0;
709 
710       subpass->multiview_mask = desc->viewMask;
711 
712       if (desc->inputAttachmentCount > 0) {
713          subpass->input_attachments = p;
714          p += desc->inputAttachmentCount;
715 
716          for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
717             uint32_t a = desc->pInputAttachments[j].attachment;
718             subpass->input_attachments[j].attachment = a;
719             /* Note: attachments only used as input attachments will be read
720              * directly instead of through gmem, so we don't mark input
721              * attachments as needing gmem.
722              */
723          }
724       }
725 
726       if (desc->colorAttachmentCount > 0) {
727          subpass->color_attachments = p;
728          p += desc->colorAttachmentCount;
729 
730          for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
731             uint32_t a = desc->pColorAttachments[j].attachment;
732             subpass->color_attachments[j].attachment = a;
733 
734             if (a != VK_ATTACHMENT_UNUSED) {
735                pass->attachments[a].gmem_offset = 0;
736                update_samples(subpass, pCreateInfo->pAttachments[a].samples);
737 
738                if (vk_format_is_srgb(pass->attachments[a].format))
739                   subpass->srgb_cntl |= 1 << j;
740 
741                pass->attachments[a].clear_views |= subpass->multiview_mask;
742             }
743          }
744       }
745 
746       subpass->resolve_attachments = (desc->pResolveAttachments || subpass->resolve_depth_stencil) ? p : NULL;
747       if (desc->pResolveAttachments) {
748          p += desc->colorAttachmentCount;
749          subpass->resolve_count += desc->colorAttachmentCount;
750          for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
751             subpass->resolve_attachments[j].attachment =
752                   desc->pResolveAttachments[j].attachment;
753          }
754       }
755 
756       if (subpass->resolve_depth_stencil) {
757          p++;
758          subpass->resolve_count++;
759          uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
760          subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a;
761       }
762 
763       uint32_t a = desc->pDepthStencilAttachment ?
764          desc->pDepthStencilAttachment->attachment : VK_ATTACHMENT_UNUSED;
765       subpass->depth_stencil_attachment.attachment = a;
766       if (a != VK_ATTACHMENT_UNUSED) {
767             pass->attachments[a].gmem_offset = 0;
768             update_samples(subpass, pCreateInfo->pAttachments[a].samples);
769 
770             pass->attachments[a].clear_views |= subpass->multiview_mask;
771       }
772    }
773 
774    tu_render_pass_patch_input_gmem(pass);
775 
776    tu_render_pass_check_feedback_loop(pass);
777 
778    /* disable unused attachments */
779    for (uint32_t i = 0; i < pass->attachment_count; i++) {
780       struct tu_render_pass_attachment *att = &pass->attachments[i];
781       if (att->gmem_offset < 0) {
782          att->clear_mask = 0;
783          att->load = false;
784       }
785    }
786 
787    /* From the VK_KHR_multiview spec:
788     *
789     *    Multiview is all-or-nothing for a render pass - that is, either all
790     *    subpasses must have a non-zero view mask (though some subpasses may
791     *    have only one view) or all must be zero.
792     *
793     * This means we only have to check one of the view masks.
794     */
795    if (pCreateInfo->pSubpasses[0].viewMask) {
796       /* It seems multiview must use sysmem rendering. */
797       pass->gmem_pixels = 0;
798    } else {
799       tu_render_pass_gmem_config(pass, device->physical_device);
800    }
801 
802    for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
803       tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
804    }
805 
806    tu_render_pass_add_implicit_deps(pass, pCreateInfo);
807 
808    *pRenderPass = tu_render_pass_to_handle(pass);
809 
810    return VK_SUCCESS;
811 }
812 
813 VKAPI_ATTR void VKAPI_CALL
tu_DestroyRenderPass(VkDevice _device,VkRenderPass _pass,const VkAllocationCallbacks * pAllocator)814 tu_DestroyRenderPass(VkDevice _device,
815                      VkRenderPass _pass,
816                      const VkAllocationCallbacks *pAllocator)
817 {
818    TU_FROM_HANDLE(tu_device, device, _device);
819    TU_FROM_HANDLE(tu_render_pass, pass, _pass);
820 
821    if (!_pass)
822       return;
823 
824    vk_free2(&device->vk.alloc, pAllocator, pass->subpass_attachments);
825    vk_object_free(&device->vk, pAllocator, pass);
826 }
827 
828 VKAPI_ATTR void VKAPI_CALL
tu_GetRenderAreaGranularity(VkDevice _device,VkRenderPass renderPass,VkExtent2D * pGranularity)829 tu_GetRenderAreaGranularity(VkDevice _device,
830                             VkRenderPass renderPass,
831                             VkExtent2D *pGranularity)
832 {
833    TU_FROM_HANDLE(tu_device, device, _device);
834    pGranularity->width = device->physical_device->info->gmem_align_w;
835    pGranularity->height = device->physical_device->info->gmem_align_h;
836 }
837 
838 uint32_t
tu_subpass_get_attachment_to_resolve(const struct tu_subpass * subpass,uint32_t index)839 tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index)
840 {
841    if (subpass->resolve_depth_stencil &&
842        index == (subpass->resolve_count - 1))
843       return subpass->depth_stencil_attachment.attachment;
844 
845    return subpass->color_attachments[index].attachment;
846 }
847