1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27 #include "tu_private.h"
28
29 #include "vk_util.h"
30 #include "vk_format.h"
31
32 /* Return true if we have to fallback to sysmem rendering because the
33 * dependency can't be satisfied with tiled rendering.
34 */
35
36 static bool
dep_invalid_for_gmem(const VkSubpassDependency2 * dep)37 dep_invalid_for_gmem(const VkSubpassDependency2 *dep)
38 {
39 /* External dependencies don't matter here. */
40 if (dep->srcSubpass == VK_SUBPASS_EXTERNAL ||
41 dep->dstSubpass == VK_SUBPASS_EXTERNAL)
42 return false;
43
44 /* We can conceptually break down the process of rewriting a sysmem
45 * renderpass into a gmem one into two parts:
46 *
47 * 1. Split each draw and multisample resolve into N copies, one for each
48 * bin. (If hardware binning, add one more copy where the FS is disabled
49 * for the binning pass). This is always allowed because the vertex stage
50 * is allowed to run an arbitrary number of times and there are no extra
51 * ordering constraints within a draw.
52 * 2. Take the last copy of the second-to-last draw and slide it down to
53 * before the last copy of the last draw. Repeat for each earlier draw
54 * until the draw pass for the last bin is complete, then repeat for each
55 * earlier bin until we finish with the first bin.
56 *
57 * During this rearranging process, we can't slide draws past each other in
58 * a way that breaks the subpass dependencies. For each draw, we must slide
59 * it past (copies of) the rest of the draws in the renderpass. We can
60 * slide a draw past another if there isn't a dependency between them, or
61 * if the dependenc(ies) are dependencies between framebuffer-space stages
62 * only with the BY_REGION bit set. Note that this includes
63 * self-dependencies, since these may result in pipeline barriers that also
64 * break the rearranging process.
65 */
66
67 /* This is straight from the Vulkan 1.2 spec, section 6.1.4 "Framebuffer
68 * Region Dependencies":
69 */
70 const VkPipelineStageFlags framebuffer_space_stages =
71 VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
72 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
73 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
74 VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
75
76 return
77 (dep->srcStageMask & ~framebuffer_space_stages) ||
78 (dep->dstStageMask & ~framebuffer_space_stages) ||
79 !(dep->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT);
80 }
81
82 static void
tu_render_pass_add_subpass_dep(struct tu_render_pass * pass,const VkSubpassDependency2 * dep)83 tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
84 const VkSubpassDependency2 *dep)
85 {
86 uint32_t src = dep->srcSubpass;
87 uint32_t dst = dep->dstSubpass;
88
89 /* Ignore subpass self-dependencies as they allow the app to call
90 * vkCmdPipelineBarrier() inside the render pass and the driver should only
91 * do the barrier when called, not when starting the render pass.
92 *
93 * We cannot decide whether to allow gmem rendering before a barrier
94 * is actually emitted, so we delay the decision until then.
95 */
96 if (src == dst)
97 return;
98
99 if (dep_invalid_for_gmem(dep))
100 pass->gmem_pixels = 0;
101
102 struct tu_subpass_barrier *dst_barrier;
103 if (dst == VK_SUBPASS_EXTERNAL) {
104 dst_barrier = &pass->end_barrier;
105 } else {
106 dst_barrier = &pass->subpasses[dst].start_barrier;
107 }
108
109 dst_barrier->src_stage_mask |= dep->srcStageMask;
110 dst_barrier->dst_stage_mask |= dep->dstStageMask;
111 dst_barrier->src_access_mask |= dep->srcAccessMask;
112 dst_barrier->dst_access_mask |= dep->dstAccessMask;
113 }
114
115 /* We currently only care about undefined layouts, because we have to
116 * flush/invalidate CCU for those. PREINITIALIZED is the same thing as
117 * UNDEFINED for anything not linear tiled, but we don't know yet whether the
118 * images used are tiled, so just assume they are.
119 */
120
121 static bool
layout_undefined(VkImageLayout layout)122 layout_undefined(VkImageLayout layout)
123 {
124 return layout == VK_IMAGE_LAYOUT_UNDEFINED ||
125 layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
126 }
127
128 /* This implements the following bit of spec text:
129 *
130 * If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the
131 * first subpass that uses an attachment, then an implicit subpass
132 * dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is
133 * used in. The implicit subpass dependency only exists if there
134 * exists an automatic layout transition away from initialLayout.
135 * The subpass dependency operates as if defined with the
136 * following parameters:
137 *
138 * VkSubpassDependency implicitDependency = {
139 * .srcSubpass = VK_SUBPASS_EXTERNAL;
140 * .dstSubpass = firstSubpass; // First subpass attachment is used in
141 * .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
142 * .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
143 * .srcAccessMask = 0;
144 * .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
145 * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
146 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
147 * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
148 * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
149 * .dependencyFlags = 0;
150 * };
151 *
152 * Similarly, if there is no subpass dependency from the last subpass
153 * that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit
154 * subpass dependency exists from the last subpass it is used in to
155 * VK_SUBPASS_EXTERNAL. The implicit subpass dependency only exists
156 * if there exists an automatic layout transition into finalLayout.
157 * The subpass dependency operates as if defined with the following
158 * parameters:
159 *
160 * VkSubpassDependency implicitDependency = {
161 * .srcSubpass = lastSubpass; // Last subpass attachment is used in
162 * .dstSubpass = VK_SUBPASS_EXTERNAL;
163 * .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
164 * .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
165 * .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
166 * VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
167 * VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
168 * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
169 * VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
170 * .dstAccessMask = 0;
171 * .dependencyFlags = 0;
172 * };
173 *
174 * Note: currently this is the only use we have for layout transitions,
175 * besides needing to invalidate CCU at the beginning, so we also flag
176 * transitions from UNDEFINED here.
177 */
178 static void
tu_render_pass_add_implicit_deps(struct tu_render_pass * pass,const VkRenderPassCreateInfo2 * info)179 tu_render_pass_add_implicit_deps(struct tu_render_pass *pass,
180 const VkRenderPassCreateInfo2 *info)
181 {
182 const VkAttachmentDescription2* att = info->pAttachments;
183 bool has_external_src[info->subpassCount];
184 bool has_external_dst[info->subpassCount];
185 bool att_used[pass->attachment_count];
186
187 memset(has_external_src, 0, sizeof(has_external_src));
188 memset(has_external_dst, 0, sizeof(has_external_dst));
189
190 for (uint32_t i = 0; i < info->dependencyCount; i++) {
191 uint32_t src = info->pDependencies[i].srcSubpass;
192 uint32_t dst = info->pDependencies[i].dstSubpass;
193
194 if (src == dst)
195 continue;
196
197 if (src == VK_SUBPASS_EXTERNAL)
198 has_external_src[dst] = true;
199 if (dst == VK_SUBPASS_EXTERNAL)
200 has_external_dst[src] = true;
201 }
202
203 memset(att_used, 0, sizeof(att_used));
204
205 for (unsigned i = 0; i < info->subpassCount; i++) {
206 const VkSubpassDescription2 *subpass = &info->pSubpasses[i];
207 bool src_implicit_dep = false;
208
209 for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) {
210 uint32_t a = subpass->pInputAttachments[j].attachment;
211 if (a == VK_ATTACHMENT_UNUSED)
212 continue;
213 if (att[a].initialLayout != subpass->pInputAttachments[j].layout &&
214 !att_used[a] && !has_external_src[i])
215 src_implicit_dep = true;
216 att_used[a] = true;
217 }
218
219 for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
220 uint32_t a = subpass->pColorAttachments[j].attachment;
221 if (a == VK_ATTACHMENT_UNUSED)
222 continue;
223 if (att[a].initialLayout != subpass->pColorAttachments[j].layout &&
224 !att_used[a] && !has_external_src[i])
225 src_implicit_dep = true;
226 att_used[a] = true;
227 }
228
229 if (subpass->pDepthStencilAttachment &&
230 subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
231 uint32_t a = subpass->pDepthStencilAttachment->attachment;
232 if (att[a].initialLayout != subpass->pDepthStencilAttachment->layout &&
233 !att_used[a] && !has_external_src[i])
234 src_implicit_dep = true;
235 att_used[a] = true;
236 }
237
238 if (subpass->pResolveAttachments) {
239 for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
240 uint32_t a = subpass->pResolveAttachments[j].attachment;
241 if (a == VK_ATTACHMENT_UNUSED)
242 continue;
243 if (att[a].initialLayout != subpass->pResolveAttachments[j].layout &&
244 !att_used[a] && !has_external_src[i])
245 src_implicit_dep = true;
246 att_used[a] = true;
247 }
248 }
249
250 const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
251 vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
252
253 if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment &&
254 ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
255 uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
256 if (att[a].initialLayout != subpass->pDepthStencilAttachment->layout &&
257 !att_used[a] && !has_external_src[i])
258 src_implicit_dep = true;
259 att_used[a] = true;
260 }
261
262 if (src_implicit_dep) {
263 tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2KHR) {
264 .srcSubpass = VK_SUBPASS_EXTERNAL,
265 .dstSubpass = i,
266 .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
267 .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
268 .srcAccessMask = 0,
269 .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
270 VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
271 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
272 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
273 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
274 .dependencyFlags = 0,
275 });
276 }
277 }
278
279 memset(att_used, 0, sizeof(att_used));
280
281 for (int i = info->subpassCount - 1; i >= 0; i--) {
282 const VkSubpassDescription2 *subpass = &info->pSubpasses[i];
283 bool dst_implicit_dep = false;
284
285 for (unsigned j = 0; j < subpass->inputAttachmentCount; j++) {
286 uint32_t a = subpass->pInputAttachments[j].attachment;
287 if (a == VK_ATTACHMENT_UNUSED)
288 continue;
289 if (att[a].finalLayout != subpass->pInputAttachments[j].layout &&
290 !att_used[a] && !has_external_dst[i])
291 dst_implicit_dep = true;
292 att_used[a] = true;
293 }
294
295 for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
296 uint32_t a = subpass->pColorAttachments[j].attachment;
297 if (a == VK_ATTACHMENT_UNUSED)
298 continue;
299 if (att[a].finalLayout != subpass->pColorAttachments[j].layout &&
300 !att_used[a] && !has_external_dst[i])
301 dst_implicit_dep = true;
302 att_used[a] = true;
303 }
304
305 if (subpass->pDepthStencilAttachment &&
306 subpass->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
307 uint32_t a = subpass->pDepthStencilAttachment->attachment;
308 if (att[a].finalLayout != subpass->pDepthStencilAttachment->layout &&
309 !att_used[a] && !has_external_dst[i])
310 dst_implicit_dep = true;
311 att_used[a] = true;
312 }
313
314 if (subpass->pResolveAttachments) {
315 for (unsigned j = 0; j < subpass->colorAttachmentCount; j++) {
316 uint32_t a = subpass->pResolveAttachments[j].attachment;
317 if (a == VK_ATTACHMENT_UNUSED)
318 continue;
319 if (att[a].finalLayout != subpass->pResolveAttachments[j].layout &&
320 !att_used[a] && !has_external_dst[i])
321 dst_implicit_dep = true;
322 att_used[a] = true;
323 }
324 }
325
326 const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
327 vk_find_struct_const(subpass->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
328
329 if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment &&
330 ds_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
331 uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
332 if (att[a].finalLayout != subpass->pDepthStencilAttachment->layout &&
333 !att_used[a] && !has_external_dst[i])
334 dst_implicit_dep = true;
335 att_used[a] = true;
336 }
337
338 if (dst_implicit_dep) {
339 tu_render_pass_add_subpass_dep(pass, &(VkSubpassDependency2KHR) {
340 .srcSubpass = i,
341 .dstSubpass = VK_SUBPASS_EXTERNAL,
342 .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
343 .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
344 .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
345 VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
346 VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
347 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
348 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
349 .dstAccessMask = 0,
350 .dependencyFlags = 0,
351 });
352 }
353 }
354
355 /* Handle UNDEFINED transitions, similar to the handling in tu_barrier().
356 * Assume that if an attachment has an initial layout of UNDEFINED, it gets
357 * transitioned eventually.
358 */
359 for (unsigned i = 0; i < info->attachmentCount; i++) {
360 if (layout_undefined(att[i].initialLayout)) {
361 if (vk_format_is_depth_or_stencil(att[i].format)) {
362 pass->subpasses[0].start_barrier.incoherent_ccu_depth = true;
363 } else {
364 pass->subpasses[0].start_barrier.incoherent_ccu_color = true;
365 }
366 }
367 }
368 }
369
370 /* If an input attachment is used without an intervening write to the same
371 * attachment, then we can just use the original image, even in GMEM mode.
372 * This is an optimization, but it's also important because it allows us to
373 * avoid having to invalidate UCHE at the beginning of each tile due to it
374 * becoming invalid. The only reads of GMEM via UCHE should be after an
375 * earlier subpass modified it, which only works if there's already an
376 * appropriate dependency that will add the CACHE_INVALIDATE anyway. We
377 * don't consider this in the dependency code, so this is also required for
378 * correctness.
379 */
380 static void
tu_render_pass_patch_input_gmem(struct tu_render_pass * pass)381 tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
382 {
383 bool written[pass->attachment_count];
384
385 memset(written, 0, sizeof(written));
386
387 for (unsigned i = 0; i < pass->subpass_count; i++) {
388 struct tu_subpass *subpass = &pass->subpasses[i];
389
390 for (unsigned j = 0; j < subpass->input_count; j++) {
391 uint32_t a = subpass->input_attachments[j].attachment;
392 if (a == VK_ATTACHMENT_UNUSED)
393 continue;
394 subpass->input_attachments[j].patch_input_gmem = written[a];
395 }
396
397 for (unsigned j = 0; j < subpass->color_count; j++) {
398 uint32_t a = subpass->color_attachments[j].attachment;
399 if (a == VK_ATTACHMENT_UNUSED)
400 continue;
401 written[a] = true;
402
403 for (unsigned k = 0; k < subpass->input_count; k++) {
404 if (subpass->input_attachments[k].attachment == a &&
405 !subpass->input_attachments[k].patch_input_gmem) {
406 /* For render feedback loops, we have no idea whether the use
407 * as a color attachment or input attachment will come first,
408 * so we have to always use GMEM in case the color attachment
409 * comes first and defensively invalidate UCHE in case the
410 * input attachment comes first.
411 */
412 subpass->feedback_invalidate = true;
413 subpass->input_attachments[k].patch_input_gmem = true;
414 }
415 }
416 }
417
418 for (unsigned j = 0; j < subpass->resolve_count; j++) {
419 uint32_t a = subpass->resolve_attachments[j].attachment;
420 if (a == VK_ATTACHMENT_UNUSED)
421 continue;
422 written[a] = true;
423 }
424
425 if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
426 written[subpass->depth_stencil_attachment.attachment] = true;
427 for (unsigned k = 0; k < subpass->input_count; k++) {
428 if (subpass->input_attachments[k].attachment ==
429 subpass->depth_stencil_attachment.attachment &&
430 !subpass->input_attachments[k].patch_input_gmem) {
431 subpass->feedback_invalidate = true;
432 subpass->input_attachments[k].patch_input_gmem = true;
433 }
434 }
435 }
436 }
437 }
438
439 static void
tu_render_pass_check_feedback_loop(struct tu_render_pass * pass)440 tu_render_pass_check_feedback_loop(struct tu_render_pass *pass)
441 {
442 for (unsigned i = 0; i < pass->subpass_count; i++) {
443 struct tu_subpass *subpass = &pass->subpasses[i];
444
445 for (unsigned j = 0; j < subpass->color_count; j++) {
446 uint32_t a = subpass->color_attachments[j].attachment;
447 if (a == VK_ATTACHMENT_UNUSED)
448 continue;
449 for (unsigned k = 0; k < subpass->input_count; k++) {
450 if (subpass->input_attachments[k].attachment == a) {
451 subpass->feedback = true;
452 break;
453 }
454 }
455 }
456
457 if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
458 for (unsigned k = 0; k < subpass->input_count; k++) {
459 if (subpass->input_attachments[k].attachment ==
460 subpass->depth_stencil_attachment.attachment) {
461 subpass->feedback = true;
462 break;
463 }
464 }
465 }
466 }
467 }
468
update_samples(struct tu_subpass * subpass,VkSampleCountFlagBits samples)469 static void update_samples(struct tu_subpass *subpass,
470 VkSampleCountFlagBits samples)
471 {
472 assert(subpass->samples == 0 || subpass->samples == samples);
473 subpass->samples = samples;
474 }
475
476 static void
tu_render_pass_gmem_config(struct tu_render_pass * pass,const struct tu_physical_device * phys_dev)477 tu_render_pass_gmem_config(struct tu_render_pass *pass,
478 const struct tu_physical_device *phys_dev)
479 {
480 uint32_t block_align_shift = 3; /* log2(gmem_align/(tile_align_w*tile_align_h)) */
481 uint32_t tile_align_w = phys_dev->info->tile_align_w;
482 uint32_t gmem_align = (1 << block_align_shift) * tile_align_w * phys_dev->info->tile_align_h;
483
484 /* calculate total bytes per pixel */
485 uint32_t cpp_total = 0;
486 for (uint32_t i = 0; i < pass->attachment_count; i++) {
487 struct tu_render_pass_attachment *att = &pass->attachments[i];
488 bool cpp1 = (att->cpp == 1);
489 if (att->gmem_offset >= 0) {
490 cpp_total += att->cpp;
491
492 /* take into account the separate stencil: */
493 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
494 cpp1 = (att->samples == 1);
495 cpp_total += att->samples;
496 }
497
498 /* texture pitch must be aligned to 64, use a tile_align_w that is
499 * a multiple of 64 for cpp==1 attachment to work as input attachment
500 */
501 if (cpp1 && tile_align_w % 64 != 0) {
502 tile_align_w *= 2;
503 block_align_shift -= 1;
504 }
505 }
506 }
507
508 pass->tile_align_w = tile_align_w;
509
510 /* no gmem attachments */
511 if (cpp_total == 0) {
512 /* any value non-zero value so tiling config works with no attachments */
513 pass->gmem_pixels = 1024*1024;
514 return;
515 }
516
517 /* TODO: using ccu_offset_gmem so that BLIT_OP_SCALE resolve path
518 * doesn't break things. maybe there is a better solution?
519 * TODO: this algorithm isn't optimal
520 * for example, two attachments with cpp = {1, 4}
521 * result: nblocks = {12, 52}, pixels = 196608
522 * optimal: nblocks = {13, 51}, pixels = 208896
523 */
524 uint32_t gmem_blocks = phys_dev->ccu_offset_gmem / gmem_align;
525 uint32_t offset = 0, pixels = ~0u, i;
526 for (i = 0; i < pass->attachment_count; i++) {
527 struct tu_render_pass_attachment *att = &pass->attachments[i];
528 if (att->gmem_offset < 0)
529 continue;
530
531 att->gmem_offset = offset;
532
533 uint32_t align = MAX2(1, att->cpp >> block_align_shift);
534 uint32_t nblocks = MAX2((gmem_blocks * att->cpp / cpp_total) & ~(align - 1), align);
535
536 if (nblocks > gmem_blocks)
537 break;
538
539 gmem_blocks -= nblocks;
540 cpp_total -= att->cpp;
541 offset += nblocks * gmem_align;
542 pixels = MIN2(pixels, nblocks * gmem_align / att->cpp);
543
544 /* repeat the same for separate stencil */
545 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
546 att->gmem_offset_stencil = offset;
547
548 /* note: for s8_uint, block align is always 1 */
549 uint32_t nblocks = gmem_blocks * att->samples / cpp_total;
550 if (nblocks > gmem_blocks)
551 break;
552
553 gmem_blocks -= nblocks;
554 cpp_total -= att->samples;
555 offset += nblocks * gmem_align;
556 pixels = MIN2(pixels, nblocks * gmem_align / att->samples);
557 }
558 }
559
560 /* if the loop didn't complete then the gmem config is impossible */
561 if (i == pass->attachment_count)
562 pass->gmem_pixels = pixels;
563 }
564
565 static void
attachment_set_ops(struct tu_render_pass_attachment * att,VkAttachmentLoadOp load_op,VkAttachmentLoadOp stencil_load_op,VkAttachmentStoreOp store_op,VkAttachmentStoreOp stencil_store_op)566 attachment_set_ops(struct tu_render_pass_attachment *att,
567 VkAttachmentLoadOp load_op,
568 VkAttachmentLoadOp stencil_load_op,
569 VkAttachmentStoreOp store_op,
570 VkAttachmentStoreOp stencil_store_op)
571 {
572 /* load/store ops */
573 att->clear_mask =
574 (load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) ? VK_IMAGE_ASPECT_COLOR_BIT : 0;
575 att->load = (load_op == VK_ATTACHMENT_LOAD_OP_LOAD);
576 att->store = (store_op == VK_ATTACHMENT_STORE_OP_STORE);
577
578 bool stencil_clear = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR);
579 bool stencil_load = (stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD);
580 bool stencil_store = (stencil_store_op == VK_ATTACHMENT_STORE_OP_STORE);
581
582 switch (att->format) {
583 case VK_FORMAT_D24_UNORM_S8_UINT: /* || stencil load/store */
584 if (att->clear_mask)
585 att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
586 if (stencil_clear)
587 att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
588 if (stencil_load)
589 att->load = true;
590 if (stencil_store)
591 att->store = true;
592 break;
593 case VK_FORMAT_S8_UINT: /* replace load/store with stencil load/store */
594 att->clear_mask = stencil_clear ? VK_IMAGE_ASPECT_COLOR_BIT : 0;
595 att->load = stencil_load;
596 att->store = stencil_store;
597 break;
598 case VK_FORMAT_D32_SFLOAT_S8_UINT: /* separate stencil */
599 if (att->clear_mask)
600 att->clear_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
601 if (stencil_clear)
602 att->clear_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
603 if (stencil_load)
604 att->load_stencil = true;
605 if (stencil_store)
606 att->store_stencil = true;
607 break;
608 default:
609 break;
610 }
611 }
612
613 static bool
is_depth_stencil_resolve_enabled(const VkSubpassDescriptionDepthStencilResolve * depth_stencil_resolve)614 is_depth_stencil_resolve_enabled(const VkSubpassDescriptionDepthStencilResolve *depth_stencil_resolve)
615 {
616 if (depth_stencil_resolve &&
617 depth_stencil_resolve->pDepthStencilResolveAttachment &&
618 depth_stencil_resolve->pDepthStencilResolveAttachment->attachment != VK_ATTACHMENT_UNUSED) {
619 return true;
620 }
621 return false;
622 }
623
624 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateRenderPass2(VkDevice _device,const VkRenderPassCreateInfo2KHR * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkRenderPass * pRenderPass)625 tu_CreateRenderPass2(VkDevice _device,
626 const VkRenderPassCreateInfo2KHR *pCreateInfo,
627 const VkAllocationCallbacks *pAllocator,
628 VkRenderPass *pRenderPass)
629 {
630 TU_FROM_HANDLE(tu_device, device, _device);
631 struct tu_render_pass *pass;
632 size_t size;
633 size_t attachments_offset;
634
635 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
636
637 size = sizeof(*pass);
638 size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
639 attachments_offset = size;
640 size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]);
641
642 pass = vk_object_zalloc(&device->vk, pAllocator, size,
643 VK_OBJECT_TYPE_RENDER_PASS);
644 if (pass == NULL)
645 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
646
647 pass->attachment_count = pCreateInfo->attachmentCount;
648 pass->subpass_count = pCreateInfo->subpassCount;
649 pass->attachments = (void *) pass + attachments_offset;
650
651 for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
652 struct tu_render_pass_attachment *att = &pass->attachments[i];
653
654 att->format = pCreateInfo->pAttachments[i].format;
655 att->samples = pCreateInfo->pAttachments[i].samples;
656 /* for d32s8, cpp is for the depth image, and
657 * att->samples will be used as the cpp for the stencil image
658 */
659 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT)
660 att->cpp = 4 * att->samples;
661 else
662 att->cpp = vk_format_get_blocksize(att->format) * att->samples;
663 att->gmem_offset = -1;
664
665 attachment_set_ops(att,
666 pCreateInfo->pAttachments[i].loadOp,
667 pCreateInfo->pAttachments[i].stencilLoadOp,
668 pCreateInfo->pAttachments[i].storeOp,
669 pCreateInfo->pAttachments[i].stencilStoreOp);
670 }
671 uint32_t subpass_attachment_count = 0;
672 struct tu_subpass_attachment *p;
673 for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
674 const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
675 const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
676 vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
677
678 subpass_attachment_count +=
679 desc->inputAttachmentCount + desc->colorAttachmentCount +
680 (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
681 (is_depth_stencil_resolve_enabled(ds_resolve) ? 1 : 0);
682 }
683
684 if (subpass_attachment_count) {
685 pass->subpass_attachments = vk_alloc2(
686 &device->vk.alloc, pAllocator,
687 subpass_attachment_count * sizeof(struct tu_subpass_attachment), 8,
688 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
689 if (pass->subpass_attachments == NULL) {
690 vk_object_free(&device->vk, pAllocator, pass);
691 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
692 }
693 } else
694 pass->subpass_attachments = NULL;
695
696 p = pass->subpass_attachments;
697 for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
698 const VkSubpassDescription2 *desc = &pCreateInfo->pSubpasses[i];
699 const VkSubpassDescriptionDepthStencilResolve *ds_resolve =
700 vk_find_struct_const(desc->pNext, SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
701 struct tu_subpass *subpass = &pass->subpasses[i];
702
703 subpass->input_count = desc->inputAttachmentCount;
704 subpass->color_count = desc->colorAttachmentCount;
705 subpass->resolve_count = 0;
706 subpass->resolve_depth_stencil = is_depth_stencil_resolve_enabled(ds_resolve);
707 subpass->samples = 0;
708 subpass->srgb_cntl = 0;
709
710 subpass->multiview_mask = desc->viewMask;
711
712 if (desc->inputAttachmentCount > 0) {
713 subpass->input_attachments = p;
714 p += desc->inputAttachmentCount;
715
716 for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
717 uint32_t a = desc->pInputAttachments[j].attachment;
718 subpass->input_attachments[j].attachment = a;
719 /* Note: attachments only used as input attachments will be read
720 * directly instead of through gmem, so we don't mark input
721 * attachments as needing gmem.
722 */
723 }
724 }
725
726 if (desc->colorAttachmentCount > 0) {
727 subpass->color_attachments = p;
728 p += desc->colorAttachmentCount;
729
730 for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
731 uint32_t a = desc->pColorAttachments[j].attachment;
732 subpass->color_attachments[j].attachment = a;
733
734 if (a != VK_ATTACHMENT_UNUSED) {
735 pass->attachments[a].gmem_offset = 0;
736 update_samples(subpass, pCreateInfo->pAttachments[a].samples);
737
738 if (vk_format_is_srgb(pass->attachments[a].format))
739 subpass->srgb_cntl |= 1 << j;
740
741 pass->attachments[a].clear_views |= subpass->multiview_mask;
742 }
743 }
744 }
745
746 subpass->resolve_attachments = (desc->pResolveAttachments || subpass->resolve_depth_stencil) ? p : NULL;
747 if (desc->pResolveAttachments) {
748 p += desc->colorAttachmentCount;
749 subpass->resolve_count += desc->colorAttachmentCount;
750 for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
751 subpass->resolve_attachments[j].attachment =
752 desc->pResolveAttachments[j].attachment;
753 }
754 }
755
756 if (subpass->resolve_depth_stencil) {
757 p++;
758 subpass->resolve_count++;
759 uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment;
760 subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a;
761 }
762
763 uint32_t a = desc->pDepthStencilAttachment ?
764 desc->pDepthStencilAttachment->attachment : VK_ATTACHMENT_UNUSED;
765 subpass->depth_stencil_attachment.attachment = a;
766 if (a != VK_ATTACHMENT_UNUSED) {
767 pass->attachments[a].gmem_offset = 0;
768 update_samples(subpass, pCreateInfo->pAttachments[a].samples);
769
770 pass->attachments[a].clear_views |= subpass->multiview_mask;
771 }
772 }
773
774 tu_render_pass_patch_input_gmem(pass);
775
776 tu_render_pass_check_feedback_loop(pass);
777
778 /* disable unused attachments */
779 for (uint32_t i = 0; i < pass->attachment_count; i++) {
780 struct tu_render_pass_attachment *att = &pass->attachments[i];
781 if (att->gmem_offset < 0) {
782 att->clear_mask = 0;
783 att->load = false;
784 }
785 }
786
787 /* From the VK_KHR_multiview spec:
788 *
789 * Multiview is all-or-nothing for a render pass - that is, either all
790 * subpasses must have a non-zero view mask (though some subpasses may
791 * have only one view) or all must be zero.
792 *
793 * This means we only have to check one of the view masks.
794 */
795 if (pCreateInfo->pSubpasses[0].viewMask) {
796 /* It seems multiview must use sysmem rendering. */
797 pass->gmem_pixels = 0;
798 } else {
799 tu_render_pass_gmem_config(pass, device->physical_device);
800 }
801
802 for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
803 tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
804 }
805
806 tu_render_pass_add_implicit_deps(pass, pCreateInfo);
807
808 *pRenderPass = tu_render_pass_to_handle(pass);
809
810 return VK_SUCCESS;
811 }
812
813 VKAPI_ATTR void VKAPI_CALL
tu_DestroyRenderPass(VkDevice _device,VkRenderPass _pass,const VkAllocationCallbacks * pAllocator)814 tu_DestroyRenderPass(VkDevice _device,
815 VkRenderPass _pass,
816 const VkAllocationCallbacks *pAllocator)
817 {
818 TU_FROM_HANDLE(tu_device, device, _device);
819 TU_FROM_HANDLE(tu_render_pass, pass, _pass);
820
821 if (!_pass)
822 return;
823
824 vk_free2(&device->vk.alloc, pAllocator, pass->subpass_attachments);
825 vk_object_free(&device->vk, pAllocator, pass);
826 }
827
828 VKAPI_ATTR void VKAPI_CALL
tu_GetRenderAreaGranularity(VkDevice _device,VkRenderPass renderPass,VkExtent2D * pGranularity)829 tu_GetRenderAreaGranularity(VkDevice _device,
830 VkRenderPass renderPass,
831 VkExtent2D *pGranularity)
832 {
833 TU_FROM_HANDLE(tu_device, device, _device);
834 pGranularity->width = device->physical_device->info->gmem_align_w;
835 pGranularity->height = device->physical_device->info->gmem_align_h;
836 }
837
838 uint32_t
tu_subpass_get_attachment_to_resolve(const struct tu_subpass * subpass,uint32_t index)839 tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index)
840 {
841 if (subpass->resolve_depth_stencil &&
842 index == (subpass->resolve_count - 1))
843 return subpass->depth_stencil_attachment.attachment;
844
845 return subpass->color_attachments[index].attachment;
846 }
847