1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33
34 #include "common/intel_l3_config.h"
35 #include "genxml/gen_macros.h"
36 #include "genxml/genX_pack.h"
37 #include "common/intel_guardband.h"
38 #include "compiler/elk/elk_prim.h"
39
40 #include "nir/nir_xfb_info.h"
41
42 #include "ds/intel_tracepoints.h"
43
44 /* We reserve :
45 * - GPR 14 for secondary command buffer returns
46 * - GPR 15 for conditional rendering
47 */
48 #define MI_BUILDER_NUM_ALLOC_GPRS 14
49 #define __gen_get_batch_dwords anv_batch_emit_dwords
50 #define __gen_address_offset anv_address_add
51 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
52 #include "common/mi_builder.h"
53
54 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
55 uint32_t pipeline);
56
57 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)58 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
59 enum anv_pipe_bits bits = 0;
60 bits |= (pc->DepthCacheFlushEnable) ? ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
61 bits |= (pc->DCFlushEnable) ? ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
62 bits |= (pc->RenderTargetCacheFlushEnable) ? ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
63 bits |= (pc->VFCacheInvalidationEnable) ? ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
64 bits |= (pc->StateCacheInvalidationEnable) ? ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
65 bits |= (pc->ConstantCacheInvalidationEnable) ? ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
66 bits |= (pc->TextureCacheInvalidationEnable) ? ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
67 bits |= (pc->InstructionCacheInvalidateEnable) ? ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
68 bits |= (pc->StallAtPixelScoreboard) ? ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
69 bits |= (pc->DepthStallEnable) ? ANV_PIPE_DEPTH_STALL_BIT : 0;
70 bits |= (pc->CommandStreamerStallEnable) ? ANV_PIPE_CS_STALL_BIT : 0;
71 return bits;
72 }
73
74 #define anv_debug_dump_pc(pc) \
75 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
76 fputs("pc: emit PC=( ", stderr); \
77 anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
78 fprintf(stderr, ") reason: %s\n", __func__); \
79 }
80
81 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)82 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
83 {
84 struct anv_queue_family *queue_family = cmd_buffer->queue_family;
85 return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
86 }
87
88 void
genX(cmd_buffer_emit_state_base_address)89 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
90 {
91 struct anv_device *device = cmd_buffer->device;
92 uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
93
94 /* If we are emitting a new state base address we probably need to re-emit
95 * binding tables.
96 */
97 cmd_buffer->state.descriptors_dirty |= ~0;
98
99 /* Emit a render target cache flush.
100 *
101 * This isn't documented anywhere in the PRM. However, it seems to be
102 * necessary prior to changing the surface state base address. Without
103 * this, we get GPU hangs when using multi-level command buffers which
104 * clear depth, reset state base address, and then go render stuff.
105 */
106 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
107 pc.DCFlushEnable = true;
108 pc.RenderTargetCacheFlushEnable = true;
109 pc.CommandStreamerStallEnable = true;
110 anv_debug_dump_pc(pc);
111 }
112
113 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
114 sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
115 sba.GeneralStateMOCS = mocs;
116 sba.GeneralStateBaseAddressModifyEnable = true;
117
118 sba.StatelessDataPortAccessMOCS = mocs;
119
120 sba.SurfaceStateBaseAddress =
121 anv_cmd_buffer_surface_base_address(cmd_buffer);
122 sba.SurfaceStateMOCS = mocs;
123 sba.SurfaceStateBaseAddressModifyEnable = true;
124
125 sba.DynamicStateBaseAddress =
126 (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
127 sba.DynamicStateMOCS = mocs;
128 sba.DynamicStateBaseAddressModifyEnable = true;
129
130 sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
131 sba.IndirectObjectMOCS = mocs;
132 sba.IndirectObjectBaseAddressModifyEnable = true;
133
134 sba.InstructionBaseAddress =
135 (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
136 sba.InstructionMOCS = mocs;
137 sba.InstructionBaseAddressModifyEnable = true;
138
139 # if (GFX_VER >= 8)
140 /* Broadwell requires that we specify a buffer size for a bunch of
141 * these fields. However, since we will be growing the BO's live, we
142 * just set them all to the maximum.
143 */
144 sba.GeneralStateBufferSize = 0xfffff;
145 sba.IndirectObjectBufferSize = 0xfffff;
146 if (anv_use_relocations(device->physical)) {
147 sba.DynamicStateBufferSize = 0xfffff;
148 sba.InstructionBufferSize = 0xfffff;
149 } else {
150 /* With softpin, we use fixed addresses so we actually know how big
151 * our base addresses are.
152 */
153 sba.DynamicStateBufferSize = DYNAMIC_STATE_POOL_SIZE / 4096;
154 sba.InstructionBufferSize = INSTRUCTION_STATE_POOL_SIZE / 4096;
155 }
156 sba.GeneralStateBufferSizeModifyEnable = true;
157 sba.IndirectObjectBufferSizeModifyEnable = true;
158 sba.DynamicStateBufferSizeModifyEnable = true;
159 sba.InstructionBuffersizeModifyEnable = true;
160 # else
161 /* On gfx7, we have upper bounds instead. According to the docs,
162 * setting an upper bound of zero means that no bounds checking is
163 * performed so, in theory, we should be able to leave them zero.
164 * However, border color is broken and the GPU bounds-checks anyway.
165 * To avoid this and other potential problems, we may as well set it
166 * for everything.
167 */
168 sba.GeneralStateAccessUpperBound =
169 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
170 sba.GeneralStateAccessUpperBoundModifyEnable = true;
171 sba.DynamicStateAccessUpperBound =
172 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
173 sba.DynamicStateAccessUpperBoundModifyEnable = true;
174 sba.InstructionAccessUpperBound =
175 (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
176 sba.InstructionAccessUpperBoundModifyEnable = true;
177 # endif
178 }
179
180 /* After re-setting the surface state base address, we have to do some
181 * cache flushing so that the sampler engine will pick up the new
182 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
183 * Shared Function > 3D Sampler > State > State Caching (page 96):
184 *
185 * Coherency with system memory in the state cache, like the texture
186 * cache is handled partially by software. It is expected that the
187 * command stream or shader will issue Cache Flush operation or
188 * Cache_Flush sampler message to ensure that the L1 cache remains
189 * coherent with system memory.
190 *
191 * [...]
192 *
193 * Whenever the value of the Dynamic_State_Base_Addr,
194 * Surface_State_Base_Addr are altered, the L1 state cache must be
195 * invalidated to ensure the new surface or sampler state is fetched
196 * from system memory.
197 *
198 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
199 * which, according the PIPE_CONTROL instruction documentation in the
200 * Broadwell PRM:
201 *
202 * Setting this bit is independent of any other bit in this packet.
203 * This bit controls the invalidation of the L1 and L2 state caches
204 * at the top of the pipe i.e. at the parsing time.
205 *
206 * Unfortunately, experimentation seems to indicate that state cache
207 * invalidation through a PIPE_CONTROL does nothing whatsoever in
208 * regards to surface state and binding tables. In stead, it seems that
209 * invalidating the texture cache is what is actually needed.
210 *
211 * XXX: As far as we have been able to determine through
212 * experimentation, shows that flush the texture cache appears to be
213 * sufficient. The theory here is that all of the sampling/rendering
214 * units cache the binding table in the texture cache. However, we have
215 * yet to be able to actually confirm this.
216 *
217 * Wa_14013910100:
218 *
219 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
220 * or program pipe control with Instruction cache invalidate post
221 * STATE_BASE_ADDRESS command"
222 */
223 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
224 pc.TextureCacheInvalidationEnable = true;
225 pc.ConstantCacheInvalidationEnable = true;
226 pc.StateCacheInvalidationEnable = true;
227 anv_debug_dump_pc(pc);
228 }
229 }
230
231 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)232 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
233 struct anv_state state, struct anv_address addr)
234 {
235 VkResult result;
236
237 if (anv_use_relocations(cmd_buffer->device->physical)) {
238 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
239 result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
240 &cmd_buffer->vk.pool->alloc,
241 state.offset + isl_dev->ss.addr_offset,
242 addr.bo, addr.offset, NULL);
243 } else {
244 result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
245 &cmd_buffer->vk.pool->alloc,
246 addr.bo);
247 }
248
249 if (unlikely(result != VK_SUCCESS))
250 anv_batch_set_error(&cmd_buffer->batch, result);
251 }
252
253 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)254 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
255 struct anv_surface_state state)
256 {
257 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
258
259 assert(!anv_address_is_null(state.address));
260 add_surface_reloc(cmd_buffer, state.state, state.address);
261
262 if (!anv_address_is_null(state.aux_address)) {
263 VkResult result =
264 anv_reloc_list_add(&cmd_buffer->surface_relocs,
265 &cmd_buffer->vk.pool->alloc,
266 state.state.offset + isl_dev->ss.aux_addr_offset,
267 state.aux_address.bo,
268 state.aux_address.offset,
269 NULL);
270 if (result != VK_SUCCESS)
271 anv_batch_set_error(&cmd_buffer->batch, result);
272 }
273
274 if (!anv_address_is_null(state.clear_address)) {
275 VkResult result =
276 anv_reloc_list_add(&cmd_buffer->surface_relocs,
277 &cmd_buffer->vk.pool->alloc,
278 state.state.offset +
279 isl_dev->ss.clear_color_state_offset,
280 state.clear_address.bo,
281 state.clear_address.offset,
282 NULL);
283 if (result != VK_SUCCESS)
284 anv_batch_set_error(&cmd_buffer->batch, result);
285 }
286 }
287
288 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)289 isl_color_value_requires_conversion(union isl_color_value color,
290 const struct isl_surf *surf,
291 const struct isl_view *view)
292 {
293 if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
294 return false;
295
296 uint32_t surf_pack[4] = { 0, 0, 0, 0 };
297 isl_color_value_pack(&color, surf->format, surf_pack);
298
299 uint32_t view_pack[4] = { 0, 0, 0, 0 };
300 union isl_color_value swiz_color =
301 isl_color_value_swizzle_inv(color, view->swizzle);
302 isl_color_value_pack(&swiz_color, view->format, view_pack);
303
304 return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
305 }
306
307 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)308 anv_can_fast_clear_color_view(struct anv_device * device,
309 struct anv_image_view *iview,
310 VkImageLayout layout,
311 union isl_color_value clear_color,
312 uint32_t num_layers,
313 VkRect2D render_area)
314 {
315 if (iview->planes[0].isl.base_array_layer >=
316 anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
317 iview->planes[0].isl.base_level))
318 return false;
319
320 /* Start by getting the fast clear type. We use the first subpass
321 * layout here because we don't want to fast-clear if the first subpass
322 * to use the attachment can't handle fast-clears.
323 */
324 enum anv_fast_clear_type fast_clear_type =
325 anv_layout_to_fast_clear_type(device->info, iview->image,
326 VK_IMAGE_ASPECT_COLOR_BIT,
327 layout);
328 switch (fast_clear_type) {
329 case ANV_FAST_CLEAR_NONE:
330 return false;
331 case ANV_FAST_CLEAR_DEFAULT_VALUE:
332 if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
333 return false;
334 break;
335 case ANV_FAST_CLEAR_ANY:
336 break;
337 }
338
339 /* Potentially, we could do partial fast-clears but doing so has crazy
340 * alignment restrictions. It's easier to just restrict to full size
341 * fast clears for now.
342 */
343 if (render_area.offset.x != 0 ||
344 render_area.offset.y != 0 ||
345 render_area.extent.width != iview->vk.extent.width ||
346 render_area.extent.height != iview->vk.extent.height)
347 return false;
348
349 /* On Broadwell and earlier, we can only handle 0/1 clear colors */
350 if (!isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
351 return false;
352
353 /* If the clear color is one that would require non-trivial format
354 * conversion on resolve, we don't bother with the fast clear. This
355 * shouldn't be common as most clear colors are 0/1 and the most common
356 * format re-interpretation is for sRGB.
357 */
358 if (isl_color_value_requires_conversion(clear_color,
359 &iview->image->planes[0].primary_surface.isl,
360 &iview->planes[0].isl)) {
361 anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
362 "Cannot fast-clear to colors which would require "
363 "format conversion on resolve");
364 return false;
365 }
366
367 /* We only allow fast clears to the first slice of an image (level 0,
368 * layer 0) and only for the entire slice. This guarantees us that, at
369 * any given time, there is only one clear color on any given image at
370 * any given time. At the time of our testing (Jan 17, 2018), there
371 * were no known applications which would benefit from fast-clearing
372 * more than just the first slice.
373 */
374 if (iview->planes[0].isl.base_level > 0 ||
375 iview->planes[0].isl.base_array_layer > 0) {
376 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
377 "Rendering with multi-lod or multi-layer framebuffer "
378 "with LOAD_OP_LOAD and baseMipLevel > 0 or "
379 "baseArrayLayer > 0. Not fast clearing.");
380 return false;
381 }
382
383 if (num_layers > 1) {
384 anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
385 "Rendering to a multi-layer framebuffer with "
386 "LOAD_OP_CLEAR. Only fast-clearing the first slice");
387 }
388
389 return true;
390 }
391
392 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,const struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)393 anv_can_hiz_clear_ds_view(struct anv_device *device,
394 const struct anv_image_view *iview,
395 VkImageLayout layout,
396 VkImageAspectFlags clear_aspects,
397 float depth_clear_value,
398 VkRect2D render_area)
399 {
400 /* We don't do any HiZ or depth fast-clears on gfx7 yet */
401 if (GFX_VER == 7)
402 return false;
403
404 /* If we're just clearing stencil, we can always HiZ clear */
405 if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
406 return true;
407
408 /* We must have depth in order to have HiZ */
409 if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
410 return false;
411
412 const enum isl_aux_usage clear_aux_usage =
413 anv_layout_to_aux_usage(device->info, iview->image,
414 VK_IMAGE_ASPECT_DEPTH_BIT,
415 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
416 layout);
417 if (!blorp_can_hiz_clear_depth(device->info,
418 &iview->image->planes[0].primary_surface.isl,
419 clear_aux_usage,
420 iview->planes[0].isl.base_level,
421 iview->planes[0].isl.base_array_layer,
422 render_area.offset.x,
423 render_area.offset.y,
424 render_area.offset.x +
425 render_area.extent.width,
426 render_area.offset.y +
427 render_area.extent.height))
428 return false;
429
430 if (depth_clear_value != ANV_HZ_FC_VAL)
431 return false;
432
433 /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
434 * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
435 * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
436 */
437 if (GFX_VER == 8 && anv_can_sample_with_hiz(device->info, iview->image))
438 return false;
439
440 /* If we got here, then we can fast clear */
441 return true;
442 }
443
444 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
445
446 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
447 * the initial layout is undefined, the HiZ buffer and depth buffer will
448 * represent the same data at the end of this operation.
449 */
450 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)451 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
452 const struct anv_image *image,
453 uint32_t base_layer, uint32_t layer_count,
454 VkImageLayout initial_layout,
455 VkImageLayout final_layout,
456 bool will_full_fast_clear)
457 {
458 const uint32_t depth_plane =
459 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
460 if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
461 return;
462
463 /* If will_full_fast_clear is set, the caller promises to fast-clear the
464 * largest portion of the specified range as it can. For depth images,
465 * that means the entire image because we don't support multi-LOD HiZ.
466 */
467 assert(image->planes[0].primary_surface.isl.levels == 1);
468 if (will_full_fast_clear)
469 return;
470
471 const enum isl_aux_state initial_state =
472 anv_layout_to_aux_state(cmd_buffer->device->info, image,
473 VK_IMAGE_ASPECT_DEPTH_BIT,
474 initial_layout);
475 const enum isl_aux_state final_state =
476 anv_layout_to_aux_state(cmd_buffer->device->info, image,
477 VK_IMAGE_ASPECT_DEPTH_BIT,
478 final_layout);
479
480 const bool initial_depth_valid =
481 isl_aux_state_has_valid_primary(initial_state);
482 const bool initial_hiz_valid =
483 isl_aux_state_has_valid_aux(initial_state);
484 const bool final_needs_depth =
485 isl_aux_state_has_valid_primary(final_state);
486 const bool final_needs_hiz =
487 isl_aux_state_has_valid_aux(final_state);
488
489 /* Getting into the pass-through state for Depth is tricky and involves
490 * both a resolve and an ambiguate. We don't handle that state right now
491 * as anv_layout_to_aux_state never returns it.
492 */
493 assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
494
495 if (final_needs_depth && !initial_depth_valid) {
496 assert(initial_hiz_valid);
497 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
498 0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
499 } else if (final_needs_hiz && !initial_hiz_valid) {
500 assert(initial_depth_valid);
501 anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
502 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
503 }
504 }
505
506 #if GFX_VER == 7
507 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)508 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
509 {
510 return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
511 layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
512 layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL ||
513 layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL;
514 }
515 #endif
516
517 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
518 * the initial layout is undefined, the HiZ buffer and depth buffer will
519 * represent the same data at the end of this operation.
520 */
521 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)522 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
523 const struct anv_image *image,
524 uint32_t base_level, uint32_t level_count,
525 uint32_t base_layer, uint32_t layer_count,
526 VkImageLayout initial_layout,
527 VkImageLayout final_layout,
528 bool will_full_fast_clear)
529 {
530 #if GFX_VER == 7
531 const uint32_t plane =
532 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
533
534 /* On gfx7, we have to store a texturable version of the stencil buffer in
535 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
536 * forth at strategic points. Stencil writes are only allowed in following
537 * layouts:
538 *
539 * - VK_IMAGE_LAYOUT_GENERAL
540 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
541 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
542 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
543 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
544 * - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
545 *
546 * For general, we have no nice opportunity to transition so we do the copy
547 * to the shadow unconditionally at the end of the subpass. For transfer
548 * destinations, we can update it as part of the transfer op. For the other
549 * layouts, we delay the copy until a transition into some other layout.
550 */
551 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
552 vk_image_layout_stencil_write_optimal(initial_layout) &&
553 !vk_image_layout_stencil_write_optimal(final_layout)) {
554 anv_image_copy_to_shadow(cmd_buffer, image,
555 VK_IMAGE_ASPECT_STENCIL_BIT,
556 base_level, level_count,
557 base_layer, layer_count);
558 }
559 #endif
560 }
561
562 #define MI_PREDICATE_SRC0 0x2400
563 #define MI_PREDICATE_SRC1 0x2408
564 #define MI_PREDICATE_RESULT 0x2418
565
566 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)567 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
568 const struct anv_image *image,
569 VkImageAspectFlagBits aspect,
570 enum anv_fast_clear_type fast_clear)
571 {
572 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
573 sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
574 image, aspect);
575 sdi.ImmediateData = fast_clear;
576 }
577 }
578
579 /* This is only really practical on haswell and above because it requires
580 * MI math in order to get it correct.
581 */
582 #if GFX_VERx10 >= 75
583 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)584 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
585 const struct anv_image *image,
586 VkImageAspectFlagBits aspect,
587 uint32_t level, uint32_t array_layer,
588 enum isl_aux_op resolve_op,
589 enum anv_fast_clear_type fast_clear_supported)
590 {
591 struct mi_builder b;
592 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
593
594 const struct mi_value fast_clear_type =
595 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
596 image, aspect));
597
598 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
599 if (level == 0 && array_layer == 0) {
600 /* In this case, we are doing a partial resolve to get rid of fast-clear
601 * colors. We don't care about the compression state but we do care
602 * about how much fast clear is allowed by the final layout.
603 */
604 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
605 assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
606
607 /* We need to compute (fast_clear_supported < image->fast_clear) */
608 struct mi_value pred =
609 mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
610 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
611
612 /* If the predicate is true, we want to write 0 to the fast clear type
613 * and, if it's false, leave it alone. We can do this by writing
614 *
615 * clear_type = clear_type & ~predicate;
616 */
617 struct mi_value new_fast_clear_type =
618 mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
619 mi_store(&b, fast_clear_type, new_fast_clear_type);
620 } else {
621 /* In this case, we're trying to do a partial resolve on a slice that
622 * doesn't have clear color. There's nothing to do.
623 */
624 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
625 return;
626 }
627
628 /* Set src1 to 0 and use a != condition */
629 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
630
631 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
632 mip.LoadOperation = LOAD_LOADINV;
633 mip.CombineOperation = COMBINE_SET;
634 mip.CompareOperation = COMPARE_SRCS_EQUAL;
635 }
636 }
637 #endif /* GFX_VERx10 >= 75 */
638
639 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)640 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
641 const struct anv_image *image,
642 VkImageAspectFlagBits aspect,
643 uint32_t level, uint32_t array_layer,
644 enum isl_aux_op resolve_op,
645 enum anv_fast_clear_type fast_clear_supported)
646 {
647 struct mi_builder b;
648 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
649
650 struct mi_value fast_clear_type_mem =
651 mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
652 image, aspect));
653
654 /* This only works for partial resolves and only when the clear color is
655 * all or nothing. On the upside, this emits less command streamer code
656 * and works on Ivybridge and Bay Trail.
657 */
658 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
659 assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
660
661 /* We don't support fast clears on anything other than the first slice. */
662 if (level > 0 || array_layer > 0)
663 return;
664
665 /* On gfx8, we don't have a concept of default clear colors because we
666 * can't sample from CCS surfaces. It's enough to just load the fast clear
667 * state into the predicate register.
668 */
669 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
670 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
671 mi_store(&b, fast_clear_type_mem, mi_imm(0));
672
673 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
674 mip.LoadOperation = LOAD_LOADINV;
675 mip.CombineOperation = COMBINE_SET;
676 mip.CompareOperation = COMPARE_SRCS_EQUAL;
677 }
678 }
679
680 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)681 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
682 const struct anv_image *image,
683 enum isl_format format,
684 struct isl_swizzle swizzle,
685 VkImageAspectFlagBits aspect,
686 uint32_t level, uint32_t array_layer,
687 enum isl_aux_op resolve_op,
688 enum anv_fast_clear_type fast_clear_supported)
689 {
690 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
691
692 anv_cmd_simple_resolve_predicate(cmd_buffer, image,
693 aspect, level, array_layer,
694 resolve_op, fast_clear_supported);
695
696 /* CCS_D only supports full resolves and BLORP will assert on us if we try
697 * to do a partial resolve on a CCS_D surface.
698 */
699 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
700 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
701 resolve_op = ISL_AUX_OP_FULL_RESOLVE;
702
703 anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
704 level, array_layer, 1, resolve_op, NULL, true);
705 }
706
707 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)708 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
709 const struct anv_image *image,
710 enum isl_format format,
711 struct isl_swizzle swizzle,
712 VkImageAspectFlagBits aspect,
713 uint32_t array_layer,
714 enum isl_aux_op resolve_op,
715 enum anv_fast_clear_type fast_clear_supported)
716 {
717 assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
718 assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
719
720 #if GFX_VERx10 >= 75
721 anv_cmd_compute_resolve_predicate(cmd_buffer, image,
722 aspect, 0, array_layer,
723 resolve_op, fast_clear_supported);
724
725 anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
726 array_layer, 1, resolve_op, NULL, true);
727 #else
728 unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
729 #endif
730 }
731
732 void
genX(cmd_buffer_mark_image_written)733 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
734 const struct anv_image *image,
735 VkImageAspectFlagBits aspect,
736 enum isl_aux_usage aux_usage,
737 uint32_t level,
738 uint32_t base_layer,
739 uint32_t layer_count)
740 {
741 /* The aspect must be exactly one of the image aspects. */
742 assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
743 }
744
745 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)746 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
747 const struct anv_image *image,
748 VkImageAspectFlagBits aspect)
749 {
750 assert(cmd_buffer && image);
751 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
752
753 set_image_fast_clear_state(cmd_buffer, image, aspect,
754 ANV_FAST_CLEAR_NONE);
755
756 /* Initialize the struct fields that are accessed for fast-clears so that
757 * the HW restrictions on the field values are satisfied.
758 */
759 struct anv_address addr =
760 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
761
762 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
763 sdi.Address = addr;
764 if (GFX_VERx10 >= 75) {
765 /* Pre-SKL, the dword containing the clear values also contains
766 * other fields, so we need to initialize those fields to match the
767 * values that would be in a color attachment.
768 */
769 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
770 ISL_CHANNEL_SELECT_GREEN << 22 |
771 ISL_CHANNEL_SELECT_BLUE << 19 |
772 ISL_CHANNEL_SELECT_ALPHA << 16;
773 } else if (GFX_VER == 7) {
774 /* On IVB, the dword containing the clear values also contains
775 * other fields that must be zero or can be zero.
776 */
777 sdi.ImmediateData = 0;
778 }
779 }
780 }
781
782 /* Copy the fast-clear value dword(s) between a surface state object and an
783 * image's fast clear state buffer.
784 */
785 static void
genX(copy_fast_clear_dwords)786 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
787 struct anv_state surface_state,
788 const struct anv_image *image,
789 VkImageAspectFlagBits aspect,
790 bool copy_from_surface_state)
791 {
792 assert(cmd_buffer && image);
793 assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
794
795 struct anv_address ss_clear_addr = {
796 .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
797 .offset = surface_state.offset +
798 cmd_buffer->device->isl_dev.ss.clear_value_offset,
799 };
800 const struct anv_address entry_addr =
801 anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
802 unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
803
804 #if GFX_VER == 7
805 /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
806 * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
807 * in-flight when they are issued even if the memory touched is not
808 * currently active for rendering. The weird bit is that it is not the
809 * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
810 * rendering hangs such that the next stalling command after the
811 * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
812 *
813 * It is unclear exactly why this hang occurs. Both MI commands come with
814 * warnings about the 3D pipeline but that doesn't seem to fully explain
815 * it. My (Faith's) best theory is that it has something to do with the
816 * fact that we're using a GPU state register as our temporary and that
817 * something with reading/writing it is causing problems.
818 *
819 * In order to work around this issue, we emit a PIPE_CONTROL with the
820 * command streamer stall bit set.
821 */
822 anv_add_pending_pipe_bits(cmd_buffer,
823 ANV_PIPE_CS_STALL_BIT,
824 "after copy_fast_clear_dwords. Avoid potential hang");
825 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
826 #endif
827
828 struct mi_builder b;
829 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
830
831 if (copy_from_surface_state) {
832 mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
833 } else {
834 mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
835
836 /* Updating a surface state object may require that the state cache be
837 * invalidated. From the SKL PRM, Shared Functions -> State -> State
838 * Caching:
839 *
840 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
841 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
842 * modified [...], the L1 state cache must be invalidated to ensure
843 * the new surface or sampler state is fetched from system memory.
844 *
845 * In testing, SKL doesn't actually seem to need this, but HSW does.
846 */
847 anv_add_pending_pipe_bits(cmd_buffer,
848 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
849 "after copy_fast_clear_dwords surface state update");
850 }
851 }
852
853 /**
854 * @brief Transitions a color buffer from one layout to another.
855 *
856 * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
857 * more information.
858 *
859 * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
860 * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
861 * this represents the maximum layers to transition at each
862 * specified miplevel.
863 */
864 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)865 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
866 const struct anv_image *image,
867 VkImageAspectFlagBits aspect,
868 const uint32_t base_level, uint32_t level_count,
869 uint32_t base_layer, uint32_t layer_count,
870 VkImageLayout initial_layout,
871 VkImageLayout final_layout,
872 uint32_t src_queue_family,
873 uint32_t dst_queue_family,
874 bool will_full_fast_clear)
875 {
876 struct anv_device *device = cmd_buffer->device;
877 const struct intel_device_info *devinfo = device->info;
878 /* Validate the inputs. */
879 assert(cmd_buffer);
880 assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
881 /* These values aren't supported for simplicity's sake. */
882 assert(level_count != VK_REMAINING_MIP_LEVELS &&
883 layer_count != VK_REMAINING_ARRAY_LAYERS);
884 /* Ensure the subresource range is valid. */
885 UNUSED uint64_t last_level_num = base_level + level_count;
886 const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
887 UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
888 assert((uint64_t)base_layer + layer_count <= image_layers);
889 assert(last_level_num <= image->vk.mip_levels);
890 /* If there is a layout transfer, the final layout cannot be undefined or
891 * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
892 */
893 assert(initial_layout == final_layout ||
894 (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
895 final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
896 const struct isl_drm_modifier_info *isl_mod_info =
897 image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
898 ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
899 : NULL;
900
901 const bool src_queue_external =
902 src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
903 src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
904
905 const bool dst_queue_external =
906 dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
907 dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
908
909 /* Simultaneous acquire and release on external queues is illegal. */
910 assert(!src_queue_external || !dst_queue_external);
911
912 /* Ownership transition on an external queue requires special action if the
913 * image has a DRM format modifier because we store image data in
914 * a driver-private bo which is inaccessible to the external queue.
915 */
916 const bool private_binding_acquire =
917 src_queue_external &&
918 anv_image_is_externally_shared(image) &&
919 anv_image_has_private_binding(image);
920
921 const bool private_binding_release =
922 dst_queue_external &&
923 anv_image_is_externally_shared(image) &&
924 anv_image_has_private_binding(image);
925
926 if (initial_layout == final_layout &&
927 !private_binding_acquire && !private_binding_release) {
928 /* No work is needed. */
929 return;
930 }
931
932 const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
933
934 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
935 final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
936 /* This surface is a linear compressed image with a tiled shadow surface
937 * for texturing. The client is about to use it in READ_ONLY_OPTIMAL so
938 * we need to ensure the shadow copy is up-to-date.
939 */
940 assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
941 assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
942 assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
943 assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
944 assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
945 assert(plane == 0);
946 anv_image_copy_to_shadow(cmd_buffer, image,
947 VK_IMAGE_ASPECT_COLOR_BIT,
948 base_level, level_count,
949 base_layer, layer_count);
950 }
951
952 if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
953 return;
954
955 assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
956
957 /* The following layouts are equivalent for non-linear images. */
958 const bool initial_layout_undefined =
959 initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
960 initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
961
962 bool must_init_fast_clear_state = false;
963 bool must_init_aux_surface = false;
964
965 if (initial_layout_undefined) {
966 /* The subresource may have been aliased and populated with arbitrary
967 * data.
968 */
969 must_init_fast_clear_state = true;
970 must_init_aux_surface = true;
971 } else if (private_binding_acquire) {
972 /* The fast clear state lives in a driver-private bo, and therefore the
973 * external/foreign queue is unaware of it.
974 *
975 * If this is the first time we are accessing the image, then the fast
976 * clear state is uninitialized.
977 *
978 * If this is NOT the first time we are accessing the image, then the fast
979 * clear state may still be valid and correct due to the resolve during
980 * our most recent ownership release. However, we do not track the aux
981 * state with MI stores, and therefore must assume the worst-case: that
982 * this is the first time we are accessing the image.
983 */
984 assert(image->planes[plane].fast_clear_memory_range.binding ==
985 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
986 must_init_fast_clear_state = true;
987
988 /* The aux surface, like the fast clear state, lives in
989 * a driver-private bo. We must initialize the aux surface for the
990 * same reasons we must initialize the fast clear state.
991 */
992 assert(image->planes[plane].aux_surface.memory_range.binding ==
993 ANV_IMAGE_MEMORY_BINDING_PRIVATE);
994 must_init_aux_surface = true;
995 }
996
997 if (must_init_fast_clear_state) {
998 if (base_level == 0 && base_layer == 0)
999 init_fast_clear_color(cmd_buffer, image, aspect);
1000 }
1001
1002 if (must_init_aux_surface) {
1003 assert(must_init_fast_clear_state);
1004
1005 /* Initialize the aux buffers to enable correct rendering. In order to
1006 * ensure that things such as storage images work correctly, aux buffers
1007 * need to be initialized to valid data.
1008 *
1009 * Having an aux buffer with invalid data is a problem for two reasons:
1010 *
1011 * 1) Having an invalid value in the buffer can confuse the hardware.
1012 * For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1013 * invalid and leads to the hardware doing strange things. It
1014 * doesn't hang as far as we can tell but rendering corruption can
1015 * occur.
1016 *
1017 * 2) If this transition is into the GENERAL layout and we then use the
1018 * image as a storage image, then we must have the aux buffer in the
1019 * pass-through state so that, if we then go to texture from the
1020 * image, we get the results of our storage image writes and not the
1021 * fast clear color or other random data.
1022 *
1023 * For CCS both of the problems above are real demonstrable issues. In
1024 * that case, the only thing we can do is to perform an ambiguate to
1025 * transition the aux surface into the pass-through state.
1026 *
1027 * For MCS, (2) is never an issue because we don't support multisampled
1028 * storage images. In theory, issue (1) is a problem with MCS but we've
1029 * never seen it in the wild. For 4x and 16x, all bit patters could, in
1030 * theory, be interpreted as something but we don't know that all bit
1031 * patterns are actually valid. For 2x and 8x, you could easily end up
1032 * with the MCS referring to an invalid plane because not all bits of
1033 * the MCS value are actually used. Even though we've never seen issues
1034 * in the wild, it's best to play it safe and initialize the MCS. We
1035 * can use a fast-clear for MCS because we only ever touch from render
1036 * and texture (no image load store).
1037 */
1038 if (image->vk.samples == 1) {
1039 for (uint32_t l = 0; l < level_count; l++) {
1040 const uint32_t level = base_level + l;
1041
1042 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1043 if (base_layer >= aux_layers)
1044 break; /* We will only get fewer layers as level increases */
1045 uint32_t level_layer_count =
1046 MIN2(layer_count, aux_layers - base_layer);
1047
1048 /* If will_full_fast_clear is set, the caller promises to
1049 * fast-clear the largest portion of the specified range as it can.
1050 * For color images, that means only the first LOD and array slice.
1051 */
1052 if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1053 base_layer++;
1054 level_layer_count--;
1055 if (level_layer_count == 0)
1056 continue;
1057 }
1058
1059 anv_image_ccs_op(cmd_buffer, image,
1060 image->planes[plane].primary_surface.isl.format,
1061 ISL_SWIZZLE_IDENTITY,
1062 aspect, level, base_layer, level_layer_count,
1063 ISL_AUX_OP_AMBIGUATE, NULL, false);
1064 }
1065 } else {
1066 if (image->vk.samples == 4 || image->vk.samples == 16) {
1067 anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1068 "Doing a potentially unnecessary fast-clear to "
1069 "define an MCS buffer.");
1070 }
1071
1072 /* If will_full_fast_clear is set, the caller promises to fast-clear
1073 * the largest portion of the specified range as it can.
1074 */
1075 if (will_full_fast_clear)
1076 return;
1077
1078 assert(base_level == 0 && level_count == 1);
1079 anv_image_mcs_op(cmd_buffer, image,
1080 image->planes[plane].primary_surface.isl.format,
1081 ISL_SWIZZLE_IDENTITY,
1082 aspect, base_layer, layer_count,
1083 ISL_AUX_OP_FAST_CLEAR, NULL, false);
1084 }
1085 return;
1086 }
1087
1088 enum isl_aux_usage initial_aux_usage =
1089 anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1090 enum isl_aux_usage final_aux_usage =
1091 anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1092 enum anv_fast_clear_type initial_fast_clear =
1093 anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1094 enum anv_fast_clear_type final_fast_clear =
1095 anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1096
1097 /* We must override the anv_layout_to_* functions because they are unaware of
1098 * acquire/release direction.
1099 */
1100 if (private_binding_acquire) {
1101 assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1102 initial_aux_usage = ISL_AUX_USAGE_NONE;
1103 initial_fast_clear = ANV_FAST_CLEAR_NONE;
1104 } else if (private_binding_release) {
1105 assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1106 final_aux_usage = ISL_AUX_USAGE_NONE;
1107 final_fast_clear = ANV_FAST_CLEAR_NONE;
1108 }
1109
1110 /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1111 * We can handle transitions between CCS_D/E to and from NONE. What we
1112 * don't yet handle is switching between CCS_E and CCS_D within a given
1113 * image. Doing so in a performant way requires more detailed aux state
1114 * tracking such as what is done in i965. For now, just assume that we
1115 * only have one type of compression.
1116 */
1117 assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1118 final_aux_usage == ISL_AUX_USAGE_NONE ||
1119 initial_aux_usage == final_aux_usage);
1120
1121 /* If initial aux usage is NONE, there is nothing to resolve */
1122 if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1123 return;
1124
1125 enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1126
1127 /* If the initial layout supports more fast clear than the final layout
1128 * then we need at least a partial resolve.
1129 */
1130 if (final_fast_clear < initial_fast_clear)
1131 resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1132
1133 if (resolve_op == ISL_AUX_OP_NONE)
1134 return;
1135
1136 /* Perform a resolve to synchronize data between the main and aux buffer.
1137 * Before we begin, we must satisfy the cache flushing requirement specified
1138 * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1139 *
1140 * Any transition from any value in {Clear, Render, Resolve} to a
1141 * different value in {Clear, Render, Resolve} requires end of pipe
1142 * synchronization.
1143 *
1144 * We perform a flush of the write cache before and after the clear and
1145 * resolve operations to meet this requirement.
1146 *
1147 * Unlike other drawing, fast clear operations are not properly
1148 * synchronized. The first PIPE_CONTROL here likely ensures that the
1149 * contents of the previous render or clear hit the render target before we
1150 * resolve and the second likely ensures that the resolve is complete before
1151 * we do any more rendering or clearing.
1152 */
1153 anv_add_pending_pipe_bits(cmd_buffer,
1154 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1155 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1156 "after transition RT");
1157
1158 for (uint32_t l = 0; l < level_count; l++) {
1159 uint32_t level = base_level + l;
1160
1161 uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1162 if (base_layer >= aux_layers)
1163 break; /* We will only get fewer layers as level increases */
1164 uint32_t level_layer_count =
1165 MIN2(layer_count, aux_layers - base_layer);
1166
1167 for (uint32_t a = 0; a < level_layer_count; a++) {
1168 uint32_t array_layer = base_layer + a;
1169
1170 /* If will_full_fast_clear is set, the caller promises to fast-clear
1171 * the largest portion of the specified range as it can. For color
1172 * images, that means only the first LOD and array slice.
1173 */
1174 if (level == 0 && array_layer == 0 && will_full_fast_clear)
1175 continue;
1176
1177 if (image->vk.samples == 1) {
1178 anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1179 image->planes[plane].primary_surface.isl.format,
1180 ISL_SWIZZLE_IDENTITY,
1181 aspect, level, array_layer, resolve_op,
1182 final_fast_clear);
1183 } else {
1184 /* We only support fast-clear on the first layer so partial
1185 * resolves should not be used on other layers as they will use
1186 * the clear color stored in memory that is only valid for layer0.
1187 */
1188 if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1189 array_layer != 0)
1190 continue;
1191
1192 anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1193 image->planes[plane].primary_surface.isl.format,
1194 ISL_SWIZZLE_IDENTITY,
1195 aspect, array_layer, resolve_op,
1196 final_fast_clear);
1197 }
1198 }
1199 }
1200
1201 anv_add_pending_pipe_bits(cmd_buffer,
1202 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1203 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1204 "after transition RT");
1205 }
1206
1207 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1208 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1209 uint32_t color_att_count)
1210 {
1211 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1212
1213 /* Reserve one for the NULL state. */
1214 unsigned num_states = 1 + color_att_count;
1215 const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1216 const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1217 gfx->att_states =
1218 anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1219 num_states * ss_stride, isl_dev->ss.align);
1220 if (gfx->att_states.map == NULL) {
1221 return anv_batch_set_error(&cmd_buffer->batch,
1222 VK_ERROR_OUT_OF_DEVICE_MEMORY);
1223 }
1224
1225 struct anv_state next_state = gfx->att_states;
1226 next_state.alloc_size = isl_dev->ss.size;
1227
1228 gfx->null_surface_state = next_state;
1229 next_state.offset += ss_stride;
1230 next_state.map += ss_stride;
1231
1232 gfx->color_att_count = color_att_count;
1233 for (uint32_t i = 0; i < color_att_count; i++) {
1234 gfx->color_att[i] = (struct anv_attachment) {
1235 .surface_state.state = next_state,
1236 };
1237 next_state.offset += ss_stride;
1238 next_state.map += ss_stride;
1239 }
1240 gfx->depth_att = (struct anv_attachment) { };
1241 gfx->stencil_att = (struct anv_attachment) { };
1242
1243 return VK_SUCCESS;
1244 }
1245
1246 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1247 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1248 {
1249 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1250
1251 gfx->render_area = (VkRect2D) { };
1252 gfx->layer_count = 0;
1253 gfx->samples = 0;
1254
1255 gfx->color_att_count = 0;
1256 gfx->depth_att = (struct anv_attachment) { };
1257 gfx->stencil_att = (struct anv_attachment) { };
1258 gfx->null_surface_state = ANV_STATE_NULL;
1259 }
1260
1261 VkResult
genX(BeginCommandBuffer)1262 genX(BeginCommandBuffer)(
1263 VkCommandBuffer commandBuffer,
1264 const VkCommandBufferBeginInfo* pBeginInfo)
1265 {
1266 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1267 VkResult result;
1268
1269 /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1270 * command buffer's state. Otherwise, we must *reset* its state. In both
1271 * cases we reset it.
1272 *
1273 * From the Vulkan 1.0 spec:
1274 *
1275 * If a command buffer is in the executable state and the command buffer
1276 * was allocated from a command pool with the
1277 * VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1278 * vkBeginCommandBuffer implicitly resets the command buffer, behaving
1279 * as if vkResetCommandBuffer had been called with
1280 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1281 * the command buffer in the recording state.
1282 */
1283 anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
1284 anv_cmd_buffer_reset_rendering(cmd_buffer);
1285
1286 cmd_buffer->usage_flags = pBeginInfo->flags;
1287
1288 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1289 * primary level command buffers.
1290 *
1291 * From the Vulkan 1.0 spec:
1292 *
1293 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1294 * secondary command buffer is considered to be entirely inside a render
1295 * pass. If this is a primary command buffer, then this bit is ignored.
1296 */
1297 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1298 cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1299
1300 trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
1301
1302 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1303
1304 /* We sometimes store vertex data in the dynamic state buffer for blorp
1305 * operations and our dynamic state stream may re-use data from previous
1306 * command buffers. In order to prevent stale cache data, we flush the VF
1307 * cache. We could do this on every blorp call but that's not really
1308 * needed as all of the data will get written by the CPU prior to the GPU
1309 * executing anything. The chances are fairly high that they will use
1310 * blorp at least once per primary command buffer so it shouldn't be
1311 * wasted.
1312 *
1313 * There is also a workaround on gfx8 which requires us to invalidate the
1314 * VF cache occasionally. It's easier if we can assume we start with a
1315 * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1316 */
1317 anv_add_pending_pipe_bits(cmd_buffer,
1318 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1319 "new cmd buffer");
1320
1321 /* We send an "Indirect State Pointers Disable" packet at
1322 * EndCommandBuffer, so all push constant packets are ignored during a
1323 * context restore. Documentation says after that command, we need to
1324 * emit push constants again before any rendering operation. So we
1325 * flag them dirty here to make sure they get emitted.
1326 */
1327 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1328
1329 if (cmd_buffer->usage_flags &
1330 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1331 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1332
1333 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
1334 const VkRenderingInfo *resume_info =
1335 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
1336 pBeginInfo,
1337 gcbiar_data);
1338 if (resume_info != NULL) {
1339 genX(CmdBeginRendering)(commandBuffer, resume_info);
1340 } else {
1341 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
1342 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1343 pBeginInfo);
1344 assert(inheritance_info);
1345
1346 gfx->rendering_flags = inheritance_info->flags;
1347 gfx->render_area = (VkRect2D) { };
1348 gfx->layer_count = 0;
1349 gfx->samples = inheritance_info->rasterizationSamples;
1350 gfx->view_mask = inheritance_info->viewMask;
1351
1352 uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1353 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
1354 if (result != VK_SUCCESS)
1355 return result;
1356
1357 for (uint32_t i = 0; i < color_att_count; i++) {
1358 gfx->color_att[i].vk_format =
1359 inheritance_info->pColorAttachmentFormats[i];
1360 }
1361 gfx->depth_att.vk_format =
1362 inheritance_info->depthAttachmentFormat;
1363 gfx->stencil_att.vk_format =
1364 inheritance_info->stencilAttachmentFormat;
1365
1366 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1367
1368 anv_cmd_graphic_state_update_has_uint_rt(gfx);
1369 }
1370 }
1371
1372 #if GFX_VER >= 8
1373 /* Emit the sample pattern at the beginning of the batch because the
1374 * default locations emitted at the device initialization might have been
1375 * changed by a previous command buffer.
1376 *
1377 * Do not change that when we're continuing a previous renderpass.
1378 */
1379 if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1380 !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1381 genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
1382 #endif
1383
1384 #if GFX_VERx10 >= 75
1385 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1386 const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1387 vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1388
1389 /* If secondary buffer supports conditional rendering
1390 * we should emit commands as if conditional rendering is enabled.
1391 */
1392 cmd_buffer->state.conditional_render_enabled =
1393 conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1394 }
1395 #endif
1396
1397 return VK_SUCCESS;
1398 }
1399
1400 /* From the PRM, Volume 2a:
1401 *
1402 * "Indirect State Pointers Disable
1403 *
1404 * At the completion of the post-sync operation associated with this pipe
1405 * control packet, the indirect state pointers in the hardware are
1406 * considered invalid; the indirect pointers are not saved in the context.
1407 * If any new indirect state commands are executed in the command stream
1408 * while the pipe control is pending, the new indirect state commands are
1409 * preserved.
1410 *
1411 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1412 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1413 * commands are only considered as Indirect State Pointers. Once ISP is
1414 * issued in a context, SW must initialize by programming push constant
1415 * commands for all the shaders (at least to zero length) before attempting
1416 * any rendering operation for the same context."
1417 *
1418 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1419 * even though they point to a BO that has been already unreferenced at
1420 * the end of the previous batch buffer. This has been fine so far since
1421 * we are protected by these scratch page (every address not covered by
1422 * a BO should be pointing to the scratch page). But on CNL, it is
1423 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1424 * instruction.
1425 *
1426 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1427 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1428 * context restore, so the mentioned hang doesn't happen. However,
1429 * software must program push constant commands for all stages prior to
1430 * rendering anything. So we flag them dirty in BeginCommandBuffer.
1431 *
1432 * Finally, we also make sure to stall at pixel scoreboard to make sure the
1433 * constants have been loaded into the EUs prior to disable the push constants
1434 * so that it doesn't hang a previous 3DPRIMITIVE.
1435 */
1436 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1437 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1438 {
1439 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1440 pc.StallAtPixelScoreboard = true;
1441 pc.CommandStreamerStallEnable = true;
1442 anv_debug_dump_pc(pc);
1443 }
1444 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1445 pc.IndirectStatePointersDisable = true;
1446 pc.CommandStreamerStallEnable = true;
1447 anv_debug_dump_pc(pc);
1448 }
1449 }
1450
1451 VkResult
genX(EndCommandBuffer)1452 genX(EndCommandBuffer)(
1453 VkCommandBuffer commandBuffer)
1454 {
1455 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1456
1457 if (anv_batch_has_error(&cmd_buffer->batch))
1458 return cmd_buffer->batch.status;
1459
1460 anv_measure_endcommandbuffer(cmd_buffer);
1461
1462 /* We want every command buffer to start with the PMA fix in a known state,
1463 * so we disable it at the end of the command buffer.
1464 */
1465 genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1466
1467 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1468
1469 emit_isp_disable(cmd_buffer);
1470
1471 trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
1472
1473 anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1474
1475 return VK_SUCCESS;
1476 }
1477
1478 void
genX(CmdExecuteCommands)1479 genX(CmdExecuteCommands)(
1480 VkCommandBuffer commandBuffer,
1481 uint32_t commandBufferCount,
1482 const VkCommandBuffer* pCmdBuffers)
1483 {
1484 ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1485
1486 assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1487
1488 if (anv_batch_has_error(&primary->batch))
1489 return;
1490
1491 /* The secondary command buffers will assume that the PMA fix is disabled
1492 * when they begin executing. Make sure this is true.
1493 */
1494 genX(cmd_buffer_enable_pma_fix)(primary, false);
1495
1496 /* The secondary command buffer doesn't know which textures etc. have been
1497 * flushed prior to their execution. Apply those flushes now.
1498 */
1499 genX(cmd_buffer_apply_pipe_flushes)(primary);
1500
1501 for (uint32_t i = 0; i < commandBufferCount; i++) {
1502 ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1503
1504 assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1505 assert(!anv_batch_has_error(&secondary->batch));
1506
1507 #if GFX_VERx10 >= 75
1508 if (secondary->state.conditional_render_enabled) {
1509 if (!primary->state.conditional_render_enabled) {
1510 /* Secondary buffer is constructed as if it will be executed
1511 * with conditional rendering, we should satisfy this dependency
1512 * regardless of conditional rendering being enabled in primary.
1513 */
1514 struct mi_builder b;
1515 mi_builder_init(&b, primary->device->info, &primary->batch);
1516 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1517 mi_imm(UINT64_MAX));
1518 }
1519 }
1520 #endif
1521
1522 if (secondary->usage_flags &
1523 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1524 /* If we're continuing a render pass from the primary, we need to
1525 * copy the surface states for the current subpass into the storage
1526 * we allocated for them in BeginCommandBuffer.
1527 */
1528 struct anv_bo *ss_bo =
1529 primary->device->surface_state_pool.block_pool.bo;
1530 struct anv_state src_state = primary->state.gfx.att_states;
1531 struct anv_state dst_state = secondary->state.gfx.att_states;
1532 assert(src_state.alloc_size == dst_state.alloc_size);
1533
1534 genX(cmd_buffer_so_memcpy)(primary,
1535 (struct anv_address) {
1536 .bo = ss_bo,
1537 .offset = dst_state.offset,
1538 },
1539 (struct anv_address) {
1540 .bo = ss_bo,
1541 .offset = src_state.offset,
1542 },
1543 src_state.alloc_size);
1544 }
1545
1546 anv_cmd_buffer_add_secondary(primary, secondary);
1547
1548 assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1549 secondary->perf_query_pool == primary->perf_query_pool);
1550 if (secondary->perf_query_pool)
1551 primary->perf_query_pool = secondary->perf_query_pool;
1552 }
1553
1554 /* The secondary isn't counted in our VF cache tracking so we need to
1555 * invalidate the whole thing.
1556 */
1557 if (GFX_VER == 8) {
1558 anv_add_pending_pipe_bits(primary,
1559 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1560 "Secondary cmd buffer not tracked in VF cache");
1561 }
1562
1563 /* The secondary may have selected a different pipeline (3D or compute) and
1564 * may have changed the current L3$ configuration. Reset our tracking
1565 * variables to invalid values to ensure that we re-emit these in the case
1566 * where we do any draws or compute dispatches from the primary after the
1567 * secondary has returned.
1568 */
1569 primary->state.current_pipeline = UINT32_MAX;
1570 primary->state.current_l3_config = NULL;
1571 primary->state.current_hash_scale = 0;
1572 primary->state.gfx.push_constant_stages = 0;
1573 vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
1574
1575 /* Each of the secondary command buffers will use its own state base
1576 * address. We need to re-emit state base address for the primary after
1577 * all of the secondaries are done.
1578 *
1579 * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1580 * address calls?
1581 */
1582 genX(cmd_buffer_emit_state_base_address)(primary);
1583 }
1584
1585 /**
1586 * Program the hardware to use the specified L3 configuration.
1587 */
1588 void
genX(cmd_buffer_config_l3)1589 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1590 const struct intel_l3_config *cfg)
1591 {
1592 assert(cfg);
1593 if (cfg == cmd_buffer->state.current_l3_config)
1594 return;
1595
1596 if (INTEL_DEBUG(DEBUG_L3)) {
1597 mesa_logd("L3 config transition: ");
1598 intel_dump_l3_config(cfg, stderr);
1599 }
1600
1601 /* According to the hardware docs, the L3 partitioning can only be changed
1602 * while the pipeline is completely drained and the caches are flushed,
1603 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1604 */
1605 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1606 pc.DCFlushEnable = true;
1607 pc.PostSyncOperation = NoWrite;
1608 pc.CommandStreamerStallEnable = true;
1609 anv_debug_dump_pc(pc);
1610 }
1611
1612 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1613 * invalidation of the relevant caches. Note that because RO invalidation
1614 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1615 * command is processed by the CS) we cannot combine it with the previous
1616 * stalling flush as the hardware documentation suggests, because that
1617 * would cause the CS to stall on previous rendering *after* RO
1618 * invalidation and wouldn't prevent the RO caches from being polluted by
1619 * concurrent rendering before the stall completes. This intentionally
1620 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1621 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1622 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1623 * already guarantee that there is no concurrent GPGPU kernel execution
1624 * (see SKL HSD 2132585).
1625 */
1626 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1627 pc.TextureCacheInvalidationEnable = true;
1628 pc.ConstantCacheInvalidationEnable = true;
1629 pc.InstructionCacheInvalidateEnable = true;
1630 pc.StateCacheInvalidationEnable = true;
1631 pc.PostSyncOperation = NoWrite;
1632 anv_debug_dump_pc(pc);
1633 }
1634
1635 /* Now send a third stalling flush to make sure that invalidation is
1636 * complete when the L3 configuration registers are modified.
1637 */
1638 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1639 pc.DCFlushEnable = true;
1640 pc.PostSyncOperation = NoWrite;
1641 pc.CommandStreamerStallEnable = true;
1642 anv_debug_dump_pc(pc);
1643 }
1644
1645 genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1646 cmd_buffer->state.current_l3_config = cfg;
1647 }
1648
1649 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1650 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1651 struct anv_device *device,
1652 uint32_t current_pipeline,
1653 enum anv_pipe_bits bits)
1654 {
1655 /*
1656 * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1657 *
1658 * Write synchronization is a special case of end-of-pipe
1659 * synchronization that requires that the render cache and/or depth
1660 * related caches are flushed to memory, where the data will become
1661 * globally visible. This type of synchronization is required prior to
1662 * SW (CPU) actually reading the result data from memory, or initiating
1663 * an operation that will use as a read surface (such as a texture
1664 * surface) a previous render target and/or depth/stencil buffer
1665 *
1666 *
1667 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1668 *
1669 * Exercising the write cache flush bits (Render Target Cache Flush
1670 * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1671 * ensures the write caches are flushed and doesn't guarantee the data
1672 * is globally visible.
1673 *
1674 * SW can track the completion of the end-of-pipe-synchronization by
1675 * using "Notify Enable" and "PostSync Operation - Write Immediate
1676 * Data" in the PIPE_CONTROL command.
1677 *
1678 * In other words, flushes are pipelined while invalidations are handled
1679 * immediately. Therefore, if we're flushing anything then we need to
1680 * schedule an end-of-pipe sync before any invalidations can happen.
1681 */
1682 if (bits & ANV_PIPE_FLUSH_BITS)
1683 bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1684
1685 /* If we're going to do an invalidate and we have a pending end-of-pipe
1686 * sync that has yet to be resolved, we do the end-of-pipe sync now.
1687 */
1688 if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1689 (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1690 bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1691 bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1692 }
1693
1694 /* Project: SKL / Argument: LRI Post Sync Operation [23]
1695 *
1696 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1697 * programmed prior to programming a PIPECONTROL command with "LRI
1698 * Post Sync Operation" in GPGPU mode of operation (i.e when
1699 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
1700 *
1701 * The same text exists a few rows below for Post Sync Op.
1702 */
1703 if (bits & ANV_PIPE_POST_SYNC_BIT)
1704 bits &= ~ANV_PIPE_POST_SYNC_BIT;
1705
1706 if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1707 ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1708 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1709 /* Flushing HDC pipeline requires DC Flush on earlier HW. */
1710 pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
1711 pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
1712 pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
1713 pipe.RenderTargetCacheFlushEnable =
1714 bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
1715
1716 pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
1717 #if GFX_VER == 8
1718 /* From Broadwell PRM, volume 2a:
1719 * PIPE_CONTROL: Command Streamer Stall Enable:
1720 *
1721 * "This bit must be always set when PIPE_CONTROL command is
1722 * programmed by GPGPU and MEDIA workloads, except for the cases
1723 * when only Read Only Cache Invalidation bits are set (State
1724 * Cache Invalidation Enable, Instruction cache Invalidation
1725 * Enable, Texture Cache Invalidation Enable, Constant Cache
1726 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
1727 * need not implemented when FF_DOP_CG is disabled."
1728 *
1729 * Since we do all the invalidation in the following PIPE_CONTROL,
1730 * if we got here, we need a stall.
1731 */
1732 pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
1733 #endif
1734
1735 pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
1736
1737 /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1738 *
1739 * "The most common action to perform upon reaching a
1740 * synchronization point is to write a value out to memory. An
1741 * immediate value (included with the synchronization command) may
1742 * be written."
1743 *
1744 *
1745 * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1746 *
1747 * "In case the data flushed out by the render engine is to be
1748 * read back in to the render engine in coherent manner, then the
1749 * render engine has to wait for the fence completion before
1750 * accessing the flushed data. This can be achieved by following
1751 * means on various products: PIPE_CONTROL command with CS Stall
1752 * and the required write caches flushed with Post-Sync-Operation
1753 * as Write Immediate Data.
1754 *
1755 * Example:
1756 * - Workload-1 (3D/GPGPU/MEDIA)
1757 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1758 * Immediate Data, Required Write Cache Flush bits set)
1759 * - Workload-2 (Can use the data produce or output by
1760 * Workload-1)
1761 */
1762 if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1763 pipe.CommandStreamerStallEnable = true;
1764 pipe.PostSyncOperation = WriteImmediateData;
1765 pipe.Address = device->workaround_address;
1766 }
1767
1768 /*
1769 * According to the Broadwell documentation, any PIPE_CONTROL with the
1770 * "Command Streamer Stall" bit set must also have another bit set,
1771 * with five different options:
1772 *
1773 * - Render Target Cache Flush
1774 * - Depth Cache Flush
1775 * - Stall at Pixel Scoreboard
1776 * - Post-Sync Operation
1777 * - Depth Stall
1778 * - DC Flush Enable
1779 *
1780 * I chose "Stall at Pixel Scoreboard" since that's what we use in
1781 * mesa and it seems to work fine. The choice is fairly arbitrary.
1782 */
1783 if (pipe.CommandStreamerStallEnable &&
1784 !pipe.RenderTargetCacheFlushEnable &&
1785 !pipe.DepthCacheFlushEnable &&
1786 !pipe.StallAtPixelScoreboard &&
1787 !pipe.PostSyncOperation &&
1788 !pipe.DepthStallEnable &&
1789 !pipe.DCFlushEnable)
1790 pipe.StallAtPixelScoreboard = true;
1791 anv_debug_dump_pc(pipe);
1792 }
1793
1794 /* If a render target flush was emitted, then we can toggle off the bit
1795 * saying that render target writes are ongoing.
1796 */
1797 if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
1798 bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
1799
1800 if (GFX_VERx10 == 75) {
1801 /* Haswell needs addition work-arounds:
1802 *
1803 * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1804 *
1805 * Option 1:
1806 * PIPE_CONTROL command with the CS Stall and the required write
1807 * caches flushed with Post-SyncOperation as Write Immediate Data
1808 * followed by eight dummy MI_STORE_DATA_IMM (write to scratch
1809 * spce) commands.
1810 *
1811 * Example:
1812 * - Workload-1
1813 * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1814 * Immediate Data, Required Write Cache Flush bits set)
1815 * - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
1816 * - Workload-2 (Can use the data produce or output by
1817 * Workload-1)
1818 *
1819 * Unfortunately, both the PRMs and the internal docs are a bit
1820 * out-of-date in this regard. What the windows driver does (and
1821 * this appears to actually work) is to emit a register read from the
1822 * memory address written by the pipe control above.
1823 *
1824 * What register we load into doesn't matter. We choose an indirect
1825 * rendering register because we know it always exists and it's one
1826 * of the first registers the command parser allows us to write. If
1827 * you don't have command parser support in your kernel (pre-4.2),
1828 * this will get turned into MI_NOOP and you won't get the
1829 * workaround. Unfortunately, there's just not much we can do in
1830 * that case. This register is perfectly safe to write since we
1831 * always re-load all of the indirect draw registers right before
1832 * 3DPRIMITIVE when needed anyway.
1833 */
1834 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1835 lrm.RegisterAddress = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
1836 lrm.MemoryAddress = device->workaround_address;
1837 }
1838 }
1839
1840 bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1841 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1842 }
1843
1844 if (bits & ANV_PIPE_INVALIDATE_BITS) {
1845 anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1846 pipe.StateCacheInvalidationEnable =
1847 bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
1848 pipe.ConstantCacheInvalidationEnable =
1849 bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
1850 pipe.VFCacheInvalidationEnable =
1851 bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
1852 pipe.TextureCacheInvalidationEnable =
1853 bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
1854 pipe.InstructionCacheInvalidateEnable =
1855 bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
1856
1857 anv_debug_dump_pc(pipe);
1858 }
1859
1860 bits &= ~ANV_PIPE_INVALIDATE_BITS;
1861 }
1862
1863 return bits;
1864 }
1865
1866 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1867 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1868 {
1869 enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1870
1871 if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1872 bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1873 else if (bits == 0)
1874 return;
1875
1876 bool trace_flush =
1877 (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
1878 if (trace_flush)
1879 trace_intel_begin_stall(&cmd_buffer->trace);
1880
1881 if (GFX_VER == 8 &&
1882 (bits & ANV_PIPE_CS_STALL_BIT) &&
1883 (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1884 /* If we are doing a VF cache invalidate AND a CS stall (it must be
1885 * both) then we can reset our vertex cache tracking.
1886 */
1887 memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1888 sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1889 memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1890 sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1891 }
1892
1893 cmd_buffer->state.pending_pipe_bits =
1894 genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1895 cmd_buffer->device,
1896 cmd_buffer->state.current_pipeline,
1897 bits);
1898
1899 if (trace_flush) {
1900 trace_intel_end_stall(&cmd_buffer->trace, bits,
1901 anv_pipe_flush_bit_to_ds_stall_flag, NULL);
1902 }
1903 }
1904
1905 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,const char * reason)1906 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
1907 const VkDependencyInfo *dep_info,
1908 const char *reason)
1909 {
1910 /* XXX: Right now, we're really dumb and just flush whatever categories
1911 * the app asks for. One of these days we may make this a bit better
1912 * but right now that's all the hardware allows for in most areas.
1913 */
1914 VkAccessFlags2 src_flags = 0;
1915 VkAccessFlags2 dst_flags = 0;
1916
1917 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
1918 src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
1919 dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
1920 }
1921
1922 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
1923 src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
1924 dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
1925 }
1926
1927 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
1928 const VkImageMemoryBarrier2 *img_barrier =
1929 &dep_info->pImageMemoryBarriers[i];
1930
1931 src_flags |= img_barrier->srcAccessMask;
1932 dst_flags |= img_barrier->dstAccessMask;
1933
1934 ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
1935 const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
1936
1937 uint32_t base_layer, layer_count;
1938 if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
1939 base_layer = 0;
1940 layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
1941 } else {
1942 base_layer = range->baseArrayLayer;
1943 layer_count = vk_image_subresource_layer_count(&image->vk, range);
1944 }
1945 const uint32_t level_count =
1946 vk_image_subresource_level_count(&image->vk, range);
1947
1948 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1949 transition_depth_buffer(cmd_buffer, image,
1950 base_layer, layer_count,
1951 img_barrier->oldLayout,
1952 img_barrier->newLayout,
1953 false /* will_full_fast_clear */);
1954 }
1955
1956 if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1957 transition_stencil_buffer(cmd_buffer, image,
1958 range->baseMipLevel, level_count,
1959 base_layer, layer_count,
1960 img_barrier->oldLayout,
1961 img_barrier->newLayout,
1962 false /* will_full_fast_clear */);
1963
1964 /* If we are in a renderpass, the gfx7 stencil shadow may need to be
1965 * updated even if the layout doesn't change
1966 */
1967 if (cmd_buffer->state.gfx.samples &&
1968 (img_barrier->dstAccessMask & (VK_ACCESS_2_SHADER_READ_BIT |
1969 VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
1970 VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))) {
1971 const uint32_t plane =
1972 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
1973 if (anv_surface_is_valid(&image->planes[plane].shadow_surface))
1974 anv_image_copy_to_shadow(cmd_buffer, image,
1975 VK_IMAGE_ASPECT_STENCIL_BIT,
1976 range->baseMipLevel, level_count,
1977 base_layer, layer_count);
1978 }
1979 }
1980
1981 if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
1982 VkImageAspectFlags color_aspects =
1983 vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
1984 anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
1985 transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
1986 range->baseMipLevel, level_count,
1987 base_layer, layer_count,
1988 img_barrier->oldLayout,
1989 img_barrier->newLayout,
1990 img_barrier->srcQueueFamilyIndex,
1991 img_barrier->dstQueueFamilyIndex,
1992 false /* will_full_fast_clear */);
1993 }
1994 }
1995 }
1996
1997 enum anv_pipe_bits bits =
1998 anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
1999 anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2000
2001 anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2002 }
2003
genX(CmdPipelineBarrier2)2004 void genX(CmdPipelineBarrier2)(
2005 VkCommandBuffer commandBuffer,
2006 const VkDependencyInfo* pDependencyInfo)
2007 {
2008 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2009
2010 cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2011 }
2012
2013 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2014 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2015 {
2016 VkShaderStageFlags stages =
2017 cmd_buffer->state.gfx.pipeline->active_stages;
2018
2019 /* In order to avoid thrash, we assume that vertex and fragment stages
2020 * always exist. In the rare case where one is missing *and* the other
2021 * uses push concstants, this may be suboptimal. However, avoiding stalls
2022 * seems more important.
2023 */
2024 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2025 if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2026 stages |= VK_SHADER_STAGE_VERTEX_BIT;
2027
2028 if (stages == cmd_buffer->state.gfx.push_constant_stages)
2029 return;
2030
2031 const unsigned push_constant_kb =
2032 cmd_buffer->device->info->max_constant_urb_size_kb;
2033
2034 const unsigned num_stages =
2035 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2036 unsigned size_per_stage = push_constant_kb / num_stages;
2037
2038 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2039 * units of 2KB. Incidentally, these are the same platforms that have
2040 * 32KB worth of push constant space.
2041 */
2042 if (push_constant_kb == 32)
2043 size_per_stage &= ~1u;
2044
2045 uint32_t kb_used = 0;
2046 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2047 unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2048 anv_batch_emit(&cmd_buffer->batch,
2049 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2050 alloc._3DCommandSubOpcode = 18 + i;
2051 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2052 alloc.ConstantBufferSize = push_size;
2053 }
2054 kb_used += push_size;
2055 }
2056
2057 anv_batch_emit(&cmd_buffer->batch,
2058 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2059 alloc.ConstantBufferOffset = kb_used;
2060 alloc.ConstantBufferSize = push_constant_kb - kb_used;
2061 }
2062
2063 cmd_buffer->state.gfx.push_constant_stages = stages;
2064
2065 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2066 *
2067 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2068 * the next 3DPRIMITIVE command after programming the
2069 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2070 *
2071 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2072 * pipeline setup, we need to dirty push constants.
2073 */
2074 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2075 }
2076
2077 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2078 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2079 struct anv_cmd_pipeline_state *pipe_state,
2080 struct anv_shader_bin *shader,
2081 struct anv_state *bt_state)
2082 {
2083 uint32_t state_offset;
2084
2085 struct anv_pipeline_bind_map *map = &shader->bind_map;
2086 if (map->surface_count == 0) {
2087 *bt_state = (struct anv_state) { 0, };
2088 return VK_SUCCESS;
2089 }
2090
2091 *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2092 map->surface_count,
2093 &state_offset);
2094 uint32_t *bt_map = bt_state->map;
2095
2096 if (bt_state->map == NULL)
2097 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2098
2099 /* We only need to emit relocs if we're not using softpin. If we are using
2100 * softpin then we always keep all user-allocated memory objects resident.
2101 */
2102 const bool need_client_mem_relocs =
2103 anv_use_relocations(cmd_buffer->device->physical);
2104 struct anv_push_constants *push = &pipe_state->push_constants;
2105
2106 for (uint32_t s = 0; s < map->surface_count; s++) {
2107 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2108
2109 struct anv_state surface_state;
2110
2111 switch (binding->set) {
2112 case ANV_DESCRIPTOR_SET_NULL:
2113 bt_map[s] = 0;
2114 break;
2115
2116 case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2117 /* Color attachment binding */
2118 assert(shader->stage == MESA_SHADER_FRAGMENT);
2119 if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2120 const struct anv_attachment *att =
2121 &cmd_buffer->state.gfx.color_att[binding->index];
2122 surface_state = att->surface_state.state;
2123 } else {
2124 surface_state = cmd_buffer->state.gfx.null_surface_state;
2125 }
2126 assert(surface_state.map);
2127 bt_map[s] = surface_state.offset + state_offset;
2128 break;
2129
2130 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2131 struct anv_state surface_state =
2132 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2133
2134 struct anv_address constant_data = {
2135 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2136 .offset = shader->kernel.offset +
2137 shader->prog_data->const_data_offset,
2138 };
2139 unsigned constant_data_size = shader->prog_data->const_data_size;
2140
2141 const enum isl_format format =
2142 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2143 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2144 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2145 format, ISL_SWIZZLE_IDENTITY,
2146 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2147 constant_data, constant_data_size, 1);
2148
2149 assert(surface_state.map);
2150 bt_map[s] = surface_state.offset + state_offset;
2151 add_surface_reloc(cmd_buffer, surface_state, constant_data);
2152 break;
2153 }
2154
2155 case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2156 /* This is always the first binding for compute shaders */
2157 assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2158
2159 struct anv_state surface_state =
2160 anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2161
2162 const enum isl_format format =
2163 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2164 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2165 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2166 format, ISL_SWIZZLE_IDENTITY,
2167 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2168 cmd_buffer->state.compute.num_workgroups,
2169 12, 1);
2170
2171 assert(surface_state.map);
2172 bt_map[s] = surface_state.offset + state_offset;
2173 if (need_client_mem_relocs) {
2174 add_surface_reloc(cmd_buffer, surface_state,
2175 cmd_buffer->state.compute.num_workgroups);
2176 }
2177 break;
2178 }
2179
2180 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2181 /* This is a descriptor set buffer so the set index is actually
2182 * given by binding->binding. (Yes, that's confusing.)
2183 */
2184 struct anv_descriptor_set *set =
2185 pipe_state->descriptors[binding->index];
2186 assert(set->desc_mem.alloc_size);
2187 assert(set->desc_surface_state.alloc_size);
2188 bt_map[s] = set->desc_surface_state.offset + state_offset;
2189 add_surface_reloc(cmd_buffer, set->desc_surface_state,
2190 anv_descriptor_set_address(set));
2191 break;
2192 }
2193
2194 default: {
2195 assert(binding->set < MAX_SETS);
2196 const struct anv_descriptor_set *set =
2197 pipe_state->descriptors[binding->set];
2198 if (binding->index >= set->descriptor_count) {
2199 /* From the Vulkan spec section entitled "DescriptorSet and
2200 * Binding Assignment":
2201 *
2202 * "If the array is runtime-sized, then array elements greater
2203 * than or equal to the size of that binding in the bound
2204 * descriptor set must not be used."
2205 *
2206 * Unfortunately, the compiler isn't smart enough to figure out
2207 * when a dynamic binding isn't used so it may grab the whole
2208 * array and stick it in the binding table. In this case, it's
2209 * safe to just skip those bindings that are OOB.
2210 */
2211 assert(binding->index < set->layout->descriptor_count);
2212 continue;
2213 }
2214 const struct anv_descriptor *desc = &set->descriptors[binding->index];
2215
2216 switch (desc->type) {
2217 case VK_DESCRIPTOR_TYPE_SAMPLER:
2218 /* Nothing for us to do here */
2219 continue;
2220
2221 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2222 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2223 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2224 if (desc->image_view) {
2225 struct anv_surface_state sstate =
2226 (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2227 desc->image_view->planes[binding->plane].general_sampler_surface_state :
2228 desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2229 surface_state = sstate.state;
2230 assert(surface_state.alloc_size);
2231 if (need_client_mem_relocs)
2232 add_surface_state_relocs(cmd_buffer, sstate);
2233 } else {
2234 surface_state = cmd_buffer->device->null_surface_state;
2235 }
2236 break;
2237 }
2238
2239 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2240 if (desc->image_view) {
2241 struct anv_surface_state sstate =
2242 binding->lowered_storage_surface
2243 ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2244 : desc->image_view->planes[binding->plane].storage_surface_state;
2245 surface_state = sstate.state;
2246 assert(surface_state.alloc_size);
2247 if (surface_state.offset == 0) {
2248 mesa_loge("Bound a image to a descriptor where the "
2249 "descriptor does not have NonReadable "
2250 "set and the image does not have a "
2251 "corresponding SPIR-V format enum.");
2252 vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2253 VK_DEBUG_REPORT_ERROR_BIT_EXT,
2254 &desc->image_view->vk.base,
2255 __LINE__, 0, "anv",
2256 "Bound a image to a descriptor where the "
2257 "descriptor does not have NonReadable "
2258 "set and the image does not have a "
2259 "corresponding SPIR-V format enum.");
2260 }
2261 if (surface_state.offset && need_client_mem_relocs)
2262 add_surface_state_relocs(cmd_buffer, sstate);
2263 } else {
2264 surface_state = cmd_buffer->device->null_surface_state;
2265 }
2266 break;
2267 }
2268
2269 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2270 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2271 if (desc->set_buffer_view) {
2272 surface_state = desc->set_buffer_view->surface_state;
2273 assert(surface_state.alloc_size);
2274 if (need_client_mem_relocs) {
2275 add_surface_reloc(cmd_buffer, surface_state,
2276 desc->set_buffer_view->address);
2277 }
2278 } else {
2279 surface_state = cmd_buffer->device->null_surface_state;
2280 }
2281 break;
2282
2283 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2284 if (desc->buffer_view) {
2285 surface_state = desc->buffer_view->surface_state;
2286 assert(surface_state.alloc_size);
2287 if (need_client_mem_relocs) {
2288 add_surface_reloc(cmd_buffer, surface_state,
2289 desc->buffer_view->address);
2290 }
2291 } else {
2292 surface_state = cmd_buffer->device->null_surface_state;
2293 }
2294 break;
2295
2296 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2297 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2298 if (desc->buffer) {
2299 /* Compute the offset within the buffer */
2300 uint32_t dynamic_offset =
2301 push->dynamic_offsets[binding->dynamic_offset_index];
2302 uint64_t offset = desc->offset + dynamic_offset;
2303 /* Clamp to the buffer size */
2304 offset = MIN2(offset, desc->buffer->vk.size);
2305 /* Clamp the range to the buffer size */
2306 uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
2307
2308 /* Align the range for consistency */
2309 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2310 range = align(range, ANV_UBO_ALIGNMENT);
2311
2312 struct anv_address address =
2313 anv_address_add(desc->buffer->address, offset);
2314
2315 surface_state =
2316 anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2317 enum isl_format format =
2318 anv_isl_format_for_descriptor_type(cmd_buffer->device,
2319 desc->type);
2320
2321 isl_surf_usage_flags_t usage =
2322 desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2323 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2324 ISL_SURF_USAGE_STORAGE_BIT;
2325
2326 anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2327 format, ISL_SWIZZLE_IDENTITY,
2328 usage, address, range, 1);
2329 if (need_client_mem_relocs)
2330 add_surface_reloc(cmd_buffer, surface_state, address);
2331 } else {
2332 surface_state = cmd_buffer->device->null_surface_state;
2333 }
2334 break;
2335 }
2336
2337 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2338 if (desc->buffer_view) {
2339 surface_state = binding->lowered_storage_surface
2340 ? desc->buffer_view->lowered_storage_surface_state
2341 : desc->buffer_view->storage_surface_state;
2342 assert(surface_state.alloc_size);
2343 if (need_client_mem_relocs) {
2344 add_surface_reloc(cmd_buffer, surface_state,
2345 desc->buffer_view->address);
2346 }
2347 } else {
2348 surface_state = cmd_buffer->device->null_surface_state;
2349 }
2350 break;
2351
2352 default:
2353 assert(!"Invalid descriptor type");
2354 continue;
2355 }
2356 assert(surface_state.map);
2357 bt_map[s] = surface_state.offset + state_offset;
2358 break;
2359 }
2360 }
2361 }
2362
2363 return VK_SUCCESS;
2364 }
2365
2366 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2367 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2368 struct anv_cmd_pipeline_state *pipe_state,
2369 struct anv_shader_bin *shader,
2370 struct anv_state *state)
2371 {
2372 struct anv_pipeline_bind_map *map = &shader->bind_map;
2373 if (map->sampler_count == 0) {
2374 *state = (struct anv_state) { 0, };
2375 return VK_SUCCESS;
2376 }
2377
2378 uint32_t size = map->sampler_count * 16;
2379 *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2380
2381 if (state->map == NULL)
2382 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2383
2384 for (uint32_t s = 0; s < map->sampler_count; s++) {
2385 struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2386 const struct anv_descriptor *desc =
2387 &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2388
2389 if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2390 desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2391 continue;
2392
2393 struct anv_sampler *sampler = desc->sampler;
2394
2395 /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2396 * happens to be zero.
2397 */
2398 if (sampler == NULL)
2399 continue;
2400
2401 memcpy(state->map + (s * 16),
2402 sampler->state[binding->plane], sizeof(sampler->state[0]));
2403 }
2404
2405 return VK_SUCCESS;
2406 }
2407
2408 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2409 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2410 struct anv_cmd_pipeline_state *pipe_state,
2411 const VkShaderStageFlags dirty,
2412 struct anv_shader_bin **shaders,
2413 uint32_t num_shaders)
2414 {
2415 VkShaderStageFlags flushed = 0;
2416
2417 VkResult result = VK_SUCCESS;
2418 for (uint32_t i = 0; i < num_shaders; i++) {
2419 if (!shaders[i])
2420 continue;
2421
2422 gl_shader_stage stage = shaders[i]->stage;
2423 VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2424 if ((vk_stage & dirty) == 0)
2425 continue;
2426
2427 assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2428 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2429 &cmd_buffer->state.samplers[stage]);
2430 if (result != VK_SUCCESS)
2431 break;
2432
2433 assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2434 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2435 &cmd_buffer->state.binding_tables[stage]);
2436 if (result != VK_SUCCESS)
2437 break;
2438
2439 flushed |= vk_stage;
2440 }
2441
2442 if (result != VK_SUCCESS) {
2443 assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2444
2445 result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2446 if (result != VK_SUCCESS)
2447 return 0;
2448
2449 /* Re-emit state base addresses so we get the new surface state base
2450 * address before we start emitting binding tables etc.
2451 */
2452 genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2453
2454 /* Re-emit all active binding tables */
2455 flushed = 0;
2456
2457 for (uint32_t i = 0; i < num_shaders; i++) {
2458 if (!shaders[i])
2459 continue;
2460
2461 gl_shader_stage stage = shaders[i]->stage;
2462
2463 result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2464 &cmd_buffer->state.samplers[stage]);
2465 if (result != VK_SUCCESS) {
2466 anv_batch_set_error(&cmd_buffer->batch, result);
2467 return 0;
2468 }
2469 result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2470 &cmd_buffer->state.binding_tables[stage]);
2471 if (result != VK_SUCCESS) {
2472 anv_batch_set_error(&cmd_buffer->batch, result);
2473 return 0;
2474 }
2475
2476 flushed |= mesa_to_vk_shader_stage(stage);
2477 }
2478 }
2479
2480 return flushed;
2481 }
2482
2483 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2484 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2485 uint32_t stages)
2486 {
2487 static const uint32_t sampler_state_opcodes[] = {
2488 [MESA_SHADER_VERTEX] = 43,
2489 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
2490 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
2491 [MESA_SHADER_GEOMETRY] = 46,
2492 [MESA_SHADER_FRAGMENT] = 47,
2493 };
2494
2495 static const uint32_t binding_table_opcodes[] = {
2496 [MESA_SHADER_VERTEX] = 38,
2497 [MESA_SHADER_TESS_CTRL] = 39,
2498 [MESA_SHADER_TESS_EVAL] = 40,
2499 [MESA_SHADER_GEOMETRY] = 41,
2500 [MESA_SHADER_FRAGMENT] = 42,
2501 };
2502
2503 anv_foreach_stage(s, stages) {
2504 assert(s < ARRAY_SIZE(binding_table_opcodes));
2505
2506 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2507 anv_batch_emit(&cmd_buffer->batch,
2508 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2509 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2510 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2511 }
2512 }
2513
2514 /* Always emit binding table pointers if we're asked to, since on SKL
2515 * this is what flushes push constants. */
2516 anv_batch_emit(&cmd_buffer->batch,
2517 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
2518 btp._3DCommandSubOpcode = binding_table_opcodes[s];
2519 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
2520 }
2521 }
2522 }
2523
2524 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2525 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
2526 const struct anv_shader_bin *shader,
2527 const struct anv_push_range *range)
2528 {
2529 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2530 switch (range->set) {
2531 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2532 /* This is a descriptor set buffer so the set index is
2533 * actually given by binding->binding. (Yes, that's
2534 * confusing.)
2535 */
2536 struct anv_descriptor_set *set =
2537 gfx_state->base.descriptors[range->index];
2538 return anv_descriptor_set_address(set);
2539 }
2540
2541 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
2542 if (gfx_state->base.push_constants_state.alloc_size == 0) {
2543 gfx_state->base.push_constants_state =
2544 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
2545 }
2546 return (struct anv_address) {
2547 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
2548 .offset = gfx_state->base.push_constants_state.offset,
2549 };
2550 }
2551
2552 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2553 return (struct anv_address) {
2554 .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2555 .offset = shader->kernel.offset +
2556 shader->prog_data->const_data_offset,
2557 };
2558
2559 default: {
2560 assert(range->set < MAX_SETS);
2561 struct anv_descriptor_set *set =
2562 gfx_state->base.descriptors[range->set];
2563 const struct anv_descriptor *desc =
2564 &set->descriptors[range->index];
2565
2566 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2567 if (desc->buffer_view)
2568 return desc->buffer_view->address;
2569 } else {
2570 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2571 if (desc->buffer) {
2572 const struct anv_push_constants *push =
2573 &gfx_state->base.push_constants;
2574 uint32_t dynamic_offset =
2575 push->dynamic_offsets[range->dynamic_offset_index];
2576 return anv_address_add(desc->buffer->address,
2577 desc->offset + dynamic_offset);
2578 }
2579 }
2580
2581 /* For NULL UBOs, we just return an address in the workaround BO. We do
2582 * writes to it for workarounds but always at the bottom. The higher
2583 * bytes should be all zeros.
2584 */
2585 assert(range->length * 32 <= 2048);
2586 return (struct anv_address) {
2587 .bo = cmd_buffer->device->workaround_bo,
2588 .offset = 1024,
2589 };
2590 }
2591 }
2592 }
2593
2594
2595 /** Returns the size in bytes of the bound buffer
2596 *
2597 * The range is relative to the start of the buffer, not the start of the
2598 * range. The returned range may be smaller than
2599 *
2600 * (range->start + range->length) * 32;
2601 */
2602 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2603 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
2604 const struct anv_shader_bin *shader,
2605 const struct anv_push_range *range)
2606 {
2607 assert(shader->stage != MESA_SHADER_COMPUTE);
2608 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2609 switch (range->set) {
2610 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2611 struct anv_descriptor_set *set =
2612 gfx_state->base.descriptors[range->index];
2613 assert(range->start * 32 < set->desc_mem.alloc_size);
2614 assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
2615 return set->desc_mem.alloc_size;
2616 }
2617
2618 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
2619 return (range->start + range->length) * 32;
2620
2621 case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2622 return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
2623
2624 default: {
2625 assert(range->set < MAX_SETS);
2626 struct anv_descriptor_set *set =
2627 gfx_state->base.descriptors[range->set];
2628 const struct anv_descriptor *desc =
2629 &set->descriptors[range->index];
2630
2631 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2632 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
2633 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
2634 */
2635 if (!desc->set_buffer_view)
2636 return 0;
2637
2638 if (range->start * 32 > desc->set_buffer_view->range)
2639 return 0;
2640
2641 return desc->set_buffer_view->range;
2642 } else {
2643 if (!desc->buffer)
2644 return 0;
2645
2646 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2647 /* Compute the offset within the buffer */
2648 const struct anv_push_constants *push =
2649 &gfx_state->base.push_constants;
2650 uint32_t dynamic_offset =
2651 push->dynamic_offsets[range->dynamic_offset_index];
2652 uint64_t offset = desc->offset + dynamic_offset;
2653 /* Clamp to the buffer size */
2654 offset = MIN2(offset, desc->buffer->vk.size);
2655 /* Clamp the range to the buffer size */
2656 uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
2657
2658 /* Align the range for consistency */
2659 bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
2660
2661 return bound_range;
2662 }
2663 }
2664 }
2665 }
2666
2667 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)2668 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
2669 gl_shader_stage stage,
2670 struct anv_address *buffers,
2671 unsigned buffer_count)
2672 {
2673 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2674 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2675
2676 static const uint32_t push_constant_opcodes[] = {
2677 [MESA_SHADER_VERTEX] = 21,
2678 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
2679 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
2680 [MESA_SHADER_GEOMETRY] = 22,
2681 [MESA_SHADER_FRAGMENT] = 23,
2682 };
2683
2684 assert(stage < ARRAY_SIZE(push_constant_opcodes));
2685
2686 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
2687
2688 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
2689 c._3DCommandSubOpcode = push_constant_opcodes[stage];
2690
2691 /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
2692 *
2693 * "Constant Buffer Object Control State must be always
2694 * programmed to zero."
2695 *
2696 * This restriction does not exist on any newer platforms.
2697 *
2698 * We only have one MOCS field for the whole packet, not one per
2699 * buffer. We could go out of our way here to walk over all of
2700 * the buffers and see if any of them are used externally and use
2701 * the external MOCS. However, the notion that someone would use
2702 * the same bit of memory for both scanout and a UBO is nuts.
2703 *
2704 * Let's not bother and assume it's all internal.
2705 */
2706 #if GFX_VER != 8
2707 c.ConstantBody.MOCS = mocs;
2708 #endif
2709
2710 if (anv_pipeline_has_stage(pipeline, stage)) {
2711 const struct anv_pipeline_bind_map *bind_map =
2712 &pipeline->shaders[stage]->bind_map;
2713
2714 #if GFX_VERx10 >= 75
2715 /* The Skylake PRM contains the following restriction:
2716 *
2717 * "The driver must ensure The following case does not occur
2718 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
2719 * buffer 3 read length equal to zero committed followed by a
2720 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
2721 * zero committed."
2722 *
2723 * To avoid this, we program the buffers in the highest slots.
2724 * This way, slot 0 is only used if slot 3 is also used.
2725 */
2726 assert(buffer_count <= 4);
2727 const unsigned shift = 4 - buffer_count;
2728 for (unsigned i = 0; i < buffer_count; i++) {
2729 const struct anv_push_range *range = &bind_map->push_ranges[i];
2730
2731 /* At this point we only have non-empty ranges */
2732 assert(range->length > 0);
2733
2734 /* For Ivy Bridge, make sure we only set the first range (actual
2735 * push constants)
2736 */
2737 assert((GFX_VERx10 >= 75) || i == 0);
2738
2739 c.ConstantBody.ReadLength[i + shift] = range->length;
2740 c.ConstantBody.Buffer[i + shift] =
2741 anv_address_add(buffers[i], range->start * 32);
2742 }
2743 #else
2744 /* For Ivy Bridge, push constants are relative to dynamic state
2745 * base address and we only ever push actual push constants.
2746 */
2747 if (bind_map->push_ranges[0].length > 0) {
2748 assert(buffer_count == 1);
2749 assert(bind_map->push_ranges[0].set ==
2750 ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
2751 assert(buffers[0].bo ==
2752 cmd_buffer->device->dynamic_state_pool.block_pool.bo);
2753 c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
2754 c.ConstantBody.Buffer[0].bo = NULL;
2755 c.ConstantBody.Buffer[0].offset = buffers[0].offset;
2756 }
2757 assert(bind_map->push_ranges[1].length == 0);
2758 assert(bind_map->push_ranges[2].length == 0);
2759 assert(bind_map->push_ranges[3].length == 0);
2760 #endif
2761 }
2762 }
2763 }
2764
2765 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)2766 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
2767 VkShaderStageFlags dirty_stages)
2768 {
2769 VkShaderStageFlags flushed = 0;
2770 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2771 const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2772
2773 /* Compute robust pushed register access mask for each stage. */
2774 if (cmd_buffer->device->vk.enabled_features.robustBufferAccess) {
2775 anv_foreach_stage(stage, dirty_stages) {
2776 if (!anv_pipeline_has_stage(pipeline, stage))
2777 continue;
2778
2779 const struct anv_shader_bin *shader = pipeline->shaders[stage];
2780 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2781 struct anv_push_constants *push = &gfx_state->base.push_constants;
2782
2783 push->push_reg_mask[stage] = 0;
2784 /* Start of the current range in the shader, relative to the start of
2785 * push constants in the shader.
2786 */
2787 unsigned range_start_reg = 0;
2788 for (unsigned i = 0; i < 4; i++) {
2789 const struct anv_push_range *range = &bind_map->push_ranges[i];
2790 if (range->length == 0)
2791 continue;
2792
2793 unsigned bound_size =
2794 get_push_range_bound_size(cmd_buffer, shader, range);
2795 if (bound_size >= range->start * 32) {
2796 unsigned bound_regs =
2797 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
2798 range->length);
2799 assert(range_start_reg + bound_regs <= 64);
2800 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
2801 bound_regs);
2802 }
2803
2804 cmd_buffer->state.push_constants_dirty |=
2805 mesa_to_vk_shader_stage(stage);
2806
2807 range_start_reg += range->length;
2808 }
2809 }
2810 }
2811
2812 /* Resets the push constant state so that we allocate a new one if
2813 * needed.
2814 */
2815 gfx_state->base.push_constants_state = ANV_STATE_NULL;
2816
2817 anv_foreach_stage(stage, dirty_stages) {
2818 unsigned buffer_count = 0;
2819 flushed |= mesa_to_vk_shader_stage(stage);
2820 UNUSED uint32_t max_push_range = 0;
2821
2822 struct anv_address buffers[4] = {};
2823 if (anv_pipeline_has_stage(pipeline, stage)) {
2824 const struct anv_shader_bin *shader = pipeline->shaders[stage];
2825 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2826
2827 /* We have to gather buffer addresses as a second step because the
2828 * loop above puts data into the push constant area and the call to
2829 * get_push_range_address is what locks our push constants and copies
2830 * them into the actual GPU buffer. If we did the two loops at the
2831 * same time, we'd risk only having some of the sizes in the push
2832 * constant buffer when we did the copy.
2833 */
2834 for (unsigned i = 0; i < 4; i++) {
2835 const struct anv_push_range *range = &bind_map->push_ranges[i];
2836 if (range->length == 0)
2837 break;
2838
2839 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
2840 max_push_range = MAX2(max_push_range, range->length);
2841 buffer_count++;
2842 }
2843
2844 /* We have at most 4 buffers but they should be tightly packed */
2845 for (unsigned i = buffer_count; i < 4; i++)
2846 assert(bind_map->push_ranges[i].length == 0);
2847 }
2848
2849 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
2850 }
2851
2852 cmd_buffer->state.push_constants_dirty &= ~flushed;
2853 }
2854
2855 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)2856 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
2857 {
2858 const struct vk_dynamic_graphics_state *dyn =
2859 &cmd_buffer->vk.dynamic_graphics_state;
2860
2861 if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
2862 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
2863 #if GFX_VER <= 7
2864 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
2865 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
2866 #endif
2867 !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
2868 return;
2869
2870 /* Take dynamic primitive topology in to account with
2871 * 3DSTATE_CLIP::ViewportXYClipTestEnable
2872 */
2873 VkPolygonMode dynamic_raster_mode =
2874 genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
2875 dyn->ia.primitive_topology);
2876 bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
2877
2878 struct GENX(3DSTATE_CLIP) clip = {
2879 GENX(3DSTATE_CLIP_header),
2880 #if GFX_VER <= 7
2881 .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
2882 .CullMode = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
2883 #endif
2884 .ViewportXYClipTestEnable = xy_clip_test_enable,
2885 };
2886 uint32_t dwords[GENX(3DSTATE_CLIP_length)];
2887
2888 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2889 if (anv_pipeline_is_primitive(pipeline)) {
2890 const struct elk_vue_prog_data *last =
2891 anv_pipeline_get_last_vue_prog_data(pipeline);
2892 if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2893 clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
2894 dyn->vp.viewport_count - 1 : 0;
2895 }
2896 }
2897
2898 GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
2899 anv_batch_emit_merge(&cmd_buffer->batch, dwords,
2900 pipeline->gfx7.clip);
2901 }
2902
2903 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer * cmd_buffer)2904 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
2905 {
2906 struct anv_instance *instance = cmd_buffer->device->physical->instance;
2907 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2908 const struct vk_dynamic_graphics_state *dyn =
2909 &cmd_buffer->vk.dynamic_graphics_state;
2910 uint32_t count = dyn->vp.viewport_count;
2911 const VkViewport *viewports = dyn->vp.viewports;
2912 struct anv_state sf_clip_state =
2913 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
2914
2915 bool negative_one_to_one =
2916 cmd_buffer->state.gfx.pipeline->negative_one_to_one;
2917
2918 float scale = negative_one_to_one ? 0.5f : 1.0f;
2919
2920 for (uint32_t i = 0; i < count; i++) {
2921 const VkViewport *vp = &viewports[i];
2922
2923 /* The gfx7 state struct has just the matrix and guardband fields, the
2924 * gfx8 struct adds the min/max viewport fields. */
2925 struct GENX(SF_CLIP_VIEWPORT) sfv = {
2926 .ViewportMatrixElementm00 = vp->width / 2,
2927 .ViewportMatrixElementm11 = vp->height / 2,
2928 .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
2929 .ViewportMatrixElementm30 = vp->x + vp->width / 2,
2930 .ViewportMatrixElementm31 = vp->y + vp->height / 2,
2931 .ViewportMatrixElementm32 = negative_one_to_one ?
2932 (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
2933 .XMinClipGuardband = -1.0f,
2934 .XMaxClipGuardband = 1.0f,
2935 .YMinClipGuardband = -1.0f,
2936 .YMaxClipGuardband = 1.0f,
2937 #if GFX_VER >= 8
2938 .XMinViewPort = vp->x,
2939 .XMaxViewPort = vp->x + vp->width - 1,
2940 .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
2941 .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
2942 #endif
2943 };
2944
2945 /* Fix depth test misrenderings by lowering translated depth range */
2946 if (instance->lower_depth_range_rate != 1.0f)
2947 sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
2948
2949 const uint32_t fb_size_max = 1 << 14;
2950 uint32_t x_min = 0, x_max = fb_size_max;
2951 uint32_t y_min = 0, y_max = fb_size_max;
2952
2953 /* If we have a valid renderArea, include that */
2954 if (gfx->render_area.extent.width > 0 &&
2955 gfx->render_area.extent.height > 0) {
2956 x_min = MAX2(x_min, gfx->render_area.offset.x);
2957 x_max = MIN2(x_min, gfx->render_area.offset.x +
2958 gfx->render_area.extent.width);
2959 y_min = MAX2(y_min, gfx->render_area.offset.y);
2960 y_max = MIN2(y_min, gfx->render_area.offset.y +
2961 gfx->render_area.extent.height);
2962 }
2963
2964 /* The client is required to have enough scissors for whatever it sets
2965 * as ViewportIndex but it's possible that they've got more viewports
2966 * set from a previous command. Also, from the Vulkan 1.3.207:
2967 *
2968 * "The application must ensure (using scissor if necessary) that
2969 * all rendering is contained within the render area."
2970 *
2971 * If the client doesn't set a scissor, that basically means it
2972 * guarantees everything is in-bounds already. If we end up using a
2973 * guardband of [-1, 1] in that case, there shouldn't be much loss.
2974 * It's theoretically possible that they could do all their clipping
2975 * with clip planes but that'd be a bit odd.
2976 */
2977 if (i < dyn->vp.scissor_count) {
2978 const VkRect2D *scissor = &dyn->vp.scissors[i];
2979 x_min = MAX2(x_min, scissor->offset.x);
2980 x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
2981 y_min = MAX2(y_min, scissor->offset.y);
2982 y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
2983 }
2984
2985 /* Only bother calculating the guardband if our known render area is
2986 * less than the maximum size. Otherwise, it will calculate [-1, 1]
2987 * anyway but possibly with precision loss.
2988 */
2989 if (x_min > 0 || x_max < fb_size_max ||
2990 y_min > 0 || y_max < fb_size_max) {
2991 intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
2992 sfv.ViewportMatrixElementm00,
2993 sfv.ViewportMatrixElementm11,
2994 sfv.ViewportMatrixElementm30,
2995 sfv.ViewportMatrixElementm31,
2996 &sfv.XMinClipGuardband,
2997 &sfv.XMaxClipGuardband,
2998 &sfv.YMinClipGuardband,
2999 &sfv.YMaxClipGuardband);
3000 }
3001
3002 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3003 }
3004
3005 anv_batch_emit(&cmd_buffer->batch,
3006 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3007 clip.SFClipViewportPointer = sf_clip_state.offset;
3008 }
3009 }
3010
3011 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer * cmd_buffer,bool depth_clamp_enable)3012 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3013 bool depth_clamp_enable)
3014 {
3015 const struct vk_dynamic_graphics_state *dyn =
3016 &cmd_buffer->vk.dynamic_graphics_state;
3017 uint32_t count = dyn->vp.viewport_count;
3018 const VkViewport *viewports = dyn->vp.viewports;
3019 struct anv_state cc_state =
3020 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3021
3022 for (uint32_t i = 0; i < count; i++) {
3023 const VkViewport *vp = &viewports[i];
3024
3025 /* From the Vulkan spec:
3026 *
3027 * "It is valid for minDepth to be greater than or equal to
3028 * maxDepth."
3029 */
3030 float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3031 float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3032
3033 struct GENX(CC_VIEWPORT) cc_viewport = {
3034 .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3035 .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3036 };
3037
3038 GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3039 }
3040
3041 anv_batch_emit(&cmd_buffer->batch,
3042 GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3043 cc.CCViewportPointer = cc_state.offset;
3044 }
3045 }
3046
3047 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer * cmd_buffer)3048 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3049 {
3050 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3051 const struct vk_dynamic_graphics_state *dyn =
3052 &cmd_buffer->vk.dynamic_graphics_state;
3053 uint32_t count = dyn->vp.scissor_count;
3054 const VkRect2D *scissors = dyn->vp.scissors;
3055 const VkViewport *viewports = dyn->vp.viewports;
3056
3057 /* Wa_1409725701:
3058 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3059 * stored as an array of up to 16 elements. The location of first
3060 * element of the array, as specified by Pointer to SCISSOR_RECT, should
3061 * be aligned to a 64-byte boundary.
3062 */
3063 uint32_t alignment = 64;
3064 struct anv_state scissor_state =
3065 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3066
3067 for (uint32_t i = 0; i < count; i++) {
3068 const VkRect2D *s = &scissors[i];
3069 const VkViewport *vp = &viewports[i];
3070
3071 /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3072 * ymax < ymin for empty clips. In case clip x, y, width height are all
3073 * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3074 * what we want. Just special case empty clips and produce a canonical
3075 * empty clip. */
3076 static const struct GENX(SCISSOR_RECT) empty_scissor = {
3077 .ScissorRectangleYMin = 1,
3078 .ScissorRectangleXMin = 1,
3079 .ScissorRectangleYMax = 0,
3080 .ScissorRectangleXMax = 0
3081 };
3082
3083 const int max = 0xffff;
3084
3085 uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3086 uint32_t x_min = MAX2(s->offset.x, vp->x);
3087 int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3088 MAX2(vp->y, vp->y + vp->height) - 1);
3089 int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3090 vp->x + vp->width - 1);
3091
3092 y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
3093 x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
3094
3095 /* Do this math using int64_t so overflow gets clamped correctly. */
3096 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3097 y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
3098 x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
3099 y_max = CLAMP((uint64_t) y_max, 0,
3100 gfx->render_area.offset.y +
3101 gfx->render_area.extent.height - 1);
3102 x_max = CLAMP((uint64_t) x_max, 0,
3103 gfx->render_area.offset.x +
3104 gfx->render_area.extent.width - 1);
3105 }
3106
3107 struct GENX(SCISSOR_RECT) scissor = {
3108 .ScissorRectangleYMin = y_min,
3109 .ScissorRectangleXMin = x_min,
3110 .ScissorRectangleYMax = y_max,
3111 .ScissorRectangleXMax = x_max
3112 };
3113
3114 if (s->extent.width <= 0 || s->extent.height <= 0) {
3115 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3116 &empty_scissor);
3117 } else {
3118 GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3119 }
3120 }
3121
3122 anv_batch_emit(&cmd_buffer->batch,
3123 GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3124 ssp.ScissorRectPointer = scissor_state.offset;
3125 }
3126 }
3127
3128 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3129 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3130 {
3131 const struct vk_dynamic_graphics_state *dyn =
3132 &cmd_buffer->vk.dynamic_graphics_state;
3133 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3134
3135 #if GFX_VER == 7
3136 # define streamout_state_dw pipeline->gfx7.streamout_state
3137 #else
3138 # define streamout_state_dw pipeline->gfx8.streamout_state
3139 #endif
3140
3141 uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3142
3143 struct GENX(3DSTATE_STREAMOUT) so = {
3144 GENX(3DSTATE_STREAMOUT_header),
3145 .RenderingDisable = dyn->rs.rasterizer_discard_enable,
3146 };
3147 GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3148 anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3149 }
3150
3151 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)3152 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
3153 {
3154 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3155 const struct vk_dynamic_graphics_state *dyn =
3156 &cmd_buffer->vk.dynamic_graphics_state;
3157 uint32_t *p;
3158
3159 assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3160
3161 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3162
3163 genX(flush_pipeline_select_3d)(cmd_buffer);
3164
3165 /* Apply any pending pipeline flushes we may have. We want to apply them
3166 * now because, if any of those flushes are for things like push constants,
3167 * the GPU will read the state at weird times.
3168 */
3169 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3170
3171 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3172 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3173 vb_emit |= pipeline->vb_used;
3174
3175 if (vb_emit) {
3176 const uint32_t num_buffers = __builtin_popcount(vb_emit);
3177 const uint32_t num_dwords = 1 + num_buffers * 4;
3178
3179 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3180 GENX(3DSTATE_VERTEX_BUFFERS));
3181 uint32_t i = 0;
3182 u_foreach_bit(vb, vb_emit) {
3183 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3184 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3185
3186 struct GENX(VERTEX_BUFFER_STATE) state;
3187 if (buffer) {
3188 uint32_t stride = dyn->vi_binding_strides[vb];
3189 UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
3190
3191 #if GFX_VER <= 7
3192 bool per_instance = pipeline->vb[vb].instanced;
3193 uint32_t divisor = pipeline->vb[vb].instance_divisor *
3194 pipeline->instance_multiplier;
3195 #endif
3196
3197 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3198 .VertexBufferIndex = vb,
3199
3200 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3201 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3202 #if GFX_VER <= 7
3203 .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
3204 .InstanceDataStepRate = per_instance ? divisor : 1,
3205 #endif
3206 .AddressModifyEnable = true,
3207 .BufferPitch = stride,
3208 .BufferStartingAddress = anv_address_add(buffer->address, offset),
3209 .NullVertexBuffer = offset >= buffer->vk.size,
3210
3211 #if GFX_VER >= 8
3212 .BufferSize = size,
3213 #else
3214 /* XXX: to handle dynamic offset for older gens we might want
3215 * to modify Endaddress, but there are issues when doing so:
3216 *
3217 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3218 */
3219 .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
3220 #endif
3221 };
3222 } else {
3223 state = (struct GENX(VERTEX_BUFFER_STATE)) {
3224 .VertexBufferIndex = vb,
3225 .NullVertexBuffer = true,
3226 .MOCS = anv_mocs(cmd_buffer->device, NULL,
3227 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3228 };
3229 }
3230
3231 #if GFX_VER == 8
3232 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3233 state.BufferStartingAddress,
3234 state.BufferSize);
3235 #endif
3236
3237 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3238 i++;
3239 }
3240 }
3241
3242 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3243
3244 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3245 pipeline->active_stages;
3246 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3247 !vk_dynamic_graphics_state_any_dirty(dyn) &&
3248 !cmd_buffer->state.push_constants_dirty)
3249 return;
3250
3251 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3252 (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3253 ANV_CMD_DIRTY_PIPELINE))) {
3254 /* Wa_16011411144:
3255 *
3256 * SW must insert a PIPE_CONTROL cmd before and after the
3257 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3258 * state is not combined with other state changes.
3259 */
3260 if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
3261 anv_add_pending_pipe_bits(cmd_buffer,
3262 ANV_PIPE_CS_STALL_BIT,
3263 "before SO_BUFFER change WA");
3264 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3265 }
3266
3267 /* We don't need any per-buffer dirty tracking because you're not
3268 * allowed to bind different XFB buffers while XFB is enabled.
3269 */
3270 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3271 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3272 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3273 sob.SOBufferIndex = idx;
3274
3275 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3276 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
3277 ISL_SURF_USAGE_STREAM_OUT_BIT);
3278 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3279 xfb->offset);
3280 #if GFX_VER >= 8
3281 sob.SOBufferEnable = true;
3282 sob.StreamOffsetWriteEnable = false;
3283 /* Size is in DWords - 1 */
3284 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3285 #else
3286 /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3287 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3288 * default for an empty SO_BUFFER packet) to disable them.
3289 */
3290 sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3291 sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3292 xfb->offset + xfb->size);
3293 #endif
3294 } else {
3295 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3296 }
3297 }
3298 }
3299 }
3300
3301 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3302 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3303
3304 /* If the pipeline changed, we may need to re-allocate push constant
3305 * space in the URB.
3306 */
3307 cmd_buffer_alloc_push_constants(cmd_buffer);
3308 }
3309
3310 #if GFX_VER <= 7
3311 if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3312 cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3313 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3314 *
3315 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3316 * stall needs to be sent just prior to any 3DSTATE_VS,
3317 * 3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3318 * 3DSTATE_BINDING_TABLE_POINTER_VS,
3319 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one
3320 * PIPE_CONTROL needs to be sent before any combination of VS
3321 * associated 3DSTATE."
3322 */
3323 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3324 pc.DepthStallEnable = true;
3325 pc.PostSyncOperation = WriteImmediateData;
3326 pc.Address = cmd_buffer->device->workaround_address;
3327 anv_debug_dump_pc(pc);
3328 }
3329 }
3330 #endif
3331
3332 /* Render targets live in the same binding table as fragment descriptors */
3333 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3334 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3335
3336 /* We emit the binding tables and sampler tables first, then emit push
3337 * constants and then finally emit binding table and sampler table
3338 * pointers. It has to happen in this order, since emitting the binding
3339 * tables may change the push constants (in case of storage images). After
3340 * emitting push constants, on SKL+ we have to emit the corresponding
3341 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
3342 */
3343 uint32_t dirty = 0;
3344 if (descriptors_dirty) {
3345 dirty = flush_descriptor_sets(cmd_buffer,
3346 &cmd_buffer->state.gfx.base,
3347 descriptors_dirty,
3348 pipeline->shaders,
3349 ARRAY_SIZE(pipeline->shaders));
3350 cmd_buffer->state.descriptors_dirty &= ~dirty;
3351 }
3352
3353 if (dirty || cmd_buffer->state.push_constants_dirty) {
3354 /* Because we're pushing UBOs, we have to push whenever either
3355 * descriptors or push constants is dirty.
3356 */
3357 dirty |= cmd_buffer->state.push_constants_dirty;
3358 cmd_buffer_flush_push_constants(cmd_buffer,
3359 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3360 }
3361
3362 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
3363 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
3364 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3365 }
3366
3367 cmd_buffer_emit_clip(cmd_buffer);
3368
3369 if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3370 ANV_CMD_DIRTY_XFB_ENABLE)) ||
3371 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
3372 cmd_buffer_emit_streamout(cmd_buffer);
3373
3374 if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3375 ANV_CMD_DIRTY_RENDER_TARGETS)) ||
3376 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
3377 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
3378 cmd_buffer_emit_viewport(cmd_buffer);
3379 cmd_buffer_emit_depth_viewport(cmd_buffer,
3380 pipeline->depth_clamp_enable);
3381 cmd_buffer_emit_scissor(cmd_buffer);
3382 }
3383
3384 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
3385 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
3386 uint32_t topology;
3387 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
3388 topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
3389 else
3390 topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
3391
3392 cmd_buffer->state.gfx.primitive_topology = topology;
3393
3394 #if (GFX_VER >= 8)
3395 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
3396 vft.PrimitiveTopologyType = topology;
3397 }
3398 #endif
3399 }
3400
3401 genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
3402 }
3403
3404 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)3405 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
3406 struct anv_address addr,
3407 uint32_t size, uint32_t index)
3408 {
3409 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
3410 GENX(3DSTATE_VERTEX_BUFFERS));
3411
3412 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
3413 &(struct GENX(VERTEX_BUFFER_STATE)) {
3414 .VertexBufferIndex = index,
3415 .AddressModifyEnable = true,
3416 .BufferPitch = 0,
3417 .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
3418 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3419 .NullVertexBuffer = size == 0,
3420 #if (GFX_VER >= 8)
3421 .BufferStartingAddress = addr,
3422 .BufferSize = size
3423 #else
3424 .BufferStartingAddress = addr,
3425 .EndAddress = anv_address_add(addr, size),
3426 #endif
3427 });
3428
3429 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
3430 index, addr, size);
3431 }
3432
3433 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)3434 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
3435 struct anv_address addr)
3436 {
3437 emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
3438 }
3439
3440 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)3441 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
3442 uint32_t base_vertex, uint32_t base_instance)
3443 {
3444 if (base_vertex == 0 && base_instance == 0) {
3445 emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
3446 } else {
3447 struct anv_state id_state =
3448 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
3449
3450 ((uint32_t *)id_state.map)[0] = base_vertex;
3451 ((uint32_t *)id_state.map)[1] = base_instance;
3452
3453 struct anv_address addr = {
3454 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3455 .offset = id_state.offset,
3456 };
3457
3458 emit_base_vertex_instance_bo(cmd_buffer, addr);
3459 }
3460 }
3461
3462 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)3463 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
3464 {
3465 struct anv_state state =
3466 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
3467
3468 ((uint32_t *)state.map)[0] = draw_index;
3469
3470 struct anv_address addr = {
3471 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3472 .offset = state.offset,
3473 };
3474
3475 emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
3476 }
3477
3478 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)3479 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
3480 uint32_t access_type)
3481 {
3482 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3483 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3484
3485 uint64_t vb_used = pipeline->vb_used;
3486 if (vs_prog_data->uses_firstvertex ||
3487 vs_prog_data->uses_baseinstance)
3488 vb_used |= 1ull << ANV_SVGS_VB_INDEX;
3489 if (vs_prog_data->uses_drawid)
3490 vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
3491
3492 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
3493 access_type == RANDOM,
3494 vb_used);
3495 }
3496
3497 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct elk_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)3498 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
3499 const struct elk_vs_prog_data *vs_prog_data,
3500 uint32_t base_vertex,
3501 uint32_t base_instance,
3502 uint32_t draw_id,
3503 bool force_flush)
3504 {
3505 bool emitted = false;
3506 if (vs_prog_data->uses_firstvertex ||
3507 vs_prog_data->uses_baseinstance) {
3508 emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
3509 emitted = true;
3510 }
3511 if (vs_prog_data->uses_drawid) {
3512 emit_draw_index(cmd_buffer, draw_id);
3513 emitted = true;
3514 }
3515 /* Emitting draw index or vertex index BOs may result in needing
3516 * additional VF cache flushes.
3517 */
3518 if (emitted || force_flush)
3519 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3520 }
3521
genX(CmdDraw)3522 void genX(CmdDraw)(
3523 VkCommandBuffer commandBuffer,
3524 uint32_t vertexCount,
3525 uint32_t instanceCount,
3526 uint32_t firstVertex,
3527 uint32_t firstInstance)
3528 {
3529 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3530 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3531 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3532
3533 if (anv_batch_has_error(&cmd_buffer->batch))
3534 return;
3535
3536 const uint32_t count =
3537 vertexCount * instanceCount * pipeline->instance_multiplier;
3538 anv_measure_snapshot(cmd_buffer,
3539 INTEL_SNAPSHOT_DRAW,
3540 "draw", count);
3541 trace_intel_begin_draw(&cmd_buffer->trace);
3542
3543 /* Select pipeline here to allow
3544 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3545 * cmd_buffer_flush_gfx_state().
3546 */
3547 genX(flush_pipeline_select_3d)(cmd_buffer);
3548
3549 if (cmd_buffer->state.conditional_render_enabled)
3550 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3551
3552 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3553 firstVertex, firstInstance, 0,
3554 false /* force_flush */);
3555
3556 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3557
3558 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3559 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3560 prim.VertexAccessType = SEQUENTIAL;
3561 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3562 prim.VertexCountPerInstance = vertexCount;
3563 prim.StartVertexLocation = firstVertex;
3564 prim.InstanceCount = instanceCount *
3565 pipeline->instance_multiplier;
3566 prim.StartInstanceLocation = firstInstance;
3567 prim.BaseVertexLocation = 0;
3568 }
3569
3570 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3571
3572 trace_intel_end_draw(&cmd_buffer->trace, count);
3573 }
3574
genX(CmdDrawMultiEXT)3575 void genX(CmdDrawMultiEXT)(
3576 VkCommandBuffer commandBuffer,
3577 uint32_t drawCount,
3578 const VkMultiDrawInfoEXT *pVertexInfo,
3579 uint32_t instanceCount,
3580 uint32_t firstInstance,
3581 uint32_t stride)
3582 {
3583 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3584 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3585 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3586
3587 if (anv_batch_has_error(&cmd_buffer->batch))
3588 return;
3589
3590 const uint32_t count =
3591 drawCount * instanceCount * pipeline->instance_multiplier;
3592 anv_measure_snapshot(cmd_buffer,
3593 INTEL_SNAPSHOT_DRAW,
3594 "draw_multi", count);
3595 trace_intel_begin_draw_multi(&cmd_buffer->trace);
3596
3597 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3598
3599 if (cmd_buffer->state.conditional_render_enabled)
3600 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3601
3602 uint32_t i = 0;
3603 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3604 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3605 draw->firstVertex,
3606 firstInstance, i, !i);
3607
3608 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3609 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3610 prim.VertexAccessType = SEQUENTIAL;
3611 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3612 prim.VertexCountPerInstance = draw->vertexCount;
3613 prim.StartVertexLocation = draw->firstVertex;
3614 prim.InstanceCount = instanceCount *
3615 pipeline->instance_multiplier;
3616 prim.StartInstanceLocation = firstInstance;
3617 prim.BaseVertexLocation = 0;
3618 }
3619 }
3620
3621 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3622
3623 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
3624 }
3625
genX(CmdDrawIndexed)3626 void genX(CmdDrawIndexed)(
3627 VkCommandBuffer commandBuffer,
3628 uint32_t indexCount,
3629 uint32_t instanceCount,
3630 uint32_t firstIndex,
3631 int32_t vertexOffset,
3632 uint32_t firstInstance)
3633 {
3634 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3635 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3636 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3637
3638 if (anv_batch_has_error(&cmd_buffer->batch))
3639 return;
3640
3641 const uint32_t count =
3642 indexCount * instanceCount * pipeline->instance_multiplier;
3643 anv_measure_snapshot(cmd_buffer,
3644 INTEL_SNAPSHOT_DRAW,
3645 "draw indexed",
3646 count);
3647 trace_intel_begin_draw_indexed(&cmd_buffer->trace);
3648
3649 /* Select pipeline here to allow
3650 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3651 * cmd_buffer_flush_gfx_state().
3652 */
3653 genX(flush_pipeline_select_3d)(cmd_buffer);
3654
3655 if (cmd_buffer->state.conditional_render_enabled)
3656 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3657
3658 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3659 vertexOffset, firstInstance,
3660 0, false /* force_flush */);
3661
3662 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3663
3664 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3665 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3666 prim.VertexAccessType = RANDOM;
3667 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3668 prim.VertexCountPerInstance = indexCount;
3669 prim.StartVertexLocation = firstIndex;
3670 prim.InstanceCount = instanceCount *
3671 pipeline->instance_multiplier;
3672 prim.StartInstanceLocation = firstInstance;
3673 prim.BaseVertexLocation = vertexOffset;
3674 }
3675
3676 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3677
3678 trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
3679 }
3680
genX(CmdDrawMultiIndexedEXT)3681 void genX(CmdDrawMultiIndexedEXT)(
3682 VkCommandBuffer commandBuffer,
3683 uint32_t drawCount,
3684 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3685 uint32_t instanceCount,
3686 uint32_t firstInstance,
3687 uint32_t stride,
3688 const int32_t *pVertexOffset)
3689 {
3690 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3691 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3692 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3693
3694 if (anv_batch_has_error(&cmd_buffer->batch))
3695 return;
3696
3697 const uint32_t count =
3698 drawCount * instanceCount * pipeline->instance_multiplier;
3699 anv_measure_snapshot(cmd_buffer,
3700 INTEL_SNAPSHOT_DRAW,
3701 "draw indexed_multi",
3702 count);
3703 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
3704
3705 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3706
3707 if (cmd_buffer->state.conditional_render_enabled)
3708 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3709
3710 uint32_t i = 0;
3711 if (pVertexOffset) {
3712 if (vs_prog_data->uses_drawid) {
3713 bool emitted = true;
3714 if (vs_prog_data->uses_firstvertex ||
3715 vs_prog_data->uses_baseinstance) {
3716 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3717 emitted = true;
3718 }
3719 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3720 if (vs_prog_data->uses_drawid) {
3721 emit_draw_index(cmd_buffer, i);
3722 emitted = true;
3723 }
3724 /* Emitting draw index or vertex index BOs may result in needing
3725 * additional VF cache flushes.
3726 */
3727 if (emitted)
3728 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3729
3730 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3731 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3732 prim.VertexAccessType = RANDOM;
3733 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3734 prim.VertexCountPerInstance = draw->indexCount;
3735 prim.StartVertexLocation = draw->firstIndex;
3736 prim.InstanceCount = instanceCount *
3737 pipeline->instance_multiplier;
3738 prim.StartInstanceLocation = firstInstance;
3739 prim.BaseVertexLocation = *pVertexOffset;
3740 }
3741 emitted = false;
3742 }
3743 } else {
3744 if (vs_prog_data->uses_firstvertex ||
3745 vs_prog_data->uses_baseinstance) {
3746 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3747 /* Emitting draw index or vertex index BOs may result in needing
3748 * additional VF cache flushes.
3749 */
3750 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3751 }
3752 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3753 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3754 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3755 prim.VertexAccessType = RANDOM;
3756 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3757 prim.VertexCountPerInstance = draw->indexCount;
3758 prim.StartVertexLocation = draw->firstIndex;
3759 prim.InstanceCount = instanceCount *
3760 pipeline->instance_multiplier;
3761 prim.StartInstanceLocation = firstInstance;
3762 prim.BaseVertexLocation = *pVertexOffset;
3763 }
3764 }
3765 }
3766 } else {
3767 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3768 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3769 draw->vertexOffset,
3770 firstInstance, i, i != 0);
3771
3772 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3773 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3774 prim.VertexAccessType = RANDOM;
3775 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3776 prim.VertexCountPerInstance = draw->indexCount;
3777 prim.StartVertexLocation = draw->firstIndex;
3778 prim.InstanceCount = instanceCount *
3779 pipeline->instance_multiplier;
3780 prim.StartInstanceLocation = firstInstance;
3781 prim.BaseVertexLocation = draw->vertexOffset;
3782 }
3783 }
3784 }
3785
3786 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3787
3788 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
3789 }
3790
3791 /* Auto-Draw / Indirect Registers */
3792 #define GFX7_3DPRIM_END_OFFSET 0x2420
3793 #define GFX7_3DPRIM_START_VERTEX 0x2430
3794 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
3795 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
3796 #define GFX7_3DPRIM_START_INSTANCE 0x243C
3797 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
3798
genX(CmdDrawIndirectByteCountEXT)3799 void genX(CmdDrawIndirectByteCountEXT)(
3800 VkCommandBuffer commandBuffer,
3801 uint32_t instanceCount,
3802 uint32_t firstInstance,
3803 VkBuffer counterBuffer,
3804 VkDeviceSize counterBufferOffset,
3805 uint32_t counterOffset,
3806 uint32_t vertexStride)
3807 {
3808 #if GFX_VERx10 >= 75
3809 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3810 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
3811 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3812 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3813
3814 /* firstVertex is always zero for this draw function */
3815 const uint32_t firstVertex = 0;
3816
3817 if (anv_batch_has_error(&cmd_buffer->batch))
3818 return;
3819
3820 anv_measure_snapshot(cmd_buffer,
3821 INTEL_SNAPSHOT_DRAW,
3822 "draw indirect byte count",
3823 instanceCount * pipeline->instance_multiplier);
3824 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
3825
3826 /* Select pipeline here to allow
3827 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3828 * emit_base_vertex_instance() & emit_draw_index().
3829 */
3830 genX(flush_pipeline_select_3d)(cmd_buffer);
3831
3832 if (cmd_buffer->state.conditional_render_enabled)
3833 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3834
3835 if (vs_prog_data->uses_firstvertex ||
3836 vs_prog_data->uses_baseinstance)
3837 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
3838 if (vs_prog_data->uses_drawid)
3839 emit_draw_index(cmd_buffer, 0);
3840
3841 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3842
3843 struct mi_builder b;
3844 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3845 struct mi_value count =
3846 mi_mem32(anv_address_add(counter_buffer->address,
3847 counterBufferOffset));
3848 if (counterOffset)
3849 count = mi_isub(&b, count, mi_imm(counterOffset));
3850 count = mi_udiv32_imm(&b, count, vertexStride);
3851 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
3852
3853 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
3854 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
3855 mi_imm(instanceCount * pipeline->instance_multiplier));
3856 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
3857 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3858
3859 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3860 prim.IndirectParameterEnable = true;
3861 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3862 prim.VertexAccessType = SEQUENTIAL;
3863 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3864 }
3865
3866 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3867
3868 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
3869 instanceCount * pipeline->instance_multiplier);
3870 #endif /* GFX_VERx10 >= 75 */
3871 }
3872
3873 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)3874 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
3875 struct anv_address addr,
3876 bool indexed)
3877 {
3878 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3879
3880 struct mi_builder b;
3881 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3882
3883 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
3884 mi_mem32(anv_address_add(addr, 0)));
3885
3886 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
3887 if (pipeline->instance_multiplier > 1) {
3888 #if GFX_VERx10 >= 75
3889 instance_count = mi_imul_imm(&b, instance_count,
3890 pipeline->instance_multiplier);
3891 #else
3892 anv_finishme("Multiview + indirect draw requires MI_MATH; "
3893 "MI_MATH is not supported on Ivy Bridge");
3894 #endif
3895 }
3896 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
3897
3898 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
3899 mi_mem32(anv_address_add(addr, 8)));
3900
3901 if (indexed) {
3902 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
3903 mi_mem32(anv_address_add(addr, 12)));
3904 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3905 mi_mem32(anv_address_add(addr, 16)));
3906 } else {
3907 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3908 mi_mem32(anv_address_add(addr, 12)));
3909 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3910 }
3911 }
3912
genX(CmdDrawIndirect)3913 void genX(CmdDrawIndirect)(
3914 VkCommandBuffer commandBuffer,
3915 VkBuffer _buffer,
3916 VkDeviceSize offset,
3917 uint32_t drawCount,
3918 uint32_t stride)
3919 {
3920 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3921 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
3922 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3923 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3924
3925 if (anv_batch_has_error(&cmd_buffer->batch))
3926 return;
3927
3928 anv_measure_snapshot(cmd_buffer,
3929 INTEL_SNAPSHOT_DRAW,
3930 "draw indirect",
3931 drawCount);
3932 trace_intel_begin_draw_indirect(&cmd_buffer->trace);
3933
3934 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3935
3936 if (cmd_buffer->state.conditional_render_enabled)
3937 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3938
3939 for (uint32_t i = 0; i < drawCount; i++) {
3940 struct anv_address draw = anv_address_add(buffer->address, offset);
3941
3942 if (vs_prog_data->uses_firstvertex ||
3943 vs_prog_data->uses_baseinstance)
3944 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
3945 if (vs_prog_data->uses_drawid)
3946 emit_draw_index(cmd_buffer, i);
3947
3948 /* Emitting draw index or vertex index BOs may result in needing
3949 * additional VF cache flushes.
3950 */
3951 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3952
3953 load_indirect_parameters(cmd_buffer, draw, false);
3954
3955 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3956 prim.IndirectParameterEnable = true;
3957 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
3958 prim.VertexAccessType = SEQUENTIAL;
3959 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
3960 }
3961
3962 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3963
3964 offset += stride;
3965 }
3966
3967 trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
3968 }
3969
genX(CmdDrawIndexedIndirect)3970 void genX(CmdDrawIndexedIndirect)(
3971 VkCommandBuffer commandBuffer,
3972 VkBuffer _buffer,
3973 VkDeviceSize offset,
3974 uint32_t drawCount,
3975 uint32_t stride)
3976 {
3977 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3978 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
3979 struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3980 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3981
3982 if (anv_batch_has_error(&cmd_buffer->batch))
3983 return;
3984
3985 anv_measure_snapshot(cmd_buffer,
3986 INTEL_SNAPSHOT_DRAW,
3987 "draw indexed indirect",
3988 drawCount);
3989 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
3990
3991 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3992
3993 if (cmd_buffer->state.conditional_render_enabled)
3994 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3995
3996 for (uint32_t i = 0; i < drawCount; i++) {
3997 struct anv_address draw = anv_address_add(buffer->address, offset);
3998
3999 /* TODO: We need to stomp base vertex to 0 somehow */
4000 if (vs_prog_data->uses_firstvertex ||
4001 vs_prog_data->uses_baseinstance)
4002 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4003 if (vs_prog_data->uses_drawid)
4004 emit_draw_index(cmd_buffer, i);
4005
4006 /* Emitting draw index or vertex index BOs may result in needing
4007 * additional VF cache flushes.
4008 */
4009 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4010
4011 load_indirect_parameters(cmd_buffer, draw, true);
4012
4013 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4014 prim.IndirectParameterEnable = true;
4015 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
4016 prim.VertexAccessType = RANDOM;
4017 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4018 }
4019
4020 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4021
4022 offset += stride;
4023 }
4024
4025 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
4026 }
4027
4028 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_buffer * count_buffer,uint64_t countBufferOffset)4029 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4030 struct mi_builder *b,
4031 struct anv_buffer *count_buffer,
4032 uint64_t countBufferOffset)
4033 {
4034 struct anv_address count_address =
4035 anv_address_add(count_buffer->address, countBufferOffset);
4036
4037 struct mi_value ret = mi_imm(0);
4038
4039 if (cmd_buffer->state.conditional_render_enabled) {
4040 #if GFX_VERx10 >= 75
4041 ret = mi_new_gpr(b);
4042 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4043 #endif
4044 } else {
4045 /* Upload the current draw count from the draw parameters buffer to
4046 * MI_PREDICATE_SRC0.
4047 */
4048 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4049 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4050 }
4051
4052 return ret;
4053 }
4054
4055 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4056 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4057 struct mi_builder *b,
4058 uint32_t draw_index)
4059 {
4060 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4061 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4062
4063 if (draw_index == 0) {
4064 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4065 mip.LoadOperation = LOAD_LOADINV;
4066 mip.CombineOperation = COMBINE_SET;
4067 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4068 }
4069 } else {
4070 /* While draw_index < draw_count the predicate's result will be
4071 * (draw_index == draw_count) ^ TRUE = TRUE
4072 * When draw_index == draw_count the result is
4073 * (TRUE) ^ TRUE = FALSE
4074 * After this all results will be:
4075 * (FALSE) ^ FALSE = FALSE
4076 */
4077 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4078 mip.LoadOperation = LOAD_LOAD;
4079 mip.CombineOperation = COMBINE_XOR;
4080 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4081 }
4082 }
4083 }
4084
4085 #if GFX_VERx10 >= 75
4086 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4087 emit_draw_count_predicate_with_conditional_render(
4088 struct anv_cmd_buffer *cmd_buffer,
4089 struct mi_builder *b,
4090 uint32_t draw_index,
4091 struct mi_value max)
4092 {
4093 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4094 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4095
4096 #if GFX_VER >= 8
4097 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4098 #else
4099 /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4100 * so we emit MI_PREDICATE to set it.
4101 */
4102
4103 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4104 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4105
4106 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4107 mip.LoadOperation = LOAD_LOADINV;
4108 mip.CombineOperation = COMBINE_SET;
4109 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4110 }
4111 #endif
4112 }
4113 #endif
4114
4115 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4116 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4117 struct mi_builder *b,
4118 uint32_t draw_index,
4119 struct mi_value max)
4120 {
4121 #if GFX_VERx10 >= 75
4122 if (cmd_buffer->state.conditional_render_enabled) {
4123 emit_draw_count_predicate_with_conditional_render(
4124 cmd_buffer, b, draw_index, mi_value_ref(b, max));
4125 } else {
4126 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4127 }
4128 #else
4129 emit_draw_count_predicate(cmd_buffer, b, draw_index);
4130 #endif
4131 }
4132
genX(CmdDrawIndirectCount)4133 void genX(CmdDrawIndirectCount)(
4134 VkCommandBuffer commandBuffer,
4135 VkBuffer _buffer,
4136 VkDeviceSize offset,
4137 VkBuffer _countBuffer,
4138 VkDeviceSize countBufferOffset,
4139 uint32_t maxDrawCount,
4140 uint32_t stride)
4141 {
4142 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4143 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4144 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4145 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4146 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4147 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4148
4149 if (anv_batch_has_error(&cmd_buffer->batch))
4150 return;
4151
4152 anv_measure_snapshot(cmd_buffer,
4153 INTEL_SNAPSHOT_DRAW,
4154 "draw indirect count",
4155 0);
4156 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
4157
4158 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4159
4160 struct mi_builder b;
4161 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4162 struct mi_value max =
4163 prepare_for_draw_count_predicate(cmd_buffer, &b,
4164 count_buffer, countBufferOffset);
4165
4166 for (uint32_t i = 0; i < maxDrawCount; i++) {
4167 struct anv_address draw = anv_address_add(buffer->address, offset);
4168
4169 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4170
4171 if (vs_prog_data->uses_firstvertex ||
4172 vs_prog_data->uses_baseinstance)
4173 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4174 if (vs_prog_data->uses_drawid)
4175 emit_draw_index(cmd_buffer, i);
4176
4177 /* Emitting draw index or vertex index BOs may result in needing
4178 * additional VF cache flushes.
4179 */
4180 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4181
4182 load_indirect_parameters(cmd_buffer, draw, false);
4183
4184 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4185 prim.IndirectParameterEnable = true;
4186 prim.PredicateEnable = true;
4187 prim.VertexAccessType = SEQUENTIAL;
4188 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4189 }
4190
4191 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4192
4193 offset += stride;
4194 }
4195
4196 mi_value_unref(&b, max);
4197
4198 trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
4199 }
4200
genX(CmdDrawIndexedIndirectCount)4201 void genX(CmdDrawIndexedIndirectCount)(
4202 VkCommandBuffer commandBuffer,
4203 VkBuffer _buffer,
4204 VkDeviceSize offset,
4205 VkBuffer _countBuffer,
4206 VkDeviceSize countBufferOffset,
4207 uint32_t maxDrawCount,
4208 uint32_t stride)
4209 {
4210 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4211 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4212 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4213 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4214 struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4215 const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4216
4217 if (anv_batch_has_error(&cmd_buffer->batch))
4218 return;
4219
4220 anv_measure_snapshot(cmd_buffer,
4221 INTEL_SNAPSHOT_DRAW,
4222 "draw indexed indirect count",
4223 0);
4224 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
4225
4226 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4227
4228 struct mi_builder b;
4229 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4230 struct mi_value max =
4231 prepare_for_draw_count_predicate(cmd_buffer, &b,
4232 count_buffer, countBufferOffset);
4233
4234 for (uint32_t i = 0; i < maxDrawCount; i++) {
4235 struct anv_address draw = anv_address_add(buffer->address, offset);
4236
4237 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4238
4239 /* TODO: We need to stomp base vertex to 0 somehow */
4240 if (vs_prog_data->uses_firstvertex ||
4241 vs_prog_data->uses_baseinstance)
4242 emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4243 if (vs_prog_data->uses_drawid)
4244 emit_draw_index(cmd_buffer, i);
4245
4246 /* Emitting draw index or vertex index BOs may result in needing
4247 * additional VF cache flushes.
4248 */
4249 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4250
4251 load_indirect_parameters(cmd_buffer, draw, true);
4252
4253 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4254 prim.IndirectParameterEnable = true;
4255 prim.PredicateEnable = true;
4256 prim.VertexAccessType = RANDOM;
4257 prim.PrimitiveTopologyType = cmd_buffer->state.gfx.primitive_topology;
4258 }
4259
4260 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4261
4262 offset += stride;
4263 }
4264
4265 mi_value_unref(&b, max);
4266
4267 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
4268
4269 }
4270
genX(CmdBeginTransformFeedbackEXT)4271 void genX(CmdBeginTransformFeedbackEXT)(
4272 VkCommandBuffer commandBuffer,
4273 uint32_t firstCounterBuffer,
4274 uint32_t counterBufferCount,
4275 const VkBuffer* pCounterBuffers,
4276 const VkDeviceSize* pCounterBufferOffsets)
4277 {
4278 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4279
4280 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4281 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4282 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4283
4284 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4285 *
4286 * "Ssoftware must ensure that no HW stream output operations can be in
4287 * process or otherwise pending at the point that the MI_LOAD/STORE
4288 * commands are processed. This will likely require a pipeline flush."
4289 */
4290 anv_add_pending_pipe_bits(cmd_buffer,
4291 ANV_PIPE_CS_STALL_BIT,
4292 "begin transform feedback");
4293 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4294
4295 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4296 /* If we have a counter buffer, this is a resume so we need to load the
4297 * value into the streamout offset register. Otherwise, this is a begin
4298 * and we need to reset it to zero.
4299 */
4300 if (pCounterBuffers &&
4301 idx >= firstCounterBuffer &&
4302 idx - firstCounterBuffer < counterBufferCount &&
4303 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4304 uint32_t cb_idx = idx - firstCounterBuffer;
4305 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4306 uint64_t offset = pCounterBufferOffsets ?
4307 pCounterBufferOffsets[cb_idx] : 0;
4308
4309 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4310 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4311 lrm.MemoryAddress = anv_address_add(counter_buffer->address,
4312 offset);
4313 }
4314 } else {
4315 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4316 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4317 lri.DataDWord = 0;
4318 }
4319 }
4320 }
4321
4322 cmd_buffer->state.xfb_enabled = true;
4323 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4324 }
4325
genX(CmdEndTransformFeedbackEXT)4326 void genX(CmdEndTransformFeedbackEXT)(
4327 VkCommandBuffer commandBuffer,
4328 uint32_t firstCounterBuffer,
4329 uint32_t counterBufferCount,
4330 const VkBuffer* pCounterBuffers,
4331 const VkDeviceSize* pCounterBufferOffsets)
4332 {
4333 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4334
4335 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4336 assert(counterBufferCount <= MAX_XFB_BUFFERS);
4337 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4338
4339 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4340 *
4341 * "Ssoftware must ensure that no HW stream output operations can be in
4342 * process or otherwise pending at the point that the MI_LOAD/STORE
4343 * commands are processed. This will likely require a pipeline flush."
4344 */
4345 anv_add_pending_pipe_bits(cmd_buffer,
4346 ANV_PIPE_CS_STALL_BIT,
4347 "end transform feedback");
4348 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4349
4350 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
4351 unsigned idx = firstCounterBuffer + cb_idx;
4352
4353 /* If we have a counter buffer, this is a resume so we need to load the
4354 * value into the streamout offset register. Otherwise, this is a begin
4355 * and we need to reset it to zero.
4356 */
4357 if (pCounterBuffers &&
4358 cb_idx < counterBufferCount &&
4359 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
4360 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4361 uint64_t offset = pCounterBufferOffsets ?
4362 pCounterBufferOffsets[cb_idx] : 0;
4363
4364 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
4365 srm.MemoryAddress = anv_address_add(counter_buffer->address,
4366 offset);
4367 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4368 }
4369 }
4370 }
4371
4372 cmd_buffer->state.xfb_enabled = false;
4373 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4374 }
4375
4376 static void
genX(cmd_buffer_flush_compute_state)4377 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
4378 {
4379 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
4380 struct anv_compute_pipeline *pipeline = comp_state->pipeline;
4381
4382 assert(pipeline->cs);
4383
4384 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
4385
4386 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
4387
4388 /* Apply any pending pipeline flushes we may have. We want to apply them
4389 * now because, if any of those flushes are for things like push constants,
4390 * the GPU will read the state at weird times.
4391 */
4392 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4393
4394 if (cmd_buffer->state.compute.pipeline_dirty) {
4395 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4396 *
4397 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4398 * the only bits that are changed are scoreboard related: Scoreboard
4399 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4400 * these scoreboard related states, a MEDIA_STATE_FLUSH is
4401 * sufficient."
4402 */
4403 anv_add_pending_pipe_bits(cmd_buffer,
4404 ANV_PIPE_CS_STALL_BIT,
4405 "flush compute state");
4406 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4407
4408 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
4409
4410 /* The workgroup size of the pipeline affects our push constant layout
4411 * so flag push constants as dirty if we change the pipeline.
4412 */
4413 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4414 }
4415
4416 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
4417 cmd_buffer->state.compute.pipeline_dirty) {
4418 flush_descriptor_sets(cmd_buffer,
4419 &cmd_buffer->state.compute.base,
4420 VK_SHADER_STAGE_COMPUTE_BIT,
4421 &pipeline->cs, 1);
4422 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4423
4424 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
4425 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
4426 .BindingTablePointer =
4427 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
4428 .SamplerStatePointer =
4429 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
4430 };
4431 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
4432
4433 struct anv_state state =
4434 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
4435 pipeline->interface_descriptor_data,
4436 GENX(INTERFACE_DESCRIPTOR_DATA_length),
4437 64);
4438
4439 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4440 anv_batch_emit(&cmd_buffer->batch,
4441 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
4442 mid.InterfaceDescriptorTotalLength = size;
4443 mid.InterfaceDescriptorDataStartAddress = state.offset;
4444 }
4445 }
4446
4447 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
4448 comp_state->push_data =
4449 anv_cmd_buffer_cs_push_constants(cmd_buffer);
4450
4451 if (comp_state->push_data.alloc_size) {
4452 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
4453 curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
4454 curbe.CURBEDataStartAddress = comp_state->push_data.offset;
4455 }
4456 }
4457
4458 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4459 }
4460
4461 cmd_buffer->state.compute.pipeline_dirty = false;
4462
4463 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4464 }
4465
4466 #if GFX_VER == 7
4467
4468 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)4469 verify_cmd_parser(const struct anv_device *device,
4470 int required_version,
4471 const char *function)
4472 {
4473 if (device->physical->cmd_parser_version < required_version) {
4474 return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
4475 "cmd parser version %d is required for %s",
4476 required_version, function);
4477 } else {
4478 return VK_SUCCESS;
4479 }
4480 }
4481
4482 #endif
4483
4484 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)4485 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
4486 uint32_t baseGroupX,
4487 uint32_t baseGroupY,
4488 uint32_t baseGroupZ)
4489 {
4490 if (anv_batch_has_error(&cmd_buffer->batch))
4491 return;
4492
4493 struct anv_push_constants *push =
4494 &cmd_buffer->state.compute.base.push_constants;
4495 if (push->cs.base_work_group_id[0] != baseGroupX ||
4496 push->cs.base_work_group_id[1] != baseGroupY ||
4497 push->cs.base_work_group_id[2] != baseGroupZ) {
4498 push->cs.base_work_group_id[0] = baseGroupX;
4499 push->cs.base_work_group_id[1] = baseGroupY;
4500 push->cs.base_work_group_id[2] = baseGroupZ;
4501
4502 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4503 }
4504 }
4505
4506 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4507 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
4508 const struct anv_compute_pipeline *pipeline, bool indirect,
4509 const struct elk_cs_prog_data *prog_data,
4510 uint32_t groupCountX, uint32_t groupCountY,
4511 uint32_t groupCountZ)
4512 {
4513 bool predicate = (GFX_VER <= 7 && indirect) ||
4514 cmd_buffer->state.conditional_render_enabled;
4515
4516 const struct intel_device_info *devinfo = pipeline->base.device->info;
4517 const struct intel_cs_dispatch_info dispatch =
4518 elk_cs_get_dispatch_info(devinfo, prog_data, NULL);
4519
4520 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
4521 ggw.IndirectParameterEnable = indirect;
4522 ggw.PredicateEnable = predicate;
4523 ggw.SIMDSize = dispatch.simd_size / 16;
4524 ggw.ThreadDepthCounterMaximum = 0;
4525 ggw.ThreadHeightCounterMaximum = 0;
4526 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
4527 ggw.ThreadGroupIDXDimension = groupCountX;
4528 ggw.ThreadGroupIDYDimension = groupCountY;
4529 ggw.ThreadGroupIDZDimension = groupCountZ;
4530 ggw.RightExecutionMask = dispatch.right_mask;
4531 ggw.BottomExecutionMask = 0xffffffff;
4532 }
4533
4534 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
4535 }
4536
4537 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4538 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
4539 const struct anv_compute_pipeline *pipeline, bool indirect,
4540 const struct elk_cs_prog_data *prog_data,
4541 uint32_t groupCountX, uint32_t groupCountY,
4542 uint32_t groupCountZ)
4543 {
4544 emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
4545 groupCountY, groupCountZ);
4546 }
4547
genX(CmdDispatchBase)4548 void genX(CmdDispatchBase)(
4549 VkCommandBuffer commandBuffer,
4550 uint32_t baseGroupX,
4551 uint32_t baseGroupY,
4552 uint32_t baseGroupZ,
4553 uint32_t groupCountX,
4554 uint32_t groupCountY,
4555 uint32_t groupCountZ)
4556 {
4557 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4558 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4559 const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4560
4561 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
4562 baseGroupY, baseGroupZ);
4563
4564 if (anv_batch_has_error(&cmd_buffer->batch))
4565 return;
4566
4567 anv_measure_snapshot(cmd_buffer,
4568 INTEL_SNAPSHOT_COMPUTE,
4569 "compute",
4570 groupCountX * groupCountY * groupCountZ *
4571 prog_data->local_size[0] * prog_data->local_size[1] *
4572 prog_data->local_size[2]);
4573
4574 trace_intel_begin_compute(&cmd_buffer->trace);
4575
4576 if (prog_data->uses_num_work_groups) {
4577 struct anv_state state =
4578 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
4579 uint32_t *sizes = state.map;
4580 sizes[0] = groupCountX;
4581 sizes[1] = groupCountY;
4582 sizes[2] = groupCountZ;
4583 cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
4584 .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4585 .offset = state.offset,
4586 };
4587
4588 /* The num_workgroups buffer goes in the binding table */
4589 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4590 }
4591
4592 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4593
4594 if (cmd_buffer->state.conditional_render_enabled)
4595 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4596
4597 emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
4598 groupCountY, groupCountZ);
4599
4600 trace_intel_end_compute(&cmd_buffer->trace,
4601 groupCountX, groupCountY, groupCountZ);
4602 }
4603
4604 #define GPGPU_DISPATCHDIMX 0x2500
4605 #define GPGPU_DISPATCHDIMY 0x2504
4606 #define GPGPU_DISPATCHDIMZ 0x2508
4607
genX(CmdDispatchIndirect)4608 void genX(CmdDispatchIndirect)(
4609 VkCommandBuffer commandBuffer,
4610 VkBuffer _buffer,
4611 VkDeviceSize offset)
4612 {
4613 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4614 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4615 struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4616 const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4617 struct anv_address addr = anv_address_add(buffer->address, offset);
4618 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
4619
4620 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
4621
4622 #if GFX_VER == 7
4623 /* Linux 4.4 added command parser version 5 which allows the GPGPU
4624 * indirect dispatch registers to be written.
4625 */
4626 if (verify_cmd_parser(cmd_buffer->device, 5,
4627 "vkCmdDispatchIndirect") != VK_SUCCESS)
4628 return;
4629 #endif
4630
4631 anv_measure_snapshot(cmd_buffer,
4632 INTEL_SNAPSHOT_COMPUTE,
4633 "compute indirect",
4634 0);
4635 trace_intel_begin_compute(&cmd_buffer->trace);
4636
4637 if (prog_data->uses_num_work_groups) {
4638 cmd_buffer->state.compute.num_workgroups = addr;
4639
4640 /* The num_workgroups buffer goes in the binding table */
4641 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4642 }
4643
4644 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4645
4646 struct mi_builder b;
4647 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4648
4649 struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
4650 struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
4651 struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
4652
4653 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
4654 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
4655 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
4656
4657 #if GFX_VER <= 7
4658 /* predicate = (compute_dispatch_indirect_x_size == 0); */
4659 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
4660 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4661 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4662 mip.LoadOperation = LOAD_LOAD;
4663 mip.CombineOperation = COMBINE_SET;
4664 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4665 }
4666
4667 /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4668 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
4669 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4670 mip.LoadOperation = LOAD_LOAD;
4671 mip.CombineOperation = COMBINE_OR;
4672 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4673 }
4674
4675 /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4676 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
4677 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4678 mip.LoadOperation = LOAD_LOAD;
4679 mip.CombineOperation = COMBINE_OR;
4680 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4681 }
4682
4683 /* predicate = !predicate; */
4684 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4685 mip.LoadOperation = LOAD_LOADINV;
4686 mip.CombineOperation = COMBINE_OR;
4687 mip.CompareOperation = COMPARE_FALSE;
4688 }
4689
4690 #if GFX_VERx10 == 75
4691 if (cmd_buffer->state.conditional_render_enabled) {
4692 /* predicate &= !(conditional_rendering_predicate == 0); */
4693 mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
4694 mi_reg32(ANV_PREDICATE_RESULT_REG));
4695 anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4696 mip.LoadOperation = LOAD_LOADINV;
4697 mip.CombineOperation = COMBINE_AND;
4698 mip.CompareOperation = COMPARE_SRCS_EQUAL;
4699 }
4700 }
4701 #endif
4702
4703 #else /* GFX_VER > 7 */
4704 if (cmd_buffer->state.conditional_render_enabled)
4705 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4706 #endif
4707
4708 emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
4709
4710 trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
4711 }
4712
4713 static void
genX(flush_pipeline_select)4714 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4715 uint32_t pipeline)
4716 {
4717 UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4718
4719 if (cmd_buffer->state.current_pipeline == pipeline)
4720 return;
4721
4722 #if GFX_VER >= 8
4723 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4724 *
4725 * Software must clear the COLOR_CALC_STATE Valid field in
4726 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4727 * with Pipeline Select set to GPGPU.
4728 *
4729 * The internal hardware docs recommend the same workaround for Gfx9
4730 * hardware too.
4731 */
4732 if (pipeline == GPGPU)
4733 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4734 #endif
4735
4736 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4737 * PIPELINE_SELECT [DevBWR+]":
4738 *
4739 * Project: DEVSNB+
4740 *
4741 * Software must ensure all the write caches are flushed through a
4742 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4743 * command to invalidate read only caches prior to programming
4744 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4745 *
4746 * Note the cmd_buffer_apply_pipe_flushes will split this into two
4747 * PIPE_CONTROLs.
4748 */
4749 anv_add_pending_pipe_bits(cmd_buffer,
4750 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4751 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4752 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4753 ANV_PIPE_CS_STALL_BIT |
4754 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4755 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4756 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4757 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4758 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT,
4759 "flush and invalidate for PIPELINE_SELECT");
4760 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4761
4762 anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
4763 ps.PipelineSelection = pipeline;
4764 }
4765
4766 cmd_buffer->state.current_pipeline = pipeline;
4767 }
4768
4769 void
genX(flush_pipeline_select_3d)4770 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4771 {
4772 genX(flush_pipeline_select)(cmd_buffer, _3D);
4773 }
4774
4775 void
genX(flush_pipeline_select_gpgpu)4776 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4777 {
4778 genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4779 }
4780
4781 void
genX(cmd_buffer_emit_gfx7_depth_flush)4782 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
4783 {
4784 if (GFX_VER >= 8)
4785 return;
4786
4787 /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
4788 *
4789 * "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
4790 * combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
4791 * 3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
4792 * issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
4793 * set), followed by a pipelined depth cache flush (PIPE_CONTROL with
4794 * Depth Flush Bit set, followed by another pipelined depth stall
4795 * (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
4796 * guarantee that the pipeline from WM onwards is already flushed (e.g.,
4797 * via a preceding MI_FLUSH)."
4798 */
4799 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4800 pipe.DepthStallEnable = true;
4801 anv_debug_dump_pc(pipe);
4802 }
4803 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4804 pipe.DepthCacheFlushEnable = true;
4805 anv_debug_dump_pc(pipe);
4806 }
4807 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4808 pipe.DepthStallEnable = true;
4809 anv_debug_dump_pc(pipe);
4810 }
4811 }
4812
4813 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4814 *
4815 * "The VF cache needs to be invalidated before binding and then using
4816 * Vertex Buffers that overlap with any previously bound Vertex Buffer
4817 * (at a 64B granularity) since the last invalidation. A VF cache
4818 * invalidate is performed by setting the "VF Cache Invalidation Enable"
4819 * bit in PIPE_CONTROL."
4820 *
4821 * This is implemented by carefully tracking all vertex and index buffer
4822 * bindings and flushing if the cache ever ends up with a range in the cache
4823 * that would exceed 4 GiB. This is implemented in three parts:
4824 *
4825 * 1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4826 * every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4827 * tracking code of the new binding. If this new binding would cause
4828 * the cache to have a too-large range on the next draw call, a pipeline
4829 * stall and VF cache invalidate are added to pending_pipeline_bits.
4830 *
4831 * 2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4832 * empty whenever we emit a VF invalidate.
4833 *
4834 * 3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4835 * after every 3DPRIMITIVE and copies the bound range into the dirty
4836 * range for each used buffer. This has to be a separate step because
4837 * we don't always re-bind all buffers and so 1. can't know which
4838 * buffers are actually bound.
4839 */
4840 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4841 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4842 int vb_index,
4843 struct anv_address vb_address,
4844 uint32_t vb_size)
4845 {
4846 if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4847 return;
4848
4849 struct anv_vb_cache_range *bound, *dirty;
4850 if (vb_index == -1) {
4851 bound = &cmd_buffer->state.gfx.ib_bound_range;
4852 dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4853 } else {
4854 assert(vb_index >= 0);
4855 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4856 assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4857 bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4858 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4859 }
4860
4861 if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4862 vb_address,
4863 vb_size)) {
4864 anv_add_pending_pipe_bits(cmd_buffer,
4865 ANV_PIPE_CS_STALL_BIT |
4866 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4867 "vb > 32b range");
4868 }
4869 }
4870
4871 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4872 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4873 uint32_t access_type,
4874 uint64_t vb_used)
4875 {
4876 if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4877 return;
4878
4879 if (access_type == RANDOM) {
4880 /* We have an index buffer */
4881 struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4882 struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4883
4884 anv_merge_vb_cache_range(dirty, bound);
4885 }
4886
4887 uint64_t mask = vb_used;
4888 while (mask) {
4889 int i = u_bit_scan64(&mask);
4890 assert(i >= 0);
4891 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4892 assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4893
4894 struct anv_vb_cache_range *bound, *dirty;
4895 bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4896 dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4897
4898 anv_merge_vb_cache_range(dirty, bound);
4899 }
4900 }
4901
4902 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4903 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4904 {
4905 struct anv_device *device = cmd_buffer->device;
4906 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4907
4908 /* FIXME: Width and Height are wrong */
4909
4910 genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
4911
4912 uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4913 device->isl_dev.ds.size / 4);
4914 if (dw == NULL)
4915 return;
4916
4917 struct isl_view isl_view = {};
4918 struct isl_depth_stencil_hiz_emit_info info = {
4919 .view = &isl_view,
4920 .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4921 };
4922
4923 if (gfx->depth_att.iview != NULL) {
4924 isl_view = gfx->depth_att.iview->planes[0].isl;
4925 } else if (gfx->stencil_att.iview != NULL) {
4926 isl_view = gfx->stencil_att.iview->planes[0].isl;
4927 }
4928
4929 if (gfx->view_mask) {
4930 assert(isl_view.array_len == 0 ||
4931 isl_view.array_len >= util_last_bit(gfx->view_mask));
4932 isl_view.array_len = util_last_bit(gfx->view_mask);
4933 } else {
4934 assert(isl_view.array_len == 0 ||
4935 isl_view.array_len >= util_last_bit(gfx->layer_count));
4936 isl_view.array_len = gfx->layer_count;
4937 }
4938
4939 if (gfx->depth_att.iview != NULL) {
4940 const struct anv_image_view *iview = gfx->depth_att.iview;
4941 const struct anv_image *image = iview->image;
4942
4943 const uint32_t depth_plane =
4944 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4945 const struct anv_surface *depth_surface =
4946 &image->planes[depth_plane].primary_surface;
4947 const struct anv_address depth_address =
4948 anv_image_address(image, &depth_surface->memory_range);
4949
4950 info.depth_surf = &depth_surface->isl;
4951
4952 info.depth_address =
4953 anv_batch_emit_reloc(&cmd_buffer->batch,
4954 dw + device->isl_dev.ds.depth_offset / 4,
4955 depth_address.bo, depth_address.offset);
4956 info.mocs =
4957 anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
4958
4959 info.hiz_usage = gfx->depth_att.aux_usage;
4960 if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
4961 assert(isl_aux_usage_has_hiz(info.hiz_usage));
4962
4963 const struct anv_surface *hiz_surface =
4964 &image->planes[depth_plane].aux_surface;
4965 const struct anv_address hiz_address =
4966 anv_image_address(image, &hiz_surface->memory_range);
4967
4968 info.hiz_surf = &hiz_surface->isl;
4969
4970 info.hiz_address =
4971 anv_batch_emit_reloc(&cmd_buffer->batch,
4972 dw + device->isl_dev.ds.hiz_offset / 4,
4973 hiz_address.bo, hiz_address.offset);
4974
4975 info.depth_clear_value = ANV_HZ_FC_VAL;
4976 }
4977 }
4978
4979 if (gfx->stencil_att.iview != NULL) {
4980 const struct anv_image_view *iview = gfx->stencil_att.iview;
4981 const struct anv_image *image = iview->image;
4982
4983 const uint32_t stencil_plane =
4984 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
4985 const struct anv_surface *stencil_surface =
4986 &image->planes[stencil_plane].primary_surface;
4987 const struct anv_address stencil_address =
4988 anv_image_address(image, &stencil_surface->memory_range);
4989
4990 info.stencil_surf = &stencil_surface->isl;
4991
4992 info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
4993 info.stencil_address =
4994 anv_batch_emit_reloc(&cmd_buffer->batch,
4995 dw + device->isl_dev.ds.stencil_offset / 4,
4996 stencil_address.bo, stencil_address.offset);
4997 info.mocs =
4998 anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
4999 }
5000
5001 isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5002
5003 cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5004 }
5005
5006 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5007 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5008 {
5009 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5010 vk_find_struct_const(att->pNext,
5011 RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5012 if (layout_info != NULL)
5013 return layout_info->initialLayout;
5014
5015 return att->imageLayout;
5016 }
5017
genX(CmdBeginRendering)5018 void genX(CmdBeginRendering)(
5019 VkCommandBuffer commandBuffer,
5020 const VkRenderingInfo* pRenderingInfo)
5021 {
5022 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5023 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5024 VkResult result;
5025
5026 if (!is_render_queue_cmd_buffer(cmd_buffer)) {
5027 assert(!"Trying to start a render pass on non-render queue!");
5028 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5029 return;
5030 }
5031
5032 anv_measure_beginrenderpass(cmd_buffer);
5033 trace_intel_begin_render_pass(&cmd_buffer->trace);
5034
5035 gfx->rendering_flags = pRenderingInfo->flags;
5036 gfx->render_area = pRenderingInfo->renderArea;
5037 gfx->view_mask = pRenderingInfo->viewMask;
5038 gfx->layer_count = pRenderingInfo->layerCount;
5039 gfx->samples = 0;
5040
5041 const bool is_multiview = gfx->view_mask != 0;
5042 const VkRect2D render_area = gfx->render_area;
5043 const uint32_t layers =
5044 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5045
5046 /* The framebuffer size is at least large enough to contain the render
5047 * area. Because a zero renderArea is possible, we MAX with 1.
5048 */
5049 struct isl_extent3d fb_size = {
5050 .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5051 .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5052 .d = layers,
5053 };
5054
5055 const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5056 result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5057 if (result != VK_SUCCESS)
5058 return;
5059
5060 genX(flush_pipeline_select_3d)(cmd_buffer);
5061
5062 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5063 if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
5064 continue;
5065
5066 const VkRenderingAttachmentInfo *att =
5067 &pRenderingInfo->pColorAttachments[i];
5068 ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5069 const VkImageLayout initial_layout = attachment_initial_layout(att);
5070
5071 assert(render_area.offset.x + render_area.extent.width <=
5072 iview->vk.extent.width);
5073 assert(render_area.offset.y + render_area.extent.height <=
5074 iview->vk.extent.height);
5075 assert(layers <= iview->vk.layer_count);
5076
5077 fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5078 fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5079
5080 assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5081 gfx->samples |= iview->vk.image->samples;
5082
5083 enum isl_aux_usage aux_usage =
5084 anv_layout_to_aux_usage(cmd_buffer->device->info,
5085 iview->image,
5086 VK_IMAGE_ASPECT_COLOR_BIT,
5087 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5088 att->imageLayout);
5089
5090 union isl_color_value fast_clear_color = { .u32 = { 0, } };
5091
5092 if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5093 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5094 const union isl_color_value clear_color =
5095 vk_to_isl_color_with_format(att->clearValue.color,
5096 iview->planes[0].isl.format);
5097
5098 /* We only support fast-clears on the first layer */
5099 const bool fast_clear =
5100 (!is_multiview || (gfx->view_mask & 1)) &&
5101 anv_can_fast_clear_color_view(cmd_buffer->device, iview,
5102 att->imageLayout, clear_color,
5103 layers, render_area);
5104
5105 if (att->imageLayout != initial_layout) {
5106 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5107 render_area.extent.width == iview->vk.extent.width &&
5108 render_area.extent.height == iview->vk.extent.height);
5109 if (is_multiview) {
5110 u_foreach_bit(view, gfx->view_mask) {
5111 transition_color_buffer(cmd_buffer, iview->image,
5112 VK_IMAGE_ASPECT_COLOR_BIT,
5113 iview->vk.base_mip_level, 1,
5114 iview->vk.base_array_layer + view,
5115 1, /* layer_count */
5116 initial_layout, att->imageLayout,
5117 VK_QUEUE_FAMILY_IGNORED,
5118 VK_QUEUE_FAMILY_IGNORED,
5119 fast_clear);
5120 }
5121 } else {
5122 transition_color_buffer(cmd_buffer, iview->image,
5123 VK_IMAGE_ASPECT_COLOR_BIT,
5124 iview->vk.base_mip_level, 1,
5125 iview->vk.base_array_layer,
5126 gfx->layer_count,
5127 initial_layout, att->imageLayout,
5128 VK_QUEUE_FAMILY_IGNORED,
5129 VK_QUEUE_FAMILY_IGNORED,
5130 fast_clear);
5131 }
5132 }
5133
5134 uint32_t clear_view_mask = pRenderingInfo->viewMask;
5135 uint32_t base_clear_layer = iview->vk.base_array_layer;
5136 uint32_t clear_layer_count = gfx->layer_count;
5137 if (fast_clear) {
5138 /* We only support fast-clears on the first layer */
5139 assert(iview->vk.base_mip_level == 0 &&
5140 iview->vk.base_array_layer == 0);
5141
5142 fast_clear_color = clear_color;
5143
5144 if (iview->image->vk.samples == 1) {
5145 anv_image_ccs_op(cmd_buffer, iview->image,
5146 iview->planes[0].isl.format,
5147 iview->planes[0].isl.swizzle,
5148 VK_IMAGE_ASPECT_COLOR_BIT,
5149 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5150 &fast_clear_color,
5151 false);
5152 } else {
5153 anv_image_mcs_op(cmd_buffer, iview->image,
5154 iview->planes[0].isl.format,
5155 iview->planes[0].isl.swizzle,
5156 VK_IMAGE_ASPECT_COLOR_BIT,
5157 0, 1, ISL_AUX_OP_FAST_CLEAR,
5158 &fast_clear_color,
5159 false);
5160 }
5161 clear_view_mask &= ~1u;
5162 base_clear_layer++;
5163 clear_layer_count--;
5164
5165 if (isl_color_value_is_zero(clear_color,
5166 iview->planes[0].isl.format)) {
5167 /* This image has the auxiliary buffer enabled. We can mark the
5168 * subresource as not needing a resolve because the clear color
5169 * will match what's in every RENDER_SURFACE_STATE object when
5170 * it's being used for sampling.
5171 */
5172 set_image_fast_clear_state(cmd_buffer, iview->image,
5173 VK_IMAGE_ASPECT_COLOR_BIT,
5174 ANV_FAST_CLEAR_DEFAULT_VALUE);
5175 } else {
5176 set_image_fast_clear_state(cmd_buffer, iview->image,
5177 VK_IMAGE_ASPECT_COLOR_BIT,
5178 ANV_FAST_CLEAR_ANY);
5179 }
5180 }
5181
5182 if (is_multiview) {
5183 u_foreach_bit(view, clear_view_mask) {
5184 anv_image_clear_color(cmd_buffer, iview->image,
5185 VK_IMAGE_ASPECT_COLOR_BIT,
5186 aux_usage,
5187 iview->planes[0].isl.format,
5188 iview->planes[0].isl.swizzle,
5189 iview->vk.base_mip_level,
5190 iview->vk.base_array_layer + view, 1,
5191 render_area, clear_color);
5192 }
5193 } else {
5194 anv_image_clear_color(cmd_buffer, iview->image,
5195 VK_IMAGE_ASPECT_COLOR_BIT,
5196 aux_usage,
5197 iview->planes[0].isl.format,
5198 iview->planes[0].isl.swizzle,
5199 iview->vk.base_mip_level,
5200 base_clear_layer, clear_layer_count,
5201 render_area, clear_color);
5202 }
5203 } else {
5204 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5205 assert(att->imageLayout == initial_layout);
5206 }
5207
5208 gfx->color_att[i].vk_format = iview->vk.format;
5209 gfx->color_att[i].iview = iview;
5210 gfx->color_att[i].layout = att->imageLayout;
5211 gfx->color_att[i].aux_usage = aux_usage;
5212
5213 struct isl_view isl_view = iview->planes[0].isl;
5214 if (pRenderingInfo->viewMask) {
5215 assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5216 isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5217 } else {
5218 assert(isl_view.array_len >= pRenderingInfo->layerCount);
5219 isl_view.array_len = pRenderingInfo->layerCount;
5220 }
5221
5222 anv_image_fill_surface_state(cmd_buffer->device,
5223 iview->image,
5224 VK_IMAGE_ASPECT_COLOR_BIT,
5225 &isl_view,
5226 ISL_SURF_USAGE_RENDER_TARGET_BIT,
5227 aux_usage, &fast_clear_color,
5228 0, /* anv_image_view_state_flags */
5229 &gfx->color_att[i].surface_state,
5230 NULL);
5231
5232 add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
5233
5234 if ((att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5235 (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5236 iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5237 iview->planes[0].isl.base_level == 0 &&
5238 iview->planes[0].isl.base_array_layer == 0) {
5239 genX(copy_fast_clear_dwords)(cmd_buffer,
5240 gfx->color_att[i].surface_state.state,
5241 iview->image,
5242 VK_IMAGE_ASPECT_COLOR_BIT,
5243 false /* copy to ss */);
5244 }
5245
5246 if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5247 gfx->color_att[i].resolve_mode = att->resolveMode;
5248 gfx->color_att[i].resolve_iview =
5249 anv_image_view_from_handle(att->resolveImageView);
5250 gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5251 }
5252 }
5253
5254 anv_cmd_graphic_state_update_has_uint_rt(gfx);
5255
5256 const struct anv_image_view *ds_iview = NULL;
5257 const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5258 const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5259 if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5260 (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5261 const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5262 VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5263 VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5264 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5265 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5266 enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5267 enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5268 float depth_clear_value = 0;
5269 uint32_t stencil_clear_value = 0;
5270
5271 if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5272 d_iview = anv_image_view_from_handle(d_att->imageView);
5273 initial_depth_layout = attachment_initial_layout(d_att);
5274 depth_layout = d_att->imageLayout;
5275 depth_aux_usage =
5276 anv_layout_to_aux_usage(cmd_buffer->device->info,
5277 d_iview->image,
5278 VK_IMAGE_ASPECT_DEPTH_BIT,
5279 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5280 depth_layout);
5281 depth_clear_value = d_att->clearValue.depthStencil.depth;
5282 }
5283
5284 if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5285 s_iview = anv_image_view_from_handle(s_att->imageView);
5286 initial_stencil_layout = attachment_initial_layout(s_att);
5287 stencil_layout = s_att->imageLayout;
5288 stencil_aux_usage =
5289 anv_layout_to_aux_usage(cmd_buffer->device->info,
5290 s_iview->image,
5291 VK_IMAGE_ASPECT_STENCIL_BIT,
5292 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5293 stencil_layout);
5294 stencil_clear_value = s_att->clearValue.depthStencil.stencil;
5295 }
5296
5297 assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5298 ds_iview = d_iview != NULL ? d_iview : s_iview;
5299 assert(ds_iview != NULL);
5300
5301 assert(render_area.offset.x + render_area.extent.width <=
5302 ds_iview->vk.extent.width);
5303 assert(render_area.offset.y + render_area.extent.height <=
5304 ds_iview->vk.extent.height);
5305 assert(layers <= ds_iview->vk.layer_count);
5306
5307 fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5308 fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5309
5310 assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5311 gfx->samples |= ds_iview->vk.image->samples;
5312
5313 VkImageAspectFlags clear_aspects = 0;
5314 if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5315 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5316 clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5317 if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5318 !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5319 clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5320
5321 if (clear_aspects != 0) {
5322 const bool hiz_clear =
5323 anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5324 depth_layout, clear_aspects,
5325 depth_clear_value,
5326 render_area);
5327
5328 if (depth_layout != initial_depth_layout) {
5329 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5330 render_area.extent.width == d_iview->vk.extent.width &&
5331 render_area.extent.height == d_iview->vk.extent.height);
5332
5333 if (is_multiview) {
5334 u_foreach_bit(view, gfx->view_mask) {
5335 transition_depth_buffer(cmd_buffer, d_iview->image,
5336 d_iview->vk.base_array_layer + view,
5337 1 /* layer_count */,
5338 initial_depth_layout, depth_layout,
5339 hiz_clear);
5340 }
5341 } else {
5342 transition_depth_buffer(cmd_buffer, d_iview->image,
5343 d_iview->vk.base_array_layer,
5344 gfx->layer_count,
5345 initial_depth_layout, depth_layout,
5346 hiz_clear);
5347 }
5348 }
5349
5350 if (stencil_layout != initial_stencil_layout) {
5351 assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5352 render_area.extent.width == s_iview->vk.extent.width &&
5353 render_area.extent.height == s_iview->vk.extent.height);
5354
5355 if (is_multiview) {
5356 u_foreach_bit(view, gfx->view_mask) {
5357 transition_stencil_buffer(cmd_buffer, s_iview->image,
5358 s_iview->vk.base_mip_level, 1,
5359 s_iview->vk.base_array_layer + view,
5360 1 /* layer_count */,
5361 initial_stencil_layout,
5362 stencil_layout,
5363 hiz_clear);
5364 }
5365 } else {
5366 transition_stencil_buffer(cmd_buffer, s_iview->image,
5367 s_iview->vk.base_mip_level, 1,
5368 s_iview->vk.base_array_layer,
5369 gfx->layer_count,
5370 initial_stencil_layout,
5371 stencil_layout,
5372 hiz_clear);
5373 }
5374 }
5375
5376 if (is_multiview) {
5377 uint32_t clear_view_mask = pRenderingInfo->viewMask;
5378 while (clear_view_mask) {
5379 int view = u_bit_scan(&clear_view_mask);
5380
5381 uint32_t level = ds_iview->vk.base_mip_level;
5382 uint32_t layer = ds_iview->vk.base_array_layer + view;
5383
5384 if (hiz_clear) {
5385 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5386 clear_aspects,
5387 level, layer, 1,
5388 render_area,
5389 stencil_clear_value);
5390 } else {
5391 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5392 clear_aspects,
5393 depth_aux_usage,
5394 level, layer, 1,
5395 render_area,
5396 depth_clear_value,
5397 stencil_clear_value);
5398 }
5399 }
5400 } else {
5401 uint32_t level = ds_iview->vk.base_mip_level;
5402 uint32_t base_layer = ds_iview->vk.base_array_layer;
5403 uint32_t layer_count = gfx->layer_count;
5404
5405 if (hiz_clear) {
5406 anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5407 clear_aspects,
5408 level, base_layer, layer_count,
5409 render_area,
5410 stencil_clear_value);
5411 } else {
5412 anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5413 clear_aspects,
5414 depth_aux_usage,
5415 level, base_layer, layer_count,
5416 render_area,
5417 depth_clear_value,
5418 stencil_clear_value);
5419 }
5420 }
5421 } else {
5422 /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5423 assert(depth_layout == initial_depth_layout);
5424 assert(stencil_layout == initial_stencil_layout);
5425 }
5426
5427 if (d_iview != NULL) {
5428 gfx->depth_att.vk_format = d_iview->vk.format;
5429 gfx->depth_att.iview = d_iview;
5430 gfx->depth_att.layout = depth_layout;
5431 gfx->depth_att.aux_usage = depth_aux_usage;
5432 if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5433 assert(d_att->resolveImageView != VK_NULL_HANDLE);
5434 gfx->depth_att.resolve_mode = d_att->resolveMode;
5435 gfx->depth_att.resolve_iview =
5436 anv_image_view_from_handle(d_att->resolveImageView);
5437 gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5438 }
5439 }
5440
5441 if (s_iview != NULL) {
5442 gfx->stencil_att.vk_format = s_iview->vk.format;
5443 gfx->stencil_att.iview = s_iview;
5444 gfx->stencil_att.layout = stencil_layout;
5445 gfx->stencil_att.aux_usage = stencil_aux_usage;
5446 if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5447 assert(s_att->resolveImageView != VK_NULL_HANDLE);
5448 gfx->stencil_att.resolve_mode = s_att->resolveMode;
5449 gfx->stencil_att.resolve_iview =
5450 anv_image_view_from_handle(s_att->resolveImageView);
5451 gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5452 }
5453 }
5454 }
5455
5456 /* Finally, now that we know the right size, set up the null surface */
5457 assert(util_bitcount(gfx->samples) <= 1);
5458 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5459 gfx->null_surface_state.map,
5460 .size = fb_size);
5461
5462 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5463 if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5464 continue;
5465
5466 isl_null_fill_state(&cmd_buffer->device->isl_dev,
5467 gfx->color_att[i].surface_state.state.map,
5468 .size = fb_size);
5469 }
5470
5471 /****** We can now start emitting code to begin the render pass ******/
5472
5473 gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5474
5475 /* Our implementation of VK_KHR_multiview uses instancing to draw the
5476 * different views. If the client asks for instancing, we need to use the
5477 * Instance Data Step Rate to ensure that we repeat the client's
5478 * per-instance data once for each view. Since this bit is in
5479 * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
5480 * of each subpass.
5481 */
5482 if (GFX_VER == 7)
5483 gfx->vb_dirty |= ~0;
5484
5485 /* It is possible to start a render pass with an old pipeline. Because the
5486 * render pass and subpass index are both baked into the pipeline, this is
5487 * highly unlikely. In order to do so, it requires that you have a render
5488 * pass with a single subpass and that you use that render pass twice
5489 * back-to-back and use the same pipeline at the start of the second render
5490 * pass as at the end of the first. In order to avoid unpredictable issues
5491 * with this edge case, we just dirty the pipeline at the start of every
5492 * subpass.
5493 */
5494 gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5495
5496 cmd_buffer_emit_depth_stencil(cmd_buffer);
5497 }
5498
5499 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5500 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5501 struct anv_attachment *att,
5502 VkImageAspectFlagBits aspect)
5503 {
5504 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5505 const struct anv_image_view *iview = att->iview;
5506
5507 if (iview == NULL)
5508 return;
5509
5510 if (gfx->view_mask == 0) {
5511 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5512 aspect, att->aux_usage,
5513 iview->planes[0].isl.base_level,
5514 iview->planes[0].isl.base_array_layer,
5515 gfx->layer_count);
5516 } else {
5517 uint32_t res_view_mask = gfx->view_mask;
5518 while (res_view_mask) {
5519 int i = u_bit_scan(&res_view_mask);
5520
5521 const uint32_t level = iview->planes[0].isl.base_level;
5522 const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5523
5524 genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5525 aspect, att->aux_usage,
5526 level, layer, 1);
5527 }
5528 }
5529 }
5530
5531 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)5532 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
5533 {
5534 switch (vk_mode) {
5535 case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
5536 return BLORP_FILTER_SAMPLE_0;
5537 case VK_RESOLVE_MODE_AVERAGE_BIT:
5538 return BLORP_FILTER_AVERAGE;
5539 case VK_RESOLVE_MODE_MIN_BIT:
5540 return BLORP_FILTER_MIN_SAMPLE;
5541 case VK_RESOLVE_MODE_MAX_BIT:
5542 return BLORP_FILTER_MAX_SAMPLE;
5543 default:
5544 return BLORP_FILTER_NONE;
5545 }
5546 }
5547
5548 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)5549 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
5550 const struct anv_attachment *att,
5551 VkImageLayout layout,
5552 VkImageAspectFlagBits aspect)
5553 {
5554 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5555 const struct anv_image_view *src_iview = att->iview;
5556 const struct anv_image_view *dst_iview = att->resolve_iview;
5557
5558 enum isl_aux_usage src_aux_usage =
5559 anv_layout_to_aux_usage(cmd_buffer->device->info,
5560 src_iview->image, aspect,
5561 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
5562 layout);
5563
5564 enum isl_aux_usage dst_aux_usage =
5565 anv_layout_to_aux_usage(cmd_buffer->device->info,
5566 dst_iview->image, aspect,
5567 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
5568 att->resolve_layout);
5569
5570 enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
5571
5572 const VkRect2D render_area = gfx->render_area;
5573 if (gfx->view_mask == 0) {
5574 anv_image_msaa_resolve(cmd_buffer,
5575 src_iview->image, src_aux_usage,
5576 src_iview->planes[0].isl.base_level,
5577 src_iview->planes[0].isl.base_array_layer,
5578 dst_iview->image, dst_aux_usage,
5579 dst_iview->planes[0].isl.base_level,
5580 dst_iview->planes[0].isl.base_array_layer,
5581 aspect,
5582 render_area.offset.x, render_area.offset.y,
5583 render_area.offset.x, render_area.offset.y,
5584 render_area.extent.width,
5585 render_area.extent.height,
5586 gfx->layer_count, filter);
5587 } else {
5588 uint32_t res_view_mask = gfx->view_mask;
5589 while (res_view_mask) {
5590 int i = u_bit_scan(&res_view_mask);
5591
5592 anv_image_msaa_resolve(cmd_buffer,
5593 src_iview->image, src_aux_usage,
5594 src_iview->planes[0].isl.base_level,
5595 src_iview->planes[0].isl.base_array_layer + i,
5596 dst_iview->image, dst_aux_usage,
5597 dst_iview->planes[0].isl.base_level,
5598 dst_iview->planes[0].isl.base_array_layer + i,
5599 aspect,
5600 render_area.offset.x, render_area.offset.y,
5601 render_area.offset.x, render_area.offset.y,
5602 render_area.extent.width,
5603 render_area.extent.height,
5604 1, filter);
5605 }
5606 }
5607 }
5608
genX(CmdEndRendering)5609 void genX(CmdEndRendering)(
5610 VkCommandBuffer commandBuffer)
5611 {
5612 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5613 struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5614
5615 if (anv_batch_has_error(&cmd_buffer->batch))
5616 return;
5617
5618 const bool is_multiview = gfx->view_mask != 0;
5619 const uint32_t layers =
5620 is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5621
5622 bool has_color_resolve = false;
5623 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5624 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5625 VK_IMAGE_ASPECT_COLOR_BIT);
5626
5627 /* Stash this off for later */
5628 if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
5629 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5630 has_color_resolve = true;
5631 }
5632
5633 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5634 VK_IMAGE_ASPECT_DEPTH_BIT);
5635
5636 cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5637 VK_IMAGE_ASPECT_STENCIL_BIT);
5638
5639 if (has_color_resolve) {
5640 /* We are about to do some MSAA resolves. We need to flush so that the
5641 * result of writes to the MSAA color attachments show up in the sampler
5642 * when we blit to the single-sampled resolve target.
5643 */
5644 anv_add_pending_pipe_bits(cmd_buffer,
5645 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5646 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5647 "MSAA resolve");
5648 }
5649
5650 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
5651 gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
5652 /* We are about to do some MSAA resolves. We need to flush so that the
5653 * result of writes to the MSAA depth attachments show up in the sampler
5654 * when we blit to the single-sampled resolve target.
5655 */
5656 anv_add_pending_pipe_bits(cmd_buffer,
5657 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5658 ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5659 "MSAA resolve");
5660 }
5661
5662 for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5663 const struct anv_attachment *att = &gfx->color_att[i];
5664 if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
5665 (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5666 continue;
5667
5668 cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
5669 VK_IMAGE_ASPECT_COLOR_BIT);
5670 }
5671
5672 if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5673 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5674 const struct anv_image_view *src_iview = gfx->depth_att.iview;
5675
5676 /* MSAA resolves sample from the source attachment. Transition the
5677 * depth attachment first to get rid of any HiZ that we may not be
5678 * able to handle.
5679 */
5680 transition_depth_buffer(cmd_buffer, src_iview->image,
5681 src_iview->planes[0].isl.base_array_layer,
5682 layers,
5683 gfx->depth_att.layout,
5684 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5685 false /* will_full_fast_clear */);
5686
5687 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
5688 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5689 VK_IMAGE_ASPECT_DEPTH_BIT);
5690
5691 /* Transition the source back to the original layout. This seems a bit
5692 * inefficient but, since HiZ resolves aren't destructive, going from
5693 * less HiZ to more is generally a no-op.
5694 */
5695 transition_depth_buffer(cmd_buffer, src_iview->image,
5696 src_iview->planes[0].isl.base_array_layer,
5697 layers,
5698 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5699 gfx->depth_att.layout,
5700 false /* will_full_fast_clear */);
5701 }
5702
5703 if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5704 !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5705 cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
5706 gfx->stencil_att.layout,
5707 VK_IMAGE_ASPECT_STENCIL_BIT);
5708 }
5709
5710 #if GFX_VER == 7
5711 /* On gfx7, we have to store a texturable version of the stencil buffer in
5712 * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
5713 * forth at strategic points. Stencil writes are only allowed in following
5714 * layouts:
5715 *
5716 * - VK_IMAGE_LAYOUT_GENERAL
5717 * - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
5718 * - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
5719 * - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
5720 * - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
5721 * - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
5722 * - VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT
5723 *
5724 * For general, we have no nice opportunity to transition so we do the copy
5725 * to the shadow unconditionally at the end of the subpass. For transfer
5726 * destinations, we can update it as part of the transfer op. For the other
5727 * layouts, we delay the copy until a transition into some other layout.
5728 */
5729 if (gfx->stencil_att.iview != NULL) {
5730 const struct anv_image_view *iview = gfx->stencil_att.iview;
5731 const struct anv_image *image = iview->image;
5732 const uint32_t plane =
5733 anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5734
5735 if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
5736 (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
5737 gfx->stencil_att.layout == VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT)) {
5738 anv_image_copy_to_shadow(cmd_buffer, image,
5739 VK_IMAGE_ASPECT_STENCIL_BIT,
5740 iview->planes[plane].isl.base_level, 1,
5741 iview->planes[plane].isl.base_array_layer,
5742 layers);
5743 }
5744 }
5745 #endif
5746
5747 anv_cmd_buffer_reset_rendering(cmd_buffer);
5748 }
5749
5750 void
genX(cmd_emit_conditional_render_predicate)5751 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5752 {
5753 #if GFX_VERx10 >= 75
5754 struct mi_builder b;
5755 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5756
5757 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5758 mi_reg32(ANV_PREDICATE_RESULT_REG));
5759 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5760
5761 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5762 mip.LoadOperation = LOAD_LOADINV;
5763 mip.CombineOperation = COMBINE_SET;
5764 mip.CompareOperation = COMPARE_SRCS_EQUAL;
5765 }
5766 #endif
5767 }
5768
5769 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)5770 void genX(CmdBeginConditionalRenderingEXT)(
5771 VkCommandBuffer commandBuffer,
5772 const VkConditionalRenderingBeginInfoEXT* pConditionalRenderingBegin)
5773 {
5774 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5775 ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5776 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5777 struct anv_address value_address =
5778 anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5779
5780 const bool isInverted = pConditionalRenderingBegin->flags &
5781 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5782
5783 cmd_state->conditional_render_enabled = true;
5784
5785 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5786
5787 struct mi_builder b;
5788 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5789
5790 /* Section 19.4 of the Vulkan 1.1.85 spec says:
5791 *
5792 * If the value of the predicate in buffer memory changes
5793 * while conditional rendering is active, the rendering commands
5794 * may be discarded in an implementation-dependent way.
5795 * Some implementations may latch the value of the predicate
5796 * upon beginning conditional rendering while others
5797 * may read it before every rendering command.
5798 *
5799 * So it's perfectly fine to read a value from the buffer once.
5800 */
5801 struct mi_value value = mi_mem32(value_address);
5802
5803 /* Precompute predicate result, it is necessary to support secondary
5804 * command buffers since it is unknown if conditional rendering is
5805 * inverted when populating them.
5806 */
5807 mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5808 isInverted ? mi_uge(&b, mi_imm(0), value) :
5809 mi_ult(&b, mi_imm(0), value));
5810 }
5811
genX(CmdEndConditionalRenderingEXT)5812 void genX(CmdEndConditionalRenderingEXT)(
5813 VkCommandBuffer commandBuffer)
5814 {
5815 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5816 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5817
5818 cmd_state->conditional_render_enabled = false;
5819 }
5820 #endif
5821
5822 /* Set of stage bits for which are pipelined, i.e. they get queued
5823 * by the command streamer for later execution.
5824 */
5825 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5826 ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5827 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5828 VK_PIPELINE_STAGE_2_HOST_BIT | \
5829 VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5830
genX(CmdSetEvent2)5831 void genX(CmdSetEvent2)(
5832 VkCommandBuffer commandBuffer,
5833 VkEvent _event,
5834 const VkDependencyInfo* pDependencyInfo)
5835 {
5836 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5837 ANV_FROM_HANDLE(anv_event, event, _event);
5838
5839 VkPipelineStageFlags2 src_stages = 0;
5840
5841 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5842 src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5843 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5844 src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5845 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5846 src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5847
5848 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5849 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5850
5851 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5852 if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5853 pc.StallAtPixelScoreboard = true;
5854 pc.CommandStreamerStallEnable = true;
5855 }
5856
5857 pc.DestinationAddressType = DAT_PPGTT,
5858 pc.PostSyncOperation = WriteImmediateData,
5859 pc.Address = (struct anv_address) {
5860 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5861 event->state.offset
5862 };
5863 pc.ImmediateData = VK_EVENT_SET;
5864 anv_debug_dump_pc(pc);
5865 }
5866 }
5867
genX(CmdResetEvent2)5868 void genX(CmdResetEvent2)(
5869 VkCommandBuffer commandBuffer,
5870 VkEvent _event,
5871 VkPipelineStageFlags2 stageMask)
5872 {
5873 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5874 ANV_FROM_HANDLE(anv_event, event, _event);
5875
5876 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5877 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5878
5879 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5880 if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5881 pc.StallAtPixelScoreboard = true;
5882 pc.CommandStreamerStallEnable = true;
5883 }
5884
5885 pc.DestinationAddressType = DAT_PPGTT;
5886 pc.PostSyncOperation = WriteImmediateData;
5887 pc.Address = (struct anv_address) {
5888 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5889 event->state.offset
5890 };
5891 pc.ImmediateData = VK_EVENT_RESET;
5892 anv_debug_dump_pc(pc);
5893 }
5894 }
5895
genX(CmdWaitEvents2)5896 void genX(CmdWaitEvents2)(
5897 VkCommandBuffer commandBuffer,
5898 uint32_t eventCount,
5899 const VkEvent* pEvents,
5900 const VkDependencyInfo* pDependencyInfos)
5901 {
5902 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5903
5904 #if GFX_VER >= 8
5905 for (uint32_t i = 0; i < eventCount; i++) {
5906 ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5907
5908 anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5909 sem.WaitMode = PollingMode,
5910 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD,
5911 sem.SemaphoreDataDword = VK_EVENT_SET,
5912 sem.SemaphoreAddress = (struct anv_address) {
5913 cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5914 event->state.offset
5915 };
5916 }
5917 }
5918 #else
5919 anv_finishme("Implement events on gfx7");
5920 #endif
5921
5922 cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
5923 }
5924
vk_to_intel_index_type(VkIndexType type)5925 static uint32_t vk_to_intel_index_type(VkIndexType type)
5926 {
5927 switch (type) {
5928 case VK_INDEX_TYPE_UINT8_EXT:
5929 return INDEX_BYTE;
5930 case VK_INDEX_TYPE_UINT16:
5931 return INDEX_WORD;
5932 case VK_INDEX_TYPE_UINT32:
5933 return INDEX_DWORD;
5934 default:
5935 unreachable("invalid index type");
5936 }
5937 }
5938
genX(CmdBindIndexBuffer)5939 void genX(CmdBindIndexBuffer)(
5940 VkCommandBuffer commandBuffer,
5941 VkBuffer _buffer,
5942 VkDeviceSize offset,
5943 VkIndexType indexType)
5944 {
5945 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5946 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5947
5948 cmd_buffer->state.gfx.restart_index = vk_index_to_restart(indexType);
5949 cmd_buffer->state.gfx.index_buffer = buffer;
5950 cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
5951 cmd_buffer->state.gfx.index_offset = offset;
5952
5953 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
5954 }
5955
genX(CmdSetPerformanceOverrideINTEL)5956 VkResult genX(CmdSetPerformanceOverrideINTEL)(
5957 VkCommandBuffer commandBuffer,
5958 const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
5959 {
5960 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5961
5962 switch (pOverrideInfo->type) {
5963 case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
5964 anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
5965 instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
5966 instpm.MediaInstructionDisable = pOverrideInfo->enable;
5967 instpm._3DRenderingInstructionDisableMask = true;
5968 instpm.MediaInstructionDisableMask = true;
5969 }
5970 break;
5971 }
5972
5973 case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
5974 if (pOverrideInfo->enable) {
5975 /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
5976 anv_add_pending_pipe_bits(cmd_buffer,
5977 ANV_PIPE_FLUSH_BITS |
5978 ANV_PIPE_INVALIDATE_BITS,
5979 "perf counter isolation");
5980 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5981 }
5982 break;
5983
5984 default:
5985 unreachable("Invalid override");
5986 }
5987
5988 return VK_SUCCESS;
5989 }
5990
genX(CmdSetPerformanceStreamMarkerINTEL)5991 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
5992 VkCommandBuffer commandBuffer,
5993 const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
5994 {
5995 /* TODO: Waiting on the register to write, might depend on generation. */
5996
5997 return VK_SUCCESS;
5998 }
5999
6000 #define TIMESTAMP 0x2358
6001
genX(cmd_emit_timestamp)6002 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6003 struct anv_device *device,
6004 struct anv_address addr,
6005 enum anv_timestamp_capture_type type) {
6006 switch (type) {
6007 case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6008 struct mi_builder b;
6009 mi_builder_init(&b, device->info, batch);
6010 mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6011 break;
6012 }
6013
6014 case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
6015 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6016 pc.PostSyncOperation = WriteTimestamp;
6017 pc.Address = addr;
6018 anv_debug_dump_pc(pc);
6019 }
6020 break;
6021
6022 case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6023 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6024 pc.CommandStreamerStallEnable = true;
6025 pc.PostSyncOperation = WriteTimestamp;
6026 pc.Address = addr;
6027 anv_debug_dump_pc(pc);
6028 }
6029 break;
6030
6031 default:
6032 unreachable("invalid");
6033 }
6034 }
6035