1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15
16 #include "nil_format.h"
17 #include "util/bitpack_helpers.h"
18 #include "vulkan/runtime/vk_render_pass.h"
19 #include "vulkan/runtime/vk_standard_sample_locations.h"
20 #include "vulkan/util/vk_format.h"
21
22 #include "nouveau_context.h"
23
24 #include "nvk_cl902d.h"
25 #include "nvk_cl9097.h"
26 #include "nvk_cl90b5.h"
27 #include "nvk_cl90c0.h"
28 #include "nvk_cla097.h"
29 #include "nvk_clb097.h"
30 #include "nvk_clb197.h"
31 #include "nvk_clc397.h"
32 #include "nvk_clc597.h"
33 #include "drf.h"
34
35 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)36 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
37 {
38 return nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d;
39 }
40
41 void
nvk_mme_set_priv_reg(struct mme_builder * b)42 nvk_mme_set_priv_reg(struct mme_builder *b)
43 {
44 mme_mthd(b, NV9097_WAIT_FOR_IDLE);
45 mme_emit(b, mme_zero());
46
47 mme_mthd(b, NV9097_SET_MME_SHADOW_SCRATCH(0));
48 mme_emit(b, mme_zero());
49 mme_emit(b, mme_load(b));
50 mme_emit(b, mme_load(b));
51
52 /* Not sure if this has to strictly go before SET_FALCON04, but it might.
53 * We also don't really know what that value indicates and when and how it's
54 * set.
55 */
56 struct mme_value s26 = mme_state(b, NV9097_SET_MME_SHADOW_SCRATCH(26));
57 s26 = mme_merge(b, mme_zero(), s26, 0, 8, 0);
58
59 mme_mthd(b, NV9097_SET_FALCON04);
60 mme_emit(b, mme_load(b));
61
62 mme_if(b, ieq, s26, mme_imm(2)) {
63 struct mme_value loop_cond = mme_mov(b, mme_zero());
64 mme_while(b, ine, loop_cond, mme_imm(1)) {
65 mme_state_to(b, loop_cond, NV9097_SET_MME_SHADOW_SCRATCH(0));
66 mme_mthd(b, NV9097_NO_OPERATION);
67 mme_emit(b, mme_zero());
68 };
69 }
70
71 mme_if(b, ine, s26, mme_imm(2)) {
72 mme_loop(b, mme_imm(10)) {
73 mme_mthd(b, NV9097_NO_OPERATION);
74 mme_emit(b, mme_zero());
75 }
76 }
77 }
78
79 VkResult
nvk_push_draw_state_init(struct nvk_device * dev,struct nv_push * p)80 nvk_push_draw_state_init(struct nvk_device *dev, struct nv_push *p)
81 {
82 struct nvk_physical_device *pdev = nvk_device_physical(dev);
83
84 /* 3D state */
85 P_MTHD(p, NV9097, SET_OBJECT);
86 P_NV9097_SET_OBJECT(p, {
87 .class_id = pdev->info.cls_eng3d,
88 .engine_id = 0,
89 });
90
91 for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
92 size_t size;
93 uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
94 if (dw == NULL)
95 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
96
97 assert(size % sizeof(uint32_t) == 0);
98 const uint32_t num_dw = size / sizeof(uint32_t);
99
100 P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
101 P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
102 P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
103
104 P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
105 P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
106 P_INLINE_ARRAY(p, dw, num_dw);
107
108 mme_pos += num_dw;
109
110 free(dw);
111 }
112
113 if (dev->pdev->info.cls_eng3d >= TURING_A)
114 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
115
116 /* Enable FP hepler invocation memory loads
117 *
118 * For generations with firmware support for our `SET_PRIV_REG` mme method
119 * we simply use that. On older generations we'll let the kernel do it.
120 * Starting with GSP we have to do it via the firmware anyway.
121 */
122 if (dev->pdev->info.cls_eng3d >= MAXWELL_B) {
123 unsigned reg = dev->pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
124 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
125 P_INLINE_DATA(p, 0);
126 P_INLINE_DATA(p, BITFIELD_BIT(3));
127 P_INLINE_DATA(p, reg);
128 }
129
130 P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
131
132 P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
133 P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
134 for (unsigned i = 0; i < 8; i++)
135 P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
136
137 P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
138
139 // P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
140 // P_INLINE_DATA(cmd->push, 0);
141
142 P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
143
144 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
145
146 P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
147 P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
148 P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
149 P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
150
151 P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
152
153 P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
154
155 P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
156
157 P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
158 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
159
160 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
161 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
162 .all_covered_all_hit_once = 0xff,
163 });
164 P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
165 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
166 .all_covered_all_hit_once = 0xff,
167 });
168 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
169 .all_covered_all_hit_once = 0xff,
170 });
171 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
172 .all_covered_all_hit_once = 0x3f,
173 });
174 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
175 .all_covered_all_hit_once = 0xff,
176 });
177 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
178 .all_covered_all_hit_once = 0xff,
179 });
180
181 if (dev->pdev->info.cls_eng3d < VOLTA_A)
182 P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
183
184 P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
185 .current = 3,
186 .oldest_supported = 3,
187 });
188 P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
189 .current = 2,
190 .oldest_supported = 2,
191 });
192
193 if (dev->pdev->info.cls_eng3d < MAXWELL_A)
194 P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
195
196 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
197 POLICY_EVICT_NORMAL);
198 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
199 POLICY_EVICT_NORMAL);
200 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
201 POLICY_EVICT_NORMAL);
202 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
203 POLICY_EVICT_NORMAL);
204 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
205 POLICY_EVICT_NORMAL);
206
207 P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
208
209 P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
210 .color_front_diffuse = COLOR_FRONT_DIFFUSE_VECTOR_0001,
211 .color_front_specular = COLOR_FRONT_SPECULAR_VECTOR_0001,
212 .generic_vector = GENERIC_VECTOR_VECTOR_0001,
213 .fixed_fnc_texture = FIXED_FNC_TEXTURE_VECTOR_0001,
214 .dx9_color0 = DX9_COLOR0_VECTOR_0001,
215 .dx9_color1_to_color15 = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
216 });
217
218 P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
219
220 P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
221 CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
222
223 P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
224 .enable = ENABLE_TRUE,
225 .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
226 });
227
228 if (dev->pdev->info.cls_eng3d < VOLTA_A)
229 P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
230
231 P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
232 P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
233 P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
234
235 if (dev->pdev->info.cls_eng3d < MAXWELL_A)
236 P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
237
238 if (dev->pdev->info.cls_eng3d >= KEPLER_A &&
239 dev->pdev->info.cls_eng3d < MAXWELL_A) {
240 P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
241 ORDERING_KEPLER_ORDER);
242 }
243
244 P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
245 P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
246 P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
247 P_IMMD(p, NV9097, SET_PS_SATURATE, {
248 .output0 = OUTPUT0_FALSE,
249 .output1 = OUTPUT1_FALSE,
250 .output2 = OUTPUT2_FALSE,
251 .output3 = OUTPUT3_FALSE,
252 .output4 = OUTPUT4_FALSE,
253 .output5 = OUTPUT5_FALSE,
254 .output6 = OUTPUT6_FALSE,
255 .output7 = OUTPUT7_FALSE,
256 });
257
258 P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
259
260 /* From vulkan spec's point rasterization:
261 * "Point rasterization produces a fragment for each fragment area group of
262 * framebuffer pixels with one or more sample points that intersect a region
263 * centered at the point’s (xf,yf).
264 * This region is a square with side equal to the current point size.
265 * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
266 * for the point"
267 *
268 * So it seems we always need square points with PointCoords like OpenGL
269 * point sprites.
270 *
271 * From OpenGL compatibility spec:
272 * Basic point rasterization:
273 * "If point sprites are enabled, then point rasterization produces a
274 * fragment for each framebuffer pixel whose center lies inside a square
275 * centered at the point’s (xw, yw), with side length equal to the current
276 * point size.
277 * ... and xw and yw are the exact, unrounded window coordinates of the
278 * vertex for the point"
279 *
280 * And Point multisample rasterization:
281 * "This region is a circle having diameter equal to the current point width
282 * if POINT_SPRITE is disabled, or a square with side equal to the current
283 * point width if POINT_SPRITE is enabled."
284 */
285 P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
286 P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
287 .rmode = RMODE_ZERO,
288 .origin = ORIGIN_TOP,
289 .texture0 = TEXTURE0_PASSTHROUGH,
290 .texture1 = TEXTURE1_PASSTHROUGH,
291 .texture2 = TEXTURE2_PASSTHROUGH,
292 .texture3 = TEXTURE3_PASSTHROUGH,
293 .texture4 = TEXTURE4_PASSTHROUGH,
294 .texture5 = TEXTURE5_PASSTHROUGH,
295 .texture6 = TEXTURE6_PASSTHROUGH,
296 .texture7 = TEXTURE7_PASSTHROUGH,
297 .texture8 = TEXTURE8_PASSTHROUGH,
298 .texture9 = TEXTURE9_PASSTHROUGH,
299 });
300
301 /* OpenGL's GL_POINT_SMOOTH */
302 P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
303
304 if (dev->pdev->info.cls_eng3d >= MAXWELL_B)
305 P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
306
307 P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
308
309 P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
310
311 P_IMMD(p, NV9097, SET_HYBRID_ANTI_ALIAS_CONTROL, {
312 .passes = 1,
313 .centroid = CENTROID_PER_FRAGMENT,
314 });
315
316 /* Enable multisample rasterization even for one sample rasterization,
317 * this way we get strict lines and rectangular line support.
318 * More info at: DirectX rasterization rules
319 */
320 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
321
322 if (dev->pdev->info.cls_eng3d >= MAXWELL_B) {
323 P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
324 BY_VIEWPORT_INDEX_FALSE);
325 }
326
327 /* TODO: Vertex runout */
328
329 P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
330 .mode = MODE_UPPER_LEFT,
331 .flip_y = FLIP_Y_FALSE,
332 });
333
334 P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
335 P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
336 P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
337
338 P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
339 P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
340 P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
341
342 // P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
343 // .respect_stencil_mask = RESPECT_STENCIL_MASK_FALSE,
344 // .use_clear_rect = USE_CLEAR_RECT_FALSE,
345 // });
346
347 P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
348
349 P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
350 .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
351 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
352 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
353 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
354 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
355 .geometry_clip = GEOMETRY_CLIP_WZERO_CLIP,
356 .geometry_guardband_z = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
357 });
358
359 for (unsigned i = 0; i < 16; i++)
360 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
361
362 P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
363
364 if (pdev->info.cls_eng3d < VOLTA_A) {
365 uint64_t shader_base_addr =
366 nvk_heap_contiguous_base_address(&dev->shader_heap);
367
368 P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
369 P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
370 P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
371 }
372
373 for (uint32_t group = 0; group < 5; group++) {
374 for (uint32_t slot = 0; slot < 16; slot++) {
375 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
376 .valid = VALID_FALSE,
377 .shader_slot = slot,
378 });
379 }
380 }
381
382 // P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
383 // P_INLINE_DATA(cmd->push, 0x40);
384 P_IMMD(p, NV9097, SET_RT_LAYER, {
385 .v = 0,
386 .control = CONTROL_V_SELECTS_LAYER,
387 });
388 // P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
389 // P_INLINE_DATA(cmd->push, 0x30);
390
391 P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
392 P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
393 P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
394
395 uint64_t zero_addr = dev->zero_page->offset;
396 P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
397 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
398 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
399
400 if (dev->pdev->info.cls_eng3d >= FERMI_A &&
401 dev->pdev->info.cls_eng3d < MAXWELL_A) {
402 assert(dev->vab_memory);
403 uint64_t vab_addr = dev->vab_memory->offset;
404 P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
405 P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
406 P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
407 P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
408 }
409
410 if (dev->pdev->info.cls_eng3d == MAXWELL_A)
411 P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
412
413 return VK_SUCCESS;
414 }
415
416 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)417 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
418 {
419 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
420
421 /* These depend on color attachment count */
422 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
423 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
424 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
425 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
426
427 /* These depend on the depth/stencil format */
428 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
429 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
430 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
431 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
432
433 /* This may depend on render targets for ESO */
434 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
435 }
436
437 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)438 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
439 const VkCommandBufferBeginInfo *pBeginInfo)
440 {
441 if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
442 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
443 P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
444 P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
445 .lines = LINES_ALL,
446 });
447 P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
448 .lines = LINES_ALL,
449 });
450
451 P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
452 .constant = CONSTANT_TRUE,
453 });
454 }
455
456 if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
457 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
458 char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
459 const VkRenderingInfo *resume_info =
460 vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
461 pBeginInfo,
462 gcbiar_data);
463 if (resume_info) {
464 nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
465 } else {
466 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
467 vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
468 pBeginInfo);
469 assert(inheritance_info);
470
471 struct nvk_rendering_state *render = &cmd->state.gfx.render;
472 render->flags = inheritance_info->flags;
473 render->area = (VkRect2D) { };
474 render->layer_count = 0;
475 render->view_mask = inheritance_info->viewMask;
476 render->samples = inheritance_info->rasterizationSamples;
477
478 render->color_att_count = inheritance_info->colorAttachmentCount;
479 for (uint32_t i = 0; i < render->color_att_count; i++) {
480 render->color_att[i].vk_format =
481 inheritance_info->pColorAttachmentFormats[i];
482 }
483 render->depth_att.vk_format =
484 inheritance_info->depthAttachmentFormat;
485 render->stencil_att.vk_format =
486 inheritance_info->stencilAttachmentFormat;
487
488 nvk_cmd_buffer_dirty_render_pass(cmd);
489 }
490 }
491
492 cmd->state.gfx.shaders_dirty = ~0;
493 }
494
495 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)496 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
497 {
498 vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
499
500 /* From the Vulkan 1.3.275 spec:
501 *
502 * "...There is one exception to this rule - if the primary command
503 * buffer is inside a render pass instance, then the render pass and
504 * subpass state is not disturbed by executing secondary command
505 * buffers."
506 *
507 * We need to reset everything EXCEPT the render pass state.
508 */
509 struct nvk_rendering_state render_save = cmd->state.gfx.render;
510 memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
511 cmd->state.gfx.render = render_save;
512
513 cmd->state.gfx.shaders_dirty = ~0;
514 }
515
516 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)517 nvk_attachment_init(struct nvk_attachment *att,
518 const VkRenderingAttachmentInfo *info)
519 {
520 if (info == NULL || info->imageView == VK_NULL_HANDLE) {
521 *att = (struct nvk_attachment) { .iview = NULL, };
522 return;
523 }
524
525 VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
526 *att = (struct nvk_attachment) {
527 .vk_format = iview->vk.format,
528 .iview = iview,
529 };
530
531 if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
532 VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
533 att->resolve_mode = info->resolveMode;
534 att->resolve_iview = res_iview;
535 }
536 }
537
538 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)539 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
540 {
541 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
542 uint16_t nil_to_nv9097[] = {
543 MODE(1X1),
544 MODE(2X1),
545 MODE(2X2),
546 MODE(4X2),
547 MODE(4X4),
548 };
549 #undef MODE
550 assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
551
552 return nil_to_nv9097[sample_layout];
553 }
554
555 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)556 nvk_GetRenderingAreaGranularityKHR(
557 VkDevice device,
558 const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
559 VkExtent2D *pGranularity)
560 {
561 *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
562 }
563
564 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)565 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
566 const VkRenderingInfo *pRenderingInfo)
567 {
568 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
569 struct nvk_rendering_state *render = &cmd->state.gfx.render;
570
571 memset(render, 0, sizeof(*render));
572
573 render->flags = pRenderingInfo->flags;
574 render->area = pRenderingInfo->renderArea;
575 render->view_mask = pRenderingInfo->viewMask;
576 render->layer_count = pRenderingInfo->layerCount;
577 render->samples = 0;
578
579 const uint32_t layer_count =
580 render->view_mask ? util_last_bit(render->view_mask) :
581 render->layer_count;
582
583 render->color_att_count = pRenderingInfo->colorAttachmentCount;
584 for (uint32_t i = 0; i < render->color_att_count; i++) {
585 nvk_attachment_init(&render->color_att[i],
586 &pRenderingInfo->pColorAttachments[i]);
587 }
588
589 nvk_attachment_init(&render->depth_att,
590 pRenderingInfo->pDepthAttachment);
591 nvk_attachment_init(&render->stencil_att,
592 pRenderingInfo->pStencilAttachment);
593
594 nvk_cmd_buffer_dirty_render_pass(cmd);
595
596 /* Always emit at least one color attachment, even if it's just a dummy. */
597 uint32_t color_att_count = MAX2(1, render->color_att_count);
598 struct nv_push *p = nvk_cmd_buffer_push(cmd, color_att_count * 10 + 27);
599
600 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
601 render->view_mask);
602
603 P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
604 P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
605 .x = render->area.offset.x,
606 .width = render->area.extent.width,
607 });
608 P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
609 .y = render->area.offset.y,
610 .height = render->area.extent.height,
611 });
612
613 enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
614 for (uint32_t i = 0; i < color_att_count; i++) {
615 if (render->color_att[i].iview) {
616 const struct nvk_image_view *iview = render->color_att[i].iview;
617 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
618 /* Rendering to multi-planar images is valid for a specific single plane
619 * only, so assert that what we have is a single-plane, obtain its index,
620 * and begin rendering
621 */
622 assert(iview->plane_count == 1);
623 const uint8_t ip = iview->planes[0].image_plane;
624
625 const struct nil_image_level *level =
626 &image->planes[ip].nil.levels[iview->vk.base_mip_level];
627 struct nil_extent4d level_extent_sa =
628 nil_image_level_extent_sa(&image->planes[ip].nil, iview->vk.base_mip_level);
629
630 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
631 sample_layout == image->planes[ip].nil.sample_layout);
632 sample_layout = image->planes[ip].nil.sample_layout;
633 render->samples = image->vk.samples;
634
635 uint64_t addr = nvk_image_base_address(image, ip) + level->offset_B;
636
637 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
638 P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
639 P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
640
641 if (level->tiling.is_tiled) {
642 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, level_extent_sa.w);
643 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.h);
644 const enum pipe_format p_format =
645 vk_format_to_pipe_format(iview->vk.format);
646 const uint8_t ct_format = nil_format_to_color_target(p_format);
647 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
648
649 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
650 .block_width = BLOCK_WIDTH_ONE_GOB,
651 .block_height = level->tiling.y_log2,
652 .block_depth = level->tiling.z_log2,
653 .layout = LAYOUT_BLOCKLINEAR,
654 .third_dimension_control =
655 (image->planes[ip].nil.dim == NIL_IMAGE_DIM_3D) ?
656 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
657 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
658 });
659
660 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i,
661 iview->vk.base_array_layer + layer_count);
662 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
663 image->planes[ip].nil.array_stride_B >> 2);
664 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, iview->vk.base_array_layer);
665 } else {
666 /* NVIDIA can only render to 2D linear images */
667 assert(image->planes[ip].nil.dim == NIL_IMAGE_DIM_2D);
668 /* NVIDIA can only render to non-multisampled images */
669 assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
670 /* NVIDIA doesn't support linear array images */
671 assert(iview->vk.base_array_layer == 0 && layer_count == 1);
672
673 uint32_t pitch = level->row_stride_B;
674 const enum pipe_format p_format =
675 vk_format_to_pipe_format(iview->vk.format);
676 /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
677 * takes row pitch
678 */
679 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
680 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.h);
681
682 const uint8_t ct_format = nil_format_to_color_target(p_format);
683 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
684
685 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
686 .layout = LAYOUT_PITCH,
687 .third_dimension_control =
688 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
689 });
690
691 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
692 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
693 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
694 }
695 } else {
696 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
697 P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
698 P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
699 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
700 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
701 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
702 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
703 .layout = LAYOUT_BLOCKLINEAR,
704 });
705 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
706 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
707 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
708 }
709 }
710
711 P_IMMD(p, NV9097, SET_CT_SELECT, {
712 .target_count = color_att_count,
713 .target0 = 0,
714 .target1 = 1,
715 .target2 = 2,
716 .target3 = 3,
717 .target4 = 4,
718 .target5 = 5,
719 .target6 = 6,
720 .target7 = 7,
721 });
722
723 if (render->depth_att.iview || render->stencil_att.iview) {
724 struct nvk_image_view *iview = render->depth_att.iview ?
725 render->depth_att.iview :
726 render->stencil_att.iview;
727 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
728 /* Depth/stencil are always single-plane */
729 assert(iview->plane_count == 1);
730 const uint8_t ip = iview->planes[0].image_plane;
731 struct nil_image nil_image = image->planes[ip].nil;
732
733 uint64_t addr = nvk_image_base_address(image, ip);
734 uint32_t mip_level = iview->vk.base_mip_level;
735 uint32_t base_array_layer = iview->vk.base_array_layer;
736 uint32_t layer_count = iview->vk.layer_count;
737
738 if (nil_image.dim == NIL_IMAGE_DIM_3D) {
739 uint64_t level_offset_B;
740 nil_image_3d_level_as_2d_array(&nil_image, mip_level,
741 &nil_image, &level_offset_B);
742 addr += level_offset_B;
743 mip_level = 0;
744 base_array_layer = 0;
745 layer_count = iview->vk.extent.depth;
746 }
747
748 const struct nil_image_level *level = &nil_image.levels[mip_level];
749 addr += level->offset_B;
750
751 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
752 sample_layout == nil_image.sample_layout);
753 sample_layout = nil_image.sample_layout;
754 render->samples = image->vk.samples;
755
756 P_MTHD(p, NV9097, SET_ZT_A);
757 P_NV9097_SET_ZT_A(p, addr >> 32);
758 P_NV9097_SET_ZT_B(p, addr);
759 const enum pipe_format p_format =
760 vk_format_to_pipe_format(iview->vk.format);
761 const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
762 P_NV9097_SET_ZT_FORMAT(p, zs_format);
763 assert(level->tiling.z_log2 == 0);
764 P_NV9097_SET_ZT_BLOCK_SIZE(p, {
765 .width = WIDTH_ONE_GOB,
766 .height = level->tiling.y_log2,
767 .depth = DEPTH_ONE_GOB,
768 });
769 P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
770
771 P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
772
773 struct nil_extent4d level_extent_sa =
774 nil_image_level_extent_sa(&nil_image, mip_level);
775
776 P_MTHD(p, NV9097, SET_ZT_SIZE_A);
777 P_NV9097_SET_ZT_SIZE_A(p, level_extent_sa.w);
778 P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.h);
779 P_NV9097_SET_ZT_SIZE_C(p, {
780 .third_dimension = base_array_layer + layer_count,
781 .control = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
782 });
783
784 P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
785
786 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
787 P_IMMD(p, NVC597, SET_ZT_SPARSE, {
788 .enable = ENABLE_FALSE,
789 });
790 }
791 } else {
792 P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
793 }
794
795 /* From the Vulkan 1.3.275 spec:
796 *
797 * "It is legal for a subpass to use no color or depth/stencil
798 * attachments, either because it has no attachment references or
799 * because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
800 * can use shader side effects such as image stores and atomics to
801 * produce an output. In this case, the subpass continues to use the
802 * width, height, and layers of the framebuffer to define the dimensions
803 * of the rendering area, and the rasterizationSamples from each
804 * pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
805 * of samples used in rasterization;"
806 *
807 * In the case where we have attachments, we emit SET_ANTI_ALIAS here
808 * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
809 * specifying the sample layout and we want to ensure it matches. When
810 * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
811 * where we base it on dynamic rasterizationSamples.
812 */
813 if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID) {
814 P_IMMD(p, NV9097, SET_ANTI_ALIAS,
815 nil_to_nv9097_samples_mode(sample_layout));
816 }
817
818 if (render->flags & VK_RENDERING_RESUMING_BIT)
819 return;
820
821 uint32_t clear_count = 0;
822 VkClearAttachment clear_att[NVK_MAX_RTS + 1];
823 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
824 const VkRenderingAttachmentInfo *att_info =
825 &pRenderingInfo->pColorAttachments[i];
826 if (att_info->imageView == VK_NULL_HANDLE ||
827 att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
828 continue;
829
830 clear_att[clear_count++] = (VkClearAttachment) {
831 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
832 .colorAttachment = i,
833 .clearValue = att_info->clearValue,
834 };
835 }
836
837 clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
838 if (pRenderingInfo->pDepthAttachment != NULL &&
839 pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
840 pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
841 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
842 clear_att[clear_count].clearValue.depthStencil.depth =
843 pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
844 }
845 if (pRenderingInfo->pStencilAttachment != NULL &&
846 pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
847 pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
848 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
849 clear_att[clear_count].clearValue.depthStencil.stencil =
850 pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
851 }
852 if (clear_att[clear_count].aspectMask != 0)
853 clear_count++;
854
855 if (clear_count > 0) {
856 const VkClearRect clear_rect = {
857 .rect = render->area,
858 .baseArrayLayer = 0,
859 .layerCount = render->view_mask ? 1 : render->layer_count,
860 };
861
862 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
863 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
864
865 nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
866 clear_count, clear_att, 1, &clear_rect);
867 p = nvk_cmd_buffer_push(cmd, 2);
868 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
869 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
870 }
871
872 /* TODO: Attachment clears */
873 }
874
875 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)876 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
877 {
878 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
879 struct nvk_rendering_state *render = &cmd->state.gfx.render;
880
881 bool need_resolve = false;
882
883 /* Translate render state back to VK for meta */
884 VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
885 for (uint32_t i = 0; i < render->color_att_count; i++) {
886 if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
887 need_resolve = true;
888
889 vk_color_att[i] = (VkRenderingAttachmentInfo) {
890 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
891 .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
892 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
893 .resolveMode = render->color_att[i].resolve_mode,
894 .resolveImageView =
895 nvk_image_view_to_handle(render->color_att[i].resolve_iview),
896 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
897 };
898 }
899
900 const VkRenderingAttachmentInfo vk_depth_att = {
901 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
902 .imageView = nvk_image_view_to_handle(render->depth_att.iview),
903 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
904 .resolveMode = render->depth_att.resolve_mode,
905 .resolveImageView =
906 nvk_image_view_to_handle(render->depth_att.resolve_iview),
907 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
908 };
909 if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
910 need_resolve = true;
911
912 const VkRenderingAttachmentInfo vk_stencil_att = {
913 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
914 .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
915 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
916 .resolveMode = render->stencil_att.resolve_mode,
917 .resolveImageView =
918 nvk_image_view_to_handle(render->stencil_att.resolve_iview),
919 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
920 };
921 if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
922 need_resolve = true;
923
924 const VkRenderingInfo vk_render = {
925 .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
926 .renderArea = render->area,
927 .layerCount = render->layer_count,
928 .viewMask = render->view_mask,
929 .colorAttachmentCount = render->color_att_count,
930 .pColorAttachments = vk_color_att,
931 .pDepthAttachment = &vk_depth_att,
932 .pStencilAttachment = &vk_stencil_att,
933 };
934
935 if (render->flags & VK_RENDERING_SUSPENDING_BIT)
936 need_resolve = false;
937
938 memset(render, 0, sizeof(*render));
939
940 if (need_resolve) {
941 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
942 P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
943 .lines = LINES_ALL,
944 });
945
946 nvk_meta_resolve_rendering(cmd, &vk_render);
947 }
948 }
949
950 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)951 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
952 const gl_shader_stage stage,
953 struct nvk_shader *shader)
954 {
955 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
956
957 assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
958 if (cmd->state.gfx.shaders[stage] == shader)
959 return;
960
961 cmd->state.gfx.shaders[stage] = shader;
962 cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
963
964 /* When a pipeline with tess shaders is bound we need to re-upload the
965 * tessellation parameters at flush_ts_state, as the domain origin can be
966 * dynamic.
967 */
968 if (stage == MESA_SHADER_TESS_EVAL)
969 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN);
970
971 /* Emitting SET_HYBRID_ANTI_ALIAS_CONTROL requires the fragment shader */
972 if (stage == MESA_SHADER_FRAGMENT)
973 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
974 }
975
976 static uint32_t
mesa_to_nv9097_shader_type(gl_shader_stage stage)977 mesa_to_nv9097_shader_type(gl_shader_stage stage)
978 {
979 static const uint32_t mesa_to_nv9097[] = {
980 [MESA_SHADER_VERTEX] = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
981 [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
982 [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
983 [MESA_SHADER_GEOMETRY] = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
984 [MESA_SHADER_FRAGMENT] = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
985 };
986 assert(stage < ARRAY_SIZE(mesa_to_nv9097));
987 return mesa_to_nv9097[stage];
988 }
989
990 static uint32_t
nvk_pipeline_bind_group(gl_shader_stage stage)991 nvk_pipeline_bind_group(gl_shader_stage stage)
992 {
993 return stage;
994 }
995
996 static void
nvk_flush_shaders(struct nvk_cmd_buffer * cmd)997 nvk_flush_shaders(struct nvk_cmd_buffer *cmd)
998 {
999 if (cmd->state.gfx.shaders_dirty == 0)
1000 return;
1001
1002 /* Map shader types to shaders */
1003 struct nvk_shader *type_shader[6] = { NULL, };
1004 uint32_t types_dirty = 0;
1005
1006 const uint32_t gfx_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1007 BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
1008 BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1009 BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
1010 BITFIELD_BIT(MESA_SHADER_FRAGMENT);
1011
1012 u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
1013 uint32_t type = mesa_to_nv9097_shader_type(stage);
1014 types_dirty |= BITFIELD_BIT(type);
1015
1016 /* Only copy non-NULL shaders because mesh/task alias with vertex and
1017 * tessellation stages.
1018 */
1019 if (cmd->state.gfx.shaders[stage] != NULL) {
1020 assert(type < ARRAY_SIZE(type_shader));
1021 assert(type_shader[type] == NULL);
1022 type_shader[type] = cmd->state.gfx.shaders[stage];
1023 }
1024 }
1025
1026 u_foreach_bit(type, types_dirty) {
1027 struct nvk_shader *shader = type_shader[type];
1028
1029 /* We always map index == type */
1030 const uint32_t idx = type;
1031
1032 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1033 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
1034 .enable = shader != NULL,
1035 .type = type,
1036 });
1037
1038 if (shader == NULL)
1039 continue;
1040
1041 uint64_t addr = shader->hdr_addr;
1042 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1043 P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
1044 P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
1045 P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
1046 } else {
1047 assert(addr < 0xffffffff);
1048 P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
1049 }
1050
1051 P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
1052 P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
1053 P_NVC397_SET_PIPELINE_BINDING(p, idx,
1054 nvk_pipeline_bind_group(shader->info.stage));
1055
1056 if (shader->info.stage == MESA_SHADER_FRAGMENT) {
1057 p = nvk_cmd_buffer_push(cmd, 9);
1058
1059 P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
1060 P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
1061 .fraction_of_spm_register_file_per_subtile = 0x10,
1062 .fraction_of_spm_pixel_output_buffer_per_subtile = 0x40,
1063 .fraction_of_spm_triangle_ram_per_subtile = 0x16,
1064 .fraction_of_max_quads_per_subtile = 0x20,
1065 });
1066 P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);
1067
1068 P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
1069 shader->info.fs.early_fragment_tests);
1070
1071 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1072 P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
1073 shader->info.fs.post_depth_coverage);
1074 } else {
1075 assert(!shader->info.fs.post_depth_coverage);
1076 }
1077
1078 P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
1079 .z_min_unbounded_enable = shader->info.fs.writes_depth,
1080 .z_max_unbounded_enable = shader->info.fs.writes_depth,
1081 });
1082 }
1083 }
1084
1085 const uint32_t vtg_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1086 BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1087 BITFIELD_BIT(MESA_SHADER_GEOMETRY);
1088 const uint32_t vtgm_stages = vtg_stages | BITFIELD_BIT(MESA_SHADER_MESH);
1089
1090 if (cmd->state.gfx.shaders_dirty & vtg_stages) {
1091 struct nak_xfb_info *xfb = NULL;
1092 u_foreach_bit(stage, vtg_stages) {
1093 if (cmd->state.gfx.shaders[stage] != NULL)
1094 xfb = &cmd->state.gfx.shaders[stage]->info.vtg.xfb;
1095 }
1096
1097 if (xfb == NULL) {
1098 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1099 for (uint8_t b = 0; b < 4; b++)
1100 P_IMMD(p, NV9097, SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(b), 0);
1101 } else {
1102 for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
1103 const uint8_t attr_count = xfb->attr_count[b];
1104 /* upload packed varying indices in multiples of 4 bytes */
1105 const uint32_t n = DIV_ROUND_UP(attr_count, 4);
1106
1107 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 + n);
1108
1109 P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
1110 P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
1111 P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
1112 P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);
1113
1114 if (n > 0) {
1115 P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
1116 P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
1117 }
1118 }
1119 }
1120 }
1121
1122 if (cmd->state.gfx.shaders_dirty & vtgm_stages) {
1123 struct nvk_shader *last_vtgm = NULL;
1124 u_foreach_bit(stage, vtgm_stages) {
1125 if (cmd->state.gfx.shaders[stage] != NULL)
1126 last_vtgm = cmd->state.gfx.shaders[stage];
1127 }
1128
1129 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1130
1131 P_IMMD(p, NV9097, SET_RT_LAYER, {
1132 .v = 0,
1133 .control = last_vtgm->info.vtg.writes_layer ?
1134 CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
1135 CONTROL_V_SELECTS_LAYER,
1136 });
1137
1138 P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, {
1139 .enable = last_vtgm->info.vtg.writes_point_size,
1140 .slot = 0,
1141 });
1142
1143 const uint8_t clip_enable = last_vtgm->info.vtg.clip_enable;
1144 const uint8_t cull_enable = last_vtgm->info.vtg.cull_enable;
1145 P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
1146 .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
1147 .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
1148 .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
1149 .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
1150 .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
1151 .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
1152 .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
1153 .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
1154 });
1155 P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
1156 .plane0 = (cull_enable >> 0) & 1,
1157 .plane1 = (cull_enable >> 1) & 1,
1158 .plane2 = (cull_enable >> 2) & 1,
1159 .plane3 = (cull_enable >> 3) & 1,
1160 .plane4 = (cull_enable >> 4) & 1,
1161 .plane5 = (cull_enable >> 5) & 1,
1162 .plane6 = (cull_enable >> 6) & 1,
1163 .plane7 = (cull_enable >> 7) & 1,
1164 });
1165 }
1166
1167 cmd->state.gfx.shaders_dirty = 0;
1168 }
1169
1170 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1171 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1172 {
1173 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1174 struct nvk_physical_device *pdev = nvk_device_physical(dev);
1175 const struct vk_dynamic_graphics_state *dyn =
1176 &cmd->vk.dynamic_graphics_state;
1177
1178 struct nv_push *p = nvk_cmd_buffer_push(cmd, 256);
1179
1180 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1181 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1182 u_foreach_bit(a, dyn->vi->attributes_valid) {
1183 const struct nvk_va_format *fmt =
1184 nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1185
1186 P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1187 .stream = dyn->vi->attributes[a].binding,
1188 .offset = dyn->vi->attributes[a].offset,
1189 .component_bit_widths = fmt->bit_widths,
1190 .numerical_type = fmt->type,
1191 .swap_r_and_b = fmt->swap_rb,
1192 });
1193 }
1194
1195 u_foreach_bit(b, dyn->vi->bindings_valid) {
1196 const bool instanced = dyn->vi->bindings[b].input_rate ==
1197 VK_VERTEX_INPUT_RATE_INSTANCE;
1198 P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1199 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1200 dyn->vi->bindings[b].divisor);
1201 }
1202 }
1203
1204 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1205 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1206 for (uint32_t b = 0; b < 32; b++) {
1207 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
1208 .stride = dyn->vi_binding_strides[b],
1209 .enable = (dyn->vi->bindings_valid & BITFIELD_BIT(b)) != 0,
1210 });
1211 }
1212 }
1213 }
1214
1215 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1216 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1217 {
1218 const struct vk_dynamic_graphics_state *dyn =
1219 &cmd->vk.dynamic_graphics_state;
1220
1221 /** Nothing to do for MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY */
1222
1223 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1224 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1225 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1226 dyn->ia.primitive_restart_enable);
1227 }
1228 }
1229
1230 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1231 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1232 {
1233 const struct vk_dynamic_graphics_state *dyn =
1234 &cmd->vk.dynamic_graphics_state;
1235 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1236
1237 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1238 /* The hardware gets grumpy if we set this to 0 so make sure we set it
1239 * to at least 1 in case it's dirty but uninitialized.
1240 */
1241 P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1242 }
1243
1244 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1245 const struct nvk_shader *shader =
1246 cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
1247
1248 if (shader != NULL) {
1249 enum nak_ts_prims prims = shader->info.ts.prims;
1250 /* When the origin is lower-left, we have to flip the winding order */
1251 if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1252 if (prims == NAK_TS_PRIMS_TRIANGLES_CW)
1253 prims = NAK_TS_PRIMS_TRIANGLES_CCW;
1254 else if (prims == NAK_TS_PRIMS_TRIANGLES_CCW)
1255 prims = NAK_TS_PRIMS_TRIANGLES_CW;
1256 }
1257 P_MTHD(p, NV9097, SET_TESSELLATION_PARAMETERS);
1258 P_NV9097_SET_TESSELLATION_PARAMETERS(p, {
1259 shader->info.ts.domain,
1260 shader->info.ts.spacing,
1261 prims
1262 });
1263 }
1264 }
1265 }
1266
1267 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1268 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1269 {
1270 const struct vk_dynamic_graphics_state *dyn =
1271 &cmd->vk.dynamic_graphics_state;
1272
1273 struct nv_push *p =
1274 nvk_cmd_buffer_push(cmd, 16 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1275
1276 /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1277
1278 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1279 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1280 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1281 const VkViewport *vp = &dyn->vp.viewports[i];
1282
1283 /* These exactly match the spec values. Nvidia hardware oddities
1284 * are accounted for later.
1285 */
1286 const float o_x = vp->x + 0.5f * vp->width;
1287 const float o_y = vp->y + 0.5f * vp->height;
1288 const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1289 vp->minDepth :
1290 (vp->maxDepth + vp->minDepth) * 0.5f;
1291
1292 const float p_x = vp->width;
1293 const float p_y = vp->height;
1294 const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1295 vp->maxDepth - vp->minDepth :
1296 (vp->maxDepth - vp->minDepth) * 0.5f;
1297
1298 P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1299 P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1300 P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1301 P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1302
1303 P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1304 P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1305 P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1306
1307 float xmin = vp->x;
1308 float xmax = vp->x + vp->width;
1309 float ymin = MIN2(vp->y, vp->y + vp->height);
1310 float ymax = MAX2(vp->y, vp->y + vp->height);
1311 float zmin = MIN2(vp->minDepth, vp->maxDepth);
1312 float zmax = MAX2(vp->minDepth, vp->maxDepth);
1313 assert(xmin <= xmax && ymin <= ymax);
1314
1315 const float max_dim = (float)0xffff;
1316 xmin = CLAMP(xmin, 0, max_dim);
1317 xmax = CLAMP(xmax, 0, max_dim);
1318 ymin = CLAMP(ymin, 0, max_dim);
1319 ymax = CLAMP(ymax, 0, max_dim);
1320
1321 P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1322 P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1323 .x0 = xmin,
1324 .width = xmax - xmin,
1325 });
1326 P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1327 .y0 = ymin,
1328 .height = ymax - ymin,
1329 });
1330 P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1331 P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1332
1333 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1334 P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1335 .x = X_POS_X,
1336 .y = Y_POS_Y,
1337 .z = Z_POS_Z,
1338 .w = W_POS_W,
1339 });
1340 }
1341 }
1342 }
1343
1344 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1345 P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1346 dyn->vp.depth_clip_negative_one_to_one ?
1347 RANGE_NEGATIVE_W_TO_POSITIVE_W :
1348 RANGE_ZERO_TO_POSITIVE_W);
1349 }
1350
1351 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1352 for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1353 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1354 }
1355
1356 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1357 for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1358 const VkRect2D *s = &dyn->vp.scissors[i];
1359
1360 const uint32_t xmin = MIN2(16384, s->offset.x);
1361 const uint32_t xmax = MIN2(16384, s->offset.x + s->extent.width);
1362 const uint32_t ymin = MIN2(16384, s->offset.y);
1363 const uint32_t ymax = MIN2(16384, s->offset.y + s->extent.height);
1364
1365 P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1366 P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1367 P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1368 .xmin = xmin,
1369 .xmax = xmax,
1370 });
1371 P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1372 .ymin = ymin,
1373 .ymax = ymax,
1374 });
1375 }
1376 }
1377 }
1378
1379 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1380 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1381 {
1382 ASSERTED uint16_t vk_to_nv9097[] = {
1383 [VK_POLYGON_MODE_FILL] = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1384 [VK_POLYGON_MODE_LINE] = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1385 [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1386 };
1387 assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1388
1389 uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1390 assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1391 return nv9097_mode;
1392 }
1393
1394 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1395 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1396 {
1397 static const uint16_t vk_to_nv9097[] = {
1398 [VK_CULL_MODE_FRONT_BIT] = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1399 [VK_CULL_MODE_BACK_BIT] = NV9097_OGL_SET_CULL_FACE_V_BACK,
1400 [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1401 };
1402 assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1403 return vk_to_nv9097[vk_cull_mode];
1404 }
1405
1406 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1407 vk_to_nv9097_front_face(VkFrontFace vk_face)
1408 {
1409 /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1410 * convention in which framebuffer coordinates always start in the upper
1411 * left while OpenGL has framebuffer coordinates starting in the lower
1412 * left. Therefore, we want the reverse of the hardware enum name.
1413 */
1414 ASSERTED static const uint16_t vk_to_nv9097[] = {
1415 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1416 [VK_FRONT_FACE_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CW,
1417 };
1418 assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1419
1420 uint32_t nv9097_face = 0x900 | (1 - vk_face);
1421 assert(nv9097_face == vk_to_nv9097[vk_face]);
1422 return nv9097_face;
1423 }
1424
1425 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1426 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1427 {
1428 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1429 NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1430 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1431 NV9097_SET_PROVOKING_VERTEX_V_LAST);
1432 return vk_mode;
1433 }
1434
1435 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)1436 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
1437 {
1438 struct nv_push *p = nvk_cmd_buffer_push(cmd, 40);
1439
1440 const struct vk_dynamic_graphics_state *dyn =
1441 &cmd->vk.dynamic_graphics_state;
1442
1443 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
1444 P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
1445
1446 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
1447 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
1448 const bool z_clamp = dyn->rs.depth_clamp_enable;
1449 const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
1450 P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
1451 /* TODO: Fix pre-Volta
1452 *
1453 * This probably involves a few macros, one which stases viewport
1454 * min/maxDepth in scratch states and one which goes here and
1455 * emits either min/maxDepth or -/+INF as needed.
1456 */
1457 .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
1458 .z_clip_range = nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A
1459 ? ((z_clamp || z_clip)
1460 ? Z_CLIP_RANGE_MIN_Z_MAX_Z
1461 : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
1462 : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
1463
1464 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
1465 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
1466
1467 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
1468 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
1469 .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
1470 : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
1471
1472 /* We clip depth with the geometry clipper to ensure that it gets
1473 * clipped before depth bias is applied. If we leave it up to the
1474 * raserizer clipper (pixel_min/max_z = CLIP), it will clip according
1475 * to the post-bias Z value which is wrong. In order to always get
1476 * the geometry clipper, we need to set a tignt guardband
1477 * (geometry_guardband_z = SCALE_1).
1478 */
1479 .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
1480 : GEOMETRY_GUARDBAND_Z_SCALE_256,
1481 });
1482 }
1483
1484 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
1485 uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
1486 P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
1487 P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
1488 P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
1489 }
1490
1491 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
1492 P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
1493
1494 if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
1495 uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
1496 P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
1497 }
1498 }
1499
1500 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
1501 P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
1502 vk_to_nv9097_front_face(dyn->rs.front_face));
1503 }
1504
1505 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
1506 P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
1507 vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
1508 }
1509
1510 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
1511 P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
1512 P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
1513 P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
1514 P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
1515 }
1516
1517 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
1518 switch (dyn->rs.depth_bias.representation) {
1519 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
1520 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
1521 DEPTH_FORMAT_DEPENDENT_TRUE);
1522 break;
1523 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
1524 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
1525 DEPTH_FORMAT_DEPENDENT_FALSE);
1526 break;
1527 case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
1528 default:
1529 unreachable("Unsupported depth bias representation");
1530 }
1531 /* TODO: The blob multiplies by 2 for some reason. We don't. */
1532 P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant));
1533 P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope));
1534 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
1535 }
1536
1537 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
1538 P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
1539 P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
1540 P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
1541 }
1542
1543 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
1544 switch (dyn->rs.line.mode) {
1545 case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
1546 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
1547 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
1548 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
1549 break;
1550
1551 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
1552 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
1553 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
1554 break;
1555
1556 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
1557 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
1558 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
1559 break;
1560
1561 default:
1562 unreachable("Invalid line rasterization mode");
1563 }
1564 }
1565
1566 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
1567 P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
1568
1569 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
1570 /* map factor from [1,256] to [0, 255] */
1571 uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
1572 P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
1573 .factor = stipple_factor,
1574 .pattern = dyn->rs.line.stipple.pattern,
1575 });
1576 }
1577
1578 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
1579 P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
1580 }
1581
1582 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)1583 vk_sample_location(const struct vk_sample_locations_state *sl,
1584 uint32_t x, uint32_t y, uint32_t s)
1585 {
1586 x = x % sl->grid_size.width;
1587 y = y % sl->grid_size.height;
1588
1589 return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
1590 }
1591
1592 static struct nvk_sample_location
vk_to_nvk_sample_location(VkSampleLocationEXT loc)1593 vk_to_nvk_sample_location(VkSampleLocationEXT loc)
1594 {
1595 return (struct nvk_sample_location) {
1596 .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
1597 .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
1598 };
1599 }
1600
1601 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)1602 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
1603 {
1604 struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1605 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
1606 const struct vk_dynamic_graphics_state *dyn =
1607 &cmd->vk.dynamic_graphics_state;
1608
1609 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
1610 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1611
1612 /* When we don't have any attachments, we can't know the sample count
1613 * from the render pass so we need to emit SET_ANTI_ALIAS here. See the
1614 * comment in nvk_BeginRendering() for more details.
1615 */
1616 if (render->samples == 0) {
1617 /* Multisample information MAY be missing (rasterizationSamples == 0)
1618 * if rasterizer discard is enabled. However, this isn't valid in
1619 * the hardware so always use at least one sample.
1620 */
1621 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
1622 enum nil_sample_layout layout = nil_choose_sample_layout(samples);
1623 P_IMMD(p, NV9097, SET_ANTI_ALIAS, nil_to_nv9097_samples_mode(layout));
1624 } else {
1625 /* Multisample information MAY be missing (rasterizationSamples == 0)
1626 * if rasterizer discard is enabled.
1627 */
1628 assert(dyn->ms.rasterization_samples == 0 ||
1629 dyn->ms.rasterization_samples == render->samples);
1630 }
1631
1632 struct nvk_shader *fs = cmd->state.gfx.shaders[MESA_SHADER_FRAGMENT];
1633 const float min_sample_shading = fs != NULL ? fs->min_sample_shading : 0;
1634 uint32_t min_samples = ceilf(dyn->ms.rasterization_samples *
1635 min_sample_shading);
1636 min_samples = util_next_power_of_two(MAX2(1, min_samples));
1637
1638 P_IMMD(p, NV9097, SET_HYBRID_ANTI_ALIAS_CONTROL, {
1639 .passes = min_samples,
1640 .centroid = min_samples > 1 ? CENTROID_PER_PASS
1641 : CENTROID_PER_FRAGMENT,
1642 });
1643 }
1644
1645 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
1646 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
1647 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1648 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
1649 .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
1650 .alpha_to_one = dyn->ms.alpha_to_one_enable,
1651 });
1652 }
1653
1654 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
1655 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
1656 const struct vk_sample_locations_state *sl;
1657 if (dyn->ms.sample_locations_enable) {
1658 sl = dyn->ms.sample_locations;
1659 } else {
1660 sl = vk_standard_sample_locations_state(dyn->ms.rasterization_samples);
1661 }
1662
1663 for (uint32_t i = 0; i < sl->per_pixel; i++) {
1664 desc->root.draw.sample_locations[i] =
1665 vk_to_nvk_sample_location(sl->locations[i]);
1666 }
1667
1668 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1669 struct nvk_sample_location loc[16];
1670 for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
1671 const uint32_t s = n % sl->per_pixel;
1672 const uint32_t px = n / sl->per_pixel;
1673 const uint32_t x = px % 2;
1674 const uint32_t y = px / 2;
1675
1676 loc[n] = vk_to_nvk_sample_location(vk_sample_location(sl, x, y, s));
1677 }
1678
1679 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1680
1681 P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
1682 for (uint32_t i = 0; i < 4; i++) {
1683 P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
1684 .x0 = loc[i * 4 + 0].x_u4,
1685 .y0 = loc[i * 4 + 0].y_u4,
1686 .x1 = loc[i * 4 + 1].x_u4,
1687 .y1 = loc[i * 4 + 1].y_u4,
1688 .x2 = loc[i * 4 + 2].x_u4,
1689 .y2 = loc[i * 4 + 2].y_u4,
1690 .x3 = loc[i * 4 + 3].x_u4,
1691 .y3 = loc[i * 4 + 3].y_u4,
1692 });
1693 }
1694 }
1695 }
1696
1697 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
1698 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1699 P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
1700 P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
1701 P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
1702 P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
1703 P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
1704 }
1705 }
1706
1707 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)1708 vk_to_nv9097_compare_op(VkCompareOp vk_op)
1709 {
1710 ASSERTED static const uint16_t vk_to_nv9097[] = {
1711 [VK_COMPARE_OP_NEVER] = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
1712 [VK_COMPARE_OP_LESS] = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
1713 [VK_COMPARE_OP_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
1714 [VK_COMPARE_OP_LESS_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
1715 [VK_COMPARE_OP_GREATER] = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
1716 [VK_COMPARE_OP_NOT_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
1717 [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
1718 [VK_COMPARE_OP_ALWAYS] = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
1719 };
1720 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1721
1722 uint32_t nv9097_op = 0x200 | vk_op;
1723 assert(nv9097_op == vk_to_nv9097[vk_op]);
1724 return nv9097_op;
1725 }
1726
1727 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)1728 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
1729 {
1730 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
1731 ASSERTED static const uint16_t vk_to_nv9097[] = {
1732 OP(KEEP, D3D_KEEP),
1733 OP(ZERO, D3D_ZERO),
1734 OP(REPLACE, D3D_REPLACE),
1735 OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
1736 OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
1737 OP(INVERT, D3D_INVERT),
1738 OP(INCREMENT_AND_WRAP, D3D_INCR),
1739 OP(DECREMENT_AND_WRAP, D3D_DECR),
1740 };
1741 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1742 #undef OP
1743
1744 uint32_t nv9097_op = vk_op + 1;
1745 assert(nv9097_op == vk_to_nv9097[vk_op]);
1746 return nv9097_op;
1747 }
1748
1749 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)1750 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
1751 {
1752 struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
1753
1754 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
1755 const struct vk_dynamic_graphics_state *dyn =
1756 &cmd->vk.dynamic_graphics_state;
1757
1758 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
1759 bool enable = dyn->ds.depth.test_enable &&
1760 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
1761 P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
1762 }
1763
1764 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
1765 bool enable = dyn->ds.depth.write_enable &&
1766 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
1767 P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
1768 }
1769
1770 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
1771 const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
1772 P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
1773 }
1774
1775 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
1776 bool enable = dyn->ds.depth.bounds_test.enable &&
1777 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
1778 P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
1779 }
1780
1781 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
1782 P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
1783 P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
1784 P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
1785 }
1786
1787 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
1788 bool enable = dyn->ds.stencil.test_enable &&
1789 render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
1790 P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
1791 }
1792
1793 const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
1794 const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
1795 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
1796 P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
1797 P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
1798 P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
1799 P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
1800 P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
1801
1802 P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
1803 P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
1804 P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
1805 P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
1806 P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
1807 }
1808
1809 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
1810 P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
1811 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
1812 }
1813
1814 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
1815 P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
1816 P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
1817 }
1818
1819 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
1820 P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
1821 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
1822 }
1823 }
1824
1825 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)1826 vk_to_nv9097_logic_op(VkLogicOp vk_op)
1827 {
1828 ASSERTED uint16_t vk_to_nv9097[] = {
1829 [VK_LOGIC_OP_CLEAR] = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
1830 [VK_LOGIC_OP_AND] = NV9097_SET_LOGIC_OP_FUNC_V_AND,
1831 [VK_LOGIC_OP_AND_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
1832 [VK_LOGIC_OP_COPY] = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
1833 [VK_LOGIC_OP_AND_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
1834 [VK_LOGIC_OP_NO_OP] = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
1835 [VK_LOGIC_OP_XOR] = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
1836 [VK_LOGIC_OP_OR] = NV9097_SET_LOGIC_OP_FUNC_V_OR,
1837 [VK_LOGIC_OP_NOR] = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
1838 [VK_LOGIC_OP_EQUIVALENT] = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
1839 [VK_LOGIC_OP_INVERT] = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
1840 [VK_LOGIC_OP_OR_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
1841 [VK_LOGIC_OP_COPY_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
1842 [VK_LOGIC_OP_OR_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
1843 [VK_LOGIC_OP_NAND] = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
1844 [VK_LOGIC_OP_SET] = NV9097_SET_LOGIC_OP_FUNC_V_SET,
1845 };
1846 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1847
1848 uint32_t nv9097_op = 0x1500 | vk_op;
1849 assert(nv9097_op == vk_to_nv9097[vk_op]);
1850 return nv9097_op;
1851 }
1852
1853 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)1854 vk_to_nv9097_blend_op(VkBlendOp vk_op)
1855 {
1856 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
1857 ASSERTED uint16_t vk_to_nv9097[] = {
1858 OP(ADD, FUNC_ADD),
1859 OP(SUBTRACT, FUNC_SUBTRACT),
1860 OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
1861 OP(MIN, MIN),
1862 OP(MAX, MAX),
1863 };
1864 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1865 #undef OP
1866
1867 return vk_to_nv9097[vk_op];
1868 }
1869
1870 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)1871 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
1872 {
1873 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
1874 NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
1875 ASSERTED uint16_t vk_to_nv9097[] = {
1876 FACTOR(ZERO, OGL_ZERO),
1877 FACTOR(ONE, OGL_ONE),
1878 FACTOR(SRC_COLOR, OGL_SRC_COLOR),
1879 FACTOR(ONE_MINUS_SRC_COLOR, OGL_ONE_MINUS_SRC_COLOR),
1880 FACTOR(DST_COLOR, OGL_DST_COLOR),
1881 FACTOR(ONE_MINUS_DST_COLOR, OGL_ONE_MINUS_DST_COLOR),
1882 FACTOR(SRC_ALPHA, OGL_SRC_ALPHA),
1883 FACTOR(ONE_MINUS_SRC_ALPHA, OGL_ONE_MINUS_SRC_ALPHA),
1884 FACTOR(DST_ALPHA, OGL_DST_ALPHA),
1885 FACTOR(ONE_MINUS_DST_ALPHA, OGL_ONE_MINUS_DST_ALPHA),
1886 FACTOR(CONSTANT_COLOR, OGL_CONSTANT_COLOR),
1887 FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
1888 FACTOR(CONSTANT_ALPHA, OGL_CONSTANT_ALPHA),
1889 FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
1890 FACTOR(SRC_ALPHA_SATURATE, OGL_SRC_ALPHA_SATURATE),
1891 FACTOR(SRC1_COLOR, OGL_SRC1COLOR),
1892 FACTOR(ONE_MINUS_SRC1_COLOR, OGL_INVSRC1COLOR),
1893 FACTOR(SRC1_ALPHA, OGL_SRC1ALPHA),
1894 FACTOR(ONE_MINUS_SRC1_ALPHA, OGL_INVSRC1ALPHA),
1895 };
1896 assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
1897 #undef FACTOR
1898
1899 return vk_to_nv9097[vk_factor];
1900 }
1901
1902 void
nvk_mme_set_write_mask(struct mme_builder * b)1903 nvk_mme_set_write_mask(struct mme_builder *b)
1904 {
1905 struct mme_value count = mme_load(b);
1906 struct mme_value mask = mme_load(b);
1907
1908 /*
1909 * mask is a bit field
1910 *
1911 * attachment index 88887777666655554444333322221111
1912 * component abgrabgrabgrabgrabgrabgrabgrabgr
1913 */
1914
1915 struct mme_value common_mask = mme_mov(b, mme_imm(1));
1916 struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
1917 struct mme_value i = mme_mov(b, mme_zero());
1918
1919 mme_while(b, ine, i, count) {
1920 /*
1921 We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
1922 0x0000 0000 0000 0000 000a 000b 000g 000r
1923
1924 So for i=0 a mask of
1925 0x0000 0000 0000 0000 0000 0000 0000 1111
1926 becomes
1927 0x0000 0000 0000 0000 0001 0001 0001 0001
1928 */
1929
1930 struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
1931 mme_merge_to(b, val, val, mask, 4, 1, 1);
1932 mme_merge_to(b, val, val, mask, 8, 1, 2);
1933 mme_merge_to(b, val, val, mask, 12, 1, 3);
1934
1935 mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
1936 mme_emit(b, val);
1937 mme_free_reg(b, val);
1938
1939 /* Check if all masks are common */
1940 struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
1941 mme_if(b, ine, first, temp) {
1942 mme_mov_to(b, common_mask, mme_zero());
1943 }
1944 mme_free_reg(b, temp);
1945
1946 mme_srl_to(b, mask, mask, mme_imm(4));
1947
1948 mme_add_to(b, i, i, mme_imm(1));
1949 }
1950
1951 mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
1952 mme_emit(b, common_mask);
1953 }
1954
1955 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)1956 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
1957 {
1958 struct nvk_rendering_state *render = &cmd->state.gfx.render;
1959 const struct vk_dynamic_graphics_state *dyn =
1960 &cmd->vk.dynamic_graphics_state;
1961
1962 struct nv_push *p =
1963 nvk_cmd_buffer_push(cmd, 13 + 10 * render->color_att_count);
1964
1965 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
1966 P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
1967
1968 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
1969 const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
1970 P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
1971 }
1972
1973 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
1974 for (uint8_t a = 0; a < render->color_att_count; a++) {
1975 P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
1976 }
1977 }
1978
1979 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
1980 for (uint8_t a = 0; a < render->color_att_count; a++) {
1981 const struct vk_color_blend_attachment_state *att =
1982 &dyn->cb.attachments[a];
1983 P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
1984 P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
1985 P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
1986 vk_to_nv9097_blend_op(att->color_blend_op));
1987 P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
1988 vk_to_nv9097_blend_factor(att->src_color_blend_factor));
1989 P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
1990 vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
1991 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
1992 vk_to_nv9097_blend_op(att->alpha_blend_op));
1993 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
1994 vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
1995 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
1996 vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
1997 }
1998 }
1999
2000 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
2001 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
2002 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS)) {
2003 uint32_t color_write_enables = 0x0;
2004 for (uint8_t a = 0; a < render->color_att_count; a++) {
2005 if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
2006 color_write_enables |= 0xf << (4 * a);
2007 }
2008
2009 uint32_t cb_att_write_mask = 0x0;
2010 for (uint8_t a = 0; a < render->color_att_count; a++)
2011 cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
2012
2013 uint32_t rp_att_write_mask = 0x0;
2014 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2015 if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
2016 rp_att_write_mask |= 0xf << (4 * a);
2017 }
2018
2019 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
2020 P_INLINE_DATA(p, render->color_att_count);
2021 P_INLINE_DATA(p, color_write_enables &
2022 cb_att_write_mask &
2023 rp_att_write_mask);
2024 }
2025
2026 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
2027 P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
2028 P_NV9097_SET_BLEND_CONST_RED(p, fui(dyn->cb.blend_constants[0]));
2029 P_NV9097_SET_BLEND_CONST_GREEN(p, fui(dyn->cb.blend_constants[1]));
2030 P_NV9097_SET_BLEND_CONST_BLUE(p, fui(dyn->cb.blend_constants[2]));
2031 P_NV9097_SET_BLEND_CONST_ALPHA(p, fui(dyn->cb.blend_constants[3]));
2032 }
2033 }
2034
2035 static void
nvk_flush_dynamic_state(struct nvk_cmd_buffer * cmd)2036 nvk_flush_dynamic_state(struct nvk_cmd_buffer *cmd)
2037 {
2038 struct vk_dynamic_graphics_state *dyn =
2039 &cmd->vk.dynamic_graphics_state;
2040
2041 if (!vk_dynamic_graphics_state_any_dirty(dyn))
2042 return;
2043
2044 nvk_flush_vi_state(cmd);
2045 nvk_flush_ia_state(cmd);
2046 nvk_flush_ts_state(cmd);
2047 nvk_flush_vp_state(cmd);
2048 nvk_flush_rs_state(cmd);
2049
2050 /* MESA_VK_DYNAMIC_FSR */
2051
2052 nvk_flush_ms_state(cmd);
2053 nvk_flush_ds_state(cmd);
2054 nvk_flush_cb_state(cmd);
2055
2056 vk_dynamic_graphics_state_clear_dirty(dyn);
2057 }
2058
2059 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)2060 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
2061 {
2062 /* First 4 bits are group, later bits are slot */
2063 struct mme_value group_slot = mme_load(b);
2064
2065 if (b->devinfo->cls_eng3d >= TURING_A) {
2066 struct mme_value64 addr = mme_load_addr64(b);
2067 mme_tu104_read_fifoed(b, addr, mme_imm(3));
2068 }
2069
2070 /* Load the descriptor */
2071 struct mme_value addr_lo = mme_load(b);
2072 struct mme_value addr_hi = mme_load(b);
2073 struct mme_value size = mme_load(b);
2074
2075 struct mme_value cb = mme_alloc_reg(b);
2076 mme_if(b, ieq, size, mme_zero()) {
2077 /* Bottim bit is the valid bit, 8:4 are shader slot */
2078 mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
2079 }
2080
2081 mme_if(b, ine, size, mme_zero()) {
2082 uint32_t alignment = nvk_min_cbuf_alignment(b->devinfo);
2083 mme_add_to(b, size, size, mme_imm(alignment - 1));
2084 mme_and_to(b, size, size, mme_imm(~(alignment - 1)));
2085
2086 /* size = max(size, NVK_MAX_CBUF_SIZE) */
2087 assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
2088 struct mme_value is_large =
2089 mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
2090 mme_if(b, ine, is_large, mme_zero()) {
2091 mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
2092 }
2093
2094 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
2095 mme_emit(b, size);
2096 mme_emit(b, addr_hi);
2097 mme_emit(b, addr_lo);
2098
2099 /* Bottim bit is the valid bit, 8:4 are shader slot */
2100 mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
2101 }
2102
2103 mme_free_reg(b, addr_hi);
2104 mme_free_reg(b, addr_lo);
2105 mme_free_reg(b, size);
2106
2107 /* The group comes in the bottom 4 bits in group_slot and we need to
2108 * combine it with the method. However, unlike most array methods with a
2109 * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
2110 * dwords. This means we need to also shift by 3.
2111 */
2112 struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
2113 mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
2114 mme_emit(b, cb);
2115 }
2116
2117 static void
nvk_flush_descriptors(struct nvk_cmd_buffer * cmd)2118 nvk_flush_descriptors(struct nvk_cmd_buffer *cmd)
2119 {
2120 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
2121 struct nvk_physical_device *pdev = nvk_device_physical(dev);
2122 const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
2123 struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
2124 VkResult result;
2125
2126 nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
2127
2128 /* pre Pascal the constant buffer sizes need to be 0x100 aligned. As we
2129 * simply allocated a buffer and upload data to it, make sure its size is
2130 * 0x100 aligned.
2131 */
2132 STATIC_ASSERT((sizeof(desc->root) & 0xff) == 0);
2133 assert(sizeof(desc->root) % min_cbuf_alignment == 0);
2134
2135 void *root_desc_map;
2136 uint64_t root_desc_addr;
2137 result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root),
2138 min_cbuf_alignment,
2139 &root_desc_addr, &root_desc_map);
2140 if (unlikely(result != VK_SUCCESS)) {
2141 vk_command_buffer_set_error(&cmd->vk, result);
2142 return;
2143 }
2144
2145 desc->root.root_desc_addr = root_desc_addr;
2146 memcpy(root_desc_map, &desc->root, sizeof(desc->root));
2147
2148 /* Find cbuf maps for the 5 cbuf groups */
2149 const struct nvk_shader *cbuf_shaders[5] = { NULL, };
2150 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
2151 const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
2152 if (shader == NULL)
2153 continue;
2154
2155 uint32_t group = nvk_cbuf_binding_for_stage(stage);
2156 assert(group < ARRAY_SIZE(cbuf_shaders));
2157 cbuf_shaders[group] = shader;
2158 }
2159
2160 uint32_t root_cbuf_count = 0;
2161 for (uint32_t group = 0; group < ARRAY_SIZE(cbuf_shaders); group++) {
2162 if (cbuf_shaders[group] == NULL)
2163 continue;
2164
2165 const struct nvk_shader *shader = cbuf_shaders[group];
2166 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
2167
2168 for (uint32_t c = 0; c < cbuf_map->cbuf_count; c++) {
2169 const struct nvk_cbuf *cbuf = &cbuf_map->cbufs[c];
2170
2171 /* We bind these at the very end */
2172 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
2173 root_cbuf_count++;
2174 continue;
2175 }
2176
2177 struct nvk_buffer_address ba;
2178 if (nvk_cmd_buffer_get_cbuf_descriptor(cmd, desc, shader, cbuf, &ba)) {
2179 assert(ba.base_addr % min_cbuf_alignment == 0);
2180 ba.size = align(ba.size, min_cbuf_alignment);
2181 ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
2182
2183 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2184
2185 if (ba.size > 0) {
2186 P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
2187 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
2188 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
2189 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
2190 }
2191
2192 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
2193 .valid = ba.size > 0,
2194 .shader_slot = c,
2195 });
2196 } else {
2197 uint64_t desc_addr =
2198 nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
2199
2200 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2201 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2202
2203 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
2204 P_INLINE_DATA(p, group | (c << 4));
2205 P_INLINE_DATA(p, desc_addr >> 32);
2206 P_INLINE_DATA(p, desc_addr);
2207 } else {
2208 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2209
2210 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
2211 P_INLINE_DATA(p, group | (c << 4));
2212
2213 nv_push_update_count(p, 3);
2214 nvk_cmd_buffer_push_indirect(cmd, desc_addr, 3);
2215 }
2216 }
2217 }
2218 }
2219
2220 /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
2221 * always left pointing at the root descriptor table. This way draw
2222 * parameters and similar MME root table updates always hit the root
2223 * descriptor table and not some random UBO.
2224 */
2225 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4 + 2 * root_cbuf_count);
2226 P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
2227 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, sizeof(desc->root));
2228 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, root_desc_addr >> 32);
2229 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, root_desc_addr);
2230
2231 for (uint32_t group = 0; group < ARRAY_SIZE(cbuf_shaders); group++) {
2232 if (cbuf_shaders[group] == NULL)
2233 continue;
2234
2235 const struct nvk_cbuf_map *cbuf_map = &cbuf_shaders[group]->cbuf_map;
2236
2237 for (uint32_t c = 0; c < cbuf_map->cbuf_count; c++) {
2238 const struct nvk_cbuf *cbuf = &cbuf_map->cbufs[c];
2239 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
2240 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
2241 .valid = VALID_TRUE,
2242 .shader_slot = c,
2243 });
2244 }
2245 }
2246 }
2247 }
2248
2249 static void
nvk_flush_gfx_state(struct nvk_cmd_buffer * cmd)2250 nvk_flush_gfx_state(struct nvk_cmd_buffer *cmd)
2251 {
2252 nvk_flush_shaders(cmd);
2253 nvk_flush_dynamic_state(cmd);
2254 nvk_flush_descriptors(cmd);
2255 }
2256
2257 static uint32_t
vk_to_nv_index_format(VkIndexType type)2258 vk_to_nv_index_format(VkIndexType type)
2259 {
2260 switch (type) {
2261 case VK_INDEX_TYPE_UINT16:
2262 return NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES;
2263 case VK_INDEX_TYPE_UINT32:
2264 return NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES;
2265 case VK_INDEX_TYPE_UINT8_KHR:
2266 return NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE;
2267 default:
2268 unreachable("Invalid index type");
2269 }
2270 }
2271
2272 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)2273 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
2274 VkBuffer _buffer,
2275 VkDeviceSize offset,
2276 VkDeviceSize size,
2277 VkIndexType indexType)
2278 {
2279 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2280 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
2281
2282 struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
2283
2284 uint64_t addr, range;
2285 if (buffer != NULL && size > 0) {
2286 addr = nvk_buffer_address(buffer, offset);
2287 range = vk_buffer_range(&buffer->vk, offset, size);
2288 } else {
2289 range = addr = 0;
2290 }
2291
2292 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_INDEX,
2293 vk_index_to_restart(indexType));
2294
2295 P_MTHD(p, NV9097, SET_INDEX_BUFFER_A);
2296 P_NV9097_SET_INDEX_BUFFER_A(p, addr >> 32);
2297 P_NV9097_SET_INDEX_BUFFER_B(p, addr);
2298
2299 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2300 P_MTHD(p, NVC597, SET_INDEX_BUFFER_SIZE_A);
2301 P_NVC597_SET_INDEX_BUFFER_SIZE_A(p, range >> 32);
2302 P_NVC597_SET_INDEX_BUFFER_SIZE_B(p, range);
2303 } else {
2304 /* TODO: What about robust zero-size buffers? */
2305 const uint64_t limit = range > 0 ? addr + range - 1 : 0;
2306 P_MTHD(p, NV9097, SET_INDEX_BUFFER_C);
2307 P_NV9097_SET_INDEX_BUFFER_C(p, limit >> 32);
2308 P_NV9097_SET_INDEX_BUFFER_D(p, limit);
2309 }
2310
2311 P_IMMD(p, NV9097, SET_INDEX_BUFFER_E, vk_to_nv_index_format(indexType));
2312 }
2313
2314 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)2315 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
2316 struct nvk_addr_range addr_range)
2317 {
2318 /* Used for meta save/restore */
2319 if (vb_idx == 0)
2320 cmd->state.gfx.vb0 = addr_range;
2321
2322 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2323
2324 P_MTHD(p, NV9097, SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
2325 P_NV9097_SET_VERTEX_STREAM_A_LOCATION_A(p, vb_idx, addr_range.addr >> 32);
2326 P_NV9097_SET_VERTEX_STREAM_A_LOCATION_B(p, vb_idx, addr_range.addr);
2327
2328 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2329 P_MTHD(p, NVC597, SET_VERTEX_STREAM_SIZE_A(vb_idx));
2330 P_NVC597_SET_VERTEX_STREAM_SIZE_A(p, vb_idx, addr_range.range >> 32);
2331 P_NVC597_SET_VERTEX_STREAM_SIZE_B(p, vb_idx, addr_range.range);
2332 } else {
2333 /* TODO: What about robust zero-size buffers? */
2334 const uint64_t limit = addr_range.range > 0 ?
2335 addr_range.addr + addr_range.range - 1 : 0;
2336 P_MTHD(p, NV9097, SET_VERTEX_STREAM_LIMIT_A_A(vb_idx));
2337 P_NV9097_SET_VERTEX_STREAM_LIMIT_A_A(p, vb_idx, limit >> 32);
2338 P_NV9097_SET_VERTEX_STREAM_LIMIT_A_B(p, vb_idx, limit);
2339 }
2340 }
2341
2342 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)2343 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
2344 uint32_t firstBinding,
2345 uint32_t bindingCount,
2346 const VkBuffer *pBuffers,
2347 const VkDeviceSize *pOffsets,
2348 const VkDeviceSize *pSizes,
2349 const VkDeviceSize *pStrides)
2350 {
2351 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2352
2353 if (pStrides) {
2354 vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
2355 bindingCount, pStrides);
2356 }
2357
2358 for (uint32_t i = 0; i < bindingCount; i++) {
2359 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
2360 uint32_t idx = firstBinding + i;
2361
2362 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
2363 const struct nvk_addr_range addr_range =
2364 nvk_buffer_addr_range(buffer, pOffsets[i], size);
2365
2366 nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
2367 }
2368 }
2369
2370 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)2371 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
2372 {
2373 switch (prim) {
2374 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
2375 return NV9097_BEGIN_OP_POINTS;
2376 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
2377 return NV9097_BEGIN_OP_LINES;
2378 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
2379 return NV9097_BEGIN_OP_LINE_STRIP;
2380 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
2381 #pragma GCC diagnostic push
2382 #pragma GCC diagnostic ignored "-Wswitch"
2383 case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
2384 #pragma GCC diagnostic pop
2385 return NV9097_BEGIN_OP_TRIANGLES;
2386 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
2387 return NV9097_BEGIN_OP_TRIANGLE_STRIP;
2388 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
2389 return NV9097_BEGIN_OP_TRIANGLE_FAN;
2390 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
2391 return NV9097_BEGIN_OP_LINELIST_ADJCY;
2392 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
2393 return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
2394 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
2395 return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
2396 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
2397 return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
2398 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
2399 return NV9097_BEGIN_OP_PATCH;
2400 default:
2401 unreachable("Invalid primitive topology");
2402 }
2403 }
2404
2405 struct mme_draw_params {
2406 struct mme_value base_vertex;
2407 struct mme_value first_vertex;
2408 struct mme_value first_instance;
2409 struct mme_value draw_idx;
2410 };
2411
2412 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)2413 nvk_mme_build_set_draw_params(struct mme_builder *b,
2414 const struct mme_draw_params *p)
2415 {
2416 const uint32_t draw_params_offset = nvk_root_descriptor_offset(draw);
2417 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2418 mme_emit(b, mme_imm(draw_params_offset));
2419 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2420 mme_emit(b, p->first_vertex);
2421 mme_emit(b, p->first_instance);
2422 mme_emit(b, p->draw_idx);
2423 mme_emit(b, mme_zero() /* view_index */);
2424
2425 mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
2426 mme_emit(b, p->base_vertex);
2427 mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
2428 mme_emit(b, p->base_vertex);
2429
2430 mme_mthd(b, NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX);
2431 mme_emit(b, p->first_instance);
2432 }
2433
2434 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)2435 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
2436 {
2437 /* Set the push constant */
2438 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2439 mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.view_index)));
2440 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2441 mme_emit(b, view_index);
2442
2443 /* Set the layer to the view index */
2444 STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
2445 STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
2446 mme_mthd(b, NV9097_SET_RT_LAYER);
2447 mme_emit(b, view_index);
2448 }
2449
2450 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)2451 nvk_mme_build_draw_loop(struct mme_builder *b,
2452 struct mme_value instance_count,
2453 struct mme_value first_vertex,
2454 struct mme_value vertex_count)
2455 {
2456 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
2457
2458 mme_loop(b, instance_count) {
2459 mme_mthd(b, NV9097_BEGIN);
2460 mme_emit(b, begin);
2461
2462 mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
2463 mme_emit(b, first_vertex);
2464 mme_emit(b, vertex_count);
2465
2466 mme_mthd(b, NV9097_END);
2467 mme_emit(b, mme_zero());
2468
2469 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
2470 }
2471
2472 mme_free_reg(b, begin);
2473 }
2474
2475 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_idx)2476 nvk_mme_build_draw(struct mme_builder *b,
2477 struct mme_value draw_idx)
2478 {
2479 /* These are in VkDrawIndirectCommand order */
2480 struct mme_value vertex_count = mme_load(b);
2481 struct mme_value instance_count = mme_load(b);
2482 struct mme_value first_vertex = mme_load(b);
2483 struct mme_value first_instance = mme_load(b);
2484
2485 struct mme_draw_params params = {
2486 .first_vertex = first_vertex,
2487 .first_instance = first_instance,
2488 .draw_idx = draw_idx,
2489 };
2490 nvk_mme_build_set_draw_params(b, ¶ms);
2491
2492 mme_free_reg(b, first_instance);
2493
2494 if (b->devinfo->cls_eng3d < TURING_A)
2495 nvk_mme_spill(b, DRAW_IDX, draw_idx);
2496
2497 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2498 mme_if(b, ieq, view_mask, mme_zero()) {
2499 mme_free_reg(b, view_mask);
2500
2501 nvk_mme_build_draw_loop(b, instance_count,
2502 first_vertex, vertex_count);
2503 }
2504
2505 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2506 mme_if(b, ine, view_mask, mme_zero()) {
2507 mme_free_reg(b, view_mask);
2508
2509 struct mme_value view = mme_mov(b, mme_zero());
2510 mme_while(b, ine, view, mme_imm(32)) {
2511 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2512 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
2513 mme_free_reg(b, view_mask);
2514 mme_if(b, ine, has_view, mme_zero()) {
2515 mme_free_reg(b, has_view);
2516 nvk_mme_emit_view_index(b, view);
2517 nvk_mme_build_draw_loop(b, instance_count,
2518 first_vertex, vertex_count);
2519 }
2520
2521 mme_add_to(b, view, view, mme_imm(1));
2522 }
2523 mme_free_reg(b, view);
2524 }
2525
2526 mme_free_reg(b, instance_count);
2527 mme_free_reg(b, first_vertex);
2528 mme_free_reg(b, vertex_count);
2529
2530 if (b->devinfo->cls_eng3d < TURING_A)
2531 nvk_mme_unspill(b, DRAW_IDX, draw_idx);
2532 }
2533
2534 void
nvk_mme_draw(struct mme_builder * b)2535 nvk_mme_draw(struct mme_builder *b)
2536 {
2537 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2538 struct mme_value draw_idx = mme_load(b);
2539
2540 nvk_mme_build_draw(b, draw_idx);
2541 }
2542
2543 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)2544 nvk_CmdDraw(VkCommandBuffer commandBuffer,
2545 uint32_t vertexCount,
2546 uint32_t instanceCount,
2547 uint32_t firstVertex,
2548 uint32_t firstInstance)
2549 {
2550 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2551 const struct vk_dynamic_graphics_state *dyn =
2552 &cmd->vk.dynamic_graphics_state;
2553
2554 nvk_flush_gfx_state(cmd);
2555
2556 uint32_t begin;
2557 V_NV9097_BEGIN(begin, {
2558 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2559 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2560 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2561 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2562 });
2563
2564 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
2565 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
2566 P_INLINE_DATA(p, begin);
2567 P_INLINE_DATA(p, 0 /* draw_idx */);
2568 P_INLINE_DATA(p, vertexCount);
2569 P_INLINE_DATA(p, instanceCount);
2570 P_INLINE_DATA(p, firstVertex);
2571 P_INLINE_DATA(p, firstInstance);
2572 }
2573
2574 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)2575 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
2576 uint32_t drawCount,
2577 const VkMultiDrawInfoEXT *pVertexInfo,
2578 uint32_t instanceCount,
2579 uint32_t firstInstance,
2580 uint32_t stride)
2581 {
2582 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2583 const struct vk_dynamic_graphics_state *dyn =
2584 &cmd->vk.dynamic_graphics_state;
2585
2586 nvk_flush_gfx_state(cmd);
2587
2588 uint32_t begin;
2589 V_NV9097_BEGIN(begin, {
2590 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2591 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2592 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2593 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2594 });
2595
2596 for (uint32_t draw_idx = 0; draw_idx < drawCount; draw_idx++) {
2597 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
2598 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
2599 P_INLINE_DATA(p, begin);
2600 P_INLINE_DATA(p, draw_idx);
2601 P_INLINE_DATA(p, pVertexInfo->vertexCount);
2602 P_INLINE_DATA(p, instanceCount);
2603 P_INLINE_DATA(p, pVertexInfo->firstVertex);
2604 P_INLINE_DATA(p, firstInstance);
2605
2606 pVertexInfo = ((void *)pVertexInfo) + stride;
2607 }
2608 }
2609
2610 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)2611 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
2612 struct mme_value instance_count,
2613 struct mme_value first_index,
2614 struct mme_value index_count)
2615 {
2616 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
2617
2618 mme_loop(b, instance_count) {
2619 mme_mthd(b, NV9097_BEGIN);
2620 mme_emit(b, begin);
2621
2622 mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
2623 mme_emit(b, first_index);
2624 mme_emit(b, index_count);
2625
2626 mme_mthd(b, NV9097_END);
2627 mme_emit(b, mme_zero());
2628
2629 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
2630 }
2631
2632 mme_free_reg(b, begin);
2633 }
2634
2635 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_idx)2636 nvk_mme_build_draw_indexed(struct mme_builder *b,
2637 struct mme_value draw_idx)
2638 {
2639 /* These are in VkDrawIndexedIndirectCommand order */
2640 struct mme_value index_count = mme_load(b);
2641 struct mme_value instance_count = mme_load(b);
2642 struct mme_value first_index = mme_load(b);
2643 struct mme_value vertex_offset = mme_load(b);
2644 struct mme_value first_instance = mme_load(b);
2645
2646 struct mme_draw_params params = {
2647 .base_vertex = vertex_offset,
2648 .first_vertex = vertex_offset,
2649 .first_instance = first_instance,
2650 .draw_idx = draw_idx,
2651 };
2652 nvk_mme_build_set_draw_params(b, ¶ms);
2653
2654 mme_free_reg(b, vertex_offset);
2655 mme_free_reg(b, first_instance);
2656
2657 if (b->devinfo->cls_eng3d < TURING_A)
2658 nvk_mme_spill(b, DRAW_IDX, draw_idx);
2659
2660 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2661 mme_if(b, ieq, view_mask, mme_zero()) {
2662 mme_free_reg(b, view_mask);
2663
2664 nvk_mme_build_draw_indexed_loop(b, instance_count,
2665 first_index, index_count);
2666 }
2667
2668 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2669 mme_if(b, ine, view_mask, mme_zero()) {
2670 mme_free_reg(b, view_mask);
2671
2672 struct mme_value view = mme_mov(b, mme_zero());
2673 mme_while(b, ine, view, mme_imm(32)) {
2674 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2675 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
2676 mme_free_reg(b, view_mask);
2677 mme_if(b, ine, has_view, mme_zero()) {
2678 mme_free_reg(b, has_view);
2679 nvk_mme_emit_view_index(b, view);
2680 nvk_mme_build_draw_indexed_loop(b, instance_count,
2681 first_index, index_count);
2682 }
2683
2684 mme_add_to(b, view, view, mme_imm(1));
2685 }
2686 mme_free_reg(b, view);
2687 }
2688
2689 mme_free_reg(b, instance_count);
2690 mme_free_reg(b, first_index);
2691 mme_free_reg(b, index_count);
2692
2693 if (b->devinfo->cls_eng3d < TURING_A)
2694 nvk_mme_unspill(b, DRAW_IDX, draw_idx);
2695 }
2696
2697 void
nvk_mme_draw_indexed(struct mme_builder * b)2698 nvk_mme_draw_indexed(struct mme_builder *b)
2699 {
2700 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2701 struct mme_value draw_idx = mme_load(b);
2702
2703 nvk_mme_build_draw_indexed(b, draw_idx);
2704 }
2705
2706 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)2707 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
2708 uint32_t indexCount,
2709 uint32_t instanceCount,
2710 uint32_t firstIndex,
2711 int32_t vertexOffset,
2712 uint32_t firstInstance)
2713 {
2714 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2715 const struct vk_dynamic_graphics_state *dyn =
2716 &cmd->vk.dynamic_graphics_state;
2717
2718 nvk_flush_gfx_state(cmd);
2719
2720 uint32_t begin;
2721 V_NV9097_BEGIN(begin, {
2722 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2723 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2724 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2725 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2726 });
2727
2728 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
2729 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
2730 P_INLINE_DATA(p, begin);
2731 P_INLINE_DATA(p, 0 /* draw_idx */);
2732 P_INLINE_DATA(p, indexCount);
2733 P_INLINE_DATA(p, instanceCount);
2734 P_INLINE_DATA(p, firstIndex);
2735 P_INLINE_DATA(p, vertexOffset);
2736 P_INLINE_DATA(p, firstInstance);
2737 }
2738
2739 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)2740 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
2741 uint32_t drawCount,
2742 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
2743 uint32_t instanceCount,
2744 uint32_t firstInstance,
2745 uint32_t stride,
2746 const int32_t *pVertexOffset)
2747 {
2748 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2749 const struct vk_dynamic_graphics_state *dyn =
2750 &cmd->vk.dynamic_graphics_state;
2751
2752 nvk_flush_gfx_state(cmd);
2753
2754 uint32_t begin;
2755 V_NV9097_BEGIN(begin, {
2756 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2757 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2758 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2759 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2760 });
2761
2762 for (uint32_t draw_idx = 0; draw_idx < drawCount; draw_idx++) {
2763 const uint32_t vertex_offset =
2764 pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
2765
2766 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
2767 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
2768 P_INLINE_DATA(p, begin);
2769 P_INLINE_DATA(p, draw_idx);
2770 P_INLINE_DATA(p, pIndexInfo->indexCount);
2771 P_INLINE_DATA(p, instanceCount);
2772 P_INLINE_DATA(p, pIndexInfo->firstIndex);
2773 P_INLINE_DATA(p, vertex_offset);
2774 P_INLINE_DATA(p, firstInstance);
2775
2776 pIndexInfo = ((void *)pIndexInfo) + stride;
2777 }
2778 }
2779
2780 void
nvk_mme_draw_indirect(struct mme_builder * b)2781 nvk_mme_draw_indirect(struct mme_builder *b)
2782 {
2783 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2784
2785 if (b->devinfo->cls_eng3d >= TURING_A) {
2786 struct mme_value64 draw_addr = mme_load_addr64(b);
2787 struct mme_value draw_count = mme_load(b);
2788 struct mme_value stride = mme_load(b);
2789
2790 struct mme_value draw = mme_mov(b, mme_zero());
2791 mme_while(b, ult, draw, draw_count) {
2792 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
2793
2794 nvk_mme_build_draw(b, draw);
2795
2796 mme_add_to(b, draw, draw, mme_imm(1));
2797 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
2798 }
2799 } else {
2800 struct mme_value draw_count = mme_load(b);
2801 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
2802
2803 struct mme_value draw = mme_mov(b, mme_zero());
2804 mme_while(b, ine, draw, draw_count) {
2805 nvk_mme_spill(b, DRAW_COUNT, draw_count);
2806
2807 nvk_mme_build_draw(b, draw);
2808 mme_add_to(b, draw, draw, mme_imm(1));
2809
2810 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
2811 mme_loop(b, pad_dw) {
2812 mme_free_reg(b, mme_load(b));
2813 }
2814 mme_free_reg(b, pad_dw);
2815
2816 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
2817 }
2818 }
2819 }
2820
2821 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)2822 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
2823 VkBuffer _buffer,
2824 VkDeviceSize offset,
2825 uint32_t drawCount,
2826 uint32_t stride)
2827 {
2828 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2829 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
2830 const struct vk_dynamic_graphics_state *dyn =
2831 &cmd->vk.dynamic_graphics_state;
2832
2833 /* From the Vulkan 1.3.238 spec:
2834 *
2835 * VUID-vkCmdDrawIndirect-drawCount-00476
2836 *
2837 * "If drawCount is greater than 1, stride must be a multiple of 4 and
2838 * must be greater than or equal to sizeof(VkDrawIndirectCommand)"
2839 *
2840 * and
2841 *
2842 * "If drawCount is less than or equal to one, stride is ignored."
2843 */
2844 if (drawCount > 1) {
2845 assert(stride % 4 == 0);
2846 assert(stride >= sizeof(VkDrawIndirectCommand));
2847 } else {
2848 stride = sizeof(VkDrawIndirectCommand);
2849 }
2850
2851 nvk_flush_gfx_state(cmd);
2852
2853 uint32_t begin;
2854 V_NV9097_BEGIN(begin, {
2855 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2856 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2857 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2858 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2859 });
2860
2861 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2862 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2863 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
2864 P_INLINE_DATA(p, begin);
2865 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2866 P_INLINE_DATA(p, draw_addr >> 32);
2867 P_INLINE_DATA(p, draw_addr);
2868 P_INLINE_DATA(p, drawCount);
2869 P_INLINE_DATA(p, stride);
2870 } else {
2871 const uint32_t max_draws_per_push =
2872 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
2873
2874 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2875 while (drawCount) {
2876 const uint32_t count = MIN2(drawCount, max_draws_per_push);
2877
2878 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2879 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
2880 P_INLINE_DATA(p, begin);
2881 P_INLINE_DATA(p, count);
2882 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
2883
2884 uint64_t range = count * (uint64_t)stride;
2885 nv_push_update_count(p, range / 4);
2886 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
2887
2888 draw_addr += range;
2889 drawCount -= count;
2890 }
2891 }
2892 }
2893
2894 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)2895 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
2896 {
2897 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2898
2899 if (b->devinfo->cls_eng3d >= TURING_A) {
2900 struct mme_value64 draw_addr = mme_load_addr64(b);
2901 struct mme_value draw_count = mme_load(b);
2902 struct mme_value stride = mme_load(b);
2903
2904 struct mme_value draw = mme_mov(b, mme_zero());
2905 mme_while(b, ult, draw, draw_count) {
2906 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
2907
2908 nvk_mme_build_draw_indexed(b, draw);
2909
2910 mme_add_to(b, draw, draw, mme_imm(1));
2911 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
2912 }
2913 } else {
2914 struct mme_value draw_count = mme_load(b);
2915 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
2916
2917 struct mme_value draw = mme_mov(b, mme_zero());
2918 mme_while(b, ine, draw, draw_count) {
2919 nvk_mme_spill(b, DRAW_COUNT, draw_count);
2920
2921 nvk_mme_build_draw_indexed(b, draw);
2922 mme_add_to(b, draw, draw, mme_imm(1));
2923
2924 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
2925 mme_loop(b, pad_dw) {
2926 mme_free_reg(b, mme_load(b));
2927 }
2928 mme_free_reg(b, pad_dw);
2929
2930 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
2931 }
2932 }
2933 }
2934
2935 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)2936 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
2937 VkBuffer _buffer,
2938 VkDeviceSize offset,
2939 uint32_t drawCount,
2940 uint32_t stride)
2941 {
2942 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2943 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
2944 const struct vk_dynamic_graphics_state *dyn =
2945 &cmd->vk.dynamic_graphics_state;
2946
2947 /* From the Vulkan 1.3.238 spec:
2948 *
2949 * VUID-vkCmdDrawIndexedIndirect-drawCount-00528
2950 *
2951 * "If drawCount is greater than 1, stride must be a multiple of 4 and
2952 * must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
2953 *
2954 * and
2955 *
2956 * "If drawCount is less than or equal to one, stride is ignored."
2957 */
2958 if (drawCount > 1) {
2959 assert(stride % 4 == 0);
2960 assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
2961 } else {
2962 stride = sizeof(VkDrawIndexedIndirectCommand);
2963 }
2964
2965 nvk_flush_gfx_state(cmd);
2966
2967 uint32_t begin;
2968 V_NV9097_BEGIN(begin, {
2969 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2970 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2971 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2972 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2973 });
2974
2975 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2976 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2977 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
2978 P_INLINE_DATA(p, begin);
2979 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2980 P_INLINE_DATA(p, draw_addr >> 32);
2981 P_INLINE_DATA(p, draw_addr);
2982 P_INLINE_DATA(p, drawCount);
2983 P_INLINE_DATA(p, stride);
2984 } else {
2985 const uint32_t max_draws_per_push =
2986 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
2987
2988 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2989 while (drawCount) {
2990 const uint32_t count = MIN2(drawCount, max_draws_per_push);
2991
2992 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2993 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
2994 P_INLINE_DATA(p, begin);
2995 P_INLINE_DATA(p, count);
2996 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
2997
2998 uint64_t range = count * (uint64_t)stride;
2999 nv_push_update_count(p, range / 4);
3000 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
3001
3002 draw_addr += range;
3003 drawCount -= count;
3004 }
3005 }
3006 }
3007
3008 void
nvk_mme_draw_indirect_count(struct mme_builder * b)3009 nvk_mme_draw_indirect_count(struct mme_builder *b)
3010 {
3011 if (b->devinfo->cls_eng3d < TURING_A)
3012 return;
3013
3014 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
3015
3016 struct mme_value64 draw_addr = mme_load_addr64(b);
3017 struct mme_value64 draw_count_addr = mme_load_addr64(b);
3018 struct mme_value draw_max = mme_load(b);
3019 struct mme_value stride = mme_load(b);
3020
3021 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
3022 mme_free_reg64(b, draw_count_addr);
3023 struct mme_value draw_count_buf = mme_load(b);
3024
3025 mme_if(b, ule, draw_count_buf, draw_max) {
3026 mme_mov_to(b, draw_max, draw_count_buf);
3027 }
3028 mme_free_reg(b, draw_count_buf);
3029
3030 struct mme_value draw = mme_mov(b, mme_zero());
3031 mme_while(b, ult, draw, draw_max) {
3032 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
3033
3034 nvk_mme_build_draw(b, draw);
3035
3036 mme_add_to(b, draw, draw, mme_imm(1));
3037 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3038 }
3039 }
3040
3041 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3042 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
3043 VkBuffer _buffer,
3044 VkDeviceSize offset,
3045 VkBuffer countBuffer,
3046 VkDeviceSize countBufferOffset,
3047 uint32_t maxDrawCount,
3048 uint32_t stride)
3049 {
3050 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3051 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3052 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
3053
3054 const struct vk_dynamic_graphics_state *dyn =
3055 &cmd->vk.dynamic_graphics_state;
3056
3057 /* TODO: Indirect count draw pre-Turing */
3058 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
3059
3060 nvk_flush_gfx_state(cmd);
3061
3062 uint32_t begin;
3063 V_NV9097_BEGIN(begin, {
3064 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
3065 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
3066 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
3067 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
3068 });
3069
3070 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
3071 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
3072 P_INLINE_DATA(p, begin);
3073 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3074 P_INLINE_DATA(p, draw_addr >> 32);
3075 P_INLINE_DATA(p, draw_addr);
3076 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
3077 countBufferOffset);
3078 P_INLINE_DATA(p, draw_count_addr >> 32);
3079 P_INLINE_DATA(p, draw_count_addr);
3080 P_INLINE_DATA(p, maxDrawCount);
3081 P_INLINE_DATA(p, stride);
3082 }
3083
3084 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)3085 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
3086 {
3087 if (b->devinfo->cls_eng3d < TURING_A)
3088 return;
3089
3090 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
3091
3092 struct mme_value64 draw_addr = mme_load_addr64(b);
3093 struct mme_value64 draw_count_addr = mme_load_addr64(b);
3094 struct mme_value draw_max = mme_load(b);
3095 struct mme_value stride = mme_load(b);
3096
3097 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
3098 mme_free_reg64(b, draw_count_addr);
3099 struct mme_value draw_count_buf = mme_load(b);
3100
3101 mme_if(b, ule, draw_count_buf, draw_max) {
3102 mme_mov_to(b, draw_max, draw_count_buf);
3103 }
3104 mme_free_reg(b, draw_count_buf);
3105
3106 struct mme_value draw = mme_mov(b, mme_zero());
3107 mme_while(b, ult, draw, draw_max) {
3108 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
3109
3110 nvk_mme_build_draw_indexed(b, draw);
3111
3112 mme_add_to(b, draw, draw, mme_imm(1));
3113 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3114 }
3115 }
3116
3117 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3118 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
3119 VkBuffer _buffer,
3120 VkDeviceSize offset,
3121 VkBuffer countBuffer,
3122 VkDeviceSize countBufferOffset,
3123 uint32_t maxDrawCount,
3124 uint32_t stride)
3125 {
3126 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3127 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3128 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
3129
3130 const struct vk_dynamic_graphics_state *dyn =
3131 &cmd->vk.dynamic_graphics_state;
3132
3133 /* TODO: Indexed indirect count draw pre-Turing */
3134 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
3135
3136 nvk_flush_gfx_state(cmd);
3137
3138 uint32_t begin;
3139 V_NV9097_BEGIN(begin, {
3140 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
3141 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
3142 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
3143 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
3144 });
3145
3146 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
3147 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
3148 P_INLINE_DATA(p, begin);
3149 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3150 P_INLINE_DATA(p, draw_addr >> 32);
3151 P_INLINE_DATA(p, draw_addr);
3152 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
3153 countBufferOffset);
3154 P_INLINE_DATA(p, draw_count_addr >> 32);
3155 P_INLINE_DATA(p, draw_count_addr);
3156 P_INLINE_DATA(p, maxDrawCount);
3157 P_INLINE_DATA(p, stride);
3158 }
3159
3160 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)3161 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
3162 struct mme_value instance_count,
3163 struct mme_value counter)
3164 {
3165 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3166
3167 mme_loop(b, instance_count) {
3168 mme_mthd(b, NV9097_BEGIN);
3169 mme_emit(b, begin);
3170
3171 mme_mthd(b, NV9097_DRAW_AUTO);
3172 mme_emit(b, counter);
3173
3174 mme_mthd(b, NV9097_END);
3175 mme_emit(b, mme_zero());
3176
3177 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3178 }
3179
3180 mme_free_reg(b, begin);
3181 }
3182
3183 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)3184 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
3185 {
3186 nvk_mme_load_to_scratch(b, DRAW_BEGIN);
3187
3188 struct mme_value instance_count = mme_load(b);
3189 struct mme_value first_instance = mme_load(b);
3190
3191 if (b->devinfo->cls_eng3d >= TURING_A) {
3192 struct mme_value64 counter_addr = mme_load_addr64(b);
3193 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
3194 mme_free_reg(b, counter_addr.lo);
3195 mme_free_reg(b, counter_addr.hi);
3196 }
3197 struct mme_value counter = mme_load(b);
3198
3199 struct mme_draw_params params = {
3200 .first_instance = first_instance,
3201 };
3202 nvk_mme_build_set_draw_params(b, ¶ms);
3203
3204 mme_free_reg(b, first_instance);
3205
3206 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3207 mme_if(b, ieq, view_mask, mme_zero()) {
3208 mme_free_reg(b, view_mask);
3209
3210 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
3211 }
3212
3213 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3214 mme_if(b, ine, view_mask, mme_zero()) {
3215 mme_free_reg(b, view_mask);
3216
3217 struct mme_value view = mme_mov(b, mme_zero());
3218 mme_while(b, ine, view, mme_imm(32)) {
3219 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3220 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3221 mme_free_reg(b, view_mask);
3222 mme_if(b, ine, has_view, mme_zero()) {
3223 mme_free_reg(b, has_view);
3224 nvk_mme_emit_view_index(b, view);
3225 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
3226 }
3227
3228 mme_add_to(b, view, view, mme_imm(1));
3229 }
3230 }
3231
3232 mme_free_reg(b, instance_count);
3233 mme_free_reg(b, counter);
3234 }
3235
3236 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)3237 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3238 uint32_t instanceCount,
3239 uint32_t firstInstance,
3240 VkBuffer counterBuffer,
3241 VkDeviceSize counterBufferOffset,
3242 uint32_t counterOffset,
3243 uint32_t vertexStride)
3244 {
3245 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3246 VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
3247 const struct vk_dynamic_graphics_state *dyn =
3248 &cmd->vk.dynamic_graphics_state;
3249
3250 nvk_flush_gfx_state(cmd);
3251
3252 uint32_t begin;
3253 V_NV9097_BEGIN(begin, {
3254 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
3255 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
3256 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
3257 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
3258 });
3259
3260 uint64_t counter_addr = nvk_buffer_address(counter_buffer,
3261 counterBufferOffset);
3262
3263 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3264 struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
3265 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
3266 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
3267
3268 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
3269 P_INLINE_DATA(p, begin);
3270 P_INLINE_DATA(p, instanceCount);
3271 P_INLINE_DATA(p, firstInstance);
3272 P_INLINE_DATA(p, counter_addr >> 32);
3273 P_INLINE_DATA(p, counter_addr);
3274 } else {
3275 struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
3276 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
3277 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
3278
3279 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
3280 P_INLINE_DATA(p, begin);
3281 P_INLINE_DATA(p, instanceCount);
3282 P_INLINE_DATA(p, firstInstance);
3283 nv_push_update_count(p, 1);
3284 nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
3285 }
3286 }
3287
3288 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3289 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3290 uint32_t firstBinding,
3291 uint32_t bindingCount,
3292 const VkBuffer *pBuffers,
3293 const VkDeviceSize *pOffsets,
3294 const VkDeviceSize *pSizes)
3295 {
3296 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3297
3298 for (uint32_t i = 0; i < bindingCount; i++) {
3299 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3300 uint32_t idx = firstBinding + i;
3301 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3302 struct nvk_addr_range addr_range =
3303 nvk_buffer_addr_range(buffer, pOffsets[i], size);
3304 assert(addr_range.range <= UINT32_MAX);
3305
3306 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3307
3308 P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
3309 P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
3310 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
3311 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
3312 P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
3313 }
3314
3315 // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
3316 }
3317
3318 void
nvk_mme_xfb_counter_load(struct mme_builder * b)3319 nvk_mme_xfb_counter_load(struct mme_builder *b)
3320 {
3321 struct mme_value buffer = mme_load(b);
3322
3323 struct mme_value counter;
3324 if (b->devinfo->cls_eng3d >= TURING_A) {
3325 struct mme_value64 counter_addr = mme_load_addr64(b);
3326
3327 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
3328 mme_free_reg(b, counter_addr.lo);
3329 mme_free_reg(b, counter_addr.hi);
3330
3331 counter = mme_load(b);
3332 } else {
3333 counter = mme_load(b);
3334 }
3335
3336 mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
3337 mme_emit(b, counter);
3338
3339 mme_free_reg(b, counter);
3340 mme_free_reg(b, buffer);
3341 }
3342
3343 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3344 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3345 uint32_t firstCounterBuffer,
3346 uint32_t counterBufferCount,
3347 const VkBuffer *pCounterBuffers,
3348 const VkDeviceSize *pCounterBufferOffsets)
3349 {
3350 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3351 const uint32_t max_buffers = 4;
3352
3353 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
3354
3355 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
3356 for (uint32_t i = 0; i < max_buffers; ++i) {
3357 P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
3358 }
3359
3360 for (uint32_t i = 0; i < counterBufferCount; ++i) {
3361 if (pCounterBuffers[i] == VK_NULL_HANDLE)
3362 continue;
3363
3364 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
3365 // index of counter buffer corresponts to index of transform buffer
3366 uint32_t cb_idx = firstCounterBuffer + i;
3367 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3368 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
3369
3370 if (nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d >= TURING_A) {
3371 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3372 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
3373 /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
3374 P_INLINE_DATA(p, cb_idx * 8);
3375 P_INLINE_DATA(p, cb_addr >> 32);
3376 P_INLINE_DATA(p, cb_addr);
3377 } else {
3378 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3379 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
3380 P_INLINE_DATA(p, cb_idx);
3381 nv_push_update_count(p, 1);
3382 nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
3383 }
3384 }
3385 }
3386
3387 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3388 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3389 uint32_t firstCounterBuffer,
3390 uint32_t counterBufferCount,
3391 const VkBuffer *pCounterBuffers,
3392 const VkDeviceSize *pCounterBufferOffsets)
3393 {
3394 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3395
3396 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
3397
3398 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
3399
3400 for (uint32_t i = 0; i < counterBufferCount; ++i) {
3401 if (pCounterBuffers[i] == VK_NULL_HANDLE)
3402 continue;
3403
3404 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
3405 // index of counter buffer corresponts to index of transform buffer
3406 uint32_t cb_idx = firstCounterBuffer + i;
3407 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3408 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
3409
3410 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
3411 P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
3412 P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
3413 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
3414 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
3415 .operation = OPERATION_REPORT_ONLY,
3416 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
3417 .report = REPORT_STREAMING_BYTE_COUNT,
3418 .sub_report = cb_idx,
3419 .structure_size = STRUCTURE_SIZE_ONE_WORD,
3420 });
3421 }
3422 }
3423
3424 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)3425 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
3426 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
3427 {
3428 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3429 VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
3430
3431 uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
3432 bool inverted = pConditionalRenderingBegin->flags &
3433 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
3434
3435 if (addr & 0x3f || buffer->is_local) {
3436 uint64_t tmp_addr;
3437 VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
3438 if (result != VK_SUCCESS) {
3439 vk_command_buffer_set_error(&cmd->vk, result);
3440 return;
3441 }
3442
3443 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
3444 P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
3445 P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
3446 P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
3447 P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
3448 P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
3449 P_NV90B5_PITCH_IN(p, 4);
3450 P_NV90B5_PITCH_OUT(p, 4);
3451 P_NV90B5_LINE_LENGTH_IN(p, 4);
3452 P_NV90B5_LINE_COUNT(p, 1);
3453
3454 P_IMMD(p, NV90B5, LAUNCH_DMA, {
3455 .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
3456 .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
3457 .flush_enable = FLUSH_ENABLE_TRUE,
3458 .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
3459 .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
3460 });
3461 addr = tmp_addr;
3462 }
3463
3464 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
3465 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
3466 P_NV9097_SET_RENDER_ENABLE_A(p, addr >> 32);
3467 P_NV9097_SET_RENDER_ENABLE_B(p, addr & 0xfffffff0);
3468 P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
3469
3470 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
3471 P_NV90C0_SET_RENDER_ENABLE_A(p, addr >> 32);
3472 P_NV90C0_SET_RENDER_ENABLE_B(p, addr & 0xfffffff0);
3473 P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
3474 }
3475
3476 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)3477 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
3478 {
3479 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3480
3481 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
3482 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
3483 P_NV9097_SET_RENDER_ENABLE_A(p, 0);
3484 P_NV9097_SET_RENDER_ENABLE_B(p, 0);
3485 P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
3486
3487 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
3488 P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
3489 P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
3490 P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
3491 }
3492