1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15
16 #include "util/bitpack_helpers.h"
17 #include "vk_format.h"
18 #include "vk_render_pass.h"
19 #include "vk_standard_sample_locations.h"
20
21 #include "nv_push_cl902d.h"
22 #include "nv_push_cl9097.h"
23 #include "nv_push_cl90b5.h"
24 #include "nv_push_cl90c0.h"
25 #include "nv_push_cla097.h"
26 #include "nv_push_clb097.h"
27 #include "nv_push_clb197.h"
28 #include "nv_push_clc397.h"
29 #include "nv_push_clc597.h"
30 #include "drf.h"
31
32 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)33 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
34 {
35 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
36 struct nvk_physical_device *pdev = nvk_device_physical(dev);
37 return pdev->info.cls_eng3d;
38 }
39
40 static void
mme_set_priv_reg(struct mme_builder * b,struct mme_value value,struct mme_value mask,struct mme_value reg)41 mme_set_priv_reg(struct mme_builder *b,
42 struct mme_value value,
43 struct mme_value mask,
44 struct mme_value reg)
45 {
46 mme_mthd(b, NV9097_WAIT_FOR_IDLE);
47 mme_emit(b, mme_zero());
48
49 mme_mthd(b, NVK_SET_MME_SCRATCH(FALCON_0));
50 mme_emit(b, mme_zero());
51 mme_emit(b, value);
52 mme_emit(b, mask);
53
54 mme_mthd(b, NV9097_SET_FALCON04);
55 mme_emit(b, reg);
56
57 struct mme_value loop_cond = mme_mov(b, mme_zero());
58 mme_while(b, ine, loop_cond, mme_imm(1)) {
59 mme_state_to(b, loop_cond, NVK_SET_MME_SCRATCH(FALCON_0));
60 mme_mthd(b, NV9097_NO_OPERATION);
61 mme_emit(b, mme_zero());
62 };
63 }
64
65 void
nvk_mme_set_priv_reg(struct mme_builder * b)66 nvk_mme_set_priv_reg(struct mme_builder *b)
67 {
68 struct mme_value value = mme_load(b);
69 struct mme_value mask = mme_load(b);
70 struct mme_value reg = mme_load(b);
71
72 mme_set_priv_reg(b, value, mask, reg);
73 }
74
75 void
nvk_mme_set_conservative_raster_state(struct mme_builder * b)76 nvk_mme_set_conservative_raster_state(struct mme_builder *b)
77 {
78 struct mme_value new_state = mme_load(b);
79 struct mme_value old_state =
80 nvk_mme_load_scratch(b, CONSERVATIVE_RASTER_STATE);
81
82 mme_if(b, ine, new_state, old_state) {
83 nvk_mme_store_scratch(b, CONSERVATIVE_RASTER_STATE, new_state);
84 mme_set_priv_reg(b, new_state, mme_imm(BITFIELD_RANGE(23, 2)),
85 mme_imm(0x418800));
86 }
87 }
88
89 #define NVK_DRAW_CB0_SIZE sizeof(struct nvk_root_descriptor_table)
90
91 void
nvk_mme_select_cb0(struct mme_builder * b)92 nvk_mme_select_cb0(struct mme_builder *b)
93 {
94 struct mme_value addr_hi = nvk_mme_load_scratch(b, CB0_ADDR_HI);
95 struct mme_value addr_lo = nvk_mme_load_scratch(b, CB0_ADDR_LO);
96
97 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
98 mme_emit(b, mme_imm(NVK_DRAW_CB0_SIZE));
99 mme_emit(b, addr_hi);
100 mme_emit(b, addr_lo);
101 }
102
103 static uint32_t nvk_mme_anti_alias_init(void);
104
105 VkResult
nvk_push_draw_state_init(struct nvk_queue * queue,struct nv_push * p)106 nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
107 {
108 struct nvk_device *dev = nvk_queue_device(queue);
109 struct nvk_physical_device *pdev = nvk_device_physical(dev);
110
111 /* 3D state */
112 P_MTHD(p, NV9097, SET_OBJECT);
113 P_NV9097_SET_OBJECT(p, {
114 .class_id = pdev->info.cls_eng3d,
115 .engine_id = 0,
116 });
117
118 for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
119 size_t size;
120 uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
121 if (dw == NULL)
122 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
123
124 assert(size % sizeof(uint32_t) == 0);
125 const uint32_t num_dw = size / sizeof(uint32_t);
126
127 P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
128 P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
129 P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
130
131 P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
132 P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
133 P_INLINE_ARRAY(p, dw, num_dw);
134
135 mme_pos += num_dw;
136
137 free(dw);
138 }
139
140 if (pdev->info.cls_eng3d >= TURING_A)
141 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
142
143 /* Enable FP helper invocation memory loads
144 *
145 * For generations with firmware support for our `SET_PRIV_REG` mme method
146 * we simply use that. On older generations we'll let the kernel do it.
147 * Starting with GSP we have to do it via the firmware anyway.
148 *
149 * This clears bit 3 of gr_gpcs_tpcs_sm_disp_ctrl
150 *
151 * Without it,
152 * dEQP-VK.subgroups.vote.frag_helper.subgroupallequal_bvec2_fragment will
153 * occasionally fail.
154 */
155 if (pdev->info.cls_eng3d >= MAXWELL_B) {
156 unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
157 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
158 P_INLINE_DATA(p, 0);
159 P_INLINE_DATA(p, BITFIELD_BIT(3));
160 P_INLINE_DATA(p, reg);
161 }
162
163 /* Disable Out Of Range Address exceptions
164 *
165 * From the SPH documentation:
166 *
167 * "The SPH fields StoreReqStart and StoreReqEnd set a range of
168 * attributes whose corresponding Odmap values of ST or ST_LAST are
169 * treated as ST_REQ. Normally, for an attribute whose Omap bit is TRUE
170 * and Odmap value is ST, when the shader writes data to this output, it
171 * can not count on being able to read it back, since the next
172 * downstream shader might have its Imap bit FALSE, thereby causing the
173 * Bmap bit to be FALSE. By including a ST type of attribute in the
174 * range of StoreReqStart and StoreReqEnd, the attribute’s Odmap value
175 * is treated as ST_REQ, so an Omap bit being TRUE causes the Bmap bit
176 * to be TRUE. This guarantees the shader program can output the value
177 * and then read it back later. This will save register space."
178 *
179 * It's unclear exactly what's going on but this seems to imply that the
180 * hardware actually ANDs the output mask of one shader stage together with
181 * the input mask of the subsequent shader stage to determine which values
182 * are actually used.
183 *
184 * In the case when we have an empty fragment shader, it seems the hardware
185 * doesn't allocate any output memory for final geometry stage at all and
186 * so any writes to outputs from the final shader stage generates an Out Of
187 * Range Address exception. We could fix this by eliminating unused
188 * outputs via cross-stage linking but that won't work in the case of
189 * VK_EXT_shader_object and VK_EXT_graphics_pipeline_library fast-link.
190 * Instead, the easiest solution is to just disable the exception.
191 *
192 * NOTE (Faith):
193 *
194 * This above analysis is 100% conjecture on my part based on a creative
195 * reading of the SPH docs and what I saw when trying to run certain
196 * OpenGL CTS tests on NVK + Zink. Without access to NVIDIA HW
197 * engineers, have no way of verifying this analysis.
198 *
199 * The CTS test in question is:
200 *
201 * KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_tessLevel
202 *
203 * This should also prevent any issues with array overruns on I/O arrays.
204 * Before, they would get an exception and kill the context whereas now
205 * they should gently get ignored.
206 *
207 * This clears bit 14 of gr_gpcs_tpcs_sms_hww_warp_esr_report_mask
208 */
209 if (pdev->info.cls_eng3d >= MAXWELL_B) {
210 unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ea8 : 0x419e44;
211 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
212 P_INLINE_DATA(p, 0);
213 P_INLINE_DATA(p, BITFIELD_BIT(14));
214 P_INLINE_DATA(p, reg);
215 }
216
217 /* Set CONSERVATIVE_RASTER_STATE to an invalid value, to ensure the
218 * hardware reg is always set the first time conservative rasterization
219 * is enabled */
220 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE),
221 ~0);
222
223 /* Initialize tessellation parameters */
224 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_TESS_PARAMS), 0);
225 P_IMMD(p, NV9097, SET_TESSELLATION_PARAMETERS, {});
226
227 P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
228
229 P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
230 P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
231 for (unsigned i = 0; i < 8; i++)
232 P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
233
234 P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
235
236 // P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
237 // P_INLINE_DATA(cmd->push, 0);
238
239 P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
240
241 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
242
243 P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
244 P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
245 P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
246 P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
247
248 P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
249
250 P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
251
252 P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
253
254 P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
255 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
256
257 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
258 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
259 .all_covered_all_hit_once = 0xff,
260 });
261 P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
262 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
263 .all_covered_all_hit_once = 0xff,
264 });
265 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
266 .all_covered_all_hit_once = 0xff,
267 });
268 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
269 .all_covered_all_hit_once = 0x3f,
270 });
271 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
272 .all_covered_all_hit_once = 0xff,
273 });
274 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
275 .all_covered_all_hit_once = 0xff,
276 });
277
278 if (pdev->info.cls_eng3d < VOLTA_A)
279 P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
280
281 P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
282 .current = 3,
283 .oldest_supported = 3,
284 });
285 P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
286 .current = 2,
287 .oldest_supported = 2,
288 });
289
290 if (pdev->info.cls_eng3d < MAXWELL_A)
291 P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
292
293 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
294 POLICY_EVICT_NORMAL);
295 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
296 POLICY_EVICT_NORMAL);
297 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
298 POLICY_EVICT_NORMAL);
299 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
300 POLICY_EVICT_NORMAL);
301 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
302 POLICY_EVICT_NORMAL);
303
304 P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
305
306 P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
307 .color_front_diffuse = COLOR_FRONT_DIFFUSE_VECTOR_0001,
308 .color_front_specular = COLOR_FRONT_SPECULAR_VECTOR_0001,
309 .generic_vector = GENERIC_VECTOR_VECTOR_0001,
310 .fixed_fnc_texture = FIXED_FNC_TEXTURE_VECTOR_0001,
311 .dx9_color0 = DX9_COLOR0_VECTOR_0001,
312 .dx9_color1_to_color15 = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
313 });
314
315 P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
316
317 P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
318 CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
319
320 P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
321 .enable = ENABLE_TRUE,
322 .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
323 });
324
325 if (pdev->info.cls_eng3d < VOLTA_A)
326 P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
327
328 P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
329 P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
330 P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
331
332 if (pdev->info.cls_eng3d < MAXWELL_A)
333 P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
334
335 if (pdev->info.cls_eng3d >= KEPLER_A &&
336 pdev->info.cls_eng3d < MAXWELL_A) {
337 P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
338 ORDERING_KEPLER_ORDER);
339 }
340
341 P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
342 P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
343 P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
344 P_IMMD(p, NV9097, SET_PS_SATURATE, {
345 .output0 = OUTPUT0_FALSE,
346 .output1 = OUTPUT1_FALSE,
347 .output2 = OUTPUT2_FALSE,
348 .output3 = OUTPUT3_FALSE,
349 .output4 = OUTPUT4_FALSE,
350 .output5 = OUTPUT5_FALSE,
351 .output6 = OUTPUT6_FALSE,
352 .output7 = OUTPUT7_FALSE,
353 });
354
355 P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
356 P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
357
358 /* From vulkan spec's point rasterization:
359 * "Point rasterization produces a fragment for each fragment area group of
360 * framebuffer pixels with one or more sample points that intersect a region
361 * centered at the point’s (xf,yf).
362 * This region is a square with side equal to the current point size.
363 * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
364 * for the point"
365 *
366 * So it seems we always need square points with PointCoords like OpenGL
367 * point sprites.
368 *
369 * From OpenGL compatibility spec:
370 * Basic point rasterization:
371 * "If point sprites are enabled, then point rasterization produces a
372 * fragment for each framebuffer pixel whose center lies inside a square
373 * centered at the point’s (xw, yw), with side length equal to the current
374 * point size.
375 * ... and xw and yw are the exact, unrounded window coordinates of the
376 * vertex for the point"
377 *
378 * And Point multisample rasterization:
379 * "This region is a circle having diameter equal to the current point width
380 * if POINT_SPRITE is disabled, or a square with side equal to the current
381 * point width if POINT_SPRITE is enabled."
382 */
383 P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
384 P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
385 .rmode = RMODE_ZERO,
386 .origin = ORIGIN_TOP,
387 .texture0 = TEXTURE0_PASSTHROUGH,
388 .texture1 = TEXTURE1_PASSTHROUGH,
389 .texture2 = TEXTURE2_PASSTHROUGH,
390 .texture3 = TEXTURE3_PASSTHROUGH,
391 .texture4 = TEXTURE4_PASSTHROUGH,
392 .texture5 = TEXTURE5_PASSTHROUGH,
393 .texture6 = TEXTURE6_PASSTHROUGH,
394 .texture7 = TEXTURE7_PASSTHROUGH,
395 .texture8 = TEXTURE8_PASSTHROUGH,
396 .texture9 = TEXTURE9_PASSTHROUGH,
397 });
398
399 /* OpenGL's GL_POINT_SMOOTH */
400 P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
401
402 if (pdev->info.cls_eng3d >= MAXWELL_B)
403 P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
404
405 P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
406
407 P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
408
409 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SHADING_RATE_CONTROL), 0);
410 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ANTI_ALIAS),
411 nvk_mme_anti_alias_init());
412
413 /* Enable multisample rasterization even for one sample rasterization,
414 * this way we get strict lines and rectangular line support.
415 * More info at: DirectX rasterization rules
416 */
417 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
418
419 if (pdev->info.cls_eng3d >= MAXWELL_B) {
420 P_IMMD(p, NVB197, SET_POST_PS_INITIAL_COVERAGE, true);
421 P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
422 BY_VIEWPORT_INDEX_FALSE);
423 }
424
425 /* TODO: Vertex runout */
426
427 P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
428 .mode = MODE_UPPER_LEFT,
429 .flip_y = FLIP_Y_FALSE,
430 });
431
432 P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
433 P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
434 P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
435
436 P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
437 P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
438 P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
439
440 // P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
441 // .respect_stencil_mask = RESPECT_STENCIL_MASK_FALSE,
442 // .use_clear_rect = USE_CLEAR_RECT_FALSE,
443 // });
444
445 P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
446
447 P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
448 .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
449 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
450 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
451 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
452 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
453 .geometry_clip = GEOMETRY_CLIP_WZERO_CLIP,
454 .geometry_guardband_z = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
455 });
456
457 for (unsigned i = 0; i < 16; i++)
458 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
459
460 P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
461
462 if (pdev->info.cls_eng3d >= TURING_A) {
463 /* I don't know what these values actually mean. I just copied them
464 * from the way the blob sets up the hardware.
465 */
466 P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(0));
467 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 0, 0xa23eb139);
468 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 1, 0xfb72ea61);
469 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 2, 0xd950c843);
470 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 3, 0x88fac4e5);
471 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 4, 0x1ab3e1b6);
472 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 5, 0xa98fedc2);
473 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 6, 0x2107654b);
474 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 7, 0xe0539773);
475 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 8, 0x698badcf);
476 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 9, 0x71032547);
477 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 10, 0xdef05397);
478 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 11, 0x56789abc);
479 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 12, 0x1234);
480 }
481
482 if (pdev->info.cls_eng3d < VOLTA_A) {
483 uint64_t shader_base_addr =
484 nvk_heap_contiguous_base_address(&dev->shader_heap);
485
486 P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
487 P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
488 P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
489 }
490
491 for (uint32_t group = 0; group < 5; group++) {
492 for (uint32_t slot = 0; slot < 16; slot++) {
493 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
494 .valid = VALID_FALSE,
495 .shader_slot = slot,
496 });
497 }
498 }
499
500 // P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
501 // P_INLINE_DATA(cmd->push, 0x40);
502 P_IMMD(p, NV9097, SET_RT_LAYER, {
503 .v = 0,
504 .control = CONTROL_V_SELECTS_LAYER,
505 });
506 // P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
507 // P_INLINE_DATA(cmd->push, 0x30);
508
509 P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
510 P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
511 P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
512
513 uint64_t zero_addr = dev->zero_page->va->addr;
514 P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
515 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
516 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
517
518 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VB_ENABLES));
519 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_VB_ENABLES, 0);
520 for (uint32_t b = 0; b < 32; b++) {
521 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
522 .enable = false,
523 });
524 }
525
526 if (pdev->info.cls_eng3d >= FERMI_A &&
527 pdev->info.cls_eng3d < MAXWELL_A) {
528 assert(dev->vab_memory);
529 uint64_t vab_addr = dev->vab_memory->va->addr;
530 P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
531 P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
532 P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
533 P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
534 }
535
536 if (pdev->info.cls_eng3d == MAXWELL_A)
537 P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
538
539 /* Store the address to CB0 in a pair of state registers */
540 uint64_t cb0_addr = queue->draw_cb0->va->addr;
541 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_ADDR_HI));
542 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_HI, cb0_addr >> 32);
543 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_LO, cb0_addr);
544
545 /* Store the address to the zero page in a pair of state registers */
546 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ZERO_ADDR_HI));
547 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_HI, zero_addr >> 32);
548 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_LO, zero_addr);
549
550 /* We leave CB0 selected by default */
551 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
552 P_INLINE_DATA(p, 0);
553
554 /* Bind CB0 to all shader groups */
555 for (uint32_t group = 0; group < 5; group++) {
556 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
557 .valid = VALID_TRUE,
558 .shader_slot = 0,
559 });
560 }
561
562 /* Zero out CB0 */
563 P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
564 P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, 0);
565 for (uint32_t dw = 0; dw < NVK_DRAW_CB0_SIZE / 4; dw++)
566 P_INLINE_DATA(p, 0);
567
568 /* These are shadowed in cb0 so they need to be zeroed as well for
569 * consistency.
570 */
571 P_IMMD(p, NV9097, SET_GLOBAL_BASE_INSTANCE_INDEX, 0);
572 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_FIRST_VERTEX));
573 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_FIRST_VERTEX, 0);
574 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_DRAW_INDEX, 0);
575 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_VIEW_INDEX, 0);
576
577 return VK_SUCCESS;
578 }
579
580 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)581 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
582 {
583 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
584
585 /* These depend on color attachment count */
586 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
587 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
588 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
589 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
590
591 /* These depend on the depth/stencil format */
592 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
593 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
594 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
595 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
596
597 /* This may depend on render targets for ESO */
598 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
599
600 /* This may depend on render targets */
601 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
602
603 /* Might be required for depthClampZeroOne */
604 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE);
605 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE);
606 }
607
608 static void
nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer * cmd,struct nvk_descriptor_state * desc,size_t offset,size_t size)609 nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer *cmd,
610 struct nvk_descriptor_state *desc,
611 size_t offset, size_t size)
612 {
613 const uint32_t start_dw = offset / 4;
614 const uint32_t end_dw = DIV_ROUND_UP(offset + size, 4);
615 const uint32_t len_dw = end_dw - start_dw;
616
617 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + len_dw);
618 P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
619 P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, start_dw * 4);
620
621 const uint32_t *root_dw = (uint32_t *)desc->root;
622 P_INLINE_ARRAY(p, &root_dw[start_dw], len_dw);
623 }
624
625 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)626 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
627 const VkCommandBufferBeginInfo *pBeginInfo)
628 {
629 if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
630 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
631 P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
632 P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
633 .lines = LINES_ALL,
634 });
635 P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
636 .lines = LINES_ALL,
637 });
638
639 P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
640 .constant = CONSTANT_TRUE,
641 });
642 }
643
644 cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
645
646 if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
647 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
648 char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
649 const VkRenderingInfo *resume_info =
650 vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
651 pBeginInfo,
652 gcbiar_data);
653 if (resume_info) {
654 nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
655 } else {
656 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
657 vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
658 pBeginInfo);
659 assert(inheritance_info);
660
661 struct nvk_rendering_state *render = &cmd->state.gfx.render;
662 render->flags = inheritance_info->flags;
663 render->area = (VkRect2D) { };
664 render->layer_count = 0;
665 render->view_mask = inheritance_info->viewMask;
666 render->samples = inheritance_info->rasterizationSamples;
667
668 render->color_att_count = inheritance_info->colorAttachmentCount;
669 for (uint32_t i = 0; i < render->color_att_count; i++) {
670 render->color_att[i].vk_format =
671 inheritance_info->pColorAttachmentFormats[i];
672 }
673 render->depth_att.vk_format =
674 inheritance_info->depthAttachmentFormat;
675 render->stencil_att.vk_format =
676 inheritance_info->stencilAttachmentFormat;
677
678 const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
679 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
680 .colorAttachmentCount = inheritance_info->colorAttachmentCount,
681 };
682 const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
683 vk_get_command_buffer_rendering_attachment_location_info(
684 cmd->vk.level, pBeginInfo);
685 if (att_loc_info == NULL)
686 att_loc_info = &att_loc_info_default;
687
688 vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
689
690 nvk_cmd_buffer_dirty_render_pass(cmd);
691 }
692 }
693
694 cmd->state.gfx.shaders_dirty = ~0;
695 }
696
697 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)698 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
699 {
700 vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
701
702 /* From the Vulkan 1.3.275 spec:
703 *
704 * "...There is one exception to this rule - if the primary command
705 * buffer is inside a render pass instance, then the render pass and
706 * subpass state is not disturbed by executing secondary command
707 * buffers."
708 *
709 * We need to reset everything EXCEPT the render pass state.
710 */
711 struct nvk_rendering_state render_save = cmd->state.gfx.render;
712 memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
713 cmd->state.gfx.render = render_save;
714
715 /* We need to keep the flush_root callback */
716 cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
717
718 cmd->state.gfx.shaders_dirty = ~0;
719 }
720
721 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)722 nvk_attachment_init(struct nvk_attachment *att,
723 const VkRenderingAttachmentInfo *info)
724 {
725 if (info == NULL || info->imageView == VK_NULL_HANDLE) {
726 *att = (struct nvk_attachment) { .iview = NULL, };
727 return;
728 }
729
730 VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
731 *att = (struct nvk_attachment) {
732 .vk_format = iview->vk.format,
733 .iview = iview,
734 };
735
736 if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
737 VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
738 att->resolve_mode = info->resolveMode;
739 att->resolve_iview = res_iview;
740 }
741
742 att->store_op = info->storeOp;
743 }
744
745 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)746 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
747 {
748 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
749 uint16_t nil_to_nv9097[] = {
750 MODE(1X1),
751 MODE(2X1),
752 MODE(2X1_D3D),
753 MODE(2X2),
754 MODE(4X2),
755 MODE(4X2_D3D),
756 MODE(4X4),
757 };
758 #undef MODE
759 assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
760 assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1 ||
761 nil_to_nv9097[sample_layout] != 0);
762
763 return nil_to_nv9097[sample_layout];
764 }
765
766 static uint32_t nvk_mme_anti_alias_samples(uint32_t samples);
767
768 static void
nvk_cmd_set_sample_layout(struct nvk_cmd_buffer * cmd,enum nil_sample_layout sample_layout)769 nvk_cmd_set_sample_layout(struct nvk_cmd_buffer *cmd,
770 enum nil_sample_layout sample_layout)
771 {
772 const uint32_t samples = nil_sample_layout_samples(sample_layout);
773 struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
774
775 P_IMMD(p, NV9097, SET_ANTI_ALIAS,
776 nil_to_nv9097_samples_mode(sample_layout));
777
778 switch (sample_layout) {
779 case NIL_SAMPLE_LAYOUT_1X1:
780 case NIL_SAMPLE_LAYOUT_2X1:
781 case NIL_SAMPLE_LAYOUT_2X1_D3D:
782 /* These only have two modes: Single-pass or per-sample */
783 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
784 P_INLINE_DATA(p, 0);
785 P_INLINE_DATA(p, 0);
786 P_INLINE_DATA(p, 0);
787 P_INLINE_DATA(p, 0);
788 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
789 P_INLINE_DATA(p, 0);
790 P_INLINE_DATA(p, 0);
791 P_INLINE_DATA(p, 0);
792 P_INLINE_DATA(p, 0);
793 break;
794
795 case NIL_SAMPLE_LAYOUT_2X2:
796 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
797 P_INLINE_DATA(p, 0x000a0005);
798 P_INLINE_DATA(p, 0x000a0005);
799 P_INLINE_DATA(p, 0);
800 P_INLINE_DATA(p, 0);
801 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
802 P_INLINE_DATA(p, 0);
803 P_INLINE_DATA(p, 0);
804 P_INLINE_DATA(p, 0);
805 P_INLINE_DATA(p, 0);
806 break;
807
808 case NIL_SAMPLE_LAYOUT_4X2:
809 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
810 P_INLINE_DATA(p, 0x000f000f);
811 P_INLINE_DATA(p, 0x000f000f);
812 P_INLINE_DATA(p, 0x00f000f0);
813 P_INLINE_DATA(p, 0x00f000f0);
814 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
815 P_INLINE_DATA(p, 0x00030003);
816 P_INLINE_DATA(p, 0x000c000c);
817 P_INLINE_DATA(p, 0x00300030);
818 P_INLINE_DATA(p, 0x00c000c0);
819 break;
820
821 case NIL_SAMPLE_LAYOUT_4X2_D3D:
822 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
823 P_INLINE_DATA(p, 0x003a00c5);
824 P_INLINE_DATA(p, 0x003a00c5);
825 P_INLINE_DATA(p, 0x003a003a);
826 P_INLINE_DATA(p, 0x00c500c5);
827 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
828 P_INLINE_DATA(p, 0x00120081);
829 P_INLINE_DATA(p, 0x00280044);
830 P_INLINE_DATA(p, 0x00280012);
831 P_INLINE_DATA(p, 0x00810044);
832 break;
833
834 default:
835 unreachable("Unknown sample layout");
836 }
837
838 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
839 P_INLINE_DATA(p, nvk_mme_anti_alias_samples(samples));
840 }
841
842 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)843 nvk_GetRenderingAreaGranularityKHR(
844 VkDevice device,
845 const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
846 VkExtent2D *pGranularity)
847 {
848 *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
849 }
850
851 static bool
nvk_rendering_all_linear(const struct nvk_rendering_state * render)852 nvk_rendering_all_linear(const struct nvk_rendering_state *render)
853 {
854 /* Depth and stencil are never linear */
855 if (render->depth_att.iview || render->stencil_att.iview)
856 return false;
857
858 for (uint32_t i = 0; i < render->color_att_count; i++) {
859 const struct nvk_image_view *iview = render->color_att[i].iview;
860 if (iview == NULL)
861 continue;
862
863 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
864 const uint8_t ip = iview->planes[0].image_plane;
865 const struct nil_image_level *level =
866 &image->planes[ip].nil.levels[iview->vk.base_mip_level];
867
868 if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR)
869 return false;
870 }
871
872 return true;
873 }
874
875 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)876 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
877 const VkRenderingInfo *pRenderingInfo)
878 {
879 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
880 struct nvk_rendering_state *render = &cmd->state.gfx.render;
881
882 memset(render, 0, sizeof(*render));
883
884 render->flags = pRenderingInfo->flags;
885 render->area = pRenderingInfo->renderArea;
886 render->view_mask = pRenderingInfo->viewMask;
887 render->layer_count = pRenderingInfo->layerCount;
888 render->samples = 0;
889
890 const uint32_t layer_count =
891 render->view_mask ? util_last_bit(render->view_mask) :
892 render->layer_count;
893
894 render->color_att_count = pRenderingInfo->colorAttachmentCount;
895 for (uint32_t i = 0; i < render->color_att_count; i++) {
896 nvk_attachment_init(&render->color_att[i],
897 &pRenderingInfo->pColorAttachments[i]);
898 }
899
900 nvk_attachment_init(&render->depth_att,
901 pRenderingInfo->pDepthAttachment);
902 nvk_attachment_init(&render->stencil_att,
903 pRenderingInfo->pStencilAttachment);
904
905 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att_info =
906 vk_find_struct_const(pRenderingInfo->pNext,
907 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
908 if (fsr_att_info != NULL && fsr_att_info->imageView != VK_NULL_HANDLE) {
909 VK_FROM_HANDLE(nvk_image_view, iview, fsr_att_info->imageView);
910 render->fsr_att = (struct nvk_attachment) {
911 .vk_format = iview->vk.format,
912 .iview = iview,
913 .store_op = VK_ATTACHMENT_STORE_OP_NONE,
914 };
915 }
916
917 render->all_linear = nvk_rendering_all_linear(render);
918
919 const VkRenderingAttachmentLocationInfoKHR ral_info = {
920 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
921 .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
922 };
923 vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
924
925 nvk_cmd_buffer_dirty_render_pass(cmd);
926
927 struct nv_push *p = nvk_cmd_buffer_push(cmd, NVK_MAX_RTS * 12 + 34);
928
929 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
930 render->view_mask);
931
932 P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
933 P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
934 .x = render->area.offset.x,
935 .width = render->area.extent.width,
936 });
937 P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
938 .y = render->area.offset.y,
939 .height = render->area.extent.height,
940 });
941
942 enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
943
944 /* We always emit SET_COLOR_TARGET_A(i) for every color target, regardless
945 * of the number of targets in the render pass. This ensures that we have
946 * no left over pointers from previous render passes in the hardware. This
947 * also allows us to point at any render target with SET_CT_SELECT and know
948 * that it's either a valid render target or NULL.
949 */
950 for (uint32_t i = 0; i < NVK_MAX_RTS; i++) {
951 if (render->color_att[i].iview) {
952 const struct nvk_image_view *iview = render->color_att[i].iview;
953 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
954 /* Rendering to multi-planar images is valid for a specific single
955 * plane only, so assert that what we have is a single-plane, obtain
956 * its index, and begin rendering
957 */
958 assert(iview->plane_count == 1);
959 const uint8_t ip = iview->planes[0].image_plane;
960 const struct nvk_image_plane *plane = &image->planes[ip];
961
962 if (!render->all_linear &&
963 plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR)
964 plane = &image->linear_tiled_shadow;
965
966 const struct nil_image *nil_image = &plane->nil;
967 const struct nil_image_level *level =
968 &nil_image->levels[iview->vk.base_mip_level];
969 struct nil_Extent4D_Samples level_extent_sa =
970 nil_image_level_extent_sa(nil_image, iview->vk.base_mip_level);
971
972 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
973 sample_layout == nil_image->sample_layout);
974 sample_layout = nil_image->sample_layout;
975 render->samples = image->vk.samples;
976
977 uint64_t addr = nvk_image_plane_base_address(plane) + level->offset_B;
978
979 if (nil_image->dim == NIL_IMAGE_DIM_3D) {
980 addr += nil_image_level_z_offset_B(nil_image,
981 iview->vk.base_mip_level,
982 iview->vk.base_array_layer);
983 assert(layer_count <= iview->vk.extent.depth);
984 } else {
985 addr += iview->vk.base_array_layer *
986 (uint64_t)nil_image->array_stride_B;
987 assert(layer_count <= iview->vk.layer_count);
988 }
989
990 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
991 P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
992 P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
993
994 if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR) {
995 const enum pipe_format p_format =
996 nvk_format_to_pipe_format(iview->vk.format);
997
998 /* We use the stride for depth/stencil targets because the Z/S
999 * hardware has no concept of a tile width. Instead, we just set
1000 * the width to the stride divided by bpp.
1001 */
1002 const uint32_t row_stride_el =
1003 level->row_stride_B / util_format_get_blocksize(p_format);
1004 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, row_stride_el);
1005 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1006 const uint8_t ct_format = nil_format_to_color_target(p_format);
1007 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1008
1009 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1010 .block_width = BLOCK_WIDTH_ONE_GOB,
1011 .block_height = level->tiling.y_log2,
1012 .block_depth = level->tiling.z_log2,
1013 .layout = LAYOUT_BLOCKLINEAR,
1014 .third_dimension_control = (nil_image->dim == NIL_IMAGE_DIM_3D) ?
1015 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
1016 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1017 });
1018
1019 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1020 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
1021 nil_image->array_stride_B >> 2);
1022 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1023 } else {
1024 /* NVIDIA can only render to 2D linear images */
1025 assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1026 /* NVIDIA can only render to non-multisampled images */
1027 assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1028 /* NVIDIA doesn't support linear array images */
1029 assert(iview->vk.base_array_layer == 0 && layer_count == 1);
1030
1031 uint32_t pitch = level->row_stride_B;
1032 const enum pipe_format p_format =
1033 nvk_format_to_pipe_format(iview->vk.format);
1034 /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
1035 * takes row pitch
1036 */
1037 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
1038 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1039
1040 const uint8_t ct_format = nil_format_to_color_target(p_format);
1041 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1042
1043 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1044 .layout = LAYOUT_PITCH,
1045 .third_dimension_control =
1046 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1047 });
1048
1049 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
1050 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1051 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1052 }
1053
1054 P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), nil_image->compressed);
1055 } else {
1056 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
1057 P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
1058 P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
1059 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
1060 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
1061 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
1062 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1063 .layout = LAYOUT_BLOCKLINEAR,
1064 });
1065 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1066 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1067 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1068
1069 P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), ENABLE_TRUE);
1070 }
1071 }
1072
1073 if (render->depth_att.iview || render->stencil_att.iview) {
1074 struct nvk_image_view *iview = render->depth_att.iview ?
1075 render->depth_att.iview :
1076 render->stencil_att.iview;
1077 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1078 /* Depth/stencil are always single-plane */
1079 assert(iview->plane_count == 1);
1080 const uint8_t ip = iview->planes[0].image_plane;
1081 struct nil_image nil_image = image->planes[ip].nil;
1082
1083 uint64_t addr = nvk_image_base_address(image, ip);
1084 uint32_t mip_level = iview->vk.base_mip_level;
1085 uint32_t base_array_layer = iview->vk.base_array_layer;
1086
1087 if (nil_image.dim == NIL_IMAGE_DIM_3D) {
1088 uint64_t level_offset_B;
1089 nil_image = nil_image_3d_level_as_2d_array(&nil_image, mip_level,
1090 &level_offset_B);
1091 addr += level_offset_B;
1092 mip_level = 0;
1093 base_array_layer = 0;
1094 assert(layer_count <= iview->vk.extent.depth);
1095 } else {
1096 assert(layer_count <= iview->vk.layer_count);
1097 }
1098
1099 const struct nil_image_level *level = &nil_image.levels[mip_level];
1100 addr += level->offset_B;
1101
1102 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
1103 sample_layout == nil_image.sample_layout);
1104 sample_layout = nil_image.sample_layout;
1105 render->samples = image->vk.samples;
1106
1107 P_MTHD(p, NV9097, SET_ZT_A);
1108 P_NV9097_SET_ZT_A(p, addr >> 32);
1109 P_NV9097_SET_ZT_B(p, addr);
1110 const enum pipe_format p_format =
1111 nvk_format_to_pipe_format(iview->vk.format);
1112 const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
1113 P_NV9097_SET_ZT_FORMAT(p, zs_format);
1114 assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1115 assert(level->tiling.z_log2 == 0);
1116 P_NV9097_SET_ZT_BLOCK_SIZE(p, {
1117 .width = WIDTH_ONE_GOB,
1118 .height = level->tiling.y_log2,
1119 .depth = DEPTH_ONE_GOB,
1120 });
1121 P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
1122
1123 P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
1124
1125 struct nil_Extent4D_Samples level_extent_sa =
1126 nil_image_level_extent_sa(&nil_image, mip_level);
1127
1128 /* We use the stride for depth/stencil targets because the Z/S hardware
1129 * has no concept of a tile width. Instead, we just set the width to
1130 * the stride divided by bpp.
1131 */
1132 const uint32_t row_stride_el =
1133 level->row_stride_B / util_format_get_blocksize(p_format);
1134
1135 P_MTHD(p, NV9097, SET_ZT_SIZE_A);
1136 P_NV9097_SET_ZT_SIZE_A(p, row_stride_el);
1137 P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.height);
1138 P_NV9097_SET_ZT_SIZE_C(p, {
1139 .third_dimension = base_array_layer + layer_count,
1140 .control = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1141 });
1142
1143 P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
1144
1145 P_IMMD(p, NV9097, SET_Z_COMPRESSION, nil_image.compressed);
1146
1147 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1148 P_IMMD(p, NVC597, SET_ZT_SPARSE, {
1149 .enable = ENABLE_FALSE,
1150 });
1151 }
1152 } else {
1153 P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
1154 }
1155
1156 if (nvk_cmd_buffer_3d_cls(cmd) < TURING_A) {
1157 assert(render->fsr_att.iview == NULL);
1158 } else if (render->fsr_att.iview != NULL) {
1159 const struct nvk_image_view *iview = render->fsr_att.iview;
1160 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1161
1162 /* Fragment shading rate images are always single-plane */
1163 assert(iview->plane_count == 1);
1164 const uint8_t ip = iview->planes[0].image_plane;
1165 const struct nil_image *nil_image = &image->planes[ip].nil;
1166
1167 /* Fragment shading rate images are always 2D */
1168 assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1169 assert(nil_image->sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1170
1171 uint64_t addr = nvk_image_base_address(image, ip);
1172 uint32_t mip_level = iview->vk.base_mip_level;
1173 struct nil_Extent4D_Samples level_extent_sa =
1174 nil_image_level_extent_sa(nil_image, mip_level);
1175
1176 const struct nil_image_level *level = &nil_image->levels[mip_level];
1177 addr += level->offset_B;
1178
1179 P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1180 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, addr >> 32);
1181 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, addr);
1182 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, {
1183 .width = level_extent_sa.width,
1184 .height = level_extent_sa.height,
1185 });
1186 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0,
1187 iview->vk.layer_count + iview->vk.base_array_layer);
1188 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0,
1189 iview->vk.base_array_layer);
1190 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0,
1191 nil_image->array_stride_B >> 2);
1192 assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1193 assert(level->tiling.z_log2 == 0);
1194 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, {
1195 .width = WIDTH_ONE_GOB,
1196 .height = level->tiling.y_log2,
1197 .depth = DEPTH_ONE_GOB,
1198 });
1199
1200 const enum pipe_format p_format =
1201 nvk_format_to_pipe_format(iview->vk.format);
1202 const uint32_t row_stride_el =
1203 level->row_stride_B / util_format_get_blocksize(p_format);
1204 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0,
1205 row_stride_el);
1206 } else {
1207 P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1208 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, 0);
1209 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, 0);
1210 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, { });
1211 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0, 0);
1212 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0, 0);
1213 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0, 0);
1214 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, { });
1215 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0, 0);
1216 }
1217
1218 /* From the Vulkan 1.3.275 spec:
1219 *
1220 * "It is legal for a subpass to use no color or depth/stencil
1221 * attachments, either because it has no attachment references or
1222 * because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
1223 * can use shader side effects such as image stores and atomics to
1224 * produce an output. In this case, the subpass continues to use the
1225 * width, height, and layers of the framebuffer to define the dimensions
1226 * of the rendering area, and the rasterizationSamples from each
1227 * pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
1228 * of samples used in rasterization;"
1229 *
1230 * In the case where we have attachments, we emit SET_ANTI_ALIAS here
1231 * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
1232 * specifying the sample layout and we want to ensure it matches. When
1233 * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
1234 * where we base it on dynamic rasterizationSamples.
1235 */
1236 if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID)
1237 nvk_cmd_set_sample_layout(cmd, sample_layout);
1238
1239 if (render->flags & VK_RENDERING_RESUMING_BIT)
1240 return;
1241
1242 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1243 const struct nvk_image_view *iview = render->color_att[i].iview;
1244 if (iview == NULL)
1245 continue;
1246
1247 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1248 assert(iview->plane_count == 1);
1249 const uint8_t ip = iview->planes[0].image_plane;
1250 const struct nvk_image_plane *plane = &image->planes[ip];
1251
1252 const VkAttachmentLoadOp load_op =
1253 pRenderingInfo->pColorAttachments[i].loadOp;
1254 if (!render->all_linear &&
1255 plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1256 load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
1257 nvk_linear_render_copy(cmd, iview, render->area, true);
1258 }
1259
1260 uint32_t clear_count = 0;
1261 VkClearAttachment clear_att[NVK_MAX_RTS + 1];
1262 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1263 const VkRenderingAttachmentInfo *att_info =
1264 &pRenderingInfo->pColorAttachments[i];
1265 if (att_info->imageView == VK_NULL_HANDLE ||
1266 att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1267 continue;
1268
1269 clear_att[clear_count++] = (VkClearAttachment) {
1270 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1271 .colorAttachment = i,
1272 .clearValue = att_info->clearValue,
1273 };
1274 }
1275
1276 clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
1277 if (pRenderingInfo->pDepthAttachment != NULL &&
1278 pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
1279 pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1280 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
1281 clear_att[clear_count].clearValue.depthStencil.depth =
1282 pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
1283 }
1284 if (pRenderingInfo->pStencilAttachment != NULL &&
1285 pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
1286 pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1287 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
1288 clear_att[clear_count].clearValue.depthStencil.stencil =
1289 pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
1290 }
1291 if (clear_att[clear_count].aspectMask != 0)
1292 clear_count++;
1293
1294 if (clear_count > 0) {
1295 const VkClearRect clear_rect = {
1296 .rect = render->area,
1297 .baseArrayLayer = 0,
1298 .layerCount = render->view_mask ? 1 : render->layer_count,
1299 };
1300
1301 p = nvk_cmd_buffer_push(cmd, 2);
1302 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1303 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
1304
1305 nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
1306 clear_count, clear_att, 1, &clear_rect);
1307 p = nvk_cmd_buffer_push(cmd, 2);
1308 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1309 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
1310 }
1311
1312 /* TODO: Attachment clears */
1313 }
1314
1315 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)1316 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
1317 {
1318 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1319 struct nvk_rendering_state *render = &cmd->state.gfx.render;
1320
1321 if (!(render->flags & VK_RENDERING_SUSPENDING_BIT)) {
1322 for (uint32_t i = 0; i < render->color_att_count; i++) {
1323 struct nvk_image_view *iview = render->color_att[i].iview;
1324 if (iview == NULL)
1325 continue;
1326
1327 struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1328 const uint8_t ip = iview->planes[0].image_plane;
1329 const struct nvk_image_plane *plane = &image->planes[ip];
1330 if (!render->all_linear &&
1331 plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1332 render->color_att[i].store_op == VK_ATTACHMENT_STORE_OP_STORE)
1333 nvk_linear_render_copy(cmd, iview, render->area, false);
1334 }
1335 }
1336
1337 bool need_resolve = false;
1338
1339 /* Translate render state back to VK for meta */
1340 VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
1341 for (uint32_t i = 0; i < render->color_att_count; i++) {
1342 if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
1343 need_resolve = true;
1344
1345 vk_color_att[i] = (VkRenderingAttachmentInfo) {
1346 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1347 .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
1348 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1349 .resolveMode = render->color_att[i].resolve_mode,
1350 .resolveImageView =
1351 nvk_image_view_to_handle(render->color_att[i].resolve_iview),
1352 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1353 };
1354 }
1355
1356 const VkRenderingAttachmentInfo vk_depth_att = {
1357 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1358 .imageView = nvk_image_view_to_handle(render->depth_att.iview),
1359 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1360 .resolveMode = render->depth_att.resolve_mode,
1361 .resolveImageView =
1362 nvk_image_view_to_handle(render->depth_att.resolve_iview),
1363 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1364 };
1365 if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1366 need_resolve = true;
1367
1368 const VkRenderingAttachmentInfo vk_stencil_att = {
1369 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1370 .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
1371 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1372 .resolveMode = render->stencil_att.resolve_mode,
1373 .resolveImageView =
1374 nvk_image_view_to_handle(render->stencil_att.resolve_iview),
1375 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1376 };
1377 if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1378 need_resolve = true;
1379
1380 const VkRenderingInfo vk_render = {
1381 .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1382 .renderArea = render->area,
1383 .layerCount = render->layer_count,
1384 .viewMask = render->view_mask,
1385 .colorAttachmentCount = render->color_att_count,
1386 .pColorAttachments = vk_color_att,
1387 .pDepthAttachment = &vk_depth_att,
1388 .pStencilAttachment = &vk_stencil_att,
1389 };
1390
1391 if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1392 need_resolve = false;
1393
1394 memset(render, 0, sizeof(*render));
1395
1396 if (need_resolve) {
1397 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1398 P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
1399 .lines = LINES_ALL,
1400 });
1401
1402 nvk_meta_resolve_rendering(cmd, &vk_render);
1403 }
1404 }
1405
1406 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)1407 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
1408 const gl_shader_stage stage,
1409 struct nvk_shader *shader)
1410 {
1411 assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1412 if (cmd->state.gfx.shaders[stage] == shader)
1413 return;
1414
1415 cmd->state.gfx.shaders[stage] = shader;
1416 cmd->state.gfx.shaders_dirty |= mesa_to_vk_shader_stage(stage);
1417 }
1418
1419 uint32_t
nvk_mme_tess_params(enum nak_ts_domain domain,enum nak_ts_spacing spacing,enum nak_ts_prims prims)1420 nvk_mme_tess_params(enum nak_ts_domain domain,
1421 enum nak_ts_spacing spacing,
1422 enum nak_ts_prims prims)
1423 {
1424 /* This is laid out the same as SET_TESSELLATION_PARAMETERS, only with an
1425 * extra bit for lower_left
1426 */
1427 uint16_t params = ((uint16_t)domain << 0) |
1428 ((uint16_t)spacing << 4) |
1429 ((uint16_t)prims << 8);
1430 return nvk_mme_val_mask(params, 0x0fff);
1431 }
1432
1433 static uint32_t
nvk_mme_tess_lower_left(bool lower_left)1434 nvk_mme_tess_lower_left(bool lower_left)
1435 {
1436 return nvk_mme_val_mask((uint16_t)lower_left << 12, 1u << 12);
1437 }
1438
1439 void
nvk_mme_set_tess_params(struct mme_builder * b)1440 nvk_mme_set_tess_params(struct mme_builder *b)
1441 {
1442 struct mme_value val_mask = mme_load(b);
1443 struct mme_value old_params = nvk_mme_load_scratch(b, TESS_PARAMS);
1444 struct mme_value params = nvk_mme_set_masked(b, old_params, val_mask);
1445 mme_free_reg(b, val_mask);
1446
1447 mme_if(b, ine, params, old_params) {
1448 nvk_mme_store_scratch(b, TESS_PARAMS, params);
1449
1450 /* lower_left lives at bit 12 */
1451 struct mme_value lower_left = mme_merge(b, mme_zero(), params, 0, 1, 12);
1452
1453 /* Only the bottom 12 bits are valid to put in HW */
1454 mme_merge_to(b, params, mme_zero(), params, 0, 12, 0);
1455
1456 /* If we're using a lower-left orientation, we need to flip triangles
1457 * between CW and CCW.
1458 */
1459 mme_if(b, ine, lower_left, mme_zero()) {
1460 struct mme_value prims_cw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CW);
1461 struct mme_value prims_ccw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CCW);
1462
1463 struct mme_value prims = mme_merge(b, mme_zero(), params, 0, 4, 8);
1464 mme_if(b, ieq, prims, prims_cw) {
1465 mme_merge_to(b, params, params, prims_ccw, 8, 4, 0);
1466 }
1467 mme_if(b, ieq, prims, prims_ccw) {
1468 mme_merge_to(b, params, params, prims_cw, 8, 4, 0);
1469 }
1470 mme_free_reg(b, prims);
1471 }
1472 mme_free_reg(b, lower_left);
1473
1474 mme_mthd(b, NV9097_SET_TESSELLATION_PARAMETERS);
1475 mme_emit(b, params);
1476 }
1477 }
1478
1479 const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[] = {{
1480 /* This case doesn't change the state so it should do nothing */
1481 .init = (struct nvk_mme_mthd_data[]) {
1482 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1483 { }
1484 },
1485 .params = (uint32_t[]) { 0xffff0000 },
1486 .expected = (struct nvk_mme_mthd_data[]) {
1487 { }
1488 },
1489 }, {
1490 /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = false */
1491 .init = (struct nvk_mme_mthd_data[]) {
1492 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1493 { }
1494 },
1495 .params = (uint32_t[]) { 0xffff0201 },
1496 .expected = (struct nvk_mme_mthd_data[]) {
1497 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1498 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1499 { }
1500 },
1501 }, {
1502 /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = true */
1503 .init = (struct nvk_mme_mthd_data[]) {
1504 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1505 { }
1506 },
1507 .params = (uint32_t[]) { 0x10001000 },
1508 .expected = (struct nvk_mme_mthd_data[]) {
1509 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1201 },
1510 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0301 },
1511 { }
1512 },
1513 }, {
1514 /* TRIANGLE, INTEGER, TRIANGLES_CCW, lower_left = true */
1515 .init = (struct nvk_mme_mthd_data[]) {
1516 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0301 },
1517 { }
1518 },
1519 .params = (uint32_t[]) { 0x10001000 },
1520 .expected = (struct nvk_mme_mthd_data[]) {
1521 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1301 },
1522 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1523 { }
1524 },
1525 }, {}};
1526
1527 void
nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer * cmd)1528 nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer *cmd)
1529 {
1530 if (cmd->state.gfx.shaders_dirty == 0)
1531 return;
1532
1533 /* Map shader types to shaders */
1534 struct nvk_shader *type_shader[6] = { NULL, };
1535 uint32_t types_dirty = 0;
1536
1537 u_foreach_bit(s, cmd->state.gfx.shaders_dirty &
1538 NVK_SHADER_STAGE_GRAPHICS_BITS) {
1539 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1540 uint32_t type = mesa_to_nv9097_shader_type(stage);
1541 types_dirty |= BITFIELD_BIT(type);
1542
1543 /* Only copy non-NULL shaders because mesh/task alias with vertex and
1544 * tessellation stages.
1545 */
1546 struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
1547 if (shader != NULL) {
1548 assert(type < ARRAY_SIZE(type_shader));
1549 assert(type_shader[type] == NULL);
1550 type_shader[type] = shader;
1551
1552 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
1553 struct nvk_cbuf_group *cbuf_group =
1554 &cmd->state.gfx.cbuf_groups[nvk_cbuf_binding_for_stage(stage)];
1555 for (uint32_t i = 0; i < cbuf_map->cbuf_count; i++) {
1556 if (memcmp(&cbuf_group->cbufs[i], &cbuf_map->cbufs[i],
1557 sizeof(cbuf_group->cbufs[i])) != 0) {
1558 cbuf_group->cbufs[i] = cbuf_map->cbufs[i];
1559 cbuf_group->dirty |= BITFIELD_BIT(i);
1560 }
1561 }
1562 }
1563 }
1564
1565 u_foreach_bit(type, types_dirty) {
1566 struct nvk_shader *shader = type_shader[type];
1567 if (shader == NULL) {
1568 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1569 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1570 .enable = ENABLE_FALSE,
1571 .type = type,
1572 });
1573 } else {
1574 struct nv_push *p = nvk_cmd_buffer_push(cmd, shader->push_dw_count);
1575 nv_push_raw(p, shader->push_dw, shader->push_dw_count);
1576 }
1577 }
1578
1579 if (cmd->state.gfx.shaders_dirty & NVK_SHADER_STAGE_VTGM_BITS) {
1580 struct nvk_shader *last_vtgm = NULL;
1581 u_foreach_bit(s, NVK_SHADER_STAGE_VTGM_BITS) {
1582 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1583 if (cmd->state.gfx.shaders[stage] != NULL)
1584 last_vtgm = cmd->state.gfx.shaders[stage];
1585 }
1586
1587 assert(last_vtgm->vtgm_push_dw_count > last_vtgm->push_dw_count);
1588 const uint16_t dw_start = last_vtgm->push_dw_count;
1589 const uint16_t dw_count = last_vtgm->vtgm_push_dw_count - dw_start;
1590 struct nv_push *p = nvk_cmd_buffer_push(cmd, dw_count);
1591 nv_push_raw(p, &last_vtgm->push_dw[dw_start], dw_count);
1592 }
1593
1594 cmd->state.gfx.shaders_dirty = 0;
1595 }
1596
1597 void
nvk_mme_set_vb_enables(struct mme_builder * b)1598 nvk_mme_set_vb_enables(struct mme_builder *b)
1599 {
1600 struct mme_value enables = mme_load(b);
1601 struct mme_value old_enables = nvk_mme_load_scratch(b, VB_ENABLES);
1602 nvk_mme_store_scratch(b, VB_ENABLES, enables);
1603
1604 struct mme_value changed = mme_xor(b, enables, old_enables);
1605 mme_free_reg(b, old_enables);
1606
1607 struct mme_value vb_idx4 = mme_mov(b, mme_zero());
1608 mme_while(b, ine, changed, mme_zero()) {
1609 mme_if(b, ine, mme_and(b, changed, mme_imm(1)), mme_zero()) {
1610 struct mme_value state =
1611 mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1612 mme_merge_to(b, state, state, enables, 12, 1, 0);
1613 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1614 mme_emit(b, state);
1615 }
1616 mme_add_to(b, vb_idx4, vb_idx4, mme_imm(4));
1617 mme_srl_to(b, changed, changed, mme_imm(1));
1618 mme_srl_to(b, enables, enables, mme_imm(1));
1619 }
1620 }
1621
1622 static uint32_t
nvk_mme_vb_stride(uint32_t vb_idx,uint32_t stride)1623 nvk_mme_vb_stride(uint32_t vb_idx, uint32_t stride)
1624 {
1625 assert(stride < (1 << 12));
1626 assert(vb_idx < (1 << 5));
1627 return (vb_idx << 16) | stride;
1628 }
1629
1630 void
nvk_mme_set_vb_stride(struct mme_builder * b)1631 nvk_mme_set_vb_stride(struct mme_builder *b)
1632 {
1633 /* Param is laid out as
1634 *
1635 * bits 0..11 : stride
1636 * bits 16..21 : VB index
1637 */
1638 struct mme_value param = mme_load(b);
1639
1640 struct mme_value vb_idx4 = mme_merge(b, mme_zero(), param, 2, 5, 16);
1641
1642 struct mme_value state =
1643 mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1644 struct mme_value new_state = mme_merge(b, state, param, 0, 12, 0);
1645 mme_if(b, ine, state, new_state) {
1646 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1647 mme_emit(b, new_state);
1648 }
1649 }
1650
1651 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1652 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1653 {
1654 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1655 struct nvk_physical_device *pdev = nvk_device_physical(dev);
1656 const struct vk_dynamic_graphics_state *dyn =
1657 &cmd->vk.dynamic_graphics_state;
1658
1659 struct nv_push *p = nvk_cmd_buffer_push(cmd, 258);
1660
1661 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1662 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_ENABLES));
1663 P_INLINE_DATA(p, dyn->vi->bindings_valid);
1664 }
1665
1666 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1667 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1668 u_foreach_bit(a, dyn->vi->attributes_valid) {
1669 const struct nvk_va_format *fmt =
1670 nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1671
1672 P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1673 .stream = dyn->vi->attributes[a].binding,
1674 .offset = dyn->vi->attributes[a].offset,
1675 .component_bit_widths = fmt->bit_widths,
1676 .numerical_type = fmt->type,
1677 .swap_r_and_b = fmt->swap_rb,
1678 });
1679 }
1680
1681 u_foreach_bit(b, dyn->vi->bindings_valid) {
1682 const bool instanced = dyn->vi->bindings[b].input_rate ==
1683 VK_VERTEX_INPUT_RATE_INSTANCE;
1684 P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1685 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1686 dyn->vi->bindings[b].divisor);
1687 }
1688 }
1689
1690 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1691 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1692 u_foreach_bit(b, dyn->vi->bindings_valid) {
1693 assert(dyn->vi_binding_strides[b] < (1 << 12));
1694 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE));
1695 P_INLINE_DATA(p, nvk_mme_vb_stride(b, dyn->vi_binding_strides[b]));
1696 }
1697 }
1698 }
1699
1700 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)1701 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
1702 {
1703 switch (prim) {
1704 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1705 return NV9097_BEGIN_OP_POINTS;
1706 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1707 return NV9097_BEGIN_OP_LINES;
1708 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1709 return NV9097_BEGIN_OP_LINE_STRIP;
1710 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1711 #pragma GCC diagnostic push
1712 #pragma GCC diagnostic ignored "-Wswitch"
1713 case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
1714 #pragma GCC diagnostic pop
1715 return NV9097_BEGIN_OP_TRIANGLES;
1716 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1717 return NV9097_BEGIN_OP_TRIANGLE_STRIP;
1718 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1719 return NV9097_BEGIN_OP_TRIANGLE_FAN;
1720 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1721 return NV9097_BEGIN_OP_LINELIST_ADJCY;
1722 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1723 return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
1724 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1725 return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
1726 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1727 return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
1728 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1729 return NV9097_BEGIN_OP_PATCH;
1730 default:
1731 unreachable("Invalid primitive topology");
1732 }
1733 }
1734
1735 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1736 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1737 {
1738 const struct vk_dynamic_graphics_state *dyn =
1739 &cmd->vk.dynamic_graphics_state;
1740
1741 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
1742 uint32_t begin;
1743 V_NV9097_BEGIN(begin, {
1744 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
1745 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
1746 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
1747 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
1748 });
1749
1750 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1751 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
1752 P_INLINE_DATA(p, begin);
1753 }
1754
1755 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1756 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1757 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1758 dyn->ia.primitive_restart_enable);
1759 }
1760 }
1761
1762 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1763 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1764 {
1765 const struct vk_dynamic_graphics_state *dyn =
1766 &cmd->vk.dynamic_graphics_state;
1767 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1768
1769 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1770 /* The hardware gets grumpy if we set this to 0 so make sure we set it
1771 * to at least 1 in case it's dirty but uninitialized.
1772 */
1773 P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1774 }
1775
1776 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1777 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1778 P_INLINE_DATA(p, nvk_mme_tess_lower_left(
1779 dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT));
1780 }
1781 }
1782
1783 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1784 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1785 {
1786 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1787 struct nvk_physical_device *pdev = nvk_device_physical(dev);
1788
1789 const struct vk_dynamic_graphics_state *dyn =
1790 &cmd->vk.dynamic_graphics_state;
1791
1792 struct nv_push *p =
1793 nvk_cmd_buffer_push(cmd, 18 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1794
1795 /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1796
1797 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1798 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
1799 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE)) {
1800 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1801 const VkViewport *vp = &dyn->vp.viewports[i];
1802
1803 /* These exactly match the spec values. Nvidia hardware oddities
1804 * are accounted for later.
1805 */
1806 const float o_x = vp->x + 0.5f * vp->width;
1807 const float o_y = vp->y + 0.5f * vp->height;
1808 const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1809 vp->minDepth :
1810 (vp->maxDepth + vp->minDepth) * 0.5f;
1811
1812 const float p_x = vp->width;
1813 const float p_y = vp->height;
1814 const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1815 vp->maxDepth - vp->minDepth :
1816 (vp->maxDepth - vp->minDepth) * 0.5f;
1817
1818 P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1819 P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1820 P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1821 P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1822
1823 P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1824 P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1825 P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1826
1827 const bool user_defined_range =
1828 dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT;
1829 float xmin = vp->x;
1830 float xmax = vp->x + vp->width;
1831 float ymin = MIN2(vp->y, vp->y + vp->height);
1832 float ymax = MAX2(vp->y, vp->y + vp->height);
1833 float zmin = user_defined_range ?
1834 dyn->vp.depth_clamp_range.minDepthClamp :
1835 MIN2(vp->minDepth, vp->maxDepth);
1836 float zmax = user_defined_range ?
1837 dyn->vp.depth_clamp_range.maxDepthClamp :
1838 MAX2(vp->minDepth, vp->maxDepth);
1839 assert(xmin <= xmax && ymin <= ymax && zmin <= zmax);
1840
1841 const float max_dim = (float)0xffff;
1842 xmin = CLAMP(xmin, 0, max_dim);
1843 xmax = CLAMP(xmax, 0, max_dim);
1844 ymin = CLAMP(ymin, 0, max_dim);
1845 ymax = CLAMP(ymax, 0, max_dim);
1846
1847 if (!dev->vk.enabled_extensions.EXT_depth_range_unrestricted) {
1848 assert(0.0 <= zmin && zmin <= 1.0);
1849 assert(0.0 <= zmax && zmax <= 1.0);
1850 }
1851
1852 P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1853 P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1854 .x0 = xmin,
1855 .width = xmax - xmin,
1856 });
1857 P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1858 .y0 = ymin,
1859 .height = ymax - ymin,
1860 });
1861
1862 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1863 P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1864 P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1865 } else {
1866 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VIEWPORT_MIN_MAX_Z));
1867 P_INLINE_DATA(p, i);
1868 P_INLINE_DATA(p, fui(zmin));
1869 P_INLINE_DATA(p, fui(zmax));
1870 }
1871
1872 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1873 P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1874 .x = X_POS_X,
1875 .y = Y_POS_Y,
1876 .z = Z_POS_Z,
1877 .w = W_POS_W,
1878 });
1879 }
1880 }
1881 }
1882
1883 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1884 P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1885 dyn->vp.depth_clip_negative_one_to_one ?
1886 RANGE_NEGATIVE_W_TO_POSITIVE_W :
1887 RANGE_ZERO_TO_POSITIVE_W);
1888 }
1889
1890 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1891 for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1892 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1893 }
1894
1895 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1896 const uint32_t sr_max =
1897 nvk_image_max_dimension(&pdev->info, VK_IMAGE_TYPE_2D);
1898
1899 for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1900 const VkRect2D *s = &dyn->vp.scissors[i];
1901
1902 const uint32_t xmin = MIN2(sr_max, s->offset.x);
1903 const uint32_t xmax = MIN2(sr_max, s->offset.x + s->extent.width);
1904 const uint32_t ymin = MIN2(sr_max, s->offset.y);
1905 const uint32_t ymax = MIN2(sr_max, s->offset.y + s->extent.height);
1906
1907 P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1908 P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1909 P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1910 .xmin = xmin,
1911 .xmax = xmax,
1912 });
1913 P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1914 .ymin = ymin,
1915 .ymax = ymax,
1916 });
1917 }
1918 }
1919 }
1920
1921 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1922 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1923 {
1924 ASSERTED uint16_t vk_to_nv9097[] = {
1925 [VK_POLYGON_MODE_FILL] = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1926 [VK_POLYGON_MODE_LINE] = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1927 [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1928 };
1929 assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1930
1931 uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1932 assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1933 return nv9097_mode;
1934 }
1935
1936 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1937 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1938 {
1939 static const uint16_t vk_to_nv9097[] = {
1940 [VK_CULL_MODE_FRONT_BIT] = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1941 [VK_CULL_MODE_BACK_BIT] = NV9097_OGL_SET_CULL_FACE_V_BACK,
1942 [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1943 };
1944 assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1945 return vk_to_nv9097[vk_cull_mode];
1946 }
1947
1948 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1949 vk_to_nv9097_front_face(VkFrontFace vk_face)
1950 {
1951 /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1952 * convention in which framebuffer coordinates always start in the upper
1953 * left while OpenGL has framebuffer coordinates starting in the lower
1954 * left. Therefore, we want the reverse of the hardware enum name.
1955 */
1956 ASSERTED static const uint16_t vk_to_nv9097[] = {
1957 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1958 [VK_FRONT_FACE_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CW,
1959 };
1960 assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1961
1962 uint32_t nv9097_face = 0x900 | (1 - vk_face);
1963 assert(nv9097_face == vk_to_nv9097[vk_face]);
1964 return nv9097_face;
1965 }
1966
1967 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1968 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1969 {
1970 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1971 NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1972 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1973 NV9097_SET_PROVOKING_VERTEX_V_LAST);
1974 return vk_mode;
1975 }
1976
1977 void
nvk_mme_set_viewport_min_max_z(struct mme_builder * b)1978 nvk_mme_set_viewport_min_max_z(struct mme_builder *b)
1979 {
1980 struct mme_value vp_idx = mme_load(b);
1981 struct mme_value min_z = mme_load(b);
1982 struct mme_value max_z = mme_load(b);
1983
1984 /* Multiply by 2 because it's an array with stride 8 */
1985 mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1986 mme_mthd_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), vp_idx);
1987 mme_emit(b, min_z);
1988 mme_emit(b, max_z);
1989
1990 struct mme_value z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1991 mme_if(b, ine, z_clamp, mme_zero()) {
1992 /* Multiply by 2 again because this array has stride 16 */
1993 mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1994 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), vp_idx);
1995 mme_emit(b, min_z);
1996 mme_emit(b, max_z);
1997 }
1998 }
1999
2000 void
nvk_mme_set_z_clamp(struct mme_builder * b)2001 nvk_mme_set_z_clamp(struct mme_builder *b)
2002 {
2003 struct mme_value z_clamp = mme_load(b);
2004 struct mme_value old_z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
2005 mme_if(b, ine, z_clamp, old_z_clamp) {
2006 nvk_mme_store_scratch(b, Z_CLAMP, z_clamp);
2007
2008 mme_if(b, ine, z_clamp, mme_zero()) {
2009 struct mme_value i_2 = mme_mov(b, mme_zero());
2010 mme_while(b, ine, i_2, mme_imm(NVK_MAX_VIEWPORTS * 2)) {
2011 struct mme_value min_z =
2012 mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), i_2);
2013 struct mme_value max_z =
2014 mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MAX_Z), i_2);
2015
2016 struct mme_value i_4 = mme_sll(b, i_2, mme_imm(1));
2017 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2018 mme_emit(b, min_z);
2019 mme_emit(b, max_z);
2020
2021 mme_free_reg(b, i_4);
2022 mme_free_reg(b, min_z);
2023 mme_free_reg(b, max_z);
2024
2025 mme_add_to(b, i_2, i_2, mme_imm(2));
2026 }
2027 mme_free_reg(b, i_2);
2028 }
2029 mme_if(b, ieq, z_clamp, mme_zero()) {
2030 struct mme_value i_4 = mme_mov(b, mme_zero());
2031 mme_while(b, ine, i_4, mme_imm(NVK_MAX_VIEWPORTS * 4)) {
2032 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2033 mme_emit(b, mme_imm(fui(-INFINITY)));
2034 mme_emit(b, mme_imm(fui(INFINITY)));
2035
2036 mme_add_to(b, i_4, i_4, mme_imm(4));
2037 }
2038 mme_free_reg(b, i_4);
2039 }
2040 }
2041 }
2042
2043 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)2044 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
2045 {
2046 const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
2047 const struct vk_dynamic_graphics_state *dyn =
2048 &cmd->vk.dynamic_graphics_state;
2049 const struct nvk_rendering_state *render =
2050 &cmd->state.gfx.render;
2051
2052 struct nv_push *p = nvk_cmd_buffer_push(cmd, 46);
2053
2054 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
2055 P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
2056
2057 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
2058 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
2059 const bool z_clamp = dyn->rs.depth_clamp_enable;
2060 const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2061 /* z_clamp_zero_one accounts for the interaction between
2062 * depthClampZeroOne and depthRangeUnrestricted as mentioned in the
2063 * Vulkan spec. depthClampZeroOne adds an additional clamp and doesn't
2064 * modify the clip/clamp threshold. We are expected to clamp to [0,1]
2065 * when any one of these conditions are fulfilled:
2066 * - depth_range_unrestricted is not enabled
2067 * - depthClampZeroOne is enabled but depth
2068 * format is not floating point or depthRangeUnrestricted
2069 * is not enabled
2070 * - fixed point depth format
2071 */
2072 const bool z_clamp_zero_one =
2073 !vk_format_has_float_depth(render->depth_att.vk_format) ||
2074 (dev->vk.enabled_features.depthClampZeroOne &&
2075 !dev->vk.enabled_extensions.EXT_depth_range_unrestricted);
2076
2077 P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
2078 /* We only set Z clip range if clamp is requested. Otherwise, we
2079 * leave it set to -/+INF and clip using the guardband below.
2080 *
2081 * depthClampZeroOne is independent of normal depth clamping and
2082 * does not modify the clip/clamp threshold. The Vulkan spec
2083 * guarantees that, in the cases where depthClampZeroOne applies,
2084 * the [zmin, zmax] is inside [0, 1]. This means that, if z_clamp
2085 * is enabled, we can just do the regular clamp. If z_clamp is
2086 * disabled and z_clamp_zero_one is enabled then we need to
2087 * apply the [0, 1] clamp.
2088 */
2089 .min_z_zero_max_z_one = (!z_clamp && z_clamp_zero_one)
2090 ? MIN_Z_ZERO_MAX_Z_ONE_TRUE
2091 : MIN_Z_ZERO_MAX_Z_ONE_FALSE,
2092 .z_clip_range = (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A &&
2093 (z_clamp || !z_clamp_zero_one))
2094 ? (z_clamp ? Z_CLIP_RANGE_MIN_Z_MAX_Z
2095 : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
2096 : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
2097
2098 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
2099 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
2100
2101 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
2102 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
2103 .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
2104 : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
2105
2106 /* We clip depth with the geometry clipper to ensure that it gets
2107 * clipped before depth bias is applied. If we leave it up to the
2108 * raserizer clipper (pixel_min/max_z = CLIP), it will clip too late
2109 * in the pipeline. This can be seen in two different ways:
2110 *
2111 * - When depth bias is enabled, the bias is applied post-clipping.
2112 * If we clip in the rasterizer, it will clip according to the
2113 * post-bias depth which is wrong.
2114 *
2115 * - If the fragment shader overrides the depth by writing to
2116 * gl_FragDepth, it should be clipped according to the original
2117 * geometry, not accoring to gl_FragDepth.
2118 *
2119 * In order to always get the geometry clipper, we need to set a
2120 * tight guardband (geometry_guardband_z = SCALE_1).
2121 */
2122 .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
2123 : GEOMETRY_GUARDBAND_Z_SCALE_256,
2124 });
2125
2126 /* Pre-Volta, we don't have SET_VIEWPORT_CLIP_CONTROL::z_clip_range.
2127 * Instead, we have to emulate it by smashing VIEWPORT_CLIP_MIN/MAX_Z
2128 * based on whether or not z_clamp is set. This is done by a pair of
2129 * macros, one of which is called here and the other is called in
2130 * viewport setup.
2131 */
2132 if (nvk_cmd_buffer_3d_cls(cmd) < VOLTA_A) {
2133 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_Z_CLAMP));
2134 P_INLINE_DATA(p, z_clamp);
2135 }
2136 }
2137
2138 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
2139 uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
2140 P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
2141 P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
2142 P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
2143 }
2144
2145 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
2146 P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
2147
2148 if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
2149 uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
2150 P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
2151 }
2152 }
2153
2154 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
2155 P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
2156 vk_to_nv9097_front_face(dyn->rs.front_face));
2157 }
2158
2159 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
2160 P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
2161 vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
2162 }
2163
2164 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
2165 P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
2166 P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
2167 P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
2168 P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
2169 }
2170
2171 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
2172 switch (dyn->rs.depth_bias.representation) {
2173 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
2174 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2175 DEPTH_FORMAT_DEPENDENT_TRUE);
2176 break;
2177 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
2178 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2179 DEPTH_FORMAT_DEPENDENT_FALSE);
2180 break;
2181 case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
2182 default:
2183 unreachable("Unsupported depth bias representation");
2184 }
2185 /* TODO: The blob multiplies by 2 for some reason. We don't. */
2186 P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant_factor));
2187 P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope_factor));
2188 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
2189 }
2190
2191 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
2192 P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
2193 P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2194 P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2195 }
2196
2197 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
2198 switch (dyn->rs.line.mode) {
2199 case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
2200 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
2201 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
2202 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2203 break;
2204
2205 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
2206 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2207 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2208 break;
2209
2210 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
2211 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2212 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
2213 break;
2214
2215 default:
2216 unreachable("Invalid line rasterization mode");
2217 }
2218 }
2219
2220 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
2221 P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
2222
2223 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
2224 /* map factor from [1,256] to [0, 255] */
2225 uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
2226 P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
2227 .factor = stipple_factor,
2228 .pattern = dyn->rs.line.stipple.pattern,
2229 });
2230 }
2231
2232 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
2233 P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
2234
2235 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ||
2236 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE)) {
2237 if (nvk_cmd_buffer_3d_cls(cmd) < MAXWELL_B) {
2238 assert(dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
2239 } else if (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2240 P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_FALSE);
2241 } else {
2242 uint32_t extra_overestimate =
2243 MIN2(3, dyn->rs.extra_primitive_overestimation_size * 4);
2244
2245 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
2246 P_IMMD(p, NVC397, SET_CONSERVATIVE_RASTER_CONTROL, {
2247 .extra_prim_bloat = extra_overestimate,
2248 .copy_inner_to_outer =
2249 (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT),
2250 .triangle_snap_mode = TRIANGLE_SNAP_MODE_MODE_PRE_SNAP,
2251 .line_and_point_snap_mode = LINE_AND_POINT_SNAP_MODE_MODE_PRE_SNAP,
2252 .uncertainty_region_size = UNCERTAINTY_REGION_SIZE_SIZE_512,
2253 });
2254 } else {
2255 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_CONSERVATIVE_RASTER_STATE));
2256 P_INLINE_DATA(p, extra_overestimate << 23);
2257 }
2258 P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_TRUE);
2259 }
2260 }
2261 }
2262
2263 uint32_t
nvk_mme_shading_rate_control_sample_shading(bool sample_shading)2264 nvk_mme_shading_rate_control_sample_shading(bool sample_shading)
2265 {
2266 return nvk_mme_val_mask((!sample_shading) << 1, 1 << 1);
2267 }
2268
2269 static uint32_t
nvk_mme_shading_rate_control_enable(bool enable)2270 nvk_mme_shading_rate_control_enable(bool enable)
2271 {
2272 return nvk_mme_val_mask(enable, 1 << 0);
2273 }
2274
2275 void
nvk_mme_set_shading_rate_control(struct mme_builder * b)2276 nvk_mme_set_shading_rate_control(struct mme_builder *b)
2277 {
2278 if (b->devinfo->cls_eng3d < TURING_A)
2279 return;
2280
2281 struct mme_value val_mask = mme_load(b);
2282 struct mme_value old_src = nvk_mme_load_scratch(b, SHADING_RATE_CONTROL);
2283 struct mme_value src = nvk_mme_set_masked(b, old_src, val_mask);
2284 mme_free_reg(b, val_mask);
2285
2286 mme_if(b, ine, src, old_src) {
2287 mme_free_reg(b, old_src);
2288 nvk_mme_store_scratch(b, SHADING_RATE_CONTROL, src);
2289
2290 struct mme_value enable1 = mme_merge(b, mme_zero(), src, 0, 1, 0);
2291 struct mme_value enable2 = mme_merge(b, mme_zero(), src, 0, 1, 1);
2292 struct mme_value enable = mme_and(b, enable1, enable2);
2293
2294 struct mme_value i = mme_mov(b, mme_zero());
2295 mme_while(b, ine, i, mme_imm(16 * 4)) {
2296 mme_mthd_arr(b, NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(0), i);
2297 mme_emit(b, enable);
2298 mme_add_to(b, i, i, mme_imm(4));
2299 }
2300 }
2301 }
2302
2303 static void
nvk_mme_set_shading_rate_control_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)2304 nvk_mme_set_shading_rate_control_test_check(
2305 const struct nv_device_info *devinfo,
2306 const struct nvk_mme_test_case *test,
2307 const struct nvk_mme_mthd_data *results)
2308 {
2309 if (devinfo->cls_eng3d < TURING_A)
2310 return;
2311
2312 assert(results[0].mthd == NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL));
2313 bool enable = (results[0].data & 3) == 3;
2314
2315 for (uint32_t i = 0; i < 16; i++) {
2316 assert(results[i + 1].mthd ==
2317 NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(i));
2318 assert(results[i + 1].data == enable);
2319 }
2320 }
2321
2322 const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[] = {{
2323 .init = (struct nvk_mme_mthd_data[]) {
2324 { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2325 { }
2326 },
2327 .params = (uint32_t[]) { 0x00030003 },
2328 .check = nvk_mme_set_shading_rate_control_test_check,
2329 }, {
2330 .init = (struct nvk_mme_mthd_data[]) {
2331 { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2332 { }
2333 },
2334 .params = (uint32_t[]) { 0x00030001 },
2335 .check = nvk_mme_set_shading_rate_control_test_check,
2336 }, {}};
2337
2338 static VkExtent2D
nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,VkExtent2D a_log2,VkExtent2D b_log2)2339 nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,
2340 VkExtent2D a_log2, VkExtent2D b_log2)
2341 {
2342 switch (op) {
2343 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
2344 return a_log2;
2345
2346 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
2347 return b_log2;
2348
2349 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
2350 return (VkExtent2D) {
2351 .width = MIN2(a_log2.width, b_log2.width),
2352 .height = MIN2(a_log2.height, b_log2.height),
2353 };
2354
2355 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
2356 return (VkExtent2D) {
2357 .width = MAX2(a_log2.width, b_log2.width),
2358 .height = MAX2(a_log2.height, b_log2.height),
2359 };
2360
2361 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR:
2362 return (VkExtent2D) {
2363 .width = a_log2.width + b_log2.width,
2364 .height = a_log2.height + b_log2.height,
2365 };
2366
2367 default:
2368 unreachable("Invalid FSR combiner op");
2369 }
2370 }
2371
2372 static uint8_t
vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)2373 vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)
2374 {
2375 rate_log2.width = MIN2(rate_log2.width, 2);
2376 rate_log2.height = MIN2(rate_log2.height, 2);
2377 const uint8_t idx = (rate_log2.width << 2) | rate_log2.height;
2378
2379 /* From the Vulkan 1.3.297 spec:
2380 *
2381 * "A fragment shading rate Rxy representing any of Axy, Bxy or Cxy
2382 * is clamped as follows. [...] From this list of supported rates,
2383 * the following steps are applied in order, to select a single
2384 * value:
2385 *
2386 * 1. Keep only rates where Rx' ≤ Rx and Ry' ≤ Ry.
2387 *
2388 * - Implementations may also keep rates where Rx' ≤ Ry and
2389 * Ry' ≤ Rx.
2390 *
2391 * 2. Keep only rates with the highest area (Rx' × Ry').
2392 *
2393 * 3. Keep only rates with the lowest aspect ratio (Rx' + Ry').
2394 *
2395 * 4. In cases where a wide (e.g. 4x1) and tall (e.g. 1x4) rate
2396 * remain, the implementation may choose either rate. However, it
2397 * must choose this rate consistently for the same shading rates,
2398 * render pass transform, and combiner operations for the
2399 * lifetime of the VkDevice.
2400 *
2401 * We have the following rates: 1x1, 2x1, 1x2, 2x2, 4x2, 2x4, 4x4.
2402 */
2403 static const uint8_t vk_to_nvc597[] = {
2404 #define NVC597_FSR(X) NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A_RATE_INDEX0_PS_##X
2405 NVC597_FSR(X1_PER_RASTER_PIXEL),
2406 NVC597_FSR(X1_PER_1X2_RASTER_PIXELS),
2407 NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x4 */
2408 NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x8 */
2409 NVC597_FSR(X1_PER_2X1_RASTER_PIXELS),
2410 NVC597_FSR(X1_PER_2X2_RASTER_PIXELS),
2411 NVC597_FSR(X1_PER_2X4_RASTER_PIXELS),
2412 NVC597_FSR(X1_PER_2X4_RASTER_PIXELS), /* 2x8 */
2413 NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 4x1 */
2414 NVC597_FSR(X1_PER_4X2_RASTER_PIXELS),
2415 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS),
2416 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 4x8 */
2417 NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 8x1 */
2418 NVC597_FSR(X1_PER_4X2_RASTER_PIXELS), /* 8x2 */
2419 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x4 */
2420 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x8 */
2421 #undef NVC597_FSR
2422 };
2423
2424 assert(idx < ARRAY_SIZE(vk_to_nvc597));
2425 return vk_to_nvc597[idx];
2426 }
2427
2428 static void
nvk_flush_fsr_state(struct nvk_cmd_buffer * cmd)2429 nvk_flush_fsr_state(struct nvk_cmd_buffer *cmd)
2430 {
2431 const struct vk_dynamic_graphics_state *dyn =
2432 &cmd->vk.dynamic_graphics_state;
2433
2434 if (nvk_cmd_buffer_3d_cls(cmd) < TURING_A) {
2435 assert(vk_fragment_shading_rate_is_disabled(&dyn->fsr));
2436 return;
2437 }
2438
2439 if (!BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
2440 return;
2441
2442 if (vk_fragment_shading_rate_is_disabled(&dyn->fsr)) {
2443 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2444 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2445 P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(false));
2446 } else {
2447 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 16 * 3);
2448
2449 assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.width));
2450 assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.height));
2451 const VkExtent2D state_fs_log2 = {
2452 .width = util_logbase2(dyn->fsr.fragment_size.width),
2453 .height = util_logbase2(dyn->fsr.fragment_size.height),
2454 };
2455
2456 for (uint32_t prim_idx = 0; prim_idx < 16; prim_idx++) {
2457 const VkExtent2D prim_fs_log2 = {
2458 .width = (prim_idx >> 2) & 3,
2459 .height = prim_idx & 3,
2460 };
2461
2462 const VkExtent2D state_prim_fs_log2 =
2463 nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[0],
2464 state_fs_log2, prim_fs_log2);
2465
2466 uint8_t rates[16] = {};
2467 for (uint32_t att_idx = 0; att_idx < 16; att_idx++) {
2468 const VkExtent2D att_fs_log2 = {
2469 .width = (att_idx >> 2) & 3,
2470 .height = att_idx & 3,
2471 };
2472
2473 const VkExtent2D fs_log2 =
2474 nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[1],
2475 state_prim_fs_log2, att_fs_log2);
2476
2477 rates[att_idx] = vk_to_nvc597_shading_rate_log2(fs_log2);
2478 }
2479
2480 P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(prim_idx));
2481 P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(p, prim_idx, {
2482 .rate_index0 = rates[0],
2483 .rate_index1 = rates[1],
2484 .rate_index2 = rates[2],
2485 .rate_index3 = rates[3],
2486 .rate_index4 = rates[4],
2487 .rate_index5 = rates[5],
2488 .rate_index6 = rates[6],
2489 .rate_index7 = rates[7],
2490 });
2491 P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_B(p, prim_idx, {
2492 .rate_index8 = rates[8],
2493 .rate_index9 = rates[9],
2494 .rate_index10 = rates[10],
2495 .rate_index11 = rates[11],
2496 .rate_index12 = rates[12],
2497 .rate_index13 = rates[13],
2498 .rate_index14 = rates[14],
2499 .rate_index15 = rates[15],
2500 });
2501 }
2502
2503 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2504 P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(true));
2505 }
2506 }
2507
2508 static uint32_t
nvk_mme_anti_alias_init(void)2509 nvk_mme_anti_alias_init(void)
2510 {
2511 /* This is a valid value but we never set it so it ensures that the macro
2512 * will actually run the first time we set anything.
2513 */
2514 return 0xf;
2515 }
2516
2517 uint32_t
nvk_mme_anti_alias_min_sample_shading(float mss)2518 nvk_mme_anti_alias_min_sample_shading(float mss)
2519 {
2520 /* The value we want to comput in the MME is
2521 *
2522 * passes = next_pow2(samples * minSampleShading)
2523 *
2524 * Since samples is already a power of two,
2525 *
2526 * passes_log2 = log2_ceil(samples * minSampleShading)
2527 * = log2_ceil(samples / (1.0 / minSampleShading))
2528 * = samples_log2 - log2_floor(1.0 / minSampleShading)
2529 *
2530 * if we assume (1.0 / min_sample_shading) >= 1.0. This last bit is
2531 * something we can compute in the MME as long as the float math on the
2532 * right-hand side happens on the CPU.
2533 */
2534 float rcp_mss = CLAMP(1.0 / mss, 1.0f, 16.0f);
2535 uint32_t rcp_mss_log2 = util_logbase2(floorf(rcp_mss));
2536
2537 assert(rcp_mss_log2 != nvk_mme_anti_alias_init());
2538
2539 return nvk_mme_val_mask(rcp_mss_log2 << 0, 0x000f);
2540 }
2541
2542 static uint32_t
nvk_mme_anti_alias_samples(uint32_t samples)2543 nvk_mme_anti_alias_samples(uint32_t samples)
2544 {
2545 assert(util_is_power_of_two_or_zero(samples));
2546 const uint32_t samples_log2 = util_logbase2(MAX2(1, samples));
2547
2548 return nvk_mme_val_mask(samples_log2 << 4, 0x00f0);
2549 }
2550
2551 void
nvk_mme_set_anti_alias(struct mme_builder * b)2552 nvk_mme_set_anti_alias(struct mme_builder *b)
2553 {
2554 struct mme_value val_mask = mme_load(b);
2555 struct mme_value old_anti_alias = nvk_mme_load_scratch(b, ANTI_ALIAS);
2556 struct mme_value anti_alias =
2557 nvk_mme_set_masked(b, old_anti_alias, val_mask);
2558 mme_free_reg(b, val_mask);
2559
2560 mme_if(b, ine, anti_alias, old_anti_alias) {
2561 mme_free_reg(b, old_anti_alias);
2562 nvk_mme_store_scratch(b, ANTI_ALIAS, anti_alias);
2563
2564 struct mme_value rcp_mss_log2 =
2565 mme_merge(b, mme_zero(), anti_alias, 0, 4, 0);
2566 struct mme_value samples_log2 =
2567 mme_merge(b, mme_zero(), anti_alias, 0, 4, 4);
2568 mme_free_reg(b, anti_alias);
2569
2570 /* We've already done all the hard work on the CPU in
2571 * nvk_mme_min_sample_shading(). All we have to do here is add the two
2572 * log2 values and clamp so we don't get negative.
2573 */
2574 struct mme_value passes_log2 = mme_sub(b, samples_log2, rcp_mss_log2);
2575 mme_free_reg(b, rcp_mss_log2);
2576
2577 /* passes = MAX(passes, 1) */
2578 struct mme_value neg = mme_srl(b, passes_log2, mme_imm(31));
2579 mme_if(b, ine, neg, mme_zero()) {
2580 mme_mov_to(b, passes_log2, mme_zero());
2581 }
2582 mme_free_reg(b, neg);
2583
2584 /*
2585 * NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL {
2586 * ...
2587 * .centroid = passes > 1 ? CENTROID_PER_PASS
2588 * : CENTROID_PER_FRAGMENT,
2589 * }
2590 */
2591 struct mme_value aac = mme_mov(b,
2592 mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_FRAGMENT
2593 << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2594 mme_if(b, ine, passes_log2, mme_zero()) {
2595 mme_mov_to(b, aac,
2596 mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_PASS
2597 << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2598 }
2599
2600 struct mme_value passes = mme_sll(b, mme_imm(1), passes_log2);
2601 mme_merge_to(b, aac, aac, passes, 0, 4, 0);
2602 mme_free_reg(b, passes);
2603
2604 mme_mthd(b, NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL);
2605 mme_emit(b, aac);
2606 mme_free_reg(b, aac);
2607
2608 /* Now we need to emit sample masks per-sample. Annoyingly, we have to
2609 * pack these in pairs.
2610 */
2611 STATIC_ASSERT(sizeof(struct nak_sample_mask) == 2);
2612
2613 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2614 mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.sample_masks)));
2615 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2616
2617 /* Annoyingly, we have to pack these in pairs */
2618
2619 struct mme_value samples_per_pass_log2 =
2620 mme_sub(b, samples_log2, passes_log2);
2621 mme_free_reg(b, samples_log2);
2622
2623 mme_if(b, ieq, samples_per_pass_log2, mme_zero()) {
2624 /* One sample per pass, we can just blast it out */
2625 for (uint32_t i = 0; i < NVK_MAX_SAMPLES; i += 2) {
2626 uint32_t mask0 = 1 << i;
2627 uint32_t mask1 = 1 << (i + 1);
2628 mme_emit(b, mme_imm(mask0 | (mask1 << 16)));
2629 }
2630 }
2631
2632 mme_if(b, ine, samples_per_pass_log2, mme_zero()) {
2633 mme_if(b, ieq, passes_log2, mme_zero()) {
2634 /* It's a single pass so we can use 0xffff */
2635 for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++)
2636 mme_emit(b, mme_imm(~0));
2637 }
2638
2639 mme_if(b, ieq, passes_log2, mme_imm(1)) {
2640 for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2641 struct mme_value mask =
2642 nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_2PASS_0, i);
2643 mme_emit(b, mask);
2644 mme_free_reg(b, mask);
2645 }
2646 }
2647
2648 mme_if(b, ieq, passes_log2, mme_imm(2)) {
2649 for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2650 struct mme_value mask =
2651 nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_4PASS_0, i);
2652 mme_emit(b, mask);
2653 mme_free_reg(b, mask);
2654 }
2655 }
2656 }
2657 }
2658 }
2659
2660 const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[] = {{
2661 /* This case doesn't change the state so it should do nothing */
2662 .init = (struct nvk_mme_mthd_data[]) {
2663 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2664 { }
2665 },
2666 .params = (uint32_t[]) { 0xffff0000 },
2667 .expected = (struct nvk_mme_mthd_data[]) {
2668 { }
2669 },
2670 }, {
2671 /* Single sample, minSampleShading = 1.0 */
2672 .init = (struct nvk_mme_mthd_data[]) {
2673 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2674 { }
2675 },
2676 .params = (uint32_t[]) { 0xffff0000 },
2677 .expected = (struct nvk_mme_mthd_data[]) {
2678 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2679 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2680 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2681 nvk_root_descriptor_offset(draw.sample_masks) },
2682 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2683 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2684 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2685 { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2686 { }
2687 },
2688 }, {
2689 /* Single sample, minSampleShading = 0.25 */
2690 .init = (struct nvk_mme_mthd_data[]) {
2691 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2692 { }
2693 },
2694 .params = (uint32_t[]) { 0xffff0002 },
2695 .expected = (struct nvk_mme_mthd_data[]) {
2696 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x2 },
2697 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2698 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2699 nvk_root_descriptor_offset(draw.sample_masks) },
2700 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2701 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2702 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2703 { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2704 { }
2705 },
2706 }, {
2707 /* 8 samples, minSampleShading = 0.5 */
2708 .init = (struct nvk_mme_mthd_data[]) {
2709 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x1 },
2710 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_0), 0x030003 },
2711 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_1), 0x0c000c },
2712 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_2), 0x300030 },
2713 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_3), 0xc000c0 },
2714 { }
2715 },
2716 .params = (uint32_t[]) { 0x00f00030 },
2717 .expected = (struct nvk_mme_mthd_data[]) {
2718 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x31 },
2719 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x14 },
2720 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2721 nvk_root_descriptor_offset(draw.sample_masks) },
2722 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x030003 },
2723 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0c000c },
2724 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x300030 },
2725 { NV9097_LOAD_CONSTANT_BUFFER(3), 0xc000c0 },
2726 { }
2727 },
2728 }, {
2729 /* 8 samples, minSampleShading = 0.25 */
2730 .init = (struct nvk_mme_mthd_data[]) {
2731 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x30 },
2732 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_0), 0x0f000f },
2733 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_1), 0x0f000f },
2734 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_2), 0xf000f0 },
2735 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_3), 0xf000f0 },
2736 { }
2737 },
2738 .params = (uint32_t[]) { 0x000f0002 },
2739 .expected = (struct nvk_mme_mthd_data[]) {
2740 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x32 },
2741 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x12 },
2742 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2743 nvk_root_descriptor_offset(draw.sample_masks) },
2744 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x0f000f },
2745 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0f000f },
2746 { NV9097_LOAD_CONSTANT_BUFFER(2), 0xf000f0 },
2747 { NV9097_LOAD_CONSTANT_BUFFER(3), 0xf000f0 },
2748 { }
2749 },
2750 }, {}};
2751
2752 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)2753 vk_sample_location(const struct vk_sample_locations_state *sl,
2754 uint32_t x, uint32_t y, uint32_t s)
2755 {
2756 x = x % sl->grid_size.width;
2757 y = y % sl->grid_size.height;
2758
2759 return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
2760 }
2761
2762 static struct nak_sample_location
vk_to_nak_sample_location(VkSampleLocationEXT loc)2763 vk_to_nak_sample_location(VkSampleLocationEXT loc)
2764 {
2765 return (struct nak_sample_location) {
2766 .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
2767 .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
2768 };
2769 }
2770
2771 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)2772 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
2773 {
2774 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2775 const struct vk_dynamic_graphics_state *dyn =
2776 &cmd->vk.dynamic_graphics_state;
2777
2778 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
2779 /* When we don't have any attachments, we can't know the sample count
2780 * from the render pass so we need to emit SET_ANTI_ALIAS here. See the
2781 * comment in nvk_BeginRendering() for more details.
2782 */
2783 if (render->samples == 0) {
2784 /* Multisample information MAY be missing (rasterizationSamples == 0)
2785 * if rasterizer discard is enabled. However, this isn't valid in
2786 * the hardware so always use at least one sample.
2787 */
2788 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2789 nvk_cmd_set_sample_layout(cmd, nil_choose_sample_layout(samples));
2790 } else {
2791 /* Multisample information MAY be missing (rasterizationSamples == 0)
2792 * if rasterizer discard is enabled.
2793 */
2794 assert(dyn->ms.rasterization_samples == 0 ||
2795 dyn->ms.rasterization_samples == render->samples);
2796 }
2797 }
2798
2799 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
2800 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
2801 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2802 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
2803 .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2804 .alpha_to_one = dyn->ms.alpha_to_one_enable,
2805 });
2806 }
2807
2808 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
2809 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
2810 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
2811 const struct vk_sample_locations_state *sl;
2812 if (dyn->ms.sample_locations_enable) {
2813 sl = dyn->ms.sample_locations;
2814 } else {
2815 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2816 sl = vk_standard_sample_locations_state(samples);
2817 }
2818
2819 struct nak_sample_location push_sl[NVK_MAX_SAMPLES];
2820 for (uint32_t i = 0; i < sl->per_pixel; i++)
2821 push_sl[i] = vk_to_nak_sample_location(sl->locations[i]);
2822
2823 nvk_descriptor_state_set_root_array(cmd, &cmd->state.gfx.descriptors,
2824 draw.sample_locations,
2825 0, NVK_MAX_SAMPLES, push_sl);
2826
2827 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
2828 struct nak_sample_location loc[16];
2829 for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
2830 const uint32_t s = n % sl->per_pixel;
2831 const uint32_t px = n / sl->per_pixel;
2832 const uint32_t x = px % 2;
2833 const uint32_t y = px / 2;
2834
2835 loc[n] = vk_to_nak_sample_location(vk_sample_location(sl, x, y, s));
2836 }
2837
2838 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2839
2840 P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
2841 for (uint32_t i = 0; i < 4; i++) {
2842 P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
2843 .x0 = loc[i * 4 + 0].x_u4,
2844 .y0 = loc[i * 4 + 0].y_u4,
2845 .x1 = loc[i * 4 + 1].x_u4,
2846 .y1 = loc[i * 4 + 1].y_u4,
2847 .x2 = loc[i * 4 + 2].x_u4,
2848 .y2 = loc[i * 4 + 2].y_u4,
2849 .x3 = loc[i * 4 + 3].x_u4,
2850 .y3 = loc[i * 4 + 3].y_u4,
2851 });
2852 }
2853 }
2854 }
2855
2856 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
2857 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2858 P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
2859 P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
2860 P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
2861 P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
2862 P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
2863 }
2864 }
2865
2866 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)2867 vk_to_nv9097_compare_op(VkCompareOp vk_op)
2868 {
2869 ASSERTED static const uint16_t vk_to_nv9097[] = {
2870 [VK_COMPARE_OP_NEVER] = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
2871 [VK_COMPARE_OP_LESS] = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
2872 [VK_COMPARE_OP_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
2873 [VK_COMPARE_OP_LESS_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
2874 [VK_COMPARE_OP_GREATER] = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
2875 [VK_COMPARE_OP_NOT_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
2876 [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
2877 [VK_COMPARE_OP_ALWAYS] = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
2878 };
2879 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2880
2881 uint32_t nv9097_op = 0x200 | vk_op;
2882 assert(nv9097_op == vk_to_nv9097[vk_op]);
2883 return nv9097_op;
2884 }
2885
2886 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)2887 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
2888 {
2889 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
2890 ASSERTED static const uint16_t vk_to_nv9097[] = {
2891 OP(KEEP, D3D_KEEP),
2892 OP(ZERO, D3D_ZERO),
2893 OP(REPLACE, D3D_REPLACE),
2894 OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
2895 OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
2896 OP(INVERT, D3D_INVERT),
2897 OP(INCREMENT_AND_WRAP, D3D_INCR),
2898 OP(DECREMENT_AND_WRAP, D3D_DECR),
2899 };
2900 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2901 #undef OP
2902
2903 uint32_t nv9097_op = vk_op + 1;
2904 assert(nv9097_op == vk_to_nv9097[vk_op]);
2905 return nv9097_op;
2906 }
2907
2908 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)2909 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
2910 {
2911 struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
2912
2913 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2914 const struct vk_dynamic_graphics_state *dyn =
2915 &cmd->vk.dynamic_graphics_state;
2916
2917 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
2918 bool enable = dyn->ds.depth.test_enable &&
2919 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2920 P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
2921 }
2922
2923 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
2924 bool enable = dyn->ds.depth.write_enable &&
2925 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2926 P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
2927 }
2928
2929 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
2930 const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
2931 P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
2932 }
2933
2934 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
2935 bool enable = dyn->ds.depth.bounds_test.enable &&
2936 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2937 P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
2938 }
2939
2940 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
2941 P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
2942 P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
2943 P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
2944 }
2945
2946 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
2947 bool enable = dyn->ds.stencil.test_enable &&
2948 render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2949 P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
2950 }
2951
2952 const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
2953 const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
2954 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
2955 P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
2956 P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
2957 P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
2958 P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
2959 P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
2960
2961 P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
2962 P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
2963 P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
2964 P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
2965 P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
2966 }
2967
2968 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
2969 P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
2970 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
2971 }
2972
2973 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
2974 P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
2975 P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
2976 }
2977
2978 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
2979 P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
2980 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
2981 }
2982 }
2983
2984 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)2985 vk_to_nv9097_logic_op(VkLogicOp vk_op)
2986 {
2987 ASSERTED uint16_t vk_to_nv9097[] = {
2988 [VK_LOGIC_OP_CLEAR] = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
2989 [VK_LOGIC_OP_AND] = NV9097_SET_LOGIC_OP_FUNC_V_AND,
2990 [VK_LOGIC_OP_AND_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
2991 [VK_LOGIC_OP_COPY] = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
2992 [VK_LOGIC_OP_AND_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
2993 [VK_LOGIC_OP_NO_OP] = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
2994 [VK_LOGIC_OP_XOR] = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
2995 [VK_LOGIC_OP_OR] = NV9097_SET_LOGIC_OP_FUNC_V_OR,
2996 [VK_LOGIC_OP_NOR] = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
2997 [VK_LOGIC_OP_EQUIVALENT] = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
2998 [VK_LOGIC_OP_INVERT] = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
2999 [VK_LOGIC_OP_OR_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
3000 [VK_LOGIC_OP_COPY_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
3001 [VK_LOGIC_OP_OR_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
3002 [VK_LOGIC_OP_NAND] = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
3003 [VK_LOGIC_OP_SET] = NV9097_SET_LOGIC_OP_FUNC_V_SET,
3004 };
3005 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3006
3007 uint32_t nv9097_op = 0x1500 | vk_op;
3008 assert(nv9097_op == vk_to_nv9097[vk_op]);
3009 return nv9097_op;
3010 }
3011
3012 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)3013 vk_to_nv9097_blend_op(VkBlendOp vk_op)
3014 {
3015 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
3016 ASSERTED uint16_t vk_to_nv9097[] = {
3017 OP(ADD, FUNC_ADD),
3018 OP(SUBTRACT, FUNC_SUBTRACT),
3019 OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
3020 OP(MIN, MIN),
3021 OP(MAX, MAX),
3022 };
3023 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3024 #undef OP
3025
3026 return vk_to_nv9097[vk_op];
3027 }
3028
3029 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)3030 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
3031 {
3032 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
3033 NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
3034 ASSERTED uint16_t vk_to_nv9097[] = {
3035 FACTOR(ZERO, OGL_ZERO),
3036 FACTOR(ONE, OGL_ONE),
3037 FACTOR(SRC_COLOR, OGL_SRC_COLOR),
3038 FACTOR(ONE_MINUS_SRC_COLOR, OGL_ONE_MINUS_SRC_COLOR),
3039 FACTOR(DST_COLOR, OGL_DST_COLOR),
3040 FACTOR(ONE_MINUS_DST_COLOR, OGL_ONE_MINUS_DST_COLOR),
3041 FACTOR(SRC_ALPHA, OGL_SRC_ALPHA),
3042 FACTOR(ONE_MINUS_SRC_ALPHA, OGL_ONE_MINUS_SRC_ALPHA),
3043 FACTOR(DST_ALPHA, OGL_DST_ALPHA),
3044 FACTOR(ONE_MINUS_DST_ALPHA, OGL_ONE_MINUS_DST_ALPHA),
3045 FACTOR(CONSTANT_COLOR, OGL_CONSTANT_COLOR),
3046 FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
3047 FACTOR(CONSTANT_ALPHA, OGL_CONSTANT_ALPHA),
3048 FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
3049 FACTOR(SRC_ALPHA_SATURATE, OGL_SRC_ALPHA_SATURATE),
3050 FACTOR(SRC1_COLOR, OGL_SRC1COLOR),
3051 FACTOR(ONE_MINUS_SRC1_COLOR, OGL_INVSRC1COLOR),
3052 FACTOR(SRC1_ALPHA, OGL_SRC1ALPHA),
3053 FACTOR(ONE_MINUS_SRC1_ALPHA, OGL_INVSRC1ALPHA),
3054 };
3055 assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
3056 #undef FACTOR
3057
3058 return vk_to_nv9097[vk_factor];
3059 }
3060
3061 void
nvk_mme_set_write_mask(struct mme_builder * b)3062 nvk_mme_set_write_mask(struct mme_builder *b)
3063 {
3064 struct mme_value count = mme_load(b);
3065 struct mme_value mask = mme_load(b);
3066
3067 /*
3068 * mask is a bit field
3069 *
3070 * attachment index 88887777666655554444333322221111
3071 * component abgrabgrabgrabgrabgrabgrabgrabgr
3072 */
3073
3074 struct mme_value common_mask = mme_mov(b, mme_imm(1));
3075 struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3076 struct mme_value i = mme_mov(b, mme_zero());
3077
3078 mme_while(b, ine, i, count) {
3079 /*
3080 We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
3081 0x0000 0000 0000 0000 000a 000b 000g 000r
3082
3083 So for i=0 a mask of
3084 0x0000 0000 0000 0000 0000 0000 0000 1111
3085 becomes
3086 0x0000 0000 0000 0000 0001 0001 0001 0001
3087 */
3088
3089 struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
3090 mme_merge_to(b, val, val, mask, 4, 1, 1);
3091 mme_merge_to(b, val, val, mask, 8, 1, 2);
3092 mme_merge_to(b, val, val, mask, 12, 1, 3);
3093
3094 mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
3095 mme_emit(b, val);
3096 mme_free_reg(b, val);
3097
3098 /* Check if all masks are common */
3099 struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3100 mme_if(b, ine, first, temp) {
3101 mme_mov_to(b, common_mask, mme_zero());
3102 }
3103 mme_free_reg(b, temp);
3104
3105 mme_srl_to(b, mask, mask, mme_imm(4));
3106
3107 mme_add_to(b, i, i, mme_imm(1));
3108 }
3109
3110 mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
3111 mme_emit(b, common_mask);
3112 }
3113
3114 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)3115 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
3116 {
3117 struct nvk_rendering_state *render = &cmd->state.gfx.render;
3118 const struct vk_dynamic_graphics_state *dyn =
3119 &cmd->vk.dynamic_graphics_state;
3120
3121 struct nv_push *p =
3122 nvk_cmd_buffer_push(cmd, 15 + 10 * render->color_att_count);
3123
3124 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
3125 P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
3126
3127 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
3128 const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
3129 P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
3130 }
3131
3132 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
3133 for (uint8_t a = 0; a < render->color_att_count; a++) {
3134 P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
3135 }
3136 }
3137
3138 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
3139 for (uint8_t a = 0; a < render->color_att_count; a++) {
3140 const struct vk_color_blend_attachment_state *att =
3141 &dyn->cb.attachments[a];
3142 P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
3143 P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
3144 P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
3145 vk_to_nv9097_blend_op(att->color_blend_op));
3146 P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
3147 vk_to_nv9097_blend_factor(att->src_color_blend_factor));
3148 P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
3149 vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
3150 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
3151 vk_to_nv9097_blend_op(att->alpha_blend_op));
3152 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
3153 vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
3154 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
3155 vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
3156 }
3157 }
3158
3159 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
3160 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
3161 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS) ||
3162 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3163 uint32_t color_write_enables = 0x0;
3164 for (uint8_t a = 0; a < render->color_att_count; a++) {
3165 if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
3166 color_write_enables |= 0xf << (4 * a);
3167 }
3168
3169 uint32_t cb_att_write_mask = 0x0;
3170 for (uint8_t a = 0; a < render->color_att_count; a++)
3171 cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
3172
3173 uint32_t rp_att_write_mask = 0x0;
3174 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3175 if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
3176 rp_att_write_mask |= 0xf << (4 * a);
3177 }
3178
3179 uint32_t att_has_loc_mask = 0x0;
3180 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3181 if (dyn->cal.color_map[a] != MESA_VK_ATTACHMENT_UNUSED)
3182 att_has_loc_mask |= 0xf << (4 * a);
3183 }
3184
3185 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
3186 P_INLINE_DATA(p, render->color_att_count);
3187 P_INLINE_DATA(p, color_write_enables &
3188 cb_att_write_mask &
3189 rp_att_write_mask &
3190 att_has_loc_mask);
3191 }
3192
3193 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3194 int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
3195 uint8_t max_loc = 0;
3196 uint32_t att_used = 0;
3197 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3198 if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
3199 continue;
3200
3201 att_used |= BITFIELD_BIT(a);
3202
3203 assert(dyn->cal.color_map[a] < NVK_MAX_RTS);
3204 loc_att[dyn->cal.color_map[a]] = a;
3205 max_loc = MAX2(max_loc, dyn->cal.color_map[a]);
3206 }
3207
3208 for (uint8_t l = 0; l < NVK_MAX_RTS; l++) {
3209 if (loc_att[l] >= 0)
3210 continue;
3211
3212 /* Just grab any color attachment. The way we set up color targets
3213 * in BeginRenderPass ensures that every color target is either the
3214 * valid color target referenced by this render pass or a valid NULL
3215 * target. If we end up mapping to some other target in this render
3216 * pass, the handling of att_has_loc_mask above will ensure that no
3217 * color writes actually happen.
3218 */
3219 uint8_t a = ffs(~att_used) - 1;
3220 att_used |= BITFIELD_BIT(a);
3221 loc_att[l] = a;
3222 }
3223
3224 P_IMMD(p, NV9097, SET_CT_SELECT, {
3225 .target_count = max_loc + 1,
3226 .target0 = loc_att[0],
3227 .target1 = loc_att[1],
3228 .target2 = loc_att[2],
3229 .target3 = loc_att[3],
3230 .target4 = loc_att[4],
3231 .target5 = loc_att[5],
3232 .target6 = loc_att[6],
3233 .target7 = loc_att[7],
3234 });
3235 }
3236
3237 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
3238 P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
3239 P_NV9097_SET_BLEND_CONST_RED(p, fui(dyn->cb.blend_constants[0]));
3240 P_NV9097_SET_BLEND_CONST_GREEN(p, fui(dyn->cb.blend_constants[1]));
3241 P_NV9097_SET_BLEND_CONST_BLUE(p, fui(dyn->cb.blend_constants[2]));
3242 P_NV9097_SET_BLEND_CONST_ALPHA(p, fui(dyn->cb.blend_constants[3]));
3243 }
3244 }
3245
3246 void
nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer * cmd)3247 nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer *cmd)
3248 {
3249 struct vk_dynamic_graphics_state *dyn =
3250 &cmd->vk.dynamic_graphics_state;
3251
3252 if (!vk_dynamic_graphics_state_any_dirty(dyn))
3253 return;
3254
3255 nvk_flush_vi_state(cmd);
3256 nvk_flush_ia_state(cmd);
3257 nvk_flush_ts_state(cmd);
3258 nvk_flush_vp_state(cmd);
3259 nvk_flush_rs_state(cmd);
3260 nvk_flush_fsr_state(cmd);
3261 nvk_flush_ms_state(cmd);
3262 nvk_flush_ds_state(cmd);
3263 nvk_flush_cb_state(cmd);
3264
3265 vk_dynamic_graphics_state_clear_dirty(dyn);
3266 }
3267
3268 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)3269 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
3270 {
3271 /* First 4 bits are group, later bits are slot */
3272 struct mme_value group_slot = mme_load(b);
3273
3274 struct mme_value addr_lo, addr_hi, size;
3275 if (nvk_use_bindless_cbuf(b->devinfo)) {
3276 if (b->devinfo->cls_eng3d >= TURING_A) {
3277 struct mme_value64 addr = mme_load_addr64(b);
3278 mme_tu104_read_fifoed(b, addr, mme_imm(2));
3279 }
3280
3281 /* Load the descriptor */
3282 struct mme_value desc_lo = mme_load(b);
3283 struct mme_value desc_hi = mme_load(b);
3284
3285 /* The bottom 45 bits are addr >> 4 */
3286 addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
3287 addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
3288 mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
3289
3290 /* The top 19 bits are size >> 4 */
3291 size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
3292
3293 mme_free_reg(b, desc_hi);
3294 mme_free_reg(b, desc_lo);
3295 } else {
3296 if (b->devinfo->cls_eng3d >= TURING_A) {
3297 struct mme_value64 addr = mme_load_addr64(b);
3298 mme_tu104_read_fifoed(b, addr, mme_imm(3));
3299 }
3300
3301 /* Load the descriptor */
3302 addr_lo = mme_load(b);
3303 addr_hi = mme_load(b);
3304 size = mme_load(b);
3305 }
3306
3307 struct mme_value cb = mme_alloc_reg(b);
3308 mme_if(b, ieq, size, mme_zero()) {
3309 /* Bottim bit is the valid bit, 8:4 are shader slot */
3310 mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
3311 }
3312
3313 mme_if(b, ine, size, mme_zero()) {
3314 /* size = max(size, NVK_MAX_CBUF_SIZE) */
3315 assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
3316 struct mme_value is_large =
3317 mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
3318 mme_if(b, ine, is_large, mme_zero()) {
3319 mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
3320 }
3321
3322 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
3323 mme_emit(b, size);
3324 mme_emit(b, addr_hi);
3325 mme_emit(b, addr_lo);
3326
3327 /* Bottom bit is the valid bit, 8:4 are shader slot */
3328 mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
3329 }
3330
3331 mme_free_reg(b, addr_hi);
3332 mme_free_reg(b, addr_lo);
3333 mme_free_reg(b, size);
3334
3335 /* The group comes in the bottom 4 bits in group_slot and we need to
3336 * combine it with the method. However, unlike most array methods with a
3337 * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
3338 * dwords. This means we need to also shift by 3.
3339 */
3340 struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
3341 mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
3342 mme_emit(b, cb);
3343 }
3344
3345 void
nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer * cmd)3346 nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer *cmd)
3347 {
3348 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
3349 struct nvk_physical_device *pdev = nvk_device_physical(dev);
3350 const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
3351 struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3352
3353 /* Find cbuf maps for the 5 cbuf groups */
3354 const struct nvk_shader *cbuf_shaders[5] = { NULL, };
3355 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
3356 const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
3357 if (shader == NULL)
3358 continue;
3359
3360 uint32_t group = nvk_cbuf_binding_for_stage(stage);
3361 assert(group < ARRAY_SIZE(cbuf_shaders));
3362 cbuf_shaders[group] = shader;
3363 }
3364
3365 bool bound_any_cbuf = false;
3366 for (uint32_t g = 0; g < ARRAY_SIZE(cbuf_shaders); g++) {
3367 if (cbuf_shaders[g] == NULL)
3368 continue;
3369
3370 const struct nvk_shader *shader = cbuf_shaders[g];
3371 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
3372 struct nvk_cbuf_group *group = &cmd->state.gfx.cbuf_groups[g];
3373
3374 /* We only bother to re-bind cbufs that are in use */
3375 const uint32_t rebind =
3376 group->dirty & BITFIELD_MASK(cbuf_map->cbuf_count);
3377 if (!rebind)
3378 continue;
3379
3380 u_foreach_bit(c, rebind) {
3381 const struct nvk_cbuf *cbuf = &group->cbufs[c];
3382
3383 /* We bind these at the very end */
3384 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC)
3385 continue;
3386
3387 bound_any_cbuf = true;
3388
3389 struct nvk_buffer_address ba;
3390 if (nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba)) {
3391 assert(ba.base_addr % min_cbuf_alignment == 0);
3392 ba.size = align(ba.size, min_cbuf_alignment);
3393 ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
3394
3395 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3396
3397 if (ba.size > 0) {
3398 P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
3399 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
3400 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
3401 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
3402 }
3403
3404 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(g), {
3405 .valid = ba.size > 0,
3406 .shader_slot = c,
3407 });
3408 } else {
3409 uint64_t desc_addr =
3410 nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
3411
3412 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3413 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3414
3415 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3416 P_INLINE_DATA(p, g | (c << 4));
3417 P_INLINE_DATA(p, desc_addr >> 32);
3418 P_INLINE_DATA(p, desc_addr);
3419 } else {
3420 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3421
3422 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3423 P_INLINE_DATA(p, g | (c << 4));
3424
3425 nv_push_update_count(p, 3);
3426 nvk_cmd_buffer_push_indirect(cmd, desc_addr, 12);
3427 }
3428 }
3429 }
3430
3431 group->dirty &= ~rebind;
3432 }
3433
3434 /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
3435 * always left pointing at the root descriptor table. This way draw
3436 * parameters and similar MME root table updates always hit the root
3437 * descriptor table and not some random UBO.
3438 */
3439 if (bound_any_cbuf) {
3440 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3441 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
3442 P_INLINE_DATA(p, 0);
3443 }
3444 }
3445
3446 static void
nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer * cmd)3447 nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer *cmd)
3448 {
3449 nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
3450 nvk_cmd_flush_gfx_dynamic_state(cmd);
3451 nvk_cmd_flush_gfx_shaders(cmd);
3452 nvk_cmd_flush_gfx_cbufs(cmd);
3453 }
3454
3455 void
nvk_mme_bind_ib(struct mme_builder * b)3456 nvk_mme_bind_ib(struct mme_builder *b)
3457 {
3458 struct mme_value64 addr = mme_load_addr64(b);
3459 struct mme_value size_B = mme_load(b);
3460
3461 struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3462 mme_if(b, ieq, addr_or, mme_zero()) {
3463 mme_mov_to(b, size_B, mme_zero());
3464 }
3465 mme_free_reg(b, addr_or);
3466
3467 if (b->devinfo->cls_eng3d < TURING_A) {
3468 mme_if(b, ieq, size_B, mme_zero()) {
3469 nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3470 nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3471 }
3472 }
3473
3474 mme_mthd(b, NV9097_SET_INDEX_BUFFER_A);
3475 mme_emit(b, addr.hi);
3476 mme_emit(b, addr.lo);
3477
3478 if (b->devinfo->cls_eng3d >= TURING_A) {
3479 mme_mthd(b, NVC597_SET_INDEX_BUFFER_SIZE_A);
3480 mme_emit(b, mme_zero());
3481 mme_emit(b, size_B);
3482 } else {
3483 /* Convert to an end address */
3484 mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3485 mme_add64_to(b, addr, addr, mme_imm64(-1));
3486
3487 /* mme_mthd(b, NV9097_SET_INDEX_BUFFER_C); */
3488 mme_emit(b, addr.hi);
3489 mme_emit(b, addr.lo);
3490 }
3491 mme_free_reg64(b, addr);
3492 mme_free_reg(b, size_B);
3493
3494 struct mme_value fmt = mme_load(b);
3495 struct mme_value restart = mme_mov(b, mme_imm(UINT32_MAX));
3496 struct mme_value index_type = mme_mov(b,
3497 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES));
3498
3499 /* The Vulkan and D3D enums don't overlap so we can handle both at the same
3500 * time with one MME macro.
3501 */
3502 UNUSED static const uint32_t DXGI_FORMAT_R32_UINT = 42;
3503 static const uint32_t DXGI_FORMAT_R16_UINT = 57;
3504 static const uint32_t DXGI_FORMAT_R8_UINT = 62;
3505
3506 mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT16)) {
3507 mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3508 mme_mov_to(b, index_type,
3509 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3510 }
3511
3512 mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R16_UINT)) {
3513 mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3514 mme_mov_to(b, index_type,
3515 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3516 }
3517
3518 mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT8_KHR)) {
3519 mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3520 mme_mov_to(b, index_type,
3521 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3522 }
3523
3524 mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R8_UINT)) {
3525 mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3526 mme_mov_to(b, index_type,
3527 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3528 }
3529
3530 mme_mthd(b, NV9097_SET_DA_PRIMITIVE_RESTART_INDEX);
3531 mme_emit(b, restart);
3532
3533 mme_mthd(b, NV9097_SET_INDEX_BUFFER_E);
3534 mme_emit(b, index_type);
3535 }
3536
3537 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3538 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3539 VkBuffer _buffer,
3540 VkDeviceSize offset,
3541 VkDeviceSize size,
3542 VkIndexType indexType)
3543 {
3544 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3545 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3546 struct nvk_addr_range addr_range =
3547 nvk_buffer_addr_range(buffer, offset, size);
3548
3549 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3550 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB));
3551 P_INLINE_DATA(p, addr_range.addr >> 32);
3552 P_INLINE_DATA(p, addr_range.addr);
3553 assert(addr_range.range <= UINT32_MAX);
3554 P_INLINE_DATA(p, addr_range.range);
3555 P_INLINE_DATA(p, indexType);
3556 }
3557
3558 void
nvk_mme_bind_vb(struct mme_builder * b)3559 nvk_mme_bind_vb(struct mme_builder *b)
3560 {
3561 struct mme_value vb_idx = mme_load(b);
3562 struct mme_value64 addr = mme_load_addr64(b);
3563 struct mme_value size_B = mme_load(b);
3564
3565 struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3566 mme_if(b, ieq, addr_or, mme_zero()) {
3567 mme_mov_to(b, size_B, mme_zero());
3568 }
3569 mme_free_reg(b, addr_or);
3570
3571 if (b->devinfo->cls_eng3d < TURING_A) {
3572 mme_if(b, ieq, size_B, mme_zero()) {
3573 nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3574 nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3575 }
3576 }
3577
3578 struct mme_value vb_idx4 = mme_sll(b, vb_idx, mme_imm(2));
3579 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_LOCATION_A(0), vb_idx4);
3580 mme_free_reg(b, vb_idx4);
3581 mme_emit(b, addr.hi);
3582 mme_emit(b, addr.lo);
3583
3584 if (b->devinfo->cls_eng3d >= TURING_A) {
3585 struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3586 mme_mthd_arr(b, NVC597_SET_VERTEX_STREAM_SIZE_A(0), vb_idx2);
3587 mme_emit(b, mme_zero());
3588 mme_emit(b, size_B);
3589 } else {
3590 /* Convert to an end address */
3591 mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3592 mme_add64_to(b, addr, addr, mme_imm64(-1));
3593
3594 struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3595 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_LIMIT_A_A(0), vb_idx2);
3596 mme_emit(b, addr.hi);
3597 mme_emit(b, addr.lo);
3598 }
3599 }
3600
3601 static void
nvk_mme_bind_vb_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)3602 nvk_mme_bind_vb_test_check(const struct nv_device_info *devinfo,
3603 const struct nvk_mme_test_case *test,
3604 const struct nvk_mme_mthd_data *results)
3605 {
3606 const uint32_t vb_idx = test->params[0];
3607 const uint32_t addr_hi = test->params[1];
3608 const uint32_t addr_lo = test->params[2];
3609
3610 uint32_t size_B = test->params[3];
3611 if (addr_hi == 0 && addr_lo == 0)
3612 size_B = 0;
3613
3614 assert(results[0].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
3615 assert(results[1].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_B(vb_idx));
3616
3617 if (devinfo->cls_eng3d >= TURING_A) {
3618 assert(results[0].data == addr_hi);
3619 assert(results[1].data == addr_lo);
3620
3621 assert(results[2].mthd == NVC597_SET_VERTEX_STREAM_SIZE_A(3));
3622 assert(results[3].mthd == NVC597_SET_VERTEX_STREAM_SIZE_B(3));
3623 assert(results[2].data == 0);
3624 assert(results[3].data == size_B);
3625 } else {
3626 uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
3627 if (size_B == 0)
3628 addr = ((uint64_t)test->init[0].data << 32) | test->init[1].data;
3629
3630 assert(results[0].data == addr >> 32);
3631 assert(results[1].data == (uint32_t)addr);
3632
3633 const uint64_t limit = (addr + size_B) - 1;
3634 assert(results[2].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_A(3));
3635 assert(results[3].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_B(3));
3636 assert(results[2].data == limit >> 32);
3637 assert(results[3].data == (uint32_t)limit);
3638 }
3639 }
3640
3641 const struct nvk_mme_test_case nvk_mme_bind_vb_tests[] = {{
3642 .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0x10000 },
3643 .check = nvk_mme_bind_vb_test_check,
3644 }, {
3645 .init = (struct nvk_mme_mthd_data[]) {
3646 { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3647 { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3648 { }
3649 },
3650 .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0 },
3651 .check = nvk_mme_bind_vb_test_check,
3652 }, {
3653 .init = (struct nvk_mme_mthd_data[]) {
3654 { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3655 { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3656 { }
3657 },
3658 .params = (uint32_t[]) { 3, 0, 0, 0x800 },
3659 .check = nvk_mme_bind_vb_test_check,
3660 }, {}};
3661
3662 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)3663 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
3664 struct nvk_addr_range addr_range)
3665 {
3666 /* Used for meta save/restore */
3667 if (vb_idx == 0)
3668 cmd->state.gfx.vb0 = addr_range;
3669
3670 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3671 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB));
3672 P_INLINE_DATA(p, vb_idx);
3673 P_INLINE_DATA(p, addr_range.addr >> 32);
3674 P_INLINE_DATA(p, addr_range.addr);
3675 assert(addr_range.range <= UINT32_MAX);
3676 P_INLINE_DATA(p, addr_range.range);
3677 }
3678
3679 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3680 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3681 uint32_t firstBinding,
3682 uint32_t bindingCount,
3683 const VkBuffer *pBuffers,
3684 const VkDeviceSize *pOffsets,
3685 const VkDeviceSize *pSizes,
3686 const VkDeviceSize *pStrides)
3687 {
3688 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3689
3690 if (pStrides) {
3691 vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
3692 bindingCount, pStrides);
3693 }
3694
3695 for (uint32_t i = 0; i < bindingCount; i++) {
3696 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3697 uint32_t idx = firstBinding + i;
3698
3699 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3700 const struct nvk_addr_range addr_range =
3701 nvk_buffer_addr_range(buffer, pOffsets[i], size);
3702
3703 nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3704 }
3705 }
3706
3707 static void
nvk_mme_set_cb0_mthd(struct mme_builder * b,uint16_t cb0_offset,uint16_t mthd,struct mme_value val)3708 nvk_mme_set_cb0_mthd(struct mme_builder *b,
3709 uint16_t cb0_offset,
3710 uint16_t mthd,
3711 struct mme_value val)
3712 {
3713 if (b->devinfo->cls_eng3d >= TURING_A) {
3714 struct mme_value old = mme_state(b, mthd);
3715 mme_if(b, ine, old, val) {
3716 mme_mthd(b, mthd);
3717 mme_emit(b, val);
3718
3719 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3720 mme_emit(b, mme_imm(cb0_offset));
3721 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3722 mme_emit(b, val);
3723 }
3724 mme_free_reg(b, old);
3725 } else {
3726 /* Fermi is really tight on registers. Don't bother with the if and set
3727 * both unconditionally for now.
3728 */
3729 mme_mthd(b, mthd);
3730 mme_emit(b, val);
3731
3732 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3733 mme_emit(b, mme_imm(cb0_offset));
3734 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3735 mme_emit(b, val);
3736 }
3737 }
3738
3739 static void
nvk_mme_set_cb0_scratch(struct mme_builder * b,uint16_t cb0_offset,enum nvk_mme_scratch scratch,struct mme_value val)3740 nvk_mme_set_cb0_scratch(struct mme_builder *b,
3741 uint16_t cb0_offset,
3742 enum nvk_mme_scratch scratch,
3743 struct mme_value val)
3744 {
3745 const uint16_t mthd = NV9097_SET_MME_SHADOW_SCRATCH(scratch);
3746 nvk_mme_set_cb0_mthd(b, cb0_offset, mthd, val);
3747 }
3748
3749 struct mme_draw_params {
3750 struct mme_value base_vertex;
3751 struct mme_value first_vertex;
3752 struct mme_value first_instance;
3753 struct mme_value draw_index;
3754 };
3755
3756 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)3757 nvk_mme_build_set_draw_params(struct mme_builder *b,
3758 const struct mme_draw_params *p)
3759 {
3760 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.base_vertex),
3761 NVK_MME_SCRATCH_CB0_FIRST_VERTEX,
3762 p->first_vertex);
3763 nvk_mme_set_cb0_mthd(b, nvk_root_descriptor_offset(draw.base_instance),
3764 NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX,
3765 p->first_instance);
3766 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.draw_index),
3767 NVK_MME_SCRATCH_CB0_DRAW_INDEX,
3768 p->draw_index);
3769 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3770 NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3771 mme_zero());
3772
3773 mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
3774 mme_emit(b, p->base_vertex);
3775 mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
3776 mme_emit(b, p->base_vertex);
3777 }
3778
3779 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)3780 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
3781 {
3782 /* Set the push constant */
3783 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3784 NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3785 view_index);
3786
3787 /* Set the layer to the view index */
3788 STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
3789 STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
3790 mme_mthd(b, NV9097_SET_RT_LAYER);
3791 mme_emit(b, view_index);
3792 }
3793
3794 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)3795 nvk_mme_build_draw_loop(struct mme_builder *b,
3796 struct mme_value instance_count,
3797 struct mme_value first_vertex,
3798 struct mme_value vertex_count)
3799 {
3800 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3801
3802 mme_loop(b, instance_count) {
3803 mme_mthd(b, NV9097_BEGIN);
3804 mme_emit(b, begin);
3805
3806 mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
3807 mme_emit(b, first_vertex);
3808 mme_emit(b, vertex_count);
3809
3810 mme_mthd(b, NV9097_END);
3811 mme_emit(b, mme_zero());
3812
3813 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3814 }
3815
3816 mme_free_reg(b, begin);
3817 }
3818
3819 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_index)3820 nvk_mme_build_draw(struct mme_builder *b,
3821 struct mme_value draw_index)
3822 {
3823 /* These are in VkDrawIndirectCommand order */
3824 struct mme_value vertex_count = mme_load(b);
3825 struct mme_value instance_count = mme_load(b);
3826 struct mme_value first_vertex = mme_load(b);
3827 struct mme_value first_instance = mme_load(b);
3828
3829 struct mme_draw_params params = {
3830 .first_vertex = first_vertex,
3831 .first_instance = first_instance,
3832 .draw_index = draw_index,
3833 };
3834 nvk_mme_build_set_draw_params(b, ¶ms);
3835
3836 mme_free_reg(b, first_instance);
3837
3838 if (b->devinfo->cls_eng3d < TURING_A)
3839 nvk_mme_spill(b, DRAW_IDX, draw_index);
3840
3841 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3842 mme_if(b, ieq, view_mask, mme_zero()) {
3843 mme_free_reg(b, view_mask);
3844
3845 nvk_mme_build_draw_loop(b, instance_count,
3846 first_vertex, vertex_count);
3847 }
3848
3849 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3850 mme_if(b, ine, view_mask, mme_zero()) {
3851 mme_free_reg(b, view_mask);
3852
3853 struct mme_value view = mme_mov(b, mme_zero());
3854 mme_while(b, ine, view, mme_imm(32)) {
3855 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3856 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3857 mme_free_reg(b, view_mask);
3858 mme_if(b, ine, has_view, mme_zero()) {
3859 mme_free_reg(b, has_view);
3860 nvk_mme_emit_view_index(b, view);
3861 nvk_mme_build_draw_loop(b, instance_count,
3862 first_vertex, vertex_count);
3863 }
3864
3865 mme_add_to(b, view, view, mme_imm(1));
3866 }
3867 mme_free_reg(b, view);
3868 }
3869
3870 mme_free_reg(b, instance_count);
3871 mme_free_reg(b, first_vertex);
3872 mme_free_reg(b, vertex_count);
3873
3874 if (b->devinfo->cls_eng3d < TURING_A)
3875 nvk_mme_unspill(b, DRAW_IDX, draw_index);
3876 }
3877
3878 void
nvk_mme_draw(struct mme_builder * b)3879 nvk_mme_draw(struct mme_builder *b)
3880 {
3881 struct mme_value draw_index = mme_load(b);
3882 nvk_mme_build_draw(b, draw_index);
3883 }
3884
3885 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3886 nvk_CmdDraw(VkCommandBuffer commandBuffer,
3887 uint32_t vertexCount,
3888 uint32_t instanceCount,
3889 uint32_t firstVertex,
3890 uint32_t firstInstance)
3891 {
3892 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3893
3894 nvk_cmd_flush_gfx_state(cmd);
3895
3896 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3897 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3898 P_INLINE_DATA(p, 0 /* draw_index */);
3899 P_INLINE_DATA(p, vertexCount);
3900 P_INLINE_DATA(p, instanceCount);
3901 P_INLINE_DATA(p, firstVertex);
3902 P_INLINE_DATA(p, firstInstance);
3903 }
3904
3905 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3906 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3907 uint32_t drawCount,
3908 const VkMultiDrawInfoEXT *pVertexInfo,
3909 uint32_t instanceCount,
3910 uint32_t firstInstance,
3911 uint32_t stride)
3912 {
3913 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3914
3915 nvk_cmd_flush_gfx_state(cmd);
3916
3917 for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3918 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3919 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3920 P_INLINE_DATA(p, draw_index);
3921 P_INLINE_DATA(p, pVertexInfo->vertexCount);
3922 P_INLINE_DATA(p, instanceCount);
3923 P_INLINE_DATA(p, pVertexInfo->firstVertex);
3924 P_INLINE_DATA(p, firstInstance);
3925
3926 pVertexInfo = ((void *)pVertexInfo) + stride;
3927 }
3928 }
3929
3930 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)3931 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
3932 struct mme_value instance_count,
3933 struct mme_value first_index,
3934 struct mme_value index_count)
3935 {
3936 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3937
3938 mme_loop(b, instance_count) {
3939 mme_mthd(b, NV9097_BEGIN);
3940 mme_emit(b, begin);
3941
3942 mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
3943 mme_emit(b, first_index);
3944 mme_emit(b, index_count);
3945
3946 mme_mthd(b, NV9097_END);
3947 mme_emit(b, mme_zero());
3948
3949 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3950 }
3951
3952 mme_free_reg(b, begin);
3953 }
3954
3955 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_index)3956 nvk_mme_build_draw_indexed(struct mme_builder *b,
3957 struct mme_value draw_index)
3958 {
3959 /* These are in VkDrawIndexedIndirectCommand order */
3960 struct mme_value index_count = mme_load(b);
3961 struct mme_value instance_count = mme_load(b);
3962 struct mme_value first_index = mme_load(b);
3963 struct mme_value vertex_offset = mme_load(b);
3964 struct mme_value first_instance = mme_load(b);
3965
3966 struct mme_draw_params params = {
3967 .base_vertex = vertex_offset,
3968 .first_vertex = vertex_offset,
3969 .first_instance = first_instance,
3970 .draw_index = draw_index,
3971 };
3972 nvk_mme_build_set_draw_params(b, ¶ms);
3973
3974 mme_free_reg(b, vertex_offset);
3975 mme_free_reg(b, first_instance);
3976
3977 if (b->devinfo->cls_eng3d < TURING_A)
3978 nvk_mme_spill(b, DRAW_IDX, draw_index);
3979
3980 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3981 mme_if(b, ieq, view_mask, mme_zero()) {
3982 mme_free_reg(b, view_mask);
3983
3984 nvk_mme_build_draw_indexed_loop(b, instance_count,
3985 first_index, index_count);
3986 }
3987
3988 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3989 mme_if(b, ine, view_mask, mme_zero()) {
3990 mme_free_reg(b, view_mask);
3991
3992 struct mme_value view = mme_mov(b, mme_zero());
3993 mme_while(b, ine, view, mme_imm(32)) {
3994 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3995 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3996 mme_free_reg(b, view_mask);
3997 mme_if(b, ine, has_view, mme_zero()) {
3998 mme_free_reg(b, has_view);
3999 nvk_mme_emit_view_index(b, view);
4000 nvk_mme_build_draw_indexed_loop(b, instance_count,
4001 first_index, index_count);
4002 }
4003
4004 mme_add_to(b, view, view, mme_imm(1));
4005 }
4006 mme_free_reg(b, view);
4007 }
4008
4009 mme_free_reg(b, instance_count);
4010 mme_free_reg(b, first_index);
4011 mme_free_reg(b, index_count);
4012
4013 if (b->devinfo->cls_eng3d < TURING_A)
4014 nvk_mme_unspill(b, DRAW_IDX, draw_index);
4015 }
4016
4017 void
nvk_mme_draw_indexed(struct mme_builder * b)4018 nvk_mme_draw_indexed(struct mme_builder *b)
4019 {
4020 struct mme_value draw_index = mme_load(b);
4021 nvk_mme_build_draw_indexed(b, draw_index);
4022 }
4023
4024 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4025 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4026 uint32_t indexCount,
4027 uint32_t instanceCount,
4028 uint32_t firstIndex,
4029 int32_t vertexOffset,
4030 uint32_t firstInstance)
4031 {
4032 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4033
4034 nvk_cmd_flush_gfx_state(cmd);
4035
4036 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4037 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4038 P_INLINE_DATA(p, 0 /* draw_index */);
4039 P_INLINE_DATA(p, indexCount);
4040 P_INLINE_DATA(p, instanceCount);
4041 P_INLINE_DATA(p, firstIndex);
4042 P_INLINE_DATA(p, vertexOffset);
4043 P_INLINE_DATA(p, firstInstance);
4044 }
4045
4046 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)4047 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
4048 uint32_t drawCount,
4049 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
4050 uint32_t instanceCount,
4051 uint32_t firstInstance,
4052 uint32_t stride,
4053 const int32_t *pVertexOffset)
4054 {
4055 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4056
4057 nvk_cmd_flush_gfx_state(cmd);
4058
4059 for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
4060 const uint32_t vertex_offset =
4061 pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
4062
4063 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4064 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4065 P_INLINE_DATA(p, draw_index);
4066 P_INLINE_DATA(p, pIndexInfo->indexCount);
4067 P_INLINE_DATA(p, instanceCount);
4068 P_INLINE_DATA(p, pIndexInfo->firstIndex);
4069 P_INLINE_DATA(p, vertex_offset);
4070 P_INLINE_DATA(p, firstInstance);
4071
4072 pIndexInfo = ((void *)pIndexInfo) + stride;
4073 }
4074 }
4075
4076 void
nvk_mme_draw_indirect(struct mme_builder * b)4077 nvk_mme_draw_indirect(struct mme_builder *b)
4078 {
4079 if (b->devinfo->cls_eng3d >= TURING_A) {
4080 struct mme_value64 draw_addr = mme_load_addr64(b);
4081 struct mme_value draw_count = mme_load(b);
4082 struct mme_value stride = mme_load(b);
4083
4084 struct mme_value draw = mme_mov(b, mme_zero());
4085 mme_while(b, ult, draw, draw_count) {
4086 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4087
4088 nvk_mme_build_draw(b, draw);
4089
4090 mme_add_to(b, draw, draw, mme_imm(1));
4091 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4092 }
4093 } else {
4094 struct mme_value draw_count = mme_load(b);
4095 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4096
4097 struct mme_value draw = mme_mov(b, mme_zero());
4098 mme_while(b, ine, draw, draw_count) {
4099 nvk_mme_spill(b, DRAW_COUNT, draw_count);
4100
4101 nvk_mme_build_draw(b, draw);
4102 mme_add_to(b, draw, draw, mme_imm(1));
4103
4104 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4105 mme_loop(b, pad_dw) {
4106 mme_free_reg(b, mme_load(b));
4107 }
4108 mme_free_reg(b, pad_dw);
4109
4110 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4111 }
4112 }
4113 }
4114
4115 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4116 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4117 VkBuffer _buffer,
4118 VkDeviceSize offset,
4119 uint32_t drawCount,
4120 uint32_t stride)
4121 {
4122 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4123 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4124
4125 /* From the Vulkan 1.3.238 spec:
4126 *
4127 * VUID-vkCmdDrawIndirect-drawCount-00476
4128 *
4129 * "If drawCount is greater than 1, stride must be a multiple of 4 and
4130 * must be greater than or equal to sizeof(VkDrawIndirectCommand)"
4131 *
4132 * and
4133 *
4134 * "If drawCount is less than or equal to one, stride is ignored."
4135 */
4136 if (drawCount > 1) {
4137 assert(stride % 4 == 0);
4138 assert(stride >= sizeof(VkDrawIndirectCommand));
4139 } else {
4140 stride = sizeof(VkDrawIndirectCommand);
4141 }
4142
4143 nvk_cmd_flush_gfx_state(cmd);
4144
4145 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4146 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4147 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4148 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4149 P_INLINE_DATA(p, draw_addr >> 32);
4150 P_INLINE_DATA(p, draw_addr);
4151 P_INLINE_DATA(p, drawCount);
4152 P_INLINE_DATA(p, stride);
4153 } else {
4154 const uint32_t max_draws_per_push =
4155 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4156
4157 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4158 while (drawCount) {
4159 const uint32_t count = MIN2(drawCount, max_draws_per_push);
4160
4161 struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4162 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4163 P_INLINE_DATA(p, count);
4164 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
4165
4166 uint64_t range = count * (uint64_t)stride;
4167 nv_push_update_count(p, range / 4);
4168 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4169
4170 draw_addr += range;
4171 drawCount -= count;
4172 }
4173 }
4174 }
4175
4176 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)4177 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
4178 {
4179 if (b->devinfo->cls_eng3d >= TURING_A) {
4180 struct mme_value64 draw_addr = mme_load_addr64(b);
4181 struct mme_value draw_count = mme_load(b);
4182 struct mme_value stride = mme_load(b);
4183
4184 struct mme_value draw = mme_mov(b, mme_zero());
4185 mme_while(b, ult, draw, draw_count) {
4186 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4187
4188 nvk_mme_build_draw_indexed(b, draw);
4189
4190 mme_add_to(b, draw, draw, mme_imm(1));
4191 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4192 }
4193 } else {
4194 struct mme_value draw_count = mme_load(b);
4195 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4196
4197 struct mme_value draw = mme_mov(b, mme_zero());
4198 mme_while(b, ine, draw, draw_count) {
4199 nvk_mme_spill(b, DRAW_COUNT, draw_count);
4200
4201 nvk_mme_build_draw_indexed(b, draw);
4202 mme_add_to(b, draw, draw, mme_imm(1));
4203
4204 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4205 mme_loop(b, pad_dw) {
4206 mme_free_reg(b, mme_load(b));
4207 }
4208 mme_free_reg(b, pad_dw);
4209
4210 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4211 }
4212 }
4213 }
4214
4215 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4216 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4217 VkBuffer _buffer,
4218 VkDeviceSize offset,
4219 uint32_t drawCount,
4220 uint32_t stride)
4221 {
4222 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4223 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4224
4225 /* From the Vulkan 1.3.238 spec:
4226 *
4227 * VUID-vkCmdDrawIndexedIndirect-drawCount-00528
4228 *
4229 * "If drawCount is greater than 1, stride must be a multiple of 4 and
4230 * must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
4231 *
4232 * and
4233 *
4234 * "If drawCount is less than or equal to one, stride is ignored."
4235 */
4236 if (drawCount > 1) {
4237 assert(stride % 4 == 0);
4238 assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
4239 } else {
4240 stride = sizeof(VkDrawIndexedIndirectCommand);
4241 }
4242
4243 nvk_cmd_flush_gfx_state(cmd);
4244
4245 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4246 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4247 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4248 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4249 P_INLINE_DATA(p, draw_addr >> 32);
4250 P_INLINE_DATA(p, draw_addr);
4251 P_INLINE_DATA(p, drawCount);
4252 P_INLINE_DATA(p, stride);
4253 } else {
4254 const uint32_t max_draws_per_push =
4255 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4256
4257 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4258 while (drawCount) {
4259 const uint32_t count = MIN2(drawCount, max_draws_per_push);
4260
4261 struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4262 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4263 P_INLINE_DATA(p, count);
4264 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
4265
4266 uint64_t range = count * (uint64_t)stride;
4267 nv_push_update_count(p, range / 4);
4268 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4269
4270 draw_addr += range;
4271 drawCount -= count;
4272 }
4273 }
4274 }
4275
4276 void
nvk_mme_draw_indirect_count(struct mme_builder * b)4277 nvk_mme_draw_indirect_count(struct mme_builder *b)
4278 {
4279 if (b->devinfo->cls_eng3d < TURING_A)
4280 return;
4281
4282 struct mme_value64 draw_addr = mme_load_addr64(b);
4283 struct mme_value64 draw_count_addr = mme_load_addr64(b);
4284 struct mme_value draw_max = mme_load(b);
4285 struct mme_value stride = mme_load(b);
4286
4287 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4288 mme_free_reg64(b, draw_count_addr);
4289 struct mme_value draw_count_buf = mme_load(b);
4290
4291 mme_if(b, ule, draw_count_buf, draw_max) {
4292 mme_mov_to(b, draw_max, draw_count_buf);
4293 }
4294 mme_free_reg(b, draw_count_buf);
4295
4296 struct mme_value draw = mme_mov(b, mme_zero());
4297 mme_while(b, ult, draw, draw_max) {
4298 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4299
4300 nvk_mme_build_draw(b, draw);
4301
4302 mme_add_to(b, draw, draw, mme_imm(1));
4303 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4304 }
4305 }
4306
4307 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4308 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4309 VkBuffer _buffer,
4310 VkDeviceSize offset,
4311 VkBuffer countBuffer,
4312 VkDeviceSize countBufferOffset,
4313 uint32_t maxDrawCount,
4314 uint32_t stride)
4315 {
4316 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4317 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4318 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4319
4320 /* TODO: Indirect count draw pre-Turing */
4321 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4322
4323 nvk_cmd_flush_gfx_state(cmd);
4324
4325 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4326 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
4327 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4328 P_INLINE_DATA(p, draw_addr >> 32);
4329 P_INLINE_DATA(p, draw_addr);
4330 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4331 countBufferOffset);
4332 P_INLINE_DATA(p, draw_count_addr >> 32);
4333 P_INLINE_DATA(p, draw_count_addr);
4334 P_INLINE_DATA(p, maxDrawCount);
4335 P_INLINE_DATA(p, stride);
4336 }
4337
4338 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)4339 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
4340 {
4341 if (b->devinfo->cls_eng3d < TURING_A)
4342 return;
4343
4344 struct mme_value64 draw_addr = mme_load_addr64(b);
4345 struct mme_value64 draw_count_addr = mme_load_addr64(b);
4346 struct mme_value draw_max = mme_load(b);
4347 struct mme_value stride = mme_load(b);
4348
4349 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4350 mme_free_reg64(b, draw_count_addr);
4351 struct mme_value draw_count_buf = mme_load(b);
4352
4353 mme_if(b, ule, draw_count_buf, draw_max) {
4354 mme_mov_to(b, draw_max, draw_count_buf);
4355 }
4356 mme_free_reg(b, draw_count_buf);
4357
4358 struct mme_value draw = mme_mov(b, mme_zero());
4359 mme_while(b, ult, draw, draw_max) {
4360 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4361
4362 nvk_mme_build_draw_indexed(b, draw);
4363
4364 mme_add_to(b, draw, draw, mme_imm(1));
4365 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4366 }
4367 }
4368
4369 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4370 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4371 VkBuffer _buffer,
4372 VkDeviceSize offset,
4373 VkBuffer countBuffer,
4374 VkDeviceSize countBufferOffset,
4375 uint32_t maxDrawCount,
4376 uint32_t stride)
4377 {
4378 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4379 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4380 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4381
4382 /* TODO: Indexed indirect count draw pre-Turing */
4383 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4384
4385 nvk_cmd_flush_gfx_state(cmd);
4386
4387 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4388 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
4389 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4390 P_INLINE_DATA(p, draw_addr >> 32);
4391 P_INLINE_DATA(p, draw_addr);
4392 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4393 countBufferOffset);
4394 P_INLINE_DATA(p, draw_count_addr >> 32);
4395 P_INLINE_DATA(p, draw_count_addr);
4396 P_INLINE_DATA(p, maxDrawCount);
4397 P_INLINE_DATA(p, stride);
4398 }
4399
4400 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)4401 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
4402 struct mme_value instance_count,
4403 struct mme_value counter)
4404 {
4405 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
4406
4407 mme_loop(b, instance_count) {
4408 mme_mthd(b, NV9097_BEGIN);
4409 mme_emit(b, begin);
4410
4411 mme_mthd(b, NV9097_DRAW_AUTO);
4412 mme_emit(b, counter);
4413
4414 mme_mthd(b, NV9097_END);
4415 mme_emit(b, mme_zero());
4416
4417 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
4418 }
4419
4420 mme_free_reg(b, begin);
4421 }
4422
4423 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)4424 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
4425 {
4426 struct mme_value instance_count = mme_load(b);
4427 struct mme_value first_instance = mme_load(b);
4428
4429 if (b->devinfo->cls_eng3d >= TURING_A) {
4430 struct mme_value64 counter_addr = mme_load_addr64(b);
4431 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4432 mme_free_reg(b, counter_addr.lo);
4433 mme_free_reg(b, counter_addr.hi);
4434 }
4435 struct mme_value counter = mme_load(b);
4436
4437 struct mme_draw_params params = {
4438 .first_instance = first_instance,
4439 };
4440 nvk_mme_build_set_draw_params(b, ¶ms);
4441
4442 mme_free_reg(b, first_instance);
4443
4444 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4445 mme_if(b, ieq, view_mask, mme_zero()) {
4446 mme_free_reg(b, view_mask);
4447
4448 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4449 }
4450
4451 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4452 mme_if(b, ine, view_mask, mme_zero()) {
4453 mme_free_reg(b, view_mask);
4454
4455 struct mme_value view = mme_mov(b, mme_zero());
4456 mme_while(b, ine, view, mme_imm(32)) {
4457 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4458 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
4459 mme_free_reg(b, view_mask);
4460 mme_if(b, ine, has_view, mme_zero()) {
4461 mme_free_reg(b, has_view);
4462 nvk_mme_emit_view_index(b, view);
4463 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4464 }
4465
4466 mme_add_to(b, view, view, mme_imm(1));
4467 }
4468 }
4469
4470 mme_free_reg(b, instance_count);
4471 mme_free_reg(b, counter);
4472 }
4473
4474 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)4475 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4476 uint32_t instanceCount,
4477 uint32_t firstInstance,
4478 VkBuffer counterBuffer,
4479 VkDeviceSize counterBufferOffset,
4480 uint32_t counterOffset,
4481 uint32_t vertexStride)
4482 {
4483 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4484 VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
4485
4486 nvk_cmd_flush_gfx_state(cmd);
4487
4488 uint64_t counter_addr = nvk_buffer_address(counter_buffer,
4489 counterBufferOffset);
4490
4491 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4492 struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
4493 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4494 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4495
4496 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4497 P_INLINE_DATA(p, instanceCount);
4498 P_INLINE_DATA(p, firstInstance);
4499 P_INLINE_DATA(p, counter_addr >> 32);
4500 P_INLINE_DATA(p, counter_addr);
4501 } else {
4502 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
4503 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4504 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4505
4506 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4507 P_INLINE_DATA(p, instanceCount);
4508 P_INLINE_DATA(p, firstInstance);
4509 nv_push_update_count(p, 1);
4510 nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
4511 }
4512 }
4513
4514 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)4515 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
4516 uint32_t firstBinding,
4517 uint32_t bindingCount,
4518 const VkBuffer *pBuffers,
4519 const VkDeviceSize *pOffsets,
4520 const VkDeviceSize *pSizes)
4521 {
4522 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4523
4524 for (uint32_t i = 0; i < bindingCount; i++) {
4525 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
4526 uint32_t idx = firstBinding + i;
4527 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
4528 struct nvk_addr_range addr_range =
4529 nvk_buffer_addr_range(buffer, pOffsets[i], size);
4530 assert(addr_range.range <= UINT32_MAX);
4531
4532 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4533
4534 P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
4535 P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
4536 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
4537 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
4538 P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
4539 }
4540
4541 // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
4542 }
4543
4544 void
nvk_mme_xfb_counter_load(struct mme_builder * b)4545 nvk_mme_xfb_counter_load(struct mme_builder *b)
4546 {
4547 struct mme_value buffer = mme_load(b);
4548
4549 struct mme_value counter;
4550 if (b->devinfo->cls_eng3d >= TURING_A) {
4551 struct mme_value64 counter_addr = mme_load_addr64(b);
4552
4553 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4554 mme_free_reg(b, counter_addr.lo);
4555 mme_free_reg(b, counter_addr.hi);
4556
4557 counter = mme_load(b);
4558 } else {
4559 counter = mme_load(b);
4560 }
4561
4562 mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
4563 mme_emit(b, counter);
4564
4565 mme_free_reg(b, counter);
4566 mme_free_reg(b, buffer);
4567 }
4568
4569 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4570 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4571 uint32_t firstCounterBuffer,
4572 uint32_t counterBufferCount,
4573 const VkBuffer *pCounterBuffers,
4574 const VkDeviceSize *pCounterBufferOffsets)
4575 {
4576 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4577 const uint32_t max_buffers = 4;
4578
4579 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
4580
4581 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
4582 for (uint32_t i = 0; i < max_buffers; ++i) {
4583 P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
4584 }
4585
4586 for (uint32_t i = 0; i < counterBufferCount; ++i) {
4587 if (pCounterBuffers == NULL || pCounterBuffers[i] == VK_NULL_HANDLE)
4588 continue;
4589
4590 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4591 // index of counter buffer corresponts to index of transform buffer
4592 uint32_t cb_idx = firstCounterBuffer + i;
4593 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4594 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4595
4596 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4597 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
4598 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4599 /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
4600 P_INLINE_DATA(p, cb_idx * 8);
4601 P_INLINE_DATA(p, cb_addr >> 32);
4602 P_INLINE_DATA(p, cb_addr);
4603 } else {
4604 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
4605 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4606 P_INLINE_DATA(p, cb_idx);
4607 nv_push_update_count(p, 1);
4608 nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
4609 }
4610 }
4611 }
4612
4613 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4614 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4615 uint32_t firstCounterBuffer,
4616 uint32_t counterBufferCount,
4617 const VkBuffer *pCounterBuffers,
4618 const VkDeviceSize *pCounterBufferOffsets)
4619 {
4620 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4621
4622 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
4623
4624 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
4625
4626 for (uint32_t i = 0; i < counterBufferCount; ++i) {
4627 if (pCounterBuffers == NULL || pCounterBuffers[i] == VK_NULL_HANDLE)
4628 continue;
4629
4630 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4631 // index of counter buffer corresponts to index of transform buffer
4632 uint32_t cb_idx = firstCounterBuffer + i;
4633 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4634 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4635
4636 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
4637 P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
4638 P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
4639 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
4640 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
4641 .operation = OPERATION_REPORT_ONLY,
4642 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
4643 .report = REPORT_STREAMING_BYTE_COUNT,
4644 .sub_report = cb_idx,
4645 .structure_size = STRUCTURE_SIZE_ONE_WORD,
4646 });
4647 }
4648 }
4649
4650 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)4651 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4652 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4653 {
4654 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4655 VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
4656
4657 uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
4658 bool inverted = pConditionalRenderingBegin->flags &
4659 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4660
4661 /* From the Vulkan 1.3.280 spec:
4662 *
4663 * "If the 32-bit value at offset in buffer memory is zero,
4664 * then the rendering commands are discarded,
4665 * otherwise they are executed as normal."
4666 *
4667 * The hardware compare a 64-bit value, as such we are required to copy it.
4668 */
4669 uint64_t tmp_addr;
4670 VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
4671 if (result != VK_SUCCESS) {
4672 vk_command_buffer_set_error(&cmd->vk, result);
4673 return;
4674 }
4675
4676 struct nv_push *p = nvk_cmd_buffer_push(cmd, 26);
4677
4678 P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
4679 P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
4680 P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
4681 P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
4682 P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
4683 P_NV90B5_PITCH_IN(p, 4);
4684 P_NV90B5_PITCH_OUT(p, 4);
4685 P_NV90B5_LINE_LENGTH_IN(p, 4);
4686 P_NV90B5_LINE_COUNT(p, 1);
4687
4688 P_IMMD(p, NV90B5, SET_REMAP_COMPONENTS, {
4689 .dst_x = DST_X_SRC_X,
4690 .dst_y = DST_Y_SRC_X,
4691 .dst_z = DST_Z_NO_WRITE,
4692 .dst_w = DST_W_NO_WRITE,
4693 .component_size = COMPONENT_SIZE_ONE,
4694 .num_src_components = NUM_SRC_COMPONENTS_ONE,
4695 .num_dst_components = NUM_DST_COMPONENTS_TWO,
4696 });
4697
4698 P_IMMD(p, NV90B5, LAUNCH_DMA, {
4699 .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
4700 .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
4701 .flush_enable = FLUSH_ENABLE_TRUE,
4702 .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
4703 .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
4704 .remap_enable = REMAP_ENABLE_TRUE,
4705 });
4706
4707 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4708 P_NV9097_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4709 P_NV9097_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4710 P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4711
4712 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4713 P_NV90C0_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4714 P_NV90C0_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4715 P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4716 }
4717
4718 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4719 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4720 {
4721 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4722
4723 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
4724 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4725 P_NV9097_SET_RENDER_ENABLE_A(p, 0);
4726 P_NV9097_SET_RENDER_ENABLE_B(p, 0);
4727 P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4728
4729 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4730 P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
4731 P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
4732 P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4733 }
4734