1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15
16 #include "util/bitpack_helpers.h"
17 #include "vk_format.h"
18 #include "vk_render_pass.h"
19 #include "vk_standard_sample_locations.h"
20
21 #include "nv_push_cl902d.h"
22 #include "nv_push_cl9097.h"
23 #include "nv_push_cl90b5.h"
24 #include "nv_push_cl90c0.h"
25 #include "nv_push_cla097.h"
26 #include "nv_push_clb097.h"
27 #include "nv_push_clb197.h"
28 #include "nv_push_clc397.h"
29 #include "nv_push_clc597.h"
30 #include "drf.h"
31
32 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)33 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
34 {
35 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
36 struct nvk_physical_device *pdev = nvk_device_physical(dev);
37 return pdev->info.cls_eng3d;
38 }
39
40 static void
mme_set_priv_reg(struct mme_builder * b,struct mme_value value,struct mme_value mask,struct mme_value reg)41 mme_set_priv_reg(struct mme_builder *b,
42 struct mme_value value,
43 struct mme_value mask,
44 struct mme_value reg)
45 {
46 mme_mthd(b, NV9097_WAIT_FOR_IDLE);
47 mme_emit(b, mme_zero());
48
49 mme_mthd(b, NVK_SET_MME_SCRATCH(FALCON_0));
50 mme_emit(b, mme_zero());
51 mme_emit(b, value);
52 mme_emit(b, mask);
53
54 mme_mthd(b, NV9097_SET_FALCON04);
55 mme_emit(b, reg);
56
57 struct mme_value loop_cond = mme_mov(b, mme_zero());
58 mme_while(b, ine, loop_cond, mme_imm(1)) {
59 mme_state_to(b, loop_cond, NVK_SET_MME_SCRATCH(FALCON_0));
60 mme_mthd(b, NV9097_NO_OPERATION);
61 mme_emit(b, mme_zero());
62 };
63 }
64
65 void
nvk_mme_set_priv_reg(struct mme_builder * b)66 nvk_mme_set_priv_reg(struct mme_builder *b)
67 {
68 struct mme_value value = mme_load(b);
69 struct mme_value mask = mme_load(b);
70 struct mme_value reg = mme_load(b);
71
72 mme_set_priv_reg(b, value, mask, reg);
73 }
74
75 void
nvk_mme_set_conservative_raster_state(struct mme_builder * b)76 nvk_mme_set_conservative_raster_state(struct mme_builder *b)
77 {
78 struct mme_value new_state = mme_load(b);
79 struct mme_value old_state =
80 nvk_mme_load_scratch(b, CONSERVATIVE_RASTER_STATE);
81
82 mme_if(b, ine, new_state, old_state) {
83 nvk_mme_store_scratch(b, CONSERVATIVE_RASTER_STATE, new_state);
84 mme_set_priv_reg(b, new_state, mme_imm(BITFIELD_RANGE(23, 2)),
85 mme_imm(0x418800));
86 }
87 }
88
89 #define NVK_DRAW_CB0_SIZE sizeof(struct nvk_root_descriptor_table)
90
91 void
nvk_mme_select_cb0(struct mme_builder * b)92 nvk_mme_select_cb0(struct mme_builder *b)
93 {
94 struct mme_value addr_hi = nvk_mme_load_scratch(b, CB0_ADDR_HI);
95 struct mme_value addr_lo = nvk_mme_load_scratch(b, CB0_ADDR_LO);
96
97 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
98 mme_emit(b, mme_imm(NVK_DRAW_CB0_SIZE));
99 mme_emit(b, addr_hi);
100 mme_emit(b, addr_lo);
101 }
102
103 static uint32_t nvk_mme_anti_alias_init(void);
104
105 VkResult
nvk_push_draw_state_init(struct nvk_queue * queue,struct nv_push * p)106 nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
107 {
108 struct nvk_device *dev = nvk_queue_device(queue);
109 struct nvk_physical_device *pdev = nvk_device_physical(dev);
110
111 /* 3D state */
112 P_MTHD(p, NV9097, SET_OBJECT);
113 P_NV9097_SET_OBJECT(p, {
114 .class_id = pdev->info.cls_eng3d,
115 .engine_id = 0,
116 });
117
118 for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
119 size_t size;
120 uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
121 if (dw == NULL)
122 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
123
124 assert(size % sizeof(uint32_t) == 0);
125 const uint32_t num_dw = size / sizeof(uint32_t);
126
127 P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
128 P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
129 P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
130
131 P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
132 P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
133 P_INLINE_ARRAY(p, dw, num_dw);
134
135 mme_pos += num_dw;
136
137 free(dw);
138 }
139
140 if (pdev->info.cls_eng3d >= TURING_A)
141 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
142
143 /* Enable FP helper invocation memory loads
144 *
145 * For generations with firmware support for our `SET_PRIV_REG` mme method
146 * we simply use that. On older generations we'll let the kernel do it.
147 * Starting with GSP we have to do it via the firmware anyway.
148 *
149 * This clears bit 3 of gr_gpcs_tpcs_sm_disp_ctrl
150 *
151 * Without it,
152 * dEQP-VK.subgroups.vote.frag_helper.subgroupallequal_bvec2_fragment will
153 * occasionally fail.
154 */
155 if (pdev->info.cls_eng3d >= MAXWELL_B) {
156 unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
157 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
158 P_INLINE_DATA(p, 0);
159 P_INLINE_DATA(p, BITFIELD_BIT(3));
160 P_INLINE_DATA(p, reg);
161 }
162
163 /* Disable Out Of Range Address exceptions
164 *
165 * From the SPH documentation:
166 *
167 * "The SPH fields StoreReqStart and StoreReqEnd set a range of
168 * attributes whose corresponding Odmap values of ST or ST_LAST are
169 * treated as ST_REQ. Normally, for an attribute whose Omap bit is TRUE
170 * and Odmap value is ST, when the shader writes data to this output, it
171 * can not count on being able to read it back, since the next
172 * downstream shader might have its Imap bit FALSE, thereby causing the
173 * Bmap bit to be FALSE. By including a ST type of attribute in the
174 * range of StoreReqStart and StoreReqEnd, the attribute’s Odmap value
175 * is treated as ST_REQ, so an Omap bit being TRUE causes the Bmap bit
176 * to be TRUE. This guarantees the shader program can output the value
177 * and then read it back later. This will save register space."
178 *
179 * It's unclear exactly what's going on but this seems to imply that the
180 * hardware actually ANDs the output mask of one shader stage together with
181 * the input mask of the subsequent shader stage to determine which values
182 * are actually used.
183 *
184 * In the case when we have an empty fragment shader, it seems the hardware
185 * doesn't allocate any output memory for final geometry stage at all and
186 * so any writes to outputs from the final shader stage generates an Out Of
187 * Range Address exception. We could fix this by eliminating unused
188 * outputs via cross-stage linking but that won't work in the case of
189 * VK_EXT_shader_object and VK_EXT_graphics_pipeline_library fast-link.
190 * Instead, the easiest solution is to just disable the exception.
191 *
192 * NOTE (Faith):
193 *
194 * This above analysis is 100% conjecture on my part based on a creative
195 * reading of the SPH docs and what I saw when trying to run certain
196 * OpenGL CTS tests on NVK + Zink. Without access to NVIDIA HW
197 * engineers, have no way of verifying this analysis.
198 *
199 * The CTS test in question is:
200 *
201 * KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_tessLevel
202 *
203 * This should also prevent any issues with array overruns on I/O arrays.
204 * Before, they would get an exception and kill the context whereas now
205 * they should gently get ignored.
206 *
207 * This clears bit 14 of gr_gpcs_tpcs_sms_hww_warp_esr_report_mask
208 */
209 if (pdev->info.cls_eng3d >= MAXWELL_B) {
210 unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ea8 : 0x419e44;
211 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
212 P_INLINE_DATA(p, 0);
213 P_INLINE_DATA(p, BITFIELD_BIT(14));
214 P_INLINE_DATA(p, reg);
215 }
216
217 /* Set CONSERVATIVE_RASTER_STATE to an invalid value, to ensure the
218 * hardware reg is always set the first time conservative rasterization
219 * is enabled */
220 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE),
221 ~0);
222
223 /* Initialize tessellation parameters */
224 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_TESS_PARAMS), 0);
225 P_IMMD(p, NV9097, SET_TESSELLATION_PARAMETERS, {});
226
227 P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
228
229 P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
230 P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
231 for (unsigned i = 0; i < 8; i++)
232 P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
233
234 P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
235
236 // P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
237 // P_INLINE_DATA(cmd->push, 0);
238
239 P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
240
241 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
242
243 P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
244 P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
245 P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
246 P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
247
248 P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
249
250 P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
251
252 P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
253
254 P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
255 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
256
257 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
258 P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
259 .all_covered_all_hit_once = 0xff,
260 });
261 P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
262 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
263 .all_covered_all_hit_once = 0xff,
264 });
265 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
266 .all_covered_all_hit_once = 0xff,
267 });
268 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
269 .all_covered_all_hit_once = 0x3f,
270 });
271 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
272 .all_covered_all_hit_once = 0xff,
273 });
274 P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
275 .all_covered_all_hit_once = 0xff,
276 });
277
278 if (pdev->info.cls_eng3d < VOLTA_A)
279 P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
280
281 P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
282 .current = 3,
283 .oldest_supported = 3,
284 });
285 P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
286 .current = 2,
287 .oldest_supported = 2,
288 });
289
290 if (pdev->info.cls_eng3d < MAXWELL_A)
291 P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
292
293 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
294 POLICY_EVICT_NORMAL);
295 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
296 POLICY_EVICT_NORMAL);
297 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
298 POLICY_EVICT_NORMAL);
299 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
300 POLICY_EVICT_NORMAL);
301 P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
302 POLICY_EVICT_NORMAL);
303
304 P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
305
306 P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
307 .color_front_diffuse = COLOR_FRONT_DIFFUSE_VECTOR_0001,
308 .color_front_specular = COLOR_FRONT_SPECULAR_VECTOR_0001,
309 .generic_vector = GENERIC_VECTOR_VECTOR_0001,
310 .fixed_fnc_texture = FIXED_FNC_TEXTURE_VECTOR_0001,
311 .dx9_color0 = DX9_COLOR0_VECTOR_0001,
312 .dx9_color1_to_color15 = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
313 });
314
315 P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
316
317 P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
318 CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
319
320 P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
321 .enable = ENABLE_TRUE,
322 .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
323 });
324
325 if (pdev->info.cls_eng3d < VOLTA_A)
326 P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
327
328 P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
329 P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
330 P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
331
332 if (pdev->info.cls_eng3d < MAXWELL_A)
333 P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
334
335 if (pdev->info.cls_eng3d >= KEPLER_A &&
336 pdev->info.cls_eng3d < MAXWELL_A) {
337 P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
338 ORDERING_KEPLER_ORDER);
339 }
340
341 P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
342 P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
343 P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
344 P_IMMD(p, NV9097, SET_PS_SATURATE, {
345 .output0 = OUTPUT0_FALSE,
346 .output1 = OUTPUT1_FALSE,
347 .output2 = OUTPUT2_FALSE,
348 .output3 = OUTPUT3_FALSE,
349 .output4 = OUTPUT4_FALSE,
350 .output5 = OUTPUT5_FALSE,
351 .output6 = OUTPUT6_FALSE,
352 .output7 = OUTPUT7_FALSE,
353 });
354
355 P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
356 P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
357
358 /* From vulkan spec's point rasterization:
359 * "Point rasterization produces a fragment for each fragment area group of
360 * framebuffer pixels with one or more sample points that intersect a region
361 * centered at the point’s (xf,yf).
362 * This region is a square with side equal to the current point size.
363 * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
364 * for the point"
365 *
366 * So it seems we always need square points with PointCoords like OpenGL
367 * point sprites.
368 *
369 * From OpenGL compatibility spec:
370 * Basic point rasterization:
371 * "If point sprites are enabled, then point rasterization produces a
372 * fragment for each framebuffer pixel whose center lies inside a square
373 * centered at the point’s (xw, yw), with side length equal to the current
374 * point size.
375 * ... and xw and yw are the exact, unrounded window coordinates of the
376 * vertex for the point"
377 *
378 * And Point multisample rasterization:
379 * "This region is a circle having diameter equal to the current point width
380 * if POINT_SPRITE is disabled, or a square with side equal to the current
381 * point width if POINT_SPRITE is enabled."
382 */
383 P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
384 P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
385 .rmode = RMODE_ZERO,
386 .origin = ORIGIN_TOP,
387 .texture0 = TEXTURE0_PASSTHROUGH,
388 .texture1 = TEXTURE1_PASSTHROUGH,
389 .texture2 = TEXTURE2_PASSTHROUGH,
390 .texture3 = TEXTURE3_PASSTHROUGH,
391 .texture4 = TEXTURE4_PASSTHROUGH,
392 .texture5 = TEXTURE5_PASSTHROUGH,
393 .texture6 = TEXTURE6_PASSTHROUGH,
394 .texture7 = TEXTURE7_PASSTHROUGH,
395 .texture8 = TEXTURE8_PASSTHROUGH,
396 .texture9 = TEXTURE9_PASSTHROUGH,
397 });
398
399 /* OpenGL's GL_POINT_SMOOTH */
400 P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
401
402 if (pdev->info.cls_eng3d >= MAXWELL_B)
403 P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
404
405 P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
406
407 P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
408
409 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SHADING_RATE_CONTROL), 0);
410 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ANTI_ALIAS),
411 nvk_mme_anti_alias_init());
412
413 /* Enable multisample rasterization even for one sample rasterization,
414 * this way we get strict lines and rectangular line support.
415 * More info at: DirectX rasterization rules
416 */
417 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
418
419 if (pdev->info.cls_eng3d >= MAXWELL_B) {
420 P_IMMD(p, NVB197, SET_POST_PS_INITIAL_COVERAGE, true);
421 P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
422 BY_VIEWPORT_INDEX_FALSE);
423 }
424
425 /* TODO: Vertex runout */
426
427 P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
428 .mode = MODE_UPPER_LEFT,
429 .flip_y = FLIP_Y_FALSE,
430 });
431
432 P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
433 P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
434 P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
435
436 P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
437 P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
438 P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
439
440 // P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
441 // .respect_stencil_mask = RESPECT_STENCIL_MASK_FALSE,
442 // .use_clear_rect = USE_CLEAR_RECT_FALSE,
443 // });
444
445 P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
446
447 P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
448 .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
449 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
450 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
451 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
452 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
453 .geometry_clip = GEOMETRY_CLIP_WZERO_CLIP,
454 .geometry_guardband_z = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
455 });
456
457 for (unsigned i = 0; i < 16; i++)
458 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
459
460 P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
461
462 if (pdev->info.cls_eng3d >= TURING_A) {
463 /* I don't know what these values actually mean. I just copied them
464 * from the way the blob sets up the hardware.
465 */
466 P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(0));
467 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 0, 0xa23eb139);
468 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 1, 0xfb72ea61);
469 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 2, 0xd950c843);
470 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 3, 0x88fac4e5);
471 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 4, 0x1ab3e1b6);
472 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 5, 0xa98fedc2);
473 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 6, 0x2107654b);
474 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 7, 0xe0539773);
475 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 8, 0x698badcf);
476 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 9, 0x71032547);
477 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 10, 0xdef05397);
478 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 11, 0x56789abc);
479 P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 12, 0x1234);
480 }
481
482 if (pdev->info.cls_eng3d < VOLTA_A) {
483 uint64_t shader_base_addr =
484 nvk_heap_contiguous_base_address(&dev->shader_heap);
485
486 P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
487 P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
488 P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
489 }
490
491 for (uint32_t group = 0; group < 5; group++) {
492 for (uint32_t slot = 0; slot < 16; slot++) {
493 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
494 .valid = VALID_FALSE,
495 .shader_slot = slot,
496 });
497 }
498 }
499
500 // P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
501 // P_INLINE_DATA(cmd->push, 0x40);
502 P_IMMD(p, NV9097, SET_RT_LAYER, {
503 .v = 0,
504 .control = CONTROL_V_SELECTS_LAYER,
505 });
506 // P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
507 // P_INLINE_DATA(cmd->push, 0x30);
508
509 P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
510 P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
511 P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
512
513 uint64_t zero_addr = dev->zero_page->va->addr;
514 P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
515 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
516 P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
517
518 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VB_ENABLES));
519 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_VB_ENABLES, 0);
520 for (uint32_t b = 0; b < 32; b++) {
521 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
522 .enable = false,
523 });
524 }
525
526 if (pdev->info.cls_eng3d >= FERMI_A &&
527 pdev->info.cls_eng3d < MAXWELL_A) {
528 assert(dev->vab_memory);
529 uint64_t vab_addr = dev->vab_memory->va->addr;
530 P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
531 P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
532 P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
533 P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
534 }
535
536 if (pdev->info.cls_eng3d == MAXWELL_A)
537 P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
538
539 /* Store the address to CB0 in a pair of state registers */
540 uint64_t cb0_addr = queue->draw_cb0->va->addr;
541 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_ADDR_HI));
542 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_HI, cb0_addr >> 32);
543 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_LO, cb0_addr);
544
545 /* Store the address to the zero page in a pair of state registers */
546 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ZERO_ADDR_HI));
547 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_HI, zero_addr >> 32);
548 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_LO, zero_addr);
549
550 /* We leave CB0 selected by default */
551 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
552 P_INLINE_DATA(p, 0);
553
554 /* Bind CB0 to all shader groups */
555 for (uint32_t group = 0; group < 5; group++) {
556 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
557 .valid = VALID_TRUE,
558 .shader_slot = 0,
559 });
560 }
561
562 /* Zero out CB0 */
563 P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
564 P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, 0);
565 for (uint32_t dw = 0; dw < NVK_DRAW_CB0_SIZE / 4; dw++)
566 P_INLINE_DATA(p, 0);
567
568 /* These are shadowed in cb0 so they need to be zeroed as well for
569 * consistency.
570 */
571 P_IMMD(p, NV9097, SET_GLOBAL_BASE_INSTANCE_INDEX, 0);
572 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_FIRST_VERTEX));
573 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_FIRST_VERTEX, 0);
574 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_DRAW_INDEX, 0);
575 P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_VIEW_INDEX, 0);
576
577 return VK_SUCCESS;
578 }
579
580 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)581 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
582 {
583 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
584
585 /* These depend on color attachment count */
586 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
587 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
588 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
589 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
590
591 /* These depend on the depth/stencil format */
592 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
593 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
594 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
595 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
596
597 /* This may depend on render targets for ESO */
598 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
599
600 /* This may depend on render targets */
601 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
602
603 /* Might be required for depthClampZeroOne */
604 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE);
605 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE);
606 }
607
608 static void
nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer * cmd,struct nvk_descriptor_state * desc,size_t offset,size_t size)609 nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer *cmd,
610 struct nvk_descriptor_state *desc,
611 size_t offset, size_t size)
612 {
613 const uint32_t start_dw = offset / 4;
614 const uint32_t end_dw = DIV_ROUND_UP(offset + size, 4);
615 const uint32_t len_dw = end_dw - start_dw;
616
617 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + len_dw);
618 P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
619 P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, start_dw * 4);
620
621 const uint32_t *root_dw = (uint32_t *)desc->root;
622 P_INLINE_ARRAY(p, &root_dw[start_dw], len_dw);
623 }
624
625 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)626 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
627 const VkCommandBufferBeginInfo *pBeginInfo)
628 {
629 if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
630 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
631 P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
632 P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
633 .lines = LINES_ALL,
634 });
635 P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
636 .lines = LINES_ALL,
637 });
638
639 P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
640 .constant = CONSTANT_TRUE,
641 });
642 }
643
644 cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
645
646 if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
647 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
648 char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
649 const VkRenderingInfo *resume_info =
650 vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
651 pBeginInfo,
652 gcbiar_data);
653 if (resume_info) {
654 nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
655 } else {
656 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
657 vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
658 pBeginInfo);
659 assert(inheritance_info);
660
661 struct nvk_rendering_state *render = &cmd->state.gfx.render;
662 render->flags = inheritance_info->flags;
663 render->area = (VkRect2D) { };
664 render->layer_count = 0;
665 render->view_mask = inheritance_info->viewMask;
666 render->samples = inheritance_info->rasterizationSamples;
667
668 render->color_att_count = inheritance_info->colorAttachmentCount;
669 for (uint32_t i = 0; i < render->color_att_count; i++) {
670 render->color_att[i].vk_format =
671 inheritance_info->pColorAttachmentFormats[i];
672 }
673 render->depth_att.vk_format =
674 inheritance_info->depthAttachmentFormat;
675 render->stencil_att.vk_format =
676 inheritance_info->stencilAttachmentFormat;
677
678 const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
679 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
680 .colorAttachmentCount = inheritance_info->colorAttachmentCount,
681 };
682 const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
683 vk_get_command_buffer_rendering_attachment_location_info(
684 cmd->vk.level, pBeginInfo);
685 if (att_loc_info == NULL)
686 att_loc_info = &att_loc_info_default;
687
688 vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
689
690 nvk_cmd_buffer_dirty_render_pass(cmd);
691 }
692 }
693
694 cmd->state.gfx.shaders_dirty = ~0;
695 }
696
697 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)698 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
699 {
700 vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
701
702 /* From the Vulkan 1.3.275 spec:
703 *
704 * "...There is one exception to this rule - if the primary command
705 * buffer is inside a render pass instance, then the render pass and
706 * subpass state is not disturbed by executing secondary command
707 * buffers."
708 *
709 * We need to reset everything EXCEPT the render pass state.
710 */
711 struct nvk_rendering_state render_save = cmd->state.gfx.render;
712 memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
713 cmd->state.gfx.render = render_save;
714
715 /* We need to keep the flush_root callback */
716 cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
717
718 cmd->state.gfx.shaders_dirty = ~0;
719 }
720
721 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)722 nvk_attachment_init(struct nvk_attachment *att,
723 const VkRenderingAttachmentInfo *info)
724 {
725 if (info == NULL || info->imageView == VK_NULL_HANDLE) {
726 *att = (struct nvk_attachment) { .iview = NULL, };
727 return;
728 }
729
730 VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
731 *att = (struct nvk_attachment) {
732 .vk_format = iview->vk.format,
733 .iview = iview,
734 };
735
736 if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
737 VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
738 att->resolve_mode = info->resolveMode;
739 att->resolve_iview = res_iview;
740 }
741
742 att->store_op = info->storeOp;
743 }
744
745 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)746 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
747 {
748 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
749 uint16_t nil_to_nv9097[] = {
750 MODE(1X1),
751 MODE(2X1),
752 MODE(2X1_D3D),
753 MODE(2X2),
754 MODE(4X2),
755 MODE(4X2_D3D),
756 MODE(4X4),
757 };
758 #undef MODE
759 assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
760 assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1 ||
761 nil_to_nv9097[sample_layout] != 0);
762
763 return nil_to_nv9097[sample_layout];
764 }
765
766 static uint32_t nvk_mme_anti_alias_samples(uint32_t samples);
767
768 static void
nvk_cmd_set_sample_layout(struct nvk_cmd_buffer * cmd,enum nil_sample_layout sample_layout)769 nvk_cmd_set_sample_layout(struct nvk_cmd_buffer *cmd,
770 enum nil_sample_layout sample_layout)
771 {
772 const uint32_t samples = nil_sample_layout_samples(sample_layout);
773 struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
774
775 P_IMMD(p, NV9097, SET_ANTI_ALIAS,
776 nil_to_nv9097_samples_mode(sample_layout));
777
778 switch (sample_layout) {
779 case NIL_SAMPLE_LAYOUT_1X1:
780 case NIL_SAMPLE_LAYOUT_2X1:
781 case NIL_SAMPLE_LAYOUT_2X1_D3D:
782 /* These only have two modes: Single-pass or per-sample */
783 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
784 P_INLINE_DATA(p, 0);
785 P_INLINE_DATA(p, 0);
786 P_INLINE_DATA(p, 0);
787 P_INLINE_DATA(p, 0);
788 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
789 P_INLINE_DATA(p, 0);
790 P_INLINE_DATA(p, 0);
791 P_INLINE_DATA(p, 0);
792 P_INLINE_DATA(p, 0);
793 break;
794
795 case NIL_SAMPLE_LAYOUT_2X2:
796 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
797 P_INLINE_DATA(p, 0x000a0005);
798 P_INLINE_DATA(p, 0x000a0005);
799 P_INLINE_DATA(p, 0);
800 P_INLINE_DATA(p, 0);
801 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
802 P_INLINE_DATA(p, 0);
803 P_INLINE_DATA(p, 0);
804 P_INLINE_DATA(p, 0);
805 P_INLINE_DATA(p, 0);
806 break;
807
808 case NIL_SAMPLE_LAYOUT_4X2:
809 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
810 P_INLINE_DATA(p, 0x000f000f);
811 P_INLINE_DATA(p, 0x000f000f);
812 P_INLINE_DATA(p, 0x00f000f0);
813 P_INLINE_DATA(p, 0x00f000f0);
814 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
815 P_INLINE_DATA(p, 0x00030003);
816 P_INLINE_DATA(p, 0x000c000c);
817 P_INLINE_DATA(p, 0x00300030);
818 P_INLINE_DATA(p, 0x00c000c0);
819 break;
820
821 case NIL_SAMPLE_LAYOUT_4X2_D3D:
822 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
823 P_INLINE_DATA(p, 0x003a00c5);
824 P_INLINE_DATA(p, 0x003a00c5);
825 P_INLINE_DATA(p, 0x003a003a);
826 P_INLINE_DATA(p, 0x00c500c5);
827 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
828 P_INLINE_DATA(p, 0x00120081);
829 P_INLINE_DATA(p, 0x00280044);
830 P_INLINE_DATA(p, 0x00280012);
831 P_INLINE_DATA(p, 0x00810044);
832 break;
833
834 default:
835 unreachable("Unknown sample layout");
836 }
837
838 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
839 P_INLINE_DATA(p, nvk_mme_anti_alias_samples(samples));
840 }
841
842 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)843 nvk_GetRenderingAreaGranularityKHR(
844 VkDevice device,
845 const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
846 VkExtent2D *pGranularity)
847 {
848 *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
849 }
850
851 static bool
nvk_rendering_all_linear(const struct nvk_rendering_state * render)852 nvk_rendering_all_linear(const struct nvk_rendering_state *render)
853 {
854 /* Depth and stencil are never linear */
855 if (render->depth_att.iview || render->stencil_att.iview)
856 return false;
857
858 for (uint32_t i = 0; i < render->color_att_count; i++) {
859 const struct nvk_image_view *iview = render->color_att[i].iview;
860 if (iview == NULL)
861 continue;
862
863 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
864 const uint8_t ip = iview->planes[0].image_plane;
865 const struct nil_image_level *level =
866 &image->planes[ip].nil.levels[iview->vk.base_mip_level];
867
868 if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR)
869 return false;
870 }
871
872 return true;
873 }
874
875 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)876 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
877 const VkRenderingInfo *pRenderingInfo)
878 {
879 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
880 struct nvk_rendering_state *render = &cmd->state.gfx.render;
881
882 memset(render, 0, sizeof(*render));
883
884 render->flags = pRenderingInfo->flags;
885 render->area = pRenderingInfo->renderArea;
886 render->view_mask = pRenderingInfo->viewMask;
887 render->layer_count = pRenderingInfo->layerCount;
888 render->samples = 0;
889
890 const uint32_t layer_count =
891 render->view_mask ? util_last_bit(render->view_mask) :
892 render->layer_count;
893
894 render->color_att_count = pRenderingInfo->colorAttachmentCount;
895 for (uint32_t i = 0; i < render->color_att_count; i++) {
896 nvk_attachment_init(&render->color_att[i],
897 &pRenderingInfo->pColorAttachments[i]);
898 }
899
900 nvk_attachment_init(&render->depth_att,
901 pRenderingInfo->pDepthAttachment);
902 nvk_attachment_init(&render->stencil_att,
903 pRenderingInfo->pStencilAttachment);
904
905 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att_info =
906 vk_find_struct_const(pRenderingInfo->pNext,
907 RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
908 if (fsr_att_info != NULL && fsr_att_info->imageView != VK_NULL_HANDLE) {
909 VK_FROM_HANDLE(nvk_image_view, iview, fsr_att_info->imageView);
910 render->fsr_att = (struct nvk_attachment) {
911 .vk_format = iview->vk.format,
912 .iview = iview,
913 .store_op = VK_ATTACHMENT_STORE_OP_NONE,
914 };
915 }
916
917 render->all_linear = nvk_rendering_all_linear(render);
918
919 const VkRenderingAttachmentLocationInfoKHR ral_info = {
920 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
921 .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
922 };
923 vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
924
925 nvk_cmd_buffer_dirty_render_pass(cmd);
926
927 struct nv_push *p = nvk_cmd_buffer_push(cmd, NVK_MAX_RTS * 12 + 34);
928
929 P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
930 render->view_mask);
931
932 P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
933 P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
934 .x = render->area.offset.x,
935 .width = render->area.extent.width,
936 });
937 P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
938 .y = render->area.offset.y,
939 .height = render->area.extent.height,
940 });
941
942 enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
943
944 /* We always emit SET_COLOR_TARGET_A(i) for every color target, regardless
945 * of the number of targets in the render pass. This ensures that we have
946 * no left over pointers from previous render passes in the hardware. This
947 * also allows us to point at any render target with SET_CT_SELECT and know
948 * that it's either a valid render target or NULL.
949 */
950 for (uint32_t i = 0; i < NVK_MAX_RTS; i++) {
951 if (render->color_att[i].iview) {
952 const struct nvk_image_view *iview = render->color_att[i].iview;
953 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
954 /* Rendering to multi-planar images is valid for a specific single
955 * plane only, so assert that what we have is a single-plane, obtain
956 * its index, and begin rendering
957 */
958 assert(iview->plane_count == 1);
959 const uint8_t ip = iview->planes[0].image_plane;
960 const struct nvk_image_plane *plane = &image->planes[ip];
961
962 if (!render->all_linear &&
963 plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR)
964 plane = &image->linear_tiled_shadow;
965
966 const struct nil_image *nil_image = &plane->nil;
967 const struct nil_image_level *level =
968 &nil_image->levels[iview->vk.base_mip_level];
969 struct nil_Extent4D_Samples level_extent_sa =
970 nil_image_level_extent_sa(nil_image, iview->vk.base_mip_level);
971
972 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
973 sample_layout == nil_image->sample_layout);
974 sample_layout = nil_image->sample_layout;
975 render->samples = image->vk.samples;
976
977 uint64_t addr = nvk_image_plane_base_address(plane) + level->offset_B;
978
979 if (nil_image->dim == NIL_IMAGE_DIM_3D) {
980 addr += nil_image_level_z_offset_B(nil_image,
981 iview->vk.base_mip_level,
982 iview->vk.base_array_layer);
983 assert(layer_count <= iview->vk.extent.depth);
984 } else {
985 addr += iview->vk.base_array_layer *
986 (uint64_t)nil_image->array_stride_B;
987 assert(layer_count <= iview->vk.layer_count);
988 }
989
990 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
991 P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
992 P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
993
994 if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR) {
995 const enum pipe_format p_format =
996 nvk_format_to_pipe_format(iview->vk.format);
997
998 /* We use the stride for depth/stencil targets because the Z/S
999 * hardware has no concept of a tile width. Instead, we just set
1000 * the width to the stride divided by bpp.
1001 */
1002 const uint32_t row_stride_el =
1003 level->row_stride_B / util_format_get_blocksize(p_format);
1004 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, row_stride_el);
1005 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1006 const uint8_t ct_format = nil_format_to_color_target(p_format);
1007 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1008
1009 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1010 .block_width = BLOCK_WIDTH_ONE_GOB,
1011 .block_height = level->tiling.y_log2,
1012 .block_depth = level->tiling.z_log2,
1013 .layout = LAYOUT_BLOCKLINEAR,
1014 .third_dimension_control = (nil_image->dim == NIL_IMAGE_DIM_3D) ?
1015 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
1016 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1017 });
1018
1019 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1020 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
1021 nil_image->array_stride_B >> 2);
1022 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1023 } else {
1024 /* NVIDIA can only render to 2D linear images */
1025 assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1026 /* NVIDIA can only render to non-multisampled images */
1027 assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1028 /* NVIDIA doesn't support linear array images */
1029 assert(iview->vk.base_array_layer == 0 && layer_count == 1);
1030
1031 uint32_t pitch = level->row_stride_B;
1032 const enum pipe_format p_format =
1033 nvk_format_to_pipe_format(iview->vk.format);
1034 /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
1035 * takes row pitch
1036 */
1037 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
1038 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1039
1040 const uint8_t ct_format = nil_format_to_color_target(p_format);
1041 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1042
1043 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1044 .layout = LAYOUT_PITCH,
1045 .third_dimension_control =
1046 THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1047 });
1048
1049 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
1050 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1051 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1052 }
1053
1054 P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), nil_image->compressed);
1055 } else {
1056 P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
1057 P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
1058 P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
1059 P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
1060 P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
1061 P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
1062 P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1063 .layout = LAYOUT_BLOCKLINEAR,
1064 });
1065 P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1066 P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1067 P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1068
1069 P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), ENABLE_TRUE);
1070 }
1071 }
1072
1073 if (render->depth_att.iview || render->stencil_att.iview) {
1074 struct nvk_image_view *iview = render->depth_att.iview ?
1075 render->depth_att.iview :
1076 render->stencil_att.iview;
1077 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1078 /* Depth/stencil are always single-plane */
1079 assert(iview->plane_count == 1);
1080 const uint8_t ip = iview->planes[0].image_plane;
1081 struct nil_image nil_image = image->planes[ip].nil;
1082
1083 uint64_t addr = nvk_image_base_address(image, ip);
1084 uint32_t mip_level = iview->vk.base_mip_level;
1085 uint32_t base_array_layer = iview->vk.base_array_layer;
1086
1087 if (nil_image.dim == NIL_IMAGE_DIM_3D) {
1088 uint64_t level_offset_B;
1089 nil_image = nil_image_3d_level_as_2d_array(&nil_image, mip_level,
1090 &level_offset_B);
1091 addr += level_offset_B;
1092 mip_level = 0;
1093 base_array_layer = 0;
1094 assert(layer_count <= iview->vk.extent.depth);
1095 } else {
1096 assert(layer_count <= iview->vk.layer_count);
1097 }
1098
1099 const struct nil_image_level *level = &nil_image.levels[mip_level];
1100 addr += level->offset_B;
1101
1102 assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
1103 sample_layout == nil_image.sample_layout);
1104 sample_layout = nil_image.sample_layout;
1105 render->samples = image->vk.samples;
1106
1107 P_MTHD(p, NV9097, SET_ZT_A);
1108 P_NV9097_SET_ZT_A(p, addr >> 32);
1109 P_NV9097_SET_ZT_B(p, addr);
1110 const enum pipe_format p_format =
1111 nvk_format_to_pipe_format(iview->vk.format);
1112 const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
1113 P_NV9097_SET_ZT_FORMAT(p, zs_format);
1114 assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1115 assert(level->tiling.z_log2 == 0);
1116 P_NV9097_SET_ZT_BLOCK_SIZE(p, {
1117 .width = WIDTH_ONE_GOB,
1118 .height = level->tiling.y_log2,
1119 .depth = DEPTH_ONE_GOB,
1120 });
1121 P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
1122
1123 P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
1124
1125 struct nil_Extent4D_Samples level_extent_sa =
1126 nil_image_level_extent_sa(&nil_image, mip_level);
1127
1128 /* We use the stride for depth/stencil targets because the Z/S hardware
1129 * has no concept of a tile width. Instead, we just set the width to
1130 * the stride divided by bpp.
1131 */
1132 const uint32_t row_stride_el =
1133 level->row_stride_B / util_format_get_blocksize(p_format);
1134
1135 P_MTHD(p, NV9097, SET_ZT_SIZE_A);
1136 P_NV9097_SET_ZT_SIZE_A(p, row_stride_el);
1137 P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.height);
1138 P_NV9097_SET_ZT_SIZE_C(p, {
1139 .third_dimension = base_array_layer + layer_count,
1140 .control = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1141 });
1142
1143 P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
1144
1145 P_IMMD(p, NV9097, SET_Z_COMPRESSION, nil_image.compressed);
1146
1147 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1148 P_IMMD(p, NVC597, SET_ZT_SPARSE, {
1149 .enable = ENABLE_FALSE,
1150 });
1151 }
1152 } else {
1153 P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
1154 }
1155
1156 if (render->fsr_att.iview) {
1157 const struct nvk_image_view *iview = render->fsr_att.iview;
1158 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1159
1160 /* Fragment shading rate images are always single-plane */
1161 assert(iview->plane_count == 1);
1162 const uint8_t ip = iview->planes[0].image_plane;
1163 const struct nil_image *nil_image = &image->planes[ip].nil;
1164
1165 /* Fragment shading rate images are always 2D */
1166 assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1167 assert(nil_image->sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1168
1169 uint64_t addr = nvk_image_base_address(image, ip);
1170 uint32_t mip_level = iview->vk.base_mip_level;
1171 struct nil_Extent4D_Samples level_extent_sa =
1172 nil_image_level_extent_sa(nil_image, mip_level);
1173
1174 const struct nil_image_level *level = &nil_image->levels[mip_level];
1175 addr += level->offset_B;
1176
1177 P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1178 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, addr >> 32);
1179 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, addr);
1180 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, {
1181 .width = level_extent_sa.width,
1182 .height = level_extent_sa.height,
1183 });
1184 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0,
1185 iview->vk.layer_count + iview->vk.base_array_layer);
1186 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0,
1187 iview->vk.base_array_layer);
1188 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0,
1189 nil_image->array_stride_B >> 2);
1190 assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1191 assert(level->tiling.z_log2 == 0);
1192 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, {
1193 .width = WIDTH_ONE_GOB,
1194 .height = level->tiling.y_log2,
1195 .depth = DEPTH_ONE_GOB,
1196 });
1197
1198 const enum pipe_format p_format =
1199 nvk_format_to_pipe_format(iview->vk.format);
1200 const uint32_t row_stride_el =
1201 level->row_stride_B / util_format_get_blocksize(p_format);
1202 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0,
1203 row_stride_el);
1204 } else {
1205 P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1206 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, 0);
1207 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, 0);
1208 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, { });
1209 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0, 0);
1210 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0, 0);
1211 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0, 0);
1212 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, { });
1213 P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0, 0);
1214 }
1215
1216 /* From the Vulkan 1.3.275 spec:
1217 *
1218 * "It is legal for a subpass to use no color or depth/stencil
1219 * attachments, either because it has no attachment references or
1220 * because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
1221 * can use shader side effects such as image stores and atomics to
1222 * produce an output. In this case, the subpass continues to use the
1223 * width, height, and layers of the framebuffer to define the dimensions
1224 * of the rendering area, and the rasterizationSamples from each
1225 * pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
1226 * of samples used in rasterization;"
1227 *
1228 * In the case where we have attachments, we emit SET_ANTI_ALIAS here
1229 * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
1230 * specifying the sample layout and we want to ensure it matches. When
1231 * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
1232 * where we base it on dynamic rasterizationSamples.
1233 */
1234 if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID)
1235 nvk_cmd_set_sample_layout(cmd, sample_layout);
1236
1237 if (render->flags & VK_RENDERING_RESUMING_BIT)
1238 return;
1239
1240 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1241 const struct nvk_image_view *iview = render->color_att[i].iview;
1242 if (iview == NULL)
1243 continue;
1244
1245 const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1246 assert(iview->plane_count == 1);
1247 const uint8_t ip = iview->planes[0].image_plane;
1248 const struct nvk_image_plane *plane = &image->planes[ip];
1249
1250 const VkAttachmentLoadOp load_op =
1251 pRenderingInfo->pColorAttachments[i].loadOp;
1252 if (!render->all_linear &&
1253 plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1254 load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
1255 nvk_linear_render_copy(cmd, iview, render->area, true);
1256 }
1257
1258 uint32_t clear_count = 0;
1259 VkClearAttachment clear_att[NVK_MAX_RTS + 1];
1260 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1261 const VkRenderingAttachmentInfo *att_info =
1262 &pRenderingInfo->pColorAttachments[i];
1263 if (att_info->imageView == VK_NULL_HANDLE ||
1264 att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1265 continue;
1266
1267 clear_att[clear_count++] = (VkClearAttachment) {
1268 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1269 .colorAttachment = i,
1270 .clearValue = att_info->clearValue,
1271 };
1272 }
1273
1274 clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
1275 if (pRenderingInfo->pDepthAttachment != NULL &&
1276 pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
1277 pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1278 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
1279 clear_att[clear_count].clearValue.depthStencil.depth =
1280 pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
1281 }
1282 if (pRenderingInfo->pStencilAttachment != NULL &&
1283 pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
1284 pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1285 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
1286 clear_att[clear_count].clearValue.depthStencil.stencil =
1287 pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
1288 }
1289 if (clear_att[clear_count].aspectMask != 0)
1290 clear_count++;
1291
1292 if (clear_count > 0) {
1293 const VkClearRect clear_rect = {
1294 .rect = render->area,
1295 .baseArrayLayer = 0,
1296 .layerCount = render->view_mask ? 1 : render->layer_count,
1297 };
1298
1299 p = nvk_cmd_buffer_push(cmd, 2);
1300 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1301 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
1302
1303 nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
1304 clear_count, clear_att, 1, &clear_rect);
1305 p = nvk_cmd_buffer_push(cmd, 2);
1306 P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1307 P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
1308 }
1309
1310 /* TODO: Attachment clears */
1311 }
1312
1313 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)1314 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
1315 {
1316 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1317 struct nvk_rendering_state *render = &cmd->state.gfx.render;
1318
1319 if (!(render->flags & VK_RENDERING_SUSPENDING_BIT)) {
1320 for (uint32_t i = 0; i < render->color_att_count; i++) {
1321 struct nvk_image_view *iview = render->color_att[i].iview;
1322 if (iview == NULL)
1323 continue;
1324
1325 struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1326 const uint8_t ip = iview->planes[0].image_plane;
1327 const struct nvk_image_plane *plane = &image->planes[ip];
1328 if (!render->all_linear &&
1329 plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1330 render->color_att[i].store_op == VK_ATTACHMENT_STORE_OP_STORE)
1331 nvk_linear_render_copy(cmd, iview, render->area, false);
1332 }
1333 }
1334
1335 bool need_resolve = false;
1336
1337 /* Translate render state back to VK for meta */
1338 VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
1339 for (uint32_t i = 0; i < render->color_att_count; i++) {
1340 if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
1341 need_resolve = true;
1342
1343 vk_color_att[i] = (VkRenderingAttachmentInfo) {
1344 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1345 .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
1346 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1347 .resolveMode = render->color_att[i].resolve_mode,
1348 .resolveImageView =
1349 nvk_image_view_to_handle(render->color_att[i].resolve_iview),
1350 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1351 };
1352 }
1353
1354 const VkRenderingAttachmentInfo vk_depth_att = {
1355 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1356 .imageView = nvk_image_view_to_handle(render->depth_att.iview),
1357 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1358 .resolveMode = render->depth_att.resolve_mode,
1359 .resolveImageView =
1360 nvk_image_view_to_handle(render->depth_att.resolve_iview),
1361 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1362 };
1363 if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1364 need_resolve = true;
1365
1366 const VkRenderingAttachmentInfo vk_stencil_att = {
1367 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1368 .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
1369 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1370 .resolveMode = render->stencil_att.resolve_mode,
1371 .resolveImageView =
1372 nvk_image_view_to_handle(render->stencil_att.resolve_iview),
1373 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1374 };
1375 if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1376 need_resolve = true;
1377
1378 const VkRenderingInfo vk_render = {
1379 .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1380 .renderArea = render->area,
1381 .layerCount = render->layer_count,
1382 .viewMask = render->view_mask,
1383 .colorAttachmentCount = render->color_att_count,
1384 .pColorAttachments = vk_color_att,
1385 .pDepthAttachment = &vk_depth_att,
1386 .pStencilAttachment = &vk_stencil_att,
1387 };
1388
1389 if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1390 need_resolve = false;
1391
1392 memset(render, 0, sizeof(*render));
1393
1394 if (need_resolve) {
1395 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1396 P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
1397 .lines = LINES_ALL,
1398 });
1399
1400 nvk_meta_resolve_rendering(cmd, &vk_render);
1401 }
1402 }
1403
1404 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)1405 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
1406 const gl_shader_stage stage,
1407 struct nvk_shader *shader)
1408 {
1409 assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1410 if (cmd->state.gfx.shaders[stage] == shader)
1411 return;
1412
1413 cmd->state.gfx.shaders[stage] = shader;
1414 cmd->state.gfx.shaders_dirty |= mesa_to_vk_shader_stage(stage);
1415 }
1416
1417 uint32_t
nvk_mme_tess_params(enum nak_ts_domain domain,enum nak_ts_spacing spacing,enum nak_ts_prims prims)1418 nvk_mme_tess_params(enum nak_ts_domain domain,
1419 enum nak_ts_spacing spacing,
1420 enum nak_ts_prims prims)
1421 {
1422 /* This is laid out the same as SET_TESSELLATION_PARAMETERS, only with an
1423 * extra bit for lower_left
1424 */
1425 uint16_t params = ((uint16_t)domain << 0) |
1426 ((uint16_t)spacing << 4) |
1427 ((uint16_t)prims << 8);
1428 return nvk_mme_val_mask(params, 0x0fff);
1429 }
1430
1431 static uint32_t
nvk_mme_tess_lower_left(bool lower_left)1432 nvk_mme_tess_lower_left(bool lower_left)
1433 {
1434 return nvk_mme_val_mask((uint16_t)lower_left << 12, 1u << 12);
1435 }
1436
1437 void
nvk_mme_set_tess_params(struct mme_builder * b)1438 nvk_mme_set_tess_params(struct mme_builder *b)
1439 {
1440 struct mme_value val_mask = mme_load(b);
1441 struct mme_value old_params = nvk_mme_load_scratch(b, TESS_PARAMS);
1442 struct mme_value params = nvk_mme_set_masked(b, old_params, val_mask);
1443 mme_free_reg(b, val_mask);
1444
1445 mme_if(b, ine, params, old_params) {
1446 nvk_mme_store_scratch(b, TESS_PARAMS, params);
1447
1448 /* lower_left lives at bit 12 */
1449 struct mme_value lower_left = mme_merge(b, mme_zero(), params, 0, 1, 12);
1450
1451 /* Only the bottom 12 bits are valid to put in HW */
1452 mme_merge_to(b, params, mme_zero(), params, 0, 12, 0);
1453
1454 /* If we're using a lower-left orientation, we need to flip triangles
1455 * between CW and CCW.
1456 */
1457 mme_if(b, ine, lower_left, mme_zero()) {
1458 struct mme_value prims_cw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CW);
1459 struct mme_value prims_ccw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CCW);
1460
1461 struct mme_value prims = mme_merge(b, mme_zero(), params, 0, 4, 8);
1462 mme_if(b, ieq, prims, prims_cw) {
1463 mme_merge_to(b, params, params, prims_ccw, 8, 4, 0);
1464 }
1465 mme_if(b, ieq, prims, prims_ccw) {
1466 mme_merge_to(b, params, params, prims_cw, 8, 4, 0);
1467 }
1468 mme_free_reg(b, prims);
1469 }
1470 mme_free_reg(b, lower_left);
1471
1472 mme_mthd(b, NV9097_SET_TESSELLATION_PARAMETERS);
1473 mme_emit(b, params);
1474 }
1475 }
1476
1477 const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[] = {{
1478 /* This case doesn't change the state so it should do nothing */
1479 .init = (struct nvk_mme_mthd_data[]) {
1480 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1481 { }
1482 },
1483 .params = (uint32_t[]) { 0xffff0000 },
1484 .expected = (struct nvk_mme_mthd_data[]) {
1485 { }
1486 },
1487 }, {
1488 /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = false */
1489 .init = (struct nvk_mme_mthd_data[]) {
1490 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1491 { }
1492 },
1493 .params = (uint32_t[]) { 0xffff0201 },
1494 .expected = (struct nvk_mme_mthd_data[]) {
1495 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1496 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1497 { }
1498 },
1499 }, {
1500 /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = true */
1501 .init = (struct nvk_mme_mthd_data[]) {
1502 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1503 { }
1504 },
1505 .params = (uint32_t[]) { 0x10001000 },
1506 .expected = (struct nvk_mme_mthd_data[]) {
1507 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1201 },
1508 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0301 },
1509 { }
1510 },
1511 }, {
1512 /* TRIANGLE, INTEGER, TRIANGLES_CCW, lower_left = true */
1513 .init = (struct nvk_mme_mthd_data[]) {
1514 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0301 },
1515 { }
1516 },
1517 .params = (uint32_t[]) { 0x10001000 },
1518 .expected = (struct nvk_mme_mthd_data[]) {
1519 { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1301 },
1520 { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1521 { }
1522 },
1523 }, {}};
1524
1525 void
nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer * cmd)1526 nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer *cmd)
1527 {
1528 if (cmd->state.gfx.shaders_dirty == 0)
1529 return;
1530
1531 /* Map shader types to shaders */
1532 struct nvk_shader *type_shader[6] = { NULL, };
1533 uint32_t types_dirty = 0;
1534
1535 u_foreach_bit(s, cmd->state.gfx.shaders_dirty &
1536 NVK_SHADER_STAGE_GRAPHICS_BITS) {
1537 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1538 uint32_t type = mesa_to_nv9097_shader_type(stage);
1539 types_dirty |= BITFIELD_BIT(type);
1540
1541 /* Only copy non-NULL shaders because mesh/task alias with vertex and
1542 * tessellation stages.
1543 */
1544 struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
1545 if (shader != NULL) {
1546 assert(type < ARRAY_SIZE(type_shader));
1547 assert(type_shader[type] == NULL);
1548 type_shader[type] = shader;
1549
1550 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
1551 struct nvk_cbuf_group *cbuf_group =
1552 &cmd->state.gfx.cbuf_groups[nvk_cbuf_binding_for_stage(stage)];
1553 for (uint32_t i = 0; i < cbuf_map->cbuf_count; i++) {
1554 if (memcmp(&cbuf_group->cbufs[i], &cbuf_map->cbufs[i],
1555 sizeof(cbuf_group->cbufs[i])) != 0) {
1556 cbuf_group->cbufs[i] = cbuf_map->cbufs[i];
1557 cbuf_group->dirty |= BITFIELD_BIT(i);
1558 }
1559 }
1560 }
1561 }
1562
1563 u_foreach_bit(type, types_dirty) {
1564 struct nvk_shader *shader = type_shader[type];
1565 if (shader == NULL) {
1566 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1567 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1568 .enable = ENABLE_FALSE,
1569 .type = type,
1570 });
1571 } else {
1572 struct nv_push *p = nvk_cmd_buffer_push(cmd, shader->push_dw_count);
1573 nv_push_raw(p, shader->push_dw, shader->push_dw_count);
1574 }
1575 }
1576
1577 if (cmd->state.gfx.shaders_dirty & NVK_SHADER_STAGE_VTGM_BITS) {
1578 struct nvk_shader *last_vtgm = NULL;
1579 u_foreach_bit(s, NVK_SHADER_STAGE_VTGM_BITS) {
1580 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1581 if (cmd->state.gfx.shaders[stage] != NULL)
1582 last_vtgm = cmd->state.gfx.shaders[stage];
1583 }
1584
1585 assert(last_vtgm->vtgm_push_dw_count > last_vtgm->push_dw_count);
1586 const uint16_t dw_start = last_vtgm->push_dw_count;
1587 const uint16_t dw_count = last_vtgm->vtgm_push_dw_count - dw_start;
1588 struct nv_push *p = nvk_cmd_buffer_push(cmd, dw_count);
1589 nv_push_raw(p, &last_vtgm->push_dw[dw_start], dw_count);
1590 }
1591
1592 cmd->state.gfx.shaders_dirty = 0;
1593 }
1594
1595 void
nvk_mme_set_vb_enables(struct mme_builder * b)1596 nvk_mme_set_vb_enables(struct mme_builder *b)
1597 {
1598 struct mme_value enables = mme_load(b);
1599 struct mme_value old_enables = nvk_mme_load_scratch(b, VB_ENABLES);
1600 nvk_mme_store_scratch(b, VB_ENABLES, enables);
1601
1602 struct mme_value changed = mme_xor(b, enables, old_enables);
1603 mme_free_reg(b, old_enables);
1604
1605 struct mme_value vb_idx4 = mme_mov(b, mme_zero());
1606 mme_while(b, ine, changed, mme_zero()) {
1607 mme_if(b, ine, mme_and(b, changed, mme_imm(1)), mme_zero()) {
1608 struct mme_value state =
1609 mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1610 mme_merge_to(b, state, state, enables, 12, 1, 0);
1611 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1612 mme_emit(b, state);
1613 }
1614 mme_add_to(b, vb_idx4, vb_idx4, mme_imm(4));
1615 mme_srl_to(b, changed, changed, mme_imm(1));
1616 mme_srl_to(b, enables, enables, mme_imm(1));
1617 }
1618 }
1619
1620 static uint32_t
nvk_mme_vb_stride(uint32_t vb_idx,uint32_t stride)1621 nvk_mme_vb_stride(uint32_t vb_idx, uint32_t stride)
1622 {
1623 assert(stride < (1 << 12));
1624 assert(vb_idx < (1 << 5));
1625 return (vb_idx << 16) | stride;
1626 }
1627
1628 void
nvk_mme_set_vb_stride(struct mme_builder * b)1629 nvk_mme_set_vb_stride(struct mme_builder *b)
1630 {
1631 /* Param is laid out as
1632 *
1633 * bits 0..11 : stride
1634 * bits 16..21 : VB index
1635 */
1636 struct mme_value param = mme_load(b);
1637
1638 struct mme_value vb_idx4 = mme_merge(b, mme_zero(), param, 2, 5, 16);
1639
1640 struct mme_value state =
1641 mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1642 struct mme_value new_state = mme_merge(b, state, param, 0, 12, 0);
1643 mme_if(b, ine, state, new_state) {
1644 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1645 mme_emit(b, new_state);
1646 }
1647 }
1648
1649 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1650 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1651 {
1652 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1653 struct nvk_physical_device *pdev = nvk_device_physical(dev);
1654 const struct vk_dynamic_graphics_state *dyn =
1655 &cmd->vk.dynamic_graphics_state;
1656
1657 struct nv_push *p = nvk_cmd_buffer_push(cmd, 258);
1658
1659 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1660 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_ENABLES));
1661 P_INLINE_DATA(p, dyn->vi->bindings_valid);
1662 }
1663
1664 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1665 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1666 u_foreach_bit(a, dyn->vi->attributes_valid) {
1667 const struct nvk_va_format *fmt =
1668 nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1669
1670 P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1671 .stream = dyn->vi->attributes[a].binding,
1672 .offset = dyn->vi->attributes[a].offset,
1673 .component_bit_widths = fmt->bit_widths,
1674 .numerical_type = fmt->type,
1675 .swap_r_and_b = fmt->swap_rb,
1676 });
1677 }
1678
1679 u_foreach_bit(b, dyn->vi->bindings_valid) {
1680 const bool instanced = dyn->vi->bindings[b].input_rate ==
1681 VK_VERTEX_INPUT_RATE_INSTANCE;
1682 P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1683 P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1684 dyn->vi->bindings[b].divisor);
1685 }
1686 }
1687
1688 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1689 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1690 u_foreach_bit(b, dyn->vi->bindings_valid) {
1691 assert(dyn->vi_binding_strides[b] < (1 << 12));
1692 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE));
1693 P_INLINE_DATA(p, nvk_mme_vb_stride(b, dyn->vi_binding_strides[b]));
1694 }
1695 }
1696 }
1697
1698 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)1699 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
1700 {
1701 switch (prim) {
1702 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1703 return NV9097_BEGIN_OP_POINTS;
1704 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1705 return NV9097_BEGIN_OP_LINES;
1706 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1707 return NV9097_BEGIN_OP_LINE_STRIP;
1708 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1709 #pragma GCC diagnostic push
1710 #pragma GCC diagnostic ignored "-Wswitch"
1711 case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
1712 #pragma GCC diagnostic pop
1713 return NV9097_BEGIN_OP_TRIANGLES;
1714 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1715 return NV9097_BEGIN_OP_TRIANGLE_STRIP;
1716 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1717 return NV9097_BEGIN_OP_TRIANGLE_FAN;
1718 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1719 return NV9097_BEGIN_OP_LINELIST_ADJCY;
1720 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1721 return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
1722 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1723 return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
1724 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1725 return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
1726 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1727 return NV9097_BEGIN_OP_PATCH;
1728 default:
1729 unreachable("Invalid primitive topology");
1730 }
1731 }
1732
1733 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1734 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1735 {
1736 const struct vk_dynamic_graphics_state *dyn =
1737 &cmd->vk.dynamic_graphics_state;
1738
1739 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
1740 uint32_t begin;
1741 V_NV9097_BEGIN(begin, {
1742 .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
1743 .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
1744 .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
1745 .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
1746 });
1747
1748 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1749 P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
1750 P_INLINE_DATA(p, begin);
1751 }
1752
1753 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1754 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1755 P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1756 dyn->ia.primitive_restart_enable);
1757 }
1758 }
1759
1760 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1761 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1762 {
1763 const struct vk_dynamic_graphics_state *dyn =
1764 &cmd->vk.dynamic_graphics_state;
1765 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1766
1767 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1768 /* The hardware gets grumpy if we set this to 0 so make sure we set it
1769 * to at least 1 in case it's dirty but uninitialized.
1770 */
1771 P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1772 }
1773
1774 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1775 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1776 P_INLINE_DATA(p, nvk_mme_tess_lower_left(
1777 dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT));
1778 }
1779 }
1780
1781 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1782 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1783 {
1784 const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1785
1786 const struct vk_dynamic_graphics_state *dyn =
1787 &cmd->vk.dynamic_graphics_state;
1788
1789 struct nv_push *p =
1790 nvk_cmd_buffer_push(cmd, 18 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1791
1792 /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1793
1794 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1795 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
1796 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE)) {
1797 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1798 const VkViewport *vp = &dyn->vp.viewports[i];
1799
1800 /* These exactly match the spec values. Nvidia hardware oddities
1801 * are accounted for later.
1802 */
1803 const float o_x = vp->x + 0.5f * vp->width;
1804 const float o_y = vp->y + 0.5f * vp->height;
1805 const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1806 vp->minDepth :
1807 (vp->maxDepth + vp->minDepth) * 0.5f;
1808
1809 const float p_x = vp->width;
1810 const float p_y = vp->height;
1811 const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1812 vp->maxDepth - vp->minDepth :
1813 (vp->maxDepth - vp->minDepth) * 0.5f;
1814
1815 P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1816 P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1817 P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1818 P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1819
1820 P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1821 P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1822 P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1823
1824 const bool user_defined_range =
1825 dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT;
1826 float xmin = vp->x;
1827 float xmax = vp->x + vp->width;
1828 float ymin = MIN2(vp->y, vp->y + vp->height);
1829 float ymax = MAX2(vp->y, vp->y + vp->height);
1830 float zmin = user_defined_range ?
1831 dyn->vp.depth_clamp_range.minDepthClamp :
1832 MIN2(vp->minDepth, vp->maxDepth);
1833 float zmax = user_defined_range ?
1834 dyn->vp.depth_clamp_range.maxDepthClamp :
1835 MAX2(vp->minDepth, vp->maxDepth);
1836 assert(xmin <= xmax && ymin <= ymax && zmin <= zmax);
1837
1838 const float max_dim = (float)0xffff;
1839 xmin = CLAMP(xmin, 0, max_dim);
1840 xmax = CLAMP(xmax, 0, max_dim);
1841 ymin = CLAMP(ymin, 0, max_dim);
1842 ymax = CLAMP(ymax, 0, max_dim);
1843
1844 if (!dev->vk.enabled_extensions.EXT_depth_range_unrestricted) {
1845 assert(0.0 <= zmin && zmin <= 1.0);
1846 assert(0.0 <= zmax && zmax <= 1.0);
1847 }
1848
1849 P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1850 P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1851 .x0 = xmin,
1852 .width = xmax - xmin,
1853 });
1854 P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1855 .y0 = ymin,
1856 .height = ymax - ymin,
1857 });
1858
1859 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1860 P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1861 P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1862 } else {
1863 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VIEWPORT_MIN_MAX_Z));
1864 P_INLINE_DATA(p, i);
1865 P_INLINE_DATA(p, fui(zmin));
1866 P_INLINE_DATA(p, fui(zmax));
1867 }
1868
1869 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1870 P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1871 .x = X_POS_X,
1872 .y = Y_POS_Y,
1873 .z = Z_POS_Z,
1874 .w = W_POS_W,
1875 });
1876 }
1877 }
1878 }
1879
1880 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1881 P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1882 dyn->vp.depth_clip_negative_one_to_one ?
1883 RANGE_NEGATIVE_W_TO_POSITIVE_W :
1884 RANGE_ZERO_TO_POSITIVE_W);
1885 }
1886
1887 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1888 for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1889 P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1890 }
1891
1892 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1893 for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1894 const VkRect2D *s = &dyn->vp.scissors[i];
1895
1896 const uint32_t xmin = MIN2(16384, s->offset.x);
1897 const uint32_t xmax = MIN2(16384, s->offset.x + s->extent.width);
1898 const uint32_t ymin = MIN2(16384, s->offset.y);
1899 const uint32_t ymax = MIN2(16384, s->offset.y + s->extent.height);
1900
1901 P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1902 P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1903 P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1904 .xmin = xmin,
1905 .xmax = xmax,
1906 });
1907 P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1908 .ymin = ymin,
1909 .ymax = ymax,
1910 });
1911 }
1912 }
1913 }
1914
1915 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1916 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1917 {
1918 ASSERTED uint16_t vk_to_nv9097[] = {
1919 [VK_POLYGON_MODE_FILL] = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1920 [VK_POLYGON_MODE_LINE] = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1921 [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1922 };
1923 assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1924
1925 uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1926 assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1927 return nv9097_mode;
1928 }
1929
1930 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1931 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1932 {
1933 static const uint16_t vk_to_nv9097[] = {
1934 [VK_CULL_MODE_FRONT_BIT] = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1935 [VK_CULL_MODE_BACK_BIT] = NV9097_OGL_SET_CULL_FACE_V_BACK,
1936 [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1937 };
1938 assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1939 return vk_to_nv9097[vk_cull_mode];
1940 }
1941
1942 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1943 vk_to_nv9097_front_face(VkFrontFace vk_face)
1944 {
1945 /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1946 * convention in which framebuffer coordinates always start in the upper
1947 * left while OpenGL has framebuffer coordinates starting in the lower
1948 * left. Therefore, we want the reverse of the hardware enum name.
1949 */
1950 ASSERTED static const uint16_t vk_to_nv9097[] = {
1951 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1952 [VK_FRONT_FACE_CLOCKWISE] = NV9097_OGL_SET_FRONT_FACE_V_CW,
1953 };
1954 assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1955
1956 uint32_t nv9097_face = 0x900 | (1 - vk_face);
1957 assert(nv9097_face == vk_to_nv9097[vk_face]);
1958 return nv9097_face;
1959 }
1960
1961 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1962 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1963 {
1964 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1965 NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1966 STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1967 NV9097_SET_PROVOKING_VERTEX_V_LAST);
1968 return vk_mode;
1969 }
1970
1971 void
nvk_mme_set_viewport_min_max_z(struct mme_builder * b)1972 nvk_mme_set_viewport_min_max_z(struct mme_builder *b)
1973 {
1974 struct mme_value vp_idx = mme_load(b);
1975 struct mme_value min_z = mme_load(b);
1976 struct mme_value max_z = mme_load(b);
1977
1978 /* Multiply by 2 because it's an array with stride 8 */
1979 mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1980 mme_mthd_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), vp_idx);
1981 mme_emit(b, min_z);
1982 mme_emit(b, max_z);
1983
1984 struct mme_value z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1985 mme_if(b, ine, z_clamp, mme_zero()) {
1986 /* Multiply by 2 again because this array has stride 16 */
1987 mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1988 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), vp_idx);
1989 mme_emit(b, min_z);
1990 mme_emit(b, max_z);
1991 }
1992 }
1993
1994 void
nvk_mme_set_z_clamp(struct mme_builder * b)1995 nvk_mme_set_z_clamp(struct mme_builder *b)
1996 {
1997 struct mme_value z_clamp = mme_load(b);
1998 struct mme_value old_z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1999 mme_if(b, ine, z_clamp, old_z_clamp) {
2000 nvk_mme_store_scratch(b, Z_CLAMP, z_clamp);
2001
2002 mme_if(b, ine, z_clamp, mme_zero()) {
2003 struct mme_value i_2 = mme_mov(b, mme_zero());
2004 mme_while(b, ine, i_2, mme_imm(NVK_MAX_VIEWPORTS * 2)) {
2005 struct mme_value min_z =
2006 mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), i_2);
2007 struct mme_value max_z =
2008 mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MAX_Z), i_2);
2009
2010 struct mme_value i_4 = mme_sll(b, i_2, mme_imm(1));
2011 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2012 mme_emit(b, min_z);
2013 mme_emit(b, max_z);
2014
2015 mme_free_reg(b, i_4);
2016 mme_free_reg(b, min_z);
2017 mme_free_reg(b, max_z);
2018
2019 mme_add_to(b, i_2, i_2, mme_imm(2));
2020 }
2021 mme_free_reg(b, i_2);
2022 }
2023 mme_if(b, ieq, z_clamp, mme_zero()) {
2024 struct mme_value i_4 = mme_mov(b, mme_zero());
2025 mme_while(b, ine, i_4, mme_imm(NVK_MAX_VIEWPORTS * 4)) {
2026 mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2027 mme_emit(b, mme_imm(fui(-INFINITY)));
2028 mme_emit(b, mme_imm(fui(INFINITY)));
2029
2030 mme_add_to(b, i_4, i_4, mme_imm(4));
2031 }
2032 mme_free_reg(b, i_4);
2033 }
2034 }
2035 }
2036
2037 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)2038 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
2039 {
2040 const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
2041 const struct vk_dynamic_graphics_state *dyn =
2042 &cmd->vk.dynamic_graphics_state;
2043 const struct nvk_rendering_state *render =
2044 &cmd->state.gfx.render;
2045
2046 struct nv_push *p = nvk_cmd_buffer_push(cmd, 46);
2047
2048 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
2049 P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
2050
2051 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
2052 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
2053 const bool z_clamp = dyn->rs.depth_clamp_enable;
2054 const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2055 /* z_clamp_zero_one accounts for the interaction between
2056 * depthClampZeroOne and depthRangeUnrestricted as mentioned in the
2057 * Vulkan spec. depthClampZeroOne adds an additional clamp and doesn't
2058 * modify the clip/clamp threshold. We are expected to clamp to [0,1]
2059 * when any one of these conditions are fulfilled:
2060 * - depth_range_unrestricted is not enabled
2061 * - depthClampZeroOne is enabled but depth
2062 * format is not floating point or depthRangeUnrestricted
2063 * is not enabled
2064 * - fixed point depth format
2065 */
2066 const bool z_clamp_zero_one =
2067 !vk_format_has_float_depth(render->depth_att.vk_format) ||
2068 (dev->vk.enabled_features.depthClampZeroOne &&
2069 !dev->vk.enabled_extensions.EXT_depth_range_unrestricted);
2070
2071 P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
2072 /* We only set Z clip range if clamp is requested. Otherwise, we
2073 * leave it set to -/+INF and clip using the guardband below.
2074 *
2075 * depthClampZeroOne is independent of normal depth clamping and
2076 * does not modify the clip/clamp threshold. The Vulkan spec
2077 * guarantees that, in the cases where depthClampZeroOne applies,
2078 * the [zmin, zmax] is inside [0, 1]. This means that, if z_clamp
2079 * is enabled, we can just do the regular clamp. If z_clamp is
2080 * disabled and z_clamp_zero_one is enabled then we need to
2081 * apply the [0, 1] clamp.
2082 */
2083 .min_z_zero_max_z_one = (!z_clamp && z_clamp_zero_one)
2084 ? MIN_Z_ZERO_MAX_Z_ONE_TRUE
2085 : MIN_Z_ZERO_MAX_Z_ONE_FALSE,
2086 .z_clip_range = (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A &&
2087 (z_clamp || !z_clamp_zero_one))
2088 ? (z_clamp ? Z_CLIP_RANGE_MIN_Z_MAX_Z
2089 : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
2090 : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
2091
2092 .pixel_min_z = PIXEL_MIN_Z_CLAMP,
2093 .pixel_max_z = PIXEL_MAX_Z_CLAMP,
2094
2095 .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
2096 .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
2097 .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
2098 : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
2099
2100 /* We clip depth with the geometry clipper to ensure that it gets
2101 * clipped before depth bias is applied. If we leave it up to the
2102 * raserizer clipper (pixel_min/max_z = CLIP), it will clip too late
2103 * in the pipeline. This can be seen in two different ways:
2104 *
2105 * - When depth bias is enabled, the bias is applied post-clipping.
2106 * If we clip in the rasterizer, it will clip according to the
2107 * post-bias depth which is wrong.
2108 *
2109 * - If the fragment shader overrides the depth by writing to
2110 * gl_FragDepth, it should be clipped according to the original
2111 * geometry, not accoring to gl_FragDepth.
2112 *
2113 * In order to always get the geometry clipper, we need to set a
2114 * tight guardband (geometry_guardband_z = SCALE_1).
2115 */
2116 .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
2117 : GEOMETRY_GUARDBAND_Z_SCALE_256,
2118 });
2119
2120 /* Pre-Volta, we don't have SET_VIEWPORT_CLIP_CONTROL::z_clip_range.
2121 * Instead, we have to emulate it by smashing VIEWPORT_CLIP_MIN/MAX_Z
2122 * based on whether or not z_clamp is set. This is done by a pair of
2123 * macros, one of which is called here and the other is called in
2124 * viewport setup.
2125 */
2126 if (nvk_cmd_buffer_3d_cls(cmd) < VOLTA_A) {
2127 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_Z_CLAMP));
2128 P_INLINE_DATA(p, z_clamp);
2129 }
2130 }
2131
2132 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
2133 uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
2134 P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
2135 P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
2136 P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
2137 }
2138
2139 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
2140 P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
2141
2142 if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
2143 uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
2144 P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
2145 }
2146 }
2147
2148 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
2149 P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
2150 vk_to_nv9097_front_face(dyn->rs.front_face));
2151 }
2152
2153 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
2154 P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
2155 vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
2156 }
2157
2158 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
2159 P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
2160 P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
2161 P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
2162 P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
2163 }
2164
2165 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
2166 switch (dyn->rs.depth_bias.representation) {
2167 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
2168 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2169 DEPTH_FORMAT_DEPENDENT_TRUE);
2170 break;
2171 case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
2172 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2173 DEPTH_FORMAT_DEPENDENT_FALSE);
2174 break;
2175 case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
2176 default:
2177 unreachable("Unsupported depth bias representation");
2178 }
2179 /* TODO: The blob multiplies by 2 for some reason. We don't. */
2180 P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant_factor));
2181 P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope_factor));
2182 P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
2183 }
2184
2185 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
2186 P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
2187 P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2188 P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2189 }
2190
2191 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
2192 switch (dyn->rs.line.mode) {
2193 case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
2194 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
2195 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
2196 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2197 break;
2198
2199 case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
2200 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2201 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2202 break;
2203
2204 case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
2205 P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2206 P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
2207 break;
2208
2209 default:
2210 unreachable("Invalid line rasterization mode");
2211 }
2212 }
2213
2214 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
2215 P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
2216
2217 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
2218 /* map factor from [1,256] to [0, 255] */
2219 uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
2220 P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
2221 .factor = stipple_factor,
2222 .pattern = dyn->rs.line.stipple.pattern,
2223 });
2224 }
2225
2226 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
2227 P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
2228
2229 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ||
2230 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE)) {
2231 if (nvk_cmd_buffer_3d_cls(cmd) < MAXWELL_B) {
2232 assert(dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
2233 } else if (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2234 P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_FALSE);
2235 } else {
2236 uint32_t extra_overestimate =
2237 MIN2(3, dyn->rs.extra_primitive_overestimation_size * 4);
2238
2239 if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
2240 P_IMMD(p, NVC397, SET_CONSERVATIVE_RASTER_CONTROL, {
2241 .extra_prim_bloat = extra_overestimate,
2242 .copy_inner_to_outer =
2243 (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT),
2244 .triangle_snap_mode = TRIANGLE_SNAP_MODE_MODE_PRE_SNAP,
2245 .line_and_point_snap_mode = LINE_AND_POINT_SNAP_MODE_MODE_PRE_SNAP,
2246 .uncertainty_region_size = UNCERTAINTY_REGION_SIZE_SIZE_512,
2247 });
2248 } else {
2249 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_CONSERVATIVE_RASTER_STATE));
2250 P_INLINE_DATA(p, extra_overestimate << 23);
2251 }
2252 P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_TRUE);
2253 }
2254 }
2255 }
2256
2257 uint32_t
nvk_mme_shading_rate_control_sample_shading(bool sample_shading)2258 nvk_mme_shading_rate_control_sample_shading(bool sample_shading)
2259 {
2260 return nvk_mme_val_mask((!sample_shading) << 1, 1 << 1);
2261 }
2262
2263 static uint32_t
nvk_mme_shading_rate_control_enable(bool enable)2264 nvk_mme_shading_rate_control_enable(bool enable)
2265 {
2266 return nvk_mme_val_mask(enable, 1 << 0);
2267 }
2268
2269 void
nvk_mme_set_shading_rate_control(struct mme_builder * b)2270 nvk_mme_set_shading_rate_control(struct mme_builder *b)
2271 {
2272 if (b->devinfo->cls_eng3d < TURING_A)
2273 return;
2274
2275 struct mme_value val_mask = mme_load(b);
2276 struct mme_value old_src = nvk_mme_load_scratch(b, SHADING_RATE_CONTROL);
2277 struct mme_value src = nvk_mme_set_masked(b, old_src, val_mask);
2278 mme_free_reg(b, val_mask);
2279
2280 mme_if(b, ine, src, old_src) {
2281 mme_free_reg(b, old_src);
2282 nvk_mme_store_scratch(b, SHADING_RATE_CONTROL, src);
2283
2284 struct mme_value enable1 = mme_merge(b, mme_zero(), src, 0, 1, 0);
2285 struct mme_value enable2 = mme_merge(b, mme_zero(), src, 0, 1, 1);
2286 struct mme_value enable = mme_and(b, enable1, enable2);
2287
2288 struct mme_value i = mme_mov(b, mme_zero());
2289 mme_while(b, ine, i, mme_imm(16 * 4)) {
2290 mme_mthd_arr(b, NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(0), i);
2291 mme_emit(b, enable);
2292 mme_add_to(b, i, i, mme_imm(4));
2293 }
2294 }
2295 }
2296
2297 static void
nvk_mme_set_shading_rate_control_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)2298 nvk_mme_set_shading_rate_control_test_check(
2299 const struct nv_device_info *devinfo,
2300 const struct nvk_mme_test_case *test,
2301 const struct nvk_mme_mthd_data *results)
2302 {
2303 if (devinfo->cls_eng3d < TURING_A)
2304 return;
2305
2306 assert(results[0].mthd == NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL));
2307 bool enable = (results[0].data & 3) == 3;
2308
2309 for (uint32_t i = 0; i < 16; i++) {
2310 assert(results[i + 1].mthd ==
2311 NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(i));
2312 assert(results[i + 1].data == enable);
2313 }
2314 }
2315
2316 const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[] = {{
2317 .init = (struct nvk_mme_mthd_data[]) {
2318 { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2319 { }
2320 },
2321 .params = (uint32_t[]) { 0x00030003 },
2322 .check = nvk_mme_set_shading_rate_control_test_check,
2323 }, {
2324 .init = (struct nvk_mme_mthd_data[]) {
2325 { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2326 { }
2327 },
2328 .params = (uint32_t[]) { 0x00030001 },
2329 .check = nvk_mme_set_shading_rate_control_test_check,
2330 }, {}};
2331
2332 static VkExtent2D
nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,VkExtent2D a_log2,VkExtent2D b_log2)2333 nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,
2334 VkExtent2D a_log2, VkExtent2D b_log2)
2335 {
2336 switch (op) {
2337 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
2338 return a_log2;
2339
2340 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
2341 return b_log2;
2342
2343 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
2344 return (VkExtent2D) {
2345 .width = MIN2(a_log2.width, b_log2.width),
2346 .height = MIN2(a_log2.height, b_log2.height),
2347 };
2348
2349 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
2350 return (VkExtent2D) {
2351 .width = MAX2(a_log2.width, b_log2.width),
2352 .height = MAX2(a_log2.height, b_log2.height),
2353 };
2354
2355 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR:
2356 return (VkExtent2D) {
2357 .width = a_log2.width + b_log2.width,
2358 .height = a_log2.height + b_log2.height,
2359 };
2360
2361 default:
2362 unreachable("Invalid FSR combiner op");
2363 }
2364 }
2365
2366 static uint8_t
vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)2367 vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)
2368 {
2369 rate_log2.width = MIN2(rate_log2.width, 2);
2370 rate_log2.height = MIN2(rate_log2.height, 2);
2371 const uint8_t idx = (rate_log2.width << 2) | rate_log2.height;
2372
2373 /* From the Vulkan 1.3.297 spec:
2374 *
2375 * "A fragment shading rate Rxy representing any of Axy, Bxy or Cxy
2376 * is clamped as follows. [...] From this list of supported rates,
2377 * the following steps are applied in order, to select a single
2378 * value:
2379 *
2380 * 1. Keep only rates where Rx' ≤ Rx and Ry' ≤ Ry.
2381 *
2382 * - Implementations may also keep rates where Rx' ≤ Ry and
2383 * Ry' ≤ Rx.
2384 *
2385 * 2. Keep only rates with the highest area (Rx' × Ry').
2386 *
2387 * 3. Keep only rates with the lowest aspect ratio (Rx' + Ry').
2388 *
2389 * 4. In cases where a wide (e.g. 4x1) and tall (e.g. 1x4) rate
2390 * remain, the implementation may choose either rate. However, it
2391 * must choose this rate consistently for the same shading rates,
2392 * render pass transform, and combiner operations for the
2393 * lifetime of the VkDevice.
2394 *
2395 * We have the following rates: 1x1, 2x1, 1x2, 2x2, 4x2, 2x4, 4x4.
2396 */
2397 static const uint8_t vk_to_nvc597[] = {
2398 #define NVC597_FSR(X) NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A_RATE_INDEX0_PS_##X
2399 NVC597_FSR(X1_PER_RASTER_PIXEL),
2400 NVC597_FSR(X1_PER_1X2_RASTER_PIXELS),
2401 NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x4 */
2402 NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x8 */
2403 NVC597_FSR(X1_PER_2X1_RASTER_PIXELS),
2404 NVC597_FSR(X1_PER_2X2_RASTER_PIXELS),
2405 NVC597_FSR(X1_PER_2X4_RASTER_PIXELS),
2406 NVC597_FSR(X1_PER_2X4_RASTER_PIXELS), /* 2x8 */
2407 NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 4x1 */
2408 NVC597_FSR(X1_PER_4X2_RASTER_PIXELS),
2409 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS),
2410 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 4x8 */
2411 NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 8x1 */
2412 NVC597_FSR(X1_PER_4X2_RASTER_PIXELS), /* 8x2 */
2413 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x4 */
2414 NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x8 */
2415 #undef NVC597_FSR
2416 };
2417
2418 assert(idx < ARRAY_SIZE(vk_to_nvc597));
2419 return vk_to_nvc597[idx];
2420 }
2421
2422 static void
nvk_flush_fsr_state(struct nvk_cmd_buffer * cmd)2423 nvk_flush_fsr_state(struct nvk_cmd_buffer *cmd)
2424 {
2425 const struct vk_dynamic_graphics_state *dyn =
2426 &cmd->vk.dynamic_graphics_state;
2427
2428 if (nvk_cmd_buffer_3d_cls(cmd) < TURING_A) {
2429 assert(vk_fragment_shading_rate_is_disabled(&dyn->fsr));
2430 return;
2431 }
2432
2433 if (!BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
2434 return;
2435
2436 if (vk_fragment_shading_rate_is_disabled(&dyn->fsr)) {
2437 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2438 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2439 P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(false));
2440 } else {
2441 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 16 * 3);
2442
2443 assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.width));
2444 assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.height));
2445 const VkExtent2D state_fs_log2 = {
2446 .width = util_logbase2(dyn->fsr.fragment_size.width),
2447 .height = util_logbase2(dyn->fsr.fragment_size.height),
2448 };
2449
2450 for (uint32_t prim_idx = 0; prim_idx < 16; prim_idx++) {
2451 const VkExtent2D prim_fs_log2 = {
2452 .width = (prim_idx >> 2) & 3,
2453 .height = prim_idx & 3,
2454 };
2455
2456 const VkExtent2D state_prim_fs_log2 =
2457 nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[0],
2458 state_fs_log2, prim_fs_log2);
2459
2460 uint8_t rates[16] = {};
2461 for (uint32_t att_idx = 0; att_idx < 16; att_idx++) {
2462 const VkExtent2D att_fs_log2 = {
2463 .width = (att_idx >> 2) & 3,
2464 .height = att_idx & 3,
2465 };
2466
2467 const VkExtent2D fs_log2 =
2468 nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[1],
2469 state_prim_fs_log2, att_fs_log2);
2470
2471 rates[att_idx] = vk_to_nvc597_shading_rate_log2(fs_log2);
2472 }
2473
2474 P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(prim_idx));
2475 P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(p, prim_idx, {
2476 .rate_index0 = rates[0],
2477 .rate_index1 = rates[1],
2478 .rate_index2 = rates[2],
2479 .rate_index3 = rates[3],
2480 .rate_index4 = rates[4],
2481 .rate_index5 = rates[5],
2482 .rate_index6 = rates[6],
2483 .rate_index7 = rates[7],
2484 });
2485 P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_B(p, prim_idx, {
2486 .rate_index8 = rates[8],
2487 .rate_index9 = rates[9],
2488 .rate_index10 = rates[10],
2489 .rate_index11 = rates[11],
2490 .rate_index12 = rates[12],
2491 .rate_index13 = rates[13],
2492 .rate_index14 = rates[14],
2493 .rate_index15 = rates[15],
2494 });
2495 }
2496
2497 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2498 P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(true));
2499 }
2500 }
2501
2502 static uint32_t
nvk_mme_anti_alias_init(void)2503 nvk_mme_anti_alias_init(void)
2504 {
2505 /* This is a valid value but we never set it so it ensures that the macro
2506 * will actually run the first time we set anything.
2507 */
2508 return 0xf;
2509 }
2510
2511 uint32_t
nvk_mme_anti_alias_min_sample_shading(float mss)2512 nvk_mme_anti_alias_min_sample_shading(float mss)
2513 {
2514 /* The value we want to comput in the MME is
2515 *
2516 * passes = next_pow2(samples * minSampleShading)
2517 *
2518 * Since samples is already a power of two,
2519 *
2520 * passes_log2 = log2_ceil(samples * minSampleShading)
2521 * = log2_ceil(samples / (1.0 / minSampleShading))
2522 * = samples_log2 - log2_floor(1.0 / minSampleShading)
2523 *
2524 * if we assume (1.0 / min_sample_shading) >= 1.0. This last bit is
2525 * something we can compute in the MME as long as the float math on the
2526 * right-hand side happens on the CPU.
2527 */
2528 float rcp_mss = CLAMP(1.0 / mss, 1.0f, 16.0f);
2529 uint32_t rcp_mss_log2 = util_logbase2(floorf(rcp_mss));
2530
2531 assert(rcp_mss_log2 != nvk_mme_anti_alias_init());
2532
2533 return nvk_mme_val_mask(rcp_mss_log2 << 0, 0x000f);
2534 }
2535
2536 static uint32_t
nvk_mme_anti_alias_samples(uint32_t samples)2537 nvk_mme_anti_alias_samples(uint32_t samples)
2538 {
2539 assert(util_is_power_of_two_or_zero(samples));
2540 const uint32_t samples_log2 = util_logbase2(MAX2(1, samples));
2541
2542 return nvk_mme_val_mask(samples_log2 << 4, 0x00f0);
2543 }
2544
2545 void
nvk_mme_set_anti_alias(struct mme_builder * b)2546 nvk_mme_set_anti_alias(struct mme_builder *b)
2547 {
2548 struct mme_value val_mask = mme_load(b);
2549 struct mme_value old_anti_alias = nvk_mme_load_scratch(b, ANTI_ALIAS);
2550 struct mme_value anti_alias =
2551 nvk_mme_set_masked(b, old_anti_alias, val_mask);
2552 mme_free_reg(b, val_mask);
2553
2554 mme_if(b, ine, anti_alias, old_anti_alias) {
2555 mme_free_reg(b, old_anti_alias);
2556 nvk_mme_store_scratch(b, ANTI_ALIAS, anti_alias);
2557
2558 struct mme_value rcp_mss_log2 =
2559 mme_merge(b, mme_zero(), anti_alias, 0, 4, 0);
2560 struct mme_value samples_log2 =
2561 mme_merge(b, mme_zero(), anti_alias, 0, 4, 4);
2562 mme_free_reg(b, anti_alias);
2563
2564 /* We've already done all the hard work on the CPU in
2565 * nvk_mme_min_sample_shading(). All we have to do here is add the two
2566 * log2 values and clamp so we don't get negative.
2567 */
2568 struct mme_value passes_log2 = mme_sub(b, samples_log2, rcp_mss_log2);
2569 mme_free_reg(b, rcp_mss_log2);
2570
2571 /* passes = MAX(passes, 1) */
2572 struct mme_value neg = mme_srl(b, passes_log2, mme_imm(31));
2573 mme_if(b, ine, neg, mme_zero()) {
2574 mme_mov_to(b, passes_log2, mme_zero());
2575 }
2576 mme_free_reg(b, neg);
2577
2578 /*
2579 * NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL {
2580 * ...
2581 * .centroid = passes > 1 ? CENTROID_PER_PASS
2582 * : CENTROID_PER_FRAGMENT,
2583 * }
2584 */
2585 struct mme_value aac = mme_mov(b,
2586 mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_FRAGMENT
2587 << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2588 mme_if(b, ine, passes_log2, mme_zero()) {
2589 mme_mov_to(b, aac,
2590 mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_PASS
2591 << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2592 }
2593
2594 struct mme_value passes = mme_sll(b, mme_imm(1), passes_log2);
2595 mme_merge_to(b, aac, aac, passes, 0, 4, 0);
2596 mme_free_reg(b, passes);
2597
2598 mme_mthd(b, NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL);
2599 mme_emit(b, aac);
2600 mme_free_reg(b, aac);
2601
2602 /* Now we need to emit sample masks per-sample. Annoyingly, we have to
2603 * pack these in pairs.
2604 */
2605 STATIC_ASSERT(sizeof(struct nak_sample_mask) == 2);
2606
2607 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2608 mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.sample_masks)));
2609 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2610
2611 /* Annoyingly, we have to pack these in pairs */
2612
2613 struct mme_value samples_per_pass_log2 =
2614 mme_sub(b, samples_log2, passes_log2);
2615 mme_free_reg(b, samples_log2);
2616
2617 mme_if(b, ieq, samples_per_pass_log2, mme_zero()) {
2618 /* One sample per pass, we can just blast it out */
2619 for (uint32_t i = 0; i < NVK_MAX_SAMPLES; i += 2) {
2620 uint32_t mask0 = 1 << i;
2621 uint32_t mask1 = 1 << (i + 1);
2622 mme_emit(b, mme_imm(mask0 | (mask1 << 16)));
2623 }
2624 }
2625
2626 mme_if(b, ine, samples_per_pass_log2, mme_zero()) {
2627 mme_if(b, ieq, passes_log2, mme_zero()) {
2628 /* It's a single pass so we can use 0xffff */
2629 for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++)
2630 mme_emit(b, mme_imm(~0));
2631 }
2632
2633 mme_if(b, ieq, passes_log2, mme_imm(1)) {
2634 for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2635 struct mme_value mask =
2636 nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_2PASS_0, i);
2637 mme_emit(b, mask);
2638 mme_free_reg(b, mask);
2639 }
2640 }
2641
2642 mme_if(b, ieq, passes_log2, mme_imm(2)) {
2643 for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2644 struct mme_value mask =
2645 nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_4PASS_0, i);
2646 mme_emit(b, mask);
2647 mme_free_reg(b, mask);
2648 }
2649 }
2650 }
2651 }
2652 }
2653
2654 const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[] = {{
2655 /* This case doesn't change the state so it should do nothing */
2656 .init = (struct nvk_mme_mthd_data[]) {
2657 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2658 { }
2659 },
2660 .params = (uint32_t[]) { 0xffff0000 },
2661 .expected = (struct nvk_mme_mthd_data[]) {
2662 { }
2663 },
2664 }, {
2665 /* Single sample, minSampleShading = 1.0 */
2666 .init = (struct nvk_mme_mthd_data[]) {
2667 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2668 { }
2669 },
2670 .params = (uint32_t[]) { 0xffff0000 },
2671 .expected = (struct nvk_mme_mthd_data[]) {
2672 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2673 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2674 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2675 nvk_root_descriptor_offset(draw.sample_masks) },
2676 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2677 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2678 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2679 { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2680 { }
2681 },
2682 }, {
2683 /* Single sample, minSampleShading = 0.25 */
2684 .init = (struct nvk_mme_mthd_data[]) {
2685 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2686 { }
2687 },
2688 .params = (uint32_t[]) { 0xffff0002 },
2689 .expected = (struct nvk_mme_mthd_data[]) {
2690 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x2 },
2691 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2692 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2693 nvk_root_descriptor_offset(draw.sample_masks) },
2694 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2695 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2696 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2697 { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2698 { }
2699 },
2700 }, {
2701 /* 8 samples, minSampleShading = 0.5 */
2702 .init = (struct nvk_mme_mthd_data[]) {
2703 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x1 },
2704 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_0), 0x030003 },
2705 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_1), 0x0c000c },
2706 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_2), 0x300030 },
2707 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_3), 0xc000c0 },
2708 { }
2709 },
2710 .params = (uint32_t[]) { 0x00f00030 },
2711 .expected = (struct nvk_mme_mthd_data[]) {
2712 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x31 },
2713 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x14 },
2714 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2715 nvk_root_descriptor_offset(draw.sample_masks) },
2716 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x030003 },
2717 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0c000c },
2718 { NV9097_LOAD_CONSTANT_BUFFER(2), 0x300030 },
2719 { NV9097_LOAD_CONSTANT_BUFFER(3), 0xc000c0 },
2720 { }
2721 },
2722 }, {
2723 /* 8 samples, minSampleShading = 0.25 */
2724 .init = (struct nvk_mme_mthd_data[]) {
2725 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x30 },
2726 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_0), 0x0f000f },
2727 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_1), 0x0f000f },
2728 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_2), 0xf000f0 },
2729 { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_3), 0xf000f0 },
2730 { }
2731 },
2732 .params = (uint32_t[]) { 0x000f0002 },
2733 .expected = (struct nvk_mme_mthd_data[]) {
2734 { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x32 },
2735 { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x12 },
2736 { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2737 nvk_root_descriptor_offset(draw.sample_masks) },
2738 { NV9097_LOAD_CONSTANT_BUFFER(0), 0x0f000f },
2739 { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0f000f },
2740 { NV9097_LOAD_CONSTANT_BUFFER(2), 0xf000f0 },
2741 { NV9097_LOAD_CONSTANT_BUFFER(3), 0xf000f0 },
2742 { }
2743 },
2744 }, {}};
2745
2746 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)2747 vk_sample_location(const struct vk_sample_locations_state *sl,
2748 uint32_t x, uint32_t y, uint32_t s)
2749 {
2750 x = x % sl->grid_size.width;
2751 y = y % sl->grid_size.height;
2752
2753 return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
2754 }
2755
2756 static struct nak_sample_location
vk_to_nak_sample_location(VkSampleLocationEXT loc)2757 vk_to_nak_sample_location(VkSampleLocationEXT loc)
2758 {
2759 return (struct nak_sample_location) {
2760 .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
2761 .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
2762 };
2763 }
2764
2765 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)2766 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
2767 {
2768 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2769 const struct vk_dynamic_graphics_state *dyn =
2770 &cmd->vk.dynamic_graphics_state;
2771
2772 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
2773 /* When we don't have any attachments, we can't know the sample count
2774 * from the render pass so we need to emit SET_ANTI_ALIAS here. See the
2775 * comment in nvk_BeginRendering() for more details.
2776 */
2777 if (render->samples == 0) {
2778 /* Multisample information MAY be missing (rasterizationSamples == 0)
2779 * if rasterizer discard is enabled. However, this isn't valid in
2780 * the hardware so always use at least one sample.
2781 */
2782 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2783 nvk_cmd_set_sample_layout(cmd, nil_choose_sample_layout(samples));
2784 } else {
2785 /* Multisample information MAY be missing (rasterizationSamples == 0)
2786 * if rasterizer discard is enabled.
2787 */
2788 assert(dyn->ms.rasterization_samples == 0 ||
2789 dyn->ms.rasterization_samples == render->samples);
2790 }
2791 }
2792
2793 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
2794 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
2795 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2796 P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
2797 .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2798 .alpha_to_one = dyn->ms.alpha_to_one_enable,
2799 });
2800 }
2801
2802 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
2803 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
2804 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
2805 const struct vk_sample_locations_state *sl;
2806 if (dyn->ms.sample_locations_enable) {
2807 sl = dyn->ms.sample_locations;
2808 } else {
2809 const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2810 sl = vk_standard_sample_locations_state(samples);
2811 }
2812
2813 struct nak_sample_location push_sl[NVK_MAX_SAMPLES];
2814 for (uint32_t i = 0; i < sl->per_pixel; i++)
2815 push_sl[i] = vk_to_nak_sample_location(sl->locations[i]);
2816
2817 nvk_descriptor_state_set_root_array(cmd, &cmd->state.gfx.descriptors,
2818 draw.sample_locations,
2819 0, NVK_MAX_SAMPLES, push_sl);
2820
2821 if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
2822 struct nak_sample_location loc[16];
2823 for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
2824 const uint32_t s = n % sl->per_pixel;
2825 const uint32_t px = n / sl->per_pixel;
2826 const uint32_t x = px % 2;
2827 const uint32_t y = px / 2;
2828
2829 loc[n] = vk_to_nak_sample_location(vk_sample_location(sl, x, y, s));
2830 }
2831
2832 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2833
2834 P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
2835 for (uint32_t i = 0; i < 4; i++) {
2836 P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
2837 .x0 = loc[i * 4 + 0].x_u4,
2838 .y0 = loc[i * 4 + 0].y_u4,
2839 .x1 = loc[i * 4 + 1].x_u4,
2840 .y1 = loc[i * 4 + 1].y_u4,
2841 .x2 = loc[i * 4 + 2].x_u4,
2842 .y2 = loc[i * 4 + 2].y_u4,
2843 .x3 = loc[i * 4 + 3].x_u4,
2844 .y3 = loc[i * 4 + 3].y_u4,
2845 });
2846 }
2847 }
2848 }
2849
2850 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
2851 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2852 P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
2853 P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
2854 P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
2855 P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
2856 P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
2857 }
2858 }
2859
2860 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)2861 vk_to_nv9097_compare_op(VkCompareOp vk_op)
2862 {
2863 ASSERTED static const uint16_t vk_to_nv9097[] = {
2864 [VK_COMPARE_OP_NEVER] = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
2865 [VK_COMPARE_OP_LESS] = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
2866 [VK_COMPARE_OP_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
2867 [VK_COMPARE_OP_LESS_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
2868 [VK_COMPARE_OP_GREATER] = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
2869 [VK_COMPARE_OP_NOT_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
2870 [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
2871 [VK_COMPARE_OP_ALWAYS] = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
2872 };
2873 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2874
2875 uint32_t nv9097_op = 0x200 | vk_op;
2876 assert(nv9097_op == vk_to_nv9097[vk_op]);
2877 return nv9097_op;
2878 }
2879
2880 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)2881 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
2882 {
2883 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
2884 ASSERTED static const uint16_t vk_to_nv9097[] = {
2885 OP(KEEP, D3D_KEEP),
2886 OP(ZERO, D3D_ZERO),
2887 OP(REPLACE, D3D_REPLACE),
2888 OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
2889 OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
2890 OP(INVERT, D3D_INVERT),
2891 OP(INCREMENT_AND_WRAP, D3D_INCR),
2892 OP(DECREMENT_AND_WRAP, D3D_DECR),
2893 };
2894 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2895 #undef OP
2896
2897 uint32_t nv9097_op = vk_op + 1;
2898 assert(nv9097_op == vk_to_nv9097[vk_op]);
2899 return nv9097_op;
2900 }
2901
2902 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)2903 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
2904 {
2905 struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
2906
2907 const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2908 const struct vk_dynamic_graphics_state *dyn =
2909 &cmd->vk.dynamic_graphics_state;
2910
2911 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
2912 bool enable = dyn->ds.depth.test_enable &&
2913 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2914 P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
2915 }
2916
2917 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
2918 bool enable = dyn->ds.depth.write_enable &&
2919 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2920 P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
2921 }
2922
2923 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
2924 const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
2925 P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
2926 }
2927
2928 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
2929 bool enable = dyn->ds.depth.bounds_test.enable &&
2930 render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2931 P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
2932 }
2933
2934 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
2935 P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
2936 P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
2937 P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
2938 }
2939
2940 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
2941 bool enable = dyn->ds.stencil.test_enable &&
2942 render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2943 P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
2944 }
2945
2946 const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
2947 const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
2948 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
2949 P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
2950 P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
2951 P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
2952 P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
2953 P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
2954
2955 P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
2956 P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
2957 P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
2958 P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
2959 P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
2960 }
2961
2962 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
2963 P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
2964 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
2965 }
2966
2967 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
2968 P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
2969 P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
2970 }
2971
2972 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
2973 P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
2974 P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
2975 }
2976 }
2977
2978 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)2979 vk_to_nv9097_logic_op(VkLogicOp vk_op)
2980 {
2981 ASSERTED uint16_t vk_to_nv9097[] = {
2982 [VK_LOGIC_OP_CLEAR] = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
2983 [VK_LOGIC_OP_AND] = NV9097_SET_LOGIC_OP_FUNC_V_AND,
2984 [VK_LOGIC_OP_AND_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
2985 [VK_LOGIC_OP_COPY] = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
2986 [VK_LOGIC_OP_AND_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
2987 [VK_LOGIC_OP_NO_OP] = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
2988 [VK_LOGIC_OP_XOR] = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
2989 [VK_LOGIC_OP_OR] = NV9097_SET_LOGIC_OP_FUNC_V_OR,
2990 [VK_LOGIC_OP_NOR] = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
2991 [VK_LOGIC_OP_EQUIVALENT] = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
2992 [VK_LOGIC_OP_INVERT] = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
2993 [VK_LOGIC_OP_OR_REVERSE] = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
2994 [VK_LOGIC_OP_COPY_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
2995 [VK_LOGIC_OP_OR_INVERTED] = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
2996 [VK_LOGIC_OP_NAND] = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
2997 [VK_LOGIC_OP_SET] = NV9097_SET_LOGIC_OP_FUNC_V_SET,
2998 };
2999 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3000
3001 uint32_t nv9097_op = 0x1500 | vk_op;
3002 assert(nv9097_op == vk_to_nv9097[vk_op]);
3003 return nv9097_op;
3004 }
3005
3006 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)3007 vk_to_nv9097_blend_op(VkBlendOp vk_op)
3008 {
3009 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
3010 ASSERTED uint16_t vk_to_nv9097[] = {
3011 OP(ADD, FUNC_ADD),
3012 OP(SUBTRACT, FUNC_SUBTRACT),
3013 OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
3014 OP(MIN, MIN),
3015 OP(MAX, MAX),
3016 };
3017 assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3018 #undef OP
3019
3020 return vk_to_nv9097[vk_op];
3021 }
3022
3023 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)3024 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
3025 {
3026 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
3027 NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
3028 ASSERTED uint16_t vk_to_nv9097[] = {
3029 FACTOR(ZERO, OGL_ZERO),
3030 FACTOR(ONE, OGL_ONE),
3031 FACTOR(SRC_COLOR, OGL_SRC_COLOR),
3032 FACTOR(ONE_MINUS_SRC_COLOR, OGL_ONE_MINUS_SRC_COLOR),
3033 FACTOR(DST_COLOR, OGL_DST_COLOR),
3034 FACTOR(ONE_MINUS_DST_COLOR, OGL_ONE_MINUS_DST_COLOR),
3035 FACTOR(SRC_ALPHA, OGL_SRC_ALPHA),
3036 FACTOR(ONE_MINUS_SRC_ALPHA, OGL_ONE_MINUS_SRC_ALPHA),
3037 FACTOR(DST_ALPHA, OGL_DST_ALPHA),
3038 FACTOR(ONE_MINUS_DST_ALPHA, OGL_ONE_MINUS_DST_ALPHA),
3039 FACTOR(CONSTANT_COLOR, OGL_CONSTANT_COLOR),
3040 FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
3041 FACTOR(CONSTANT_ALPHA, OGL_CONSTANT_ALPHA),
3042 FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
3043 FACTOR(SRC_ALPHA_SATURATE, OGL_SRC_ALPHA_SATURATE),
3044 FACTOR(SRC1_COLOR, OGL_SRC1COLOR),
3045 FACTOR(ONE_MINUS_SRC1_COLOR, OGL_INVSRC1COLOR),
3046 FACTOR(SRC1_ALPHA, OGL_SRC1ALPHA),
3047 FACTOR(ONE_MINUS_SRC1_ALPHA, OGL_INVSRC1ALPHA),
3048 };
3049 assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
3050 #undef FACTOR
3051
3052 return vk_to_nv9097[vk_factor];
3053 }
3054
3055 void
nvk_mme_set_write_mask(struct mme_builder * b)3056 nvk_mme_set_write_mask(struct mme_builder *b)
3057 {
3058 struct mme_value count = mme_load(b);
3059 struct mme_value mask = mme_load(b);
3060
3061 /*
3062 * mask is a bit field
3063 *
3064 * attachment index 88887777666655554444333322221111
3065 * component abgrabgrabgrabgrabgrabgrabgrabgr
3066 */
3067
3068 struct mme_value common_mask = mme_mov(b, mme_imm(1));
3069 struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3070 struct mme_value i = mme_mov(b, mme_zero());
3071
3072 mme_while(b, ine, i, count) {
3073 /*
3074 We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
3075 0x0000 0000 0000 0000 000a 000b 000g 000r
3076
3077 So for i=0 a mask of
3078 0x0000 0000 0000 0000 0000 0000 0000 1111
3079 becomes
3080 0x0000 0000 0000 0000 0001 0001 0001 0001
3081 */
3082
3083 struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
3084 mme_merge_to(b, val, val, mask, 4, 1, 1);
3085 mme_merge_to(b, val, val, mask, 8, 1, 2);
3086 mme_merge_to(b, val, val, mask, 12, 1, 3);
3087
3088 mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
3089 mme_emit(b, val);
3090 mme_free_reg(b, val);
3091
3092 /* Check if all masks are common */
3093 struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3094 mme_if(b, ine, first, temp) {
3095 mme_mov_to(b, common_mask, mme_zero());
3096 }
3097 mme_free_reg(b, temp);
3098
3099 mme_srl_to(b, mask, mask, mme_imm(4));
3100
3101 mme_add_to(b, i, i, mme_imm(1));
3102 }
3103
3104 mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
3105 mme_emit(b, common_mask);
3106 }
3107
3108 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)3109 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
3110 {
3111 struct nvk_rendering_state *render = &cmd->state.gfx.render;
3112 const struct vk_dynamic_graphics_state *dyn =
3113 &cmd->vk.dynamic_graphics_state;
3114
3115 struct nv_push *p =
3116 nvk_cmd_buffer_push(cmd, 15 + 10 * render->color_att_count);
3117
3118 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
3119 P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
3120
3121 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
3122 const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
3123 P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
3124 }
3125
3126 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
3127 for (uint8_t a = 0; a < render->color_att_count; a++) {
3128 P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
3129 }
3130 }
3131
3132 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
3133 for (uint8_t a = 0; a < render->color_att_count; a++) {
3134 const struct vk_color_blend_attachment_state *att =
3135 &dyn->cb.attachments[a];
3136 P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
3137 P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
3138 P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
3139 vk_to_nv9097_blend_op(att->color_blend_op));
3140 P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
3141 vk_to_nv9097_blend_factor(att->src_color_blend_factor));
3142 P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
3143 vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
3144 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
3145 vk_to_nv9097_blend_op(att->alpha_blend_op));
3146 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
3147 vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
3148 P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
3149 vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
3150 }
3151 }
3152
3153 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
3154 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
3155 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS) ||
3156 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3157 uint32_t color_write_enables = 0x0;
3158 for (uint8_t a = 0; a < render->color_att_count; a++) {
3159 if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
3160 color_write_enables |= 0xf << (4 * a);
3161 }
3162
3163 uint32_t cb_att_write_mask = 0x0;
3164 for (uint8_t a = 0; a < render->color_att_count; a++)
3165 cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
3166
3167 uint32_t rp_att_write_mask = 0x0;
3168 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3169 if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
3170 rp_att_write_mask |= 0xf << (4 * a);
3171 }
3172
3173 uint32_t att_has_loc_mask = 0x0;
3174 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3175 if (dyn->cal.color_map[a] != MESA_VK_ATTACHMENT_UNUSED)
3176 att_has_loc_mask |= 0xf << (4 * a);
3177 }
3178
3179 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
3180 P_INLINE_DATA(p, render->color_att_count);
3181 P_INLINE_DATA(p, color_write_enables &
3182 cb_att_write_mask &
3183 rp_att_write_mask &
3184 att_has_loc_mask);
3185 }
3186
3187 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3188 int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
3189 uint8_t max_loc = 0;
3190 uint32_t att_used = 0;
3191 for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3192 if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
3193 continue;
3194
3195 att_used |= BITFIELD_BIT(a);
3196
3197 assert(dyn->cal.color_map[a] < NVK_MAX_RTS);
3198 loc_att[dyn->cal.color_map[a]] = a;
3199 max_loc = MAX2(max_loc, dyn->cal.color_map[a]);
3200 }
3201
3202 for (uint8_t l = 0; l < NVK_MAX_RTS; l++) {
3203 if (loc_att[l] >= 0)
3204 continue;
3205
3206 /* Just grab any color attachment. The way we set up color targets
3207 * in BeginRenderPass ensures that every color target is either the
3208 * valid color target referenced by this render pass or a valid NULL
3209 * target. If we end up mapping to some other target in this render
3210 * pass, the handling of att_has_loc_mask above will ensure that no
3211 * color writes actually happen.
3212 */
3213 uint8_t a = ffs(~att_used) - 1;
3214 att_used |= BITFIELD_BIT(a);
3215 loc_att[l] = a;
3216 }
3217
3218 P_IMMD(p, NV9097, SET_CT_SELECT, {
3219 .target_count = max_loc + 1,
3220 .target0 = loc_att[0],
3221 .target1 = loc_att[1],
3222 .target2 = loc_att[2],
3223 .target3 = loc_att[3],
3224 .target4 = loc_att[4],
3225 .target5 = loc_att[5],
3226 .target6 = loc_att[6],
3227 .target7 = loc_att[7],
3228 });
3229 }
3230
3231 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
3232 P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
3233 P_NV9097_SET_BLEND_CONST_RED(p, fui(dyn->cb.blend_constants[0]));
3234 P_NV9097_SET_BLEND_CONST_GREEN(p, fui(dyn->cb.blend_constants[1]));
3235 P_NV9097_SET_BLEND_CONST_BLUE(p, fui(dyn->cb.blend_constants[2]));
3236 P_NV9097_SET_BLEND_CONST_ALPHA(p, fui(dyn->cb.blend_constants[3]));
3237 }
3238 }
3239
3240 void
nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer * cmd)3241 nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer *cmd)
3242 {
3243 struct vk_dynamic_graphics_state *dyn =
3244 &cmd->vk.dynamic_graphics_state;
3245
3246 if (!vk_dynamic_graphics_state_any_dirty(dyn))
3247 return;
3248
3249 nvk_flush_vi_state(cmd);
3250 nvk_flush_ia_state(cmd);
3251 nvk_flush_ts_state(cmd);
3252 nvk_flush_vp_state(cmd);
3253 nvk_flush_rs_state(cmd);
3254 nvk_flush_fsr_state(cmd);
3255 nvk_flush_ms_state(cmd);
3256 nvk_flush_ds_state(cmd);
3257 nvk_flush_cb_state(cmd);
3258
3259 vk_dynamic_graphics_state_clear_dirty(dyn);
3260 }
3261
3262 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)3263 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
3264 {
3265 /* First 4 bits are group, later bits are slot */
3266 struct mme_value group_slot = mme_load(b);
3267
3268 struct mme_value addr_lo, addr_hi, size;
3269 if (nvk_use_bindless_cbuf(b->devinfo)) {
3270 if (b->devinfo->cls_eng3d >= TURING_A) {
3271 struct mme_value64 addr = mme_load_addr64(b);
3272 mme_tu104_read_fifoed(b, addr, mme_imm(2));
3273 }
3274
3275 /* Load the descriptor */
3276 struct mme_value desc_lo = mme_load(b);
3277 struct mme_value desc_hi = mme_load(b);
3278
3279 /* The bottom 45 bits are addr >> 4 */
3280 addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
3281 addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
3282 mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
3283
3284 /* The top 19 bits are size >> 4 */
3285 size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
3286
3287 mme_free_reg(b, desc_hi);
3288 mme_free_reg(b, desc_lo);
3289 } else {
3290 if (b->devinfo->cls_eng3d >= TURING_A) {
3291 struct mme_value64 addr = mme_load_addr64(b);
3292 mme_tu104_read_fifoed(b, addr, mme_imm(3));
3293 }
3294
3295 /* Load the descriptor */
3296 addr_lo = mme_load(b);
3297 addr_hi = mme_load(b);
3298 size = mme_load(b);
3299 }
3300
3301 struct mme_value cb = mme_alloc_reg(b);
3302 mme_if(b, ieq, size, mme_zero()) {
3303 /* Bottim bit is the valid bit, 8:4 are shader slot */
3304 mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
3305 }
3306
3307 mme_if(b, ine, size, mme_zero()) {
3308 /* size = max(size, NVK_MAX_CBUF_SIZE) */
3309 assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
3310 struct mme_value is_large =
3311 mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
3312 mme_if(b, ine, is_large, mme_zero()) {
3313 mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
3314 }
3315
3316 mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
3317 mme_emit(b, size);
3318 mme_emit(b, addr_hi);
3319 mme_emit(b, addr_lo);
3320
3321 /* Bottom bit is the valid bit, 8:4 are shader slot */
3322 mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
3323 }
3324
3325 mme_free_reg(b, addr_hi);
3326 mme_free_reg(b, addr_lo);
3327 mme_free_reg(b, size);
3328
3329 /* The group comes in the bottom 4 bits in group_slot and we need to
3330 * combine it with the method. However, unlike most array methods with a
3331 * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
3332 * dwords. This means we need to also shift by 3.
3333 */
3334 struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
3335 mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
3336 mme_emit(b, cb);
3337 }
3338
3339 void
nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer * cmd)3340 nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer *cmd)
3341 {
3342 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
3343 struct nvk_physical_device *pdev = nvk_device_physical(dev);
3344 const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
3345 struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3346
3347 /* Find cbuf maps for the 5 cbuf groups */
3348 const struct nvk_shader *cbuf_shaders[5] = { NULL, };
3349 for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
3350 const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
3351 if (shader == NULL)
3352 continue;
3353
3354 uint32_t group = nvk_cbuf_binding_for_stage(stage);
3355 assert(group < ARRAY_SIZE(cbuf_shaders));
3356 cbuf_shaders[group] = shader;
3357 }
3358
3359 bool bound_any_cbuf = false;
3360 for (uint32_t g = 0; g < ARRAY_SIZE(cbuf_shaders); g++) {
3361 if (cbuf_shaders[g] == NULL)
3362 continue;
3363
3364 const struct nvk_shader *shader = cbuf_shaders[g];
3365 const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
3366 struct nvk_cbuf_group *group = &cmd->state.gfx.cbuf_groups[g];
3367
3368 /* We only bother to re-bind cbufs that are in use */
3369 const uint32_t rebind =
3370 group->dirty & BITFIELD_MASK(cbuf_map->cbuf_count);
3371 if (!rebind)
3372 continue;
3373
3374 u_foreach_bit(c, rebind) {
3375 const struct nvk_cbuf *cbuf = &group->cbufs[c];
3376
3377 /* We bind these at the very end */
3378 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC)
3379 continue;
3380
3381 bound_any_cbuf = true;
3382
3383 struct nvk_buffer_address ba;
3384 if (nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba)) {
3385 assert(ba.base_addr % min_cbuf_alignment == 0);
3386 ba.size = align(ba.size, min_cbuf_alignment);
3387 ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
3388
3389 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3390
3391 if (ba.size > 0) {
3392 P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
3393 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
3394 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
3395 P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
3396 }
3397
3398 P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(g), {
3399 .valid = ba.size > 0,
3400 .shader_slot = c,
3401 });
3402 } else {
3403 uint64_t desc_addr =
3404 nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
3405
3406 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3407 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3408
3409 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3410 P_INLINE_DATA(p, g | (c << 4));
3411 P_INLINE_DATA(p, desc_addr >> 32);
3412 P_INLINE_DATA(p, desc_addr);
3413 } else {
3414 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3415
3416 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3417 P_INLINE_DATA(p, g | (c << 4));
3418
3419 nv_push_update_count(p, 3);
3420 nvk_cmd_buffer_push_indirect(cmd, desc_addr, 12);
3421 }
3422 }
3423 }
3424
3425 group->dirty &= ~rebind;
3426 }
3427
3428 /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
3429 * always left pointing at the root descriptor table. This way draw
3430 * parameters and similar MME root table updates always hit the root
3431 * descriptor table and not some random UBO.
3432 */
3433 if (bound_any_cbuf) {
3434 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3435 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
3436 P_INLINE_DATA(p, 0);
3437 }
3438 }
3439
3440 static void
nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer * cmd)3441 nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer *cmd)
3442 {
3443 nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
3444 nvk_cmd_flush_gfx_dynamic_state(cmd);
3445 nvk_cmd_flush_gfx_shaders(cmd);
3446 nvk_cmd_flush_gfx_cbufs(cmd);
3447 }
3448
3449 void
nvk_mme_bind_ib(struct mme_builder * b)3450 nvk_mme_bind_ib(struct mme_builder *b)
3451 {
3452 struct mme_value64 addr = mme_load_addr64(b);
3453 struct mme_value size_B = mme_load(b);
3454
3455 struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3456 mme_if(b, ieq, addr_or, mme_zero()) {
3457 mme_mov_to(b, size_B, mme_zero());
3458 }
3459 mme_free_reg(b, addr_or);
3460
3461 if (b->devinfo->cls_eng3d < TURING_A) {
3462 mme_if(b, ieq, size_B, mme_zero()) {
3463 nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3464 nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3465 }
3466 }
3467
3468 mme_mthd(b, NV9097_SET_INDEX_BUFFER_A);
3469 mme_emit(b, addr.hi);
3470 mme_emit(b, addr.lo);
3471
3472 if (b->devinfo->cls_eng3d >= TURING_A) {
3473 mme_mthd(b, NVC597_SET_INDEX_BUFFER_SIZE_A);
3474 mme_emit(b, mme_zero());
3475 mme_emit(b, size_B);
3476 } else {
3477 /* Convert to an end address */
3478 mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3479 mme_add64_to(b, addr, addr, mme_imm64(-1));
3480
3481 /* mme_mthd(b, NV9097_SET_INDEX_BUFFER_C); */
3482 mme_emit(b, addr.hi);
3483 mme_emit(b, addr.lo);
3484 }
3485 mme_free_reg64(b, addr);
3486 mme_free_reg(b, size_B);
3487
3488 struct mme_value fmt = mme_load(b);
3489 struct mme_value restart = mme_mov(b, mme_imm(UINT32_MAX));
3490 struct mme_value index_type = mme_mov(b,
3491 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES));
3492
3493 /* The Vulkan and D3D enums don't overlap so we can handle both at the same
3494 * time with one MME macro.
3495 */
3496 UNUSED static const uint32_t DXGI_FORMAT_R32_UINT = 42;
3497 static const uint32_t DXGI_FORMAT_R16_UINT = 57;
3498 static const uint32_t DXGI_FORMAT_R8_UINT = 62;
3499
3500 mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT16)) {
3501 mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3502 mme_mov_to(b, index_type,
3503 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3504 }
3505
3506 mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R16_UINT)) {
3507 mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3508 mme_mov_to(b, index_type,
3509 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3510 }
3511
3512 mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT8_KHR)) {
3513 mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3514 mme_mov_to(b, index_type,
3515 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3516 }
3517
3518 mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R8_UINT)) {
3519 mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3520 mme_mov_to(b, index_type,
3521 mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3522 }
3523
3524 mme_mthd(b, NV9097_SET_DA_PRIMITIVE_RESTART_INDEX);
3525 mme_emit(b, restart);
3526
3527 mme_mthd(b, NV9097_SET_INDEX_BUFFER_E);
3528 mme_emit(b, index_type);
3529 }
3530
3531 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3532 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3533 VkBuffer _buffer,
3534 VkDeviceSize offset,
3535 VkDeviceSize size,
3536 VkIndexType indexType)
3537 {
3538 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3539 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3540 struct nvk_addr_range addr_range =
3541 nvk_buffer_addr_range(buffer, offset, size);
3542
3543 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3544 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB));
3545 P_INLINE_DATA(p, addr_range.addr >> 32);
3546 P_INLINE_DATA(p, addr_range.addr);
3547 assert(addr_range.range <= UINT32_MAX);
3548 P_INLINE_DATA(p, addr_range.range);
3549 P_INLINE_DATA(p, indexType);
3550 }
3551
3552 void
nvk_mme_bind_vb(struct mme_builder * b)3553 nvk_mme_bind_vb(struct mme_builder *b)
3554 {
3555 struct mme_value vb_idx = mme_load(b);
3556 struct mme_value64 addr = mme_load_addr64(b);
3557 struct mme_value size_B = mme_load(b);
3558
3559 struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3560 mme_if(b, ieq, addr_or, mme_zero()) {
3561 mme_mov_to(b, size_B, mme_zero());
3562 }
3563 mme_free_reg(b, addr_or);
3564
3565 if (b->devinfo->cls_eng3d < TURING_A) {
3566 mme_if(b, ieq, size_B, mme_zero()) {
3567 nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3568 nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3569 }
3570 }
3571
3572 struct mme_value vb_idx4 = mme_sll(b, vb_idx, mme_imm(2));
3573 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_LOCATION_A(0), vb_idx4);
3574 mme_free_reg(b, vb_idx4);
3575 mme_emit(b, addr.hi);
3576 mme_emit(b, addr.lo);
3577
3578 if (b->devinfo->cls_eng3d >= TURING_A) {
3579 struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3580 mme_mthd_arr(b, NVC597_SET_VERTEX_STREAM_SIZE_A(0), vb_idx2);
3581 mme_emit(b, mme_zero());
3582 mme_emit(b, size_B);
3583 } else {
3584 /* Convert to an end address */
3585 mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3586 mme_add64_to(b, addr, addr, mme_imm64(-1));
3587
3588 struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3589 mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_LIMIT_A_A(0), vb_idx2);
3590 mme_emit(b, addr.hi);
3591 mme_emit(b, addr.lo);
3592 }
3593 }
3594
3595 static void
nvk_mme_bind_vb_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)3596 nvk_mme_bind_vb_test_check(const struct nv_device_info *devinfo,
3597 const struct nvk_mme_test_case *test,
3598 const struct nvk_mme_mthd_data *results)
3599 {
3600 const uint32_t vb_idx = test->params[0];
3601 const uint32_t addr_hi = test->params[1];
3602 const uint32_t addr_lo = test->params[2];
3603
3604 uint32_t size_B = test->params[3];
3605 if (addr_hi == 0 && addr_lo == 0)
3606 size_B = 0;
3607
3608 assert(results[0].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
3609 assert(results[1].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_B(vb_idx));
3610
3611 if (devinfo->cls_eng3d >= TURING_A) {
3612 assert(results[0].data == addr_hi);
3613 assert(results[1].data == addr_lo);
3614
3615 assert(results[2].mthd == NVC597_SET_VERTEX_STREAM_SIZE_A(3));
3616 assert(results[3].mthd == NVC597_SET_VERTEX_STREAM_SIZE_B(3));
3617 assert(results[2].data == 0);
3618 assert(results[3].data == size_B);
3619 } else {
3620 uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
3621 if (size_B == 0)
3622 addr = ((uint64_t)test->init[0].data << 32) | test->init[1].data;
3623
3624 assert(results[0].data == addr >> 32);
3625 assert(results[1].data == (uint32_t)addr);
3626
3627 const uint64_t limit = (addr + size_B) - 1;
3628 assert(results[2].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_A(3));
3629 assert(results[3].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_B(3));
3630 assert(results[2].data == limit >> 32);
3631 assert(results[3].data == (uint32_t)limit);
3632 }
3633 }
3634
3635 const struct nvk_mme_test_case nvk_mme_bind_vb_tests[] = {{
3636 .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0x10000 },
3637 .check = nvk_mme_bind_vb_test_check,
3638 }, {
3639 .init = (struct nvk_mme_mthd_data[]) {
3640 { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3641 { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3642 { }
3643 },
3644 .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0 },
3645 .check = nvk_mme_bind_vb_test_check,
3646 }, {
3647 .init = (struct nvk_mme_mthd_data[]) {
3648 { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3649 { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3650 { }
3651 },
3652 .params = (uint32_t[]) { 3, 0, 0, 0x800 },
3653 .check = nvk_mme_bind_vb_test_check,
3654 }, {}};
3655
3656 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)3657 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
3658 struct nvk_addr_range addr_range)
3659 {
3660 /* Used for meta save/restore */
3661 if (vb_idx == 0)
3662 cmd->state.gfx.vb0 = addr_range;
3663
3664 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3665 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB));
3666 P_INLINE_DATA(p, vb_idx);
3667 P_INLINE_DATA(p, addr_range.addr >> 32);
3668 P_INLINE_DATA(p, addr_range.addr);
3669 assert(addr_range.range <= UINT32_MAX);
3670 P_INLINE_DATA(p, addr_range.range);
3671 }
3672
3673 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3674 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3675 uint32_t firstBinding,
3676 uint32_t bindingCount,
3677 const VkBuffer *pBuffers,
3678 const VkDeviceSize *pOffsets,
3679 const VkDeviceSize *pSizes,
3680 const VkDeviceSize *pStrides)
3681 {
3682 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3683
3684 if (pStrides) {
3685 vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
3686 bindingCount, pStrides);
3687 }
3688
3689 for (uint32_t i = 0; i < bindingCount; i++) {
3690 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3691 uint32_t idx = firstBinding + i;
3692
3693 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3694 const struct nvk_addr_range addr_range =
3695 nvk_buffer_addr_range(buffer, pOffsets[i], size);
3696
3697 nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3698 }
3699 }
3700
3701 static void
nvk_mme_set_cb0_mthd(struct mme_builder * b,uint16_t cb0_offset,uint16_t mthd,struct mme_value val)3702 nvk_mme_set_cb0_mthd(struct mme_builder *b,
3703 uint16_t cb0_offset,
3704 uint16_t mthd,
3705 struct mme_value val)
3706 {
3707 if (b->devinfo->cls_eng3d >= TURING_A) {
3708 struct mme_value old = mme_state(b, mthd);
3709 mme_if(b, ine, old, val) {
3710 mme_mthd(b, mthd);
3711 mme_emit(b, val);
3712
3713 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3714 mme_emit(b, mme_imm(cb0_offset));
3715 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3716 mme_emit(b, val);
3717 }
3718 mme_free_reg(b, old);
3719 } else {
3720 /* Fermi is really tight on registers. Don't bother with the if and set
3721 * both unconditionally for now.
3722 */
3723 mme_mthd(b, mthd);
3724 mme_emit(b, val);
3725
3726 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3727 mme_emit(b, mme_imm(cb0_offset));
3728 mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3729 mme_emit(b, val);
3730 }
3731 }
3732
3733 static void
nvk_mme_set_cb0_scratch(struct mme_builder * b,uint16_t cb0_offset,enum nvk_mme_scratch scratch,struct mme_value val)3734 nvk_mme_set_cb0_scratch(struct mme_builder *b,
3735 uint16_t cb0_offset,
3736 enum nvk_mme_scratch scratch,
3737 struct mme_value val)
3738 {
3739 const uint16_t mthd = NV9097_SET_MME_SHADOW_SCRATCH(scratch);
3740 nvk_mme_set_cb0_mthd(b, cb0_offset, mthd, val);
3741 }
3742
3743 struct mme_draw_params {
3744 struct mme_value base_vertex;
3745 struct mme_value first_vertex;
3746 struct mme_value first_instance;
3747 struct mme_value draw_index;
3748 };
3749
3750 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)3751 nvk_mme_build_set_draw_params(struct mme_builder *b,
3752 const struct mme_draw_params *p)
3753 {
3754 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.base_vertex),
3755 NVK_MME_SCRATCH_CB0_FIRST_VERTEX,
3756 p->first_vertex);
3757 nvk_mme_set_cb0_mthd(b, nvk_root_descriptor_offset(draw.base_instance),
3758 NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX,
3759 p->first_instance);
3760 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.draw_index),
3761 NVK_MME_SCRATCH_CB0_DRAW_INDEX,
3762 p->draw_index);
3763 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3764 NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3765 mme_zero());
3766
3767 mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
3768 mme_emit(b, p->base_vertex);
3769 mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
3770 mme_emit(b, p->base_vertex);
3771 }
3772
3773 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)3774 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
3775 {
3776 /* Set the push constant */
3777 nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3778 NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3779 view_index);
3780
3781 /* Set the layer to the view index */
3782 STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
3783 STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
3784 mme_mthd(b, NV9097_SET_RT_LAYER);
3785 mme_emit(b, view_index);
3786 }
3787
3788 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)3789 nvk_mme_build_draw_loop(struct mme_builder *b,
3790 struct mme_value instance_count,
3791 struct mme_value first_vertex,
3792 struct mme_value vertex_count)
3793 {
3794 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3795
3796 mme_loop(b, instance_count) {
3797 mme_mthd(b, NV9097_BEGIN);
3798 mme_emit(b, begin);
3799
3800 mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
3801 mme_emit(b, first_vertex);
3802 mme_emit(b, vertex_count);
3803
3804 mme_mthd(b, NV9097_END);
3805 mme_emit(b, mme_zero());
3806
3807 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3808 }
3809
3810 mme_free_reg(b, begin);
3811 }
3812
3813 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_index)3814 nvk_mme_build_draw(struct mme_builder *b,
3815 struct mme_value draw_index)
3816 {
3817 /* These are in VkDrawIndirectCommand order */
3818 struct mme_value vertex_count = mme_load(b);
3819 struct mme_value instance_count = mme_load(b);
3820 struct mme_value first_vertex = mme_load(b);
3821 struct mme_value first_instance = mme_load(b);
3822
3823 struct mme_draw_params params = {
3824 .first_vertex = first_vertex,
3825 .first_instance = first_instance,
3826 .draw_index = draw_index,
3827 };
3828 nvk_mme_build_set_draw_params(b, ¶ms);
3829
3830 mme_free_reg(b, first_instance);
3831
3832 if (b->devinfo->cls_eng3d < TURING_A)
3833 nvk_mme_spill(b, DRAW_IDX, draw_index);
3834
3835 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3836 mme_if(b, ieq, view_mask, mme_zero()) {
3837 mme_free_reg(b, view_mask);
3838
3839 nvk_mme_build_draw_loop(b, instance_count,
3840 first_vertex, vertex_count);
3841 }
3842
3843 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3844 mme_if(b, ine, view_mask, mme_zero()) {
3845 mme_free_reg(b, view_mask);
3846
3847 struct mme_value view = mme_mov(b, mme_zero());
3848 mme_while(b, ine, view, mme_imm(32)) {
3849 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3850 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3851 mme_free_reg(b, view_mask);
3852 mme_if(b, ine, has_view, mme_zero()) {
3853 mme_free_reg(b, has_view);
3854 nvk_mme_emit_view_index(b, view);
3855 nvk_mme_build_draw_loop(b, instance_count,
3856 first_vertex, vertex_count);
3857 }
3858
3859 mme_add_to(b, view, view, mme_imm(1));
3860 }
3861 mme_free_reg(b, view);
3862 }
3863
3864 mme_free_reg(b, instance_count);
3865 mme_free_reg(b, first_vertex);
3866 mme_free_reg(b, vertex_count);
3867
3868 if (b->devinfo->cls_eng3d < TURING_A)
3869 nvk_mme_unspill(b, DRAW_IDX, draw_index);
3870 }
3871
3872 void
nvk_mme_draw(struct mme_builder * b)3873 nvk_mme_draw(struct mme_builder *b)
3874 {
3875 struct mme_value draw_index = mme_load(b);
3876 nvk_mme_build_draw(b, draw_index);
3877 }
3878
3879 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3880 nvk_CmdDraw(VkCommandBuffer commandBuffer,
3881 uint32_t vertexCount,
3882 uint32_t instanceCount,
3883 uint32_t firstVertex,
3884 uint32_t firstInstance)
3885 {
3886 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3887
3888 nvk_cmd_flush_gfx_state(cmd);
3889
3890 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3891 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3892 P_INLINE_DATA(p, 0 /* draw_index */);
3893 P_INLINE_DATA(p, vertexCount);
3894 P_INLINE_DATA(p, instanceCount);
3895 P_INLINE_DATA(p, firstVertex);
3896 P_INLINE_DATA(p, firstInstance);
3897 }
3898
3899 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3900 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3901 uint32_t drawCount,
3902 const VkMultiDrawInfoEXT *pVertexInfo,
3903 uint32_t instanceCount,
3904 uint32_t firstInstance,
3905 uint32_t stride)
3906 {
3907 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3908
3909 nvk_cmd_flush_gfx_state(cmd);
3910
3911 for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3912 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3913 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3914 P_INLINE_DATA(p, draw_index);
3915 P_INLINE_DATA(p, pVertexInfo->vertexCount);
3916 P_INLINE_DATA(p, instanceCount);
3917 P_INLINE_DATA(p, pVertexInfo->firstVertex);
3918 P_INLINE_DATA(p, firstInstance);
3919
3920 pVertexInfo = ((void *)pVertexInfo) + stride;
3921 }
3922 }
3923
3924 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)3925 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
3926 struct mme_value instance_count,
3927 struct mme_value first_index,
3928 struct mme_value index_count)
3929 {
3930 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3931
3932 mme_loop(b, instance_count) {
3933 mme_mthd(b, NV9097_BEGIN);
3934 mme_emit(b, begin);
3935
3936 mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
3937 mme_emit(b, first_index);
3938 mme_emit(b, index_count);
3939
3940 mme_mthd(b, NV9097_END);
3941 mme_emit(b, mme_zero());
3942
3943 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3944 }
3945
3946 mme_free_reg(b, begin);
3947 }
3948
3949 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_index)3950 nvk_mme_build_draw_indexed(struct mme_builder *b,
3951 struct mme_value draw_index)
3952 {
3953 /* These are in VkDrawIndexedIndirectCommand order */
3954 struct mme_value index_count = mme_load(b);
3955 struct mme_value instance_count = mme_load(b);
3956 struct mme_value first_index = mme_load(b);
3957 struct mme_value vertex_offset = mme_load(b);
3958 struct mme_value first_instance = mme_load(b);
3959
3960 struct mme_draw_params params = {
3961 .base_vertex = vertex_offset,
3962 .first_vertex = vertex_offset,
3963 .first_instance = first_instance,
3964 .draw_index = draw_index,
3965 };
3966 nvk_mme_build_set_draw_params(b, ¶ms);
3967
3968 mme_free_reg(b, vertex_offset);
3969 mme_free_reg(b, first_instance);
3970
3971 if (b->devinfo->cls_eng3d < TURING_A)
3972 nvk_mme_spill(b, DRAW_IDX, draw_index);
3973
3974 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3975 mme_if(b, ieq, view_mask, mme_zero()) {
3976 mme_free_reg(b, view_mask);
3977
3978 nvk_mme_build_draw_indexed_loop(b, instance_count,
3979 first_index, index_count);
3980 }
3981
3982 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3983 mme_if(b, ine, view_mask, mme_zero()) {
3984 mme_free_reg(b, view_mask);
3985
3986 struct mme_value view = mme_mov(b, mme_zero());
3987 mme_while(b, ine, view, mme_imm(32)) {
3988 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3989 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3990 mme_free_reg(b, view_mask);
3991 mme_if(b, ine, has_view, mme_zero()) {
3992 mme_free_reg(b, has_view);
3993 nvk_mme_emit_view_index(b, view);
3994 nvk_mme_build_draw_indexed_loop(b, instance_count,
3995 first_index, index_count);
3996 }
3997
3998 mme_add_to(b, view, view, mme_imm(1));
3999 }
4000 mme_free_reg(b, view);
4001 }
4002
4003 mme_free_reg(b, instance_count);
4004 mme_free_reg(b, first_index);
4005 mme_free_reg(b, index_count);
4006
4007 if (b->devinfo->cls_eng3d < TURING_A)
4008 nvk_mme_unspill(b, DRAW_IDX, draw_index);
4009 }
4010
4011 void
nvk_mme_draw_indexed(struct mme_builder * b)4012 nvk_mme_draw_indexed(struct mme_builder *b)
4013 {
4014 struct mme_value draw_index = mme_load(b);
4015 nvk_mme_build_draw_indexed(b, draw_index);
4016 }
4017
4018 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4019 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4020 uint32_t indexCount,
4021 uint32_t instanceCount,
4022 uint32_t firstIndex,
4023 int32_t vertexOffset,
4024 uint32_t firstInstance)
4025 {
4026 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4027
4028 nvk_cmd_flush_gfx_state(cmd);
4029
4030 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4031 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4032 P_INLINE_DATA(p, 0 /* draw_index */);
4033 P_INLINE_DATA(p, indexCount);
4034 P_INLINE_DATA(p, instanceCount);
4035 P_INLINE_DATA(p, firstIndex);
4036 P_INLINE_DATA(p, vertexOffset);
4037 P_INLINE_DATA(p, firstInstance);
4038 }
4039
4040 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)4041 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
4042 uint32_t drawCount,
4043 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
4044 uint32_t instanceCount,
4045 uint32_t firstInstance,
4046 uint32_t stride,
4047 const int32_t *pVertexOffset)
4048 {
4049 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4050
4051 nvk_cmd_flush_gfx_state(cmd);
4052
4053 for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
4054 const uint32_t vertex_offset =
4055 pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
4056
4057 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4058 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4059 P_INLINE_DATA(p, draw_index);
4060 P_INLINE_DATA(p, pIndexInfo->indexCount);
4061 P_INLINE_DATA(p, instanceCount);
4062 P_INLINE_DATA(p, pIndexInfo->firstIndex);
4063 P_INLINE_DATA(p, vertex_offset);
4064 P_INLINE_DATA(p, firstInstance);
4065
4066 pIndexInfo = ((void *)pIndexInfo) + stride;
4067 }
4068 }
4069
4070 void
nvk_mme_draw_indirect(struct mme_builder * b)4071 nvk_mme_draw_indirect(struct mme_builder *b)
4072 {
4073 if (b->devinfo->cls_eng3d >= TURING_A) {
4074 struct mme_value64 draw_addr = mme_load_addr64(b);
4075 struct mme_value draw_count = mme_load(b);
4076 struct mme_value stride = mme_load(b);
4077
4078 struct mme_value draw = mme_mov(b, mme_zero());
4079 mme_while(b, ult, draw, draw_count) {
4080 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4081
4082 nvk_mme_build_draw(b, draw);
4083
4084 mme_add_to(b, draw, draw, mme_imm(1));
4085 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4086 }
4087 } else {
4088 struct mme_value draw_count = mme_load(b);
4089 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4090
4091 struct mme_value draw = mme_mov(b, mme_zero());
4092 mme_while(b, ine, draw, draw_count) {
4093 nvk_mme_spill(b, DRAW_COUNT, draw_count);
4094
4095 nvk_mme_build_draw(b, draw);
4096 mme_add_to(b, draw, draw, mme_imm(1));
4097
4098 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4099 mme_loop(b, pad_dw) {
4100 mme_free_reg(b, mme_load(b));
4101 }
4102 mme_free_reg(b, pad_dw);
4103
4104 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4105 }
4106 }
4107 }
4108
4109 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4110 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4111 VkBuffer _buffer,
4112 VkDeviceSize offset,
4113 uint32_t drawCount,
4114 uint32_t stride)
4115 {
4116 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4117 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4118
4119 /* From the Vulkan 1.3.238 spec:
4120 *
4121 * VUID-vkCmdDrawIndirect-drawCount-00476
4122 *
4123 * "If drawCount is greater than 1, stride must be a multiple of 4 and
4124 * must be greater than or equal to sizeof(VkDrawIndirectCommand)"
4125 *
4126 * and
4127 *
4128 * "If drawCount is less than or equal to one, stride is ignored."
4129 */
4130 if (drawCount > 1) {
4131 assert(stride % 4 == 0);
4132 assert(stride >= sizeof(VkDrawIndirectCommand));
4133 } else {
4134 stride = sizeof(VkDrawIndirectCommand);
4135 }
4136
4137 nvk_cmd_flush_gfx_state(cmd);
4138
4139 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4140 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4141 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4142 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4143 P_INLINE_DATA(p, draw_addr >> 32);
4144 P_INLINE_DATA(p, draw_addr);
4145 P_INLINE_DATA(p, drawCount);
4146 P_INLINE_DATA(p, stride);
4147 } else {
4148 const uint32_t max_draws_per_push =
4149 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4150
4151 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4152 while (drawCount) {
4153 const uint32_t count = MIN2(drawCount, max_draws_per_push);
4154
4155 struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4156 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4157 P_INLINE_DATA(p, count);
4158 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
4159
4160 uint64_t range = count * (uint64_t)stride;
4161 nv_push_update_count(p, range / 4);
4162 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4163
4164 draw_addr += range;
4165 drawCount -= count;
4166 }
4167 }
4168 }
4169
4170 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)4171 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
4172 {
4173 if (b->devinfo->cls_eng3d >= TURING_A) {
4174 struct mme_value64 draw_addr = mme_load_addr64(b);
4175 struct mme_value draw_count = mme_load(b);
4176 struct mme_value stride = mme_load(b);
4177
4178 struct mme_value draw = mme_mov(b, mme_zero());
4179 mme_while(b, ult, draw, draw_count) {
4180 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4181
4182 nvk_mme_build_draw_indexed(b, draw);
4183
4184 mme_add_to(b, draw, draw, mme_imm(1));
4185 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4186 }
4187 } else {
4188 struct mme_value draw_count = mme_load(b);
4189 nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4190
4191 struct mme_value draw = mme_mov(b, mme_zero());
4192 mme_while(b, ine, draw, draw_count) {
4193 nvk_mme_spill(b, DRAW_COUNT, draw_count);
4194
4195 nvk_mme_build_draw_indexed(b, draw);
4196 mme_add_to(b, draw, draw, mme_imm(1));
4197
4198 struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4199 mme_loop(b, pad_dw) {
4200 mme_free_reg(b, mme_load(b));
4201 }
4202 mme_free_reg(b, pad_dw);
4203
4204 nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4205 }
4206 }
4207 }
4208
4209 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4210 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4211 VkBuffer _buffer,
4212 VkDeviceSize offset,
4213 uint32_t drawCount,
4214 uint32_t stride)
4215 {
4216 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4217 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4218
4219 /* From the Vulkan 1.3.238 spec:
4220 *
4221 * VUID-vkCmdDrawIndexedIndirect-drawCount-00528
4222 *
4223 * "If drawCount is greater than 1, stride must be a multiple of 4 and
4224 * must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
4225 *
4226 * and
4227 *
4228 * "If drawCount is less than or equal to one, stride is ignored."
4229 */
4230 if (drawCount > 1) {
4231 assert(stride % 4 == 0);
4232 assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
4233 } else {
4234 stride = sizeof(VkDrawIndexedIndirectCommand);
4235 }
4236
4237 nvk_cmd_flush_gfx_state(cmd);
4238
4239 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4240 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4241 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4242 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4243 P_INLINE_DATA(p, draw_addr >> 32);
4244 P_INLINE_DATA(p, draw_addr);
4245 P_INLINE_DATA(p, drawCount);
4246 P_INLINE_DATA(p, stride);
4247 } else {
4248 const uint32_t max_draws_per_push =
4249 ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4250
4251 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4252 while (drawCount) {
4253 const uint32_t count = MIN2(drawCount, max_draws_per_push);
4254
4255 struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4256 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4257 P_INLINE_DATA(p, count);
4258 P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
4259
4260 uint64_t range = count * (uint64_t)stride;
4261 nv_push_update_count(p, range / 4);
4262 nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4263
4264 draw_addr += range;
4265 drawCount -= count;
4266 }
4267 }
4268 }
4269
4270 void
nvk_mme_draw_indirect_count(struct mme_builder * b)4271 nvk_mme_draw_indirect_count(struct mme_builder *b)
4272 {
4273 if (b->devinfo->cls_eng3d < TURING_A)
4274 return;
4275
4276 struct mme_value64 draw_addr = mme_load_addr64(b);
4277 struct mme_value64 draw_count_addr = mme_load_addr64(b);
4278 struct mme_value draw_max = mme_load(b);
4279 struct mme_value stride = mme_load(b);
4280
4281 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4282 mme_free_reg64(b, draw_count_addr);
4283 struct mme_value draw_count_buf = mme_load(b);
4284
4285 mme_if(b, ule, draw_count_buf, draw_max) {
4286 mme_mov_to(b, draw_max, draw_count_buf);
4287 }
4288 mme_free_reg(b, draw_count_buf);
4289
4290 struct mme_value draw = mme_mov(b, mme_zero());
4291 mme_while(b, ult, draw, draw_max) {
4292 mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4293
4294 nvk_mme_build_draw(b, draw);
4295
4296 mme_add_to(b, draw, draw, mme_imm(1));
4297 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4298 }
4299 }
4300
4301 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4302 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4303 VkBuffer _buffer,
4304 VkDeviceSize offset,
4305 VkBuffer countBuffer,
4306 VkDeviceSize countBufferOffset,
4307 uint32_t maxDrawCount,
4308 uint32_t stride)
4309 {
4310 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4311 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4312 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4313
4314 /* TODO: Indirect count draw pre-Turing */
4315 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4316
4317 nvk_cmd_flush_gfx_state(cmd);
4318
4319 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4320 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
4321 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4322 P_INLINE_DATA(p, draw_addr >> 32);
4323 P_INLINE_DATA(p, draw_addr);
4324 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4325 countBufferOffset);
4326 P_INLINE_DATA(p, draw_count_addr >> 32);
4327 P_INLINE_DATA(p, draw_count_addr);
4328 P_INLINE_DATA(p, maxDrawCount);
4329 P_INLINE_DATA(p, stride);
4330 }
4331
4332 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)4333 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
4334 {
4335 if (b->devinfo->cls_eng3d < TURING_A)
4336 return;
4337
4338 struct mme_value64 draw_addr = mme_load_addr64(b);
4339 struct mme_value64 draw_count_addr = mme_load_addr64(b);
4340 struct mme_value draw_max = mme_load(b);
4341 struct mme_value stride = mme_load(b);
4342
4343 mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4344 mme_free_reg64(b, draw_count_addr);
4345 struct mme_value draw_count_buf = mme_load(b);
4346
4347 mme_if(b, ule, draw_count_buf, draw_max) {
4348 mme_mov_to(b, draw_max, draw_count_buf);
4349 }
4350 mme_free_reg(b, draw_count_buf);
4351
4352 struct mme_value draw = mme_mov(b, mme_zero());
4353 mme_while(b, ult, draw, draw_max) {
4354 mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4355
4356 nvk_mme_build_draw_indexed(b, draw);
4357
4358 mme_add_to(b, draw, draw, mme_imm(1));
4359 mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4360 }
4361 }
4362
4363 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4364 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4365 VkBuffer _buffer,
4366 VkDeviceSize offset,
4367 VkBuffer countBuffer,
4368 VkDeviceSize countBufferOffset,
4369 uint32_t maxDrawCount,
4370 uint32_t stride)
4371 {
4372 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4373 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4374 VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4375
4376 /* TODO: Indexed indirect count draw pre-Turing */
4377 assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4378
4379 nvk_cmd_flush_gfx_state(cmd);
4380
4381 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4382 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
4383 uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4384 P_INLINE_DATA(p, draw_addr >> 32);
4385 P_INLINE_DATA(p, draw_addr);
4386 uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4387 countBufferOffset);
4388 P_INLINE_DATA(p, draw_count_addr >> 32);
4389 P_INLINE_DATA(p, draw_count_addr);
4390 P_INLINE_DATA(p, maxDrawCount);
4391 P_INLINE_DATA(p, stride);
4392 }
4393
4394 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)4395 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
4396 struct mme_value instance_count,
4397 struct mme_value counter)
4398 {
4399 struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
4400
4401 mme_loop(b, instance_count) {
4402 mme_mthd(b, NV9097_BEGIN);
4403 mme_emit(b, begin);
4404
4405 mme_mthd(b, NV9097_DRAW_AUTO);
4406 mme_emit(b, counter);
4407
4408 mme_mthd(b, NV9097_END);
4409 mme_emit(b, mme_zero());
4410
4411 mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
4412 }
4413
4414 mme_free_reg(b, begin);
4415 }
4416
4417 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)4418 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
4419 {
4420 struct mme_value instance_count = mme_load(b);
4421 struct mme_value first_instance = mme_load(b);
4422
4423 if (b->devinfo->cls_eng3d >= TURING_A) {
4424 struct mme_value64 counter_addr = mme_load_addr64(b);
4425 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4426 mme_free_reg(b, counter_addr.lo);
4427 mme_free_reg(b, counter_addr.hi);
4428 }
4429 struct mme_value counter = mme_load(b);
4430
4431 struct mme_draw_params params = {
4432 .first_instance = first_instance,
4433 };
4434 nvk_mme_build_set_draw_params(b, ¶ms);
4435
4436 mme_free_reg(b, first_instance);
4437
4438 struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4439 mme_if(b, ieq, view_mask, mme_zero()) {
4440 mme_free_reg(b, view_mask);
4441
4442 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4443 }
4444
4445 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4446 mme_if(b, ine, view_mask, mme_zero()) {
4447 mme_free_reg(b, view_mask);
4448
4449 struct mme_value view = mme_mov(b, mme_zero());
4450 mme_while(b, ine, view, mme_imm(32)) {
4451 view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4452 struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
4453 mme_free_reg(b, view_mask);
4454 mme_if(b, ine, has_view, mme_zero()) {
4455 mme_free_reg(b, has_view);
4456 nvk_mme_emit_view_index(b, view);
4457 nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4458 }
4459
4460 mme_add_to(b, view, view, mme_imm(1));
4461 }
4462 }
4463
4464 mme_free_reg(b, instance_count);
4465 mme_free_reg(b, counter);
4466 }
4467
4468 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)4469 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4470 uint32_t instanceCount,
4471 uint32_t firstInstance,
4472 VkBuffer counterBuffer,
4473 VkDeviceSize counterBufferOffset,
4474 uint32_t counterOffset,
4475 uint32_t vertexStride)
4476 {
4477 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4478 VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
4479
4480 nvk_cmd_flush_gfx_state(cmd);
4481
4482 uint64_t counter_addr = nvk_buffer_address(counter_buffer,
4483 counterBufferOffset);
4484
4485 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4486 struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
4487 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4488 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4489
4490 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4491 P_INLINE_DATA(p, instanceCount);
4492 P_INLINE_DATA(p, firstInstance);
4493 P_INLINE_DATA(p, counter_addr >> 32);
4494 P_INLINE_DATA(p, counter_addr);
4495 } else {
4496 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
4497 P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4498 P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4499
4500 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4501 P_INLINE_DATA(p, instanceCount);
4502 P_INLINE_DATA(p, firstInstance);
4503 nv_push_update_count(p, 1);
4504 nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
4505 }
4506 }
4507
4508 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)4509 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
4510 uint32_t firstBinding,
4511 uint32_t bindingCount,
4512 const VkBuffer *pBuffers,
4513 const VkDeviceSize *pOffsets,
4514 const VkDeviceSize *pSizes)
4515 {
4516 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4517
4518 for (uint32_t i = 0; i < bindingCount; i++) {
4519 VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
4520 uint32_t idx = firstBinding + i;
4521 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
4522 struct nvk_addr_range addr_range =
4523 nvk_buffer_addr_range(buffer, pOffsets[i], size);
4524 assert(addr_range.range <= UINT32_MAX);
4525
4526 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4527
4528 P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
4529 P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
4530 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
4531 P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
4532 P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
4533 }
4534
4535 // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
4536 }
4537
4538 void
nvk_mme_xfb_counter_load(struct mme_builder * b)4539 nvk_mme_xfb_counter_load(struct mme_builder *b)
4540 {
4541 struct mme_value buffer = mme_load(b);
4542
4543 struct mme_value counter;
4544 if (b->devinfo->cls_eng3d >= TURING_A) {
4545 struct mme_value64 counter_addr = mme_load_addr64(b);
4546
4547 mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4548 mme_free_reg(b, counter_addr.lo);
4549 mme_free_reg(b, counter_addr.hi);
4550
4551 counter = mme_load(b);
4552 } else {
4553 counter = mme_load(b);
4554 }
4555
4556 mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
4557 mme_emit(b, counter);
4558
4559 mme_free_reg(b, counter);
4560 mme_free_reg(b, buffer);
4561 }
4562
4563 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4564 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4565 uint32_t firstCounterBuffer,
4566 uint32_t counterBufferCount,
4567 const VkBuffer *pCounterBuffers,
4568 const VkDeviceSize *pCounterBufferOffsets)
4569 {
4570 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4571 const uint32_t max_buffers = 4;
4572
4573 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
4574
4575 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
4576 for (uint32_t i = 0; i < max_buffers; ++i) {
4577 P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
4578 }
4579
4580 for (uint32_t i = 0; i < counterBufferCount; ++i) {
4581 if (pCounterBuffers[i] == VK_NULL_HANDLE)
4582 continue;
4583
4584 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4585 // index of counter buffer corresponts to index of transform buffer
4586 uint32_t cb_idx = firstCounterBuffer + i;
4587 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4588 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4589
4590 if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4591 struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
4592 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4593 /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
4594 P_INLINE_DATA(p, cb_idx * 8);
4595 P_INLINE_DATA(p, cb_addr >> 32);
4596 P_INLINE_DATA(p, cb_addr);
4597 } else {
4598 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
4599 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4600 P_INLINE_DATA(p, cb_idx);
4601 nv_push_update_count(p, 1);
4602 nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
4603 }
4604 }
4605 }
4606
4607 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4608 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4609 uint32_t firstCounterBuffer,
4610 uint32_t counterBufferCount,
4611 const VkBuffer *pCounterBuffers,
4612 const VkDeviceSize *pCounterBufferOffsets)
4613 {
4614 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4615
4616 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
4617
4618 P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
4619
4620 for (uint32_t i = 0; i < counterBufferCount; ++i) {
4621 if (pCounterBuffers[i] == VK_NULL_HANDLE)
4622 continue;
4623
4624 VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4625 // index of counter buffer corresponts to index of transform buffer
4626 uint32_t cb_idx = firstCounterBuffer + i;
4627 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4628 uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4629
4630 P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
4631 P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
4632 P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
4633 P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
4634 P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
4635 .operation = OPERATION_REPORT_ONLY,
4636 .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
4637 .report = REPORT_STREAMING_BYTE_COUNT,
4638 .sub_report = cb_idx,
4639 .structure_size = STRUCTURE_SIZE_ONE_WORD,
4640 });
4641 }
4642 }
4643
4644 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)4645 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4646 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4647 {
4648 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4649 VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
4650
4651 uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
4652 bool inverted = pConditionalRenderingBegin->flags &
4653 VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4654
4655 /* From the Vulkan 1.3.280 spec:
4656 *
4657 * "If the 32-bit value at offset in buffer memory is zero,
4658 * then the rendering commands are discarded,
4659 * otherwise they are executed as normal."
4660 *
4661 * The hardware compare a 64-bit value, as such we are required to copy it.
4662 */
4663 uint64_t tmp_addr;
4664 VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
4665 if (result != VK_SUCCESS) {
4666 vk_command_buffer_set_error(&cmd->vk, result);
4667 return;
4668 }
4669
4670 struct nv_push *p = nvk_cmd_buffer_push(cmd, 26);
4671
4672 P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
4673 P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
4674 P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
4675 P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
4676 P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
4677 P_NV90B5_PITCH_IN(p, 4);
4678 P_NV90B5_PITCH_OUT(p, 4);
4679 P_NV90B5_LINE_LENGTH_IN(p, 4);
4680 P_NV90B5_LINE_COUNT(p, 1);
4681
4682 P_IMMD(p, NV90B5, SET_REMAP_COMPONENTS, {
4683 .dst_x = DST_X_SRC_X,
4684 .dst_y = DST_Y_SRC_X,
4685 .dst_z = DST_Z_NO_WRITE,
4686 .dst_w = DST_W_NO_WRITE,
4687 .component_size = COMPONENT_SIZE_ONE,
4688 .num_src_components = NUM_SRC_COMPONENTS_ONE,
4689 .num_dst_components = NUM_DST_COMPONENTS_TWO,
4690 });
4691
4692 P_IMMD(p, NV90B5, LAUNCH_DMA, {
4693 .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
4694 .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
4695 .flush_enable = FLUSH_ENABLE_TRUE,
4696 .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
4697 .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
4698 .remap_enable = REMAP_ENABLE_TRUE,
4699 });
4700
4701 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4702 P_NV9097_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4703 P_NV9097_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4704 P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4705
4706 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4707 P_NV90C0_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4708 P_NV90C0_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4709 P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4710 }
4711
4712 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4713 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4714 {
4715 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4716
4717 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
4718 P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4719 P_NV9097_SET_RENDER_ENABLE_A(p, 0);
4720 P_NV9097_SET_RENDER_ENABLE_B(p, 0);
4721 P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4722
4723 P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4724 P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
4725 P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
4726 P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4727 }
4728