• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15 
16 #include "nil_format.h"
17 #include "util/bitpack_helpers.h"
18 #include "vulkan/runtime/vk_render_pass.h"
19 #include "vulkan/runtime/vk_standard_sample_locations.h"
20 #include "vulkan/util/vk_format.h"
21 
22 #include "nouveau_context.h"
23 
24 #include "nvk_cl902d.h"
25 #include "nvk_cl9097.h"
26 #include "nvk_cl90b5.h"
27 #include "nvk_cl90c0.h"
28 #include "nvk_cla097.h"
29 #include "nvk_clb097.h"
30 #include "nvk_clb197.h"
31 #include "nvk_clc397.h"
32 #include "nvk_clc597.h"
33 #include "drf.h"
34 
35 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)36 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
37 {
38    return nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d;
39 }
40 
41 void
nvk_mme_set_priv_reg(struct mme_builder * b)42 nvk_mme_set_priv_reg(struct mme_builder *b)
43 {
44    mme_mthd(b, NV9097_WAIT_FOR_IDLE);
45    mme_emit(b, mme_zero());
46 
47    mme_mthd(b, NV9097_SET_MME_SHADOW_SCRATCH(0));
48    mme_emit(b, mme_zero());
49    mme_emit(b, mme_load(b));
50    mme_emit(b, mme_load(b));
51 
52    /* Not sure if this has to strictly go before SET_FALCON04, but it might.
53     * We also don't really know what that value indicates and when and how it's
54     * set.
55     */
56    struct mme_value s26 = mme_state(b, NV9097_SET_MME_SHADOW_SCRATCH(26));
57    s26 = mme_merge(b, mme_zero(), s26, 0, 8, 0);
58 
59    mme_mthd(b, NV9097_SET_FALCON04);
60    mme_emit(b, mme_load(b));
61 
62    mme_if(b, ieq, s26, mme_imm(2)) {
63       struct mme_value loop_cond = mme_mov(b, mme_zero());
64       mme_while(b, ine, loop_cond, mme_imm(1)) {
65          mme_state_to(b, loop_cond, NV9097_SET_MME_SHADOW_SCRATCH(0));
66          mme_mthd(b, NV9097_NO_OPERATION);
67          mme_emit(b, mme_zero());
68       };
69    }
70 
71    mme_if(b, ine, s26, mme_imm(2)) {
72       mme_loop(b, mme_imm(10)) {
73          mme_mthd(b, NV9097_NO_OPERATION);
74          mme_emit(b, mme_zero());
75       }
76    }
77 }
78 
79 VkResult
nvk_push_draw_state_init(struct nvk_device * dev,struct nv_push * p)80 nvk_push_draw_state_init(struct nvk_device *dev, struct nv_push *p)
81 {
82    struct nvk_physical_device *pdev = nvk_device_physical(dev);
83 
84    /* 3D state */
85    P_MTHD(p, NV9097, SET_OBJECT);
86    P_NV9097_SET_OBJECT(p, {
87       .class_id = pdev->info.cls_eng3d,
88       .engine_id = 0,
89    });
90 
91    for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
92       size_t size;
93       uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
94       if (dw == NULL)
95          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
96 
97       assert(size % sizeof(uint32_t) == 0);
98       const uint32_t num_dw = size / sizeof(uint32_t);
99 
100       P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
101       P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
102       P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
103 
104       P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
105       P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
106       P_INLINE_ARRAY(p, dw, num_dw);
107 
108       mme_pos += num_dw;
109 
110       free(dw);
111    }
112 
113    if (dev->pdev->info.cls_eng3d >= TURING_A)
114       P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
115 
116    /* Enable FP hepler invocation memory loads
117     *
118     * For generations with firmware support for our `SET_PRIV_REG` mme method
119     * we simply use that. On older generations we'll let the kernel do it.
120     * Starting with GSP we have to do it via the firmware anyway.
121     */
122    if (dev->pdev->info.cls_eng3d >= MAXWELL_B) {
123       unsigned reg = dev->pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
124       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
125       P_INLINE_DATA(p, 0);
126       P_INLINE_DATA(p, BITFIELD_BIT(3));
127       P_INLINE_DATA(p, reg);
128    }
129 
130    P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
131 
132    P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
133    P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
134    for (unsigned i = 0; i < 8; i++)
135       P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
136 
137    P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
138 
139 //   P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
140 //   P_INLINE_DATA(cmd->push, 0);
141 
142    P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
143 
144    P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
145 
146    P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
147    P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
148    P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
149    P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
150 
151    P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
152 
153    P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
154 
155    P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
156 
157    P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
158                      DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
159 
160    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
161    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
162       .all_covered_all_hit_once = 0xff,
163    });
164    P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
165    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
166       .all_covered_all_hit_once = 0xff,
167    });
168    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
169       .all_covered_all_hit_once = 0xff,
170    });
171    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
172       .all_covered_all_hit_once = 0x3f,
173    });
174    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
175       .all_covered_all_hit_once = 0xff,
176    });
177    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
178       .all_covered_all_hit_once = 0xff,
179    });
180 
181    if (dev->pdev->info.cls_eng3d < VOLTA_A)
182       P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
183 
184    P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
185       .current = 3,
186       .oldest_supported = 3,
187    });
188    P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
189       .current = 2,
190       .oldest_supported = 2,
191    });
192 
193    if (dev->pdev->info.cls_eng3d < MAXWELL_A)
194       P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
195 
196    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
197                      POLICY_EVICT_NORMAL);
198    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
199                      POLICY_EVICT_NORMAL);
200    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
201                      POLICY_EVICT_NORMAL);
202    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
203                      POLICY_EVICT_NORMAL);
204    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
205                      POLICY_EVICT_NORMAL);
206 
207    P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
208 
209    P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
210       .color_front_diffuse    = COLOR_FRONT_DIFFUSE_VECTOR_0001,
211       .color_front_specular   = COLOR_FRONT_SPECULAR_VECTOR_0001,
212       .generic_vector         = GENERIC_VECTOR_VECTOR_0001,
213       .fixed_fnc_texture      = FIXED_FNC_TEXTURE_VECTOR_0001,
214       .dx9_color0             = DX9_COLOR0_VECTOR_0001,
215       .dx9_color1_to_color15  = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
216    });
217 
218    P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
219 
220    P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
221                      CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
222 
223    P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
224       .enable                       = ENABLE_TRUE,
225       .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
226    });
227 
228    if (dev->pdev->info.cls_eng3d < VOLTA_A)
229       P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
230 
231    P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
232    P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
233    P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
234 
235    if (dev->pdev->info.cls_eng3d < MAXWELL_A)
236       P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
237 
238    if (dev->pdev->info.cls_eng3d >= KEPLER_A &&
239        dev->pdev->info.cls_eng3d < MAXWELL_A) {
240       P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
241                         ORDERING_KEPLER_ORDER);
242    }
243 
244    P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
245    P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
246    P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
247    P_IMMD(p, NV9097, SET_PS_SATURATE, {
248       .output0 = OUTPUT0_FALSE,
249       .output1 = OUTPUT1_FALSE,
250       .output2 = OUTPUT2_FALSE,
251       .output3 = OUTPUT3_FALSE,
252       .output4 = OUTPUT4_FALSE,
253       .output5 = OUTPUT5_FALSE,
254       .output6 = OUTPUT6_FALSE,
255       .output7 = OUTPUT7_FALSE,
256    });
257 
258    P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
259 
260    /* From vulkan spec's point rasterization:
261     * "Point rasterization produces a fragment for each fragment area group of
262     * framebuffer pixels with one or more sample points that intersect a region
263     * centered at the point’s (xf,yf).
264     * This region is a square with side equal to the current point size.
265     * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
266     * for the point"
267     *
268     * So it seems we always need square points with PointCoords like OpenGL
269     * point sprites.
270     *
271     * From OpenGL compatibility spec:
272     * Basic point rasterization:
273     * "If point sprites are enabled, then point rasterization produces a
274     * fragment for each framebuffer pixel whose center lies inside a square
275     * centered at the point’s (xw, yw), with side length equal to the current
276     * point size.
277     * ... and xw and yw are the exact, unrounded window coordinates of the
278     * vertex for the point"
279     *
280     * And Point multisample rasterization:
281     * "This region is a circle having diameter equal to the current point width
282     * if POINT_SPRITE is disabled, or a square with side equal to the current
283     * point width if POINT_SPRITE is enabled."
284     */
285    P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
286    P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
287       .rmode      = RMODE_ZERO,
288       .origin     = ORIGIN_TOP,
289       .texture0   = TEXTURE0_PASSTHROUGH,
290       .texture1   = TEXTURE1_PASSTHROUGH,
291       .texture2   = TEXTURE2_PASSTHROUGH,
292       .texture3   = TEXTURE3_PASSTHROUGH,
293       .texture4   = TEXTURE4_PASSTHROUGH,
294       .texture5   = TEXTURE5_PASSTHROUGH,
295       .texture6   = TEXTURE6_PASSTHROUGH,
296       .texture7   = TEXTURE7_PASSTHROUGH,
297       .texture8   = TEXTURE8_PASSTHROUGH,
298       .texture9   = TEXTURE9_PASSTHROUGH,
299    });
300 
301    /* OpenGL's GL_POINT_SMOOTH */
302    P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
303 
304    if (dev->pdev->info.cls_eng3d >= MAXWELL_B)
305       P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
306 
307    P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
308 
309    P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
310 
311    P_IMMD(p, NV9097, SET_HYBRID_ANTI_ALIAS_CONTROL, {
312       .passes     = 1,
313       .centroid   = CENTROID_PER_FRAGMENT,
314    });
315 
316    /* Enable multisample rasterization even for one sample rasterization,
317     * this way we get strict lines and rectangular line support.
318     * More info at: DirectX rasterization rules
319     */
320    P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
321 
322    if (dev->pdev->info.cls_eng3d >= MAXWELL_B) {
323       P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
324                         BY_VIEWPORT_INDEX_FALSE);
325    }
326 
327    /* TODO: Vertex runout */
328 
329    P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
330       .mode    = MODE_UPPER_LEFT,
331       .flip_y  = FLIP_Y_FALSE,
332    });
333 
334    P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
335    P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
336    P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
337 
338    P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
339    P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
340    P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
341 
342 //   P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
343 //      .respect_stencil_mask   = RESPECT_STENCIL_MASK_FALSE,
344 //      .use_clear_rect         = USE_CLEAR_RECT_FALSE,
345 //   });
346 
347    P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
348 
349    P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
350       .min_z_zero_max_z_one      = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
351       .pixel_min_z               = PIXEL_MIN_Z_CLAMP,
352       .pixel_max_z               = PIXEL_MAX_Z_CLAMP,
353       .geometry_guardband        = GEOMETRY_GUARDBAND_SCALE_256,
354       .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
355       .geometry_clip             = GEOMETRY_CLIP_WZERO_CLIP,
356       .geometry_guardband_z      = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
357    });
358 
359    for (unsigned i = 0; i < 16; i++)
360       P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
361 
362    P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
363 
364    if (pdev->info.cls_eng3d < VOLTA_A) {
365       uint64_t shader_base_addr =
366          nvk_heap_contiguous_base_address(&dev->shader_heap);
367 
368       P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
369       P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
370       P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
371    }
372 
373    for (uint32_t group = 0; group < 5; group++) {
374       for (uint32_t slot = 0; slot < 16; slot++) {
375          P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
376             .valid = VALID_FALSE,
377             .shader_slot = slot,
378          });
379       }
380    }
381 
382 //   P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
383 //   P_INLINE_DATA(cmd->push, 0x40);
384    P_IMMD(p, NV9097, SET_RT_LAYER, {
385       .v = 0,
386       .control = CONTROL_V_SELECTS_LAYER,
387    });
388 //   P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
389 //   P_INLINE_DATA(cmd->push, 0x30);
390 
391    P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
392    P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
393    P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
394 
395    uint64_t zero_addr = dev->zero_page->offset;
396    P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
397    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
398    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
399 
400    if (dev->pdev->info.cls_eng3d >= FERMI_A &&
401        dev->pdev->info.cls_eng3d < MAXWELL_A) {
402       assert(dev->vab_memory);
403       uint64_t vab_addr = dev->vab_memory->offset;
404       P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
405       P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
406       P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
407       P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
408    }
409 
410    if (dev->pdev->info.cls_eng3d == MAXWELL_A)
411       P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
412 
413    return VK_SUCCESS;
414 }
415 
416 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)417 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
418 {
419    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
420 
421    /* These depend on color attachment count */
422    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
423    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
424    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
425    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
426 
427    /* These depend on the depth/stencil format */
428    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
429    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
430    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
431    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
432 
433    /* This may depend on render targets for ESO */
434    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
435 }
436 
437 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)438 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
439                               const VkCommandBufferBeginInfo *pBeginInfo)
440 {
441    if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
442       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
443       P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
444       P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
445          .lines = LINES_ALL,
446       });
447       P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
448          .lines = LINES_ALL,
449       });
450 
451       P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
452          .constant = CONSTANT_TRUE,
453       });
454    }
455 
456    if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
457        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
458       char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
459       const VkRenderingInfo *resume_info =
460          vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
461                                                                pBeginInfo,
462                                                                gcbiar_data);
463       if (resume_info) {
464          nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
465       } else {
466          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
467             vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
468                                                              pBeginInfo);
469          assert(inheritance_info);
470 
471          struct nvk_rendering_state *render = &cmd->state.gfx.render;
472          render->flags = inheritance_info->flags;
473          render->area = (VkRect2D) { };
474          render->layer_count = 0;
475          render->view_mask = inheritance_info->viewMask;
476          render->samples = inheritance_info->rasterizationSamples;
477 
478          render->color_att_count = inheritance_info->colorAttachmentCount;
479          for (uint32_t i = 0; i < render->color_att_count; i++) {
480             render->color_att[i].vk_format =
481                inheritance_info->pColorAttachmentFormats[i];
482          }
483          render->depth_att.vk_format =
484             inheritance_info->depthAttachmentFormat;
485          render->stencil_att.vk_format =
486             inheritance_info->stencilAttachmentFormat;
487 
488          nvk_cmd_buffer_dirty_render_pass(cmd);
489       }
490    }
491 
492    cmd->state.gfx.shaders_dirty = ~0;
493 }
494 
495 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)496 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
497 {
498    vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
499 
500    /* From the Vulkan 1.3.275 spec:
501     *
502     *    "...There is one exception to this rule - if the primary command
503     *    buffer is inside a render pass instance, then the render pass and
504     *    subpass state is not disturbed by executing secondary command
505     *    buffers."
506     *
507     * We need to reset everything EXCEPT the render pass state.
508     */
509    struct nvk_rendering_state render_save = cmd->state.gfx.render;
510    memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
511    cmd->state.gfx.render = render_save;
512 
513    cmd->state.gfx.shaders_dirty = ~0;
514 }
515 
516 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)517 nvk_attachment_init(struct nvk_attachment *att,
518                     const VkRenderingAttachmentInfo *info)
519 {
520    if (info == NULL || info->imageView == VK_NULL_HANDLE) {
521       *att = (struct nvk_attachment) { .iview = NULL, };
522       return;
523    }
524 
525    VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
526    *att = (struct nvk_attachment) {
527       .vk_format = iview->vk.format,
528       .iview = iview,
529    };
530 
531    if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
532       VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
533       att->resolve_mode = info->resolveMode;
534       att->resolve_iview = res_iview;
535    }
536 }
537 
538 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)539 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
540 {
541 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
542    uint16_t nil_to_nv9097[] = {
543       MODE(1X1),
544       MODE(2X1),
545       MODE(2X2),
546       MODE(4X2),
547       MODE(4X4),
548    };
549 #undef MODE
550    assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
551 
552    return nil_to_nv9097[sample_layout];
553 }
554 
555 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)556 nvk_GetRenderingAreaGranularityKHR(
557     VkDevice device,
558     const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
559     VkExtent2D *pGranularity)
560 {
561    *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
562 }
563 
564 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)565 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
566                       const VkRenderingInfo *pRenderingInfo)
567 {
568    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
569    struct nvk_rendering_state *render = &cmd->state.gfx.render;
570 
571    memset(render, 0, sizeof(*render));
572 
573    render->flags = pRenderingInfo->flags;
574    render->area = pRenderingInfo->renderArea;
575    render->view_mask = pRenderingInfo->viewMask;
576    render->layer_count = pRenderingInfo->layerCount;
577    render->samples = 0;
578 
579    const uint32_t layer_count =
580       render->view_mask ? util_last_bit(render->view_mask) :
581                           render->layer_count;
582 
583    render->color_att_count = pRenderingInfo->colorAttachmentCount;
584    for (uint32_t i = 0; i < render->color_att_count; i++) {
585       nvk_attachment_init(&render->color_att[i],
586                           &pRenderingInfo->pColorAttachments[i]);
587    }
588 
589    nvk_attachment_init(&render->depth_att,
590                        pRenderingInfo->pDepthAttachment);
591    nvk_attachment_init(&render->stencil_att,
592                        pRenderingInfo->pStencilAttachment);
593 
594    nvk_cmd_buffer_dirty_render_pass(cmd);
595 
596    /* Always emit at least one color attachment, even if it's just a dummy. */
597    uint32_t color_att_count = MAX2(1, render->color_att_count);
598    struct nv_push *p = nvk_cmd_buffer_push(cmd, color_att_count * 10 + 27);
599 
600    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
601           render->view_mask);
602 
603    P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
604    P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
605       .x       = render->area.offset.x,
606       .width   = render->area.extent.width,
607    });
608    P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
609       .y       = render->area.offset.y,
610       .height  = render->area.extent.height,
611    });
612 
613    enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
614    for (uint32_t i = 0; i < color_att_count; i++) {
615       if (render->color_att[i].iview) {
616          const struct nvk_image_view *iview = render->color_att[i].iview;
617          const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
618          /* Rendering to multi-planar images is valid for a specific single plane
619           * only, so assert that what we have is a single-plane, obtain its index,
620           * and begin rendering
621           */
622          assert(iview->plane_count == 1);
623          const uint8_t ip = iview->planes[0].image_plane;
624 
625          const struct nil_image_level *level =
626             &image->planes[ip].nil.levels[iview->vk.base_mip_level];
627          struct nil_extent4d level_extent_sa =
628             nil_image_level_extent_sa(&image->planes[ip].nil, iview->vk.base_mip_level);
629 
630          assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
631                 sample_layout == image->planes[ip].nil.sample_layout);
632          sample_layout = image->planes[ip].nil.sample_layout;
633          render->samples = image->vk.samples;
634 
635          uint64_t addr = nvk_image_base_address(image, ip) + level->offset_B;
636 
637          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
638          P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
639          P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
640 
641          if (level->tiling.is_tiled) {
642             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, level_extent_sa.w);
643             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.h);
644             const enum pipe_format p_format =
645                vk_format_to_pipe_format(iview->vk.format);
646             const uint8_t ct_format = nil_format_to_color_target(p_format);
647             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
648 
649             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
650                .block_width   = BLOCK_WIDTH_ONE_GOB,
651                .block_height  = level->tiling.y_log2,
652                .block_depth   = level->tiling.z_log2,
653                .layout        = LAYOUT_BLOCKLINEAR,
654                .third_dimension_control =
655                   (image->planes[ip].nil.dim == NIL_IMAGE_DIM_3D) ?
656                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
657                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
658             });
659 
660             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i,
661                iview->vk.base_array_layer + layer_count);
662             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
663                image->planes[ip].nil.array_stride_B >> 2);
664             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, iview->vk.base_array_layer);
665          } else {
666             /* NVIDIA can only render to 2D linear images */
667             assert(image->planes[ip].nil.dim == NIL_IMAGE_DIM_2D);
668             /* NVIDIA can only render to non-multisampled images */
669             assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
670             /* NVIDIA doesn't support linear array images */
671             assert(iview->vk.base_array_layer == 0 && layer_count == 1);
672 
673             uint32_t pitch = level->row_stride_B;
674             const enum pipe_format p_format =
675                vk_format_to_pipe_format(iview->vk.format);
676             /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
677              * takes row pitch
678              */
679             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
680             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.h);
681 
682             const uint8_t ct_format = nil_format_to_color_target(p_format);
683             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
684 
685             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
686                .layout = LAYOUT_PITCH,
687                .third_dimension_control =
688                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
689             });
690 
691             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
692             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
693             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
694          }
695       } else {
696          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
697          P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
698          P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
699          P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
700          P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
701          P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
702          P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
703             .layout        = LAYOUT_BLOCKLINEAR,
704          });
705          P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
706          P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
707          P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
708       }
709    }
710 
711    P_IMMD(p, NV9097, SET_CT_SELECT, {
712       .target_count = color_att_count,
713       .target0 = 0,
714       .target1 = 1,
715       .target2 = 2,
716       .target3 = 3,
717       .target4 = 4,
718       .target5 = 5,
719       .target6 = 6,
720       .target7 = 7,
721    });
722 
723    if (render->depth_att.iview || render->stencil_att.iview) {
724       struct nvk_image_view *iview = render->depth_att.iview ?
725                                      render->depth_att.iview :
726                                      render->stencil_att.iview;
727       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
728       /* Depth/stencil are always single-plane */
729       assert(iview->plane_count == 1);
730       const uint8_t ip = iview->planes[0].image_plane;
731       struct nil_image nil_image = image->planes[ip].nil;
732 
733       uint64_t addr = nvk_image_base_address(image, ip);
734       uint32_t mip_level = iview->vk.base_mip_level;
735       uint32_t base_array_layer = iview->vk.base_array_layer;
736       uint32_t layer_count = iview->vk.layer_count;
737 
738       if (nil_image.dim == NIL_IMAGE_DIM_3D) {
739          uint64_t level_offset_B;
740          nil_image_3d_level_as_2d_array(&nil_image, mip_level,
741                                         &nil_image, &level_offset_B);
742          addr += level_offset_B;
743          mip_level = 0;
744          base_array_layer = 0;
745          layer_count = iview->vk.extent.depth;
746       }
747 
748       const struct nil_image_level *level = &nil_image.levels[mip_level];
749       addr += level->offset_B;
750 
751       assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
752              sample_layout == nil_image.sample_layout);
753       sample_layout = nil_image.sample_layout;
754       render->samples = image->vk.samples;
755 
756       P_MTHD(p, NV9097, SET_ZT_A);
757       P_NV9097_SET_ZT_A(p, addr >> 32);
758       P_NV9097_SET_ZT_B(p, addr);
759       const enum pipe_format p_format =
760          vk_format_to_pipe_format(iview->vk.format);
761       const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
762       P_NV9097_SET_ZT_FORMAT(p, zs_format);
763       assert(level->tiling.z_log2 == 0);
764       P_NV9097_SET_ZT_BLOCK_SIZE(p, {
765          .width = WIDTH_ONE_GOB,
766          .height = level->tiling.y_log2,
767          .depth = DEPTH_ONE_GOB,
768       });
769       P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
770 
771       P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
772 
773       struct nil_extent4d level_extent_sa =
774          nil_image_level_extent_sa(&nil_image, mip_level);
775 
776       P_MTHD(p, NV9097, SET_ZT_SIZE_A);
777       P_NV9097_SET_ZT_SIZE_A(p, level_extent_sa.w);
778       P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.h);
779       P_NV9097_SET_ZT_SIZE_C(p, {
780          .third_dimension  = base_array_layer + layer_count,
781          .control          = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
782       });
783 
784       P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
785 
786       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
787          P_IMMD(p, NVC597, SET_ZT_SPARSE, {
788             .enable = ENABLE_FALSE,
789          });
790       }
791    } else {
792       P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
793    }
794 
795    /* From the Vulkan 1.3.275 spec:
796     *
797     *    "It is legal for a subpass to use no color or depth/stencil
798     *    attachments, either because it has no attachment references or
799     *    because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
800     *    can use shader side effects such as image stores and atomics to
801     *    produce an output. In this case, the subpass continues to use the
802     *    width, height, and layers of the framebuffer to define the dimensions
803     *    of the rendering area, and the rasterizationSamples from each
804     *    pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
805     *    of samples used in rasterization;"
806     *
807     * In the case where we have attachments, we emit SET_ANTI_ALIAS here
808     * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
809     * specifying the sample layout and we want to ensure it matches.  When
810     * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
811     * where we base it on dynamic rasterizationSamples.
812     */
813    if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID) {
814       P_IMMD(p, NV9097, SET_ANTI_ALIAS,
815              nil_to_nv9097_samples_mode(sample_layout));
816    }
817 
818    if (render->flags & VK_RENDERING_RESUMING_BIT)
819       return;
820 
821    uint32_t clear_count = 0;
822    VkClearAttachment clear_att[NVK_MAX_RTS + 1];
823    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
824       const VkRenderingAttachmentInfo *att_info =
825          &pRenderingInfo->pColorAttachments[i];
826       if (att_info->imageView == VK_NULL_HANDLE ||
827           att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
828          continue;
829 
830       clear_att[clear_count++] = (VkClearAttachment) {
831          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
832          .colorAttachment = i,
833          .clearValue = att_info->clearValue,
834       };
835    }
836 
837    clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
838    if (pRenderingInfo->pDepthAttachment != NULL &&
839        pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
840        pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
841       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
842       clear_att[clear_count].clearValue.depthStencil.depth =
843          pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
844    }
845    if (pRenderingInfo->pStencilAttachment != NULL &&
846        pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
847        pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
848       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
849       clear_att[clear_count].clearValue.depthStencil.stencil =
850          pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
851    }
852    if (clear_att[clear_count].aspectMask != 0)
853       clear_count++;
854 
855    if (clear_count > 0) {
856       const VkClearRect clear_rect = {
857          .rect = render->area,
858          .baseArrayLayer = 0,
859          .layerCount = render->view_mask ? 1 : render->layer_count,
860       };
861 
862       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
863       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
864 
865       nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
866                               clear_count, clear_att, 1, &clear_rect);
867       p = nvk_cmd_buffer_push(cmd, 2);
868       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
869       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
870    }
871 
872    /* TODO: Attachment clears */
873 }
874 
875 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)876 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
877 {
878    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
879    struct nvk_rendering_state *render = &cmd->state.gfx.render;
880 
881    bool need_resolve = false;
882 
883    /* Translate render state back to VK for meta */
884    VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
885    for (uint32_t i = 0; i < render->color_att_count; i++) {
886       if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
887          need_resolve = true;
888 
889       vk_color_att[i] = (VkRenderingAttachmentInfo) {
890          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
891          .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
892          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
893          .resolveMode = render->color_att[i].resolve_mode,
894          .resolveImageView =
895             nvk_image_view_to_handle(render->color_att[i].resolve_iview),
896          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
897       };
898    }
899 
900    const VkRenderingAttachmentInfo vk_depth_att = {
901       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
902       .imageView = nvk_image_view_to_handle(render->depth_att.iview),
903       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
904       .resolveMode = render->depth_att.resolve_mode,
905       .resolveImageView =
906          nvk_image_view_to_handle(render->depth_att.resolve_iview),
907       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
908    };
909    if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
910       need_resolve = true;
911 
912    const VkRenderingAttachmentInfo vk_stencil_att = {
913       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
914       .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
915       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
916       .resolveMode = render->stencil_att.resolve_mode,
917       .resolveImageView =
918          nvk_image_view_to_handle(render->stencil_att.resolve_iview),
919       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
920    };
921    if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
922       need_resolve = true;
923 
924    const VkRenderingInfo vk_render = {
925       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
926       .renderArea = render->area,
927       .layerCount = render->layer_count,
928       .viewMask = render->view_mask,
929       .colorAttachmentCount = render->color_att_count,
930       .pColorAttachments = vk_color_att,
931       .pDepthAttachment = &vk_depth_att,
932       .pStencilAttachment = &vk_stencil_att,
933    };
934 
935    if (render->flags & VK_RENDERING_SUSPENDING_BIT)
936       need_resolve = false;
937 
938    memset(render, 0, sizeof(*render));
939 
940    if (need_resolve) {
941       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
942       P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
943          .lines = LINES_ALL,
944       });
945 
946       nvk_meta_resolve_rendering(cmd, &vk_render);
947    }
948 }
949 
950 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)951 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
952                              const gl_shader_stage stage,
953                              struct nvk_shader *shader)
954 {
955    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
956 
957    assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
958    if (cmd->state.gfx.shaders[stage] == shader)
959       return;
960 
961    cmd->state.gfx.shaders[stage] = shader;
962    cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
963 
964    /* When a pipeline with tess shaders is bound we need to re-upload the
965     * tessellation parameters at flush_ts_state, as the domain origin can be
966     * dynamic.
967     */
968    if (stage == MESA_SHADER_TESS_EVAL)
969       BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN);
970 
971    /* Emitting SET_HYBRID_ANTI_ALIAS_CONTROL requires the fragment shader */
972    if (stage == MESA_SHADER_FRAGMENT)
973       BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
974 }
975 
976 static uint32_t
mesa_to_nv9097_shader_type(gl_shader_stage stage)977 mesa_to_nv9097_shader_type(gl_shader_stage stage)
978 {
979    static const uint32_t mesa_to_nv9097[] = {
980       [MESA_SHADER_VERTEX]    = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
981       [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
982       [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
983       [MESA_SHADER_GEOMETRY]  = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
984       [MESA_SHADER_FRAGMENT]  = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
985    };
986    assert(stage < ARRAY_SIZE(mesa_to_nv9097));
987    return mesa_to_nv9097[stage];
988 }
989 
990 static uint32_t
nvk_pipeline_bind_group(gl_shader_stage stage)991 nvk_pipeline_bind_group(gl_shader_stage stage)
992 {
993    return stage;
994 }
995 
996 static void
nvk_flush_shaders(struct nvk_cmd_buffer * cmd)997 nvk_flush_shaders(struct nvk_cmd_buffer *cmd)
998 {
999    if (cmd->state.gfx.shaders_dirty == 0)
1000       return;
1001 
1002    /* Map shader types to shaders */
1003    struct nvk_shader *type_shader[6] = { NULL, };
1004    uint32_t types_dirty = 0;
1005 
1006    const uint32_t gfx_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1007                                BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
1008                                BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1009                                BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
1010                                BITFIELD_BIT(MESA_SHADER_FRAGMENT);
1011 
1012    u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
1013       uint32_t type = mesa_to_nv9097_shader_type(stage);
1014       types_dirty |= BITFIELD_BIT(type);
1015 
1016       /* Only copy non-NULL shaders because mesh/task alias with vertex and
1017        * tessellation stages.
1018        */
1019       if (cmd->state.gfx.shaders[stage] != NULL) {
1020          assert(type < ARRAY_SIZE(type_shader));
1021          assert(type_shader[type] == NULL);
1022          type_shader[type] = cmd->state.gfx.shaders[stage];
1023       }
1024    }
1025 
1026    u_foreach_bit(type, types_dirty) {
1027       struct nvk_shader *shader = type_shader[type];
1028 
1029       /* We always map index == type */
1030       const uint32_t idx = type;
1031 
1032       struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1033       P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
1034          .enable  = shader != NULL,
1035          .type    = type,
1036       });
1037 
1038       if (shader == NULL)
1039          continue;
1040 
1041       uint64_t addr = shader->hdr_addr;
1042       if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1043          P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
1044          P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
1045          P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
1046       } else {
1047          assert(addr < 0xffffffff);
1048          P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
1049       }
1050 
1051       P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
1052       P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
1053       P_NVC397_SET_PIPELINE_BINDING(p, idx,
1054          nvk_pipeline_bind_group(shader->info.stage));
1055 
1056       if (shader->info.stage == MESA_SHADER_FRAGMENT) {
1057          p = nvk_cmd_buffer_push(cmd, 9);
1058 
1059          P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
1060          P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
1061             .fraction_of_spm_register_file_per_subtile         = 0x10,
1062             .fraction_of_spm_pixel_output_buffer_per_subtile   = 0x40,
1063             .fraction_of_spm_triangle_ram_per_subtile          = 0x16,
1064             .fraction_of_max_quads_per_subtile                 = 0x20,
1065          });
1066          P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);
1067 
1068          P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
1069                 shader->info.fs.early_fragment_tests);
1070 
1071          if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1072             P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
1073                    shader->info.fs.post_depth_coverage);
1074          } else {
1075             assert(!shader->info.fs.post_depth_coverage);
1076          }
1077 
1078          P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
1079             .z_min_unbounded_enable = shader->info.fs.writes_depth,
1080             .z_max_unbounded_enable = shader->info.fs.writes_depth,
1081          });
1082       }
1083    }
1084 
1085    const uint32_t vtg_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) |
1086                                BITFIELD_BIT(MESA_SHADER_TESS_EVAL) |
1087                                BITFIELD_BIT(MESA_SHADER_GEOMETRY);
1088    const uint32_t vtgm_stages = vtg_stages | BITFIELD_BIT(MESA_SHADER_MESH);
1089 
1090    if (cmd->state.gfx.shaders_dirty & vtg_stages) {
1091       struct nak_xfb_info *xfb = NULL;
1092       u_foreach_bit(stage, vtg_stages) {
1093          if (cmd->state.gfx.shaders[stage] != NULL)
1094             xfb = &cmd->state.gfx.shaders[stage]->info.vtg.xfb;
1095       }
1096 
1097       if (xfb == NULL) {
1098          struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1099          for (uint8_t b = 0; b < 4; b++)
1100             P_IMMD(p, NV9097, SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(b), 0);
1101       } else {
1102          for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
1103             const uint8_t attr_count = xfb->attr_count[b];
1104             /* upload packed varying indices in multiples of 4 bytes */
1105             const uint32_t n = DIV_ROUND_UP(attr_count, 4);
1106 
1107             struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 + n);
1108 
1109             P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
1110             P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
1111             P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
1112             P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);
1113 
1114             if (n > 0) {
1115                P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
1116                P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
1117             }
1118          }
1119       }
1120    }
1121 
1122    if (cmd->state.gfx.shaders_dirty & vtgm_stages) {
1123       struct nvk_shader *last_vtgm = NULL;
1124       u_foreach_bit(stage, vtgm_stages) {
1125          if (cmd->state.gfx.shaders[stage] != NULL)
1126             last_vtgm = cmd->state.gfx.shaders[stage];
1127       }
1128 
1129       struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
1130 
1131       P_IMMD(p, NV9097, SET_RT_LAYER, {
1132          .v       = 0,
1133          .control = last_vtgm->info.vtg.writes_layer ?
1134                     CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
1135                     CONTROL_V_SELECTS_LAYER,
1136       });
1137 
1138       P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, {
1139          .enable  = last_vtgm->info.vtg.writes_point_size,
1140          .slot    = 0,
1141       });
1142 
1143       const uint8_t clip_enable = last_vtgm->info.vtg.clip_enable;
1144       const uint8_t cull_enable = last_vtgm->info.vtg.cull_enable;
1145       P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
1146          .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
1147          .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
1148          .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
1149          .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
1150          .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
1151          .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
1152          .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
1153          .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
1154       });
1155       P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
1156          .plane0 = (cull_enable >> 0) & 1,
1157          .plane1 = (cull_enable >> 1) & 1,
1158          .plane2 = (cull_enable >> 2) & 1,
1159          .plane3 = (cull_enable >> 3) & 1,
1160          .plane4 = (cull_enable >> 4) & 1,
1161          .plane5 = (cull_enable >> 5) & 1,
1162          .plane6 = (cull_enable >> 6) & 1,
1163          .plane7 = (cull_enable >> 7) & 1,
1164       });
1165    }
1166 
1167    cmd->state.gfx.shaders_dirty = 0;
1168 }
1169 
1170 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1171 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1172 {
1173    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1174    struct nvk_physical_device *pdev = nvk_device_physical(dev);
1175    const struct vk_dynamic_graphics_state *dyn =
1176       &cmd->vk.dynamic_graphics_state;
1177 
1178    struct nv_push *p = nvk_cmd_buffer_push(cmd, 256);
1179 
1180    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1181        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1182       u_foreach_bit(a, dyn->vi->attributes_valid) {
1183          const struct nvk_va_format *fmt =
1184             nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1185 
1186          P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1187             .stream                 = dyn->vi->attributes[a].binding,
1188             .offset                 = dyn->vi->attributes[a].offset,
1189             .component_bit_widths   = fmt->bit_widths,
1190             .numerical_type         = fmt->type,
1191             .swap_r_and_b           = fmt->swap_rb,
1192          });
1193       }
1194 
1195       u_foreach_bit(b, dyn->vi->bindings_valid) {
1196          const bool instanced = dyn->vi->bindings[b].input_rate ==
1197                                 VK_VERTEX_INPUT_RATE_INSTANCE;
1198          P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1199          P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1200             dyn->vi->bindings[b].divisor);
1201       }
1202    }
1203 
1204    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1205        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1206       for (uint32_t b = 0; b < 32; b++) {
1207          P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
1208             .stride = dyn->vi_binding_strides[b],
1209             .enable = (dyn->vi->bindings_valid & BITFIELD_BIT(b)) != 0,
1210          });
1211       }
1212    }
1213 }
1214 
1215 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1216 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1217 {
1218    const struct vk_dynamic_graphics_state *dyn =
1219       &cmd->vk.dynamic_graphics_state;
1220 
1221    /** Nothing to do for MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY */
1222 
1223    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1224       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1225       P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1226              dyn->ia.primitive_restart_enable);
1227    }
1228 }
1229 
1230 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1231 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1232 {
1233    const struct vk_dynamic_graphics_state *dyn =
1234       &cmd->vk.dynamic_graphics_state;
1235    struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1236 
1237    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1238       /* The hardware gets grumpy if we set this to 0 so make sure we set it
1239        * to at least 1 in case it's dirty but uninitialized.
1240        */
1241       P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1242    }
1243 
1244    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1245       const struct nvk_shader *shader =
1246          cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
1247 
1248       if (shader != NULL) {
1249          enum nak_ts_prims prims = shader->info.ts.prims;
1250          /* When the origin is lower-left, we have to flip the winding order */
1251          if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1252             if (prims == NAK_TS_PRIMS_TRIANGLES_CW)
1253                prims = NAK_TS_PRIMS_TRIANGLES_CCW;
1254             else if (prims == NAK_TS_PRIMS_TRIANGLES_CCW)
1255                prims = NAK_TS_PRIMS_TRIANGLES_CW;
1256          }
1257          P_MTHD(p, NV9097, SET_TESSELLATION_PARAMETERS);
1258          P_NV9097_SET_TESSELLATION_PARAMETERS(p, {
1259             shader->info.ts.domain,
1260             shader->info.ts.spacing,
1261             prims
1262          });
1263       }
1264    }
1265 }
1266 
1267 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1268 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1269 {
1270    const struct vk_dynamic_graphics_state *dyn =
1271       &cmd->vk.dynamic_graphics_state;
1272 
1273    struct nv_push *p =
1274       nvk_cmd_buffer_push(cmd, 16 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1275 
1276    /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1277 
1278    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1279        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1280       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1281          const VkViewport *vp = &dyn->vp.viewports[i];
1282 
1283          /* These exactly match the spec values.  Nvidia hardware oddities
1284           * are accounted for later.
1285           */
1286          const float o_x = vp->x + 0.5f * vp->width;
1287          const float o_y = vp->y + 0.5f * vp->height;
1288          const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1289                            vp->minDepth :
1290                            (vp->maxDepth + vp->minDepth) * 0.5f;
1291 
1292          const float p_x = vp->width;
1293          const float p_y = vp->height;
1294          const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1295                            vp->maxDepth - vp->minDepth :
1296                            (vp->maxDepth - vp->minDepth) * 0.5f;
1297 
1298          P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1299          P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1300          P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1301          P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1302 
1303          P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1304          P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1305          P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1306 
1307          float xmin = vp->x;
1308          float xmax = vp->x + vp->width;
1309          float ymin = MIN2(vp->y, vp->y + vp->height);
1310          float ymax = MAX2(vp->y, vp->y + vp->height);
1311          float zmin = MIN2(vp->minDepth, vp->maxDepth);
1312          float zmax = MAX2(vp->minDepth, vp->maxDepth);
1313          assert(xmin <= xmax && ymin <= ymax);
1314 
1315          const float max_dim = (float)0xffff;
1316          xmin = CLAMP(xmin, 0, max_dim);
1317          xmax = CLAMP(xmax, 0, max_dim);
1318          ymin = CLAMP(ymin, 0, max_dim);
1319          ymax = CLAMP(ymax, 0, max_dim);
1320 
1321          P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1322          P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1323             .x0      = xmin,
1324             .width   = xmax - xmin,
1325          });
1326          P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1327             .y0      = ymin,
1328             .height  = ymax - ymin,
1329          });
1330          P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1331          P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1332 
1333          if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1334             P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1335                .x = X_POS_X,
1336                .y = Y_POS_Y,
1337                .z = Z_POS_Z,
1338                .w = W_POS_W,
1339             });
1340          }
1341       }
1342    }
1343 
1344    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1345       P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1346              dyn->vp.depth_clip_negative_one_to_one ?
1347              RANGE_NEGATIVE_W_TO_POSITIVE_W :
1348              RANGE_ZERO_TO_POSITIVE_W);
1349    }
1350 
1351    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1352       for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1353          P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1354    }
1355 
1356    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1357       for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1358          const VkRect2D *s = &dyn->vp.scissors[i];
1359 
1360          const uint32_t xmin = MIN2(16384, s->offset.x);
1361          const uint32_t xmax = MIN2(16384, s->offset.x + s->extent.width);
1362          const uint32_t ymin = MIN2(16384, s->offset.y);
1363          const uint32_t ymax = MIN2(16384, s->offset.y + s->extent.height);
1364 
1365          P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1366          P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1367          P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1368             .xmin = xmin,
1369             .xmax = xmax,
1370          });
1371          P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1372             .ymin = ymin,
1373             .ymax = ymax,
1374          });
1375       }
1376    }
1377 }
1378 
1379 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1380 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1381 {
1382    ASSERTED uint16_t vk_to_nv9097[] = {
1383       [VK_POLYGON_MODE_FILL]  = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1384       [VK_POLYGON_MODE_LINE]  = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1385       [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1386    };
1387    assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1388 
1389    uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1390    assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1391    return nv9097_mode;
1392 }
1393 
1394 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1395 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1396 {
1397    static const uint16_t vk_to_nv9097[] = {
1398       [VK_CULL_MODE_FRONT_BIT]      = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1399       [VK_CULL_MODE_BACK_BIT]       = NV9097_OGL_SET_CULL_FACE_V_BACK,
1400       [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1401    };
1402    assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1403    return vk_to_nv9097[vk_cull_mode];
1404 }
1405 
1406 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1407 vk_to_nv9097_front_face(VkFrontFace vk_face)
1408 {
1409    /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1410     * convention in which framebuffer coordinates always start in the upper
1411     * left while OpenGL has framebuffer coordinates starting in the lower
1412     * left.  Therefore, we want the reverse of the hardware enum name.
1413     */
1414    ASSERTED static const uint16_t vk_to_nv9097[] = {
1415       [VK_FRONT_FACE_COUNTER_CLOCKWISE]   = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1416       [VK_FRONT_FACE_CLOCKWISE]           = NV9097_OGL_SET_FRONT_FACE_V_CW,
1417    };
1418    assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1419 
1420    uint32_t nv9097_face = 0x900 | (1 - vk_face);
1421    assert(nv9097_face == vk_to_nv9097[vk_face]);
1422    return nv9097_face;
1423 }
1424 
1425 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1426 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1427 {
1428    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1429                  NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1430    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1431                  NV9097_SET_PROVOKING_VERTEX_V_LAST);
1432    return vk_mode;
1433 }
1434 
1435 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)1436 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
1437 {
1438    struct nv_push *p = nvk_cmd_buffer_push(cmd, 40);
1439 
1440    const struct vk_dynamic_graphics_state *dyn =
1441       &cmd->vk.dynamic_graphics_state;
1442 
1443    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
1444       P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
1445 
1446    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
1447        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
1448       const bool z_clamp = dyn->rs.depth_clamp_enable;
1449       const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
1450       P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
1451          /* TODO: Fix pre-Volta
1452           *
1453           * This probably involves a few macros, one which stases viewport
1454           * min/maxDepth in scratch states and one which goes here and
1455           * emits either min/maxDepth or -/+INF as needed.
1456           */
1457          .min_z_zero_max_z_one = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
1458          .z_clip_range = nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A
1459                          ? ((z_clamp || z_clip)
1460                             ? Z_CLIP_RANGE_MIN_Z_MAX_Z
1461                             : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
1462                          : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
1463 
1464          .pixel_min_z = PIXEL_MIN_Z_CLAMP,
1465          .pixel_max_z = PIXEL_MAX_Z_CLAMP,
1466 
1467          .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
1468          .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
1469          .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
1470                                  : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
1471 
1472          /* We clip depth with the geometry clipper to ensure that it gets
1473           * clipped before depth bias is applied.  If we leave it up to the
1474           * raserizer clipper (pixel_min/max_z = CLIP), it will clip according
1475           * to the post-bias Z value which is wrong.  In order to always get
1476           * the geometry clipper, we need to set a tignt guardband
1477           * (geometry_guardband_z = SCALE_1).
1478           */
1479          .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
1480                                         : GEOMETRY_GUARDBAND_Z_SCALE_256,
1481       });
1482    }
1483 
1484    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
1485       uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
1486       P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
1487       P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
1488       P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
1489    }
1490 
1491    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
1492       P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
1493 
1494       if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
1495          uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
1496          P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
1497       }
1498    }
1499 
1500    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
1501       P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
1502          vk_to_nv9097_front_face(dyn->rs.front_face));
1503    }
1504 
1505    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
1506       P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
1507              vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
1508    }
1509 
1510    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
1511       P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
1512       P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
1513       P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
1514       P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
1515    }
1516 
1517    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
1518       switch (dyn->rs.depth_bias.representation) {
1519       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
1520          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
1521                 DEPTH_FORMAT_DEPENDENT_TRUE);
1522          break;
1523       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
1524          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
1525                 DEPTH_FORMAT_DEPENDENT_FALSE);
1526          break;
1527       case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
1528       default:
1529          unreachable("Unsupported depth bias representation");
1530       }
1531       /* TODO: The blob multiplies by 2 for some reason. We don't. */
1532       P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant));
1533       P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope));
1534       P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
1535    }
1536 
1537    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
1538       P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
1539       P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
1540       P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
1541    }
1542 
1543    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
1544       switch (dyn->rs.line.mode) {
1545       case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
1546       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
1547          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
1548          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
1549          break;
1550 
1551       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
1552          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
1553          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
1554          break;
1555 
1556       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
1557          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
1558          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
1559          break;
1560 
1561       default:
1562          unreachable("Invalid line rasterization mode");
1563       }
1564    }
1565 
1566    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
1567       P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
1568 
1569    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
1570       /* map factor from [1,256] to [0, 255] */
1571       uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
1572       P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
1573          .factor  = stipple_factor,
1574          .pattern = dyn->rs.line.stipple.pattern,
1575       });
1576    }
1577 
1578    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
1579       P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
1580 }
1581 
1582 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)1583 vk_sample_location(const struct vk_sample_locations_state *sl,
1584                    uint32_t x, uint32_t y, uint32_t s)
1585 {
1586    x = x % sl->grid_size.width;
1587    y = y % sl->grid_size.height;
1588 
1589    return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
1590 }
1591 
1592 static struct nvk_sample_location
vk_to_nvk_sample_location(VkSampleLocationEXT loc)1593 vk_to_nvk_sample_location(VkSampleLocationEXT loc)
1594 {
1595    return (struct nvk_sample_location) {
1596       .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
1597       .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
1598    };
1599 }
1600 
1601 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)1602 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
1603 {
1604    struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1605    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
1606    const struct vk_dynamic_graphics_state *dyn =
1607       &cmd->vk.dynamic_graphics_state;
1608 
1609    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
1610       struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1611 
1612       /* When we don't have any attachments, we can't know the sample count
1613        * from the render pass so we need to emit SET_ANTI_ALIAS here.  See the
1614        * comment in nvk_BeginRendering() for more details.
1615        */
1616       if (render->samples == 0) {
1617          /* Multisample information MAY be missing (rasterizationSamples == 0)
1618           * if rasterizer discard is enabled.  However, this isn't valid in
1619           * the hardware so always use at least one sample.
1620           */
1621          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
1622          enum nil_sample_layout layout = nil_choose_sample_layout(samples);
1623          P_IMMD(p, NV9097, SET_ANTI_ALIAS, nil_to_nv9097_samples_mode(layout));
1624       } else {
1625          /* Multisample information MAY be missing (rasterizationSamples == 0)
1626           * if rasterizer discard is enabled.
1627           */
1628          assert(dyn->ms.rasterization_samples == 0 ||
1629                 dyn->ms.rasterization_samples == render->samples);
1630       }
1631 
1632       struct nvk_shader *fs = cmd->state.gfx.shaders[MESA_SHADER_FRAGMENT];
1633       const float min_sample_shading = fs != NULL ? fs->min_sample_shading : 0;
1634       uint32_t min_samples = ceilf(dyn->ms.rasterization_samples *
1635                                    min_sample_shading);
1636       min_samples = util_next_power_of_two(MAX2(1, min_samples));
1637 
1638       P_IMMD(p, NV9097, SET_HYBRID_ANTI_ALIAS_CONTROL, {
1639          .passes = min_samples,
1640          .centroid = min_samples > 1 ? CENTROID_PER_PASS
1641                                      : CENTROID_PER_FRAGMENT,
1642       });
1643    }
1644 
1645    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
1646        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
1647       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1648       P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
1649          .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
1650          .alpha_to_one      = dyn->ms.alpha_to_one_enable,
1651       });
1652    }
1653 
1654    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
1655        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
1656       const struct vk_sample_locations_state *sl;
1657       if (dyn->ms.sample_locations_enable) {
1658          sl = dyn->ms.sample_locations;
1659       } else {
1660          sl = vk_standard_sample_locations_state(dyn->ms.rasterization_samples);
1661       }
1662 
1663       for (uint32_t i = 0; i < sl->per_pixel; i++) {
1664          desc->root.draw.sample_locations[i] =
1665             vk_to_nvk_sample_location(sl->locations[i]);
1666       }
1667 
1668       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1669          struct nvk_sample_location loc[16];
1670          for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
1671             const uint32_t s = n % sl->per_pixel;
1672             const uint32_t px = n / sl->per_pixel;
1673             const uint32_t x = px % 2;
1674             const uint32_t y = px / 2;
1675 
1676             loc[n] = vk_to_nvk_sample_location(vk_sample_location(sl, x, y, s));
1677          }
1678 
1679          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1680 
1681          P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
1682          for (uint32_t i = 0; i < 4; i++) {
1683             P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
1684                .x0 = loc[i * 4 + 0].x_u4,
1685                .y0 = loc[i * 4 + 0].y_u4,
1686                .x1 = loc[i * 4 + 1].x_u4,
1687                .y1 = loc[i * 4 + 1].y_u4,
1688                .x2 = loc[i * 4 + 2].x_u4,
1689                .y2 = loc[i * 4 + 2].y_u4,
1690                .x3 = loc[i * 4 + 3].x_u4,
1691                .y3 = loc[i * 4 + 3].y_u4,
1692             });
1693          }
1694       }
1695    }
1696 
1697    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
1698       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1699       P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
1700       P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
1701       P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
1702       P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
1703       P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
1704    }
1705 }
1706 
1707 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)1708 vk_to_nv9097_compare_op(VkCompareOp vk_op)
1709 {
1710    ASSERTED static const uint16_t vk_to_nv9097[] = {
1711       [VK_COMPARE_OP_NEVER]            = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
1712       [VK_COMPARE_OP_LESS]             = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
1713       [VK_COMPARE_OP_EQUAL]            = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
1714       [VK_COMPARE_OP_LESS_OR_EQUAL]    = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
1715       [VK_COMPARE_OP_GREATER]          = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
1716       [VK_COMPARE_OP_NOT_EQUAL]        = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
1717       [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
1718       [VK_COMPARE_OP_ALWAYS]           = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
1719    };
1720    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1721 
1722    uint32_t nv9097_op = 0x200 | vk_op;
1723    assert(nv9097_op == vk_to_nv9097[vk_op]);
1724    return nv9097_op;
1725 }
1726 
1727 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)1728 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
1729 {
1730 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
1731    ASSERTED static const uint16_t vk_to_nv9097[] = {
1732       OP(KEEP,                D3D_KEEP),
1733       OP(ZERO,                D3D_ZERO),
1734       OP(REPLACE,             D3D_REPLACE),
1735       OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
1736       OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
1737       OP(INVERT,              D3D_INVERT),
1738       OP(INCREMENT_AND_WRAP,  D3D_INCR),
1739       OP(DECREMENT_AND_WRAP,  D3D_DECR),
1740    };
1741    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1742 #undef OP
1743 
1744    uint32_t nv9097_op = vk_op + 1;
1745    assert(nv9097_op == vk_to_nv9097[vk_op]);
1746    return nv9097_op;
1747 }
1748 
1749 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)1750 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
1751 {
1752    struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
1753 
1754    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
1755    const struct vk_dynamic_graphics_state *dyn =
1756       &cmd->vk.dynamic_graphics_state;
1757 
1758    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
1759       bool enable = dyn->ds.depth.test_enable &&
1760                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
1761       P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
1762    }
1763 
1764    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
1765       bool enable = dyn->ds.depth.write_enable &&
1766                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
1767       P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
1768    }
1769 
1770    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
1771       const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
1772       P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
1773    }
1774 
1775    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
1776       bool enable = dyn->ds.depth.bounds_test.enable &&
1777                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
1778       P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
1779    }
1780 
1781    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
1782       P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
1783       P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
1784       P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
1785    }
1786 
1787    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
1788       bool enable = dyn->ds.stencil.test_enable &&
1789                     render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
1790       P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
1791    }
1792 
1793    const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
1794    const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
1795    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
1796       P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
1797       P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
1798       P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
1799       P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
1800       P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
1801 
1802       P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
1803       P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
1804       P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
1805       P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
1806       P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
1807    }
1808 
1809    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
1810       P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
1811       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
1812    }
1813 
1814    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
1815       P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
1816       P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
1817    }
1818 
1819    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
1820       P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
1821       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
1822    }
1823 }
1824 
1825 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)1826 vk_to_nv9097_logic_op(VkLogicOp vk_op)
1827 {
1828    ASSERTED uint16_t vk_to_nv9097[] = {
1829       [VK_LOGIC_OP_CLEAR]           = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
1830       [VK_LOGIC_OP_AND]             = NV9097_SET_LOGIC_OP_FUNC_V_AND,
1831       [VK_LOGIC_OP_AND_REVERSE]     = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
1832       [VK_LOGIC_OP_COPY]            = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
1833       [VK_LOGIC_OP_AND_INVERTED]    = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
1834       [VK_LOGIC_OP_NO_OP]           = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
1835       [VK_LOGIC_OP_XOR]             = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
1836       [VK_LOGIC_OP_OR]              = NV9097_SET_LOGIC_OP_FUNC_V_OR,
1837       [VK_LOGIC_OP_NOR]             = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
1838       [VK_LOGIC_OP_EQUIVALENT]      = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
1839       [VK_LOGIC_OP_INVERT]          = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
1840       [VK_LOGIC_OP_OR_REVERSE]      = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
1841       [VK_LOGIC_OP_COPY_INVERTED]   = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
1842       [VK_LOGIC_OP_OR_INVERTED]     = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
1843       [VK_LOGIC_OP_NAND]            = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
1844       [VK_LOGIC_OP_SET]             = NV9097_SET_LOGIC_OP_FUNC_V_SET,
1845    };
1846    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1847 
1848    uint32_t nv9097_op = 0x1500 | vk_op;
1849    assert(nv9097_op == vk_to_nv9097[vk_op]);
1850    return nv9097_op;
1851 }
1852 
1853 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)1854 vk_to_nv9097_blend_op(VkBlendOp vk_op)
1855 {
1856 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
1857    ASSERTED uint16_t vk_to_nv9097[] = {
1858       OP(ADD,              FUNC_ADD),
1859       OP(SUBTRACT,         FUNC_SUBTRACT),
1860       OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
1861       OP(MIN,              MIN),
1862       OP(MAX,              MAX),
1863    };
1864    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
1865 #undef OP
1866 
1867    return vk_to_nv9097[vk_op];
1868 }
1869 
1870 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)1871 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
1872 {
1873 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
1874    NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
1875    ASSERTED uint16_t vk_to_nv9097[] = {
1876       FACTOR(ZERO,                     OGL_ZERO),
1877       FACTOR(ONE,                      OGL_ONE),
1878       FACTOR(SRC_COLOR,                OGL_SRC_COLOR),
1879       FACTOR(ONE_MINUS_SRC_COLOR,      OGL_ONE_MINUS_SRC_COLOR),
1880       FACTOR(DST_COLOR,                OGL_DST_COLOR),
1881       FACTOR(ONE_MINUS_DST_COLOR,      OGL_ONE_MINUS_DST_COLOR),
1882       FACTOR(SRC_ALPHA,                OGL_SRC_ALPHA),
1883       FACTOR(ONE_MINUS_SRC_ALPHA,      OGL_ONE_MINUS_SRC_ALPHA),
1884       FACTOR(DST_ALPHA,                OGL_DST_ALPHA),
1885       FACTOR(ONE_MINUS_DST_ALPHA,      OGL_ONE_MINUS_DST_ALPHA),
1886       FACTOR(CONSTANT_COLOR,           OGL_CONSTANT_COLOR),
1887       FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
1888       FACTOR(CONSTANT_ALPHA,           OGL_CONSTANT_ALPHA),
1889       FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
1890       FACTOR(SRC_ALPHA_SATURATE,       OGL_SRC_ALPHA_SATURATE),
1891       FACTOR(SRC1_COLOR,               OGL_SRC1COLOR),
1892       FACTOR(ONE_MINUS_SRC1_COLOR,     OGL_INVSRC1COLOR),
1893       FACTOR(SRC1_ALPHA,               OGL_SRC1ALPHA),
1894       FACTOR(ONE_MINUS_SRC1_ALPHA,     OGL_INVSRC1ALPHA),
1895    };
1896    assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
1897 #undef FACTOR
1898 
1899    return vk_to_nv9097[vk_factor];
1900 }
1901 
1902 void
nvk_mme_set_write_mask(struct mme_builder * b)1903 nvk_mme_set_write_mask(struct mme_builder *b)
1904 {
1905    struct mme_value count = mme_load(b);
1906    struct mme_value mask = mme_load(b);
1907 
1908    /*
1909     * mask is a bit field
1910     *
1911     * attachment index 88887777666655554444333322221111
1912     * component        abgrabgrabgrabgrabgrabgrabgrabgr
1913    */
1914 
1915    struct mme_value common_mask = mme_mov(b, mme_imm(1));
1916    struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
1917    struct mme_value i = mme_mov(b, mme_zero());
1918 
1919    mme_while(b, ine, i, count) {
1920       /*
1921          We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
1922          0x0000 0000 0000 0000 000a 000b 000g 000r
1923 
1924          So for i=0 a mask of
1925          0x0000 0000 0000 0000 0000 0000 0000 1111
1926          becomes
1927          0x0000 0000 0000 0000 0001 0001 0001 0001
1928       */
1929 
1930       struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
1931       mme_merge_to(b, val, val, mask, 4, 1, 1);
1932       mme_merge_to(b, val, val, mask, 8, 1, 2);
1933       mme_merge_to(b, val, val, mask, 12, 1, 3);
1934 
1935       mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
1936       mme_emit(b, val);
1937       mme_free_reg(b, val);
1938 
1939       /* Check if all masks are common */
1940       struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
1941       mme_if(b, ine, first, temp) {
1942          mme_mov_to(b, common_mask, mme_zero());
1943       }
1944       mme_free_reg(b, temp);
1945 
1946       mme_srl_to(b, mask, mask, mme_imm(4));
1947 
1948       mme_add_to(b, i, i, mme_imm(1));
1949    }
1950 
1951    mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
1952    mme_emit(b, common_mask);
1953 }
1954 
1955 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)1956 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
1957 {
1958    struct nvk_rendering_state *render = &cmd->state.gfx.render;
1959    const struct vk_dynamic_graphics_state *dyn =
1960       &cmd->vk.dynamic_graphics_state;
1961 
1962    struct nv_push *p =
1963       nvk_cmd_buffer_push(cmd, 13 + 10 * render->color_att_count);
1964 
1965    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
1966       P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
1967 
1968    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
1969       const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
1970       P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
1971    }
1972 
1973    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
1974       for (uint8_t a = 0; a < render->color_att_count; a++) {
1975          P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
1976       }
1977    }
1978 
1979    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
1980       for (uint8_t a = 0; a < render->color_att_count; a++) {
1981          const struct vk_color_blend_attachment_state *att =
1982             &dyn->cb.attachments[a];
1983          P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
1984          P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
1985          P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
1986                vk_to_nv9097_blend_op(att->color_blend_op));
1987          P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
1988                vk_to_nv9097_blend_factor(att->src_color_blend_factor));
1989          P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
1990                vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
1991          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
1992                vk_to_nv9097_blend_op(att->alpha_blend_op));
1993          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
1994                vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
1995          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
1996                vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
1997       }
1998    }
1999 
2000    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
2001        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
2002        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS)) {
2003       uint32_t color_write_enables = 0x0;
2004       for (uint8_t a = 0; a < render->color_att_count; a++) {
2005          if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
2006             color_write_enables |= 0xf << (4 * a);
2007       }
2008 
2009       uint32_t cb_att_write_mask = 0x0;
2010       for (uint8_t a = 0; a < render->color_att_count; a++)
2011          cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
2012 
2013       uint32_t rp_att_write_mask = 0x0;
2014       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
2015          if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
2016             rp_att_write_mask |= 0xf << (4 * a);
2017       }
2018 
2019       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
2020       P_INLINE_DATA(p, render->color_att_count);
2021       P_INLINE_DATA(p, color_write_enables &
2022                        cb_att_write_mask &
2023                        rp_att_write_mask);
2024    }
2025 
2026    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
2027       P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
2028       P_NV9097_SET_BLEND_CONST_RED(p,     fui(dyn->cb.blend_constants[0]));
2029       P_NV9097_SET_BLEND_CONST_GREEN(p,   fui(dyn->cb.blend_constants[1]));
2030       P_NV9097_SET_BLEND_CONST_BLUE(p,    fui(dyn->cb.blend_constants[2]));
2031       P_NV9097_SET_BLEND_CONST_ALPHA(p,   fui(dyn->cb.blend_constants[3]));
2032    }
2033 }
2034 
2035 static void
nvk_flush_dynamic_state(struct nvk_cmd_buffer * cmd)2036 nvk_flush_dynamic_state(struct nvk_cmd_buffer *cmd)
2037 {
2038    struct vk_dynamic_graphics_state *dyn =
2039       &cmd->vk.dynamic_graphics_state;
2040 
2041    if (!vk_dynamic_graphics_state_any_dirty(dyn))
2042       return;
2043 
2044    nvk_flush_vi_state(cmd);
2045    nvk_flush_ia_state(cmd);
2046    nvk_flush_ts_state(cmd);
2047    nvk_flush_vp_state(cmd);
2048    nvk_flush_rs_state(cmd);
2049 
2050    /* MESA_VK_DYNAMIC_FSR */
2051 
2052    nvk_flush_ms_state(cmd);
2053    nvk_flush_ds_state(cmd);
2054    nvk_flush_cb_state(cmd);
2055 
2056    vk_dynamic_graphics_state_clear_dirty(dyn);
2057 }
2058 
2059 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)2060 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
2061 {
2062    /* First 4 bits are group, later bits are slot */
2063    struct mme_value group_slot = mme_load(b);
2064 
2065    if (b->devinfo->cls_eng3d >= TURING_A) {
2066       struct mme_value64 addr = mme_load_addr64(b);
2067       mme_tu104_read_fifoed(b, addr, mme_imm(3));
2068    }
2069 
2070    /* Load the descriptor */
2071    struct mme_value addr_lo = mme_load(b);
2072    struct mme_value addr_hi = mme_load(b);
2073    struct mme_value size = mme_load(b);
2074 
2075    struct mme_value cb = mme_alloc_reg(b);
2076    mme_if(b, ieq, size, mme_zero()) {
2077       /* Bottim bit is the valid bit, 8:4 are shader slot */
2078       mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
2079    }
2080 
2081    mme_if(b, ine, size, mme_zero()) {
2082       uint32_t alignment = nvk_min_cbuf_alignment(b->devinfo);
2083       mme_add_to(b, size, size, mme_imm(alignment - 1));
2084       mme_and_to(b, size, size, mme_imm(~(alignment - 1)));
2085 
2086       /* size = max(size, NVK_MAX_CBUF_SIZE) */
2087       assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
2088       struct mme_value is_large =
2089          mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
2090       mme_if(b, ine, is_large, mme_zero()) {
2091          mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
2092       }
2093 
2094       mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
2095       mme_emit(b, size);
2096       mme_emit(b, addr_hi);
2097       mme_emit(b, addr_lo);
2098 
2099       /* Bottim bit is the valid bit, 8:4 are shader slot */
2100       mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
2101    }
2102 
2103    mme_free_reg(b, addr_hi);
2104    mme_free_reg(b, addr_lo);
2105    mme_free_reg(b, size);
2106 
2107    /* The group comes in the bottom 4 bits in group_slot and we need to
2108     * combine it with the method.  However, unlike most array methods with a
2109     * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
2110     * dwords.  This means we need to also shift by 3.
2111     */
2112    struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
2113    mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
2114    mme_emit(b, cb);
2115 }
2116 
2117 static void
nvk_flush_descriptors(struct nvk_cmd_buffer * cmd)2118 nvk_flush_descriptors(struct nvk_cmd_buffer *cmd)
2119 {
2120    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
2121    struct nvk_physical_device *pdev = nvk_device_physical(dev);
2122    const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
2123    struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
2124    VkResult result;
2125 
2126    nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
2127 
2128    /* pre Pascal the constant buffer sizes need to be 0x100 aligned. As we
2129     * simply allocated a buffer and upload data to it, make sure its size is
2130     * 0x100 aligned.
2131     */
2132    STATIC_ASSERT((sizeof(desc->root) & 0xff) == 0);
2133    assert(sizeof(desc->root) % min_cbuf_alignment == 0);
2134 
2135    void *root_desc_map;
2136    uint64_t root_desc_addr;
2137    result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root),
2138                                         min_cbuf_alignment,
2139                                         &root_desc_addr, &root_desc_map);
2140    if (unlikely(result != VK_SUCCESS)) {
2141       vk_command_buffer_set_error(&cmd->vk, result);
2142       return;
2143    }
2144 
2145    desc->root.root_desc_addr = root_desc_addr;
2146    memcpy(root_desc_map, &desc->root, sizeof(desc->root));
2147 
2148    /* Find cbuf maps for the 5 cbuf groups */
2149    const struct nvk_shader *cbuf_shaders[5] = { NULL, };
2150    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
2151       const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
2152       if (shader == NULL)
2153          continue;
2154 
2155       uint32_t group = nvk_cbuf_binding_for_stage(stage);
2156       assert(group < ARRAY_SIZE(cbuf_shaders));
2157       cbuf_shaders[group] = shader;
2158    }
2159 
2160    uint32_t root_cbuf_count = 0;
2161    for (uint32_t group = 0; group < ARRAY_SIZE(cbuf_shaders); group++) {
2162       if (cbuf_shaders[group] == NULL)
2163          continue;
2164 
2165       const struct nvk_shader *shader = cbuf_shaders[group];
2166       const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
2167 
2168       for (uint32_t c = 0; c < cbuf_map->cbuf_count; c++) {
2169          const struct nvk_cbuf *cbuf = &cbuf_map->cbufs[c];
2170 
2171          /* We bind these at the very end */
2172          if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
2173             root_cbuf_count++;
2174             continue;
2175          }
2176 
2177          struct nvk_buffer_address ba;
2178          if (nvk_cmd_buffer_get_cbuf_descriptor(cmd, desc, shader, cbuf, &ba)) {
2179             assert(ba.base_addr % min_cbuf_alignment == 0);
2180             ba.size = align(ba.size, min_cbuf_alignment);
2181             ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
2182 
2183             struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2184 
2185             if (ba.size > 0) {
2186                P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
2187                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
2188                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
2189                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
2190             }
2191 
2192             P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
2193                .valid = ba.size > 0,
2194                .shader_slot = c,
2195             });
2196          } else {
2197             uint64_t desc_addr =
2198                nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
2199 
2200             if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2201                struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2202 
2203                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
2204                P_INLINE_DATA(p, group | (c << 4));
2205                P_INLINE_DATA(p, desc_addr >> 32);
2206                P_INLINE_DATA(p, desc_addr);
2207             } else {
2208                struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2209 
2210                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
2211                P_INLINE_DATA(p, group | (c << 4));
2212 
2213                nv_push_update_count(p, 3);
2214                nvk_cmd_buffer_push_indirect(cmd, desc_addr, 3);
2215             }
2216          }
2217       }
2218    }
2219 
2220    /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
2221     * always left pointing at the root descriptor table.  This way draw
2222     * parameters and similar MME root table updates always hit the root
2223     * descriptor table and not some random UBO.
2224     */
2225    struct nv_push *p = nvk_cmd_buffer_push(cmd, 4 + 2 * root_cbuf_count);
2226    P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
2227    P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, sizeof(desc->root));
2228    P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, root_desc_addr >> 32);
2229    P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, root_desc_addr);
2230 
2231    for (uint32_t group = 0; group < ARRAY_SIZE(cbuf_shaders); group++) {
2232       if (cbuf_shaders[group] == NULL)
2233          continue;
2234 
2235       const struct nvk_cbuf_map *cbuf_map = &cbuf_shaders[group]->cbuf_map;
2236 
2237       for (uint32_t c = 0; c < cbuf_map->cbuf_count; c++) {
2238          const struct nvk_cbuf *cbuf = &cbuf_map->cbufs[c];
2239          if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
2240             P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
2241                .valid = VALID_TRUE,
2242                .shader_slot = c,
2243             });
2244          }
2245       }
2246    }
2247 }
2248 
2249 static void
nvk_flush_gfx_state(struct nvk_cmd_buffer * cmd)2250 nvk_flush_gfx_state(struct nvk_cmd_buffer *cmd)
2251 {
2252    nvk_flush_shaders(cmd);
2253    nvk_flush_dynamic_state(cmd);
2254    nvk_flush_descriptors(cmd);
2255 }
2256 
2257 static uint32_t
vk_to_nv_index_format(VkIndexType type)2258 vk_to_nv_index_format(VkIndexType type)
2259 {
2260    switch (type) {
2261    case VK_INDEX_TYPE_UINT16:
2262       return NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES;
2263    case VK_INDEX_TYPE_UINT32:
2264       return NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES;
2265    case VK_INDEX_TYPE_UINT8_KHR:
2266       return NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE;
2267    default:
2268       unreachable("Invalid index type");
2269    }
2270 }
2271 
2272 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)2273 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
2274                            VkBuffer _buffer,
2275                            VkDeviceSize offset,
2276                            VkDeviceSize size,
2277                            VkIndexType indexType)
2278 {
2279    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2280    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
2281 
2282    struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
2283 
2284    uint64_t addr, range;
2285    if (buffer != NULL && size > 0) {
2286       addr = nvk_buffer_address(buffer, offset);
2287       range = vk_buffer_range(&buffer->vk, offset, size);
2288    } else {
2289       range = addr = 0;
2290    }
2291 
2292    P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_INDEX,
2293           vk_index_to_restart(indexType));
2294 
2295    P_MTHD(p, NV9097, SET_INDEX_BUFFER_A);
2296    P_NV9097_SET_INDEX_BUFFER_A(p, addr >> 32);
2297    P_NV9097_SET_INDEX_BUFFER_B(p, addr);
2298 
2299    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2300       P_MTHD(p, NVC597, SET_INDEX_BUFFER_SIZE_A);
2301       P_NVC597_SET_INDEX_BUFFER_SIZE_A(p, range >> 32);
2302       P_NVC597_SET_INDEX_BUFFER_SIZE_B(p, range);
2303    } else {
2304       /* TODO: What about robust zero-size buffers? */
2305       const uint64_t limit = range > 0 ? addr + range - 1 : 0;
2306       P_MTHD(p, NV9097, SET_INDEX_BUFFER_C);
2307       P_NV9097_SET_INDEX_BUFFER_C(p, limit >> 32);
2308       P_NV9097_SET_INDEX_BUFFER_D(p, limit);
2309    }
2310 
2311    P_IMMD(p, NV9097, SET_INDEX_BUFFER_E, vk_to_nv_index_format(indexType));
2312 }
2313 
2314 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)2315 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
2316                            struct nvk_addr_range addr_range)
2317 {
2318    /* Used for meta save/restore */
2319    if (vb_idx == 0)
2320       cmd->state.gfx.vb0 = addr_range;
2321 
2322    struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2323 
2324    P_MTHD(p, NV9097, SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
2325    P_NV9097_SET_VERTEX_STREAM_A_LOCATION_A(p, vb_idx, addr_range.addr >> 32);
2326    P_NV9097_SET_VERTEX_STREAM_A_LOCATION_B(p, vb_idx, addr_range.addr);
2327 
2328    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2329       P_MTHD(p, NVC597, SET_VERTEX_STREAM_SIZE_A(vb_idx));
2330       P_NVC597_SET_VERTEX_STREAM_SIZE_A(p, vb_idx, addr_range.range >> 32);
2331       P_NVC597_SET_VERTEX_STREAM_SIZE_B(p, vb_idx, addr_range.range);
2332    } else {
2333       /* TODO: What about robust zero-size buffers? */
2334       const uint64_t limit = addr_range.range > 0 ?
2335          addr_range.addr + addr_range.range - 1 : 0;
2336       P_MTHD(p, NV9097, SET_VERTEX_STREAM_LIMIT_A_A(vb_idx));
2337       P_NV9097_SET_VERTEX_STREAM_LIMIT_A_A(p, vb_idx, limit >> 32);
2338       P_NV9097_SET_VERTEX_STREAM_LIMIT_A_B(p, vb_idx, limit);
2339    }
2340 }
2341 
2342 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)2343 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
2344                           uint32_t firstBinding,
2345                           uint32_t bindingCount,
2346                           const VkBuffer *pBuffers,
2347                           const VkDeviceSize *pOffsets,
2348                           const VkDeviceSize *pSizes,
2349                           const VkDeviceSize *pStrides)
2350 {
2351    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2352 
2353    if (pStrides) {
2354       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
2355                                         bindingCount, pStrides);
2356    }
2357 
2358    for (uint32_t i = 0; i < bindingCount; i++) {
2359       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
2360       uint32_t idx = firstBinding + i;
2361 
2362       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
2363       const struct nvk_addr_range addr_range =
2364          nvk_buffer_addr_range(buffer, pOffsets[i], size);
2365 
2366       nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
2367    }
2368 }
2369 
2370 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)2371 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
2372 {
2373    switch (prim) {
2374    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
2375       return NV9097_BEGIN_OP_POINTS;
2376    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
2377       return NV9097_BEGIN_OP_LINES;
2378    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
2379       return NV9097_BEGIN_OP_LINE_STRIP;
2380    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
2381 #pragma GCC diagnostic push
2382 #pragma GCC diagnostic ignored "-Wswitch"
2383    case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
2384 #pragma GCC diagnostic pop
2385       return NV9097_BEGIN_OP_TRIANGLES;
2386    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
2387       return NV9097_BEGIN_OP_TRIANGLE_STRIP;
2388    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
2389       return NV9097_BEGIN_OP_TRIANGLE_FAN;
2390    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
2391       return NV9097_BEGIN_OP_LINELIST_ADJCY;
2392    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
2393       return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
2394    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
2395       return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
2396    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
2397       return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
2398    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
2399       return NV9097_BEGIN_OP_PATCH;
2400    default:
2401       unreachable("Invalid primitive topology");
2402    }
2403 }
2404 
2405 struct mme_draw_params {
2406    struct mme_value base_vertex;
2407    struct mme_value first_vertex;
2408    struct mme_value first_instance;
2409    struct mme_value draw_idx;
2410 };
2411 
2412 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)2413 nvk_mme_build_set_draw_params(struct mme_builder *b,
2414                               const struct mme_draw_params *p)
2415 {
2416    const uint32_t draw_params_offset = nvk_root_descriptor_offset(draw);
2417    mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2418    mme_emit(b, mme_imm(draw_params_offset));
2419    mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2420    mme_emit(b, p->first_vertex);
2421    mme_emit(b, p->first_instance);
2422    mme_emit(b, p->draw_idx);
2423    mme_emit(b, mme_zero() /* view_index */);
2424 
2425    mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
2426    mme_emit(b, p->base_vertex);
2427    mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
2428    mme_emit(b, p->base_vertex);
2429 
2430    mme_mthd(b, NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX);
2431    mme_emit(b, p->first_instance);
2432 }
2433 
2434 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)2435 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
2436 {
2437    /* Set the push constant */
2438    mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2439    mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.view_index)));
2440    mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2441    mme_emit(b, view_index);
2442 
2443    /* Set the layer to the view index */
2444    STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
2445    STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
2446    mme_mthd(b, NV9097_SET_RT_LAYER);
2447    mme_emit(b, view_index);
2448 }
2449 
2450 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)2451 nvk_mme_build_draw_loop(struct mme_builder *b,
2452                         struct mme_value instance_count,
2453                         struct mme_value first_vertex,
2454                         struct mme_value vertex_count)
2455 {
2456    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
2457 
2458    mme_loop(b, instance_count) {
2459       mme_mthd(b, NV9097_BEGIN);
2460       mme_emit(b, begin);
2461 
2462       mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
2463       mme_emit(b, first_vertex);
2464       mme_emit(b, vertex_count);
2465 
2466       mme_mthd(b, NV9097_END);
2467       mme_emit(b, mme_zero());
2468 
2469       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
2470    }
2471 
2472    mme_free_reg(b, begin);
2473 }
2474 
2475 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_idx)2476 nvk_mme_build_draw(struct mme_builder *b,
2477                    struct mme_value draw_idx)
2478 {
2479    /* These are in VkDrawIndirectCommand order */
2480    struct mme_value vertex_count = mme_load(b);
2481    struct mme_value instance_count = mme_load(b);
2482    struct mme_value first_vertex = mme_load(b);
2483    struct mme_value first_instance = mme_load(b);
2484 
2485    struct mme_draw_params params = {
2486       .first_vertex = first_vertex,
2487       .first_instance = first_instance,
2488       .draw_idx = draw_idx,
2489    };
2490    nvk_mme_build_set_draw_params(b, &params);
2491 
2492    mme_free_reg(b, first_instance);
2493 
2494    if (b->devinfo->cls_eng3d < TURING_A)
2495       nvk_mme_spill(b, DRAW_IDX, draw_idx);
2496 
2497    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2498    mme_if(b, ieq, view_mask, mme_zero()) {
2499       mme_free_reg(b, view_mask);
2500 
2501       nvk_mme_build_draw_loop(b, instance_count,
2502                               first_vertex, vertex_count);
2503    }
2504 
2505    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2506    mme_if(b, ine, view_mask, mme_zero()) {
2507       mme_free_reg(b, view_mask);
2508 
2509       struct mme_value view = mme_mov(b, mme_zero());
2510       mme_while(b, ine, view, mme_imm(32)) {
2511          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2512          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
2513          mme_free_reg(b, view_mask);
2514          mme_if(b, ine, has_view, mme_zero()) {
2515             mme_free_reg(b, has_view);
2516             nvk_mme_emit_view_index(b, view);
2517             nvk_mme_build_draw_loop(b, instance_count,
2518                                     first_vertex, vertex_count);
2519          }
2520 
2521          mme_add_to(b, view, view, mme_imm(1));
2522       }
2523       mme_free_reg(b, view);
2524    }
2525 
2526    mme_free_reg(b, instance_count);
2527    mme_free_reg(b, first_vertex);
2528    mme_free_reg(b, vertex_count);
2529 
2530    if (b->devinfo->cls_eng3d < TURING_A)
2531       nvk_mme_unspill(b, DRAW_IDX, draw_idx);
2532 }
2533 
2534 void
nvk_mme_draw(struct mme_builder * b)2535 nvk_mme_draw(struct mme_builder *b)
2536 {
2537    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2538    struct mme_value draw_idx = mme_load(b);
2539 
2540    nvk_mme_build_draw(b, draw_idx);
2541 }
2542 
2543 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)2544 nvk_CmdDraw(VkCommandBuffer commandBuffer,
2545             uint32_t vertexCount,
2546             uint32_t instanceCount,
2547             uint32_t firstVertex,
2548             uint32_t firstInstance)
2549 {
2550    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2551    const struct vk_dynamic_graphics_state *dyn =
2552       &cmd->vk.dynamic_graphics_state;
2553 
2554    nvk_flush_gfx_state(cmd);
2555 
2556    uint32_t begin;
2557    V_NV9097_BEGIN(begin, {
2558       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2559       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2560       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2561       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2562    });
2563 
2564    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
2565    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
2566    P_INLINE_DATA(p, begin);
2567    P_INLINE_DATA(p, 0 /* draw_idx */);
2568    P_INLINE_DATA(p, vertexCount);
2569    P_INLINE_DATA(p, instanceCount);
2570    P_INLINE_DATA(p, firstVertex);
2571    P_INLINE_DATA(p, firstInstance);
2572 }
2573 
2574 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)2575 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
2576                     uint32_t drawCount,
2577                     const VkMultiDrawInfoEXT *pVertexInfo,
2578                     uint32_t instanceCount,
2579                     uint32_t firstInstance,
2580                     uint32_t stride)
2581 {
2582    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2583    const struct vk_dynamic_graphics_state *dyn =
2584       &cmd->vk.dynamic_graphics_state;
2585 
2586    nvk_flush_gfx_state(cmd);
2587 
2588    uint32_t begin;
2589    V_NV9097_BEGIN(begin, {
2590       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2591       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2592       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2593       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2594    });
2595 
2596    for (uint32_t draw_idx = 0; draw_idx < drawCount; draw_idx++) {
2597       struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
2598       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
2599       P_INLINE_DATA(p, begin);
2600       P_INLINE_DATA(p, draw_idx);
2601       P_INLINE_DATA(p, pVertexInfo->vertexCount);
2602       P_INLINE_DATA(p, instanceCount);
2603       P_INLINE_DATA(p, pVertexInfo->firstVertex);
2604       P_INLINE_DATA(p, firstInstance);
2605 
2606       pVertexInfo = ((void *)pVertexInfo) + stride;
2607    }
2608 }
2609 
2610 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)2611 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
2612                                 struct mme_value instance_count,
2613                                 struct mme_value first_index,
2614                                 struct mme_value index_count)
2615 {
2616    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
2617 
2618    mme_loop(b, instance_count) {
2619       mme_mthd(b, NV9097_BEGIN);
2620       mme_emit(b, begin);
2621 
2622       mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
2623       mme_emit(b, first_index);
2624       mme_emit(b, index_count);
2625 
2626       mme_mthd(b, NV9097_END);
2627       mme_emit(b, mme_zero());
2628 
2629       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
2630    }
2631 
2632    mme_free_reg(b, begin);
2633 }
2634 
2635 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_idx)2636 nvk_mme_build_draw_indexed(struct mme_builder *b,
2637                            struct mme_value draw_idx)
2638 {
2639    /* These are in VkDrawIndexedIndirectCommand order */
2640    struct mme_value index_count = mme_load(b);
2641    struct mme_value instance_count = mme_load(b);
2642    struct mme_value first_index = mme_load(b);
2643    struct mme_value vertex_offset = mme_load(b);
2644    struct mme_value first_instance = mme_load(b);
2645 
2646    struct mme_draw_params params = {
2647       .base_vertex = vertex_offset,
2648       .first_vertex = vertex_offset,
2649       .first_instance = first_instance,
2650       .draw_idx = draw_idx,
2651    };
2652    nvk_mme_build_set_draw_params(b, &params);
2653 
2654    mme_free_reg(b, vertex_offset);
2655    mme_free_reg(b, first_instance);
2656 
2657    if (b->devinfo->cls_eng3d < TURING_A)
2658       nvk_mme_spill(b, DRAW_IDX, draw_idx);
2659 
2660    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2661    mme_if(b, ieq, view_mask, mme_zero()) {
2662       mme_free_reg(b, view_mask);
2663 
2664       nvk_mme_build_draw_indexed_loop(b, instance_count,
2665                                       first_index, index_count);
2666    }
2667 
2668    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2669    mme_if(b, ine, view_mask, mme_zero()) {
2670       mme_free_reg(b, view_mask);
2671 
2672       struct mme_value view = mme_mov(b, mme_zero());
2673       mme_while(b, ine, view, mme_imm(32)) {
2674          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
2675          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
2676          mme_free_reg(b, view_mask);
2677          mme_if(b, ine, has_view, mme_zero()) {
2678             mme_free_reg(b, has_view);
2679             nvk_mme_emit_view_index(b, view);
2680             nvk_mme_build_draw_indexed_loop(b, instance_count,
2681                                             first_index, index_count);
2682          }
2683 
2684          mme_add_to(b, view, view, mme_imm(1));
2685       }
2686       mme_free_reg(b, view);
2687    }
2688 
2689    mme_free_reg(b, instance_count);
2690    mme_free_reg(b, first_index);
2691    mme_free_reg(b, index_count);
2692 
2693    if (b->devinfo->cls_eng3d < TURING_A)
2694       nvk_mme_unspill(b, DRAW_IDX, draw_idx);
2695 }
2696 
2697 void
nvk_mme_draw_indexed(struct mme_builder * b)2698 nvk_mme_draw_indexed(struct mme_builder *b)
2699 {
2700    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2701    struct mme_value draw_idx = mme_load(b);
2702 
2703    nvk_mme_build_draw_indexed(b, draw_idx);
2704 }
2705 
2706 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)2707 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
2708                    uint32_t indexCount,
2709                    uint32_t instanceCount,
2710                    uint32_t firstIndex,
2711                    int32_t vertexOffset,
2712                    uint32_t firstInstance)
2713 {
2714    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2715    const struct vk_dynamic_graphics_state *dyn =
2716       &cmd->vk.dynamic_graphics_state;
2717 
2718    nvk_flush_gfx_state(cmd);
2719 
2720    uint32_t begin;
2721    V_NV9097_BEGIN(begin, {
2722       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2723       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2724       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2725       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2726    });
2727 
2728    struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
2729    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
2730    P_INLINE_DATA(p, begin);
2731    P_INLINE_DATA(p, 0 /* draw_idx */);
2732    P_INLINE_DATA(p, indexCount);
2733    P_INLINE_DATA(p, instanceCount);
2734    P_INLINE_DATA(p, firstIndex);
2735    P_INLINE_DATA(p, vertexOffset);
2736    P_INLINE_DATA(p, firstInstance);
2737 }
2738 
2739 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)2740 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
2741                            uint32_t drawCount,
2742                            const VkMultiDrawIndexedInfoEXT *pIndexInfo,
2743                            uint32_t instanceCount,
2744                            uint32_t firstInstance,
2745                            uint32_t stride,
2746                            const int32_t *pVertexOffset)
2747 {
2748    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2749    const struct vk_dynamic_graphics_state *dyn =
2750       &cmd->vk.dynamic_graphics_state;
2751 
2752    nvk_flush_gfx_state(cmd);
2753 
2754    uint32_t begin;
2755    V_NV9097_BEGIN(begin, {
2756       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2757       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2758       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2759       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2760    });
2761 
2762    for (uint32_t draw_idx = 0; draw_idx < drawCount; draw_idx++) {
2763       const uint32_t vertex_offset =
2764          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
2765 
2766       struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
2767       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
2768       P_INLINE_DATA(p, begin);
2769       P_INLINE_DATA(p, draw_idx);
2770       P_INLINE_DATA(p, pIndexInfo->indexCount);
2771       P_INLINE_DATA(p, instanceCount);
2772       P_INLINE_DATA(p, pIndexInfo->firstIndex);
2773       P_INLINE_DATA(p, vertex_offset);
2774       P_INLINE_DATA(p, firstInstance);
2775 
2776       pIndexInfo = ((void *)pIndexInfo) + stride;
2777    }
2778 }
2779 
2780 void
nvk_mme_draw_indirect(struct mme_builder * b)2781 nvk_mme_draw_indirect(struct mme_builder *b)
2782 {
2783    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2784 
2785    if (b->devinfo->cls_eng3d >= TURING_A) {
2786       struct mme_value64 draw_addr = mme_load_addr64(b);
2787       struct mme_value draw_count = mme_load(b);
2788       struct mme_value stride = mme_load(b);
2789 
2790       struct mme_value draw = mme_mov(b, mme_zero());
2791       mme_while(b, ult, draw, draw_count) {
2792          mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
2793 
2794          nvk_mme_build_draw(b, draw);
2795 
2796          mme_add_to(b, draw, draw, mme_imm(1));
2797          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
2798       }
2799    } else {
2800       struct mme_value draw_count = mme_load(b);
2801       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
2802 
2803       struct mme_value draw = mme_mov(b, mme_zero());
2804       mme_while(b, ine, draw, draw_count) {
2805          nvk_mme_spill(b, DRAW_COUNT, draw_count);
2806 
2807          nvk_mme_build_draw(b, draw);
2808          mme_add_to(b, draw, draw, mme_imm(1));
2809 
2810          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
2811          mme_loop(b, pad_dw) {
2812             mme_free_reg(b, mme_load(b));
2813          }
2814          mme_free_reg(b, pad_dw);
2815 
2816          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
2817       }
2818    }
2819 }
2820 
2821 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)2822 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
2823                     VkBuffer _buffer,
2824                     VkDeviceSize offset,
2825                     uint32_t drawCount,
2826                     uint32_t stride)
2827 {
2828    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2829    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
2830    const struct vk_dynamic_graphics_state *dyn =
2831       &cmd->vk.dynamic_graphics_state;
2832 
2833    /* From the Vulkan 1.3.238 spec:
2834     *
2835     *    VUID-vkCmdDrawIndirect-drawCount-00476
2836     *
2837     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
2838     *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
2839     *
2840     * and
2841     *
2842     *    "If drawCount is less than or equal to one, stride is ignored."
2843     */
2844    if (drawCount > 1) {
2845       assert(stride % 4 == 0);
2846       assert(stride >= sizeof(VkDrawIndirectCommand));
2847    } else {
2848       stride = sizeof(VkDrawIndirectCommand);
2849    }
2850 
2851    nvk_flush_gfx_state(cmd);
2852 
2853    uint32_t begin;
2854    V_NV9097_BEGIN(begin, {
2855       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2856       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2857       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2858       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2859    });
2860 
2861    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2862       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2863       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
2864       P_INLINE_DATA(p, begin);
2865       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2866       P_INLINE_DATA(p, draw_addr >> 32);
2867       P_INLINE_DATA(p, draw_addr);
2868       P_INLINE_DATA(p, drawCount);
2869       P_INLINE_DATA(p, stride);
2870    } else {
2871       const uint32_t max_draws_per_push =
2872          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
2873 
2874       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2875       while (drawCount) {
2876          const uint32_t count = MIN2(drawCount, max_draws_per_push);
2877 
2878          struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2879          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
2880          P_INLINE_DATA(p, begin);
2881          P_INLINE_DATA(p, count);
2882          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
2883 
2884          uint64_t range = count * (uint64_t)stride;
2885          nv_push_update_count(p, range / 4);
2886          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
2887 
2888          draw_addr += range;
2889          drawCount -= count;
2890       }
2891    }
2892 }
2893 
2894 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)2895 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
2896 {
2897    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
2898 
2899    if (b->devinfo->cls_eng3d >= TURING_A) {
2900       struct mme_value64 draw_addr = mme_load_addr64(b);
2901       struct mme_value draw_count = mme_load(b);
2902       struct mme_value stride = mme_load(b);
2903 
2904       struct mme_value draw = mme_mov(b, mme_zero());
2905       mme_while(b, ult, draw, draw_count) {
2906          mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
2907 
2908          nvk_mme_build_draw_indexed(b, draw);
2909 
2910          mme_add_to(b, draw, draw, mme_imm(1));
2911          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
2912       }
2913    } else {
2914       struct mme_value draw_count = mme_load(b);
2915       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
2916 
2917       struct mme_value draw = mme_mov(b, mme_zero());
2918       mme_while(b, ine, draw, draw_count) {
2919          nvk_mme_spill(b, DRAW_COUNT, draw_count);
2920 
2921          nvk_mme_build_draw_indexed(b, draw);
2922          mme_add_to(b, draw, draw, mme_imm(1));
2923 
2924          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
2925          mme_loop(b, pad_dw) {
2926             mme_free_reg(b, mme_load(b));
2927          }
2928          mme_free_reg(b, pad_dw);
2929 
2930          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
2931       }
2932    }
2933 }
2934 
2935 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)2936 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
2937                            VkBuffer _buffer,
2938                            VkDeviceSize offset,
2939                            uint32_t drawCount,
2940                            uint32_t stride)
2941 {
2942    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
2943    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
2944    const struct vk_dynamic_graphics_state *dyn =
2945       &cmd->vk.dynamic_graphics_state;
2946 
2947    /* From the Vulkan 1.3.238 spec:
2948     *
2949     *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
2950     *
2951     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
2952     *    must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
2953     *
2954     * and
2955     *
2956     *    "If drawCount is less than or equal to one, stride is ignored."
2957     */
2958    if (drawCount > 1) {
2959       assert(stride % 4 == 0);
2960       assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
2961    } else {
2962       stride = sizeof(VkDrawIndexedIndirectCommand);
2963    }
2964 
2965    nvk_flush_gfx_state(cmd);
2966 
2967    uint32_t begin;
2968    V_NV9097_BEGIN(begin, {
2969       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
2970       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
2971       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
2972       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
2973    });
2974 
2975    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
2976       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
2977       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
2978       P_INLINE_DATA(p, begin);
2979       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2980       P_INLINE_DATA(p, draw_addr >> 32);
2981       P_INLINE_DATA(p, draw_addr);
2982       P_INLINE_DATA(p, drawCount);
2983       P_INLINE_DATA(p, stride);
2984    } else {
2985       const uint32_t max_draws_per_push =
2986          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
2987 
2988       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
2989       while (drawCount) {
2990          const uint32_t count = MIN2(drawCount, max_draws_per_push);
2991 
2992          struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
2993          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
2994          P_INLINE_DATA(p, begin);
2995          P_INLINE_DATA(p, count);
2996          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
2997 
2998          uint64_t range = count * (uint64_t)stride;
2999          nv_push_update_count(p, range / 4);
3000          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
3001 
3002          draw_addr += range;
3003          drawCount -= count;
3004       }
3005    }
3006 }
3007 
3008 void
nvk_mme_draw_indirect_count(struct mme_builder * b)3009 nvk_mme_draw_indirect_count(struct mme_builder *b)
3010 {
3011    if (b->devinfo->cls_eng3d < TURING_A)
3012       return;
3013 
3014    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
3015 
3016    struct mme_value64 draw_addr = mme_load_addr64(b);
3017    struct mme_value64 draw_count_addr = mme_load_addr64(b);
3018    struct mme_value draw_max = mme_load(b);
3019    struct mme_value stride = mme_load(b);
3020 
3021    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
3022    mme_free_reg64(b, draw_count_addr);
3023    struct mme_value draw_count_buf = mme_load(b);
3024 
3025    mme_if(b, ule, draw_count_buf, draw_max) {
3026       mme_mov_to(b, draw_max, draw_count_buf);
3027    }
3028    mme_free_reg(b, draw_count_buf);
3029 
3030    struct mme_value draw = mme_mov(b, mme_zero());
3031    mme_while(b, ult, draw, draw_max) {
3032       mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
3033 
3034       nvk_mme_build_draw(b, draw);
3035 
3036       mme_add_to(b, draw, draw, mme_imm(1));
3037       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3038    }
3039 }
3040 
3041 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3042 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
3043                          VkBuffer _buffer,
3044                          VkDeviceSize offset,
3045                          VkBuffer countBuffer,
3046                          VkDeviceSize countBufferOffset,
3047                          uint32_t maxDrawCount,
3048                          uint32_t stride)
3049 {
3050    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3051    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3052    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
3053 
3054    const struct vk_dynamic_graphics_state *dyn =
3055       &cmd->vk.dynamic_graphics_state;
3056 
3057    /* TODO: Indirect count draw pre-Turing */
3058    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
3059 
3060    nvk_flush_gfx_state(cmd);
3061 
3062    uint32_t begin;
3063    V_NV9097_BEGIN(begin, {
3064       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
3065       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
3066       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
3067       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
3068    });
3069 
3070    struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
3071    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
3072    P_INLINE_DATA(p, begin);
3073    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3074    P_INLINE_DATA(p, draw_addr >> 32);
3075    P_INLINE_DATA(p, draw_addr);
3076    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
3077                                                  countBufferOffset);
3078    P_INLINE_DATA(p, draw_count_addr >> 32);
3079    P_INLINE_DATA(p, draw_count_addr);
3080    P_INLINE_DATA(p, maxDrawCount);
3081    P_INLINE_DATA(p, stride);
3082 }
3083 
3084 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)3085 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
3086 {
3087    if (b->devinfo->cls_eng3d < TURING_A)
3088       return;
3089 
3090    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
3091 
3092    struct mme_value64 draw_addr = mme_load_addr64(b);
3093    struct mme_value64 draw_count_addr = mme_load_addr64(b);
3094    struct mme_value draw_max = mme_load(b);
3095    struct mme_value stride = mme_load(b);
3096 
3097    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
3098    mme_free_reg64(b, draw_count_addr);
3099    struct mme_value draw_count_buf = mme_load(b);
3100 
3101    mme_if(b, ule, draw_count_buf, draw_max) {
3102       mme_mov_to(b, draw_max, draw_count_buf);
3103    }
3104    mme_free_reg(b, draw_count_buf);
3105 
3106    struct mme_value draw = mme_mov(b, mme_zero());
3107    mme_while(b, ult, draw, draw_max) {
3108       mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
3109 
3110       nvk_mme_build_draw_indexed(b, draw);
3111 
3112       mme_add_to(b, draw, draw, mme_imm(1));
3113       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
3114    }
3115 }
3116 
3117 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3118 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
3119                                 VkBuffer _buffer,
3120                                 VkDeviceSize offset,
3121                                 VkBuffer countBuffer,
3122                                 VkDeviceSize countBufferOffset,
3123                                 uint32_t maxDrawCount,
3124                                 uint32_t stride)
3125 {
3126    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3127    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3128    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
3129 
3130    const struct vk_dynamic_graphics_state *dyn =
3131       &cmd->vk.dynamic_graphics_state;
3132 
3133    /* TODO: Indexed indirect count draw pre-Turing */
3134    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
3135 
3136    nvk_flush_gfx_state(cmd);
3137 
3138    uint32_t begin;
3139    V_NV9097_BEGIN(begin, {
3140       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
3141       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
3142       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
3143       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
3144    });
3145 
3146    struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
3147    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
3148    P_INLINE_DATA(p, begin);
3149    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
3150    P_INLINE_DATA(p, draw_addr >> 32);
3151    P_INLINE_DATA(p, draw_addr);
3152    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
3153                                                  countBufferOffset);
3154    P_INLINE_DATA(p, draw_count_addr >> 32);
3155    P_INLINE_DATA(p, draw_count_addr);
3156    P_INLINE_DATA(p, maxDrawCount);
3157    P_INLINE_DATA(p, stride);
3158 }
3159 
3160 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)3161 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
3162                                struct mme_value instance_count,
3163                                struct mme_value counter)
3164 {
3165    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3166 
3167    mme_loop(b, instance_count) {
3168       mme_mthd(b, NV9097_BEGIN);
3169       mme_emit(b, begin);
3170 
3171       mme_mthd(b, NV9097_DRAW_AUTO);
3172       mme_emit(b, counter);
3173 
3174       mme_mthd(b, NV9097_END);
3175       mme_emit(b, mme_zero());
3176 
3177       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3178    }
3179 
3180    mme_free_reg(b, begin);
3181 }
3182 
3183 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)3184 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
3185 {
3186    nvk_mme_load_to_scratch(b, DRAW_BEGIN);
3187 
3188    struct mme_value instance_count = mme_load(b);
3189    struct mme_value first_instance = mme_load(b);
3190 
3191    if (b->devinfo->cls_eng3d >= TURING_A) {
3192       struct mme_value64 counter_addr = mme_load_addr64(b);
3193       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
3194       mme_free_reg(b, counter_addr.lo);
3195       mme_free_reg(b, counter_addr.hi);
3196    }
3197    struct mme_value counter = mme_load(b);
3198 
3199    struct mme_draw_params params = {
3200       .first_instance = first_instance,
3201    };
3202    nvk_mme_build_set_draw_params(b, &params);
3203 
3204    mme_free_reg(b, first_instance);
3205 
3206    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3207    mme_if(b, ieq, view_mask, mme_zero()) {
3208       mme_free_reg(b, view_mask);
3209 
3210       nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
3211    }
3212 
3213    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3214    mme_if(b, ine, view_mask, mme_zero()) {
3215       mme_free_reg(b, view_mask);
3216 
3217       struct mme_value view = mme_mov(b, mme_zero());
3218       mme_while(b, ine, view, mme_imm(32)) {
3219          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3220          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3221          mme_free_reg(b, view_mask);
3222          mme_if(b, ine, has_view, mme_zero()) {
3223             mme_free_reg(b, has_view);
3224             nvk_mme_emit_view_index(b, view);
3225             nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
3226          }
3227 
3228          mme_add_to(b, view, view, mme_imm(1));
3229       }
3230    }
3231 
3232    mme_free_reg(b, instance_count);
3233    mme_free_reg(b, counter);
3234 }
3235 
3236 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)3237 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3238                                 uint32_t instanceCount,
3239                                 uint32_t firstInstance,
3240                                 VkBuffer counterBuffer,
3241                                 VkDeviceSize counterBufferOffset,
3242                                 uint32_t counterOffset,
3243                                 uint32_t vertexStride)
3244 {
3245    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3246    VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
3247    const struct vk_dynamic_graphics_state *dyn =
3248       &cmd->vk.dynamic_graphics_state;
3249 
3250    nvk_flush_gfx_state(cmd);
3251 
3252    uint32_t begin;
3253    V_NV9097_BEGIN(begin, {
3254       .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
3255       .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
3256       .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
3257       .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
3258    });
3259 
3260    uint64_t counter_addr = nvk_buffer_address(counter_buffer,
3261                                               counterBufferOffset);
3262 
3263    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3264       struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);
3265       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
3266       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
3267 
3268       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
3269       P_INLINE_DATA(p, begin);
3270       P_INLINE_DATA(p, instanceCount);
3271       P_INLINE_DATA(p, firstInstance);
3272       P_INLINE_DATA(p, counter_addr >> 32);
3273       P_INLINE_DATA(p, counter_addr);
3274    } else {
3275       struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
3276       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
3277       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
3278 
3279       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
3280       P_INLINE_DATA(p, begin);
3281       P_INLINE_DATA(p, instanceCount);
3282       P_INLINE_DATA(p, firstInstance);
3283       nv_push_update_count(p, 1);
3284       nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
3285    }
3286 }
3287 
3288 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3289 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3290                                        uint32_t firstBinding,
3291                                        uint32_t bindingCount,
3292                                        const VkBuffer *pBuffers,
3293                                        const VkDeviceSize *pOffsets,
3294                                        const VkDeviceSize *pSizes)
3295 {
3296    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3297 
3298    for (uint32_t i = 0; i < bindingCount; i++) {
3299       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3300       uint32_t idx = firstBinding + i;
3301       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3302       struct nvk_addr_range addr_range =
3303          nvk_buffer_addr_range(buffer, pOffsets[i], size);
3304       assert(addr_range.range <= UINT32_MAX);
3305 
3306       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3307 
3308       P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
3309       P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
3310       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
3311       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
3312       P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
3313    }
3314 
3315    // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
3316 }
3317 
3318 void
nvk_mme_xfb_counter_load(struct mme_builder * b)3319 nvk_mme_xfb_counter_load(struct mme_builder *b)
3320 {
3321    struct mme_value buffer = mme_load(b);
3322 
3323    struct mme_value counter;
3324    if (b->devinfo->cls_eng3d >= TURING_A) {
3325       struct mme_value64 counter_addr = mme_load_addr64(b);
3326 
3327       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
3328       mme_free_reg(b, counter_addr.lo);
3329       mme_free_reg(b, counter_addr.hi);
3330 
3331       counter = mme_load(b);
3332    } else {
3333       counter = mme_load(b);
3334    }
3335 
3336    mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
3337    mme_emit(b, counter);
3338 
3339    mme_free_reg(b, counter);
3340    mme_free_reg(b, buffer);
3341 }
3342 
3343 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3344 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3345                                  uint32_t firstCounterBuffer,
3346                                  uint32_t counterBufferCount,
3347                                  const VkBuffer *pCounterBuffers,
3348                                  const VkDeviceSize *pCounterBufferOffsets)
3349 {
3350    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3351    const uint32_t max_buffers = 4;
3352 
3353    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
3354 
3355    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
3356    for (uint32_t i = 0; i < max_buffers; ++i) {
3357       P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
3358    }
3359 
3360    for (uint32_t i = 0; i < counterBufferCount; ++i) {
3361       if (pCounterBuffers[i] == VK_NULL_HANDLE)
3362          continue;
3363 
3364       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
3365       // index of counter buffer corresponts to index of transform buffer
3366       uint32_t cb_idx = firstCounterBuffer + i;
3367       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3368       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
3369 
3370       if (nvk_cmd_buffer_device(cmd)->pdev->info.cls_eng3d >= TURING_A) {
3371          struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3372          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
3373          /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
3374          P_INLINE_DATA(p, cb_idx * 8);
3375          P_INLINE_DATA(p, cb_addr >> 32);
3376          P_INLINE_DATA(p, cb_addr);
3377       } else {
3378          struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3379          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
3380          P_INLINE_DATA(p, cb_idx);
3381          nv_push_update_count(p, 1);
3382          nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
3383       }
3384    }
3385 }
3386 
3387 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3388 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3389                                uint32_t firstCounterBuffer,
3390                                uint32_t counterBufferCount,
3391                                const VkBuffer *pCounterBuffers,
3392                                const VkDeviceSize *pCounterBufferOffsets)
3393 {
3394    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3395 
3396    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
3397 
3398    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
3399 
3400    for (uint32_t i = 0; i < counterBufferCount; ++i) {
3401       if (pCounterBuffers[i] == VK_NULL_HANDLE)
3402          continue;
3403 
3404       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
3405       // index of counter buffer corresponts to index of transform buffer
3406       uint32_t cb_idx = firstCounterBuffer + i;
3407       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3408       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
3409 
3410       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
3411       P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
3412       P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
3413       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
3414       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
3415          .operation = OPERATION_REPORT_ONLY,
3416          .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
3417          .report = REPORT_STREAMING_BYTE_COUNT,
3418          .sub_report = cb_idx,
3419          .structure_size = STRUCTURE_SIZE_ONE_WORD,
3420       });
3421    }
3422 }
3423 
3424 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)3425 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
3426                                     const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
3427 {
3428    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3429    VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
3430 
3431    uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
3432    bool inverted = pConditionalRenderingBegin->flags &
3433       VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
3434 
3435    if (addr & 0x3f || buffer->is_local) {
3436       uint64_t tmp_addr;
3437       VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
3438       if (result != VK_SUCCESS) {
3439          vk_command_buffer_set_error(&cmd->vk, result);
3440          return;
3441       }
3442 
3443       struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
3444       P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
3445       P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
3446       P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
3447       P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
3448       P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
3449       P_NV90B5_PITCH_IN(p, 4);
3450       P_NV90B5_PITCH_OUT(p, 4);
3451       P_NV90B5_LINE_LENGTH_IN(p, 4);
3452       P_NV90B5_LINE_COUNT(p, 1);
3453 
3454       P_IMMD(p, NV90B5, LAUNCH_DMA, {
3455             .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
3456             .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
3457             .flush_enable = FLUSH_ENABLE_TRUE,
3458             .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
3459             .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
3460          });
3461       addr = tmp_addr;
3462    }
3463 
3464    struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
3465    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
3466    P_NV9097_SET_RENDER_ENABLE_A(p, addr >> 32);
3467    P_NV9097_SET_RENDER_ENABLE_B(p, addr & 0xfffffff0);
3468    P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
3469 
3470    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
3471    P_NV90C0_SET_RENDER_ENABLE_A(p, addr >> 32);
3472    P_NV90C0_SET_RENDER_ENABLE_B(p, addr & 0xfffffff0);
3473    P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
3474 }
3475 
3476 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)3477 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
3478 {
3479    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3480 
3481    struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
3482    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
3483    P_NV9097_SET_RENDER_ENABLE_A(p, 0);
3484    P_NV9097_SET_RENDER_ENABLE_B(p, 0);
3485    P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
3486 
3487    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
3488    P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
3489    P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
3490    P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
3491 }
3492