• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15 
16 #include "util/bitpack_helpers.h"
17 #include "vk_format.h"
18 #include "vk_render_pass.h"
19 #include "vk_standard_sample_locations.h"
20 
21 #include "nv_push_cl902d.h"
22 #include "nv_push_cl9097.h"
23 #include "nv_push_cl90b5.h"
24 #include "nv_push_cl90c0.h"
25 #include "nv_push_cla097.h"
26 #include "nv_push_clb097.h"
27 #include "nv_push_clb197.h"
28 #include "nv_push_clc397.h"
29 #include "nv_push_clc597.h"
30 #include "drf.h"
31 
32 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)33 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
34 {
35    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
36    struct nvk_physical_device *pdev = nvk_device_physical(dev);
37    return pdev->info.cls_eng3d;
38 }
39 
40 static void
mme_set_priv_reg(struct mme_builder * b,struct mme_value value,struct mme_value mask,struct mme_value reg)41 mme_set_priv_reg(struct mme_builder *b,
42                  struct mme_value value,
43                  struct mme_value mask,
44                  struct mme_value reg)
45 {
46    mme_mthd(b, NV9097_WAIT_FOR_IDLE);
47    mme_emit(b, mme_zero());
48 
49    mme_mthd(b, NVK_SET_MME_SCRATCH(FALCON_0));
50    mme_emit(b, mme_zero());
51    mme_emit(b, value);
52    mme_emit(b, mask);
53 
54    mme_mthd(b, NV9097_SET_FALCON04);
55    mme_emit(b, reg);
56 
57    struct mme_value loop_cond = mme_mov(b, mme_zero());
58    mme_while(b, ine, loop_cond, mme_imm(1)) {
59       mme_state_to(b, loop_cond, NVK_SET_MME_SCRATCH(FALCON_0));
60       mme_mthd(b, NV9097_NO_OPERATION);
61       mme_emit(b, mme_zero());
62    };
63 }
64 
65 void
nvk_mme_set_priv_reg(struct mme_builder * b)66 nvk_mme_set_priv_reg(struct mme_builder *b)
67 {
68    struct mme_value value = mme_load(b);
69    struct mme_value mask = mme_load(b);
70    struct mme_value reg = mme_load(b);
71 
72    mme_set_priv_reg(b, value, mask, reg);
73 }
74 
75 void
nvk_mme_set_conservative_raster_state(struct mme_builder * b)76 nvk_mme_set_conservative_raster_state(struct mme_builder *b)
77 {
78    struct mme_value new_state = mme_load(b);
79    struct mme_value old_state =
80       nvk_mme_load_scratch(b, CONSERVATIVE_RASTER_STATE);
81 
82    mme_if(b, ine, new_state, old_state) {
83       nvk_mme_store_scratch(b, CONSERVATIVE_RASTER_STATE, new_state);
84       mme_set_priv_reg(b, new_state, mme_imm(BITFIELD_RANGE(23, 2)),
85                        mme_imm(0x418800));
86    }
87 }
88 
89 #define NVK_DRAW_CB0_SIZE sizeof(struct nvk_root_descriptor_table)
90 
91 void
nvk_mme_select_cb0(struct mme_builder * b)92 nvk_mme_select_cb0(struct mme_builder *b)
93 {
94    struct mme_value addr_hi = nvk_mme_load_scratch(b, CB0_ADDR_HI);
95    struct mme_value addr_lo = nvk_mme_load_scratch(b, CB0_ADDR_LO);
96 
97    mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
98    mme_emit(b, mme_imm(NVK_DRAW_CB0_SIZE));
99    mme_emit(b, addr_hi);
100    mme_emit(b, addr_lo);
101 }
102 
103 static uint32_t nvk_mme_anti_alias_init(void);
104 
105 VkResult
nvk_push_draw_state_init(struct nvk_queue * queue,struct nv_push * p)106 nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
107 {
108    struct nvk_device *dev = nvk_queue_device(queue);
109    struct nvk_physical_device *pdev = nvk_device_physical(dev);
110 
111    /* 3D state */
112    P_MTHD(p, NV9097, SET_OBJECT);
113    P_NV9097_SET_OBJECT(p, {
114       .class_id = pdev->info.cls_eng3d,
115       .engine_id = 0,
116    });
117 
118    for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
119       size_t size;
120       uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
121       if (dw == NULL)
122          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
123 
124       assert(size % sizeof(uint32_t) == 0);
125       const uint32_t num_dw = size / sizeof(uint32_t);
126 
127       P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
128       P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
129       P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
130 
131       P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
132       P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
133       P_INLINE_ARRAY(p, dw, num_dw);
134 
135       mme_pos += num_dw;
136 
137       free(dw);
138    }
139 
140    if (pdev->info.cls_eng3d >= TURING_A)
141       P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
142 
143    /* Enable FP helper invocation memory loads
144     *
145     * For generations with firmware support for our `SET_PRIV_REG` mme method
146     * we simply use that. On older generations we'll let the kernel do it.
147     * Starting with GSP we have to do it via the firmware anyway.
148     *
149     * This clears bit 3 of gr_gpcs_tpcs_sm_disp_ctrl
150     *
151     * Without it,
152     * dEQP-VK.subgroups.vote.frag_helper.subgroupallequal_bvec2_fragment will
153     * occasionally fail.
154     */
155    if (pdev->info.cls_eng3d >= MAXWELL_B) {
156       unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
157       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
158       P_INLINE_DATA(p, 0);
159       P_INLINE_DATA(p, BITFIELD_BIT(3));
160       P_INLINE_DATA(p, reg);
161    }
162 
163    /* Disable Out Of Range Address exceptions
164     *
165     * From the SPH documentation:
166     *
167     *    "The SPH fields StoreReqStart and StoreReqEnd set a range of
168     *    attributes whose corresponding Odmap values of ST or ST_LAST are
169     *    treated as ST_REQ. Normally, for an attribute whose Omap bit is TRUE
170     *    and Odmap value is ST, when the shader writes data to this output, it
171     *    can not count on being able to read it back, since the next
172     *    downstream shader might have its Imap bit FALSE, thereby causing the
173     *    Bmap bit to be FALSE. By including a ST type of attribute in the
174     *    range of StoreReqStart and StoreReqEnd, the attribute’s Odmap value
175     *    is treated as ST_REQ, so an Omap bit being TRUE causes the Bmap bit
176     *    to be TRUE. This guarantees the shader program can output the value
177     *    and then read it back later. This will save register space."
178     *
179     * It's unclear exactly what's going on but this seems to imply that the
180     * hardware actually ANDs the output mask of one shader stage together with
181     * the input mask of the subsequent shader stage to determine which values
182     * are actually used.
183     *
184     * In the case when we have an empty fragment shader, it seems the hardware
185     * doesn't allocate any output memory for final geometry stage at all and
186     * so any writes to outputs from the final shader stage generates an Out Of
187     * Range Address exception.  We could fix this by eliminating unused
188     * outputs via cross-stage linking but that won't work in the case of
189     * VK_EXT_shader_object and VK_EXT_graphics_pipeline_library fast-link.
190     * Instead, the easiest solution is to just disable the exception.
191     *
192     * NOTE (Faith):
193     *
194     *    This above analysis is 100% conjecture on my part based on a creative
195     *    reading of the SPH docs and what I saw when trying to run certain
196     *    OpenGL CTS tests on NVK + Zink.  Without access to NVIDIA HW
197     *    engineers, have no way of verifying this analysis.
198     *
199     *    The CTS test in question is:
200     *
201     *    KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_tessLevel
202     *
203     * This should also prevent any issues with array overruns on I/O arrays.
204     * Before, they would get an exception and kill the context whereas now
205     * they should gently get ignored.
206     *
207     * This clears bit 14 of gr_gpcs_tpcs_sms_hww_warp_esr_report_mask
208     */
209    if (pdev->info.cls_eng3d >= MAXWELL_B) {
210       unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ea8 : 0x419e44;
211       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
212       P_INLINE_DATA(p, 0);
213       P_INLINE_DATA(p, BITFIELD_BIT(14));
214       P_INLINE_DATA(p, reg);
215    }
216 
217    /* Set CONSERVATIVE_RASTER_STATE to an invalid value, to ensure the
218     * hardware reg is always set the first time conservative rasterization
219     * is enabled */
220    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE),
221                      ~0);
222 
223    /* Initialize tessellation parameters */
224    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_TESS_PARAMS), 0);
225    P_IMMD(p, NV9097, SET_TESSELLATION_PARAMETERS, {});
226 
227    P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
228 
229    P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
230    P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
231    for (unsigned i = 0; i < 8; i++)
232       P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
233 
234    P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
235 
236 //   P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
237 //   P_INLINE_DATA(cmd->push, 0);
238 
239    P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
240 
241    P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
242 
243    P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
244    P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
245    P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
246    P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
247 
248    P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
249 
250    P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
251 
252    P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
253 
254    P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
255                      DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
256 
257    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
258    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
259       .all_covered_all_hit_once = 0xff,
260    });
261    P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
262    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
263       .all_covered_all_hit_once = 0xff,
264    });
265    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
266       .all_covered_all_hit_once = 0xff,
267    });
268    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
269       .all_covered_all_hit_once = 0x3f,
270    });
271    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
272       .all_covered_all_hit_once = 0xff,
273    });
274    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
275       .all_covered_all_hit_once = 0xff,
276    });
277 
278    if (pdev->info.cls_eng3d < VOLTA_A)
279       P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
280 
281    P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
282       .current = 3,
283       .oldest_supported = 3,
284    });
285    P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
286       .current = 2,
287       .oldest_supported = 2,
288    });
289 
290    if (pdev->info.cls_eng3d < MAXWELL_A)
291       P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
292 
293    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
294                      POLICY_EVICT_NORMAL);
295    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
296                      POLICY_EVICT_NORMAL);
297    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
298                      POLICY_EVICT_NORMAL);
299    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
300                      POLICY_EVICT_NORMAL);
301    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
302                      POLICY_EVICT_NORMAL);
303 
304    P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
305 
306    P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
307       .color_front_diffuse    = COLOR_FRONT_DIFFUSE_VECTOR_0001,
308       .color_front_specular   = COLOR_FRONT_SPECULAR_VECTOR_0001,
309       .generic_vector         = GENERIC_VECTOR_VECTOR_0001,
310       .fixed_fnc_texture      = FIXED_FNC_TEXTURE_VECTOR_0001,
311       .dx9_color0             = DX9_COLOR0_VECTOR_0001,
312       .dx9_color1_to_color15  = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
313    });
314 
315    P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
316 
317    P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
318                      CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
319 
320    P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
321       .enable                       = ENABLE_TRUE,
322       .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
323    });
324 
325    if (pdev->info.cls_eng3d < VOLTA_A)
326       P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
327 
328    P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
329    P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
330    P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
331 
332    if (pdev->info.cls_eng3d < MAXWELL_A)
333       P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
334 
335    if (pdev->info.cls_eng3d >= KEPLER_A &&
336        pdev->info.cls_eng3d < MAXWELL_A) {
337       P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
338                         ORDERING_KEPLER_ORDER);
339    }
340 
341    P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
342    P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
343    P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
344    P_IMMD(p, NV9097, SET_PS_SATURATE, {
345       .output0 = OUTPUT0_FALSE,
346       .output1 = OUTPUT1_FALSE,
347       .output2 = OUTPUT2_FALSE,
348       .output3 = OUTPUT3_FALSE,
349       .output4 = OUTPUT4_FALSE,
350       .output5 = OUTPUT5_FALSE,
351       .output6 = OUTPUT6_FALSE,
352       .output7 = OUTPUT7_FALSE,
353    });
354 
355    P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
356    P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
357 
358    /* From vulkan spec's point rasterization:
359     * "Point rasterization produces a fragment for each fragment area group of
360     * framebuffer pixels with one or more sample points that intersect a region
361     * centered at the point’s (xf,yf).
362     * This region is a square with side equal to the current point size.
363     * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
364     * for the point"
365     *
366     * So it seems we always need square points with PointCoords like OpenGL
367     * point sprites.
368     *
369     * From OpenGL compatibility spec:
370     * Basic point rasterization:
371     * "If point sprites are enabled, then point rasterization produces a
372     * fragment for each framebuffer pixel whose center lies inside a square
373     * centered at the point’s (xw, yw), with side length equal to the current
374     * point size.
375     * ... and xw and yw are the exact, unrounded window coordinates of the
376     * vertex for the point"
377     *
378     * And Point multisample rasterization:
379     * "This region is a circle having diameter equal to the current point width
380     * if POINT_SPRITE is disabled, or a square with side equal to the current
381     * point width if POINT_SPRITE is enabled."
382     */
383    P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
384    P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
385       .rmode      = RMODE_ZERO,
386       .origin     = ORIGIN_TOP,
387       .texture0   = TEXTURE0_PASSTHROUGH,
388       .texture1   = TEXTURE1_PASSTHROUGH,
389       .texture2   = TEXTURE2_PASSTHROUGH,
390       .texture3   = TEXTURE3_PASSTHROUGH,
391       .texture4   = TEXTURE4_PASSTHROUGH,
392       .texture5   = TEXTURE5_PASSTHROUGH,
393       .texture6   = TEXTURE6_PASSTHROUGH,
394       .texture7   = TEXTURE7_PASSTHROUGH,
395       .texture8   = TEXTURE8_PASSTHROUGH,
396       .texture9   = TEXTURE9_PASSTHROUGH,
397    });
398 
399    /* OpenGL's GL_POINT_SMOOTH */
400    P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
401 
402    if (pdev->info.cls_eng3d >= MAXWELL_B)
403       P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
404 
405    P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
406 
407    P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
408 
409    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SHADING_RATE_CONTROL), 0);
410    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ANTI_ALIAS),
411           nvk_mme_anti_alias_init());
412 
413    /* Enable multisample rasterization even for one sample rasterization,
414     * this way we get strict lines and rectangular line support.
415     * More info at: DirectX rasterization rules
416     */
417    P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
418 
419    if (pdev->info.cls_eng3d >= MAXWELL_B) {
420       P_IMMD(p, NVB197, SET_POST_PS_INITIAL_COVERAGE, true);
421       P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
422                         BY_VIEWPORT_INDEX_FALSE);
423    }
424 
425    /* TODO: Vertex runout */
426 
427    P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
428       .mode    = MODE_UPPER_LEFT,
429       .flip_y  = FLIP_Y_FALSE,
430    });
431 
432    P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
433    P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
434    P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
435 
436    P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
437    P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
438    P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
439 
440 //   P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
441 //      .respect_stencil_mask   = RESPECT_STENCIL_MASK_FALSE,
442 //      .use_clear_rect         = USE_CLEAR_RECT_FALSE,
443 //   });
444 
445    P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
446 
447    P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
448       .min_z_zero_max_z_one      = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
449       .pixel_min_z               = PIXEL_MIN_Z_CLAMP,
450       .pixel_max_z               = PIXEL_MAX_Z_CLAMP,
451       .geometry_guardband        = GEOMETRY_GUARDBAND_SCALE_256,
452       .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
453       .geometry_clip             = GEOMETRY_CLIP_WZERO_CLIP,
454       .geometry_guardband_z      = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
455    });
456 
457    for (unsigned i = 0; i < 16; i++)
458       P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
459 
460    P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
461 
462    if (pdev->info.cls_eng3d >= TURING_A) {
463       /* I don't know what these values actually mean.  I just copied them
464        * from the way the blob sets up the hardware.
465        */
466       P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(0));
467       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 0, 0xa23eb139);
468       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 1, 0xfb72ea61);
469       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 2, 0xd950c843);
470       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 3, 0x88fac4e5);
471       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 4, 0x1ab3e1b6);
472       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 5, 0xa98fedc2);
473       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 6, 0x2107654b);
474       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 7, 0xe0539773);
475       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 8, 0x698badcf);
476       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 9, 0x71032547);
477       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 10, 0xdef05397);
478       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 11, 0x56789abc);
479       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 12, 0x1234);
480    }
481 
482    if (pdev->info.cls_eng3d < VOLTA_A) {
483       uint64_t shader_base_addr =
484          nvk_heap_contiguous_base_address(&dev->shader_heap);
485 
486       P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
487       P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
488       P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
489    }
490 
491    for (uint32_t group = 0; group < 5; group++) {
492       for (uint32_t slot = 0; slot < 16; slot++) {
493          P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
494             .valid = VALID_FALSE,
495             .shader_slot = slot,
496          });
497       }
498    }
499 
500 //   P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
501 //   P_INLINE_DATA(cmd->push, 0x40);
502    P_IMMD(p, NV9097, SET_RT_LAYER, {
503       .v = 0,
504       .control = CONTROL_V_SELECTS_LAYER,
505    });
506 //   P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
507 //   P_INLINE_DATA(cmd->push, 0x30);
508 
509    P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
510    P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
511    P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
512 
513    uint64_t zero_addr = dev->zero_page->va->addr;
514    P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
515    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
516    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
517 
518    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VB_ENABLES));
519    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_VB_ENABLES, 0);
520    for (uint32_t b = 0; b < 32; b++) {
521       P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
522          .enable = false,
523       });
524    }
525 
526    if (pdev->info.cls_eng3d >= FERMI_A &&
527        pdev->info.cls_eng3d < MAXWELL_A) {
528       assert(dev->vab_memory);
529       uint64_t vab_addr = dev->vab_memory->va->addr;
530       P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
531       P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
532       P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
533       P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
534    }
535 
536    if (pdev->info.cls_eng3d == MAXWELL_A)
537       P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
538 
539    /* Store the address to CB0 in a pair of state registers */
540    uint64_t cb0_addr = queue->draw_cb0->va->addr;
541    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_ADDR_HI));
542    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_HI, cb0_addr >> 32);
543    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_LO, cb0_addr);
544 
545    /* Store the address to the zero page in a pair of state registers */
546    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ZERO_ADDR_HI));
547    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_HI, zero_addr >> 32);
548    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_LO, zero_addr);
549 
550    /* We leave CB0 selected by default */
551    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
552    P_INLINE_DATA(p, 0);
553 
554    /* Bind CB0 to all shader groups */
555    for (uint32_t group = 0; group < 5; group++) {
556       P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
557          .valid = VALID_TRUE,
558          .shader_slot = 0,
559       });
560    }
561 
562    /* Zero out CB0 */
563    P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
564    P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, 0);
565    for (uint32_t dw = 0; dw < NVK_DRAW_CB0_SIZE / 4; dw++)
566       P_INLINE_DATA(p, 0);
567 
568    /* These are shadowed in cb0 so they need to be zeroed as well for
569     * consistency.
570     */
571    P_IMMD(p, NV9097, SET_GLOBAL_BASE_INSTANCE_INDEX, 0);
572    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_FIRST_VERTEX));
573    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_FIRST_VERTEX, 0);
574    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_DRAW_INDEX, 0);
575    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_VIEW_INDEX, 0);
576 
577    return VK_SUCCESS;
578 }
579 
580 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)581 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
582 {
583    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
584 
585    /* These depend on color attachment count */
586    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
587    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
588    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
589    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
590 
591    /* These depend on the depth/stencil format */
592    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
593    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
594    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
595    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
596 
597    /* This may depend on render targets for ESO */
598    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
599 
600    /* This may depend on render targets */
601    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
602 
603    /* Might be required for depthClampZeroOne */
604    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE);
605    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE);
606 }
607 
608 static void
nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer * cmd,struct nvk_descriptor_state * desc,size_t offset,size_t size)609 nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer *cmd,
610                             struct nvk_descriptor_state *desc,
611                             size_t offset, size_t size)
612 {
613    const uint32_t start_dw = offset / 4;
614    const uint32_t end_dw = DIV_ROUND_UP(offset + size, 4);
615    const uint32_t len_dw = end_dw - start_dw;
616 
617    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + len_dw);
618    P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
619    P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, start_dw * 4);
620 
621    const uint32_t *root_dw = (uint32_t *)desc->root;
622    P_INLINE_ARRAY(p, &root_dw[start_dw], len_dw);
623 }
624 
625 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)626 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
627                               const VkCommandBufferBeginInfo *pBeginInfo)
628 {
629    if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
630       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
631       P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
632       P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
633          .lines = LINES_ALL,
634       });
635       P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
636          .lines = LINES_ALL,
637       });
638 
639       P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
640          .constant = CONSTANT_TRUE,
641       });
642    }
643 
644    cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
645 
646    if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
647        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
648       char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
649       const VkRenderingInfo *resume_info =
650          vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
651                                                                pBeginInfo,
652                                                                gcbiar_data);
653       if (resume_info) {
654          nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
655       } else {
656          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
657             vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
658                                                              pBeginInfo);
659          assert(inheritance_info);
660 
661          struct nvk_rendering_state *render = &cmd->state.gfx.render;
662          render->flags = inheritance_info->flags;
663          render->area = (VkRect2D) { };
664          render->layer_count = 0;
665          render->view_mask = inheritance_info->viewMask;
666          render->samples = inheritance_info->rasterizationSamples;
667 
668          render->color_att_count = inheritance_info->colorAttachmentCount;
669          for (uint32_t i = 0; i < render->color_att_count; i++) {
670             render->color_att[i].vk_format =
671                inheritance_info->pColorAttachmentFormats[i];
672          }
673          render->depth_att.vk_format =
674             inheritance_info->depthAttachmentFormat;
675          render->stencil_att.vk_format =
676             inheritance_info->stencilAttachmentFormat;
677 
678          const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
679             .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
680             .colorAttachmentCount = inheritance_info->colorAttachmentCount,
681          };
682          const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
683             vk_get_command_buffer_rendering_attachment_location_info(
684                cmd->vk.level, pBeginInfo);
685          if (att_loc_info == NULL)
686             att_loc_info = &att_loc_info_default;
687 
688          vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
689 
690          nvk_cmd_buffer_dirty_render_pass(cmd);
691       }
692    }
693 
694    cmd->state.gfx.shaders_dirty = ~0;
695 }
696 
697 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)698 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
699 {
700    vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
701 
702    /* From the Vulkan 1.3.275 spec:
703     *
704     *    "...There is one exception to this rule - if the primary command
705     *    buffer is inside a render pass instance, then the render pass and
706     *    subpass state is not disturbed by executing secondary command
707     *    buffers."
708     *
709     * We need to reset everything EXCEPT the render pass state.
710     */
711    struct nvk_rendering_state render_save = cmd->state.gfx.render;
712    memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
713    cmd->state.gfx.render = render_save;
714 
715    /* We need to keep the flush_root callback */
716    cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
717 
718    cmd->state.gfx.shaders_dirty = ~0;
719 }
720 
721 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)722 nvk_attachment_init(struct nvk_attachment *att,
723                     const VkRenderingAttachmentInfo *info)
724 {
725    if (info == NULL || info->imageView == VK_NULL_HANDLE) {
726       *att = (struct nvk_attachment) { .iview = NULL, };
727       return;
728    }
729 
730    VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
731    *att = (struct nvk_attachment) {
732       .vk_format = iview->vk.format,
733       .iview = iview,
734    };
735 
736    if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
737       VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
738       att->resolve_mode = info->resolveMode;
739       att->resolve_iview = res_iview;
740    }
741 
742    att->store_op = info->storeOp;
743 }
744 
745 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)746 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
747 {
748 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
749    uint16_t nil_to_nv9097[] = {
750       MODE(1X1),
751       MODE(2X1),
752       MODE(2X1_D3D),
753       MODE(2X2),
754       MODE(4X2),
755       MODE(4X2_D3D),
756       MODE(4X4),
757    };
758 #undef MODE
759    assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
760    assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1 ||
761           nil_to_nv9097[sample_layout] != 0);
762 
763    return nil_to_nv9097[sample_layout];
764 }
765 
766 static uint32_t nvk_mme_anti_alias_samples(uint32_t samples);
767 
768 static void
nvk_cmd_set_sample_layout(struct nvk_cmd_buffer * cmd,enum nil_sample_layout sample_layout)769 nvk_cmd_set_sample_layout(struct nvk_cmd_buffer *cmd,
770                           enum nil_sample_layout sample_layout)
771 {
772    const uint32_t samples = nil_sample_layout_samples(sample_layout);
773    struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
774 
775    P_IMMD(p, NV9097, SET_ANTI_ALIAS,
776           nil_to_nv9097_samples_mode(sample_layout));
777 
778    switch (sample_layout) {
779    case NIL_SAMPLE_LAYOUT_1X1:
780    case NIL_SAMPLE_LAYOUT_2X1:
781    case NIL_SAMPLE_LAYOUT_2X1_D3D:
782       /* These only have two modes: Single-pass or per-sample */
783       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
784       P_INLINE_DATA(p, 0);
785       P_INLINE_DATA(p, 0);
786       P_INLINE_DATA(p, 0);
787       P_INLINE_DATA(p, 0);
788       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
789       P_INLINE_DATA(p, 0);
790       P_INLINE_DATA(p, 0);
791       P_INLINE_DATA(p, 0);
792       P_INLINE_DATA(p, 0);
793       break;
794 
795    case NIL_SAMPLE_LAYOUT_2X2:
796       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
797       P_INLINE_DATA(p, 0x000a0005);
798       P_INLINE_DATA(p, 0x000a0005);
799       P_INLINE_DATA(p, 0);
800       P_INLINE_DATA(p, 0);
801       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
802       P_INLINE_DATA(p, 0);
803       P_INLINE_DATA(p, 0);
804       P_INLINE_DATA(p, 0);
805       P_INLINE_DATA(p, 0);
806       break;
807 
808    case NIL_SAMPLE_LAYOUT_4X2:
809       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
810       P_INLINE_DATA(p, 0x000f000f);
811       P_INLINE_DATA(p, 0x000f000f);
812       P_INLINE_DATA(p, 0x00f000f0);
813       P_INLINE_DATA(p, 0x00f000f0);
814       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
815       P_INLINE_DATA(p, 0x00030003);
816       P_INLINE_DATA(p, 0x000c000c);
817       P_INLINE_DATA(p, 0x00300030);
818       P_INLINE_DATA(p, 0x00c000c0);
819       break;
820 
821    case NIL_SAMPLE_LAYOUT_4X2_D3D:
822       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
823       P_INLINE_DATA(p, 0x003a00c5);
824       P_INLINE_DATA(p, 0x003a00c5);
825       P_INLINE_DATA(p, 0x003a003a);
826       P_INLINE_DATA(p, 0x00c500c5);
827       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
828       P_INLINE_DATA(p, 0x00120081);
829       P_INLINE_DATA(p, 0x00280044);
830       P_INLINE_DATA(p, 0x00280012);
831       P_INLINE_DATA(p, 0x00810044);
832       break;
833 
834    default:
835       unreachable("Unknown sample layout");
836    }
837 
838    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
839    P_INLINE_DATA(p, nvk_mme_anti_alias_samples(samples));
840 }
841 
842 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)843 nvk_GetRenderingAreaGranularityKHR(
844     VkDevice device,
845     const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
846     VkExtent2D *pGranularity)
847 {
848    *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
849 }
850 
851 static bool
nvk_rendering_all_linear(const struct nvk_rendering_state * render)852 nvk_rendering_all_linear(const struct nvk_rendering_state *render)
853 {
854    /* Depth and stencil are never linear */
855    if (render->depth_att.iview || render->stencil_att.iview)
856       return false;
857 
858    for (uint32_t i = 0; i < render->color_att_count; i++) {
859       const struct nvk_image_view *iview = render->color_att[i].iview;
860       if (iview == NULL)
861          continue;
862 
863       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
864       const uint8_t ip = iview->planes[0].image_plane;
865       const struct nil_image_level *level =
866          &image->planes[ip].nil.levels[iview->vk.base_mip_level];
867 
868       if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR)
869          return false;
870    }
871 
872    return true;
873 }
874 
875 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)876 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
877                       const VkRenderingInfo *pRenderingInfo)
878 {
879    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
880    struct nvk_rendering_state *render = &cmd->state.gfx.render;
881 
882    memset(render, 0, sizeof(*render));
883 
884    render->flags = pRenderingInfo->flags;
885    render->area = pRenderingInfo->renderArea;
886    render->view_mask = pRenderingInfo->viewMask;
887    render->layer_count = pRenderingInfo->layerCount;
888    render->samples = 0;
889 
890    const uint32_t layer_count =
891       render->view_mask ? util_last_bit(render->view_mask) :
892                           render->layer_count;
893 
894    render->color_att_count = pRenderingInfo->colorAttachmentCount;
895    for (uint32_t i = 0; i < render->color_att_count; i++) {
896       nvk_attachment_init(&render->color_att[i],
897                           &pRenderingInfo->pColorAttachments[i]);
898    }
899 
900    nvk_attachment_init(&render->depth_att,
901                        pRenderingInfo->pDepthAttachment);
902    nvk_attachment_init(&render->stencil_att,
903                        pRenderingInfo->pStencilAttachment);
904 
905    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att_info =
906       vk_find_struct_const(pRenderingInfo->pNext,
907                            RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
908    if (fsr_att_info != NULL && fsr_att_info->imageView != VK_NULL_HANDLE) {
909       VK_FROM_HANDLE(nvk_image_view, iview, fsr_att_info->imageView);
910       render->fsr_att = (struct nvk_attachment) {
911          .vk_format = iview->vk.format,
912          .iview = iview,
913          .store_op = VK_ATTACHMENT_STORE_OP_NONE,
914       };
915    }
916 
917    render->all_linear = nvk_rendering_all_linear(render);
918 
919    const VkRenderingAttachmentLocationInfoKHR ral_info = {
920       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
921       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
922    };
923    vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
924 
925    nvk_cmd_buffer_dirty_render_pass(cmd);
926 
927    struct nv_push *p = nvk_cmd_buffer_push(cmd, NVK_MAX_RTS * 12 + 34);
928 
929    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
930           render->view_mask);
931 
932    P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
933    P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
934       .x       = render->area.offset.x,
935       .width   = render->area.extent.width,
936    });
937    P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
938       .y       = render->area.offset.y,
939       .height  = render->area.extent.height,
940    });
941 
942    enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
943 
944    /* We always emit SET_COLOR_TARGET_A(i) for every color target, regardless
945     * of the number of targets in the render pass.  This ensures that we have
946     * no left over pointers from previous render passes in the hardware.  This
947     * also allows us to point at any render target with SET_CT_SELECT and know
948     * that it's either a valid render target or NULL.
949     */
950    for (uint32_t i = 0; i < NVK_MAX_RTS; i++) {
951       if (render->color_att[i].iview) {
952          const struct nvk_image_view *iview = render->color_att[i].iview;
953          const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
954          /* Rendering to multi-planar images is valid for a specific single
955           * plane only, so assert that what we have is a single-plane, obtain
956           * its index, and begin rendering
957           */
958          assert(iview->plane_count == 1);
959          const uint8_t ip = iview->planes[0].image_plane;
960          const struct nvk_image_plane *plane = &image->planes[ip];
961 
962          if (!render->all_linear &&
963              plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR)
964             plane = &image->linear_tiled_shadow;
965 
966          const struct nil_image *nil_image = &plane->nil;
967          const struct nil_image_level *level =
968             &nil_image->levels[iview->vk.base_mip_level];
969          struct nil_Extent4D_Samples level_extent_sa =
970             nil_image_level_extent_sa(nil_image, iview->vk.base_mip_level);
971 
972          assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
973                 sample_layout == nil_image->sample_layout);
974          sample_layout = nil_image->sample_layout;
975          render->samples = image->vk.samples;
976 
977          uint64_t addr = nvk_image_plane_base_address(plane) + level->offset_B;
978 
979          if (nil_image->dim == NIL_IMAGE_DIM_3D) {
980             addr += nil_image_level_z_offset_B(nil_image,
981                                                iview->vk.base_mip_level,
982                                                iview->vk.base_array_layer);
983             assert(layer_count <= iview->vk.extent.depth);
984          } else {
985             addr += iview->vk.base_array_layer *
986                     (uint64_t)nil_image->array_stride_B;
987             assert(layer_count <= iview->vk.layer_count);
988          }
989 
990          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
991          P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
992          P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
993 
994          if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR) {
995             const enum pipe_format p_format =
996                nvk_format_to_pipe_format(iview->vk.format);
997 
998             /* We use the stride for depth/stencil targets because the Z/S
999              * hardware has no concept of a tile width.  Instead, we just set
1000              * the width to the stride divided by bpp.
1001              */
1002             const uint32_t row_stride_el =
1003                level->row_stride_B / util_format_get_blocksize(p_format);
1004             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, row_stride_el);
1005             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1006             const uint8_t ct_format = nil_format_to_color_target(p_format);
1007             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1008 
1009             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1010                .block_width   = BLOCK_WIDTH_ONE_GOB,
1011                .block_height  = level->tiling.y_log2,
1012                .block_depth   = level->tiling.z_log2,
1013                .layout        = LAYOUT_BLOCKLINEAR,
1014                .third_dimension_control = (nil_image->dim == NIL_IMAGE_DIM_3D) ?
1015                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
1016                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1017             });
1018 
1019             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1020             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
1021                nil_image->array_stride_B >> 2);
1022             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1023          } else {
1024             /* NVIDIA can only render to 2D linear images */
1025             assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1026             /* NVIDIA can only render to non-multisampled images */
1027             assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1028             /* NVIDIA doesn't support linear array images */
1029             assert(iview->vk.base_array_layer == 0 && layer_count == 1);
1030 
1031             uint32_t pitch = level->row_stride_B;
1032             const enum pipe_format p_format =
1033                nvk_format_to_pipe_format(iview->vk.format);
1034             /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
1035              * takes row pitch
1036              */
1037             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
1038             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1039 
1040             const uint8_t ct_format = nil_format_to_color_target(p_format);
1041             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1042 
1043             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1044                .layout = LAYOUT_PITCH,
1045                .third_dimension_control =
1046                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1047             });
1048 
1049             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
1050             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1051             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1052          }
1053 
1054          P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), nil_image->compressed);
1055       } else {
1056          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
1057          P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
1058          P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
1059          P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
1060          P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
1061          P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
1062          P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1063             .layout        = LAYOUT_BLOCKLINEAR,
1064          });
1065          P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1066          P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1067          P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1068 
1069          P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), ENABLE_TRUE);
1070       }
1071    }
1072 
1073    if (render->depth_att.iview || render->stencil_att.iview) {
1074       struct nvk_image_view *iview = render->depth_att.iview ?
1075                                      render->depth_att.iview :
1076                                      render->stencil_att.iview;
1077       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1078       /* Depth/stencil are always single-plane */
1079       assert(iview->plane_count == 1);
1080       const uint8_t ip = iview->planes[0].image_plane;
1081       struct nil_image nil_image = image->planes[ip].nil;
1082 
1083       uint64_t addr = nvk_image_base_address(image, ip);
1084       uint32_t mip_level = iview->vk.base_mip_level;
1085       uint32_t base_array_layer = iview->vk.base_array_layer;
1086 
1087       if (nil_image.dim == NIL_IMAGE_DIM_3D) {
1088          uint64_t level_offset_B;
1089          nil_image = nil_image_3d_level_as_2d_array(&nil_image, mip_level,
1090                                                     &level_offset_B);
1091          addr += level_offset_B;
1092          mip_level = 0;
1093          base_array_layer = 0;
1094          assert(layer_count <= iview->vk.extent.depth);
1095       } else {
1096          assert(layer_count <= iview->vk.layer_count);
1097       }
1098 
1099       const struct nil_image_level *level = &nil_image.levels[mip_level];
1100       addr += level->offset_B;
1101 
1102       assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
1103              sample_layout == nil_image.sample_layout);
1104       sample_layout = nil_image.sample_layout;
1105       render->samples = image->vk.samples;
1106 
1107       P_MTHD(p, NV9097, SET_ZT_A);
1108       P_NV9097_SET_ZT_A(p, addr >> 32);
1109       P_NV9097_SET_ZT_B(p, addr);
1110       const enum pipe_format p_format =
1111          nvk_format_to_pipe_format(iview->vk.format);
1112       const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
1113       P_NV9097_SET_ZT_FORMAT(p, zs_format);
1114       assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1115       assert(level->tiling.z_log2 == 0);
1116       P_NV9097_SET_ZT_BLOCK_SIZE(p, {
1117          .width = WIDTH_ONE_GOB,
1118          .height = level->tiling.y_log2,
1119          .depth = DEPTH_ONE_GOB,
1120       });
1121       P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
1122 
1123       P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
1124 
1125       struct nil_Extent4D_Samples level_extent_sa =
1126          nil_image_level_extent_sa(&nil_image, mip_level);
1127 
1128       /* We use the stride for depth/stencil targets because the Z/S hardware
1129        * has no concept of a tile width.  Instead, we just set the width to
1130        * the stride divided by bpp.
1131        */
1132       const uint32_t row_stride_el =
1133          level->row_stride_B / util_format_get_blocksize(p_format);
1134 
1135       P_MTHD(p, NV9097, SET_ZT_SIZE_A);
1136       P_NV9097_SET_ZT_SIZE_A(p, row_stride_el);
1137       P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.height);
1138       P_NV9097_SET_ZT_SIZE_C(p, {
1139          .third_dimension  = base_array_layer + layer_count,
1140          .control          = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1141       });
1142 
1143       P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
1144 
1145       P_IMMD(p, NV9097, SET_Z_COMPRESSION, nil_image.compressed);
1146 
1147       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1148          P_IMMD(p, NVC597, SET_ZT_SPARSE, {
1149             .enable = ENABLE_FALSE,
1150          });
1151       }
1152    } else {
1153       P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
1154    }
1155 
1156    if (render->fsr_att.iview) {
1157       const struct nvk_image_view *iview = render->fsr_att.iview;
1158       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1159 
1160       /* Fragment shading rate images are always single-plane */
1161       assert(iview->plane_count == 1);
1162       const uint8_t ip = iview->planes[0].image_plane;
1163       const struct nil_image *nil_image = &image->planes[ip].nil;
1164 
1165       /* Fragment shading rate images are always 2D */
1166       assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1167       assert(nil_image->sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1168 
1169       uint64_t addr = nvk_image_base_address(image, ip);
1170       uint32_t mip_level = iview->vk.base_mip_level;
1171       struct nil_Extent4D_Samples level_extent_sa =
1172          nil_image_level_extent_sa(nil_image, mip_level);
1173 
1174       const struct nil_image_level *level = &nil_image->levels[mip_level];
1175       addr += level->offset_B;
1176 
1177       P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1178       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, addr >> 32);
1179       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, addr);
1180       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, {
1181          .width = level_extent_sa.width,
1182          .height = level_extent_sa.height,
1183       });
1184       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0,
1185          iview->vk.layer_count + iview->vk.base_array_layer);
1186       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0,
1187          iview->vk.base_array_layer);
1188       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0,
1189          nil_image->array_stride_B >> 2);
1190       assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1191       assert(level->tiling.z_log2 == 0);
1192       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, {
1193          .width = WIDTH_ONE_GOB,
1194          .height = level->tiling.y_log2,
1195          .depth = DEPTH_ONE_GOB,
1196       });
1197 
1198       const enum pipe_format p_format =
1199          nvk_format_to_pipe_format(iview->vk.format);
1200       const uint32_t row_stride_el =
1201          level->row_stride_B / util_format_get_blocksize(p_format);
1202       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0,
1203          row_stride_el);
1204    } else {
1205       P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1206       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, 0);
1207       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, 0);
1208       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, { });
1209       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0, 0);
1210       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0, 0);
1211       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0, 0);
1212       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, { });
1213       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0, 0);
1214    }
1215 
1216    /* From the Vulkan 1.3.275 spec:
1217     *
1218     *    "It is legal for a subpass to use no color or depth/stencil
1219     *    attachments, either because it has no attachment references or
1220     *    because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
1221     *    can use shader side effects such as image stores and atomics to
1222     *    produce an output. In this case, the subpass continues to use the
1223     *    width, height, and layers of the framebuffer to define the dimensions
1224     *    of the rendering area, and the rasterizationSamples from each
1225     *    pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
1226     *    of samples used in rasterization;"
1227     *
1228     * In the case where we have attachments, we emit SET_ANTI_ALIAS here
1229     * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
1230     * specifying the sample layout and we want to ensure it matches.  When
1231     * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
1232     * where we base it on dynamic rasterizationSamples.
1233     */
1234    if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID)
1235       nvk_cmd_set_sample_layout(cmd, sample_layout);
1236 
1237    if (render->flags & VK_RENDERING_RESUMING_BIT)
1238       return;
1239 
1240    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1241       const struct nvk_image_view *iview = render->color_att[i].iview;
1242       if (iview == NULL)
1243          continue;
1244 
1245       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1246       assert(iview->plane_count == 1);
1247       const uint8_t ip = iview->planes[0].image_plane;
1248       const struct nvk_image_plane *plane = &image->planes[ip];
1249 
1250       const VkAttachmentLoadOp load_op =
1251          pRenderingInfo->pColorAttachments[i].loadOp;
1252       if (!render->all_linear &&
1253           plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1254           load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
1255          nvk_linear_render_copy(cmd, iview, render->area, true);
1256    }
1257 
1258    uint32_t clear_count = 0;
1259    VkClearAttachment clear_att[NVK_MAX_RTS + 1];
1260    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1261       const VkRenderingAttachmentInfo *att_info =
1262          &pRenderingInfo->pColorAttachments[i];
1263       if (att_info->imageView == VK_NULL_HANDLE ||
1264           att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1265          continue;
1266 
1267       clear_att[clear_count++] = (VkClearAttachment) {
1268          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1269          .colorAttachment = i,
1270          .clearValue = att_info->clearValue,
1271       };
1272    }
1273 
1274    clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
1275    if (pRenderingInfo->pDepthAttachment != NULL &&
1276        pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
1277        pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1278       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
1279       clear_att[clear_count].clearValue.depthStencil.depth =
1280          pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
1281    }
1282    if (pRenderingInfo->pStencilAttachment != NULL &&
1283        pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
1284        pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1285       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
1286       clear_att[clear_count].clearValue.depthStencil.stencil =
1287          pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
1288    }
1289    if (clear_att[clear_count].aspectMask != 0)
1290       clear_count++;
1291 
1292    if (clear_count > 0) {
1293       const VkClearRect clear_rect = {
1294          .rect = render->area,
1295          .baseArrayLayer = 0,
1296          .layerCount = render->view_mask ? 1 : render->layer_count,
1297       };
1298 
1299       p = nvk_cmd_buffer_push(cmd, 2);
1300       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1301       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
1302 
1303       nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
1304                               clear_count, clear_att, 1, &clear_rect);
1305       p = nvk_cmd_buffer_push(cmd, 2);
1306       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1307       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
1308    }
1309 
1310    /* TODO: Attachment clears */
1311 }
1312 
1313 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)1314 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
1315 {
1316    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1317    struct nvk_rendering_state *render = &cmd->state.gfx.render;
1318 
1319    if (!(render->flags & VK_RENDERING_SUSPENDING_BIT)) {
1320       for (uint32_t i = 0; i < render->color_att_count; i++) {
1321          struct nvk_image_view *iview = render->color_att[i].iview;
1322          if (iview == NULL)
1323             continue;
1324 
1325          struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1326          const uint8_t ip = iview->planes[0].image_plane;
1327          const struct nvk_image_plane *plane = &image->planes[ip];
1328          if (!render->all_linear &&
1329              plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1330              render->color_att[i].store_op == VK_ATTACHMENT_STORE_OP_STORE)
1331             nvk_linear_render_copy(cmd, iview, render->area, false);
1332       }
1333    }
1334 
1335    bool need_resolve = false;
1336 
1337    /* Translate render state back to VK for meta */
1338    VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
1339    for (uint32_t i = 0; i < render->color_att_count; i++) {
1340       if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
1341          need_resolve = true;
1342 
1343       vk_color_att[i] = (VkRenderingAttachmentInfo) {
1344          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1345          .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
1346          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1347          .resolveMode = render->color_att[i].resolve_mode,
1348          .resolveImageView =
1349             nvk_image_view_to_handle(render->color_att[i].resolve_iview),
1350          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1351       };
1352    }
1353 
1354    const VkRenderingAttachmentInfo vk_depth_att = {
1355       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1356       .imageView = nvk_image_view_to_handle(render->depth_att.iview),
1357       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1358       .resolveMode = render->depth_att.resolve_mode,
1359       .resolveImageView =
1360          nvk_image_view_to_handle(render->depth_att.resolve_iview),
1361       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1362    };
1363    if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1364       need_resolve = true;
1365 
1366    const VkRenderingAttachmentInfo vk_stencil_att = {
1367       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1368       .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
1369       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1370       .resolveMode = render->stencil_att.resolve_mode,
1371       .resolveImageView =
1372          nvk_image_view_to_handle(render->stencil_att.resolve_iview),
1373       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1374    };
1375    if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1376       need_resolve = true;
1377 
1378    const VkRenderingInfo vk_render = {
1379       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1380       .renderArea = render->area,
1381       .layerCount = render->layer_count,
1382       .viewMask = render->view_mask,
1383       .colorAttachmentCount = render->color_att_count,
1384       .pColorAttachments = vk_color_att,
1385       .pDepthAttachment = &vk_depth_att,
1386       .pStencilAttachment = &vk_stencil_att,
1387    };
1388 
1389    if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1390       need_resolve = false;
1391 
1392    memset(render, 0, sizeof(*render));
1393 
1394    if (need_resolve) {
1395       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1396       P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
1397          .lines = LINES_ALL,
1398       });
1399 
1400       nvk_meta_resolve_rendering(cmd, &vk_render);
1401    }
1402 }
1403 
1404 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)1405 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
1406                              const gl_shader_stage stage,
1407                              struct nvk_shader *shader)
1408 {
1409    assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1410    if (cmd->state.gfx.shaders[stage] == shader)
1411       return;
1412 
1413    cmd->state.gfx.shaders[stage] = shader;
1414    cmd->state.gfx.shaders_dirty |= mesa_to_vk_shader_stage(stage);
1415 }
1416 
1417 uint32_t
nvk_mme_tess_params(enum nak_ts_domain domain,enum nak_ts_spacing spacing,enum nak_ts_prims prims)1418 nvk_mme_tess_params(enum nak_ts_domain domain,
1419                     enum nak_ts_spacing spacing,
1420                     enum nak_ts_prims prims)
1421 {
1422    /* This is laid out the same as SET_TESSELLATION_PARAMETERS, only with an
1423     * extra bit for lower_left
1424     */
1425    uint16_t params = ((uint16_t)domain << 0) |
1426                      ((uint16_t)spacing << 4) |
1427                      ((uint16_t)prims << 8);
1428    return nvk_mme_val_mask(params, 0x0fff);
1429 }
1430 
1431 static uint32_t
nvk_mme_tess_lower_left(bool lower_left)1432 nvk_mme_tess_lower_left(bool lower_left)
1433 {
1434    return nvk_mme_val_mask((uint16_t)lower_left << 12, 1u << 12);
1435 }
1436 
1437 void
nvk_mme_set_tess_params(struct mme_builder * b)1438 nvk_mme_set_tess_params(struct mme_builder *b)
1439 {
1440    struct mme_value val_mask = mme_load(b);
1441    struct mme_value old_params = nvk_mme_load_scratch(b, TESS_PARAMS);
1442    struct mme_value params = nvk_mme_set_masked(b, old_params, val_mask);
1443    mme_free_reg(b, val_mask);
1444 
1445    mme_if(b, ine, params, old_params) {
1446       nvk_mme_store_scratch(b, TESS_PARAMS, params);
1447 
1448       /* lower_left lives at bit 12 */
1449       struct mme_value lower_left = mme_merge(b, mme_zero(), params, 0, 1, 12);
1450 
1451       /* Only the bottom 12 bits are valid to put in HW */
1452       mme_merge_to(b, params, mme_zero(), params, 0, 12, 0);
1453 
1454       /* If we're using a lower-left orientation, we need to flip triangles
1455        * between CW and CCW.
1456        */
1457       mme_if(b, ine, lower_left, mme_zero()) {
1458          struct mme_value prims_cw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CW);
1459          struct mme_value prims_ccw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CCW);
1460 
1461          struct mme_value prims = mme_merge(b, mme_zero(), params, 0, 4, 8);
1462          mme_if(b, ieq, prims, prims_cw) {
1463             mme_merge_to(b, params, params, prims_ccw, 8, 4, 0);
1464          }
1465          mme_if(b, ieq, prims, prims_ccw) {
1466             mme_merge_to(b, params, params, prims_cw, 8, 4, 0);
1467          }
1468          mme_free_reg(b, prims);
1469       }
1470       mme_free_reg(b, lower_left);
1471 
1472       mme_mthd(b, NV9097_SET_TESSELLATION_PARAMETERS);
1473       mme_emit(b, params);
1474    }
1475 }
1476 
1477 const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[] = {{
1478    /* This case doesn't change the state so it should do nothing */
1479    .init = (struct nvk_mme_mthd_data[]) {
1480       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1481       { }
1482    },
1483    .params = (uint32_t[]) { 0xffff0000 },
1484    .expected = (struct nvk_mme_mthd_data[]) {
1485       { }
1486    },
1487 }, {
1488    /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = false */
1489    .init = (struct nvk_mme_mthd_data[]) {
1490       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1491       { }
1492    },
1493    .params = (uint32_t[]) { 0xffff0201 },
1494    .expected = (struct nvk_mme_mthd_data[]) {
1495       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1496       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1497       { }
1498    },
1499 }, {
1500    /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = true */
1501    .init = (struct nvk_mme_mthd_data[]) {
1502       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1503       { }
1504    },
1505    .params = (uint32_t[]) { 0x10001000 },
1506    .expected = (struct nvk_mme_mthd_data[]) {
1507       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1201 },
1508       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0301 },
1509       { }
1510    },
1511 }, {
1512    /* TRIANGLE, INTEGER, TRIANGLES_CCW, lower_left = true */
1513    .init = (struct nvk_mme_mthd_data[]) {
1514       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0301 },
1515       { }
1516    },
1517    .params = (uint32_t[]) { 0x10001000 },
1518    .expected = (struct nvk_mme_mthd_data[]) {
1519       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1301 },
1520       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1521       { }
1522    },
1523 }, {}};
1524 
1525 void
nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer * cmd)1526 nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer *cmd)
1527 {
1528    if (cmd->state.gfx.shaders_dirty == 0)
1529       return;
1530 
1531    /* Map shader types to shaders */
1532    struct nvk_shader *type_shader[6] = { NULL, };
1533    uint32_t types_dirty = 0;
1534 
1535    u_foreach_bit(s, cmd->state.gfx.shaders_dirty &
1536                     NVK_SHADER_STAGE_GRAPHICS_BITS) {
1537       gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1538       uint32_t type = mesa_to_nv9097_shader_type(stage);
1539       types_dirty |= BITFIELD_BIT(type);
1540 
1541       /* Only copy non-NULL shaders because mesh/task alias with vertex and
1542        * tessellation stages.
1543        */
1544       struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
1545       if (shader != NULL) {
1546          assert(type < ARRAY_SIZE(type_shader));
1547          assert(type_shader[type] == NULL);
1548          type_shader[type] = shader;
1549 
1550          const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
1551          struct nvk_cbuf_group *cbuf_group =
1552             &cmd->state.gfx.cbuf_groups[nvk_cbuf_binding_for_stage(stage)];
1553          for (uint32_t i = 0; i < cbuf_map->cbuf_count; i++) {
1554             if (memcmp(&cbuf_group->cbufs[i], &cbuf_map->cbufs[i],
1555                        sizeof(cbuf_group->cbufs[i])) != 0) {
1556                cbuf_group->cbufs[i] = cbuf_map->cbufs[i];
1557                cbuf_group->dirty |= BITFIELD_BIT(i);
1558             }
1559          }
1560       }
1561    }
1562 
1563    u_foreach_bit(type, types_dirty) {
1564       struct nvk_shader *shader = type_shader[type];
1565       if (shader == NULL) {
1566          struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1567          P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1568             .enable  = ENABLE_FALSE,
1569             .type    = type,
1570          });
1571       } else {
1572          struct nv_push *p = nvk_cmd_buffer_push(cmd, shader->push_dw_count);
1573          nv_push_raw(p, shader->push_dw, shader->push_dw_count);
1574       }
1575    }
1576 
1577    if (cmd->state.gfx.shaders_dirty & NVK_SHADER_STAGE_VTGM_BITS) {
1578       struct nvk_shader *last_vtgm = NULL;
1579       u_foreach_bit(s, NVK_SHADER_STAGE_VTGM_BITS) {
1580          gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1581          if (cmd->state.gfx.shaders[stage] != NULL)
1582             last_vtgm = cmd->state.gfx.shaders[stage];
1583       }
1584 
1585       assert(last_vtgm->vtgm_push_dw_count > last_vtgm->push_dw_count);
1586       const uint16_t dw_start = last_vtgm->push_dw_count;
1587       const uint16_t dw_count = last_vtgm->vtgm_push_dw_count - dw_start;
1588       struct nv_push *p = nvk_cmd_buffer_push(cmd, dw_count);
1589       nv_push_raw(p, &last_vtgm->push_dw[dw_start], dw_count);
1590    }
1591 
1592    cmd->state.gfx.shaders_dirty = 0;
1593 }
1594 
1595 void
nvk_mme_set_vb_enables(struct mme_builder * b)1596 nvk_mme_set_vb_enables(struct mme_builder *b)
1597 {
1598    struct mme_value enables = mme_load(b);
1599    struct mme_value old_enables = nvk_mme_load_scratch(b, VB_ENABLES);
1600    nvk_mme_store_scratch(b, VB_ENABLES, enables);
1601 
1602    struct mme_value changed = mme_xor(b, enables, old_enables);
1603    mme_free_reg(b, old_enables);
1604 
1605    struct mme_value vb_idx4 = mme_mov(b, mme_zero());
1606    mme_while(b, ine, changed, mme_zero()) {
1607       mme_if(b, ine, mme_and(b, changed, mme_imm(1)), mme_zero()) {
1608          struct mme_value state =
1609             mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1610          mme_merge_to(b, state, state, enables, 12, 1, 0);
1611          mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1612          mme_emit(b, state);
1613       }
1614       mme_add_to(b, vb_idx4, vb_idx4, mme_imm(4));
1615       mme_srl_to(b, changed, changed, mme_imm(1));
1616       mme_srl_to(b, enables, enables, mme_imm(1));
1617    }
1618 }
1619 
1620 static uint32_t
nvk_mme_vb_stride(uint32_t vb_idx,uint32_t stride)1621 nvk_mme_vb_stride(uint32_t vb_idx, uint32_t stride)
1622 {
1623    assert(stride < (1 << 12));
1624    assert(vb_idx < (1 << 5));
1625    return (vb_idx << 16) | stride;
1626 }
1627 
1628 void
nvk_mme_set_vb_stride(struct mme_builder * b)1629 nvk_mme_set_vb_stride(struct mme_builder *b)
1630 {
1631    /* Param is laid out as
1632     *
1633     *    bits 0..11  : stride
1634     *    bits 16..21 : VB index
1635     */
1636    struct mme_value param = mme_load(b);
1637 
1638    struct mme_value vb_idx4 = mme_merge(b, mme_zero(), param, 2, 5, 16);
1639 
1640    struct mme_value state =
1641       mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1642    struct mme_value new_state = mme_merge(b, state, param, 0, 12, 0);
1643    mme_if(b, ine, state, new_state) {
1644       mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1645       mme_emit(b, new_state);
1646    }
1647 }
1648 
1649 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1650 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1651 {
1652    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1653    struct nvk_physical_device *pdev = nvk_device_physical(dev);
1654    const struct vk_dynamic_graphics_state *dyn =
1655       &cmd->vk.dynamic_graphics_state;
1656 
1657    struct nv_push *p = nvk_cmd_buffer_push(cmd, 258);
1658 
1659    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1660       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_ENABLES));
1661       P_INLINE_DATA(p, dyn->vi->bindings_valid);
1662    }
1663 
1664    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1665        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1666       u_foreach_bit(a, dyn->vi->attributes_valid) {
1667          const struct nvk_va_format *fmt =
1668             nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1669 
1670          P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1671             .stream                 = dyn->vi->attributes[a].binding,
1672             .offset                 = dyn->vi->attributes[a].offset,
1673             .component_bit_widths   = fmt->bit_widths,
1674             .numerical_type         = fmt->type,
1675             .swap_r_and_b           = fmt->swap_rb,
1676          });
1677       }
1678 
1679       u_foreach_bit(b, dyn->vi->bindings_valid) {
1680          const bool instanced = dyn->vi->bindings[b].input_rate ==
1681                                 VK_VERTEX_INPUT_RATE_INSTANCE;
1682          P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1683          P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1684             dyn->vi->bindings[b].divisor);
1685       }
1686    }
1687 
1688    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1689        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1690       u_foreach_bit(b, dyn->vi->bindings_valid) {
1691          assert(dyn->vi_binding_strides[b] < (1 << 12));
1692          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE));
1693          P_INLINE_DATA(p, nvk_mme_vb_stride(b, dyn->vi_binding_strides[b]));
1694       }
1695    }
1696 }
1697 
1698 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)1699 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
1700 {
1701    switch (prim) {
1702    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1703       return NV9097_BEGIN_OP_POINTS;
1704    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1705       return NV9097_BEGIN_OP_LINES;
1706    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1707       return NV9097_BEGIN_OP_LINE_STRIP;
1708    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1709 #pragma GCC diagnostic push
1710 #pragma GCC diagnostic ignored "-Wswitch"
1711    case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
1712 #pragma GCC diagnostic pop
1713       return NV9097_BEGIN_OP_TRIANGLES;
1714    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1715       return NV9097_BEGIN_OP_TRIANGLE_STRIP;
1716    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1717       return NV9097_BEGIN_OP_TRIANGLE_FAN;
1718    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1719       return NV9097_BEGIN_OP_LINELIST_ADJCY;
1720    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1721       return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
1722    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1723       return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
1724    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1725       return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
1726    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1727       return NV9097_BEGIN_OP_PATCH;
1728    default:
1729       unreachable("Invalid primitive topology");
1730    }
1731 }
1732 
1733 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1734 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1735 {
1736    const struct vk_dynamic_graphics_state *dyn =
1737       &cmd->vk.dynamic_graphics_state;
1738 
1739    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
1740       uint32_t begin;
1741       V_NV9097_BEGIN(begin, {
1742          .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
1743          .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
1744          .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
1745          .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
1746       });
1747 
1748       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1749       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
1750       P_INLINE_DATA(p, begin);
1751    }
1752 
1753    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1754       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1755       P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1756              dyn->ia.primitive_restart_enable);
1757    }
1758 }
1759 
1760 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1761 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1762 {
1763    const struct vk_dynamic_graphics_state *dyn =
1764       &cmd->vk.dynamic_graphics_state;
1765    struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1766 
1767    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1768       /* The hardware gets grumpy if we set this to 0 so make sure we set it
1769        * to at least 1 in case it's dirty but uninitialized.
1770        */
1771       P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1772    }
1773 
1774    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1775       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1776       P_INLINE_DATA(p, nvk_mme_tess_lower_left(
1777          dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT));
1778    }
1779 }
1780 
1781 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1782 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1783 {
1784    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1785 
1786    const struct vk_dynamic_graphics_state *dyn =
1787       &cmd->vk.dynamic_graphics_state;
1788 
1789    struct nv_push *p =
1790       nvk_cmd_buffer_push(cmd, 18 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1791 
1792    /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1793 
1794    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1795        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
1796        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE)) {
1797       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1798          const VkViewport *vp = &dyn->vp.viewports[i];
1799 
1800          /* These exactly match the spec values.  Nvidia hardware oddities
1801           * are accounted for later.
1802           */
1803          const float o_x = vp->x + 0.5f * vp->width;
1804          const float o_y = vp->y + 0.5f * vp->height;
1805          const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1806                            vp->minDepth :
1807                            (vp->maxDepth + vp->minDepth) * 0.5f;
1808 
1809          const float p_x = vp->width;
1810          const float p_y = vp->height;
1811          const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1812                            vp->maxDepth - vp->minDepth :
1813                            (vp->maxDepth - vp->minDepth) * 0.5f;
1814 
1815          P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1816          P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1817          P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1818          P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1819 
1820          P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1821          P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1822          P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1823 
1824          const bool user_defined_range =
1825             dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT;
1826          float xmin = vp->x;
1827          float xmax = vp->x + vp->width;
1828          float ymin = MIN2(vp->y, vp->y + vp->height);
1829          float ymax = MAX2(vp->y, vp->y + vp->height);
1830          float zmin = user_defined_range ?
1831                       dyn->vp.depth_clamp_range.minDepthClamp :
1832                       MIN2(vp->minDepth, vp->maxDepth);
1833          float zmax = user_defined_range ?
1834                       dyn->vp.depth_clamp_range.maxDepthClamp :
1835                       MAX2(vp->minDepth, vp->maxDepth);
1836          assert(xmin <= xmax && ymin <= ymax && zmin <= zmax);
1837 
1838          const float max_dim = (float)0xffff;
1839          xmin = CLAMP(xmin, 0, max_dim);
1840          xmax = CLAMP(xmax, 0, max_dim);
1841          ymin = CLAMP(ymin, 0, max_dim);
1842          ymax = CLAMP(ymax, 0, max_dim);
1843 
1844          if (!dev->vk.enabled_extensions.EXT_depth_range_unrestricted) {
1845             assert(0.0 <= zmin && zmin <= 1.0);
1846             assert(0.0 <= zmax && zmax <= 1.0);
1847          }
1848 
1849          P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1850          P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1851             .x0      = xmin,
1852             .width   = xmax - xmin,
1853          });
1854          P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1855             .y0      = ymin,
1856             .height  = ymax - ymin,
1857          });
1858 
1859          if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1860             P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1861             P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1862          } else {
1863             P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VIEWPORT_MIN_MAX_Z));
1864             P_INLINE_DATA(p, i);
1865             P_INLINE_DATA(p, fui(zmin));
1866             P_INLINE_DATA(p, fui(zmax));
1867          }
1868 
1869          if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1870             P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1871                .x = X_POS_X,
1872                .y = Y_POS_Y,
1873                .z = Z_POS_Z,
1874                .w = W_POS_W,
1875             });
1876          }
1877       }
1878    }
1879 
1880    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1881       P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1882              dyn->vp.depth_clip_negative_one_to_one ?
1883              RANGE_NEGATIVE_W_TO_POSITIVE_W :
1884              RANGE_ZERO_TO_POSITIVE_W);
1885    }
1886 
1887    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1888       for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1889          P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1890    }
1891 
1892    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1893       for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1894          const VkRect2D *s = &dyn->vp.scissors[i];
1895 
1896          const uint32_t xmin = MIN2(16384, s->offset.x);
1897          const uint32_t xmax = MIN2(16384, s->offset.x + s->extent.width);
1898          const uint32_t ymin = MIN2(16384, s->offset.y);
1899          const uint32_t ymax = MIN2(16384, s->offset.y + s->extent.height);
1900 
1901          P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1902          P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1903          P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1904             .xmin = xmin,
1905             .xmax = xmax,
1906          });
1907          P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1908             .ymin = ymin,
1909             .ymax = ymax,
1910          });
1911       }
1912    }
1913 }
1914 
1915 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1916 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1917 {
1918    ASSERTED uint16_t vk_to_nv9097[] = {
1919       [VK_POLYGON_MODE_FILL]  = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1920       [VK_POLYGON_MODE_LINE]  = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1921       [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1922    };
1923    assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1924 
1925    uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1926    assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1927    return nv9097_mode;
1928 }
1929 
1930 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1931 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1932 {
1933    static const uint16_t vk_to_nv9097[] = {
1934       [VK_CULL_MODE_FRONT_BIT]      = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1935       [VK_CULL_MODE_BACK_BIT]       = NV9097_OGL_SET_CULL_FACE_V_BACK,
1936       [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1937    };
1938    assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1939    return vk_to_nv9097[vk_cull_mode];
1940 }
1941 
1942 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1943 vk_to_nv9097_front_face(VkFrontFace vk_face)
1944 {
1945    /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1946     * convention in which framebuffer coordinates always start in the upper
1947     * left while OpenGL has framebuffer coordinates starting in the lower
1948     * left.  Therefore, we want the reverse of the hardware enum name.
1949     */
1950    ASSERTED static const uint16_t vk_to_nv9097[] = {
1951       [VK_FRONT_FACE_COUNTER_CLOCKWISE]   = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1952       [VK_FRONT_FACE_CLOCKWISE]           = NV9097_OGL_SET_FRONT_FACE_V_CW,
1953    };
1954    assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1955 
1956    uint32_t nv9097_face = 0x900 | (1 - vk_face);
1957    assert(nv9097_face == vk_to_nv9097[vk_face]);
1958    return nv9097_face;
1959 }
1960 
1961 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1962 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1963 {
1964    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1965                  NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1966    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1967                  NV9097_SET_PROVOKING_VERTEX_V_LAST);
1968    return vk_mode;
1969 }
1970 
1971 void
nvk_mme_set_viewport_min_max_z(struct mme_builder * b)1972 nvk_mme_set_viewport_min_max_z(struct mme_builder *b)
1973 {
1974    struct mme_value vp_idx = mme_load(b);
1975    struct mme_value min_z = mme_load(b);
1976    struct mme_value max_z = mme_load(b);
1977 
1978    /* Multiply by 2 because it's an array with stride 8 */
1979    mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1980    mme_mthd_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), vp_idx);
1981    mme_emit(b, min_z);
1982    mme_emit(b, max_z);
1983 
1984    struct mme_value z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1985    mme_if(b, ine, z_clamp, mme_zero()) {
1986       /* Multiply by 2 again because this array has stride 16 */
1987       mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1988       mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), vp_idx);
1989       mme_emit(b, min_z);
1990       mme_emit(b, max_z);
1991    }
1992 }
1993 
1994 void
nvk_mme_set_z_clamp(struct mme_builder * b)1995 nvk_mme_set_z_clamp(struct mme_builder *b)
1996 {
1997    struct mme_value z_clamp = mme_load(b);
1998    struct mme_value old_z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1999    mme_if(b, ine, z_clamp, old_z_clamp) {
2000       nvk_mme_store_scratch(b, Z_CLAMP, z_clamp);
2001 
2002       mme_if(b, ine, z_clamp, mme_zero()) {
2003          struct mme_value i_2 = mme_mov(b, mme_zero());
2004          mme_while(b, ine, i_2, mme_imm(NVK_MAX_VIEWPORTS * 2)) {
2005             struct mme_value min_z =
2006                mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), i_2);
2007             struct mme_value max_z =
2008                mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MAX_Z), i_2);
2009 
2010             struct mme_value i_4 = mme_sll(b, i_2, mme_imm(1));
2011             mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2012             mme_emit(b, min_z);
2013             mme_emit(b, max_z);
2014 
2015             mme_free_reg(b, i_4);
2016             mme_free_reg(b, min_z);
2017             mme_free_reg(b, max_z);
2018 
2019             mme_add_to(b, i_2, i_2, mme_imm(2));
2020          }
2021          mme_free_reg(b, i_2);
2022       }
2023       mme_if(b, ieq, z_clamp, mme_zero()) {
2024          struct mme_value i_4 = mme_mov(b, mme_zero());
2025          mme_while(b, ine, i_4, mme_imm(NVK_MAX_VIEWPORTS * 4)) {
2026             mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2027             mme_emit(b, mme_imm(fui(-INFINITY)));
2028             mme_emit(b, mme_imm(fui(INFINITY)));
2029 
2030             mme_add_to(b, i_4, i_4, mme_imm(4));
2031          }
2032          mme_free_reg(b, i_4);
2033       }
2034    }
2035 }
2036 
2037 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)2038 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
2039 {
2040    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
2041    const struct vk_dynamic_graphics_state *dyn =
2042       &cmd->vk.dynamic_graphics_state;
2043    const struct nvk_rendering_state *render =
2044       &cmd->state.gfx.render;
2045 
2046    struct nv_push *p = nvk_cmd_buffer_push(cmd, 46);
2047 
2048    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
2049       P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
2050 
2051    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
2052        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
2053       const bool z_clamp = dyn->rs.depth_clamp_enable;
2054       const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2055       /* z_clamp_zero_one accounts for the interaction between
2056        * depthClampZeroOne and depthRangeUnrestricted as mentioned in the
2057        * Vulkan spec. depthClampZeroOne adds an additional clamp and doesn't
2058        * modify the clip/clamp threshold.  We are expected to clamp to [0,1]
2059        * when any one of these conditions are fulfilled:
2060        * - depth_range_unrestricted is not enabled
2061        * - depthClampZeroOne is enabled but depth
2062        *    format is not floating point or depthRangeUnrestricted
2063        *    is not enabled
2064        * - fixed point depth format
2065       */
2066       const bool z_clamp_zero_one =
2067          !vk_format_has_float_depth(render->depth_att.vk_format) ||
2068          (dev->vk.enabled_features.depthClampZeroOne &&
2069          !dev->vk.enabled_extensions.EXT_depth_range_unrestricted);
2070 
2071       P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
2072          /* We only set Z clip range if clamp is requested.  Otherwise, we
2073           * leave it set to -/+INF and clip using the guardband below.
2074           *
2075           * depthClampZeroOne is independent of normal depth clamping and
2076           * does not modify the clip/clamp threshold.  The Vulkan spec
2077           * guarantees that, in the cases where depthClampZeroOne applies,
2078           * the [zmin, zmax] is inside [0, 1].  This means that, if z_clamp
2079           * is enabled, we can just do the regular clamp.  If z_clamp is
2080           * disabled and z_clamp_zero_one is enabled then we need to
2081           * apply the [0, 1] clamp.
2082           */
2083          .min_z_zero_max_z_one = (!z_clamp && z_clamp_zero_one)
2084                                  ? MIN_Z_ZERO_MAX_Z_ONE_TRUE
2085                                  : MIN_Z_ZERO_MAX_Z_ONE_FALSE,
2086          .z_clip_range = (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A &&
2087                           (z_clamp || !z_clamp_zero_one))
2088                          ? (z_clamp ? Z_CLIP_RANGE_MIN_Z_MAX_Z
2089                                     : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
2090                          : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
2091 
2092          .pixel_min_z = PIXEL_MIN_Z_CLAMP,
2093          .pixel_max_z = PIXEL_MAX_Z_CLAMP,
2094 
2095          .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
2096          .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
2097          .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
2098                                  : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
2099 
2100          /* We clip depth with the geometry clipper to ensure that it gets
2101           * clipped before depth bias is applied.  If we leave it up to the
2102           * raserizer clipper (pixel_min/max_z = CLIP), it will clip too late
2103           * in the pipeline.  This can be seen in two different ways:
2104           *
2105           *  - When depth bias is enabled, the bias is applied post-clipping.
2106           *    If we clip in the rasterizer, it will clip according to the
2107           *    post-bias depth which is wrong.
2108           *
2109           *  - If the fragment shader overrides the depth by writing to
2110           *    gl_FragDepth, it should be clipped according to the original
2111           *    geometry, not accoring to gl_FragDepth.
2112           *
2113           * In order to always get the geometry clipper, we need to set a
2114           * tight guardband (geometry_guardband_z = SCALE_1).
2115           */
2116          .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
2117                                         : GEOMETRY_GUARDBAND_Z_SCALE_256,
2118       });
2119 
2120       /* Pre-Volta, we don't have SET_VIEWPORT_CLIP_CONTROL::z_clip_range.
2121        * Instead, we have to emulate it by smashing VIEWPORT_CLIP_MIN/MAX_Z
2122        * based on whether or not z_clamp is set. This is done by a pair of
2123        * macros, one of which is called here and the other is called in
2124        * viewport setup.
2125        */
2126       if (nvk_cmd_buffer_3d_cls(cmd) < VOLTA_A) {
2127          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_Z_CLAMP));
2128          P_INLINE_DATA(p, z_clamp);
2129       }
2130    }
2131 
2132    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
2133       uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
2134       P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
2135       P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
2136       P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
2137    }
2138 
2139    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
2140       P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
2141 
2142       if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
2143          uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
2144          P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
2145       }
2146    }
2147 
2148    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
2149       P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
2150          vk_to_nv9097_front_face(dyn->rs.front_face));
2151    }
2152 
2153    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
2154       P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
2155              vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
2156    }
2157 
2158    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
2159       P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
2160       P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
2161       P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
2162       P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
2163    }
2164 
2165    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
2166       switch (dyn->rs.depth_bias.representation) {
2167       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
2168          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2169                 DEPTH_FORMAT_DEPENDENT_TRUE);
2170          break;
2171       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
2172          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2173                 DEPTH_FORMAT_DEPENDENT_FALSE);
2174          break;
2175       case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
2176       default:
2177          unreachable("Unsupported depth bias representation");
2178       }
2179       /* TODO: The blob multiplies by 2 for some reason. We don't. */
2180       P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant_factor));
2181       P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope_factor));
2182       P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
2183    }
2184 
2185    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
2186       P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
2187       P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2188       P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2189    }
2190 
2191    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
2192       switch (dyn->rs.line.mode) {
2193       case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
2194       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
2195          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
2196          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2197          break;
2198 
2199       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
2200          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2201          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2202          break;
2203 
2204       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
2205          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2206          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
2207          break;
2208 
2209       default:
2210          unreachable("Invalid line rasterization mode");
2211       }
2212    }
2213 
2214    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
2215       P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
2216 
2217    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
2218       /* map factor from [1,256] to [0, 255] */
2219       uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
2220       P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
2221          .factor  = stipple_factor,
2222          .pattern = dyn->rs.line.stipple.pattern,
2223       });
2224    }
2225 
2226    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
2227       P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
2228 
2229    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ||
2230        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE)) {
2231       if (nvk_cmd_buffer_3d_cls(cmd) < MAXWELL_B) {
2232          assert(dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
2233       } else if (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2234          P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_FALSE);
2235       } else {
2236          uint32_t extra_overestimate =
2237             MIN2(3, dyn->rs.extra_primitive_overestimation_size * 4);
2238 
2239          if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
2240             P_IMMD(p, NVC397, SET_CONSERVATIVE_RASTER_CONTROL, {
2241                .extra_prim_bloat = extra_overestimate,
2242                .copy_inner_to_outer =
2243                   (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT),
2244                .triangle_snap_mode = TRIANGLE_SNAP_MODE_MODE_PRE_SNAP,
2245                .line_and_point_snap_mode = LINE_AND_POINT_SNAP_MODE_MODE_PRE_SNAP,
2246                .uncertainty_region_size = UNCERTAINTY_REGION_SIZE_SIZE_512,
2247             });
2248          } else {
2249             P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_CONSERVATIVE_RASTER_STATE));
2250             P_INLINE_DATA(p, extra_overestimate << 23);
2251          }
2252          P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_TRUE);
2253       }
2254    }
2255 }
2256 
2257 uint32_t
nvk_mme_shading_rate_control_sample_shading(bool sample_shading)2258 nvk_mme_shading_rate_control_sample_shading(bool sample_shading)
2259 {
2260    return nvk_mme_val_mask((!sample_shading) << 1, 1 << 1);
2261 }
2262 
2263 static uint32_t
nvk_mme_shading_rate_control_enable(bool enable)2264 nvk_mme_shading_rate_control_enable(bool enable)
2265 {
2266    return nvk_mme_val_mask(enable, 1 << 0);
2267 }
2268 
2269 void
nvk_mme_set_shading_rate_control(struct mme_builder * b)2270 nvk_mme_set_shading_rate_control(struct mme_builder *b)
2271 {
2272    if (b->devinfo->cls_eng3d < TURING_A)
2273       return;
2274 
2275    struct mme_value val_mask = mme_load(b);
2276    struct mme_value old_src = nvk_mme_load_scratch(b, SHADING_RATE_CONTROL);
2277    struct mme_value src = nvk_mme_set_masked(b, old_src, val_mask);
2278    mme_free_reg(b, val_mask);
2279 
2280    mme_if(b, ine, src, old_src) {
2281       mme_free_reg(b, old_src);
2282       nvk_mme_store_scratch(b, SHADING_RATE_CONTROL, src);
2283 
2284       struct mme_value enable1 = mme_merge(b, mme_zero(), src, 0, 1, 0);
2285       struct mme_value enable2 = mme_merge(b, mme_zero(), src, 0, 1, 1);
2286       struct mme_value enable = mme_and(b, enable1, enable2);
2287 
2288       struct mme_value i = mme_mov(b, mme_zero());
2289       mme_while(b, ine, i, mme_imm(16 * 4)) {
2290          mme_mthd_arr(b, NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(0), i);
2291          mme_emit(b, enable);
2292          mme_add_to(b, i, i, mme_imm(4));
2293       }
2294    }
2295 }
2296 
2297 static void
nvk_mme_set_shading_rate_control_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)2298 nvk_mme_set_shading_rate_control_test_check(
2299    const struct nv_device_info *devinfo,
2300    const struct nvk_mme_test_case *test,
2301    const struct nvk_mme_mthd_data *results)
2302 {
2303    if (devinfo->cls_eng3d < TURING_A)
2304       return;
2305 
2306    assert(results[0].mthd == NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL));
2307    bool enable = (results[0].data & 3) == 3;
2308 
2309    for (uint32_t i = 0; i < 16; i++) {
2310       assert(results[i + 1].mthd ==
2311              NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(i));
2312       assert(results[i + 1].data == enable);
2313    }
2314 }
2315 
2316 const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[] = {{
2317    .init = (struct nvk_mme_mthd_data[]) {
2318       { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2319       { }
2320    },
2321    .params = (uint32_t[]) { 0x00030003 },
2322    .check = nvk_mme_set_shading_rate_control_test_check,
2323 }, {
2324    .init = (struct nvk_mme_mthd_data[]) {
2325       { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2326       { }
2327    },
2328    .params = (uint32_t[]) { 0x00030001 },
2329    .check = nvk_mme_set_shading_rate_control_test_check,
2330 }, {}};
2331 
2332 static VkExtent2D
nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,VkExtent2D a_log2,VkExtent2D b_log2)2333 nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,
2334                           VkExtent2D a_log2, VkExtent2D b_log2)
2335 {
2336    switch (op) {
2337    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
2338       return a_log2;
2339 
2340    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
2341       return b_log2;
2342 
2343    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
2344       return (VkExtent2D) {
2345          .width = MIN2(a_log2.width, b_log2.width),
2346          .height = MIN2(a_log2.height, b_log2.height),
2347       };
2348 
2349    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
2350       return (VkExtent2D) {
2351          .width = MAX2(a_log2.width, b_log2.width),
2352          .height = MAX2(a_log2.height, b_log2.height),
2353       };
2354 
2355    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR:
2356       return (VkExtent2D) {
2357          .width = a_log2.width + b_log2.width,
2358          .height = a_log2.height + b_log2.height,
2359       };
2360 
2361    default:
2362       unreachable("Invalid FSR combiner op");
2363    }
2364 }
2365 
2366 static uint8_t
vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)2367 vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)
2368 {
2369    rate_log2.width = MIN2(rate_log2.width, 2);
2370    rate_log2.height = MIN2(rate_log2.height, 2);
2371    const uint8_t idx = (rate_log2.width << 2) | rate_log2.height;
2372 
2373    /* From the Vulkan 1.3.297 spec:
2374     *
2375     *    "A fragment shading rate Rxy representing any of Axy, Bxy or Cxy
2376     *    is clamped as follows. [...] From this list of supported rates,
2377     *    the following steps are applied in order, to select a single
2378     *    value:
2379     *
2380     *     1. Keep only rates where Rx' ≤ Rx and Ry' ≤ Ry.
2381     *
2382     *        - Implementations may also keep rates where Rx' ≤ Ry and
2383     *          Ry' ≤ Rx.
2384     *
2385     *     2. Keep only rates with the highest area (Rx' × Ry').
2386     *
2387     *     3. Keep only rates with the lowest aspect ratio (Rx' + Ry').
2388     *
2389     *     4. In cases where a wide (e.g. 4x1) and tall (e.g. 1x4) rate
2390     *        remain, the implementation may choose either rate. However, it
2391     *        must choose this rate consistently for the same shading rates,
2392     *        render pass transform, and combiner operations for the
2393     *        lifetime of the VkDevice.
2394     *
2395     * We have the following rates: 1x1, 2x1, 1x2, 2x2, 4x2, 2x4, 4x4.
2396     */
2397    static const uint8_t vk_to_nvc597[] = {
2398 #define NVC597_FSR(X) NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A_RATE_INDEX0_PS_##X
2399       NVC597_FSR(X1_PER_RASTER_PIXEL),
2400       NVC597_FSR(X1_PER_1X2_RASTER_PIXELS),
2401       NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x4 */
2402       NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x8 */
2403       NVC597_FSR(X1_PER_2X1_RASTER_PIXELS),
2404       NVC597_FSR(X1_PER_2X2_RASTER_PIXELS),
2405       NVC597_FSR(X1_PER_2X4_RASTER_PIXELS),
2406       NVC597_FSR(X1_PER_2X4_RASTER_PIXELS), /* 2x8 */
2407       NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 4x1 */
2408       NVC597_FSR(X1_PER_4X2_RASTER_PIXELS),
2409       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS),
2410       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 4x8 */
2411       NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 8x1 */
2412       NVC597_FSR(X1_PER_4X2_RASTER_PIXELS), /* 8x2 */
2413       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x4 */
2414       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x8 */
2415 #undef NVC597_FSR
2416    };
2417 
2418    assert(idx < ARRAY_SIZE(vk_to_nvc597));
2419    return vk_to_nvc597[idx];
2420 }
2421 
2422 static void
nvk_flush_fsr_state(struct nvk_cmd_buffer * cmd)2423 nvk_flush_fsr_state(struct nvk_cmd_buffer *cmd)
2424 {
2425    const struct vk_dynamic_graphics_state *dyn =
2426       &cmd->vk.dynamic_graphics_state;
2427 
2428    if (nvk_cmd_buffer_3d_cls(cmd) < TURING_A) {
2429       assert(vk_fragment_shading_rate_is_disabled(&dyn->fsr));
2430       return;
2431    }
2432 
2433    if (!BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
2434       return;
2435 
2436    if (vk_fragment_shading_rate_is_disabled(&dyn->fsr)) {
2437       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2438       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2439       P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(false));
2440    } else {
2441       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 16 * 3);
2442 
2443       assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.width));
2444       assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.height));
2445       const VkExtent2D state_fs_log2 = {
2446          .width = util_logbase2(dyn->fsr.fragment_size.width),
2447          .height = util_logbase2(dyn->fsr.fragment_size.height),
2448       };
2449 
2450       for (uint32_t prim_idx = 0; prim_idx < 16; prim_idx++) {
2451          const VkExtent2D prim_fs_log2 = {
2452             .width = (prim_idx >> 2) & 3,
2453             .height = prim_idx & 3,
2454          };
2455 
2456          const VkExtent2D state_prim_fs_log2 =
2457             nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[0],
2458                                       state_fs_log2, prim_fs_log2);
2459 
2460          uint8_t rates[16] = {};
2461          for (uint32_t att_idx = 0; att_idx < 16; att_idx++) {
2462             const VkExtent2D att_fs_log2 = {
2463                .width = (att_idx >> 2) & 3,
2464                .height = att_idx & 3,
2465             };
2466 
2467             const VkExtent2D fs_log2 =
2468                nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[1],
2469                                          state_prim_fs_log2, att_fs_log2);
2470 
2471             rates[att_idx] = vk_to_nvc597_shading_rate_log2(fs_log2);
2472          }
2473 
2474          P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(prim_idx));
2475          P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(p, prim_idx, {
2476             .rate_index0 = rates[0],
2477             .rate_index1 = rates[1],
2478             .rate_index2 = rates[2],
2479             .rate_index3 = rates[3],
2480             .rate_index4 = rates[4],
2481             .rate_index5 = rates[5],
2482             .rate_index6 = rates[6],
2483             .rate_index7 = rates[7],
2484          });
2485          P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_B(p, prim_idx, {
2486             .rate_index8 = rates[8],
2487             .rate_index9 = rates[9],
2488             .rate_index10 = rates[10],
2489             .rate_index11 = rates[11],
2490             .rate_index12 = rates[12],
2491             .rate_index13 = rates[13],
2492             .rate_index14 = rates[14],
2493             .rate_index15 = rates[15],
2494          });
2495       }
2496 
2497       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2498       P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(true));
2499    }
2500 }
2501 
2502 static uint32_t
nvk_mme_anti_alias_init(void)2503 nvk_mme_anti_alias_init(void)
2504 {
2505    /* This is a valid value but we never set it so it ensures that the macro
2506     * will actually run the first time we set anything.
2507     */
2508    return 0xf;
2509 }
2510 
2511 uint32_t
nvk_mme_anti_alias_min_sample_shading(float mss)2512 nvk_mme_anti_alias_min_sample_shading(float mss)
2513 {
2514    /* The value we want to comput in the MME is
2515     *
2516     *    passes = next_pow2(samples * minSampleShading)
2517     *
2518     * Since samples is already a power of two,
2519     *
2520     *    passes_log2 = log2_ceil(samples * minSampleShading)
2521     *                = log2_ceil(samples / (1.0 / minSampleShading))
2522     *                = samples_log2 - log2_floor(1.0 / minSampleShading)
2523     *
2524     * if we assume (1.0 / min_sample_shading) >= 1.0.  This last bit is
2525     * something we can compute in the MME as long as the float math on the
2526     * right-hand side happens  on the CPU.
2527     */
2528    float rcp_mss = CLAMP(1.0 / mss, 1.0f, 16.0f);
2529    uint32_t rcp_mss_log2 = util_logbase2(floorf(rcp_mss));
2530 
2531    assert(rcp_mss_log2 != nvk_mme_anti_alias_init());
2532 
2533    return nvk_mme_val_mask(rcp_mss_log2 << 0, 0x000f);
2534 }
2535 
2536 static uint32_t
nvk_mme_anti_alias_samples(uint32_t samples)2537 nvk_mme_anti_alias_samples(uint32_t samples)
2538 {
2539    assert(util_is_power_of_two_or_zero(samples));
2540    const uint32_t samples_log2 = util_logbase2(MAX2(1, samples));
2541 
2542    return nvk_mme_val_mask(samples_log2 << 4, 0x00f0);
2543 }
2544 
2545 void
nvk_mme_set_anti_alias(struct mme_builder * b)2546 nvk_mme_set_anti_alias(struct mme_builder *b)
2547 {
2548    struct mme_value val_mask = mme_load(b);
2549    struct mme_value old_anti_alias = nvk_mme_load_scratch(b, ANTI_ALIAS);
2550    struct mme_value anti_alias =
2551       nvk_mme_set_masked(b, old_anti_alias, val_mask);
2552    mme_free_reg(b, val_mask);
2553 
2554    mme_if(b, ine, anti_alias, old_anti_alias) {
2555       mme_free_reg(b, old_anti_alias);
2556       nvk_mme_store_scratch(b, ANTI_ALIAS, anti_alias);
2557 
2558       struct mme_value rcp_mss_log2 =
2559          mme_merge(b, mme_zero(), anti_alias, 0, 4, 0);
2560       struct mme_value samples_log2 =
2561          mme_merge(b, mme_zero(), anti_alias, 0, 4, 4);
2562       mme_free_reg(b, anti_alias);
2563 
2564       /* We've already done all the hard work on the CPU in
2565        * nvk_mme_min_sample_shading().  All we have to do here is add the two
2566        * log2 values and clamp so we don't get negative.
2567        */
2568       struct mme_value passes_log2 = mme_sub(b, samples_log2, rcp_mss_log2);
2569       mme_free_reg(b, rcp_mss_log2);
2570 
2571       /* passes = MAX(passes, 1) */
2572       struct mme_value neg = mme_srl(b, passes_log2, mme_imm(31));
2573       mme_if(b, ine, neg, mme_zero()) {
2574          mme_mov_to(b, passes_log2, mme_zero());
2575       }
2576       mme_free_reg(b, neg);
2577 
2578       /*
2579        * NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL {
2580        *    ...
2581        *    .centroid = passes > 1 ? CENTROID_PER_PASS
2582        *                           : CENTROID_PER_FRAGMENT,
2583        * }
2584        */
2585       struct mme_value aac = mme_mov(b,
2586          mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_FRAGMENT
2587                  << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2588       mme_if(b, ine, passes_log2, mme_zero()) {
2589          mme_mov_to(b, aac,
2590             mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_PASS
2591                     << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2592       }
2593 
2594       struct mme_value passes = mme_sll(b, mme_imm(1), passes_log2);
2595       mme_merge_to(b, aac, aac, passes, 0, 4, 0);
2596       mme_free_reg(b, passes);
2597 
2598       mme_mthd(b, NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL);
2599       mme_emit(b, aac);
2600       mme_free_reg(b, aac);
2601 
2602       /* Now we need to emit sample masks per-sample. Annoyingly, we have to
2603        * pack these in pairs.
2604        */
2605       STATIC_ASSERT(sizeof(struct nak_sample_mask) == 2);
2606 
2607       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2608       mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.sample_masks)));
2609       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2610 
2611       /* Annoyingly, we have to pack these in pairs */
2612 
2613       struct mme_value samples_per_pass_log2 =
2614          mme_sub(b, samples_log2, passes_log2);
2615       mme_free_reg(b, samples_log2);
2616 
2617       mme_if(b, ieq, samples_per_pass_log2, mme_zero()) {
2618          /* One sample per pass, we can just blast it out */
2619          for (uint32_t i = 0; i < NVK_MAX_SAMPLES; i += 2) {
2620             uint32_t mask0 = 1 << i;
2621             uint32_t mask1 = 1 << (i + 1);
2622             mme_emit(b, mme_imm(mask0 | (mask1 << 16)));
2623          }
2624       }
2625 
2626       mme_if(b, ine, samples_per_pass_log2, mme_zero()) {
2627          mme_if(b, ieq, passes_log2, mme_zero()) {
2628             /* It's a single pass so we can use 0xffff */
2629             for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++)
2630                mme_emit(b, mme_imm(~0));
2631          }
2632 
2633          mme_if(b, ieq, passes_log2, mme_imm(1)) {
2634             for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2635                struct mme_value mask =
2636                   nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_2PASS_0, i);
2637                mme_emit(b, mask);
2638                mme_free_reg(b, mask);
2639             }
2640          }
2641 
2642          mme_if(b, ieq, passes_log2, mme_imm(2)) {
2643             for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2644                struct mme_value mask =
2645                   nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_4PASS_0, i);
2646                mme_emit(b, mask);
2647                mme_free_reg(b, mask);
2648             }
2649          }
2650       }
2651    }
2652 }
2653 
2654 const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[] = {{
2655    /* This case doesn't change the state so it should do nothing */
2656    .init = (struct nvk_mme_mthd_data[]) {
2657       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2658       { }
2659    },
2660    .params = (uint32_t[]) { 0xffff0000 },
2661    .expected = (struct nvk_mme_mthd_data[]) {
2662       { }
2663    },
2664 }, {
2665    /* Single sample, minSampleShading = 1.0 */
2666    .init = (struct nvk_mme_mthd_data[]) {
2667       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2668       { }
2669    },
2670    .params = (uint32_t[]) { 0xffff0000 },
2671    .expected = (struct nvk_mme_mthd_data[]) {
2672       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2673       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2674       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2675         nvk_root_descriptor_offset(draw.sample_masks) },
2676       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2677       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2678       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2679       { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2680       { }
2681    },
2682 }, {
2683    /* Single sample, minSampleShading = 0.25 */
2684    .init = (struct nvk_mme_mthd_data[]) {
2685       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2686       { }
2687    },
2688    .params = (uint32_t[]) { 0xffff0002 },
2689    .expected = (struct nvk_mme_mthd_data[]) {
2690       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x2 },
2691       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2692       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2693         nvk_root_descriptor_offset(draw.sample_masks) },
2694       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2695       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2696       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2697       { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2698       { }
2699    },
2700 }, {
2701    /* 8 samples, minSampleShading = 0.5 */
2702    .init = (struct nvk_mme_mthd_data[]) {
2703       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x1 },
2704       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_0), 0x030003 },
2705       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_1), 0x0c000c },
2706       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_2), 0x300030 },
2707       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_3), 0xc000c0 },
2708       { }
2709    },
2710    .params = (uint32_t[]) { 0x00f00030 },
2711    .expected = (struct nvk_mme_mthd_data[]) {
2712       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x31 },
2713       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x14 },
2714       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2715         nvk_root_descriptor_offset(draw.sample_masks) },
2716       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x030003 },
2717       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0c000c },
2718       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x300030 },
2719       { NV9097_LOAD_CONSTANT_BUFFER(3), 0xc000c0 },
2720       { }
2721    },
2722 }, {
2723    /* 8 samples, minSampleShading = 0.25 */
2724    .init = (struct nvk_mme_mthd_data[]) {
2725       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x30 },
2726       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_0), 0x0f000f },
2727       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_1), 0x0f000f },
2728       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_2), 0xf000f0 },
2729       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_3), 0xf000f0 },
2730       { }
2731    },
2732    .params = (uint32_t[]) { 0x000f0002 },
2733    .expected = (struct nvk_mme_mthd_data[]) {
2734       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x32 },
2735       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x12 },
2736       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2737         nvk_root_descriptor_offset(draw.sample_masks) },
2738       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x0f000f },
2739       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0f000f },
2740       { NV9097_LOAD_CONSTANT_BUFFER(2), 0xf000f0 },
2741       { NV9097_LOAD_CONSTANT_BUFFER(3), 0xf000f0 },
2742       { }
2743    },
2744 }, {}};
2745 
2746 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)2747 vk_sample_location(const struct vk_sample_locations_state *sl,
2748                    uint32_t x, uint32_t y, uint32_t s)
2749 {
2750    x = x % sl->grid_size.width;
2751    y = y % sl->grid_size.height;
2752 
2753    return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
2754 }
2755 
2756 static struct nak_sample_location
vk_to_nak_sample_location(VkSampleLocationEXT loc)2757 vk_to_nak_sample_location(VkSampleLocationEXT loc)
2758 {
2759    return (struct nak_sample_location) {
2760       .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
2761       .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
2762    };
2763 }
2764 
2765 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)2766 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
2767 {
2768    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2769    const struct vk_dynamic_graphics_state *dyn =
2770       &cmd->vk.dynamic_graphics_state;
2771 
2772    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
2773       /* When we don't have any attachments, we can't know the sample count
2774        * from the render pass so we need to emit SET_ANTI_ALIAS here.  See the
2775        * comment in nvk_BeginRendering() for more details.
2776        */
2777       if (render->samples == 0) {
2778          /* Multisample information MAY be missing (rasterizationSamples == 0)
2779           * if rasterizer discard is enabled.  However, this isn't valid in
2780           * the hardware so always use at least one sample.
2781           */
2782          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2783          nvk_cmd_set_sample_layout(cmd, nil_choose_sample_layout(samples));
2784       } else {
2785          /* Multisample information MAY be missing (rasterizationSamples == 0)
2786           * if rasterizer discard is enabled.
2787           */
2788          assert(dyn->ms.rasterization_samples == 0 ||
2789                 dyn->ms.rasterization_samples == render->samples);
2790       }
2791    }
2792 
2793    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
2794        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
2795       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2796       P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
2797          .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2798          .alpha_to_one      = dyn->ms.alpha_to_one_enable,
2799       });
2800    }
2801 
2802    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
2803        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
2804        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
2805       const struct vk_sample_locations_state *sl;
2806       if (dyn->ms.sample_locations_enable) {
2807          sl = dyn->ms.sample_locations;
2808       } else {
2809          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2810          sl = vk_standard_sample_locations_state(samples);
2811       }
2812 
2813       struct nak_sample_location push_sl[NVK_MAX_SAMPLES];
2814       for (uint32_t i = 0; i < sl->per_pixel; i++)
2815          push_sl[i] = vk_to_nak_sample_location(sl->locations[i]);
2816 
2817       nvk_descriptor_state_set_root_array(cmd, &cmd->state.gfx.descriptors,
2818                                           draw.sample_locations,
2819                                           0, NVK_MAX_SAMPLES, push_sl);
2820 
2821       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
2822          struct nak_sample_location loc[16];
2823          for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
2824             const uint32_t s = n % sl->per_pixel;
2825             const uint32_t px = n / sl->per_pixel;
2826             const uint32_t x = px % 2;
2827             const uint32_t y = px / 2;
2828 
2829             loc[n] = vk_to_nak_sample_location(vk_sample_location(sl, x, y, s));
2830          }
2831 
2832          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2833 
2834          P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
2835          for (uint32_t i = 0; i < 4; i++) {
2836             P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
2837                .x0 = loc[i * 4 + 0].x_u4,
2838                .y0 = loc[i * 4 + 0].y_u4,
2839                .x1 = loc[i * 4 + 1].x_u4,
2840                .y1 = loc[i * 4 + 1].y_u4,
2841                .x2 = loc[i * 4 + 2].x_u4,
2842                .y2 = loc[i * 4 + 2].y_u4,
2843                .x3 = loc[i * 4 + 3].x_u4,
2844                .y3 = loc[i * 4 + 3].y_u4,
2845             });
2846          }
2847       }
2848    }
2849 
2850    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
2851       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2852       P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
2853       P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
2854       P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
2855       P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
2856       P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
2857    }
2858 }
2859 
2860 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)2861 vk_to_nv9097_compare_op(VkCompareOp vk_op)
2862 {
2863    ASSERTED static const uint16_t vk_to_nv9097[] = {
2864       [VK_COMPARE_OP_NEVER]            = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
2865       [VK_COMPARE_OP_LESS]             = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
2866       [VK_COMPARE_OP_EQUAL]            = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
2867       [VK_COMPARE_OP_LESS_OR_EQUAL]    = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
2868       [VK_COMPARE_OP_GREATER]          = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
2869       [VK_COMPARE_OP_NOT_EQUAL]        = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
2870       [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
2871       [VK_COMPARE_OP_ALWAYS]           = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
2872    };
2873    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2874 
2875    uint32_t nv9097_op = 0x200 | vk_op;
2876    assert(nv9097_op == vk_to_nv9097[vk_op]);
2877    return nv9097_op;
2878 }
2879 
2880 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)2881 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
2882 {
2883 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
2884    ASSERTED static const uint16_t vk_to_nv9097[] = {
2885       OP(KEEP,                D3D_KEEP),
2886       OP(ZERO,                D3D_ZERO),
2887       OP(REPLACE,             D3D_REPLACE),
2888       OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
2889       OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
2890       OP(INVERT,              D3D_INVERT),
2891       OP(INCREMENT_AND_WRAP,  D3D_INCR),
2892       OP(DECREMENT_AND_WRAP,  D3D_DECR),
2893    };
2894    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2895 #undef OP
2896 
2897    uint32_t nv9097_op = vk_op + 1;
2898    assert(nv9097_op == vk_to_nv9097[vk_op]);
2899    return nv9097_op;
2900 }
2901 
2902 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)2903 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
2904 {
2905    struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
2906 
2907    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2908    const struct vk_dynamic_graphics_state *dyn =
2909       &cmd->vk.dynamic_graphics_state;
2910 
2911    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
2912       bool enable = dyn->ds.depth.test_enable &&
2913                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2914       P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
2915    }
2916 
2917    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
2918       bool enable = dyn->ds.depth.write_enable &&
2919                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2920       P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
2921    }
2922 
2923    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
2924       const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
2925       P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
2926    }
2927 
2928    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
2929       bool enable = dyn->ds.depth.bounds_test.enable &&
2930                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2931       P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
2932    }
2933 
2934    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
2935       P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
2936       P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
2937       P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
2938    }
2939 
2940    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
2941       bool enable = dyn->ds.stencil.test_enable &&
2942                     render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2943       P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
2944    }
2945 
2946    const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
2947    const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
2948    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
2949       P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
2950       P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
2951       P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
2952       P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
2953       P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
2954 
2955       P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
2956       P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
2957       P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
2958       P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
2959       P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
2960    }
2961 
2962    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
2963       P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
2964       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
2965    }
2966 
2967    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
2968       P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
2969       P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
2970    }
2971 
2972    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
2973       P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
2974       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
2975    }
2976 }
2977 
2978 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)2979 vk_to_nv9097_logic_op(VkLogicOp vk_op)
2980 {
2981    ASSERTED uint16_t vk_to_nv9097[] = {
2982       [VK_LOGIC_OP_CLEAR]           = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
2983       [VK_LOGIC_OP_AND]             = NV9097_SET_LOGIC_OP_FUNC_V_AND,
2984       [VK_LOGIC_OP_AND_REVERSE]     = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
2985       [VK_LOGIC_OP_COPY]            = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
2986       [VK_LOGIC_OP_AND_INVERTED]    = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
2987       [VK_LOGIC_OP_NO_OP]           = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
2988       [VK_LOGIC_OP_XOR]             = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
2989       [VK_LOGIC_OP_OR]              = NV9097_SET_LOGIC_OP_FUNC_V_OR,
2990       [VK_LOGIC_OP_NOR]             = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
2991       [VK_LOGIC_OP_EQUIVALENT]      = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
2992       [VK_LOGIC_OP_INVERT]          = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
2993       [VK_LOGIC_OP_OR_REVERSE]      = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
2994       [VK_LOGIC_OP_COPY_INVERTED]   = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
2995       [VK_LOGIC_OP_OR_INVERTED]     = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
2996       [VK_LOGIC_OP_NAND]            = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
2997       [VK_LOGIC_OP_SET]             = NV9097_SET_LOGIC_OP_FUNC_V_SET,
2998    };
2999    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3000 
3001    uint32_t nv9097_op = 0x1500 | vk_op;
3002    assert(nv9097_op == vk_to_nv9097[vk_op]);
3003    return nv9097_op;
3004 }
3005 
3006 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)3007 vk_to_nv9097_blend_op(VkBlendOp vk_op)
3008 {
3009 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
3010    ASSERTED uint16_t vk_to_nv9097[] = {
3011       OP(ADD,              FUNC_ADD),
3012       OP(SUBTRACT,         FUNC_SUBTRACT),
3013       OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
3014       OP(MIN,              MIN),
3015       OP(MAX,              MAX),
3016    };
3017    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3018 #undef OP
3019 
3020    return vk_to_nv9097[vk_op];
3021 }
3022 
3023 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)3024 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
3025 {
3026 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
3027    NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
3028    ASSERTED uint16_t vk_to_nv9097[] = {
3029       FACTOR(ZERO,                     OGL_ZERO),
3030       FACTOR(ONE,                      OGL_ONE),
3031       FACTOR(SRC_COLOR,                OGL_SRC_COLOR),
3032       FACTOR(ONE_MINUS_SRC_COLOR,      OGL_ONE_MINUS_SRC_COLOR),
3033       FACTOR(DST_COLOR,                OGL_DST_COLOR),
3034       FACTOR(ONE_MINUS_DST_COLOR,      OGL_ONE_MINUS_DST_COLOR),
3035       FACTOR(SRC_ALPHA,                OGL_SRC_ALPHA),
3036       FACTOR(ONE_MINUS_SRC_ALPHA,      OGL_ONE_MINUS_SRC_ALPHA),
3037       FACTOR(DST_ALPHA,                OGL_DST_ALPHA),
3038       FACTOR(ONE_MINUS_DST_ALPHA,      OGL_ONE_MINUS_DST_ALPHA),
3039       FACTOR(CONSTANT_COLOR,           OGL_CONSTANT_COLOR),
3040       FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
3041       FACTOR(CONSTANT_ALPHA,           OGL_CONSTANT_ALPHA),
3042       FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
3043       FACTOR(SRC_ALPHA_SATURATE,       OGL_SRC_ALPHA_SATURATE),
3044       FACTOR(SRC1_COLOR,               OGL_SRC1COLOR),
3045       FACTOR(ONE_MINUS_SRC1_COLOR,     OGL_INVSRC1COLOR),
3046       FACTOR(SRC1_ALPHA,               OGL_SRC1ALPHA),
3047       FACTOR(ONE_MINUS_SRC1_ALPHA,     OGL_INVSRC1ALPHA),
3048    };
3049    assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
3050 #undef FACTOR
3051 
3052    return vk_to_nv9097[vk_factor];
3053 }
3054 
3055 void
nvk_mme_set_write_mask(struct mme_builder * b)3056 nvk_mme_set_write_mask(struct mme_builder *b)
3057 {
3058    struct mme_value count = mme_load(b);
3059    struct mme_value mask = mme_load(b);
3060 
3061    /*
3062     * mask is a bit field
3063     *
3064     * attachment index 88887777666655554444333322221111
3065     * component        abgrabgrabgrabgrabgrabgrabgrabgr
3066    */
3067 
3068    struct mme_value common_mask = mme_mov(b, mme_imm(1));
3069    struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3070    struct mme_value i = mme_mov(b, mme_zero());
3071 
3072    mme_while(b, ine, i, count) {
3073       /*
3074          We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
3075          0x0000 0000 0000 0000 000a 000b 000g 000r
3076 
3077          So for i=0 a mask of
3078          0x0000 0000 0000 0000 0000 0000 0000 1111
3079          becomes
3080          0x0000 0000 0000 0000 0001 0001 0001 0001
3081       */
3082 
3083       struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
3084       mme_merge_to(b, val, val, mask, 4, 1, 1);
3085       mme_merge_to(b, val, val, mask, 8, 1, 2);
3086       mme_merge_to(b, val, val, mask, 12, 1, 3);
3087 
3088       mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
3089       mme_emit(b, val);
3090       mme_free_reg(b, val);
3091 
3092       /* Check if all masks are common */
3093       struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3094       mme_if(b, ine, first, temp) {
3095          mme_mov_to(b, common_mask, mme_zero());
3096       }
3097       mme_free_reg(b, temp);
3098 
3099       mme_srl_to(b, mask, mask, mme_imm(4));
3100 
3101       mme_add_to(b, i, i, mme_imm(1));
3102    }
3103 
3104    mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
3105    mme_emit(b, common_mask);
3106 }
3107 
3108 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)3109 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
3110 {
3111    struct nvk_rendering_state *render = &cmd->state.gfx.render;
3112    const struct vk_dynamic_graphics_state *dyn =
3113       &cmd->vk.dynamic_graphics_state;
3114 
3115    struct nv_push *p =
3116       nvk_cmd_buffer_push(cmd, 15 + 10 * render->color_att_count);
3117 
3118    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
3119       P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
3120 
3121    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
3122       const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
3123       P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
3124    }
3125 
3126    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
3127       for (uint8_t a = 0; a < render->color_att_count; a++) {
3128          P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
3129       }
3130    }
3131 
3132    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
3133       for (uint8_t a = 0; a < render->color_att_count; a++) {
3134          const struct vk_color_blend_attachment_state *att =
3135             &dyn->cb.attachments[a];
3136          P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
3137          P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
3138          P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
3139                vk_to_nv9097_blend_op(att->color_blend_op));
3140          P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
3141                vk_to_nv9097_blend_factor(att->src_color_blend_factor));
3142          P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
3143                vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
3144          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
3145                vk_to_nv9097_blend_op(att->alpha_blend_op));
3146          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
3147                vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
3148          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
3149                vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
3150       }
3151    }
3152 
3153    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
3154        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
3155        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS) ||
3156        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3157       uint32_t color_write_enables = 0x0;
3158       for (uint8_t a = 0; a < render->color_att_count; a++) {
3159          if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
3160             color_write_enables |= 0xf << (4 * a);
3161       }
3162 
3163       uint32_t cb_att_write_mask = 0x0;
3164       for (uint8_t a = 0; a < render->color_att_count; a++)
3165          cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
3166 
3167       uint32_t rp_att_write_mask = 0x0;
3168       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3169          if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
3170             rp_att_write_mask |= 0xf << (4 * a);
3171       }
3172 
3173       uint32_t att_has_loc_mask = 0x0;
3174       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3175          if (dyn->cal.color_map[a] != MESA_VK_ATTACHMENT_UNUSED)
3176             att_has_loc_mask |= 0xf << (4 * a);
3177       }
3178 
3179       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
3180       P_INLINE_DATA(p, render->color_att_count);
3181       P_INLINE_DATA(p, color_write_enables &
3182                        cb_att_write_mask &
3183                        rp_att_write_mask &
3184                        att_has_loc_mask);
3185    }
3186 
3187    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3188       int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
3189       uint8_t max_loc = 0;
3190       uint32_t att_used = 0;
3191       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3192          if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
3193             continue;
3194 
3195          att_used |= BITFIELD_BIT(a);
3196 
3197          assert(dyn->cal.color_map[a] < NVK_MAX_RTS);
3198          loc_att[dyn->cal.color_map[a]] = a;
3199          max_loc = MAX2(max_loc, dyn->cal.color_map[a]);
3200       }
3201 
3202       for (uint8_t l = 0; l < NVK_MAX_RTS; l++) {
3203          if (loc_att[l] >= 0)
3204             continue;
3205 
3206          /* Just grab any color attachment.  The way we set up color targets
3207           * in BeginRenderPass ensures that every color target is either the
3208           * valid color target referenced by this render pass or a valid NULL
3209           * target.  If we end up mapping to some other target in this render
3210           * pass, the handling of att_has_loc_mask above will ensure that no
3211           * color writes actually happen.
3212           */
3213          uint8_t a = ffs(~att_used) - 1;
3214          att_used |= BITFIELD_BIT(a);
3215          loc_att[l] = a;
3216       }
3217 
3218       P_IMMD(p, NV9097, SET_CT_SELECT, {
3219          .target_count = max_loc + 1,
3220          .target0 = loc_att[0],
3221          .target1 = loc_att[1],
3222          .target2 = loc_att[2],
3223          .target3 = loc_att[3],
3224          .target4 = loc_att[4],
3225          .target5 = loc_att[5],
3226          .target6 = loc_att[6],
3227          .target7 = loc_att[7],
3228       });
3229    }
3230 
3231    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
3232       P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
3233       P_NV9097_SET_BLEND_CONST_RED(p,     fui(dyn->cb.blend_constants[0]));
3234       P_NV9097_SET_BLEND_CONST_GREEN(p,   fui(dyn->cb.blend_constants[1]));
3235       P_NV9097_SET_BLEND_CONST_BLUE(p,    fui(dyn->cb.blend_constants[2]));
3236       P_NV9097_SET_BLEND_CONST_ALPHA(p,   fui(dyn->cb.blend_constants[3]));
3237    }
3238 }
3239 
3240 void
nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer * cmd)3241 nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer *cmd)
3242 {
3243    struct vk_dynamic_graphics_state *dyn =
3244       &cmd->vk.dynamic_graphics_state;
3245 
3246    if (!vk_dynamic_graphics_state_any_dirty(dyn))
3247       return;
3248 
3249    nvk_flush_vi_state(cmd);
3250    nvk_flush_ia_state(cmd);
3251    nvk_flush_ts_state(cmd);
3252    nvk_flush_vp_state(cmd);
3253    nvk_flush_rs_state(cmd);
3254    nvk_flush_fsr_state(cmd);
3255    nvk_flush_ms_state(cmd);
3256    nvk_flush_ds_state(cmd);
3257    nvk_flush_cb_state(cmd);
3258 
3259    vk_dynamic_graphics_state_clear_dirty(dyn);
3260 }
3261 
3262 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)3263 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
3264 {
3265    /* First 4 bits are group, later bits are slot */
3266    struct mme_value group_slot = mme_load(b);
3267 
3268    struct mme_value addr_lo, addr_hi, size;
3269    if (nvk_use_bindless_cbuf(b->devinfo)) {
3270       if (b->devinfo->cls_eng3d >= TURING_A) {
3271          struct mme_value64 addr = mme_load_addr64(b);
3272          mme_tu104_read_fifoed(b, addr, mme_imm(2));
3273       }
3274 
3275       /* Load the descriptor */
3276       struct mme_value desc_lo = mme_load(b);
3277       struct mme_value desc_hi = mme_load(b);
3278 
3279       /* The bottom 45 bits are addr >> 4 */
3280       addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
3281       addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
3282       mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
3283 
3284       /* The top 19 bits are size >> 4 */
3285       size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
3286 
3287       mme_free_reg(b, desc_hi);
3288       mme_free_reg(b, desc_lo);
3289    } else {
3290       if (b->devinfo->cls_eng3d >= TURING_A) {
3291          struct mme_value64 addr = mme_load_addr64(b);
3292          mme_tu104_read_fifoed(b, addr, mme_imm(3));
3293       }
3294 
3295       /* Load the descriptor */
3296       addr_lo = mme_load(b);
3297       addr_hi = mme_load(b);
3298       size = mme_load(b);
3299    }
3300 
3301    struct mme_value cb = mme_alloc_reg(b);
3302    mme_if(b, ieq, size, mme_zero()) {
3303       /* Bottim bit is the valid bit, 8:4 are shader slot */
3304       mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
3305    }
3306 
3307    mme_if(b, ine, size, mme_zero()) {
3308       /* size = max(size, NVK_MAX_CBUF_SIZE) */
3309       assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
3310       struct mme_value is_large =
3311          mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
3312       mme_if(b, ine, is_large, mme_zero()) {
3313          mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
3314       }
3315 
3316       mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
3317       mme_emit(b, size);
3318       mme_emit(b, addr_hi);
3319       mme_emit(b, addr_lo);
3320 
3321       /* Bottom bit is the valid bit, 8:4 are shader slot */
3322       mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
3323    }
3324 
3325    mme_free_reg(b, addr_hi);
3326    mme_free_reg(b, addr_lo);
3327    mme_free_reg(b, size);
3328 
3329    /* The group comes in the bottom 4 bits in group_slot and we need to
3330     * combine it with the method.  However, unlike most array methods with a
3331     * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
3332     * dwords.  This means we need to also shift by 3.
3333     */
3334    struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
3335    mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
3336    mme_emit(b, cb);
3337 }
3338 
3339 void
nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer * cmd)3340 nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer *cmd)
3341 {
3342    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
3343    struct nvk_physical_device *pdev = nvk_device_physical(dev);
3344    const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
3345    struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3346 
3347    /* Find cbuf maps for the 5 cbuf groups */
3348    const struct nvk_shader *cbuf_shaders[5] = { NULL, };
3349    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
3350       const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
3351       if (shader == NULL)
3352          continue;
3353 
3354       uint32_t group = nvk_cbuf_binding_for_stage(stage);
3355       assert(group < ARRAY_SIZE(cbuf_shaders));
3356       cbuf_shaders[group] = shader;
3357    }
3358 
3359    bool bound_any_cbuf = false;
3360    for (uint32_t g = 0; g < ARRAY_SIZE(cbuf_shaders); g++) {
3361       if (cbuf_shaders[g] == NULL)
3362          continue;
3363 
3364       const struct nvk_shader *shader = cbuf_shaders[g];
3365       const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
3366       struct nvk_cbuf_group *group = &cmd->state.gfx.cbuf_groups[g];
3367 
3368       /* We only bother to re-bind cbufs that are in use */
3369       const uint32_t rebind =
3370          group->dirty & BITFIELD_MASK(cbuf_map->cbuf_count);
3371       if (!rebind)
3372          continue;
3373 
3374       u_foreach_bit(c, rebind) {
3375          const struct nvk_cbuf *cbuf = &group->cbufs[c];
3376 
3377          /* We bind these at the very end */
3378          if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC)
3379             continue;
3380 
3381          bound_any_cbuf = true;
3382 
3383          struct nvk_buffer_address ba;
3384          if (nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba)) {
3385             assert(ba.base_addr % min_cbuf_alignment == 0);
3386             ba.size = align(ba.size, min_cbuf_alignment);
3387             ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
3388 
3389             struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3390 
3391             if (ba.size > 0) {
3392                P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
3393                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
3394                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
3395                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
3396             }
3397 
3398             P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(g), {
3399                .valid = ba.size > 0,
3400                .shader_slot = c,
3401             });
3402          } else {
3403             uint64_t desc_addr =
3404                nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
3405 
3406             if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3407                struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3408 
3409                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3410                P_INLINE_DATA(p, g | (c << 4));
3411                P_INLINE_DATA(p, desc_addr >> 32);
3412                P_INLINE_DATA(p, desc_addr);
3413             } else {
3414                struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3415 
3416                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3417                P_INLINE_DATA(p, g | (c << 4));
3418 
3419                nv_push_update_count(p, 3);
3420                nvk_cmd_buffer_push_indirect(cmd, desc_addr, 12);
3421             }
3422          }
3423       }
3424 
3425       group->dirty &= ~rebind;
3426    }
3427 
3428    /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
3429     * always left pointing at the root descriptor table.  This way draw
3430     * parameters and similar MME root table updates always hit the root
3431     * descriptor table and not some random UBO.
3432     */
3433    if (bound_any_cbuf) {
3434       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3435       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
3436       P_INLINE_DATA(p, 0);
3437    }
3438 }
3439 
3440 static void
nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer * cmd)3441 nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer *cmd)
3442 {
3443    nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
3444    nvk_cmd_flush_gfx_dynamic_state(cmd);
3445    nvk_cmd_flush_gfx_shaders(cmd);
3446    nvk_cmd_flush_gfx_cbufs(cmd);
3447 }
3448 
3449 void
nvk_mme_bind_ib(struct mme_builder * b)3450 nvk_mme_bind_ib(struct mme_builder *b)
3451 {
3452    struct mme_value64 addr = mme_load_addr64(b);
3453    struct mme_value size_B = mme_load(b);
3454 
3455    struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3456    mme_if(b, ieq, addr_or, mme_zero()) {
3457       mme_mov_to(b, size_B, mme_zero());
3458    }
3459    mme_free_reg(b, addr_or);
3460 
3461    if (b->devinfo->cls_eng3d < TURING_A) {
3462       mme_if(b, ieq, size_B, mme_zero()) {
3463          nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3464          nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3465       }
3466    }
3467 
3468    mme_mthd(b, NV9097_SET_INDEX_BUFFER_A);
3469    mme_emit(b, addr.hi);
3470    mme_emit(b, addr.lo);
3471 
3472    if (b->devinfo->cls_eng3d >= TURING_A) {
3473       mme_mthd(b, NVC597_SET_INDEX_BUFFER_SIZE_A);
3474       mme_emit(b, mme_zero());
3475       mme_emit(b, size_B);
3476    } else {
3477       /* Convert to an end address */
3478       mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3479       mme_add64_to(b, addr, addr, mme_imm64(-1));
3480 
3481       /* mme_mthd(b, NV9097_SET_INDEX_BUFFER_C); */
3482       mme_emit(b, addr.hi);
3483       mme_emit(b, addr.lo);
3484    }
3485    mme_free_reg64(b, addr);
3486    mme_free_reg(b, size_B);
3487 
3488    struct mme_value fmt = mme_load(b);
3489    struct mme_value restart = mme_mov(b, mme_imm(UINT32_MAX));
3490    struct mme_value index_type = mme_mov(b,
3491       mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES));
3492 
3493    /* The Vulkan and D3D enums don't overlap so we can handle both at the same
3494     * time with one MME macro.
3495     */
3496    UNUSED static const uint32_t DXGI_FORMAT_R32_UINT = 42;
3497    static const uint32_t DXGI_FORMAT_R16_UINT = 57;
3498    static const uint32_t DXGI_FORMAT_R8_UINT = 62;
3499 
3500    mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT16)) {
3501       mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3502       mme_mov_to(b, index_type,
3503                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3504    }
3505 
3506    mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R16_UINT)) {
3507       mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3508       mme_mov_to(b, index_type,
3509                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3510    }
3511 
3512    mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT8_KHR)) {
3513       mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3514       mme_mov_to(b, index_type,
3515                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3516    }
3517 
3518    mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R8_UINT)) {
3519       mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3520       mme_mov_to(b, index_type,
3521                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3522    }
3523 
3524    mme_mthd(b, NV9097_SET_DA_PRIMITIVE_RESTART_INDEX);
3525    mme_emit(b, restart);
3526 
3527    mme_mthd(b, NV9097_SET_INDEX_BUFFER_E);
3528    mme_emit(b, index_type);
3529 }
3530 
3531 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3532 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3533                            VkBuffer _buffer,
3534                            VkDeviceSize offset,
3535                            VkDeviceSize size,
3536                            VkIndexType indexType)
3537 {
3538    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3539    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3540    struct nvk_addr_range addr_range =
3541       nvk_buffer_addr_range(buffer, offset, size);
3542 
3543    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3544    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB));
3545    P_INLINE_DATA(p, addr_range.addr >> 32);
3546    P_INLINE_DATA(p, addr_range.addr);
3547    assert(addr_range.range <= UINT32_MAX);
3548    P_INLINE_DATA(p, addr_range.range);
3549    P_INLINE_DATA(p, indexType);
3550 }
3551 
3552 void
nvk_mme_bind_vb(struct mme_builder * b)3553 nvk_mme_bind_vb(struct mme_builder *b)
3554 {
3555    struct mme_value vb_idx = mme_load(b);
3556    struct mme_value64 addr = mme_load_addr64(b);
3557    struct mme_value size_B = mme_load(b);
3558 
3559    struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3560    mme_if(b, ieq, addr_or, mme_zero()) {
3561       mme_mov_to(b, size_B, mme_zero());
3562    }
3563    mme_free_reg(b, addr_or);
3564 
3565    if (b->devinfo->cls_eng3d < TURING_A) {
3566       mme_if(b, ieq, size_B, mme_zero()) {
3567          nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3568          nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3569       }
3570    }
3571 
3572    struct mme_value vb_idx4 = mme_sll(b, vb_idx, mme_imm(2));
3573    mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_LOCATION_A(0), vb_idx4);
3574    mme_free_reg(b, vb_idx4);
3575    mme_emit(b, addr.hi);
3576    mme_emit(b, addr.lo);
3577 
3578    if (b->devinfo->cls_eng3d >= TURING_A) {
3579       struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3580       mme_mthd_arr(b, NVC597_SET_VERTEX_STREAM_SIZE_A(0), vb_idx2);
3581       mme_emit(b, mme_zero());
3582       mme_emit(b, size_B);
3583    } else {
3584       /* Convert to an end address */
3585       mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3586       mme_add64_to(b, addr, addr, mme_imm64(-1));
3587 
3588       struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3589       mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_LIMIT_A_A(0), vb_idx2);
3590       mme_emit(b, addr.hi);
3591       mme_emit(b, addr.lo);
3592    }
3593 }
3594 
3595 static void
nvk_mme_bind_vb_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)3596 nvk_mme_bind_vb_test_check(const struct nv_device_info *devinfo,
3597                            const struct nvk_mme_test_case *test,
3598                            const struct nvk_mme_mthd_data *results)
3599 {
3600    const uint32_t vb_idx = test->params[0];
3601    const uint32_t addr_hi = test->params[1];
3602    const uint32_t addr_lo = test->params[2];
3603 
3604    uint32_t size_B = test->params[3];
3605    if (addr_hi == 0 && addr_lo == 0)
3606       size_B = 0;
3607 
3608    assert(results[0].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
3609    assert(results[1].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_B(vb_idx));
3610 
3611    if (devinfo->cls_eng3d >= TURING_A) {
3612       assert(results[0].data == addr_hi);
3613       assert(results[1].data == addr_lo);
3614 
3615       assert(results[2].mthd == NVC597_SET_VERTEX_STREAM_SIZE_A(3));
3616       assert(results[3].mthd == NVC597_SET_VERTEX_STREAM_SIZE_B(3));
3617       assert(results[2].data == 0);
3618       assert(results[3].data == size_B);
3619    } else {
3620       uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
3621       if (size_B == 0)
3622          addr = ((uint64_t)test->init[0].data << 32) | test->init[1].data;
3623 
3624       assert(results[0].data == addr >> 32);
3625       assert(results[1].data == (uint32_t)addr);
3626 
3627       const uint64_t limit = (addr + size_B) - 1;
3628       assert(results[2].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_A(3));
3629       assert(results[3].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_B(3));
3630       assert(results[2].data == limit >> 32);
3631       assert(results[3].data == (uint32_t)limit);
3632    }
3633 }
3634 
3635 const struct nvk_mme_test_case nvk_mme_bind_vb_tests[] = {{
3636    .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0x10000 },
3637    .check = nvk_mme_bind_vb_test_check,
3638 }, {
3639    .init = (struct nvk_mme_mthd_data[]) {
3640       { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3641       { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3642       { }
3643    },
3644    .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0 },
3645    .check = nvk_mme_bind_vb_test_check,
3646 }, {
3647    .init = (struct nvk_mme_mthd_data[]) {
3648       { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3649       { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3650       { }
3651    },
3652    .params = (uint32_t[]) { 3, 0, 0, 0x800 },
3653    .check = nvk_mme_bind_vb_test_check,
3654 }, {}};
3655 
3656 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)3657 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
3658                            struct nvk_addr_range addr_range)
3659 {
3660    /* Used for meta save/restore */
3661    if (vb_idx == 0)
3662       cmd->state.gfx.vb0 = addr_range;
3663 
3664    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3665    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB));
3666    P_INLINE_DATA(p, vb_idx);
3667    P_INLINE_DATA(p, addr_range.addr >> 32);
3668    P_INLINE_DATA(p, addr_range.addr);
3669    assert(addr_range.range <= UINT32_MAX);
3670    P_INLINE_DATA(p, addr_range.range);
3671 }
3672 
3673 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3674 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3675                           uint32_t firstBinding,
3676                           uint32_t bindingCount,
3677                           const VkBuffer *pBuffers,
3678                           const VkDeviceSize *pOffsets,
3679                           const VkDeviceSize *pSizes,
3680                           const VkDeviceSize *pStrides)
3681 {
3682    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3683 
3684    if (pStrides) {
3685       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
3686                                         bindingCount, pStrides);
3687    }
3688 
3689    for (uint32_t i = 0; i < bindingCount; i++) {
3690       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3691       uint32_t idx = firstBinding + i;
3692 
3693       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3694       const struct nvk_addr_range addr_range =
3695          nvk_buffer_addr_range(buffer, pOffsets[i], size);
3696 
3697       nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3698    }
3699 }
3700 
3701 static void
nvk_mme_set_cb0_mthd(struct mme_builder * b,uint16_t cb0_offset,uint16_t mthd,struct mme_value val)3702 nvk_mme_set_cb0_mthd(struct mme_builder *b,
3703                      uint16_t cb0_offset,
3704                      uint16_t mthd,
3705                      struct mme_value val)
3706 {
3707    if (b->devinfo->cls_eng3d >= TURING_A) {
3708       struct mme_value old = mme_state(b, mthd);
3709       mme_if(b, ine, old, val) {
3710          mme_mthd(b, mthd);
3711          mme_emit(b, val);
3712 
3713          mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3714          mme_emit(b, mme_imm(cb0_offset));
3715          mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3716          mme_emit(b, val);
3717       }
3718       mme_free_reg(b, old);
3719    } else {
3720       /* Fermi is really tight on registers. Don't bother with the if and set
3721        * both unconditionally for now.
3722        */
3723       mme_mthd(b, mthd);
3724       mme_emit(b, val);
3725 
3726       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3727       mme_emit(b, mme_imm(cb0_offset));
3728       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3729       mme_emit(b, val);
3730    }
3731 }
3732 
3733 static void
nvk_mme_set_cb0_scratch(struct mme_builder * b,uint16_t cb0_offset,enum nvk_mme_scratch scratch,struct mme_value val)3734 nvk_mme_set_cb0_scratch(struct mme_builder *b,
3735                         uint16_t cb0_offset,
3736                         enum nvk_mme_scratch scratch,
3737                         struct mme_value val)
3738 {
3739    const uint16_t mthd = NV9097_SET_MME_SHADOW_SCRATCH(scratch);
3740    nvk_mme_set_cb0_mthd(b, cb0_offset, mthd, val);
3741 }
3742 
3743 struct mme_draw_params {
3744    struct mme_value base_vertex;
3745    struct mme_value first_vertex;
3746    struct mme_value first_instance;
3747    struct mme_value draw_index;
3748 };
3749 
3750 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)3751 nvk_mme_build_set_draw_params(struct mme_builder *b,
3752                               const struct mme_draw_params *p)
3753 {
3754    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.base_vertex),
3755                            NVK_MME_SCRATCH_CB0_FIRST_VERTEX,
3756                            p->first_vertex);
3757    nvk_mme_set_cb0_mthd(b, nvk_root_descriptor_offset(draw.base_instance),
3758                         NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX,
3759                         p->first_instance);
3760    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.draw_index),
3761                            NVK_MME_SCRATCH_CB0_DRAW_INDEX,
3762                            p->draw_index);
3763    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3764                            NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3765                            mme_zero());
3766 
3767    mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
3768    mme_emit(b, p->base_vertex);
3769    mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
3770    mme_emit(b, p->base_vertex);
3771 }
3772 
3773 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)3774 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
3775 {
3776    /* Set the push constant */
3777    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3778                            NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3779                            view_index);
3780 
3781    /* Set the layer to the view index */
3782    STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
3783    STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
3784    mme_mthd(b, NV9097_SET_RT_LAYER);
3785    mme_emit(b, view_index);
3786 }
3787 
3788 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)3789 nvk_mme_build_draw_loop(struct mme_builder *b,
3790                         struct mme_value instance_count,
3791                         struct mme_value first_vertex,
3792                         struct mme_value vertex_count)
3793 {
3794    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3795 
3796    mme_loop(b, instance_count) {
3797       mme_mthd(b, NV9097_BEGIN);
3798       mme_emit(b, begin);
3799 
3800       mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
3801       mme_emit(b, first_vertex);
3802       mme_emit(b, vertex_count);
3803 
3804       mme_mthd(b, NV9097_END);
3805       mme_emit(b, mme_zero());
3806 
3807       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3808    }
3809 
3810    mme_free_reg(b, begin);
3811 }
3812 
3813 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_index)3814 nvk_mme_build_draw(struct mme_builder *b,
3815                    struct mme_value draw_index)
3816 {
3817    /* These are in VkDrawIndirectCommand order */
3818    struct mme_value vertex_count = mme_load(b);
3819    struct mme_value instance_count = mme_load(b);
3820    struct mme_value first_vertex = mme_load(b);
3821    struct mme_value first_instance = mme_load(b);
3822 
3823    struct mme_draw_params params = {
3824       .first_vertex = first_vertex,
3825       .first_instance = first_instance,
3826       .draw_index = draw_index,
3827    };
3828    nvk_mme_build_set_draw_params(b, &params);
3829 
3830    mme_free_reg(b, first_instance);
3831 
3832    if (b->devinfo->cls_eng3d < TURING_A)
3833       nvk_mme_spill(b, DRAW_IDX, draw_index);
3834 
3835    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3836    mme_if(b, ieq, view_mask, mme_zero()) {
3837       mme_free_reg(b, view_mask);
3838 
3839       nvk_mme_build_draw_loop(b, instance_count,
3840                               first_vertex, vertex_count);
3841    }
3842 
3843    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3844    mme_if(b, ine, view_mask, mme_zero()) {
3845       mme_free_reg(b, view_mask);
3846 
3847       struct mme_value view = mme_mov(b, mme_zero());
3848       mme_while(b, ine, view, mme_imm(32)) {
3849          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3850          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3851          mme_free_reg(b, view_mask);
3852          mme_if(b, ine, has_view, mme_zero()) {
3853             mme_free_reg(b, has_view);
3854             nvk_mme_emit_view_index(b, view);
3855             nvk_mme_build_draw_loop(b, instance_count,
3856                                     first_vertex, vertex_count);
3857          }
3858 
3859          mme_add_to(b, view, view, mme_imm(1));
3860       }
3861       mme_free_reg(b, view);
3862    }
3863 
3864    mme_free_reg(b, instance_count);
3865    mme_free_reg(b, first_vertex);
3866    mme_free_reg(b, vertex_count);
3867 
3868    if (b->devinfo->cls_eng3d < TURING_A)
3869       nvk_mme_unspill(b, DRAW_IDX, draw_index);
3870 }
3871 
3872 void
nvk_mme_draw(struct mme_builder * b)3873 nvk_mme_draw(struct mme_builder *b)
3874 {
3875    struct mme_value draw_index = mme_load(b);
3876    nvk_mme_build_draw(b, draw_index);
3877 }
3878 
3879 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3880 nvk_CmdDraw(VkCommandBuffer commandBuffer,
3881             uint32_t vertexCount,
3882             uint32_t instanceCount,
3883             uint32_t firstVertex,
3884             uint32_t firstInstance)
3885 {
3886    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3887 
3888    nvk_cmd_flush_gfx_state(cmd);
3889 
3890    struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3891    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3892    P_INLINE_DATA(p, 0 /* draw_index */);
3893    P_INLINE_DATA(p, vertexCount);
3894    P_INLINE_DATA(p, instanceCount);
3895    P_INLINE_DATA(p, firstVertex);
3896    P_INLINE_DATA(p, firstInstance);
3897 }
3898 
3899 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3900 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3901                     uint32_t drawCount,
3902                     const VkMultiDrawInfoEXT *pVertexInfo,
3903                     uint32_t instanceCount,
3904                     uint32_t firstInstance,
3905                     uint32_t stride)
3906 {
3907    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3908 
3909    nvk_cmd_flush_gfx_state(cmd);
3910 
3911    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3912       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3913       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3914       P_INLINE_DATA(p, draw_index);
3915       P_INLINE_DATA(p, pVertexInfo->vertexCount);
3916       P_INLINE_DATA(p, instanceCount);
3917       P_INLINE_DATA(p, pVertexInfo->firstVertex);
3918       P_INLINE_DATA(p, firstInstance);
3919 
3920       pVertexInfo = ((void *)pVertexInfo) + stride;
3921    }
3922 }
3923 
3924 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)3925 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
3926                                 struct mme_value instance_count,
3927                                 struct mme_value first_index,
3928                                 struct mme_value index_count)
3929 {
3930    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3931 
3932    mme_loop(b, instance_count) {
3933       mme_mthd(b, NV9097_BEGIN);
3934       mme_emit(b, begin);
3935 
3936       mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
3937       mme_emit(b, first_index);
3938       mme_emit(b, index_count);
3939 
3940       mme_mthd(b, NV9097_END);
3941       mme_emit(b, mme_zero());
3942 
3943       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3944    }
3945 
3946    mme_free_reg(b, begin);
3947 }
3948 
3949 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_index)3950 nvk_mme_build_draw_indexed(struct mme_builder *b,
3951                            struct mme_value draw_index)
3952 {
3953    /* These are in VkDrawIndexedIndirectCommand order */
3954    struct mme_value index_count = mme_load(b);
3955    struct mme_value instance_count = mme_load(b);
3956    struct mme_value first_index = mme_load(b);
3957    struct mme_value vertex_offset = mme_load(b);
3958    struct mme_value first_instance = mme_load(b);
3959 
3960    struct mme_draw_params params = {
3961       .base_vertex = vertex_offset,
3962       .first_vertex = vertex_offset,
3963       .first_instance = first_instance,
3964       .draw_index = draw_index,
3965    };
3966    nvk_mme_build_set_draw_params(b, &params);
3967 
3968    mme_free_reg(b, vertex_offset);
3969    mme_free_reg(b, first_instance);
3970 
3971    if (b->devinfo->cls_eng3d < TURING_A)
3972       nvk_mme_spill(b, DRAW_IDX, draw_index);
3973 
3974    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3975    mme_if(b, ieq, view_mask, mme_zero()) {
3976       mme_free_reg(b, view_mask);
3977 
3978       nvk_mme_build_draw_indexed_loop(b, instance_count,
3979                                       first_index, index_count);
3980    }
3981 
3982    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3983    mme_if(b, ine, view_mask, mme_zero()) {
3984       mme_free_reg(b, view_mask);
3985 
3986       struct mme_value view = mme_mov(b, mme_zero());
3987       mme_while(b, ine, view, mme_imm(32)) {
3988          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3989          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3990          mme_free_reg(b, view_mask);
3991          mme_if(b, ine, has_view, mme_zero()) {
3992             mme_free_reg(b, has_view);
3993             nvk_mme_emit_view_index(b, view);
3994             nvk_mme_build_draw_indexed_loop(b, instance_count,
3995                                             first_index, index_count);
3996          }
3997 
3998          mme_add_to(b, view, view, mme_imm(1));
3999       }
4000       mme_free_reg(b, view);
4001    }
4002 
4003    mme_free_reg(b, instance_count);
4004    mme_free_reg(b, first_index);
4005    mme_free_reg(b, index_count);
4006 
4007    if (b->devinfo->cls_eng3d < TURING_A)
4008       nvk_mme_unspill(b, DRAW_IDX, draw_index);
4009 }
4010 
4011 void
nvk_mme_draw_indexed(struct mme_builder * b)4012 nvk_mme_draw_indexed(struct mme_builder *b)
4013 {
4014    struct mme_value draw_index = mme_load(b);
4015    nvk_mme_build_draw_indexed(b, draw_index);
4016 }
4017 
4018 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4019 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4020                    uint32_t indexCount,
4021                    uint32_t instanceCount,
4022                    uint32_t firstIndex,
4023                    int32_t vertexOffset,
4024                    uint32_t firstInstance)
4025 {
4026    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4027 
4028    nvk_cmd_flush_gfx_state(cmd);
4029 
4030    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4031    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4032    P_INLINE_DATA(p, 0 /* draw_index */);
4033    P_INLINE_DATA(p, indexCount);
4034    P_INLINE_DATA(p, instanceCount);
4035    P_INLINE_DATA(p, firstIndex);
4036    P_INLINE_DATA(p, vertexOffset);
4037    P_INLINE_DATA(p, firstInstance);
4038 }
4039 
4040 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)4041 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
4042                            uint32_t drawCount,
4043                            const VkMultiDrawIndexedInfoEXT *pIndexInfo,
4044                            uint32_t instanceCount,
4045                            uint32_t firstInstance,
4046                            uint32_t stride,
4047                            const int32_t *pVertexOffset)
4048 {
4049    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4050 
4051    nvk_cmd_flush_gfx_state(cmd);
4052 
4053    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
4054       const uint32_t vertex_offset =
4055          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
4056 
4057       struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4058       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4059       P_INLINE_DATA(p, draw_index);
4060       P_INLINE_DATA(p, pIndexInfo->indexCount);
4061       P_INLINE_DATA(p, instanceCount);
4062       P_INLINE_DATA(p, pIndexInfo->firstIndex);
4063       P_INLINE_DATA(p, vertex_offset);
4064       P_INLINE_DATA(p, firstInstance);
4065 
4066       pIndexInfo = ((void *)pIndexInfo) + stride;
4067    }
4068 }
4069 
4070 void
nvk_mme_draw_indirect(struct mme_builder * b)4071 nvk_mme_draw_indirect(struct mme_builder *b)
4072 {
4073    if (b->devinfo->cls_eng3d >= TURING_A) {
4074       struct mme_value64 draw_addr = mme_load_addr64(b);
4075       struct mme_value draw_count = mme_load(b);
4076       struct mme_value stride = mme_load(b);
4077 
4078       struct mme_value draw = mme_mov(b, mme_zero());
4079       mme_while(b, ult, draw, draw_count) {
4080          mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4081 
4082          nvk_mme_build_draw(b, draw);
4083 
4084          mme_add_to(b, draw, draw, mme_imm(1));
4085          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4086       }
4087    } else {
4088       struct mme_value draw_count = mme_load(b);
4089       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4090 
4091       struct mme_value draw = mme_mov(b, mme_zero());
4092       mme_while(b, ine, draw, draw_count) {
4093          nvk_mme_spill(b, DRAW_COUNT, draw_count);
4094 
4095          nvk_mme_build_draw(b, draw);
4096          mme_add_to(b, draw, draw, mme_imm(1));
4097 
4098          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4099          mme_loop(b, pad_dw) {
4100             mme_free_reg(b, mme_load(b));
4101          }
4102          mme_free_reg(b, pad_dw);
4103 
4104          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4105       }
4106    }
4107 }
4108 
4109 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4110 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4111                     VkBuffer _buffer,
4112                     VkDeviceSize offset,
4113                     uint32_t drawCount,
4114                     uint32_t stride)
4115 {
4116    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4117    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4118 
4119    /* From the Vulkan 1.3.238 spec:
4120     *
4121     *    VUID-vkCmdDrawIndirect-drawCount-00476
4122     *
4123     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
4124     *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
4125     *
4126     * and
4127     *
4128     *    "If drawCount is less than or equal to one, stride is ignored."
4129     */
4130    if (drawCount > 1) {
4131       assert(stride % 4 == 0);
4132       assert(stride >= sizeof(VkDrawIndirectCommand));
4133    } else {
4134       stride = sizeof(VkDrawIndirectCommand);
4135    }
4136 
4137    nvk_cmd_flush_gfx_state(cmd);
4138 
4139    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4140       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4141       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4142       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4143       P_INLINE_DATA(p, draw_addr >> 32);
4144       P_INLINE_DATA(p, draw_addr);
4145       P_INLINE_DATA(p, drawCount);
4146       P_INLINE_DATA(p, stride);
4147    } else {
4148       const uint32_t max_draws_per_push =
4149          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4150 
4151       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4152       while (drawCount) {
4153          const uint32_t count = MIN2(drawCount, max_draws_per_push);
4154 
4155          struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4156          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4157          P_INLINE_DATA(p, count);
4158          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
4159 
4160          uint64_t range = count * (uint64_t)stride;
4161          nv_push_update_count(p, range / 4);
4162          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4163 
4164          draw_addr += range;
4165          drawCount -= count;
4166       }
4167    }
4168 }
4169 
4170 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)4171 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
4172 {
4173    if (b->devinfo->cls_eng3d >= TURING_A) {
4174       struct mme_value64 draw_addr = mme_load_addr64(b);
4175       struct mme_value draw_count = mme_load(b);
4176       struct mme_value stride = mme_load(b);
4177 
4178       struct mme_value draw = mme_mov(b, mme_zero());
4179       mme_while(b, ult, draw, draw_count) {
4180          mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4181 
4182          nvk_mme_build_draw_indexed(b, draw);
4183 
4184          mme_add_to(b, draw, draw, mme_imm(1));
4185          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4186       }
4187    } else {
4188       struct mme_value draw_count = mme_load(b);
4189       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4190 
4191       struct mme_value draw = mme_mov(b, mme_zero());
4192       mme_while(b, ine, draw, draw_count) {
4193          nvk_mme_spill(b, DRAW_COUNT, draw_count);
4194 
4195          nvk_mme_build_draw_indexed(b, draw);
4196          mme_add_to(b, draw, draw, mme_imm(1));
4197 
4198          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4199          mme_loop(b, pad_dw) {
4200             mme_free_reg(b, mme_load(b));
4201          }
4202          mme_free_reg(b, pad_dw);
4203 
4204          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4205       }
4206    }
4207 }
4208 
4209 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4210 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4211                            VkBuffer _buffer,
4212                            VkDeviceSize offset,
4213                            uint32_t drawCount,
4214                            uint32_t stride)
4215 {
4216    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4217    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4218 
4219    /* From the Vulkan 1.3.238 spec:
4220     *
4221     *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
4222     *
4223     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
4224     *    must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
4225     *
4226     * and
4227     *
4228     *    "If drawCount is less than or equal to one, stride is ignored."
4229     */
4230    if (drawCount > 1) {
4231       assert(stride % 4 == 0);
4232       assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
4233    } else {
4234       stride = sizeof(VkDrawIndexedIndirectCommand);
4235    }
4236 
4237    nvk_cmd_flush_gfx_state(cmd);
4238 
4239    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4240       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4241       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4242       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4243       P_INLINE_DATA(p, draw_addr >> 32);
4244       P_INLINE_DATA(p, draw_addr);
4245       P_INLINE_DATA(p, drawCount);
4246       P_INLINE_DATA(p, stride);
4247    } else {
4248       const uint32_t max_draws_per_push =
4249          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4250 
4251       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4252       while (drawCount) {
4253          const uint32_t count = MIN2(drawCount, max_draws_per_push);
4254 
4255          struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4256          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4257          P_INLINE_DATA(p, count);
4258          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
4259 
4260          uint64_t range = count * (uint64_t)stride;
4261          nv_push_update_count(p, range / 4);
4262          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4263 
4264          draw_addr += range;
4265          drawCount -= count;
4266       }
4267    }
4268 }
4269 
4270 void
nvk_mme_draw_indirect_count(struct mme_builder * b)4271 nvk_mme_draw_indirect_count(struct mme_builder *b)
4272 {
4273    if (b->devinfo->cls_eng3d < TURING_A)
4274       return;
4275 
4276    struct mme_value64 draw_addr = mme_load_addr64(b);
4277    struct mme_value64 draw_count_addr = mme_load_addr64(b);
4278    struct mme_value draw_max = mme_load(b);
4279    struct mme_value stride = mme_load(b);
4280 
4281    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4282    mme_free_reg64(b, draw_count_addr);
4283    struct mme_value draw_count_buf = mme_load(b);
4284 
4285    mme_if(b, ule, draw_count_buf, draw_max) {
4286       mme_mov_to(b, draw_max, draw_count_buf);
4287    }
4288    mme_free_reg(b, draw_count_buf);
4289 
4290    struct mme_value draw = mme_mov(b, mme_zero());
4291    mme_while(b, ult, draw, draw_max) {
4292       mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4293 
4294       nvk_mme_build_draw(b, draw);
4295 
4296       mme_add_to(b, draw, draw, mme_imm(1));
4297       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4298    }
4299 }
4300 
4301 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4302 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4303                          VkBuffer _buffer,
4304                          VkDeviceSize offset,
4305                          VkBuffer countBuffer,
4306                          VkDeviceSize countBufferOffset,
4307                          uint32_t maxDrawCount,
4308                          uint32_t stride)
4309 {
4310    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4311    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4312    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4313 
4314    /* TODO: Indirect count draw pre-Turing */
4315    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4316 
4317    nvk_cmd_flush_gfx_state(cmd);
4318 
4319    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4320    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
4321    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4322    P_INLINE_DATA(p, draw_addr >> 32);
4323    P_INLINE_DATA(p, draw_addr);
4324    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4325                                                  countBufferOffset);
4326    P_INLINE_DATA(p, draw_count_addr >> 32);
4327    P_INLINE_DATA(p, draw_count_addr);
4328    P_INLINE_DATA(p, maxDrawCount);
4329    P_INLINE_DATA(p, stride);
4330 }
4331 
4332 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)4333 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
4334 {
4335    if (b->devinfo->cls_eng3d < TURING_A)
4336       return;
4337 
4338    struct mme_value64 draw_addr = mme_load_addr64(b);
4339    struct mme_value64 draw_count_addr = mme_load_addr64(b);
4340    struct mme_value draw_max = mme_load(b);
4341    struct mme_value stride = mme_load(b);
4342 
4343    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4344    mme_free_reg64(b, draw_count_addr);
4345    struct mme_value draw_count_buf = mme_load(b);
4346 
4347    mme_if(b, ule, draw_count_buf, draw_max) {
4348       mme_mov_to(b, draw_max, draw_count_buf);
4349    }
4350    mme_free_reg(b, draw_count_buf);
4351 
4352    struct mme_value draw = mme_mov(b, mme_zero());
4353    mme_while(b, ult, draw, draw_max) {
4354       mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4355 
4356       nvk_mme_build_draw_indexed(b, draw);
4357 
4358       mme_add_to(b, draw, draw, mme_imm(1));
4359       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4360    }
4361 }
4362 
4363 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4364 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4365                                 VkBuffer _buffer,
4366                                 VkDeviceSize offset,
4367                                 VkBuffer countBuffer,
4368                                 VkDeviceSize countBufferOffset,
4369                                 uint32_t maxDrawCount,
4370                                 uint32_t stride)
4371 {
4372    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4373    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4374    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4375 
4376    /* TODO: Indexed indirect count draw pre-Turing */
4377    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4378 
4379    nvk_cmd_flush_gfx_state(cmd);
4380 
4381    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4382    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
4383    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4384    P_INLINE_DATA(p, draw_addr >> 32);
4385    P_INLINE_DATA(p, draw_addr);
4386    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4387                                                  countBufferOffset);
4388    P_INLINE_DATA(p, draw_count_addr >> 32);
4389    P_INLINE_DATA(p, draw_count_addr);
4390    P_INLINE_DATA(p, maxDrawCount);
4391    P_INLINE_DATA(p, stride);
4392 }
4393 
4394 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)4395 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
4396                                struct mme_value instance_count,
4397                                struct mme_value counter)
4398 {
4399    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
4400 
4401    mme_loop(b, instance_count) {
4402       mme_mthd(b, NV9097_BEGIN);
4403       mme_emit(b, begin);
4404 
4405       mme_mthd(b, NV9097_DRAW_AUTO);
4406       mme_emit(b, counter);
4407 
4408       mme_mthd(b, NV9097_END);
4409       mme_emit(b, mme_zero());
4410 
4411       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
4412    }
4413 
4414    mme_free_reg(b, begin);
4415 }
4416 
4417 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)4418 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
4419 {
4420    struct mme_value instance_count = mme_load(b);
4421    struct mme_value first_instance = mme_load(b);
4422 
4423    if (b->devinfo->cls_eng3d >= TURING_A) {
4424       struct mme_value64 counter_addr = mme_load_addr64(b);
4425       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4426       mme_free_reg(b, counter_addr.lo);
4427       mme_free_reg(b, counter_addr.hi);
4428    }
4429    struct mme_value counter = mme_load(b);
4430 
4431    struct mme_draw_params params = {
4432       .first_instance = first_instance,
4433    };
4434    nvk_mme_build_set_draw_params(b, &params);
4435 
4436    mme_free_reg(b, first_instance);
4437 
4438    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4439    mme_if(b, ieq, view_mask, mme_zero()) {
4440       mme_free_reg(b, view_mask);
4441 
4442       nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4443    }
4444 
4445    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4446    mme_if(b, ine, view_mask, mme_zero()) {
4447       mme_free_reg(b, view_mask);
4448 
4449       struct mme_value view = mme_mov(b, mme_zero());
4450       mme_while(b, ine, view, mme_imm(32)) {
4451          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4452          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
4453          mme_free_reg(b, view_mask);
4454          mme_if(b, ine, has_view, mme_zero()) {
4455             mme_free_reg(b, has_view);
4456             nvk_mme_emit_view_index(b, view);
4457             nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4458          }
4459 
4460          mme_add_to(b, view, view, mme_imm(1));
4461       }
4462    }
4463 
4464    mme_free_reg(b, instance_count);
4465    mme_free_reg(b, counter);
4466 }
4467 
4468 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)4469 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4470                                 uint32_t instanceCount,
4471                                 uint32_t firstInstance,
4472                                 VkBuffer counterBuffer,
4473                                 VkDeviceSize counterBufferOffset,
4474                                 uint32_t counterOffset,
4475                                 uint32_t vertexStride)
4476 {
4477    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4478    VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
4479 
4480    nvk_cmd_flush_gfx_state(cmd);
4481 
4482    uint64_t counter_addr = nvk_buffer_address(counter_buffer,
4483                                               counterBufferOffset);
4484 
4485    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4486       struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
4487       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4488       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4489 
4490       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4491       P_INLINE_DATA(p, instanceCount);
4492       P_INLINE_DATA(p, firstInstance);
4493       P_INLINE_DATA(p, counter_addr >> 32);
4494       P_INLINE_DATA(p, counter_addr);
4495    } else {
4496       struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
4497       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4498       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4499 
4500       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4501       P_INLINE_DATA(p, instanceCount);
4502       P_INLINE_DATA(p, firstInstance);
4503       nv_push_update_count(p, 1);
4504       nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
4505    }
4506 }
4507 
4508 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)4509 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
4510                                        uint32_t firstBinding,
4511                                        uint32_t bindingCount,
4512                                        const VkBuffer *pBuffers,
4513                                        const VkDeviceSize *pOffsets,
4514                                        const VkDeviceSize *pSizes)
4515 {
4516    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4517 
4518    for (uint32_t i = 0; i < bindingCount; i++) {
4519       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
4520       uint32_t idx = firstBinding + i;
4521       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
4522       struct nvk_addr_range addr_range =
4523          nvk_buffer_addr_range(buffer, pOffsets[i], size);
4524       assert(addr_range.range <= UINT32_MAX);
4525 
4526       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4527 
4528       P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
4529       P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
4530       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
4531       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
4532       P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
4533    }
4534 
4535    // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
4536 }
4537 
4538 void
nvk_mme_xfb_counter_load(struct mme_builder * b)4539 nvk_mme_xfb_counter_load(struct mme_builder *b)
4540 {
4541    struct mme_value buffer = mme_load(b);
4542 
4543    struct mme_value counter;
4544    if (b->devinfo->cls_eng3d >= TURING_A) {
4545       struct mme_value64 counter_addr = mme_load_addr64(b);
4546 
4547       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4548       mme_free_reg(b, counter_addr.lo);
4549       mme_free_reg(b, counter_addr.hi);
4550 
4551       counter = mme_load(b);
4552    } else {
4553       counter = mme_load(b);
4554    }
4555 
4556    mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
4557    mme_emit(b, counter);
4558 
4559    mme_free_reg(b, counter);
4560    mme_free_reg(b, buffer);
4561 }
4562 
4563 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4564 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4565                                  uint32_t firstCounterBuffer,
4566                                  uint32_t counterBufferCount,
4567                                  const VkBuffer *pCounterBuffers,
4568                                  const VkDeviceSize *pCounterBufferOffsets)
4569 {
4570    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4571    const uint32_t max_buffers = 4;
4572 
4573    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
4574 
4575    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
4576    for (uint32_t i = 0; i < max_buffers; ++i) {
4577       P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
4578    }
4579 
4580    for (uint32_t i = 0; i < counterBufferCount; ++i) {
4581       if (pCounterBuffers[i] == VK_NULL_HANDLE)
4582          continue;
4583 
4584       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4585       // index of counter buffer corresponts to index of transform buffer
4586       uint32_t cb_idx = firstCounterBuffer + i;
4587       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4588       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4589 
4590       if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4591          struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
4592          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4593          /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
4594          P_INLINE_DATA(p, cb_idx * 8);
4595          P_INLINE_DATA(p, cb_addr >> 32);
4596          P_INLINE_DATA(p, cb_addr);
4597       } else {
4598          struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
4599          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4600          P_INLINE_DATA(p, cb_idx);
4601          nv_push_update_count(p, 1);
4602          nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
4603       }
4604    }
4605 }
4606 
4607 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4608 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4609                                uint32_t firstCounterBuffer,
4610                                uint32_t counterBufferCount,
4611                                const VkBuffer *pCounterBuffers,
4612                                const VkDeviceSize *pCounterBufferOffsets)
4613 {
4614    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4615 
4616    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
4617 
4618    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
4619 
4620    for (uint32_t i = 0; i < counterBufferCount; ++i) {
4621       if (pCounterBuffers[i] == VK_NULL_HANDLE)
4622          continue;
4623 
4624       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4625       // index of counter buffer corresponts to index of transform buffer
4626       uint32_t cb_idx = firstCounterBuffer + i;
4627       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4628       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4629 
4630       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
4631       P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
4632       P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
4633       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
4634       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
4635          .operation = OPERATION_REPORT_ONLY,
4636          .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
4637          .report = REPORT_STREAMING_BYTE_COUNT,
4638          .sub_report = cb_idx,
4639          .structure_size = STRUCTURE_SIZE_ONE_WORD,
4640       });
4641    }
4642 }
4643 
4644 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)4645 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4646                                     const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4647 {
4648    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4649    VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
4650 
4651    uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
4652    bool inverted = pConditionalRenderingBegin->flags &
4653       VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4654 
4655    /* From the Vulkan 1.3.280 spec:
4656     *
4657     *    "If the 32-bit value at offset in buffer memory is zero,
4658     *     then the rendering commands are discarded,
4659     *     otherwise they are executed as normal."
4660     *
4661     * The hardware compare a 64-bit value, as such we are required to copy it.
4662     */
4663    uint64_t tmp_addr;
4664    VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
4665    if (result != VK_SUCCESS) {
4666       vk_command_buffer_set_error(&cmd->vk, result);
4667       return;
4668    }
4669 
4670    struct nv_push *p = nvk_cmd_buffer_push(cmd, 26);
4671 
4672    P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
4673    P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
4674    P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
4675    P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
4676    P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
4677    P_NV90B5_PITCH_IN(p, 4);
4678    P_NV90B5_PITCH_OUT(p, 4);
4679    P_NV90B5_LINE_LENGTH_IN(p, 4);
4680    P_NV90B5_LINE_COUNT(p, 1);
4681 
4682    P_IMMD(p, NV90B5, SET_REMAP_COMPONENTS, {
4683       .dst_x = DST_X_SRC_X,
4684       .dst_y = DST_Y_SRC_X,
4685       .dst_z = DST_Z_NO_WRITE,
4686       .dst_w = DST_W_NO_WRITE,
4687       .component_size = COMPONENT_SIZE_ONE,
4688       .num_src_components = NUM_SRC_COMPONENTS_ONE,
4689       .num_dst_components = NUM_DST_COMPONENTS_TWO,
4690    });
4691 
4692    P_IMMD(p, NV90B5, LAUNCH_DMA, {
4693       .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
4694       .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
4695       .flush_enable = FLUSH_ENABLE_TRUE,
4696       .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
4697       .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
4698       .remap_enable = REMAP_ENABLE_TRUE,
4699    });
4700 
4701    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4702    P_NV9097_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4703    P_NV9097_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4704    P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4705 
4706    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4707    P_NV90C0_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4708    P_NV90C0_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4709    P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4710 }
4711 
4712 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4713 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4714 {
4715    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4716 
4717    struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
4718    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4719    P_NV9097_SET_RENDER_ENABLE_A(p, 0);
4720    P_NV9097_SET_RENDER_ENABLE_B(p, 0);
4721    P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4722 
4723    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4724    P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
4725    P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
4726    P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4727 }
4728