• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_buffer.h"
6 #include "nvk_entrypoints.h"
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_device.h"
9 #include "nvk_format.h"
10 #include "nvk_image.h"
11 #include "nvk_image_view.h"
12 #include "nvk_mme.h"
13 #include "nvk_physical_device.h"
14 #include "nvk_shader.h"
15 
16 #include "util/bitpack_helpers.h"
17 #include "vk_format.h"
18 #include "vk_render_pass.h"
19 #include "vk_standard_sample_locations.h"
20 
21 #include "nv_push_cl902d.h"
22 #include "nv_push_cl9097.h"
23 #include "nv_push_cl90b5.h"
24 #include "nv_push_cl90c0.h"
25 #include "nv_push_cla097.h"
26 #include "nv_push_clb097.h"
27 #include "nv_push_clb197.h"
28 #include "nv_push_clc397.h"
29 #include "nv_push_clc597.h"
30 #include "drf.h"
31 
32 static inline uint16_t
nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer * cmd)33 nvk_cmd_buffer_3d_cls(struct nvk_cmd_buffer *cmd)
34 {
35    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
36    struct nvk_physical_device *pdev = nvk_device_physical(dev);
37    return pdev->info.cls_eng3d;
38 }
39 
40 static void
mme_set_priv_reg(struct mme_builder * b,struct mme_value value,struct mme_value mask,struct mme_value reg)41 mme_set_priv_reg(struct mme_builder *b,
42                  struct mme_value value,
43                  struct mme_value mask,
44                  struct mme_value reg)
45 {
46    mme_mthd(b, NV9097_WAIT_FOR_IDLE);
47    mme_emit(b, mme_zero());
48 
49    mme_mthd(b, NVK_SET_MME_SCRATCH(FALCON_0));
50    mme_emit(b, mme_zero());
51    mme_emit(b, value);
52    mme_emit(b, mask);
53 
54    mme_mthd(b, NV9097_SET_FALCON04);
55    mme_emit(b, reg);
56 
57    struct mme_value loop_cond = mme_mov(b, mme_zero());
58    mme_while(b, ine, loop_cond, mme_imm(1)) {
59       mme_state_to(b, loop_cond, NVK_SET_MME_SCRATCH(FALCON_0));
60       mme_mthd(b, NV9097_NO_OPERATION);
61       mme_emit(b, mme_zero());
62    };
63 }
64 
65 void
nvk_mme_set_priv_reg(struct mme_builder * b)66 nvk_mme_set_priv_reg(struct mme_builder *b)
67 {
68    struct mme_value value = mme_load(b);
69    struct mme_value mask = mme_load(b);
70    struct mme_value reg = mme_load(b);
71 
72    mme_set_priv_reg(b, value, mask, reg);
73 }
74 
75 void
nvk_mme_set_conservative_raster_state(struct mme_builder * b)76 nvk_mme_set_conservative_raster_state(struct mme_builder *b)
77 {
78    struct mme_value new_state = mme_load(b);
79    struct mme_value old_state =
80       nvk_mme_load_scratch(b, CONSERVATIVE_RASTER_STATE);
81 
82    mme_if(b, ine, new_state, old_state) {
83       nvk_mme_store_scratch(b, CONSERVATIVE_RASTER_STATE, new_state);
84       mme_set_priv_reg(b, new_state, mme_imm(BITFIELD_RANGE(23, 2)),
85                        mme_imm(0x418800));
86    }
87 }
88 
89 #define NVK_DRAW_CB0_SIZE sizeof(struct nvk_root_descriptor_table)
90 
91 void
nvk_mme_select_cb0(struct mme_builder * b)92 nvk_mme_select_cb0(struct mme_builder *b)
93 {
94    struct mme_value addr_hi = nvk_mme_load_scratch(b, CB0_ADDR_HI);
95    struct mme_value addr_lo = nvk_mme_load_scratch(b, CB0_ADDR_LO);
96 
97    mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
98    mme_emit(b, mme_imm(NVK_DRAW_CB0_SIZE));
99    mme_emit(b, addr_hi);
100    mme_emit(b, addr_lo);
101 }
102 
103 static uint32_t nvk_mme_anti_alias_init(void);
104 
105 VkResult
nvk_push_draw_state_init(struct nvk_queue * queue,struct nv_push * p)106 nvk_push_draw_state_init(struct nvk_queue *queue, struct nv_push *p)
107 {
108    struct nvk_device *dev = nvk_queue_device(queue);
109    struct nvk_physical_device *pdev = nvk_device_physical(dev);
110 
111    /* 3D state */
112    P_MTHD(p, NV9097, SET_OBJECT);
113    P_NV9097_SET_OBJECT(p, {
114       .class_id = pdev->info.cls_eng3d,
115       .engine_id = 0,
116    });
117 
118    for (uint32_t mme = 0, mme_pos = 0; mme < NVK_MME_COUNT; mme++) {
119       size_t size;
120       uint32_t *dw = nvk_build_mme(&pdev->info, mme, &size);
121       if (dw == NULL)
122          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
123 
124       assert(size % sizeof(uint32_t) == 0);
125       const uint32_t num_dw = size / sizeof(uint32_t);
126 
127       P_MTHD(p, NV9097, LOAD_MME_START_ADDRESS_RAM_POINTER);
128       P_NV9097_LOAD_MME_START_ADDRESS_RAM_POINTER(p, mme);
129       P_NV9097_LOAD_MME_START_ADDRESS_RAM(p, mme_pos);
130 
131       P_1INC(p, NV9097, LOAD_MME_INSTRUCTION_RAM_POINTER);
132       P_NV9097_LOAD_MME_INSTRUCTION_RAM_POINTER(p, mme_pos);
133       P_INLINE_ARRAY(p, dw, num_dw);
134 
135       mme_pos += num_dw;
136 
137       free(dw);
138    }
139 
140    if (pdev->info.cls_eng3d >= TURING_A)
141       P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
142 
143    /* Enable FP helper invocation memory loads
144     *
145     * For generations with firmware support for our `SET_PRIV_REG` mme method
146     * we simply use that. On older generations we'll let the kernel do it.
147     * Starting with GSP we have to do it via the firmware anyway.
148     *
149     * This clears bit 3 of gr_gpcs_tpcs_sm_disp_ctrl
150     *
151     * Without it,
152     * dEQP-VK.subgroups.vote.frag_helper.subgroupallequal_bvec2_fragment will
153     * occasionally fail.
154     */
155    if (pdev->info.cls_eng3d >= MAXWELL_B) {
156       unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ba4 : 0x419f78;
157       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
158       P_INLINE_DATA(p, 0);
159       P_INLINE_DATA(p, BITFIELD_BIT(3));
160       P_INLINE_DATA(p, reg);
161    }
162 
163    /* Disable Out Of Range Address exceptions
164     *
165     * From the SPH documentation:
166     *
167     *    "The SPH fields StoreReqStart and StoreReqEnd set a range of
168     *    attributes whose corresponding Odmap values of ST or ST_LAST are
169     *    treated as ST_REQ. Normally, for an attribute whose Omap bit is TRUE
170     *    and Odmap value is ST, when the shader writes data to this output, it
171     *    can not count on being able to read it back, since the next
172     *    downstream shader might have its Imap bit FALSE, thereby causing the
173     *    Bmap bit to be FALSE. By including a ST type of attribute in the
174     *    range of StoreReqStart and StoreReqEnd, the attribute’s Odmap value
175     *    is treated as ST_REQ, so an Omap bit being TRUE causes the Bmap bit
176     *    to be TRUE. This guarantees the shader program can output the value
177     *    and then read it back later. This will save register space."
178     *
179     * It's unclear exactly what's going on but this seems to imply that the
180     * hardware actually ANDs the output mask of one shader stage together with
181     * the input mask of the subsequent shader stage to determine which values
182     * are actually used.
183     *
184     * In the case when we have an empty fragment shader, it seems the hardware
185     * doesn't allocate any output memory for final geometry stage at all and
186     * so any writes to outputs from the final shader stage generates an Out Of
187     * Range Address exception.  We could fix this by eliminating unused
188     * outputs via cross-stage linking but that won't work in the case of
189     * VK_EXT_shader_object and VK_EXT_graphics_pipeline_library fast-link.
190     * Instead, the easiest solution is to just disable the exception.
191     *
192     * NOTE (Faith):
193     *
194     *    This above analysis is 100% conjecture on my part based on a creative
195     *    reading of the SPH docs and what I saw when trying to run certain
196     *    OpenGL CTS tests on NVK + Zink.  Without access to NVIDIA HW
197     *    engineers, have no way of verifying this analysis.
198     *
199     *    The CTS test in question is:
200     *
201     *    KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_tessLevel
202     *
203     * This should also prevent any issues with array overruns on I/O arrays.
204     * Before, they would get an exception and kill the context whereas now
205     * they should gently get ignored.
206     *
207     * This clears bit 14 of gr_gpcs_tpcs_sms_hww_warp_esr_report_mask
208     */
209    if (pdev->info.cls_eng3d >= MAXWELL_B) {
210       unsigned reg = pdev->info.cls_eng3d >= VOLTA_A ? 0x419ea8 : 0x419e44;
211       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_PRIV_REG));
212       P_INLINE_DATA(p, 0);
213       P_INLINE_DATA(p, BITFIELD_BIT(14));
214       P_INLINE_DATA(p, reg);
215    }
216 
217    /* Set CONSERVATIVE_RASTER_STATE to an invalid value, to ensure the
218     * hardware reg is always set the first time conservative rasterization
219     * is enabled */
220    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CONSERVATIVE_RASTER_STATE),
221                      ~0);
222 
223    /* Initialize tessellation parameters */
224    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_TESS_PARAMS), 0);
225    P_IMMD(p, NV9097, SET_TESSELLATION_PARAMETERS, {});
226 
227    P_IMMD(p, NV9097, SET_RENDER_ENABLE_C, MODE_TRUE);
228 
229    P_IMMD(p, NV9097, SET_Z_COMPRESSION, ENABLE_TRUE);
230    P_MTHD(p, NV9097, SET_COLOR_COMPRESSION(0));
231    for (unsigned i = 0; i < 8; i++)
232       P_NV9097_SET_COLOR_COMPRESSION(p, i, ENABLE_TRUE);
233 
234    P_IMMD(p, NV9097, SET_CT_SELECT, { .target_count = 1 });
235 
236 //   P_MTHD(cmd->push, NVC0_3D, CSAA_ENABLE);
237 //   P_INLINE_DATA(cmd->push, 0);
238 
239    P_IMMD(p, NV9097, SET_ALIASED_LINE_WIDTH_ENABLE, V_TRUE);
240 
241    P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART_VERTEX_ARRAY, ENABLE_FALSE);
242 
243    P_IMMD(p, NV9097, SET_BLEND_SEPARATE_FOR_ALPHA, ENABLE_TRUE);
244    P_IMMD(p, NV9097, SET_SINGLE_CT_WRITE_CONTROL, ENABLE_TRUE);
245    P_IMMD(p, NV9097, SET_SINGLE_ROP_CONTROL, ENABLE_FALSE);
246    P_IMMD(p, NV9097, SET_TWO_SIDED_STENCIL_TEST, ENABLE_TRUE);
247 
248    P_IMMD(p, NV9097, SET_SHADE_MODE, V_OGL_SMOOTH);
249 
250    P_IMMD(p, NV9097, SET_API_VISIBLE_CALL_LIMIT, V__128);
251 
252    P_IMMD(p, NV9097, SET_ZCULL_STATS, ENABLE_TRUE);
253 
254    P_IMMD(p, NV9097, SET_L1_CONFIGURATION,
255                      DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
256 
257    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_ENABLE, V_FALSE);
258    P_IMMD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM8, {
259       .all_covered_all_hit_once = 0xff,
260    });
261    P_MTHD(p, NV9097, SET_REDUCE_COLOR_THRESHOLDS_UNORM10);
262    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM10(p, {
263       .all_covered_all_hit_once = 0xff,
264    });
265    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_UNORM16(p, {
266       .all_covered_all_hit_once = 0xff,
267    });
268    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP11(p, {
269       .all_covered_all_hit_once = 0x3f,
270    });
271    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_FP16(p, {
272       .all_covered_all_hit_once = 0xff,
273    });
274    P_NV9097_SET_REDUCE_COLOR_THRESHOLDS_SRGB8(p, {
275       .all_covered_all_hit_once = 0xff,
276    });
277 
278    if (pdev->info.cls_eng3d < VOLTA_A)
279       P_IMMD(p, NV9097, SET_ALPHA_FRACTION, 0x3f);
280 
281    P_IMMD(p, NV9097, CHECK_SPH_VERSION, {
282       .current = 3,
283       .oldest_supported = 3,
284    });
285    P_IMMD(p, NV9097, CHECK_AAM_VERSION, {
286       .current = 2,
287       .oldest_supported = 2,
288    });
289 
290    if (pdev->info.cls_eng3d < MAXWELL_A)
291       P_IMMD(p, NV9097, SET_SHADER_SCHEDULING, MODE_OLDEST_THREAD_FIRST);
292 
293    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_PREFETCH_READ_REQUESTS,
294                      POLICY_EVICT_NORMAL);
295    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_READ_REQUESTS,
296                      POLICY_EVICT_NORMAL);
297    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_READ_REQUESTS,
298                      POLICY_EVICT_NORMAL);
299    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_NONINTERLOCKED_WRITE_REQUESTS,
300                      POLICY_EVICT_NORMAL);
301    P_IMMD(p, NV9097, SET_L2_CACHE_CONTROL_FOR_ROP_INTERLOCKED_WRITE_REQUESTS,
302                      POLICY_EVICT_NORMAL);
303 
304    P_IMMD(p, NV9097, SET_BLEND_PER_FORMAT_ENABLE, SNORM8_UNORM16_SNORM16_TRUE);
305 
306    P_IMMD(p, NV9097, SET_ATTRIBUTE_DEFAULT, {
307       .color_front_diffuse    = COLOR_FRONT_DIFFUSE_VECTOR_0001,
308       .color_front_specular   = COLOR_FRONT_SPECULAR_VECTOR_0001,
309       .generic_vector         = GENERIC_VECTOR_VECTOR_0001,
310       .fixed_fnc_texture      = FIXED_FNC_TEXTURE_VECTOR_0001,
311       .dx9_color0             = DX9_COLOR0_VECTOR_0001,
312       .dx9_color1_to_color15  = DX9_COLOR1_TO_COLOR15_VECTOR_0000,
313    });
314 
315    P_IMMD(p, NV9097, SET_DA_OUTPUT, VERTEX_ID_USES_ARRAY_START_TRUE);
316 
317    P_IMMD(p, NV9097, SET_RENDER_ENABLE_CONTROL,
318                      CONDITIONAL_LOAD_CONSTANT_BUFFER_FALSE);
319 
320    P_IMMD(p, NV9097, SET_PS_OUTPUT_SAMPLE_MASK_USAGE, {
321       .enable                       = ENABLE_TRUE,
322       .qualify_by_anti_alias_enable = QUALIFY_BY_ANTI_ALIAS_ENABLE_ENABLE,
323    });
324 
325    if (pdev->info.cls_eng3d < VOLTA_A)
326       P_IMMD(p, NV9097, SET_PRIM_CIRCULAR_BUFFER_THROTTLE, 0x3fffff);
327 
328    P_IMMD(p, NV9097, SET_BLEND_OPT_CONTROL, ALLOW_FLOAT_PIXEL_KILLS_TRUE);
329    P_IMMD(p, NV9097, SET_BLEND_FLOAT_OPTION, ZERO_TIMES_ANYTHING_IS_ZERO_TRUE);
330    P_IMMD(p, NV9097, SET_BLEND_STATE_PER_TARGET, ENABLE_TRUE);
331 
332    if (pdev->info.cls_eng3d < MAXWELL_A)
333       P_IMMD(p, NV9097, SET_MAX_TI_WARPS_PER_BATCH, 3);
334 
335    if (pdev->info.cls_eng3d >= KEPLER_A &&
336        pdev->info.cls_eng3d < MAXWELL_A) {
337       P_IMMD(p, NVA097, SET_TEXTURE_INSTRUCTION_OPERAND,
338                         ORDERING_KEPLER_ORDER);
339    }
340 
341    P_IMMD(p, NV9097, SET_ALPHA_TEST, ENABLE_FALSE);
342    P_IMMD(p, NV9097, SET_TWO_SIDED_LIGHT, ENABLE_FALSE);
343    P_IMMD(p, NV9097, SET_COLOR_CLAMP, ENABLE_TRUE);
344    P_IMMD(p, NV9097, SET_PS_SATURATE, {
345       .output0 = OUTPUT0_FALSE,
346       .output1 = OUTPUT1_FALSE,
347       .output2 = OUTPUT2_FALSE,
348       .output3 = OUTPUT3_FALSE,
349       .output4 = OUTPUT4_FALSE,
350       .output5 = OUTPUT5_FALSE,
351       .output6 = OUTPUT6_FALSE,
352       .output7 = OUTPUT7_FALSE,
353    });
354 
355    P_IMMD(p, NV9097, SET_POINT_SIZE, fui(1.0));
356    P_IMMD(p, NV9097, SET_ATTRIBUTE_POINT_SIZE, { .enable = ENABLE_TRUE });
357 
358    /* From vulkan spec's point rasterization:
359     * "Point rasterization produces a fragment for each fragment area group of
360     * framebuffer pixels with one or more sample points that intersect a region
361     * centered at the point’s (xf,yf).
362     * This region is a square with side equal to the current point size.
363     * ... (xf,yf) is the exact, unrounded framebuffer coordinate of the vertex
364     * for the point"
365     *
366     * So it seems we always need square points with PointCoords like OpenGL
367     * point sprites.
368     *
369     * From OpenGL compatibility spec:
370     * Basic point rasterization:
371     * "If point sprites are enabled, then point rasterization produces a
372     * fragment for each framebuffer pixel whose center lies inside a square
373     * centered at the point’s (xw, yw), with side length equal to the current
374     * point size.
375     * ... and xw and yw are the exact, unrounded window coordinates of the
376     * vertex for the point"
377     *
378     * And Point multisample rasterization:
379     * "This region is a circle having diameter equal to the current point width
380     * if POINT_SPRITE is disabled, or a square with side equal to the current
381     * point width if POINT_SPRITE is enabled."
382     */
383    P_IMMD(p, NV9097, SET_POINT_SPRITE, ENABLE_TRUE);
384    P_IMMD(p, NV9097, SET_POINT_SPRITE_SELECT, {
385       .rmode      = RMODE_ZERO,
386       .origin     = ORIGIN_TOP,
387       .texture0   = TEXTURE0_PASSTHROUGH,
388       .texture1   = TEXTURE1_PASSTHROUGH,
389       .texture2   = TEXTURE2_PASSTHROUGH,
390       .texture3   = TEXTURE3_PASSTHROUGH,
391       .texture4   = TEXTURE4_PASSTHROUGH,
392       .texture5   = TEXTURE5_PASSTHROUGH,
393       .texture6   = TEXTURE6_PASSTHROUGH,
394       .texture7   = TEXTURE7_PASSTHROUGH,
395       .texture8   = TEXTURE8_PASSTHROUGH,
396       .texture9   = TEXTURE9_PASSTHROUGH,
397    });
398 
399    /* OpenGL's GL_POINT_SMOOTH */
400    P_IMMD(p, NV9097, SET_ANTI_ALIASED_POINT, ENABLE_FALSE);
401 
402    if (pdev->info.cls_eng3d >= MAXWELL_B)
403       P_IMMD(p, NVB197, SET_FILL_VIA_TRIANGLE, MODE_DISABLED);
404 
405    P_IMMD(p, NV9097, SET_POLY_SMOOTH, ENABLE_FALSE);
406 
407    P_IMMD(p, NV9097, SET_VIEWPORT_PIXEL, CENTER_AT_HALF_INTEGERS);
408 
409    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SHADING_RATE_CONTROL), 0);
410    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ANTI_ALIAS),
411           nvk_mme_anti_alias_init());
412 
413    /* Enable multisample rasterization even for one sample rasterization,
414     * this way we get strict lines and rectangular line support.
415     * More info at: DirectX rasterization rules
416     */
417    P_IMMD(p, NV9097, SET_ANTI_ALIAS_ENABLE, V_TRUE);
418 
419    if (pdev->info.cls_eng3d >= MAXWELL_B) {
420       P_IMMD(p, NVB197, SET_POST_PS_INITIAL_COVERAGE, true);
421       P_IMMD(p, NVB197, SET_OFFSET_RENDER_TARGET_INDEX,
422                         BY_VIEWPORT_INDEX_FALSE);
423    }
424 
425    /* TODO: Vertex runout */
426 
427    P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
428       .mode    = MODE_UPPER_LEFT,
429       .flip_y  = FLIP_Y_FALSE,
430    });
431 
432    P_MTHD(p, NV9097, SET_WINDOW_OFFSET_X);
433    P_NV9097_SET_WINDOW_OFFSET_X(p, 0);
434    P_NV9097_SET_WINDOW_OFFSET_Y(p, 0);
435 
436    P_IMMD(p, NV9097, SET_ACTIVE_ZCULL_REGION, 0x3f);
437    P_IMMD(p, NV9097, SET_WINDOW_CLIP_ENABLE, V_FALSE);
438    P_IMMD(p, NV9097, SET_CLIP_ID_TEST, ENABLE_FALSE);
439 
440 //   P_IMMD(p, NV9097, X_X_X_SET_CLEAR_CONTROL, {
441 //      .respect_stencil_mask   = RESPECT_STENCIL_MASK_FALSE,
442 //      .use_clear_rect         = USE_CLEAR_RECT_FALSE,
443 //   });
444 
445    P_IMMD(p, NV9097, SET_VIEWPORT_SCALE_OFFSET, ENABLE_TRUE);
446 
447    P_IMMD(p, NV9097, SET_VIEWPORT_CLIP_CONTROL, {
448       .min_z_zero_max_z_one      = MIN_Z_ZERO_MAX_Z_ONE_FALSE,
449       .pixel_min_z               = PIXEL_MIN_Z_CLAMP,
450       .pixel_max_z               = PIXEL_MAX_Z_CLAMP,
451       .geometry_guardband        = GEOMETRY_GUARDBAND_SCALE_256,
452       .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
453       .geometry_clip             = GEOMETRY_CLIP_WZERO_CLIP,
454       .geometry_guardband_z      = GEOMETRY_GUARDBAND_Z_SAME_AS_XY_GUARDBAND,
455    });
456 
457    for (unsigned i = 0; i < 16; i++)
458       P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
459 
460    P_IMMD(p, NV9097, SET_CT_MRT_ENABLE, V_TRUE);
461 
462    if (pdev->info.cls_eng3d >= TURING_A) {
463       /* I don't know what these values actually mean.  I just copied them
464        * from the way the blob sets up the hardware.
465        */
466       P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(0));
467       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 0, 0xa23eb139);
468       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 1, 0xfb72ea61);
469       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 2, 0xd950c843);
470       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 3, 0x88fac4e5);
471       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 4, 0x1ab3e1b6);
472       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 5, 0xa98fedc2);
473       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 6, 0x2107654b);
474       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 7, 0xe0539773);
475       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 8, 0x698badcf);
476       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 9, 0x71032547);
477       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 10, 0xdef05397);
478       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 11, 0x56789abc);
479       P_NVC597_SET_VARIABLE_PIXEL_RATE_SAMPLE_ORDER(p, 12, 0x1234);
480    }
481 
482    if (pdev->info.cls_eng3d < VOLTA_A) {
483       uint64_t shader_base_addr =
484          nvk_heap_contiguous_base_address(&dev->shader_heap);
485 
486       P_MTHD(p, NV9097, SET_PROGRAM_REGION_A);
487       P_NV9097_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
488       P_NV9097_SET_PROGRAM_REGION_B(p, shader_base_addr);
489    }
490 
491    for (uint32_t group = 0; group < 5; group++) {
492       for (uint32_t slot = 0; slot < 16; slot++) {
493          P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
494             .valid = VALID_FALSE,
495             .shader_slot = slot,
496          });
497       }
498    }
499 
500 //   P_MTHD(cmd->push, NVC0_3D, MACRO_GP_SELECT);
501 //   P_INLINE_DATA(cmd->push, 0x40);
502    P_IMMD(p, NV9097, SET_RT_LAYER, {
503       .v = 0,
504       .control = CONTROL_V_SELECTS_LAYER,
505    });
506 //   P_MTHD(cmd->push, NVC0_3D, MACRO_TEP_SELECT;
507 //   P_INLINE_DATA(cmd->push, 0x30);
508 
509    P_IMMD(p, NV9097, SET_POINT_CENTER_MODE, V_OGL);
510    P_IMMD(p, NV9097, SET_EDGE_FLAG, V_TRUE);
511    P_IMMD(p, NV9097, SET_SAMPLER_BINDING, V_INDEPENDENTLY);
512 
513    uint64_t zero_addr = dev->zero_page->va->addr;
514    P_MTHD(p, NV9097, SET_VERTEX_STREAM_SUBSTITUTE_A);
515    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_A(p, zero_addr >> 32);
516    P_NV9097_SET_VERTEX_STREAM_SUBSTITUTE_B(p, zero_addr);
517 
518    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VB_ENABLES));
519    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_VB_ENABLES, 0);
520    for (uint32_t b = 0; b < 32; b++) {
521       P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FORMAT(b), {
522          .enable = false,
523       });
524    }
525 
526    if (pdev->info.cls_eng3d >= FERMI_A &&
527        pdev->info.cls_eng3d < MAXWELL_A) {
528       assert(dev->vab_memory);
529       uint64_t vab_addr = dev->vab_memory->va->addr;
530       P_MTHD(p, NV9097, SET_VAB_MEMORY_AREA_A);
531       P_NV9097_SET_VAB_MEMORY_AREA_A(p, vab_addr >> 32);
532       P_NV9097_SET_VAB_MEMORY_AREA_B(p, vab_addr);
533       P_NV9097_SET_VAB_MEMORY_AREA_C(p, SIZE_BYTES_256K);
534    }
535 
536    if (pdev->info.cls_eng3d == MAXWELL_A)
537       P_IMMD(p, NVB097, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
538 
539    /* Store the address to CB0 in a pair of state registers */
540    uint64_t cb0_addr = queue->draw_cb0->va->addr;
541    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_ADDR_HI));
542    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_HI, cb0_addr >> 32);
543    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_ADDR_LO, cb0_addr);
544 
545    /* Store the address to the zero page in a pair of state registers */
546    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_ZERO_ADDR_HI));
547    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_HI, zero_addr >> 32);
548    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_ZERO_ADDR_LO, zero_addr);
549 
550    /* We leave CB0 selected by default */
551    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
552    P_INLINE_DATA(p, 0);
553 
554    /* Bind CB0 to all shader groups */
555    for (uint32_t group = 0; group < 5; group++) {
556       P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), {
557          .valid = VALID_TRUE,
558          .shader_slot = 0,
559       });
560    }
561 
562    /* Zero out CB0 */
563    P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
564    P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, 0);
565    for (uint32_t dw = 0; dw < NVK_DRAW_CB0_SIZE / 4; dw++)
566       P_INLINE_DATA(p, 0);
567 
568    /* These are shadowed in cb0 so they need to be zeroed as well for
569     * consistency.
570     */
571    P_IMMD(p, NV9097, SET_GLOBAL_BASE_INSTANCE_INDEX, 0);
572    P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CB0_FIRST_VERTEX));
573    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_FIRST_VERTEX, 0);
574    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_DRAW_INDEX, 0);
575    P_NV9097_SET_MME_SHADOW_SCRATCH(p, NVK_MME_SCRATCH_CB0_VIEW_INDEX, 0);
576 
577    return VK_SUCCESS;
578 }
579 
580 static void
nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer * cmd)581 nvk_cmd_buffer_dirty_render_pass(struct nvk_cmd_buffer *cmd)
582 {
583    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
584 
585    /* These depend on color attachment count */
586    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
587    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
588    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
589    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
590 
591    /* These depend on the depth/stencil format */
592    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
593    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
594    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
595    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
596 
597    /* This may depend on render targets for ESO */
598    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
599 
600    /* This may depend on render targets */
601    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
602 
603    /* Might be required for depthClampZeroOne */
604    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE);
605    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE);
606 }
607 
608 static void
nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer * cmd,struct nvk_descriptor_state * desc,size_t offset,size_t size)609 nvk_cmd_flush_gfx_root_desc(struct nvk_cmd_buffer *cmd,
610                             struct nvk_descriptor_state *desc,
611                             size_t offset, size_t size)
612 {
613    const uint32_t start_dw = offset / 4;
614    const uint32_t end_dw = DIV_ROUND_UP(offset + size, 4);
615    const uint32_t len_dw = end_dw - start_dw;
616 
617    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + len_dw);
618    P_1INC(p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET);
619    P_NV9097_LOAD_CONSTANT_BUFFER_OFFSET(p, start_dw * 4);
620 
621    const uint32_t *root_dw = (uint32_t *)desc->root;
622    P_INLINE_ARRAY(p, &root_dw[start_dw], len_dw);
623 }
624 
625 void
nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)626 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
627                               const VkCommandBufferBeginInfo *pBeginInfo)
628 {
629    if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
630       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
631       P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI);
632       P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, {
633          .lines = LINES_ALL,
634       });
635       P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, {
636          .lines = LINES_ALL,
637       });
638 
639       P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, {
640          .constant = CONSTANT_TRUE,
641       });
642    }
643 
644    cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
645 
646    if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
647        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
648       char gcbiar_data[VK_GCBIARR_DATA_SIZE(NVK_MAX_RTS)];
649       const VkRenderingInfo *resume_info =
650          vk_get_command_buffer_inheritance_as_rendering_resume(cmd->vk.level,
651                                                                pBeginInfo,
652                                                                gcbiar_data);
653       if (resume_info) {
654          nvk_CmdBeginRendering(nvk_cmd_buffer_to_handle(cmd), resume_info);
655       } else {
656          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
657             vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
658                                                              pBeginInfo);
659          assert(inheritance_info);
660 
661          struct nvk_rendering_state *render = &cmd->state.gfx.render;
662          render->flags = inheritance_info->flags;
663          render->area = (VkRect2D) { };
664          render->layer_count = 0;
665          render->view_mask = inheritance_info->viewMask;
666          render->samples = inheritance_info->rasterizationSamples;
667 
668          render->color_att_count = inheritance_info->colorAttachmentCount;
669          for (uint32_t i = 0; i < render->color_att_count; i++) {
670             render->color_att[i].vk_format =
671                inheritance_info->pColorAttachmentFormats[i];
672          }
673          render->depth_att.vk_format =
674             inheritance_info->depthAttachmentFormat;
675          render->stencil_att.vk_format =
676             inheritance_info->stencilAttachmentFormat;
677 
678          const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
679             .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
680             .colorAttachmentCount = inheritance_info->colorAttachmentCount,
681          };
682          const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
683             vk_get_command_buffer_rendering_attachment_location_info(
684                cmd->vk.level, pBeginInfo);
685          if (att_loc_info == NULL)
686             att_loc_info = &att_loc_info_default;
687 
688          vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
689 
690          nvk_cmd_buffer_dirty_render_pass(cmd);
691       }
692    }
693 
694    cmd->state.gfx.shaders_dirty = ~0;
695 }
696 
697 void
nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer * cmd)698 nvk_cmd_invalidate_graphics_state(struct nvk_cmd_buffer *cmd)
699 {
700    vk_dynamic_graphics_state_dirty_all(&cmd->vk.dynamic_graphics_state);
701 
702    /* From the Vulkan 1.3.275 spec:
703     *
704     *    "...There is one exception to this rule - if the primary command
705     *    buffer is inside a render pass instance, then the render pass and
706     *    subpass state is not disturbed by executing secondary command
707     *    buffers."
708     *
709     * We need to reset everything EXCEPT the render pass state.
710     */
711    struct nvk_rendering_state render_save = cmd->state.gfx.render;
712    memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
713    cmd->state.gfx.render = render_save;
714 
715    /* We need to keep the flush_root callback */
716    cmd->state.gfx.descriptors.flush_root = nvk_cmd_flush_gfx_root_desc;
717 
718    cmd->state.gfx.shaders_dirty = ~0;
719 }
720 
721 static void
nvk_attachment_init(struct nvk_attachment * att,const VkRenderingAttachmentInfo * info)722 nvk_attachment_init(struct nvk_attachment *att,
723                     const VkRenderingAttachmentInfo *info)
724 {
725    if (info == NULL || info->imageView == VK_NULL_HANDLE) {
726       *att = (struct nvk_attachment) { .iview = NULL, };
727       return;
728    }
729 
730    VK_FROM_HANDLE(nvk_image_view, iview, info->imageView);
731    *att = (struct nvk_attachment) {
732       .vk_format = iview->vk.format,
733       .iview = iview,
734    };
735 
736    if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
737       VK_FROM_HANDLE(nvk_image_view, res_iview, info->resolveImageView);
738       att->resolve_mode = info->resolveMode;
739       att->resolve_iview = res_iview;
740    }
741 
742    att->store_op = info->storeOp;
743 }
744 
745 static uint32_t
nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)746 nil_to_nv9097_samples_mode(enum nil_sample_layout sample_layout)
747 {
748 #define MODE(S) [NIL_SAMPLE_LAYOUT_##S] = NV9097_SET_ANTI_ALIAS_SAMPLES_MODE_##S
749    uint16_t nil_to_nv9097[] = {
750       MODE(1X1),
751       MODE(2X1),
752       MODE(2X1_D3D),
753       MODE(2X2),
754       MODE(4X2),
755       MODE(4X2_D3D),
756       MODE(4X4),
757    };
758 #undef MODE
759    assert(sample_layout < ARRAY_SIZE(nil_to_nv9097));
760    assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1 ||
761           nil_to_nv9097[sample_layout] != 0);
762 
763    return nil_to_nv9097[sample_layout];
764 }
765 
766 static uint32_t nvk_mme_anti_alias_samples(uint32_t samples);
767 
768 static void
nvk_cmd_set_sample_layout(struct nvk_cmd_buffer * cmd,enum nil_sample_layout sample_layout)769 nvk_cmd_set_sample_layout(struct nvk_cmd_buffer *cmd,
770                           enum nil_sample_layout sample_layout)
771 {
772    const uint32_t samples = nil_sample_layout_samples(sample_layout);
773    struct nv_push *p = nvk_cmd_buffer_push(cmd, 14);
774 
775    P_IMMD(p, NV9097, SET_ANTI_ALIAS,
776           nil_to_nv9097_samples_mode(sample_layout));
777 
778    switch (sample_layout) {
779    case NIL_SAMPLE_LAYOUT_1X1:
780    case NIL_SAMPLE_LAYOUT_2X1:
781    case NIL_SAMPLE_LAYOUT_2X1_D3D:
782       /* These only have two modes: Single-pass or per-sample */
783       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
784       P_INLINE_DATA(p, 0);
785       P_INLINE_DATA(p, 0);
786       P_INLINE_DATA(p, 0);
787       P_INLINE_DATA(p, 0);
788       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
789       P_INLINE_DATA(p, 0);
790       P_INLINE_DATA(p, 0);
791       P_INLINE_DATA(p, 0);
792       P_INLINE_DATA(p, 0);
793       break;
794 
795    case NIL_SAMPLE_LAYOUT_2X2:
796       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
797       P_INLINE_DATA(p, 0x000a0005);
798       P_INLINE_DATA(p, 0x000a0005);
799       P_INLINE_DATA(p, 0);
800       P_INLINE_DATA(p, 0);
801       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
802       P_INLINE_DATA(p, 0);
803       P_INLINE_DATA(p, 0);
804       P_INLINE_DATA(p, 0);
805       P_INLINE_DATA(p, 0);
806       break;
807 
808    case NIL_SAMPLE_LAYOUT_4X2:
809       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
810       P_INLINE_DATA(p, 0x000f000f);
811       P_INLINE_DATA(p, 0x000f000f);
812       P_INLINE_DATA(p, 0x00f000f0);
813       P_INLINE_DATA(p, 0x00f000f0);
814       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
815       P_INLINE_DATA(p, 0x00030003);
816       P_INLINE_DATA(p, 0x000c000c);
817       P_INLINE_DATA(p, 0x00300030);
818       P_INLINE_DATA(p, 0x00c000c0);
819       break;
820 
821    case NIL_SAMPLE_LAYOUT_4X2_D3D:
822       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_2PASS_0));
823       P_INLINE_DATA(p, 0x003a00c5);
824       P_INLINE_DATA(p, 0x003a00c5);
825       P_INLINE_DATA(p, 0x003a003a);
826       P_INLINE_DATA(p, 0x00c500c5);
827       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_SAMPLE_MASKS_4PASS_0));
828       P_INLINE_DATA(p, 0x00120081);
829       P_INLINE_DATA(p, 0x00280044);
830       P_INLINE_DATA(p, 0x00280012);
831       P_INLINE_DATA(p, 0x00810044);
832       break;
833 
834    default:
835       unreachable("Unknown sample layout");
836    }
837 
838    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
839    P_INLINE_DATA(p, nvk_mme_anti_alias_samples(samples));
840 }
841 
842 VKAPI_ATTR void VKAPI_CALL
nvk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)843 nvk_GetRenderingAreaGranularityKHR(
844     VkDevice device,
845     const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
846     VkExtent2D *pGranularity)
847 {
848    *pGranularity = (VkExtent2D) { .width = 1, .height = 1 };
849 }
850 
851 static bool
nvk_rendering_all_linear(const struct nvk_rendering_state * render)852 nvk_rendering_all_linear(const struct nvk_rendering_state *render)
853 {
854    /* Depth and stencil are never linear */
855    if (render->depth_att.iview || render->stencil_att.iview)
856       return false;
857 
858    for (uint32_t i = 0; i < render->color_att_count; i++) {
859       const struct nvk_image_view *iview = render->color_att[i].iview;
860       if (iview == NULL)
861          continue;
862 
863       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
864       const uint8_t ip = iview->planes[0].image_plane;
865       const struct nil_image_level *level =
866          &image->planes[ip].nil.levels[iview->vk.base_mip_level];
867 
868       if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR)
869          return false;
870    }
871 
872    return true;
873 }
874 
875 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)876 nvk_CmdBeginRendering(VkCommandBuffer commandBuffer,
877                       const VkRenderingInfo *pRenderingInfo)
878 {
879    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
880    struct nvk_rendering_state *render = &cmd->state.gfx.render;
881 
882    memset(render, 0, sizeof(*render));
883 
884    render->flags = pRenderingInfo->flags;
885    render->area = pRenderingInfo->renderArea;
886    render->view_mask = pRenderingInfo->viewMask;
887    render->layer_count = pRenderingInfo->layerCount;
888    render->samples = 0;
889 
890    const uint32_t layer_count =
891       render->view_mask ? util_last_bit(render->view_mask) :
892                           render->layer_count;
893 
894    render->color_att_count = pRenderingInfo->colorAttachmentCount;
895    for (uint32_t i = 0; i < render->color_att_count; i++) {
896       nvk_attachment_init(&render->color_att[i],
897                           &pRenderingInfo->pColorAttachments[i]);
898    }
899 
900    nvk_attachment_init(&render->depth_att,
901                        pRenderingInfo->pDepthAttachment);
902    nvk_attachment_init(&render->stencil_att,
903                        pRenderingInfo->pStencilAttachment);
904 
905    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att_info =
906       vk_find_struct_const(pRenderingInfo->pNext,
907                            RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
908    if (fsr_att_info != NULL && fsr_att_info->imageView != VK_NULL_HANDLE) {
909       VK_FROM_HANDLE(nvk_image_view, iview, fsr_att_info->imageView);
910       render->fsr_att = (struct nvk_attachment) {
911          .vk_format = iview->vk.format,
912          .iview = iview,
913          .store_op = VK_ATTACHMENT_STORE_OP_NONE,
914       };
915    }
916 
917    render->all_linear = nvk_rendering_all_linear(render);
918 
919    const VkRenderingAttachmentLocationInfoKHR ral_info = {
920       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
921       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
922    };
923    vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
924 
925    nvk_cmd_buffer_dirty_render_pass(cmd);
926 
927    struct nv_push *p = nvk_cmd_buffer_push(cmd, NVK_MAX_RTS * 12 + 34);
928 
929    P_IMMD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_VIEW_MASK),
930           render->view_mask);
931 
932    P_MTHD(p, NV9097, SET_SURFACE_CLIP_HORIZONTAL);
933    P_NV9097_SET_SURFACE_CLIP_HORIZONTAL(p, {
934       .x       = render->area.offset.x,
935       .width   = render->area.extent.width,
936    });
937    P_NV9097_SET_SURFACE_CLIP_VERTICAL(p, {
938       .y       = render->area.offset.y,
939       .height  = render->area.extent.height,
940    });
941 
942    enum nil_sample_layout sample_layout = NIL_SAMPLE_LAYOUT_INVALID;
943 
944    /* We always emit SET_COLOR_TARGET_A(i) for every color target, regardless
945     * of the number of targets in the render pass.  This ensures that we have
946     * no left over pointers from previous render passes in the hardware.  This
947     * also allows us to point at any render target with SET_CT_SELECT and know
948     * that it's either a valid render target or NULL.
949     */
950    for (uint32_t i = 0; i < NVK_MAX_RTS; i++) {
951       if (render->color_att[i].iview) {
952          const struct nvk_image_view *iview = render->color_att[i].iview;
953          const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
954          /* Rendering to multi-planar images is valid for a specific single
955           * plane only, so assert that what we have is a single-plane, obtain
956           * its index, and begin rendering
957           */
958          assert(iview->plane_count == 1);
959          const uint8_t ip = iview->planes[0].image_plane;
960          const struct nvk_image_plane *plane = &image->planes[ip];
961 
962          if (!render->all_linear &&
963              plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR)
964             plane = &image->linear_tiled_shadow;
965 
966          const struct nil_image *nil_image = &plane->nil;
967          const struct nil_image_level *level =
968             &nil_image->levels[iview->vk.base_mip_level];
969          struct nil_Extent4D_Samples level_extent_sa =
970             nil_image_level_extent_sa(nil_image, iview->vk.base_mip_level);
971 
972          assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
973                 sample_layout == nil_image->sample_layout);
974          sample_layout = nil_image->sample_layout;
975          render->samples = image->vk.samples;
976 
977          uint64_t addr = nvk_image_plane_base_address(plane) + level->offset_B;
978 
979          if (nil_image->dim == NIL_IMAGE_DIM_3D) {
980             addr += nil_image_level_z_offset_B(nil_image,
981                                                iview->vk.base_mip_level,
982                                                iview->vk.base_array_layer);
983             assert(layer_count <= iview->vk.extent.depth);
984          } else {
985             addr += iview->vk.base_array_layer *
986                     (uint64_t)nil_image->array_stride_B;
987             assert(layer_count <= iview->vk.layer_count);
988          }
989 
990          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
991          P_NV9097_SET_COLOR_TARGET_A(p, i, addr >> 32);
992          P_NV9097_SET_COLOR_TARGET_B(p, i, addr);
993 
994          if (level->tiling.gob_type != NIL_GOB_TYPE_LINEAR) {
995             const enum pipe_format p_format =
996                nvk_format_to_pipe_format(iview->vk.format);
997 
998             /* We use the stride for depth/stencil targets because the Z/S
999              * hardware has no concept of a tile width.  Instead, we just set
1000              * the width to the stride divided by bpp.
1001              */
1002             const uint32_t row_stride_el =
1003                level->row_stride_B / util_format_get_blocksize(p_format);
1004             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, row_stride_el);
1005             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1006             const uint8_t ct_format = nil_format_to_color_target(p_format);
1007             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1008 
1009             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1010                .block_width   = BLOCK_WIDTH_ONE_GOB,
1011                .block_height  = level->tiling.y_log2,
1012                .block_depth   = level->tiling.z_log2,
1013                .layout        = LAYOUT_BLOCKLINEAR,
1014                .third_dimension_control = (nil_image->dim == NIL_IMAGE_DIM_3D) ?
1015                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_DEPTH_SIZE :
1016                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1017             });
1018 
1019             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1020             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i,
1021                nil_image->array_stride_B >> 2);
1022             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1023          } else {
1024             /* NVIDIA can only render to 2D linear images */
1025             assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1026             /* NVIDIA can only render to non-multisampled images */
1027             assert(sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1028             /* NVIDIA doesn't support linear array images */
1029             assert(iview->vk.base_array_layer == 0 && layer_count == 1);
1030 
1031             uint32_t pitch = level->row_stride_B;
1032             const enum pipe_format p_format =
1033                nvk_format_to_pipe_format(iview->vk.format);
1034             /* When memory layout is set to LAYOUT_PITCH, the WIDTH field
1035              * takes row pitch
1036              */
1037             P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, pitch);
1038             P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, level_extent_sa.height);
1039 
1040             const uint8_t ct_format = nil_format_to_color_target(p_format);
1041             P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, ct_format);
1042 
1043             P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1044                .layout = LAYOUT_PITCH,
1045                .third_dimension_control =
1046                   THIRD_DIMENSION_CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1047             });
1048 
1049             P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, 1);
1050             P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1051             P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1052          }
1053 
1054          P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), nil_image->compressed);
1055       } else {
1056          P_MTHD(p, NV9097, SET_COLOR_TARGET_A(i));
1057          P_NV9097_SET_COLOR_TARGET_A(p, i, 0);
1058          P_NV9097_SET_COLOR_TARGET_B(p, i, 0);
1059          P_NV9097_SET_COLOR_TARGET_WIDTH(p, i, 64);
1060          P_NV9097_SET_COLOR_TARGET_HEIGHT(p, i, 0);
1061          P_NV9097_SET_COLOR_TARGET_FORMAT(p, i, V_DISABLED);
1062          P_NV9097_SET_COLOR_TARGET_MEMORY(p, i, {
1063             .layout        = LAYOUT_BLOCKLINEAR,
1064          });
1065          P_NV9097_SET_COLOR_TARGET_THIRD_DIMENSION(p, i, layer_count);
1066          P_NV9097_SET_COLOR_TARGET_ARRAY_PITCH(p, i, 0);
1067          P_NV9097_SET_COLOR_TARGET_LAYER(p, i, 0);
1068 
1069          P_IMMD(p, NV9097, SET_COLOR_COMPRESSION(i), ENABLE_TRUE);
1070       }
1071    }
1072 
1073    if (render->depth_att.iview || render->stencil_att.iview) {
1074       struct nvk_image_view *iview = render->depth_att.iview ?
1075                                      render->depth_att.iview :
1076                                      render->stencil_att.iview;
1077       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1078       /* Depth/stencil are always single-plane */
1079       assert(iview->plane_count == 1);
1080       const uint8_t ip = iview->planes[0].image_plane;
1081       struct nil_image nil_image = image->planes[ip].nil;
1082 
1083       uint64_t addr = nvk_image_base_address(image, ip);
1084       uint32_t mip_level = iview->vk.base_mip_level;
1085       uint32_t base_array_layer = iview->vk.base_array_layer;
1086 
1087       if (nil_image.dim == NIL_IMAGE_DIM_3D) {
1088          uint64_t level_offset_B;
1089          nil_image = nil_image_3d_level_as_2d_array(&nil_image, mip_level,
1090                                                     &level_offset_B);
1091          addr += level_offset_B;
1092          mip_level = 0;
1093          base_array_layer = 0;
1094          assert(layer_count <= iview->vk.extent.depth);
1095       } else {
1096          assert(layer_count <= iview->vk.layer_count);
1097       }
1098 
1099       const struct nil_image_level *level = &nil_image.levels[mip_level];
1100       addr += level->offset_B;
1101 
1102       assert(sample_layout == NIL_SAMPLE_LAYOUT_INVALID ||
1103              sample_layout == nil_image.sample_layout);
1104       sample_layout = nil_image.sample_layout;
1105       render->samples = image->vk.samples;
1106 
1107       P_MTHD(p, NV9097, SET_ZT_A);
1108       P_NV9097_SET_ZT_A(p, addr >> 32);
1109       P_NV9097_SET_ZT_B(p, addr);
1110       const enum pipe_format p_format =
1111          nvk_format_to_pipe_format(iview->vk.format);
1112       const uint8_t zs_format = nil_format_to_depth_stencil(p_format);
1113       P_NV9097_SET_ZT_FORMAT(p, zs_format);
1114       assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1115       assert(level->tiling.z_log2 == 0);
1116       P_NV9097_SET_ZT_BLOCK_SIZE(p, {
1117          .width = WIDTH_ONE_GOB,
1118          .height = level->tiling.y_log2,
1119          .depth = DEPTH_ONE_GOB,
1120       });
1121       P_NV9097_SET_ZT_ARRAY_PITCH(p, nil_image.array_stride_B >> 2);
1122 
1123       P_IMMD(p, NV9097, SET_ZT_SELECT, 1 /* target_count */);
1124 
1125       struct nil_Extent4D_Samples level_extent_sa =
1126          nil_image_level_extent_sa(&nil_image, mip_level);
1127 
1128       /* We use the stride for depth/stencil targets because the Z/S hardware
1129        * has no concept of a tile width.  Instead, we just set the width to
1130        * the stride divided by bpp.
1131        */
1132       const uint32_t row_stride_el =
1133          level->row_stride_B / util_format_get_blocksize(p_format);
1134 
1135       P_MTHD(p, NV9097, SET_ZT_SIZE_A);
1136       P_NV9097_SET_ZT_SIZE_A(p, row_stride_el);
1137       P_NV9097_SET_ZT_SIZE_B(p, level_extent_sa.height);
1138       P_NV9097_SET_ZT_SIZE_C(p, {
1139          .third_dimension  = base_array_layer + layer_count,
1140          .control          = CONTROL_THIRD_DIMENSION_DEFINES_ARRAY_SIZE,
1141       });
1142 
1143       P_IMMD(p, NV9097, SET_ZT_LAYER, base_array_layer);
1144 
1145       P_IMMD(p, NV9097, SET_Z_COMPRESSION, nil_image.compressed);
1146 
1147       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1148          P_IMMD(p, NVC597, SET_ZT_SPARSE, {
1149             .enable = ENABLE_FALSE,
1150          });
1151       }
1152    } else {
1153       P_IMMD(p, NV9097, SET_ZT_SELECT, 0 /* target_count */);
1154    }
1155 
1156    if (nvk_cmd_buffer_3d_cls(cmd) < TURING_A) {
1157       assert(render->fsr_att.iview == NULL);
1158    } else if (render->fsr_att.iview != NULL) {
1159       const struct nvk_image_view *iview = render->fsr_att.iview;
1160       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1161 
1162       /* Fragment shading rate images are always single-plane */
1163       assert(iview->plane_count == 1);
1164       const uint8_t ip = iview->planes[0].image_plane;
1165       const struct nil_image *nil_image = &image->planes[ip].nil;
1166 
1167       /* Fragment shading rate images are always 2D */
1168       assert(nil_image->dim == NIL_IMAGE_DIM_2D);
1169       assert(nil_image->sample_layout == NIL_SAMPLE_LAYOUT_1X1);
1170 
1171       uint64_t addr = nvk_image_base_address(image, ip);
1172       uint32_t mip_level = iview->vk.base_mip_level;
1173       struct nil_Extent4D_Samples level_extent_sa =
1174          nil_image_level_extent_sa(nil_image, mip_level);
1175 
1176       const struct nil_image_level *level = &nil_image->levels[mip_level];
1177       addr += level->offset_B;
1178 
1179       P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1180       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, addr >> 32);
1181       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, addr);
1182       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, {
1183          .width = level_extent_sa.width,
1184          .height = level_extent_sa.height,
1185       });
1186       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0,
1187          iview->vk.layer_count + iview->vk.base_array_layer);
1188       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0,
1189          iview->vk.base_array_layer);
1190       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0,
1191          nil_image->array_stride_B >> 2);
1192       assert(level->tiling.gob_type != NIL_GOB_TYPE_LINEAR);
1193       assert(level->tiling.z_log2 == 0);
1194       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, {
1195          .width = WIDTH_ONE_GOB,
1196          .height = level->tiling.y_log2,
1197          .depth = DEPTH_ONE_GOB,
1198       });
1199 
1200       const enum pipe_format p_format =
1201          nvk_format_to_pipe_format(iview->vk.format);
1202       const uint32_t row_stride_el =
1203          level->row_stride_B / util_format_get_blocksize(p_format);
1204       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0,
1205          row_stride_el);
1206    } else {
1207       P_MTHD(p, NVC597, SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(0));
1208       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_A(p, 0, 0);
1209       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ADDRESS_B(p, 0, 0);
1210       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_A(p, 0, { });
1211       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_SIZE_B(p, 0, 0);
1212       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_LAYER(p, 0, 0);
1213       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ARRAY_PITCH(p, 0, 0);
1214       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_BLOCK_SIZE(p, 0, { });
1215       P_NVC597_SET_SHADING_RATE_INDEX_SURFACE_ALLOCATED_SIZE(p, 0, 0);
1216    }
1217 
1218    /* From the Vulkan 1.3.275 spec:
1219     *
1220     *    "It is legal for a subpass to use no color or depth/stencil
1221     *    attachments, either because it has no attachment references or
1222     *    because all of them are VK_ATTACHMENT_UNUSED. This kind of subpass
1223     *    can use shader side effects such as image stores and atomics to
1224     *    produce an output. In this case, the subpass continues to use the
1225     *    width, height, and layers of the framebuffer to define the dimensions
1226     *    of the rendering area, and the rasterizationSamples from each
1227     *    pipeline’s VkPipelineMultisampleStateCreateInfo to define the number
1228     *    of samples used in rasterization;"
1229     *
1230     * In the case where we have attachments, we emit SET_ANTI_ALIAS here
1231     * because SET_COLOR_TARGET_* and SET_ZT_* don't have any other way of
1232     * specifying the sample layout and we want to ensure it matches.  When
1233     * we don't have any attachments, we defer SET_ANTI_ALIAS to draw time
1234     * where we base it on dynamic rasterizationSamples.
1235     */
1236    if (sample_layout != NIL_SAMPLE_LAYOUT_INVALID)
1237       nvk_cmd_set_sample_layout(cmd, sample_layout);
1238 
1239    if (render->flags & VK_RENDERING_RESUMING_BIT)
1240       return;
1241 
1242    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1243       const struct nvk_image_view *iview = render->color_att[i].iview;
1244       if (iview == NULL)
1245          continue;
1246 
1247       const struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1248       assert(iview->plane_count == 1);
1249       const uint8_t ip = iview->planes[0].image_plane;
1250       const struct nvk_image_plane *plane = &image->planes[ip];
1251 
1252       const VkAttachmentLoadOp load_op =
1253          pRenderingInfo->pColorAttachments[i].loadOp;
1254       if (!render->all_linear &&
1255           plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1256           load_op == VK_ATTACHMENT_LOAD_OP_LOAD)
1257          nvk_linear_render_copy(cmd, iview, render->area, true);
1258    }
1259 
1260    uint32_t clear_count = 0;
1261    VkClearAttachment clear_att[NVK_MAX_RTS + 1];
1262    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1263       const VkRenderingAttachmentInfo *att_info =
1264          &pRenderingInfo->pColorAttachments[i];
1265       if (att_info->imageView == VK_NULL_HANDLE ||
1266           att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1267          continue;
1268 
1269       clear_att[clear_count++] = (VkClearAttachment) {
1270          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1271          .colorAttachment = i,
1272          .clearValue = att_info->clearValue,
1273       };
1274    }
1275 
1276    clear_att[clear_count] = (VkClearAttachment) { .aspectMask = 0, };
1277    if (pRenderingInfo->pDepthAttachment != NULL &&
1278        pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE &&
1279        pRenderingInfo->pDepthAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1280       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
1281       clear_att[clear_count].clearValue.depthStencil.depth =
1282          pRenderingInfo->pDepthAttachment->clearValue.depthStencil.depth;
1283    }
1284    if (pRenderingInfo->pStencilAttachment != NULL &&
1285        pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE &&
1286        pRenderingInfo->pStencilAttachment->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1287       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
1288       clear_att[clear_count].clearValue.depthStencil.stencil =
1289          pRenderingInfo->pStencilAttachment->clearValue.depthStencil.stencil;
1290    }
1291    if (clear_att[clear_count].aspectMask != 0)
1292       clear_count++;
1293 
1294    if (clear_count > 0) {
1295       const VkClearRect clear_rect = {
1296          .rect = render->area,
1297          .baseArrayLayer = 0,
1298          .layerCount = render->view_mask ? 1 : render->layer_count,
1299       };
1300 
1301       p = nvk_cmd_buffer_push(cmd, 2);
1302       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1303       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_ALWAYS_RENDER);
1304 
1305       nvk_CmdClearAttachments(nvk_cmd_buffer_to_handle(cmd),
1306                               clear_count, clear_att, 1, &clear_rect);
1307       p = nvk_cmd_buffer_push(cmd, 2);
1308       P_MTHD(p, NV9097, SET_RENDER_ENABLE_OVERRIDE);
1309       P_NV9097_SET_RENDER_ENABLE_OVERRIDE(p, MODE_USE_RENDER_ENABLE);
1310    }
1311 
1312    /* TODO: Attachment clears */
1313 }
1314 
1315 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndRendering(VkCommandBuffer commandBuffer)1316 nvk_CmdEndRendering(VkCommandBuffer commandBuffer)
1317 {
1318    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1319    struct nvk_rendering_state *render = &cmd->state.gfx.render;
1320 
1321    if (!(render->flags & VK_RENDERING_SUSPENDING_BIT)) {
1322       for (uint32_t i = 0; i < render->color_att_count; i++) {
1323          struct nvk_image_view *iview = render->color_att[i].iview;
1324          if (iview == NULL)
1325             continue;
1326 
1327          struct nvk_image *image = (struct nvk_image *)iview->vk.image;
1328          const uint8_t ip = iview->planes[0].image_plane;
1329          const struct nvk_image_plane *plane = &image->planes[ip];
1330          if (!render->all_linear &&
1331              plane->nil.levels[0].tiling.gob_type == NIL_GOB_TYPE_LINEAR &&
1332              render->color_att[i].store_op == VK_ATTACHMENT_STORE_OP_STORE)
1333             nvk_linear_render_copy(cmd, iview, render->area, false);
1334       }
1335    }
1336 
1337    bool need_resolve = false;
1338 
1339    /* Translate render state back to VK for meta */
1340    VkRenderingAttachmentInfo vk_color_att[NVK_MAX_RTS];
1341    for (uint32_t i = 0; i < render->color_att_count; i++) {
1342       if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
1343          need_resolve = true;
1344 
1345       vk_color_att[i] = (VkRenderingAttachmentInfo) {
1346          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1347          .imageView = nvk_image_view_to_handle(render->color_att[i].iview),
1348          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1349          .resolveMode = render->color_att[i].resolve_mode,
1350          .resolveImageView =
1351             nvk_image_view_to_handle(render->color_att[i].resolve_iview),
1352          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1353       };
1354    }
1355 
1356    const VkRenderingAttachmentInfo vk_depth_att = {
1357       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1358       .imageView = nvk_image_view_to_handle(render->depth_att.iview),
1359       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1360       .resolveMode = render->depth_att.resolve_mode,
1361       .resolveImageView =
1362          nvk_image_view_to_handle(render->depth_att.resolve_iview),
1363       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1364    };
1365    if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1366       need_resolve = true;
1367 
1368    const VkRenderingAttachmentInfo vk_stencil_att = {
1369       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1370       .imageView = nvk_image_view_to_handle(render->stencil_att.iview),
1371       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1372       .resolveMode = render->stencil_att.resolve_mode,
1373       .resolveImageView =
1374          nvk_image_view_to_handle(render->stencil_att.resolve_iview),
1375       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1376    };
1377    if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
1378       need_resolve = true;
1379 
1380    const VkRenderingInfo vk_render = {
1381       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1382       .renderArea = render->area,
1383       .layerCount = render->layer_count,
1384       .viewMask = render->view_mask,
1385       .colorAttachmentCount = render->color_att_count,
1386       .pColorAttachments = vk_color_att,
1387       .pDepthAttachment = &vk_depth_att,
1388       .pStencilAttachment = &vk_stencil_att,
1389    };
1390 
1391    if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1392       need_resolve = false;
1393 
1394    memset(render, 0, sizeof(*render));
1395 
1396    if (need_resolve) {
1397       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1398       P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
1399          .lines = LINES_ALL,
1400       });
1401 
1402       nvk_meta_resolve_rendering(cmd, &vk_render);
1403    }
1404 }
1405 
1406 void
nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer * cmd,const gl_shader_stage stage,struct nvk_shader * shader)1407 nvk_cmd_bind_graphics_shader(struct nvk_cmd_buffer *cmd,
1408                              const gl_shader_stage stage,
1409                              struct nvk_shader *shader)
1410 {
1411    assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1412    if (cmd->state.gfx.shaders[stage] == shader)
1413       return;
1414 
1415    cmd->state.gfx.shaders[stage] = shader;
1416    cmd->state.gfx.shaders_dirty |= mesa_to_vk_shader_stage(stage);
1417 }
1418 
1419 uint32_t
nvk_mme_tess_params(enum nak_ts_domain domain,enum nak_ts_spacing spacing,enum nak_ts_prims prims)1420 nvk_mme_tess_params(enum nak_ts_domain domain,
1421                     enum nak_ts_spacing spacing,
1422                     enum nak_ts_prims prims)
1423 {
1424    /* This is laid out the same as SET_TESSELLATION_PARAMETERS, only with an
1425     * extra bit for lower_left
1426     */
1427    uint16_t params = ((uint16_t)domain << 0) |
1428                      ((uint16_t)spacing << 4) |
1429                      ((uint16_t)prims << 8);
1430    return nvk_mme_val_mask(params, 0x0fff);
1431 }
1432 
1433 static uint32_t
nvk_mme_tess_lower_left(bool lower_left)1434 nvk_mme_tess_lower_left(bool lower_left)
1435 {
1436    return nvk_mme_val_mask((uint16_t)lower_left << 12, 1u << 12);
1437 }
1438 
1439 void
nvk_mme_set_tess_params(struct mme_builder * b)1440 nvk_mme_set_tess_params(struct mme_builder *b)
1441 {
1442    struct mme_value val_mask = mme_load(b);
1443    struct mme_value old_params = nvk_mme_load_scratch(b, TESS_PARAMS);
1444    struct mme_value params = nvk_mme_set_masked(b, old_params, val_mask);
1445    mme_free_reg(b, val_mask);
1446 
1447    mme_if(b, ine, params, old_params) {
1448       nvk_mme_store_scratch(b, TESS_PARAMS, params);
1449 
1450       /* lower_left lives at bit 12 */
1451       struct mme_value lower_left = mme_merge(b, mme_zero(), params, 0, 1, 12);
1452 
1453       /* Only the bottom 12 bits are valid to put in HW */
1454       mme_merge_to(b, params, mme_zero(), params, 0, 12, 0);
1455 
1456       /* If we're using a lower-left orientation, we need to flip triangles
1457        * between CW and CCW.
1458        */
1459       mme_if(b, ine, lower_left, mme_zero()) {
1460          struct mme_value prims_cw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CW);
1461          struct mme_value prims_ccw = mme_imm(NAK_TS_PRIMS_TRIANGLES_CCW);
1462 
1463          struct mme_value prims = mme_merge(b, mme_zero(), params, 0, 4, 8);
1464          mme_if(b, ieq, prims, prims_cw) {
1465             mme_merge_to(b, params, params, prims_ccw, 8, 4, 0);
1466          }
1467          mme_if(b, ieq, prims, prims_ccw) {
1468             mme_merge_to(b, params, params, prims_cw, 8, 4, 0);
1469          }
1470          mme_free_reg(b, prims);
1471       }
1472       mme_free_reg(b, lower_left);
1473 
1474       mme_mthd(b, NV9097_SET_TESSELLATION_PARAMETERS);
1475       mme_emit(b, params);
1476    }
1477 }
1478 
1479 const struct nvk_mme_test_case nvk_mme_set_tess_params_tests[] = {{
1480    /* This case doesn't change the state so it should do nothing */
1481    .init = (struct nvk_mme_mthd_data[]) {
1482       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1483       { }
1484    },
1485    .params = (uint32_t[]) { 0xffff0000 },
1486    .expected = (struct nvk_mme_mthd_data[]) {
1487       { }
1488    },
1489 }, {
1490    /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = false */
1491    .init = (struct nvk_mme_mthd_data[]) {
1492       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0 },
1493       { }
1494    },
1495    .params = (uint32_t[]) { 0xffff0201 },
1496    .expected = (struct nvk_mme_mthd_data[]) {
1497       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1498       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1499       { }
1500    },
1501 }, {
1502    /* TRIANGLE, INTEGER, TRIANGLES_CW, lower_left = true */
1503    .init = (struct nvk_mme_mthd_data[]) {
1504       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0201 },
1505       { }
1506    },
1507    .params = (uint32_t[]) { 0x10001000 },
1508    .expected = (struct nvk_mme_mthd_data[]) {
1509       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1201 },
1510       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0301 },
1511       { }
1512    },
1513 }, {
1514    /* TRIANGLE, INTEGER, TRIANGLES_CCW, lower_left = true */
1515    .init = (struct nvk_mme_mthd_data[]) {
1516       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x0301 },
1517       { }
1518    },
1519    .params = (uint32_t[]) { 0x10001000 },
1520    .expected = (struct nvk_mme_mthd_data[]) {
1521       { NVK_SET_MME_SCRATCH(TESS_PARAMS), 0x1301 },
1522       { NV9097_SET_TESSELLATION_PARAMETERS, 0x0201 },
1523       { }
1524    },
1525 }, {}};
1526 
1527 void
nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer * cmd)1528 nvk_cmd_flush_gfx_shaders(struct nvk_cmd_buffer *cmd)
1529 {
1530    if (cmd->state.gfx.shaders_dirty == 0)
1531       return;
1532 
1533    /* Map shader types to shaders */
1534    struct nvk_shader *type_shader[6] = { NULL, };
1535    uint32_t types_dirty = 0;
1536 
1537    u_foreach_bit(s, cmd->state.gfx.shaders_dirty &
1538                     NVK_SHADER_STAGE_GRAPHICS_BITS) {
1539       gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1540       uint32_t type = mesa_to_nv9097_shader_type(stage);
1541       types_dirty |= BITFIELD_BIT(type);
1542 
1543       /* Only copy non-NULL shaders because mesh/task alias with vertex and
1544        * tessellation stages.
1545        */
1546       struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
1547       if (shader != NULL) {
1548          assert(type < ARRAY_SIZE(type_shader));
1549          assert(type_shader[type] == NULL);
1550          type_shader[type] = shader;
1551 
1552          const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
1553          struct nvk_cbuf_group *cbuf_group =
1554             &cmd->state.gfx.cbuf_groups[nvk_cbuf_binding_for_stage(stage)];
1555          for (uint32_t i = 0; i < cbuf_map->cbuf_count; i++) {
1556             if (memcmp(&cbuf_group->cbufs[i], &cbuf_map->cbufs[i],
1557                        sizeof(cbuf_group->cbufs[i])) != 0) {
1558                cbuf_group->cbufs[i] = cbuf_map->cbufs[i];
1559                cbuf_group->dirty |= BITFIELD_BIT(i);
1560             }
1561          }
1562       }
1563    }
1564 
1565    u_foreach_bit(type, types_dirty) {
1566       struct nvk_shader *shader = type_shader[type];
1567       if (shader == NULL) {
1568          struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1569          P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1570             .enable  = ENABLE_FALSE,
1571             .type    = type,
1572          });
1573       } else {
1574          struct nv_push *p = nvk_cmd_buffer_push(cmd, shader->push_dw_count);
1575          nv_push_raw(p, shader->push_dw, shader->push_dw_count);
1576       }
1577    }
1578 
1579    if (cmd->state.gfx.shaders_dirty & NVK_SHADER_STAGE_VTGM_BITS) {
1580       struct nvk_shader *last_vtgm = NULL;
1581       u_foreach_bit(s, NVK_SHADER_STAGE_VTGM_BITS) {
1582          gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1583          if (cmd->state.gfx.shaders[stage] != NULL)
1584             last_vtgm = cmd->state.gfx.shaders[stage];
1585       }
1586 
1587       assert(last_vtgm->vtgm_push_dw_count > last_vtgm->push_dw_count);
1588       const uint16_t dw_start = last_vtgm->push_dw_count;
1589       const uint16_t dw_count = last_vtgm->vtgm_push_dw_count - dw_start;
1590       struct nv_push *p = nvk_cmd_buffer_push(cmd, dw_count);
1591       nv_push_raw(p, &last_vtgm->push_dw[dw_start], dw_count);
1592    }
1593 
1594    cmd->state.gfx.shaders_dirty = 0;
1595 }
1596 
1597 void
nvk_mme_set_vb_enables(struct mme_builder * b)1598 nvk_mme_set_vb_enables(struct mme_builder *b)
1599 {
1600    struct mme_value enables = mme_load(b);
1601    struct mme_value old_enables = nvk_mme_load_scratch(b, VB_ENABLES);
1602    nvk_mme_store_scratch(b, VB_ENABLES, enables);
1603 
1604    struct mme_value changed = mme_xor(b, enables, old_enables);
1605    mme_free_reg(b, old_enables);
1606 
1607    struct mme_value vb_idx4 = mme_mov(b, mme_zero());
1608    mme_while(b, ine, changed, mme_zero()) {
1609       mme_if(b, ine, mme_and(b, changed, mme_imm(1)), mme_zero()) {
1610          struct mme_value state =
1611             mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1612          mme_merge_to(b, state, state, enables, 12, 1, 0);
1613          mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1614          mme_emit(b, state);
1615       }
1616       mme_add_to(b, vb_idx4, vb_idx4, mme_imm(4));
1617       mme_srl_to(b, changed, changed, mme_imm(1));
1618       mme_srl_to(b, enables, enables, mme_imm(1));
1619    }
1620 }
1621 
1622 static uint32_t
nvk_mme_vb_stride(uint32_t vb_idx,uint32_t stride)1623 nvk_mme_vb_stride(uint32_t vb_idx, uint32_t stride)
1624 {
1625    assert(stride < (1 << 12));
1626    assert(vb_idx < (1 << 5));
1627    return (vb_idx << 16) | stride;
1628 }
1629 
1630 void
nvk_mme_set_vb_stride(struct mme_builder * b)1631 nvk_mme_set_vb_stride(struct mme_builder *b)
1632 {
1633    /* Param is laid out as
1634     *
1635     *    bits 0..11  : stride
1636     *    bits 16..21 : VB index
1637     */
1638    struct mme_value param = mme_load(b);
1639 
1640    struct mme_value vb_idx4 = mme_merge(b, mme_zero(), param, 2, 5, 16);
1641 
1642    struct mme_value state =
1643       mme_state_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1644    struct mme_value new_state = mme_merge(b, state, param, 0, 12, 0);
1645    mme_if(b, ine, state, new_state) {
1646       mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_FORMAT(0), vb_idx4);
1647       mme_emit(b, new_state);
1648    }
1649 }
1650 
1651 static void
nvk_flush_vi_state(struct nvk_cmd_buffer * cmd)1652 nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)
1653 {
1654    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1655    struct nvk_physical_device *pdev = nvk_device_physical(dev);
1656    const struct vk_dynamic_graphics_state *dyn =
1657       &cmd->vk.dynamic_graphics_state;
1658 
1659    struct nv_push *p = nvk_cmd_buffer_push(cmd, 258);
1660 
1661    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1662       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_ENABLES));
1663       P_INLINE_DATA(p, dyn->vi->bindings_valid);
1664    }
1665 
1666    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
1667        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
1668       u_foreach_bit(a, dyn->vi->attributes_valid) {
1669          const struct nvk_va_format *fmt =
1670             nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
1671 
1672          P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
1673             .stream                 = dyn->vi->attributes[a].binding,
1674             .offset                 = dyn->vi->attributes[a].offset,
1675             .component_bit_widths   = fmt->bit_widths,
1676             .numerical_type         = fmt->type,
1677             .swap_r_and_b           = fmt->swap_rb,
1678          });
1679       }
1680 
1681       u_foreach_bit(b, dyn->vi->bindings_valid) {
1682          const bool instanced = dyn->vi->bindings[b].input_rate ==
1683                                 VK_VERTEX_INPUT_RATE_INSTANCE;
1684          P_IMMD(p, NV9097, SET_VERTEX_STREAM_INSTANCE_A(b), instanced);
1685          P_IMMD(p, NV9097, SET_VERTEX_STREAM_A_FREQUENCY(b),
1686             dyn->vi->bindings[b].divisor);
1687       }
1688    }
1689 
1690    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
1691        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES)) {
1692       u_foreach_bit(b, dyn->vi->bindings_valid) {
1693          assert(dyn->vi_binding_strides[b] < (1 << 12));
1694          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE));
1695          P_INLINE_DATA(p, nvk_mme_vb_stride(b, dyn->vi_binding_strides[b]));
1696       }
1697    }
1698 }
1699 
1700 static uint32_t
vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)1701 vk_to_nv9097_primitive_topology(VkPrimitiveTopology prim)
1702 {
1703    switch (prim) {
1704    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
1705       return NV9097_BEGIN_OP_POINTS;
1706    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
1707       return NV9097_BEGIN_OP_LINES;
1708    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
1709       return NV9097_BEGIN_OP_LINE_STRIP;
1710    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
1711 #pragma GCC diagnostic push
1712 #pragma GCC diagnostic ignored "-Wswitch"
1713    case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
1714 #pragma GCC diagnostic pop
1715       return NV9097_BEGIN_OP_TRIANGLES;
1716    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
1717       return NV9097_BEGIN_OP_TRIANGLE_STRIP;
1718    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
1719       return NV9097_BEGIN_OP_TRIANGLE_FAN;
1720    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1721       return NV9097_BEGIN_OP_LINELIST_ADJCY;
1722    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1723       return NV9097_BEGIN_OP_LINESTRIP_ADJCY;
1724    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1725       return NV9097_BEGIN_OP_TRIANGLELIST_ADJCY;
1726    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1727       return NV9097_BEGIN_OP_TRIANGLESTRIP_ADJCY;
1728    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
1729       return NV9097_BEGIN_OP_PATCH;
1730    default:
1731       unreachable("Invalid primitive topology");
1732    }
1733 }
1734 
1735 static void
nvk_flush_ia_state(struct nvk_cmd_buffer * cmd)1736 nvk_flush_ia_state(struct nvk_cmd_buffer *cmd)
1737 {
1738    const struct vk_dynamic_graphics_state *dyn =
1739       &cmd->vk.dynamic_graphics_state;
1740 
1741    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
1742       uint32_t begin;
1743       V_NV9097_BEGIN(begin, {
1744          .op = vk_to_nv9097_primitive_topology(dyn->ia.primitive_topology),
1745          .primitive_id = NV9097_BEGIN_PRIMITIVE_ID_FIRST,
1746          .instance_id = NV9097_BEGIN_INSTANCE_ID_FIRST,
1747          .split_mode = SPLIT_MODE_NORMAL_BEGIN_NORMAL_END,
1748       });
1749 
1750       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1751       P_MTHD(p, NV9097, SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_DRAW_BEGIN));
1752       P_INLINE_DATA(p, begin);
1753    }
1754 
1755    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
1756       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
1757       P_IMMD(p, NV9097, SET_DA_PRIMITIVE_RESTART,
1758              dyn->ia.primitive_restart_enable);
1759    }
1760 }
1761 
1762 static void
nvk_flush_ts_state(struct nvk_cmd_buffer * cmd)1763 nvk_flush_ts_state(struct nvk_cmd_buffer *cmd)
1764 {
1765    const struct vk_dynamic_graphics_state *dyn =
1766       &cmd->vk.dynamic_graphics_state;
1767    struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
1768 
1769    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) {
1770       /* The hardware gets grumpy if we set this to 0 so make sure we set it
1771        * to at least 1 in case it's dirty but uninitialized.
1772        */
1773       P_IMMD(p, NV9097, SET_PATCH, MAX2(1, dyn->ts.patch_control_points));
1774    }
1775 
1776    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
1777       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
1778       P_INLINE_DATA(p, nvk_mme_tess_lower_left(
1779          dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT));
1780    }
1781 }
1782 
1783 static void
nvk_flush_vp_state(struct nvk_cmd_buffer * cmd)1784 nvk_flush_vp_state(struct nvk_cmd_buffer *cmd)
1785 {
1786    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1787    struct nvk_physical_device *pdev = nvk_device_physical(dev);
1788 
1789    const struct vk_dynamic_graphics_state *dyn =
1790       &cmd->vk.dynamic_graphics_state;
1791 
1792    struct nv_push *p =
1793       nvk_cmd_buffer_push(cmd, 18 * dyn->vp.viewport_count + 4 * NVK_MAX_VIEWPORTS);
1794 
1795    /* Nothing to do for MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT */
1796 
1797    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
1798        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
1799        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLAMP_RANGE)) {
1800       for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1801          const VkViewport *vp = &dyn->vp.viewports[i];
1802 
1803          /* These exactly match the spec values.  Nvidia hardware oddities
1804           * are accounted for later.
1805           */
1806          const float o_x = vp->x + 0.5f * vp->width;
1807          const float o_y = vp->y + 0.5f * vp->height;
1808          const float o_z = !dyn->vp.depth_clip_negative_one_to_one ?
1809                            vp->minDepth :
1810                            (vp->maxDepth + vp->minDepth) * 0.5f;
1811 
1812          const float p_x = vp->width;
1813          const float p_y = vp->height;
1814          const float p_z = !dyn->vp.depth_clip_negative_one_to_one ?
1815                            vp->maxDepth - vp->minDepth :
1816                            (vp->maxDepth - vp->minDepth) * 0.5f;
1817 
1818          P_MTHD(p, NV9097, SET_VIEWPORT_SCALE_X(i));
1819          P_NV9097_SET_VIEWPORT_SCALE_X(p, i, fui(0.5f * p_x));
1820          P_NV9097_SET_VIEWPORT_SCALE_Y(p, i, fui(0.5f * p_y));
1821          P_NV9097_SET_VIEWPORT_SCALE_Z(p, i, fui(p_z));
1822 
1823          P_NV9097_SET_VIEWPORT_OFFSET_X(p, i, fui(o_x));
1824          P_NV9097_SET_VIEWPORT_OFFSET_Y(p, i, fui(o_y));
1825          P_NV9097_SET_VIEWPORT_OFFSET_Z(p, i, fui(o_z));
1826 
1827          const bool user_defined_range =
1828             dyn->vp.depth_clamp_mode == VK_DEPTH_CLAMP_MODE_USER_DEFINED_RANGE_EXT;
1829          float xmin = vp->x;
1830          float xmax = vp->x + vp->width;
1831          float ymin = MIN2(vp->y, vp->y + vp->height);
1832          float ymax = MAX2(vp->y, vp->y + vp->height);
1833          float zmin = user_defined_range ?
1834                       dyn->vp.depth_clamp_range.minDepthClamp :
1835                       MIN2(vp->minDepth, vp->maxDepth);
1836          float zmax = user_defined_range ?
1837                       dyn->vp.depth_clamp_range.maxDepthClamp :
1838                       MAX2(vp->minDepth, vp->maxDepth);
1839          assert(xmin <= xmax && ymin <= ymax && zmin <= zmax);
1840 
1841          const float max_dim = (float)0xffff;
1842          xmin = CLAMP(xmin, 0, max_dim);
1843          xmax = CLAMP(xmax, 0, max_dim);
1844          ymin = CLAMP(ymin, 0, max_dim);
1845          ymax = CLAMP(ymax, 0, max_dim);
1846 
1847          if (!dev->vk.enabled_extensions.EXT_depth_range_unrestricted) {
1848             assert(0.0 <= zmin && zmin <= 1.0);
1849             assert(0.0 <= zmax && zmax <= 1.0);
1850          }
1851 
1852          P_MTHD(p, NV9097, SET_VIEWPORT_CLIP_HORIZONTAL(i));
1853          P_NV9097_SET_VIEWPORT_CLIP_HORIZONTAL(p, i, {
1854             .x0      = xmin,
1855             .width   = xmax - xmin,
1856          });
1857          P_NV9097_SET_VIEWPORT_CLIP_VERTICAL(p, i, {
1858             .y0      = ymin,
1859             .height  = ymax - ymin,
1860          });
1861 
1862          if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
1863             P_NV9097_SET_VIEWPORT_CLIP_MIN_Z(p, i, fui(zmin));
1864             P_NV9097_SET_VIEWPORT_CLIP_MAX_Z(p, i, fui(zmax));
1865          } else {
1866             P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_VIEWPORT_MIN_MAX_Z));
1867             P_INLINE_DATA(p, i);
1868             P_INLINE_DATA(p, fui(zmin));
1869             P_INLINE_DATA(p, fui(zmax));
1870          }
1871 
1872          if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
1873             P_IMMD(p, NVB197, SET_VIEWPORT_COORDINATE_SWIZZLE(i), {
1874                .x = X_POS_X,
1875                .y = Y_POS_Y,
1876                .z = Z_POS_Z,
1877                .w = W_POS_W,
1878             });
1879          }
1880       }
1881    }
1882 
1883    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
1884       P_IMMD(p, NV9097, SET_VIEWPORT_Z_CLIP,
1885              dyn->vp.depth_clip_negative_one_to_one ?
1886              RANGE_NEGATIVE_W_TO_POSITIVE_W :
1887              RANGE_ZERO_TO_POSITIVE_W);
1888    }
1889 
1890    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT)) {
1891       for (unsigned i = dyn->vp.scissor_count; i < NVK_MAX_VIEWPORTS; i++)
1892          P_IMMD(p, NV9097, SET_SCISSOR_ENABLE(i), V_FALSE);
1893    }
1894 
1895    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
1896       const uint32_t sr_max =
1897          nvk_image_max_dimension(&pdev->info, VK_IMAGE_TYPE_2D);
1898 
1899       for (unsigned i = 0; i < dyn->vp.scissor_count; i++) {
1900          const VkRect2D *s = &dyn->vp.scissors[i];
1901 
1902          const uint32_t xmin = MIN2(sr_max, s->offset.x);
1903          const uint32_t xmax = MIN2(sr_max, s->offset.x + s->extent.width);
1904          const uint32_t ymin = MIN2(sr_max, s->offset.y);
1905          const uint32_t ymax = MIN2(sr_max, s->offset.y + s->extent.height);
1906 
1907          P_MTHD(p, NV9097, SET_SCISSOR_ENABLE(i));
1908          P_NV9097_SET_SCISSOR_ENABLE(p, i, V_TRUE);
1909          P_NV9097_SET_SCISSOR_HORIZONTAL(p, i, {
1910             .xmin = xmin,
1911             .xmax = xmax,
1912          });
1913          P_NV9097_SET_SCISSOR_VERTICAL(p, i, {
1914             .ymin = ymin,
1915             .ymax = ymax,
1916          });
1917       }
1918    }
1919 }
1920 
1921 static uint32_t
vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)1922 vk_to_nv9097_polygon_mode(VkPolygonMode vk_mode)
1923 {
1924    ASSERTED uint16_t vk_to_nv9097[] = {
1925       [VK_POLYGON_MODE_FILL]  = NV9097_SET_FRONT_POLYGON_MODE_V_FILL,
1926       [VK_POLYGON_MODE_LINE]  = NV9097_SET_FRONT_POLYGON_MODE_V_LINE,
1927       [VK_POLYGON_MODE_POINT] = NV9097_SET_FRONT_POLYGON_MODE_V_POINT,
1928    };
1929    assert(vk_mode < ARRAY_SIZE(vk_to_nv9097));
1930 
1931    uint32_t nv9097_mode = 0x1b00 | (2 - vk_mode);
1932    assert(nv9097_mode == vk_to_nv9097[vk_mode]);
1933    return nv9097_mode;
1934 }
1935 
1936 static uint32_t
vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)1937 vk_to_nv9097_cull_mode(VkCullModeFlags vk_cull_mode)
1938 {
1939    static const uint16_t vk_to_nv9097[] = {
1940       [VK_CULL_MODE_FRONT_BIT]      = NV9097_OGL_SET_CULL_FACE_V_FRONT,
1941       [VK_CULL_MODE_BACK_BIT]       = NV9097_OGL_SET_CULL_FACE_V_BACK,
1942       [VK_CULL_MODE_FRONT_AND_BACK] = NV9097_OGL_SET_CULL_FACE_V_FRONT_AND_BACK,
1943    };
1944    assert(vk_cull_mode < ARRAY_SIZE(vk_to_nv9097));
1945    return vk_to_nv9097[vk_cull_mode];
1946 }
1947 
1948 static uint32_t
vk_to_nv9097_front_face(VkFrontFace vk_face)1949 vk_to_nv9097_front_face(VkFrontFace vk_face)
1950 {
1951    /* Vulkan and OpenGL are backwards here because Vulkan assumes the D3D
1952     * convention in which framebuffer coordinates always start in the upper
1953     * left while OpenGL has framebuffer coordinates starting in the lower
1954     * left.  Therefore, we want the reverse of the hardware enum name.
1955     */
1956    ASSERTED static const uint16_t vk_to_nv9097[] = {
1957       [VK_FRONT_FACE_COUNTER_CLOCKWISE]   = NV9097_OGL_SET_FRONT_FACE_V_CCW,
1958       [VK_FRONT_FACE_CLOCKWISE]           = NV9097_OGL_SET_FRONT_FACE_V_CW,
1959    };
1960    assert(vk_face < ARRAY_SIZE(vk_to_nv9097));
1961 
1962    uint32_t nv9097_face = 0x900 | (1 - vk_face);
1963    assert(nv9097_face == vk_to_nv9097[vk_face]);
1964    return nv9097_face;
1965 }
1966 
1967 static uint32_t
vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)1968 vk_to_nv9097_provoking_vertex(VkProvokingVertexModeEXT vk_mode)
1969 {
1970    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT ==
1971                  NV9097_SET_PROVOKING_VERTEX_V_FIRST);
1972    STATIC_ASSERT(VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT ==
1973                  NV9097_SET_PROVOKING_VERTEX_V_LAST);
1974    return vk_mode;
1975 }
1976 
1977 void
nvk_mme_set_viewport_min_max_z(struct mme_builder * b)1978 nvk_mme_set_viewport_min_max_z(struct mme_builder *b)
1979 {
1980    struct mme_value vp_idx = mme_load(b);
1981    struct mme_value min_z = mme_load(b);
1982    struct mme_value max_z = mme_load(b);
1983 
1984    /* Multiply by 2 because it's an array with stride 8 */
1985    mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1986    mme_mthd_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), vp_idx);
1987    mme_emit(b, min_z);
1988    mme_emit(b, max_z);
1989 
1990    struct mme_value z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
1991    mme_if(b, ine, z_clamp, mme_zero()) {
1992       /* Multiply by 2 again because this array has stride 16 */
1993       mme_sll_to(b, vp_idx, vp_idx, mme_imm(1));
1994       mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), vp_idx);
1995       mme_emit(b, min_z);
1996       mme_emit(b, max_z);
1997    }
1998 }
1999 
2000 void
nvk_mme_set_z_clamp(struct mme_builder * b)2001 nvk_mme_set_z_clamp(struct mme_builder *b)
2002 {
2003    struct mme_value z_clamp = mme_load(b);
2004    struct mme_value old_z_clamp = nvk_mme_load_scratch(b, Z_CLAMP);
2005    mme_if(b, ine, z_clamp, old_z_clamp) {
2006       nvk_mme_store_scratch(b, Z_CLAMP, z_clamp);
2007 
2008       mme_if(b, ine, z_clamp, mme_zero()) {
2009          struct mme_value i_2 = mme_mov(b, mme_zero());
2010          mme_while(b, ine, i_2, mme_imm(NVK_MAX_VIEWPORTS * 2)) {
2011             struct mme_value min_z =
2012                mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MIN_Z), i_2);
2013             struct mme_value max_z =
2014                mme_state_arr(b, NVK_SET_MME_SCRATCH(VIEWPORT0_MAX_Z), i_2);
2015 
2016             struct mme_value i_4 = mme_sll(b, i_2, mme_imm(1));
2017             mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2018             mme_emit(b, min_z);
2019             mme_emit(b, max_z);
2020 
2021             mme_free_reg(b, i_4);
2022             mme_free_reg(b, min_z);
2023             mme_free_reg(b, max_z);
2024 
2025             mme_add_to(b, i_2, i_2, mme_imm(2));
2026          }
2027          mme_free_reg(b, i_2);
2028       }
2029       mme_if(b, ieq, z_clamp, mme_zero()) {
2030          struct mme_value i_4 = mme_mov(b, mme_zero());
2031          mme_while(b, ine, i_4, mme_imm(NVK_MAX_VIEWPORTS * 4)) {
2032             mme_mthd_arr(b, NV9097_SET_VIEWPORT_CLIP_MIN_Z(0), i_4);
2033             mme_emit(b, mme_imm(fui(-INFINITY)));
2034             mme_emit(b, mme_imm(fui(INFINITY)));
2035 
2036             mme_add_to(b, i_4, i_4, mme_imm(4));
2037          }
2038          mme_free_reg(b, i_4);
2039       }
2040    }
2041 }
2042 
2043 static void
nvk_flush_rs_state(struct nvk_cmd_buffer * cmd)2044 nvk_flush_rs_state(struct nvk_cmd_buffer *cmd)
2045 {
2046    const struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
2047    const struct vk_dynamic_graphics_state *dyn =
2048       &cmd->vk.dynamic_graphics_state;
2049    const struct nvk_rendering_state *render =
2050       &cmd->state.gfx.render;
2051 
2052    struct nv_push *p = nvk_cmd_buffer_push(cmd, 46);
2053 
2054    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
2055       P_IMMD(p, NV9097, SET_RASTER_ENABLE, !dyn->rs.rasterizer_discard_enable);
2056 
2057    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
2058        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE)) {
2059       const bool z_clamp = dyn->rs.depth_clamp_enable;
2060       const bool z_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2061       /* z_clamp_zero_one accounts for the interaction between
2062        * depthClampZeroOne and depthRangeUnrestricted as mentioned in the
2063        * Vulkan spec. depthClampZeroOne adds an additional clamp and doesn't
2064        * modify the clip/clamp threshold.  We are expected to clamp to [0,1]
2065        * when any one of these conditions are fulfilled:
2066        * - depth_range_unrestricted is not enabled
2067        * - depthClampZeroOne is enabled but depth
2068        *    format is not floating point or depthRangeUnrestricted
2069        *    is not enabled
2070        * - fixed point depth format
2071       */
2072       const bool z_clamp_zero_one =
2073          !vk_format_has_float_depth(render->depth_att.vk_format) ||
2074          (dev->vk.enabled_features.depthClampZeroOne &&
2075          !dev->vk.enabled_extensions.EXT_depth_range_unrestricted);
2076 
2077       P_IMMD(p, NVC397, SET_VIEWPORT_CLIP_CONTROL, {
2078          /* We only set Z clip range if clamp is requested.  Otherwise, we
2079           * leave it set to -/+INF and clip using the guardband below.
2080           *
2081           * depthClampZeroOne is independent of normal depth clamping and
2082           * does not modify the clip/clamp threshold.  The Vulkan spec
2083           * guarantees that, in the cases where depthClampZeroOne applies,
2084           * the [zmin, zmax] is inside [0, 1].  This means that, if z_clamp
2085           * is enabled, we can just do the regular clamp.  If z_clamp is
2086           * disabled and z_clamp_zero_one is enabled then we need to
2087           * apply the [0, 1] clamp.
2088           */
2089          .min_z_zero_max_z_one = (!z_clamp && z_clamp_zero_one)
2090                                  ? MIN_Z_ZERO_MAX_Z_ONE_TRUE
2091                                  : MIN_Z_ZERO_MAX_Z_ONE_FALSE,
2092          .z_clip_range = (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A &&
2093                           (z_clamp || !z_clamp_zero_one))
2094                          ? (z_clamp ? Z_CLIP_RANGE_MIN_Z_MAX_Z
2095                                     : Z_CLIP_RANGE_MINUS_INF_PLUS_INF)
2096                          : Z_CLIP_RANGE_USE_FIELD_MIN_Z_ZERO_MAX_Z_ONE,
2097 
2098          .pixel_min_z = PIXEL_MIN_Z_CLAMP,
2099          .pixel_max_z = PIXEL_MAX_Z_CLAMP,
2100 
2101          .geometry_guardband = GEOMETRY_GUARDBAND_SCALE_256,
2102          .line_point_cull_guardband = LINE_POINT_CULL_GUARDBAND_SCALE_256,
2103          .geometry_clip = z_clip ? GEOMETRY_CLIP_FRUSTUM_XYZ_CLIP
2104                                  : GEOMETRY_CLIP_FRUSTUM_XY_CLIP,
2105 
2106          /* We clip depth with the geometry clipper to ensure that it gets
2107           * clipped before depth bias is applied.  If we leave it up to the
2108           * raserizer clipper (pixel_min/max_z = CLIP), it will clip too late
2109           * in the pipeline.  This can be seen in two different ways:
2110           *
2111           *  - When depth bias is enabled, the bias is applied post-clipping.
2112           *    If we clip in the rasterizer, it will clip according to the
2113           *    post-bias depth which is wrong.
2114           *
2115           *  - If the fragment shader overrides the depth by writing to
2116           *    gl_FragDepth, it should be clipped according to the original
2117           *    geometry, not accoring to gl_FragDepth.
2118           *
2119           * In order to always get the geometry clipper, we need to set a
2120           * tight guardband (geometry_guardband_z = SCALE_1).
2121           */
2122          .geometry_guardband_z = z_clip ? GEOMETRY_GUARDBAND_Z_SCALE_1
2123                                         : GEOMETRY_GUARDBAND_Z_SCALE_256,
2124       });
2125 
2126       /* Pre-Volta, we don't have SET_VIEWPORT_CLIP_CONTROL::z_clip_range.
2127        * Instead, we have to emulate it by smashing VIEWPORT_CLIP_MIN/MAX_Z
2128        * based on whether or not z_clamp is set. This is done by a pair of
2129        * macros, one of which is called here and the other is called in
2130        * viewport setup.
2131        */
2132       if (nvk_cmd_buffer_3d_cls(cmd) < VOLTA_A) {
2133          P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_Z_CLAMP));
2134          P_INLINE_DATA(p, z_clamp);
2135       }
2136    }
2137 
2138    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE)) {
2139       uint32_t polygon_mode = vk_to_nv9097_polygon_mode(dyn->rs.polygon_mode);
2140       P_MTHD(p, NV9097, SET_FRONT_POLYGON_MODE);
2141       P_NV9097_SET_FRONT_POLYGON_MODE(p, polygon_mode);
2142       P_NV9097_SET_BACK_POLYGON_MODE(p, polygon_mode);
2143    }
2144 
2145    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE)) {
2146       P_IMMD(p, NV9097, OGL_SET_CULL, dyn->rs.cull_mode != VK_CULL_MODE_NONE);
2147 
2148       if (dyn->rs.cull_mode != VK_CULL_MODE_NONE) {
2149          uint32_t face = vk_to_nv9097_cull_mode(dyn->rs.cull_mode);
2150          P_IMMD(p, NV9097, OGL_SET_CULL_FACE, face);
2151       }
2152    }
2153 
2154    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE)) {
2155       P_IMMD(p, NV9097, OGL_SET_FRONT_FACE,
2156          vk_to_nv9097_front_face(dyn->rs.front_face));
2157    }
2158 
2159    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
2160       P_IMMD(p, NV9097, SET_PROVOKING_VERTEX,
2161              vk_to_nv9097_provoking_vertex(dyn->rs.provoking_vertex));
2162    }
2163 
2164    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE)) {
2165       P_MTHD(p, NV9097, SET_POLY_OFFSET_POINT);
2166       P_NV9097_SET_POLY_OFFSET_POINT(p, dyn->rs.depth_bias.enable);
2167       P_NV9097_SET_POLY_OFFSET_LINE(p, dyn->rs.depth_bias.enable);
2168       P_NV9097_SET_POLY_OFFSET_FILL(p, dyn->rs.depth_bias.enable);
2169    }
2170 
2171    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
2172       switch (dyn->rs.depth_bias.representation) {
2173       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT:
2174          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2175                 DEPTH_FORMAT_DEPENDENT_TRUE);
2176          break;
2177       case VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT:
2178          P_IMMD(p, NV9097, SET_DEPTH_BIAS_CONTROL,
2179                 DEPTH_FORMAT_DEPENDENT_FALSE);
2180          break;
2181       case VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT:
2182       default:
2183          unreachable("Unsupported depth bias representation");
2184       }
2185       /* TODO: The blob multiplies by 2 for some reason. We don't. */
2186       P_IMMD(p, NV9097, SET_DEPTH_BIAS, fui(dyn->rs.depth_bias.constant_factor));
2187       P_IMMD(p, NV9097, SET_SLOPE_SCALE_DEPTH_BIAS, fui(dyn->rs.depth_bias.slope_factor));
2188       P_IMMD(p, NV9097, SET_DEPTH_BIAS_CLAMP, fui(dyn->rs.depth_bias.clamp));
2189    }
2190 
2191    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH)) {
2192       P_MTHD(p, NV9097, SET_LINE_WIDTH_FLOAT);
2193       P_NV9097_SET_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2194       P_NV9097_SET_ALIASED_LINE_WIDTH_FLOAT(p, fui(dyn->rs.line.width));
2195    }
2196 
2197    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE)) {
2198       switch (dyn->rs.line.mode) {
2199       case VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR:
2200       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR:
2201          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_FALSE);
2202          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2203          break;
2204 
2205       case VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR:
2206          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2207          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_FALSE);
2208          break;
2209 
2210       case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR:
2211          P_IMMD(p, NV9097, SET_LINE_MULTISAMPLE_OVERRIDE, ENABLE_TRUE);
2212          P_IMMD(p, NV9097, SET_ANTI_ALIASED_LINE, ENABLE_TRUE);
2213          break;
2214 
2215       default:
2216          unreachable("Invalid line rasterization mode");
2217       }
2218    }
2219 
2220    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE))
2221       P_IMMD(p, NV9097, SET_LINE_STIPPLE, dyn->rs.line.stipple.enable);
2222 
2223    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE)) {
2224       /* map factor from [1,256] to [0, 255] */
2225       uint32_t stipple_factor = CLAMP(dyn->rs.line.stipple.factor, 1, 256) - 1;
2226       P_IMMD(p, NV9097, SET_LINE_STIPPLE_PARAMETERS, {
2227          .factor  = stipple_factor,
2228          .pattern = dyn->rs.line.stipple.pattern,
2229       });
2230    }
2231 
2232    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM))
2233       P_IMMD(p, NV9097, SET_RASTER_INPUT, dyn->rs.rasterization_stream);
2234 
2235    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE) ||
2236        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE)) {
2237       if (nvk_cmd_buffer_3d_cls(cmd) < MAXWELL_B) {
2238          assert(dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
2239       } else if (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
2240          P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_FALSE);
2241       } else {
2242          uint32_t extra_overestimate =
2243             MIN2(3, dyn->rs.extra_primitive_overestimation_size * 4);
2244 
2245          if (nvk_cmd_buffer_3d_cls(cmd) >= VOLTA_A) {
2246             P_IMMD(p, NVC397, SET_CONSERVATIVE_RASTER_CONTROL, {
2247                .extra_prim_bloat = extra_overestimate,
2248                .copy_inner_to_outer =
2249                   (dyn->rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT),
2250                .triangle_snap_mode = TRIANGLE_SNAP_MODE_MODE_PRE_SNAP,
2251                .line_and_point_snap_mode = LINE_AND_POINT_SNAP_MODE_MODE_PRE_SNAP,
2252                .uncertainty_region_size = UNCERTAINTY_REGION_SIZE_SIZE_512,
2253             });
2254          } else {
2255             P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_CONSERVATIVE_RASTER_STATE));
2256             P_INLINE_DATA(p, extra_overestimate << 23);
2257          }
2258          P_IMMD(p, NVB197, SET_CONSERVATIVE_RASTER, ENABLE_TRUE);
2259       }
2260    }
2261 }
2262 
2263 uint32_t
nvk_mme_shading_rate_control_sample_shading(bool sample_shading)2264 nvk_mme_shading_rate_control_sample_shading(bool sample_shading)
2265 {
2266    return nvk_mme_val_mask((!sample_shading) << 1, 1 << 1);
2267 }
2268 
2269 static uint32_t
nvk_mme_shading_rate_control_enable(bool enable)2270 nvk_mme_shading_rate_control_enable(bool enable)
2271 {
2272    return nvk_mme_val_mask(enable, 1 << 0);
2273 }
2274 
2275 void
nvk_mme_set_shading_rate_control(struct mme_builder * b)2276 nvk_mme_set_shading_rate_control(struct mme_builder *b)
2277 {
2278    if (b->devinfo->cls_eng3d < TURING_A)
2279       return;
2280 
2281    struct mme_value val_mask = mme_load(b);
2282    struct mme_value old_src = nvk_mme_load_scratch(b, SHADING_RATE_CONTROL);
2283    struct mme_value src = nvk_mme_set_masked(b, old_src, val_mask);
2284    mme_free_reg(b, val_mask);
2285 
2286    mme_if(b, ine, src, old_src) {
2287       mme_free_reg(b, old_src);
2288       nvk_mme_store_scratch(b, SHADING_RATE_CONTROL, src);
2289 
2290       struct mme_value enable1 = mme_merge(b, mme_zero(), src, 0, 1, 0);
2291       struct mme_value enable2 = mme_merge(b, mme_zero(), src, 0, 1, 1);
2292       struct mme_value enable = mme_and(b, enable1, enable2);
2293 
2294       struct mme_value i = mme_mov(b, mme_zero());
2295       mme_while(b, ine, i, mme_imm(16 * 4)) {
2296          mme_mthd_arr(b, NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(0), i);
2297          mme_emit(b, enable);
2298          mme_add_to(b, i, i, mme_imm(4));
2299       }
2300    }
2301 }
2302 
2303 static void
nvk_mme_set_shading_rate_control_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)2304 nvk_mme_set_shading_rate_control_test_check(
2305    const struct nv_device_info *devinfo,
2306    const struct nvk_mme_test_case *test,
2307    const struct nvk_mme_mthd_data *results)
2308 {
2309    if (devinfo->cls_eng3d < TURING_A)
2310       return;
2311 
2312    assert(results[0].mthd == NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL));
2313    bool enable = (results[0].data & 3) == 3;
2314 
2315    for (uint32_t i = 0; i < 16; i++) {
2316       assert(results[i + 1].mthd ==
2317              NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_CONTROL(i));
2318       assert(results[i + 1].data == enable);
2319    }
2320 }
2321 
2322 const struct nvk_mme_test_case nvk_mme_set_shading_rate_control_tests[] = {{
2323    .init = (struct nvk_mme_mthd_data[]) {
2324       { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2325       { }
2326    },
2327    .params = (uint32_t[]) { 0x00030003 },
2328    .check = nvk_mme_set_shading_rate_control_test_check,
2329 }, {
2330    .init = (struct nvk_mme_mthd_data[]) {
2331       { NVK_SET_MME_SCRATCH(SHADING_RATE_CONTROL), 0 },
2332       { }
2333    },
2334    .params = (uint32_t[]) { 0x00030001 },
2335    .check = nvk_mme_set_shading_rate_control_test_check,
2336 }, {}};
2337 
2338 static VkExtent2D
nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,VkExtent2D a_log2,VkExtent2D b_log2)2339 nvk_combine_fs_log2_rates(VkFragmentShadingRateCombinerOpKHR op,
2340                           VkExtent2D a_log2, VkExtent2D b_log2)
2341 {
2342    switch (op) {
2343    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
2344       return a_log2;
2345 
2346    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
2347       return b_log2;
2348 
2349    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
2350       return (VkExtent2D) {
2351          .width = MIN2(a_log2.width, b_log2.width),
2352          .height = MIN2(a_log2.height, b_log2.height),
2353       };
2354 
2355    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
2356       return (VkExtent2D) {
2357          .width = MAX2(a_log2.width, b_log2.width),
2358          .height = MAX2(a_log2.height, b_log2.height),
2359       };
2360 
2361    case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR:
2362       return (VkExtent2D) {
2363          .width = a_log2.width + b_log2.width,
2364          .height = a_log2.height + b_log2.height,
2365       };
2366 
2367    default:
2368       unreachable("Invalid FSR combiner op");
2369    }
2370 }
2371 
2372 static uint8_t
vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)2373 vk_to_nvc597_shading_rate_log2(VkExtent2D rate_log2)
2374 {
2375    rate_log2.width = MIN2(rate_log2.width, 2);
2376    rate_log2.height = MIN2(rate_log2.height, 2);
2377    const uint8_t idx = (rate_log2.width << 2) | rate_log2.height;
2378 
2379    /* From the Vulkan 1.3.297 spec:
2380     *
2381     *    "A fragment shading rate Rxy representing any of Axy, Bxy or Cxy
2382     *    is clamped as follows. [...] From this list of supported rates,
2383     *    the following steps are applied in order, to select a single
2384     *    value:
2385     *
2386     *     1. Keep only rates where Rx' ≤ Rx and Ry' ≤ Ry.
2387     *
2388     *        - Implementations may also keep rates where Rx' ≤ Ry and
2389     *          Ry' ≤ Rx.
2390     *
2391     *     2. Keep only rates with the highest area (Rx' × Ry').
2392     *
2393     *     3. Keep only rates with the lowest aspect ratio (Rx' + Ry').
2394     *
2395     *     4. In cases where a wide (e.g. 4x1) and tall (e.g. 1x4) rate
2396     *        remain, the implementation may choose either rate. However, it
2397     *        must choose this rate consistently for the same shading rates,
2398     *        render pass transform, and combiner operations for the
2399     *        lifetime of the VkDevice.
2400     *
2401     * We have the following rates: 1x1, 2x1, 1x2, 2x2, 4x2, 2x4, 4x4.
2402     */
2403    static const uint8_t vk_to_nvc597[] = {
2404 #define NVC597_FSR(X) NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A_RATE_INDEX0_PS_##X
2405       NVC597_FSR(X1_PER_RASTER_PIXEL),
2406       NVC597_FSR(X1_PER_1X2_RASTER_PIXELS),
2407       NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x4 */
2408       NVC597_FSR(X1_PER_1X2_RASTER_PIXELS), /* 1x8 */
2409       NVC597_FSR(X1_PER_2X1_RASTER_PIXELS),
2410       NVC597_FSR(X1_PER_2X2_RASTER_PIXELS),
2411       NVC597_FSR(X1_PER_2X4_RASTER_PIXELS),
2412       NVC597_FSR(X1_PER_2X4_RASTER_PIXELS), /* 2x8 */
2413       NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 4x1 */
2414       NVC597_FSR(X1_PER_4X2_RASTER_PIXELS),
2415       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS),
2416       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 4x8 */
2417       NVC597_FSR(X1_PER_2X1_RASTER_PIXELS), /* 8x1 */
2418       NVC597_FSR(X1_PER_4X2_RASTER_PIXELS), /* 8x2 */
2419       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x4 */
2420       NVC597_FSR(X1_PER_4X4_RASTER_PIXELS), /* 8x8 */
2421 #undef NVC597_FSR
2422    };
2423 
2424    assert(idx < ARRAY_SIZE(vk_to_nvc597));
2425    return vk_to_nvc597[idx];
2426 }
2427 
2428 static void
nvk_flush_fsr_state(struct nvk_cmd_buffer * cmd)2429 nvk_flush_fsr_state(struct nvk_cmd_buffer *cmd)
2430 {
2431    const struct vk_dynamic_graphics_state *dyn =
2432       &cmd->vk.dynamic_graphics_state;
2433 
2434    if (nvk_cmd_buffer_3d_cls(cmd) < TURING_A) {
2435       assert(vk_fragment_shading_rate_is_disabled(&dyn->fsr));
2436       return;
2437    }
2438 
2439    if (!BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))
2440       return;
2441 
2442    if (vk_fragment_shading_rate_is_disabled(&dyn->fsr)) {
2443       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2444       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2445       P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(false));
2446    } else {
2447       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 16 * 3);
2448 
2449       assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.width));
2450       assert(util_is_power_of_two_or_zero(dyn->fsr.fragment_size.height));
2451       const VkExtent2D state_fs_log2 = {
2452          .width = util_logbase2(dyn->fsr.fragment_size.width),
2453          .height = util_logbase2(dyn->fsr.fragment_size.height),
2454       };
2455 
2456       for (uint32_t prim_idx = 0; prim_idx < 16; prim_idx++) {
2457          const VkExtent2D prim_fs_log2 = {
2458             .width = (prim_idx >> 2) & 3,
2459             .height = prim_idx & 3,
2460          };
2461 
2462          const VkExtent2D state_prim_fs_log2 =
2463             nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[0],
2464                                       state_fs_log2, prim_fs_log2);
2465 
2466          uint8_t rates[16] = {};
2467          for (uint32_t att_idx = 0; att_idx < 16; att_idx++) {
2468             const VkExtent2D att_fs_log2 = {
2469                .width = (att_idx >> 2) & 3,
2470                .height = att_idx & 3,
2471             };
2472 
2473             const VkExtent2D fs_log2 =
2474                nvk_combine_fs_log2_rates(dyn->fsr.combiner_ops[1],
2475                                          state_prim_fs_log2, att_fs_log2);
2476 
2477             rates[att_idx] = vk_to_nvc597_shading_rate_log2(fs_log2);
2478          }
2479 
2480          P_MTHD(p, NVC597, SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(prim_idx));
2481          P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_A(p, prim_idx, {
2482             .rate_index0 = rates[0],
2483             .rate_index1 = rates[1],
2484             .rate_index2 = rates[2],
2485             .rate_index3 = rates[3],
2486             .rate_index4 = rates[4],
2487             .rate_index5 = rates[5],
2488             .rate_index6 = rates[6],
2489             .rate_index7 = rates[7],
2490          });
2491          P_NVC597_SET_VARIABLE_PIXEL_RATE_SHADING_INDEX_TO_RATE_B(p, prim_idx, {
2492             .rate_index8 = rates[8],
2493             .rate_index9 = rates[9],
2494             .rate_index10 = rates[10],
2495             .rate_index11 = rates[11],
2496             .rate_index12 = rates[12],
2497             .rate_index13 = rates[13],
2498             .rate_index14 = rates[14],
2499             .rate_index15 = rates[15],
2500          });
2501       }
2502 
2503       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
2504       P_INLINE_DATA(p, nvk_mme_shading_rate_control_enable(true));
2505    }
2506 }
2507 
2508 static uint32_t
nvk_mme_anti_alias_init(void)2509 nvk_mme_anti_alias_init(void)
2510 {
2511    /* This is a valid value but we never set it so it ensures that the macro
2512     * will actually run the first time we set anything.
2513     */
2514    return 0xf;
2515 }
2516 
2517 uint32_t
nvk_mme_anti_alias_min_sample_shading(float mss)2518 nvk_mme_anti_alias_min_sample_shading(float mss)
2519 {
2520    /* The value we want to comput in the MME is
2521     *
2522     *    passes = next_pow2(samples * minSampleShading)
2523     *
2524     * Since samples is already a power of two,
2525     *
2526     *    passes_log2 = log2_ceil(samples * minSampleShading)
2527     *                = log2_ceil(samples / (1.0 / minSampleShading))
2528     *                = samples_log2 - log2_floor(1.0 / minSampleShading)
2529     *
2530     * if we assume (1.0 / min_sample_shading) >= 1.0.  This last bit is
2531     * something we can compute in the MME as long as the float math on the
2532     * right-hand side happens  on the CPU.
2533     */
2534    float rcp_mss = CLAMP(1.0 / mss, 1.0f, 16.0f);
2535    uint32_t rcp_mss_log2 = util_logbase2(floorf(rcp_mss));
2536 
2537    assert(rcp_mss_log2 != nvk_mme_anti_alias_init());
2538 
2539    return nvk_mme_val_mask(rcp_mss_log2 << 0, 0x000f);
2540 }
2541 
2542 static uint32_t
nvk_mme_anti_alias_samples(uint32_t samples)2543 nvk_mme_anti_alias_samples(uint32_t samples)
2544 {
2545    assert(util_is_power_of_two_or_zero(samples));
2546    const uint32_t samples_log2 = util_logbase2(MAX2(1, samples));
2547 
2548    return nvk_mme_val_mask(samples_log2 << 4, 0x00f0);
2549 }
2550 
2551 void
nvk_mme_set_anti_alias(struct mme_builder * b)2552 nvk_mme_set_anti_alias(struct mme_builder *b)
2553 {
2554    struct mme_value val_mask = mme_load(b);
2555    struct mme_value old_anti_alias = nvk_mme_load_scratch(b, ANTI_ALIAS);
2556    struct mme_value anti_alias =
2557       nvk_mme_set_masked(b, old_anti_alias, val_mask);
2558    mme_free_reg(b, val_mask);
2559 
2560    mme_if(b, ine, anti_alias, old_anti_alias) {
2561       mme_free_reg(b, old_anti_alias);
2562       nvk_mme_store_scratch(b, ANTI_ALIAS, anti_alias);
2563 
2564       struct mme_value rcp_mss_log2 =
2565          mme_merge(b, mme_zero(), anti_alias, 0, 4, 0);
2566       struct mme_value samples_log2 =
2567          mme_merge(b, mme_zero(), anti_alias, 0, 4, 4);
2568       mme_free_reg(b, anti_alias);
2569 
2570       /* We've already done all the hard work on the CPU in
2571        * nvk_mme_min_sample_shading().  All we have to do here is add the two
2572        * log2 values and clamp so we don't get negative.
2573        */
2574       struct mme_value passes_log2 = mme_sub(b, samples_log2, rcp_mss_log2);
2575       mme_free_reg(b, rcp_mss_log2);
2576 
2577       /* passes = MAX(passes, 1) */
2578       struct mme_value neg = mme_srl(b, passes_log2, mme_imm(31));
2579       mme_if(b, ine, neg, mme_zero()) {
2580          mme_mov_to(b, passes_log2, mme_zero());
2581       }
2582       mme_free_reg(b, neg);
2583 
2584       /*
2585        * NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL {
2586        *    ...
2587        *    .centroid = passes > 1 ? CENTROID_PER_PASS
2588        *                           : CENTROID_PER_FRAGMENT,
2589        * }
2590        */
2591       struct mme_value aac = mme_mov(b,
2592          mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_FRAGMENT
2593                  << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2594       mme_if(b, ine, passes_log2, mme_zero()) {
2595          mme_mov_to(b, aac,
2596             mme_imm(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID_PER_PASS
2597                     << DRF_LO(NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL_CENTROID)));
2598       }
2599 
2600       struct mme_value passes = mme_sll(b, mme_imm(1), passes_log2);
2601       mme_merge_to(b, aac, aac, passes, 0, 4, 0);
2602       mme_free_reg(b, passes);
2603 
2604       mme_mthd(b, NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL);
2605       mme_emit(b, aac);
2606       mme_free_reg(b, aac);
2607 
2608       /* Now we need to emit sample masks per-sample. Annoyingly, we have to
2609        * pack these in pairs.
2610        */
2611       STATIC_ASSERT(sizeof(struct nak_sample_mask) == 2);
2612 
2613       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
2614       mme_emit(b, mme_imm(nvk_root_descriptor_offset(draw.sample_masks)));
2615       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
2616 
2617       /* Annoyingly, we have to pack these in pairs */
2618 
2619       struct mme_value samples_per_pass_log2 =
2620          mme_sub(b, samples_log2, passes_log2);
2621       mme_free_reg(b, samples_log2);
2622 
2623       mme_if(b, ieq, samples_per_pass_log2, mme_zero()) {
2624          /* One sample per pass, we can just blast it out */
2625          for (uint32_t i = 0; i < NVK_MAX_SAMPLES; i += 2) {
2626             uint32_t mask0 = 1 << i;
2627             uint32_t mask1 = 1 << (i + 1);
2628             mme_emit(b, mme_imm(mask0 | (mask1 << 16)));
2629          }
2630       }
2631 
2632       mme_if(b, ine, samples_per_pass_log2, mme_zero()) {
2633          mme_if(b, ieq, passes_log2, mme_zero()) {
2634             /* It's a single pass so we can use 0xffff */
2635             for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++)
2636                mme_emit(b, mme_imm(~0));
2637          }
2638 
2639          mme_if(b, ieq, passes_log2, mme_imm(1)) {
2640             for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2641                struct mme_value mask =
2642                   nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_2PASS_0, i);
2643                mme_emit(b, mask);
2644                mme_free_reg(b, mask);
2645             }
2646          }
2647 
2648          mme_if(b, ieq, passes_log2, mme_imm(2)) {
2649             for (uint32_t i = 0; i < NVK_MAX_SAMPLES / 2; i++) {
2650                struct mme_value mask =
2651                   nvk_mme_load_scratch_arr(b, SAMPLE_MASKS_4PASS_0, i);
2652                mme_emit(b, mask);
2653                mme_free_reg(b, mask);
2654             }
2655          }
2656       }
2657    }
2658 }
2659 
2660 const struct nvk_mme_test_case nvk_mme_set_anti_alias_tests[] = {{
2661    /* This case doesn't change the state so it should do nothing */
2662    .init = (struct nvk_mme_mthd_data[]) {
2663       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2664       { }
2665    },
2666    .params = (uint32_t[]) { 0xffff0000 },
2667    .expected = (struct nvk_mme_mthd_data[]) {
2668       { }
2669    },
2670 }, {
2671    /* Single sample, minSampleShading = 1.0 */
2672    .init = (struct nvk_mme_mthd_data[]) {
2673       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2674       { }
2675    },
2676    .params = (uint32_t[]) { 0xffff0000 },
2677    .expected = (struct nvk_mme_mthd_data[]) {
2678       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0 },
2679       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2680       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2681         nvk_root_descriptor_offset(draw.sample_masks) },
2682       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2683       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2684       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2685       { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2686       { }
2687    },
2688 }, {
2689    /* Single sample, minSampleShading = 0.25 */
2690    .init = (struct nvk_mme_mthd_data[]) {
2691       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0xf },
2692       { }
2693    },
2694    .params = (uint32_t[]) { 0xffff0002 },
2695    .expected = (struct nvk_mme_mthd_data[]) {
2696       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x2 },
2697       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x1 },
2698       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2699         nvk_root_descriptor_offset(draw.sample_masks) },
2700       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x020001 },
2701       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x080004 },
2702       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x200010 },
2703       { NV9097_LOAD_CONSTANT_BUFFER(3), 0x800040 },
2704       { }
2705    },
2706 }, {
2707    /* 8 samples, minSampleShading = 0.5 */
2708    .init = (struct nvk_mme_mthd_data[]) {
2709       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x1 },
2710       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_0), 0x030003 },
2711       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_1), 0x0c000c },
2712       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_2), 0x300030 },
2713       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_4PASS_3), 0xc000c0 },
2714       { }
2715    },
2716    .params = (uint32_t[]) { 0x00f00030 },
2717    .expected = (struct nvk_mme_mthd_data[]) {
2718       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x31 },
2719       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x14 },
2720       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2721         nvk_root_descriptor_offset(draw.sample_masks) },
2722       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x030003 },
2723       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0c000c },
2724       { NV9097_LOAD_CONSTANT_BUFFER(2), 0x300030 },
2725       { NV9097_LOAD_CONSTANT_BUFFER(3), 0xc000c0 },
2726       { }
2727    },
2728 }, {
2729    /* 8 samples, minSampleShading = 0.25 */
2730    .init = (struct nvk_mme_mthd_data[]) {
2731       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x30 },
2732       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_0), 0x0f000f },
2733       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_1), 0x0f000f },
2734       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_2), 0xf000f0 },
2735       { NVK_SET_MME_SCRATCH(SAMPLE_MASKS_2PASS_3), 0xf000f0 },
2736       { }
2737    },
2738    .params = (uint32_t[]) { 0x000f0002 },
2739    .expected = (struct nvk_mme_mthd_data[]) {
2740       { NVK_SET_MME_SCRATCH(ANTI_ALIAS), 0x32 },
2741       { NV9097_SET_HYBRID_ANTI_ALIAS_CONTROL, 0x12 },
2742       { NV9097_LOAD_CONSTANT_BUFFER_OFFSET,
2743         nvk_root_descriptor_offset(draw.sample_masks) },
2744       { NV9097_LOAD_CONSTANT_BUFFER(0), 0x0f000f },
2745       { NV9097_LOAD_CONSTANT_BUFFER(1), 0x0f000f },
2746       { NV9097_LOAD_CONSTANT_BUFFER(2), 0xf000f0 },
2747       { NV9097_LOAD_CONSTANT_BUFFER(3), 0xf000f0 },
2748       { }
2749    },
2750 }, {}};
2751 
2752 static VkSampleLocationEXT
vk_sample_location(const struct vk_sample_locations_state * sl,uint32_t x,uint32_t y,uint32_t s)2753 vk_sample_location(const struct vk_sample_locations_state *sl,
2754                    uint32_t x, uint32_t y, uint32_t s)
2755 {
2756    x = x % sl->grid_size.width;
2757    y = y % sl->grid_size.height;
2758 
2759    return sl->locations[(x + y * sl->grid_size.width) * sl->per_pixel + s];
2760 }
2761 
2762 static struct nak_sample_location
vk_to_nak_sample_location(VkSampleLocationEXT loc)2763 vk_to_nak_sample_location(VkSampleLocationEXT loc)
2764 {
2765    return (struct nak_sample_location) {
2766       .x_u4 = util_bitpack_ufixed_clamp(loc.x, 0, 3, 4),
2767       .y_u4 = util_bitpack_ufixed_clamp(loc.y, 0, 3, 4),
2768    };
2769 }
2770 
2771 static void
nvk_flush_ms_state(struct nvk_cmd_buffer * cmd)2772 nvk_flush_ms_state(struct nvk_cmd_buffer *cmd)
2773 {
2774    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2775    const struct vk_dynamic_graphics_state *dyn =
2776       &cmd->vk.dynamic_graphics_state;
2777 
2778    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
2779       /* When we don't have any attachments, we can't know the sample count
2780        * from the render pass so we need to emit SET_ANTI_ALIAS here.  See the
2781        * comment in nvk_BeginRendering() for more details.
2782        */
2783       if (render->samples == 0) {
2784          /* Multisample information MAY be missing (rasterizationSamples == 0)
2785           * if rasterizer discard is enabled.  However, this isn't valid in
2786           * the hardware so always use at least one sample.
2787           */
2788          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2789          nvk_cmd_set_sample_layout(cmd, nil_choose_sample_layout(samples));
2790       } else {
2791          /* Multisample information MAY be missing (rasterizationSamples == 0)
2792           * if rasterizer discard is enabled.
2793           */
2794          assert(dyn->ms.rasterization_samples == 0 ||
2795                 dyn->ms.rasterization_samples == render->samples);
2796       }
2797    }
2798 
2799    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
2800        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE)) {
2801       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
2802       P_IMMD(p, NV9097, SET_ANTI_ALIAS_ALPHA_CONTROL, {
2803          .alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2804          .alpha_to_one      = dyn->ms.alpha_to_one_enable,
2805       });
2806    }
2807 
2808    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
2809        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
2810        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)) {
2811       const struct vk_sample_locations_state *sl;
2812       if (dyn->ms.sample_locations_enable) {
2813          sl = dyn->ms.sample_locations;
2814       } else {
2815          const uint32_t samples = MAX2(1, dyn->ms.rasterization_samples);
2816          sl = vk_standard_sample_locations_state(samples);
2817       }
2818 
2819       struct nak_sample_location push_sl[NVK_MAX_SAMPLES];
2820       for (uint32_t i = 0; i < sl->per_pixel; i++)
2821          push_sl[i] = vk_to_nak_sample_location(sl->locations[i]);
2822 
2823       nvk_descriptor_state_set_root_array(cmd, &cmd->state.gfx.descriptors,
2824                                           draw.sample_locations,
2825                                           0, NVK_MAX_SAMPLES, push_sl);
2826 
2827       if (nvk_cmd_buffer_3d_cls(cmd) >= MAXWELL_B) {
2828          struct nak_sample_location loc[16];
2829          for (uint32_t n = 0; n < ARRAY_SIZE(loc); n++) {
2830             const uint32_t s = n % sl->per_pixel;
2831             const uint32_t px = n / sl->per_pixel;
2832             const uint32_t x = px % 2;
2833             const uint32_t y = px / 2;
2834 
2835             loc[n] = vk_to_nak_sample_location(vk_sample_location(sl, x, y, s));
2836          }
2837 
2838          struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2839 
2840          P_MTHD(p, NVB197, SET_ANTI_ALIAS_SAMPLE_POSITIONS(0));
2841          for (uint32_t i = 0; i < 4; i++) {
2842             P_NVB197_SET_ANTI_ALIAS_SAMPLE_POSITIONS(p, i, {
2843                .x0 = loc[i * 4 + 0].x_u4,
2844                .y0 = loc[i * 4 + 0].y_u4,
2845                .x1 = loc[i * 4 + 1].x_u4,
2846                .y1 = loc[i * 4 + 1].y_u4,
2847                .x2 = loc[i * 4 + 2].x_u4,
2848                .y2 = loc[i * 4 + 2].y_u4,
2849                .x3 = loc[i * 4 + 3].x_u4,
2850                .y3 = loc[i * 4 + 3].y_u4,
2851             });
2852          }
2853       }
2854    }
2855 
2856    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
2857       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
2858       P_MTHD(p, NV9097, SET_SAMPLE_MASK_X0_Y0);
2859       P_NV9097_SET_SAMPLE_MASK_X0_Y0(p, dyn->ms.sample_mask & 0xffff);
2860       P_NV9097_SET_SAMPLE_MASK_X1_Y0(p, dyn->ms.sample_mask & 0xffff);
2861       P_NV9097_SET_SAMPLE_MASK_X0_Y1(p, dyn->ms.sample_mask & 0xffff);
2862       P_NV9097_SET_SAMPLE_MASK_X1_Y1(p, dyn->ms.sample_mask & 0xffff);
2863    }
2864 }
2865 
2866 static uint32_t
vk_to_nv9097_compare_op(VkCompareOp vk_op)2867 vk_to_nv9097_compare_op(VkCompareOp vk_op)
2868 {
2869    ASSERTED static const uint16_t vk_to_nv9097[] = {
2870       [VK_COMPARE_OP_NEVER]            = NV9097_SET_DEPTH_FUNC_V_OGL_NEVER,
2871       [VK_COMPARE_OP_LESS]             = NV9097_SET_DEPTH_FUNC_V_OGL_LESS,
2872       [VK_COMPARE_OP_EQUAL]            = NV9097_SET_DEPTH_FUNC_V_OGL_EQUAL,
2873       [VK_COMPARE_OP_LESS_OR_EQUAL]    = NV9097_SET_DEPTH_FUNC_V_OGL_LEQUAL,
2874       [VK_COMPARE_OP_GREATER]          = NV9097_SET_DEPTH_FUNC_V_OGL_GREATER,
2875       [VK_COMPARE_OP_NOT_EQUAL]        = NV9097_SET_DEPTH_FUNC_V_OGL_NOTEQUAL,
2876       [VK_COMPARE_OP_GREATER_OR_EQUAL] = NV9097_SET_DEPTH_FUNC_V_OGL_GEQUAL,
2877       [VK_COMPARE_OP_ALWAYS]           = NV9097_SET_DEPTH_FUNC_V_OGL_ALWAYS,
2878    };
2879    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2880 
2881    uint32_t nv9097_op = 0x200 | vk_op;
2882    assert(nv9097_op == vk_to_nv9097[vk_op]);
2883    return nv9097_op;
2884 }
2885 
2886 static uint32_t
vk_to_nv9097_stencil_op(VkStencilOp vk_op)2887 vk_to_nv9097_stencil_op(VkStencilOp vk_op)
2888 {
2889 #define OP(vk, nv) [VK_STENCIL_OP_##vk] = NV9097_SET_STENCIL_OP_FAIL_V_##nv
2890    ASSERTED static const uint16_t vk_to_nv9097[] = {
2891       OP(KEEP,                D3D_KEEP),
2892       OP(ZERO,                D3D_ZERO),
2893       OP(REPLACE,             D3D_REPLACE),
2894       OP(INCREMENT_AND_CLAMP, D3D_INCRSAT),
2895       OP(DECREMENT_AND_CLAMP, D3D_DECRSAT),
2896       OP(INVERT,              D3D_INVERT),
2897       OP(INCREMENT_AND_WRAP,  D3D_INCR),
2898       OP(DECREMENT_AND_WRAP,  D3D_DECR),
2899    };
2900    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
2901 #undef OP
2902 
2903    uint32_t nv9097_op = vk_op + 1;
2904    assert(nv9097_op == vk_to_nv9097[vk_op]);
2905    return nv9097_op;
2906 }
2907 
2908 static void
nvk_flush_ds_state(struct nvk_cmd_buffer * cmd)2909 nvk_flush_ds_state(struct nvk_cmd_buffer *cmd)
2910 {
2911    struct nv_push *p = nvk_cmd_buffer_push(cmd, 35);
2912 
2913    const struct nvk_rendering_state *render = &cmd->state.gfx.render;
2914    const struct vk_dynamic_graphics_state *dyn =
2915       &cmd->vk.dynamic_graphics_state;
2916 
2917    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE)) {
2918       bool enable = dyn->ds.depth.test_enable &&
2919                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2920       P_IMMD(p, NV9097, SET_DEPTH_TEST, enable);
2921    }
2922 
2923    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE)) {
2924       bool enable = dyn->ds.depth.write_enable &&
2925                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2926       P_IMMD(p, NV9097, SET_DEPTH_WRITE, enable);
2927    }
2928 
2929    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
2930       const uint32_t func = vk_to_nv9097_compare_op(dyn->ds.depth.compare_op);
2931       P_IMMD(p, NV9097, SET_DEPTH_FUNC, func);
2932    }
2933 
2934    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE)) {
2935       bool enable = dyn->ds.depth.bounds_test.enable &&
2936                     render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2937       P_IMMD(p, NV9097, SET_DEPTH_BOUNDS_TEST, enable);
2938    }
2939 
2940    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
2941       P_MTHD(p, NV9097, SET_DEPTH_BOUNDS_MIN);
2942       P_NV9097_SET_DEPTH_BOUNDS_MIN(p, fui(dyn->ds.depth.bounds_test.min));
2943       P_NV9097_SET_DEPTH_BOUNDS_MAX(p, fui(dyn->ds.depth.bounds_test.max));
2944    }
2945 
2946    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE)) {
2947       bool enable = dyn->ds.stencil.test_enable &&
2948                     render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2949       P_IMMD(p, NV9097, SET_STENCIL_TEST, enable);
2950    }
2951 
2952    const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
2953    const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
2954    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP)) {
2955       P_MTHD(p, NV9097, SET_STENCIL_OP_FAIL);
2956       P_NV9097_SET_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(front->op.fail));
2957       P_NV9097_SET_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(front->op.depth_fail));
2958       P_NV9097_SET_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(front->op.pass));
2959       P_NV9097_SET_STENCIL_FUNC(p, vk_to_nv9097_compare_op(front->op.compare));
2960 
2961       P_MTHD(p, NV9097, SET_BACK_STENCIL_OP_FAIL);
2962       P_NV9097_SET_BACK_STENCIL_OP_FAIL(p, vk_to_nv9097_stencil_op(back->op.fail));
2963       P_NV9097_SET_BACK_STENCIL_OP_ZFAIL(p, vk_to_nv9097_stencil_op(back->op.depth_fail));
2964       P_NV9097_SET_BACK_STENCIL_OP_ZPASS(p, vk_to_nv9097_stencil_op(back->op.pass));
2965       P_NV9097_SET_BACK_STENCIL_FUNC(p, vk_to_nv9097_compare_op(back->op.compare));
2966    }
2967 
2968    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK)) {
2969       P_IMMD(p, NV9097, SET_STENCIL_FUNC_MASK, front->compare_mask);
2970       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_MASK, back->compare_mask);
2971    }
2972 
2973    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK)) {
2974       P_IMMD(p, NV9097, SET_STENCIL_MASK, front->write_mask);
2975       P_IMMD(p, NV9097, SET_BACK_STENCIL_MASK, back->write_mask);
2976    }
2977 
2978    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
2979       P_IMMD(p, NV9097, SET_STENCIL_FUNC_REF, front->reference);
2980       P_IMMD(p, NV9097, SET_BACK_STENCIL_FUNC_REF, back->reference);
2981    }
2982 }
2983 
2984 static uint32_t
vk_to_nv9097_logic_op(VkLogicOp vk_op)2985 vk_to_nv9097_logic_op(VkLogicOp vk_op)
2986 {
2987    ASSERTED uint16_t vk_to_nv9097[] = {
2988       [VK_LOGIC_OP_CLEAR]           = NV9097_SET_LOGIC_OP_FUNC_V_CLEAR,
2989       [VK_LOGIC_OP_AND]             = NV9097_SET_LOGIC_OP_FUNC_V_AND,
2990       [VK_LOGIC_OP_AND_REVERSE]     = NV9097_SET_LOGIC_OP_FUNC_V_AND_REVERSE,
2991       [VK_LOGIC_OP_COPY]            = NV9097_SET_LOGIC_OP_FUNC_V_COPY,
2992       [VK_LOGIC_OP_AND_INVERTED]    = NV9097_SET_LOGIC_OP_FUNC_V_AND_INVERTED,
2993       [VK_LOGIC_OP_NO_OP]           = NV9097_SET_LOGIC_OP_FUNC_V_NOOP,
2994       [VK_LOGIC_OP_XOR]             = NV9097_SET_LOGIC_OP_FUNC_V_XOR,
2995       [VK_LOGIC_OP_OR]              = NV9097_SET_LOGIC_OP_FUNC_V_OR,
2996       [VK_LOGIC_OP_NOR]             = NV9097_SET_LOGIC_OP_FUNC_V_NOR,
2997       [VK_LOGIC_OP_EQUIVALENT]      = NV9097_SET_LOGIC_OP_FUNC_V_EQUIV,
2998       [VK_LOGIC_OP_INVERT]          = NV9097_SET_LOGIC_OP_FUNC_V_INVERT,
2999       [VK_LOGIC_OP_OR_REVERSE]      = NV9097_SET_LOGIC_OP_FUNC_V_OR_REVERSE,
3000       [VK_LOGIC_OP_COPY_INVERTED]   = NV9097_SET_LOGIC_OP_FUNC_V_COPY_INVERTED,
3001       [VK_LOGIC_OP_OR_INVERTED]     = NV9097_SET_LOGIC_OP_FUNC_V_OR_INVERTED,
3002       [VK_LOGIC_OP_NAND]            = NV9097_SET_LOGIC_OP_FUNC_V_NAND,
3003       [VK_LOGIC_OP_SET]             = NV9097_SET_LOGIC_OP_FUNC_V_SET,
3004    };
3005    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3006 
3007    uint32_t nv9097_op = 0x1500 | vk_op;
3008    assert(nv9097_op == vk_to_nv9097[vk_op]);
3009    return nv9097_op;
3010 }
3011 
3012 static uint32_t
vk_to_nv9097_blend_op(VkBlendOp vk_op)3013 vk_to_nv9097_blend_op(VkBlendOp vk_op)
3014 {
3015 #define OP(vk, nv) [VK_BLEND_OP_##vk] = NV9097_SET_BLEND_COLOR_OP_V_OGL_##nv
3016    ASSERTED uint16_t vk_to_nv9097[] = {
3017       OP(ADD,              FUNC_ADD),
3018       OP(SUBTRACT,         FUNC_SUBTRACT),
3019       OP(REVERSE_SUBTRACT, FUNC_REVERSE_SUBTRACT),
3020       OP(MIN,              MIN),
3021       OP(MAX,              MAX),
3022    };
3023    assert(vk_op < ARRAY_SIZE(vk_to_nv9097));
3024 #undef OP
3025 
3026    return vk_to_nv9097[vk_op];
3027 }
3028 
3029 static uint32_t
vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)3030 vk_to_nv9097_blend_factor(VkBlendFactor vk_factor)
3031 {
3032 #define FACTOR(vk, nv) [VK_BLEND_FACTOR_##vk] = \
3033    NV9097_SET_BLEND_COLOR_SOURCE_COEFF_V_##nv
3034    ASSERTED uint16_t vk_to_nv9097[] = {
3035       FACTOR(ZERO,                     OGL_ZERO),
3036       FACTOR(ONE,                      OGL_ONE),
3037       FACTOR(SRC_COLOR,                OGL_SRC_COLOR),
3038       FACTOR(ONE_MINUS_SRC_COLOR,      OGL_ONE_MINUS_SRC_COLOR),
3039       FACTOR(DST_COLOR,                OGL_DST_COLOR),
3040       FACTOR(ONE_MINUS_DST_COLOR,      OGL_ONE_MINUS_DST_COLOR),
3041       FACTOR(SRC_ALPHA,                OGL_SRC_ALPHA),
3042       FACTOR(ONE_MINUS_SRC_ALPHA,      OGL_ONE_MINUS_SRC_ALPHA),
3043       FACTOR(DST_ALPHA,                OGL_DST_ALPHA),
3044       FACTOR(ONE_MINUS_DST_ALPHA,      OGL_ONE_MINUS_DST_ALPHA),
3045       FACTOR(CONSTANT_COLOR,           OGL_CONSTANT_COLOR),
3046       FACTOR(ONE_MINUS_CONSTANT_COLOR, OGL_ONE_MINUS_CONSTANT_COLOR),
3047       FACTOR(CONSTANT_ALPHA,           OGL_CONSTANT_ALPHA),
3048       FACTOR(ONE_MINUS_CONSTANT_ALPHA, OGL_ONE_MINUS_CONSTANT_ALPHA),
3049       FACTOR(SRC_ALPHA_SATURATE,       OGL_SRC_ALPHA_SATURATE),
3050       FACTOR(SRC1_COLOR,               OGL_SRC1COLOR),
3051       FACTOR(ONE_MINUS_SRC1_COLOR,     OGL_INVSRC1COLOR),
3052       FACTOR(SRC1_ALPHA,               OGL_SRC1ALPHA),
3053       FACTOR(ONE_MINUS_SRC1_ALPHA,     OGL_INVSRC1ALPHA),
3054    };
3055    assert(vk_factor < ARRAY_SIZE(vk_to_nv9097));
3056 #undef FACTOR
3057 
3058    return vk_to_nv9097[vk_factor];
3059 }
3060 
3061 void
nvk_mme_set_write_mask(struct mme_builder * b)3062 nvk_mme_set_write_mask(struct mme_builder *b)
3063 {
3064    struct mme_value count = mme_load(b);
3065    struct mme_value mask = mme_load(b);
3066 
3067    /*
3068     * mask is a bit field
3069     *
3070     * attachment index 88887777666655554444333322221111
3071     * component        abgrabgrabgrabgrabgrabgrabgrabgr
3072    */
3073 
3074    struct mme_value common_mask = mme_mov(b, mme_imm(1));
3075    struct mme_value first = mme_and(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3076    struct mme_value i = mme_mov(b, mme_zero());
3077 
3078    mme_while(b, ine, i, count) {
3079       /*
3080          We call NV9097_SET_CT_WRITE per attachment. It needs a value as:
3081          0x0000 0000 0000 0000 000a 000b 000g 000r
3082 
3083          So for i=0 a mask of
3084          0x0000 0000 0000 0000 0000 0000 0000 1111
3085          becomes
3086          0x0000 0000 0000 0000 0001 0001 0001 0001
3087       */
3088 
3089       struct mme_value val = mme_merge(b, mme_zero(), mask, 0, 1, 0);
3090       mme_merge_to(b, val, val, mask, 4, 1, 1);
3091       mme_merge_to(b, val, val, mask, 8, 1, 2);
3092       mme_merge_to(b, val, val, mask, 12, 1, 3);
3093 
3094       mme_mthd_arr(b, NV9097_SET_CT_WRITE(0), i);
3095       mme_emit(b, val);
3096       mme_free_reg(b, val);
3097 
3098       /* Check if all masks are common */
3099       struct mme_value temp = mme_add(b, mask, mme_imm(BITFIELD_RANGE(0, 4)));
3100       mme_if(b, ine, first, temp) {
3101          mme_mov_to(b, common_mask, mme_zero());
3102       }
3103       mme_free_reg(b, temp);
3104 
3105       mme_srl_to(b, mask, mask, mme_imm(4));
3106 
3107       mme_add_to(b, i, i, mme_imm(1));
3108    }
3109 
3110    mme_mthd(b, NV9097_SET_SINGLE_CT_WRITE_CONTROL);
3111    mme_emit(b, common_mask);
3112 }
3113 
3114 static void
nvk_flush_cb_state(struct nvk_cmd_buffer * cmd)3115 nvk_flush_cb_state(struct nvk_cmd_buffer *cmd)
3116 {
3117    struct nvk_rendering_state *render = &cmd->state.gfx.render;
3118    const struct vk_dynamic_graphics_state *dyn =
3119       &cmd->vk.dynamic_graphics_state;
3120 
3121    struct nv_push *p =
3122       nvk_cmd_buffer_push(cmd, 15 + 10 * render->color_att_count);
3123 
3124    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE))
3125       P_IMMD(p, NV9097, SET_LOGIC_OP, dyn->cb.logic_op_enable);
3126 
3127    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP)) {
3128       const uint32_t func = vk_to_nv9097_logic_op(dyn->cb.logic_op);
3129       P_IMMD(p, NV9097, SET_LOGIC_OP_FUNC, func);
3130    }
3131 
3132    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES)) {
3133       for (uint8_t a = 0; a < render->color_att_count; a++) {
3134          P_IMMD(p, NV9097, SET_BLEND(a), dyn->cb.attachments[a].blend_enable);
3135       }
3136    }
3137 
3138    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
3139       for (uint8_t a = 0; a < render->color_att_count; a++) {
3140          const struct vk_color_blend_attachment_state *att =
3141             &dyn->cb.attachments[a];
3142          P_MTHD(p, NV9097, SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(a));
3143          P_NV9097_SET_BLEND_PER_TARGET_SEPARATE_FOR_ALPHA(p, a, ENABLE_TRUE);
3144          P_NV9097_SET_BLEND_PER_TARGET_COLOR_OP(p, a,
3145                vk_to_nv9097_blend_op(att->color_blend_op));
3146          P_NV9097_SET_BLEND_PER_TARGET_COLOR_SOURCE_COEFF(p, a,
3147                vk_to_nv9097_blend_factor(att->src_color_blend_factor));
3148          P_NV9097_SET_BLEND_PER_TARGET_COLOR_DEST_COEFF(p, a,
3149                vk_to_nv9097_blend_factor(att->dst_color_blend_factor));
3150          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_OP(p, a,
3151                vk_to_nv9097_blend_op(att->alpha_blend_op));
3152          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_SOURCE_COEFF(p, a,
3153                vk_to_nv9097_blend_factor(att->src_alpha_blend_factor));
3154          P_NV9097_SET_BLEND_PER_TARGET_ALPHA_DEST_COEFF(p, a,
3155                vk_to_nv9097_blend_factor(att->dst_alpha_blend_factor));
3156       }
3157    }
3158 
3159    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
3160        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
3161        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RP_ATTACHMENTS) ||
3162        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3163       uint32_t color_write_enables = 0x0;
3164       for (uint8_t a = 0; a < render->color_att_count; a++) {
3165          if (dyn->cb.color_write_enables & BITFIELD_BIT(a))
3166             color_write_enables |= 0xf << (4 * a);
3167       }
3168 
3169       uint32_t cb_att_write_mask = 0x0;
3170       for (uint8_t a = 0; a < render->color_att_count; a++)
3171          cb_att_write_mask |= dyn->cb.attachments[a].write_mask << (a * 4);
3172 
3173       uint32_t rp_att_write_mask = 0x0;
3174       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3175          if (dyn->rp.attachments & (MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << a))
3176             rp_att_write_mask |= 0xf << (4 * a);
3177       }
3178 
3179       uint32_t att_has_loc_mask = 0x0;
3180       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3181          if (dyn->cal.color_map[a] != MESA_VK_ATTACHMENT_UNUSED)
3182             att_has_loc_mask |= 0xf << (4 * a);
3183       }
3184 
3185       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_WRITE_MASK));
3186       P_INLINE_DATA(p, render->color_att_count);
3187       P_INLINE_DATA(p, color_write_enables &
3188                        cb_att_write_mask &
3189                        rp_att_write_mask &
3190                        att_has_loc_mask);
3191    }
3192 
3193    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP)) {
3194       int8_t loc_att[NVK_MAX_RTS] = { -1, -1, -1, -1, -1, -1, -1, -1};
3195       uint8_t max_loc = 0;
3196       uint32_t att_used = 0;
3197       for (uint8_t a = 0; a < MESA_VK_MAX_COLOR_ATTACHMENTS; a++) {
3198          if (dyn->cal.color_map[a] == MESA_VK_ATTACHMENT_UNUSED)
3199             continue;
3200 
3201          att_used |= BITFIELD_BIT(a);
3202 
3203          assert(dyn->cal.color_map[a] < NVK_MAX_RTS);
3204          loc_att[dyn->cal.color_map[a]] = a;
3205          max_loc = MAX2(max_loc, dyn->cal.color_map[a]);
3206       }
3207 
3208       for (uint8_t l = 0; l < NVK_MAX_RTS; l++) {
3209          if (loc_att[l] >= 0)
3210             continue;
3211 
3212          /* Just grab any color attachment.  The way we set up color targets
3213           * in BeginRenderPass ensures that every color target is either the
3214           * valid color target referenced by this render pass or a valid NULL
3215           * target.  If we end up mapping to some other target in this render
3216           * pass, the handling of att_has_loc_mask above will ensure that no
3217           * color writes actually happen.
3218           */
3219          uint8_t a = ffs(~att_used) - 1;
3220          att_used |= BITFIELD_BIT(a);
3221          loc_att[l] = a;
3222       }
3223 
3224       P_IMMD(p, NV9097, SET_CT_SELECT, {
3225          .target_count = max_loc + 1,
3226          .target0 = loc_att[0],
3227          .target1 = loc_att[1],
3228          .target2 = loc_att[2],
3229          .target3 = loc_att[3],
3230          .target4 = loc_att[4],
3231          .target5 = loc_att[5],
3232          .target6 = loc_att[6],
3233          .target7 = loc_att[7],
3234       });
3235    }
3236 
3237    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
3238       P_MTHD(p, NV9097, SET_BLEND_CONST_RED);
3239       P_NV9097_SET_BLEND_CONST_RED(p,     fui(dyn->cb.blend_constants[0]));
3240       P_NV9097_SET_BLEND_CONST_GREEN(p,   fui(dyn->cb.blend_constants[1]));
3241       P_NV9097_SET_BLEND_CONST_BLUE(p,    fui(dyn->cb.blend_constants[2]));
3242       P_NV9097_SET_BLEND_CONST_ALPHA(p,   fui(dyn->cb.blend_constants[3]));
3243    }
3244 }
3245 
3246 void
nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer * cmd)3247 nvk_cmd_flush_gfx_dynamic_state(struct nvk_cmd_buffer *cmd)
3248 {
3249    struct vk_dynamic_graphics_state *dyn =
3250       &cmd->vk.dynamic_graphics_state;
3251 
3252    if (!vk_dynamic_graphics_state_any_dirty(dyn))
3253       return;
3254 
3255    nvk_flush_vi_state(cmd);
3256    nvk_flush_ia_state(cmd);
3257    nvk_flush_ts_state(cmd);
3258    nvk_flush_vp_state(cmd);
3259    nvk_flush_rs_state(cmd);
3260    nvk_flush_fsr_state(cmd);
3261    nvk_flush_ms_state(cmd);
3262    nvk_flush_ds_state(cmd);
3263    nvk_flush_cb_state(cmd);
3264 
3265    vk_dynamic_graphics_state_clear_dirty(dyn);
3266 }
3267 
3268 void
nvk_mme_bind_cbuf_desc(struct mme_builder * b)3269 nvk_mme_bind_cbuf_desc(struct mme_builder *b)
3270 {
3271    /* First 4 bits are group, later bits are slot */
3272    struct mme_value group_slot = mme_load(b);
3273 
3274    struct mme_value addr_lo, addr_hi, size;
3275    if (nvk_use_bindless_cbuf(b->devinfo)) {
3276       if (b->devinfo->cls_eng3d >= TURING_A) {
3277          struct mme_value64 addr = mme_load_addr64(b);
3278          mme_tu104_read_fifoed(b, addr, mme_imm(2));
3279       }
3280 
3281       /* Load the descriptor */
3282       struct mme_value desc_lo = mme_load(b);
3283       struct mme_value desc_hi = mme_load(b);
3284 
3285       /* The bottom 45 bits are addr >> 4 */
3286       addr_lo = mme_merge(b, mme_zero(), desc_lo, 4, 28, 0);
3287       addr_hi = mme_merge(b, mme_zero(), desc_lo, 0, 4, 28);
3288       mme_merge_to(b, addr_hi, addr_hi, desc_hi, 4, 13, 0);
3289 
3290       /* The top 19 bits are size >> 4 */
3291       size = mme_merge(b, mme_zero(), desc_hi, 4, 19, 13);
3292 
3293       mme_free_reg(b, desc_hi);
3294       mme_free_reg(b, desc_lo);
3295    } else {
3296       if (b->devinfo->cls_eng3d >= TURING_A) {
3297          struct mme_value64 addr = mme_load_addr64(b);
3298          mme_tu104_read_fifoed(b, addr, mme_imm(3));
3299       }
3300 
3301       /* Load the descriptor */
3302       addr_lo = mme_load(b);
3303       addr_hi = mme_load(b);
3304       size = mme_load(b);
3305    }
3306 
3307    struct mme_value cb = mme_alloc_reg(b);
3308    mme_if(b, ieq, size, mme_zero()) {
3309       /* Bottim bit is the valid bit, 8:4 are shader slot */
3310       mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4);
3311    }
3312 
3313    mme_if(b, ine, size, mme_zero()) {
3314       /* size = max(size, NVK_MAX_CBUF_SIZE) */
3315       assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE));
3316       struct mme_value is_large =
3317          mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1)));
3318       mme_if(b, ine, is_large, mme_zero()) {
3319          mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE));
3320       }
3321 
3322       mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A);
3323       mme_emit(b, size);
3324       mme_emit(b, addr_hi);
3325       mme_emit(b, addr_lo);
3326 
3327       /* Bottom bit is the valid bit, 8:4 are shader slot */
3328       mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4);
3329    }
3330 
3331    mme_free_reg(b, addr_hi);
3332    mme_free_reg(b, addr_lo);
3333    mme_free_reg(b, size);
3334 
3335    /* The group comes in the bottom 4 bits in group_slot and we need to
3336     * combine it with the method.  However, unlike most array methods with a
3337     * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8
3338     * dwords.  This means we need to also shift by 3.
3339     */
3340    struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0);
3341    mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group);
3342    mme_emit(b, cb);
3343 }
3344 
3345 void
nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer * cmd)3346 nvk_cmd_flush_gfx_cbufs(struct nvk_cmd_buffer *cmd)
3347 {
3348    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
3349    struct nvk_physical_device *pdev = nvk_device_physical(dev);
3350    const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
3351    struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3352 
3353    /* Find cbuf maps for the 5 cbuf groups */
3354    const struct nvk_shader *cbuf_shaders[5] = { NULL, };
3355    for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) {
3356       const struct nvk_shader *shader = cmd->state.gfx.shaders[stage];
3357       if (shader == NULL)
3358          continue;
3359 
3360       uint32_t group = nvk_cbuf_binding_for_stage(stage);
3361       assert(group < ARRAY_SIZE(cbuf_shaders));
3362       cbuf_shaders[group] = shader;
3363    }
3364 
3365    bool bound_any_cbuf = false;
3366    for (uint32_t g = 0; g < ARRAY_SIZE(cbuf_shaders); g++) {
3367       if (cbuf_shaders[g] == NULL)
3368          continue;
3369 
3370       const struct nvk_shader *shader = cbuf_shaders[g];
3371       const struct nvk_cbuf_map *cbuf_map = &shader->cbuf_map;
3372       struct nvk_cbuf_group *group = &cmd->state.gfx.cbuf_groups[g];
3373 
3374       /* We only bother to re-bind cbufs that are in use */
3375       const uint32_t rebind =
3376          group->dirty & BITFIELD_MASK(cbuf_map->cbuf_count);
3377       if (!rebind)
3378          continue;
3379 
3380       u_foreach_bit(c, rebind) {
3381          const struct nvk_cbuf *cbuf = &group->cbufs[c];
3382 
3383          /* We bind these at the very end */
3384          if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC)
3385             continue;
3386 
3387          bound_any_cbuf = true;
3388 
3389          struct nvk_buffer_address ba;
3390          if (nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba)) {
3391             assert(ba.base_addr % min_cbuf_alignment == 0);
3392             ba.size = align(ba.size, min_cbuf_alignment);
3393             ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
3394 
3395             struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3396 
3397             if (ba.size > 0) {
3398                P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A);
3399                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size);
3400                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32);
3401                P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr);
3402             }
3403 
3404             P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(g), {
3405                .valid = ba.size > 0,
3406                .shader_slot = c,
3407             });
3408          } else {
3409             uint64_t desc_addr =
3410                nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf);
3411 
3412             if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
3413                struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
3414 
3415                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3416                P_INLINE_DATA(p, g | (c << 4));
3417                P_INLINE_DATA(p, desc_addr >> 32);
3418                P_INLINE_DATA(p, desc_addr);
3419             } else {
3420                struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3421 
3422                P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC));
3423                P_INLINE_DATA(p, g | (c << 4));
3424 
3425                nv_push_update_count(p, 3);
3426                nvk_cmd_buffer_push_indirect(cmd, desc_addr, 12);
3427             }
3428          }
3429       }
3430 
3431       group->dirty &= ~rebind;
3432    }
3433 
3434    /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is
3435     * always left pointing at the root descriptor table.  This way draw
3436     * parameters and similar MME root table updates always hit the root
3437     * descriptor table and not some random UBO.
3438     */
3439    if (bound_any_cbuf) {
3440       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
3441       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SELECT_CB0));
3442       P_INLINE_DATA(p, 0);
3443    }
3444 }
3445 
3446 static void
nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer * cmd)3447 nvk_cmd_flush_gfx_state(struct nvk_cmd_buffer *cmd)
3448 {
3449    nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
3450    nvk_cmd_flush_gfx_dynamic_state(cmd);
3451    nvk_cmd_flush_gfx_shaders(cmd);
3452    nvk_cmd_flush_gfx_cbufs(cmd);
3453 }
3454 
3455 void
nvk_mme_bind_ib(struct mme_builder * b)3456 nvk_mme_bind_ib(struct mme_builder *b)
3457 {
3458    struct mme_value64 addr = mme_load_addr64(b);
3459    struct mme_value size_B = mme_load(b);
3460 
3461    struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3462    mme_if(b, ieq, addr_or, mme_zero()) {
3463       mme_mov_to(b, size_B, mme_zero());
3464    }
3465    mme_free_reg(b, addr_or);
3466 
3467    if (b->devinfo->cls_eng3d < TURING_A) {
3468       mme_if(b, ieq, size_B, mme_zero()) {
3469          nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3470          nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3471       }
3472    }
3473 
3474    mme_mthd(b, NV9097_SET_INDEX_BUFFER_A);
3475    mme_emit(b, addr.hi);
3476    mme_emit(b, addr.lo);
3477 
3478    if (b->devinfo->cls_eng3d >= TURING_A) {
3479       mme_mthd(b, NVC597_SET_INDEX_BUFFER_SIZE_A);
3480       mme_emit(b, mme_zero());
3481       mme_emit(b, size_B);
3482    } else {
3483       /* Convert to an end address */
3484       mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3485       mme_add64_to(b, addr, addr, mme_imm64(-1));
3486 
3487       /* mme_mthd(b, NV9097_SET_INDEX_BUFFER_C); */
3488       mme_emit(b, addr.hi);
3489       mme_emit(b, addr.lo);
3490    }
3491    mme_free_reg64(b, addr);
3492    mme_free_reg(b, size_B);
3493 
3494    struct mme_value fmt = mme_load(b);
3495    struct mme_value restart = mme_mov(b, mme_imm(UINT32_MAX));
3496    struct mme_value index_type = mme_mov(b,
3497       mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_FOUR_BYTES));
3498 
3499    /* The Vulkan and D3D enums don't overlap so we can handle both at the same
3500     * time with one MME macro.
3501     */
3502    UNUSED static const uint32_t DXGI_FORMAT_R32_UINT = 42;
3503    static const uint32_t DXGI_FORMAT_R16_UINT = 57;
3504    static const uint32_t DXGI_FORMAT_R8_UINT = 62;
3505 
3506    mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT16)) {
3507       mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3508       mme_mov_to(b, index_type,
3509                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3510    }
3511 
3512    mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R16_UINT)) {
3513       mme_mov_to(b, restart, mme_imm(UINT16_MAX));
3514       mme_mov_to(b, index_type,
3515                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_TWO_BYTES));
3516    }
3517 
3518    mme_if(b, ieq, fmt, mme_imm(VK_INDEX_TYPE_UINT8_KHR)) {
3519       mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3520       mme_mov_to(b, index_type,
3521                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3522    }
3523 
3524    mme_if(b, ieq, fmt, mme_imm(DXGI_FORMAT_R8_UINT)) {
3525       mme_mov_to(b, restart, mme_imm(UINT8_MAX));
3526       mme_mov_to(b, index_type,
3527                  mme_imm(NVC597_SET_INDEX_BUFFER_E_INDEX_SIZE_ONE_BYTE));
3528    }
3529 
3530    mme_mthd(b, NV9097_SET_DA_PRIMITIVE_RESTART_INDEX);
3531    mme_emit(b, restart);
3532 
3533    mme_mthd(b, NV9097_SET_INDEX_BUFFER_E);
3534    mme_emit(b, index_type);
3535 }
3536 
3537 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3538 nvk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3539                            VkBuffer _buffer,
3540                            VkDeviceSize offset,
3541                            VkDeviceSize size,
3542                            VkIndexType indexType)
3543 {
3544    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3545    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
3546    struct nvk_addr_range addr_range =
3547       nvk_buffer_addr_range(buffer, offset, size);
3548 
3549    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3550    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB));
3551    P_INLINE_DATA(p, addr_range.addr >> 32);
3552    P_INLINE_DATA(p, addr_range.addr);
3553    assert(addr_range.range <= UINT32_MAX);
3554    P_INLINE_DATA(p, addr_range.range);
3555    P_INLINE_DATA(p, indexType);
3556 }
3557 
3558 void
nvk_mme_bind_vb(struct mme_builder * b)3559 nvk_mme_bind_vb(struct mme_builder *b)
3560 {
3561    struct mme_value vb_idx = mme_load(b);
3562    struct mme_value64 addr = mme_load_addr64(b);
3563    struct mme_value size_B = mme_load(b);
3564 
3565    struct mme_value addr_or = mme_or(b, addr.lo, addr.hi);
3566    mme_if(b, ieq, addr_or, mme_zero()) {
3567       mme_mov_to(b, size_B, mme_zero());
3568    }
3569    mme_free_reg(b, addr_or);
3570 
3571    if (b->devinfo->cls_eng3d < TURING_A) {
3572       mme_if(b, ieq, size_B, mme_zero()) {
3573          nvk_mme_load_scratch_to(b, addr.hi, ZERO_ADDR_HI);
3574          nvk_mme_load_scratch_to(b, addr.lo, ZERO_ADDR_LO);
3575       }
3576    }
3577 
3578    struct mme_value vb_idx4 = mme_sll(b, vb_idx, mme_imm(2));
3579    mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_A_LOCATION_A(0), vb_idx4);
3580    mme_free_reg(b, vb_idx4);
3581    mme_emit(b, addr.hi);
3582    mme_emit(b, addr.lo);
3583 
3584    if (b->devinfo->cls_eng3d >= TURING_A) {
3585       struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3586       mme_mthd_arr(b, NVC597_SET_VERTEX_STREAM_SIZE_A(0), vb_idx2);
3587       mme_emit(b, mme_zero());
3588       mme_emit(b, size_B);
3589    } else {
3590       /* Convert to an end address */
3591       mme_add64_to(b, addr, addr, mme_value64(size_B, mme_zero()));
3592       mme_add64_to(b, addr, addr, mme_imm64(-1));
3593 
3594       struct mme_value vb_idx2 = mme_sll(b, vb_idx, mme_imm(1));
3595       mme_mthd_arr(b, NV9097_SET_VERTEX_STREAM_LIMIT_A_A(0), vb_idx2);
3596       mme_emit(b, addr.hi);
3597       mme_emit(b, addr.lo);
3598    }
3599 }
3600 
3601 static void
nvk_mme_bind_vb_test_check(const struct nv_device_info * devinfo,const struct nvk_mme_test_case * test,const struct nvk_mme_mthd_data * results)3602 nvk_mme_bind_vb_test_check(const struct nv_device_info *devinfo,
3603                            const struct nvk_mme_test_case *test,
3604                            const struct nvk_mme_mthd_data *results)
3605 {
3606    const uint32_t vb_idx = test->params[0];
3607    const uint32_t addr_hi = test->params[1];
3608    const uint32_t addr_lo = test->params[2];
3609 
3610    uint32_t size_B = test->params[3];
3611    if (addr_hi == 0 && addr_lo == 0)
3612       size_B = 0;
3613 
3614    assert(results[0].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_A(vb_idx));
3615    assert(results[1].mthd == NV9097_SET_VERTEX_STREAM_A_LOCATION_B(vb_idx));
3616 
3617    if (devinfo->cls_eng3d >= TURING_A) {
3618       assert(results[0].data == addr_hi);
3619       assert(results[1].data == addr_lo);
3620 
3621       assert(results[2].mthd == NVC597_SET_VERTEX_STREAM_SIZE_A(3));
3622       assert(results[3].mthd == NVC597_SET_VERTEX_STREAM_SIZE_B(3));
3623       assert(results[2].data == 0);
3624       assert(results[3].data == size_B);
3625    } else {
3626       uint64_t addr = ((uint64_t)addr_hi << 32) | addr_lo;
3627       if (size_B == 0)
3628          addr = ((uint64_t)test->init[0].data << 32) | test->init[1].data;
3629 
3630       assert(results[0].data == addr >> 32);
3631       assert(results[1].data == (uint32_t)addr);
3632 
3633       const uint64_t limit = (addr + size_B) - 1;
3634       assert(results[2].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_A(3));
3635       assert(results[3].mthd == NV9097_SET_VERTEX_STREAM_LIMIT_A_B(3));
3636       assert(results[2].data == limit >> 32);
3637       assert(results[3].data == (uint32_t)limit);
3638    }
3639 }
3640 
3641 const struct nvk_mme_test_case nvk_mme_bind_vb_tests[] = {{
3642    .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0x10000 },
3643    .check = nvk_mme_bind_vb_test_check,
3644 }, {
3645    .init = (struct nvk_mme_mthd_data[]) {
3646       { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3647       { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3648       { }
3649    },
3650    .params = (uint32_t[]) { 3, 0xff3, 0xff4ab000, 0 },
3651    .check = nvk_mme_bind_vb_test_check,
3652 }, {
3653    .init = (struct nvk_mme_mthd_data[]) {
3654       { NVK_SET_MME_SCRATCH(ZERO_ADDR_HI), 0xff3 },
3655       { NVK_SET_MME_SCRATCH(ZERO_ADDR_LO), 0xff356000 },
3656       { }
3657    },
3658    .params = (uint32_t[]) { 3, 0, 0, 0x800 },
3659    .check = nvk_mme_bind_vb_test_check,
3660 }, {}};
3661 
3662 void
nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer * cmd,uint32_t vb_idx,struct nvk_addr_range addr_range)3663 nvk_cmd_bind_vertex_buffer(struct nvk_cmd_buffer *cmd, uint32_t vb_idx,
3664                            struct nvk_addr_range addr_range)
3665 {
3666    /* Used for meta save/restore */
3667    if (vb_idx == 0)
3668       cmd->state.gfx.vb0 = addr_range;
3669 
3670    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
3671    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB));
3672    P_INLINE_DATA(p, vb_idx);
3673    P_INLINE_DATA(p, addr_range.addr >> 32);
3674    P_INLINE_DATA(p, addr_range.addr);
3675    assert(addr_range.range <= UINT32_MAX);
3676    P_INLINE_DATA(p, addr_range.range);
3677 }
3678 
3679 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3680 nvk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3681                           uint32_t firstBinding,
3682                           uint32_t bindingCount,
3683                           const VkBuffer *pBuffers,
3684                           const VkDeviceSize *pOffsets,
3685                           const VkDeviceSize *pSizes,
3686                           const VkDeviceSize *pStrides)
3687 {
3688    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3689 
3690    if (pStrides) {
3691       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding,
3692                                         bindingCount, pStrides);
3693    }
3694 
3695    for (uint32_t i = 0; i < bindingCount; i++) {
3696       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
3697       uint32_t idx = firstBinding + i;
3698 
3699       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3700       const struct nvk_addr_range addr_range =
3701          nvk_buffer_addr_range(buffer, pOffsets[i], size);
3702 
3703       nvk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3704    }
3705 }
3706 
3707 static void
nvk_mme_set_cb0_mthd(struct mme_builder * b,uint16_t cb0_offset,uint16_t mthd,struct mme_value val)3708 nvk_mme_set_cb0_mthd(struct mme_builder *b,
3709                      uint16_t cb0_offset,
3710                      uint16_t mthd,
3711                      struct mme_value val)
3712 {
3713    if (b->devinfo->cls_eng3d >= TURING_A) {
3714       struct mme_value old = mme_state(b, mthd);
3715       mme_if(b, ine, old, val) {
3716          mme_mthd(b, mthd);
3717          mme_emit(b, val);
3718 
3719          mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3720          mme_emit(b, mme_imm(cb0_offset));
3721          mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3722          mme_emit(b, val);
3723       }
3724       mme_free_reg(b, old);
3725    } else {
3726       /* Fermi is really tight on registers. Don't bother with the if and set
3727        * both unconditionally for now.
3728        */
3729       mme_mthd(b, mthd);
3730       mme_emit(b, val);
3731 
3732       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER_OFFSET);
3733       mme_emit(b, mme_imm(cb0_offset));
3734       mme_mthd(b, NV9097_LOAD_CONSTANT_BUFFER(0));
3735       mme_emit(b, val);
3736    }
3737 }
3738 
3739 static void
nvk_mme_set_cb0_scratch(struct mme_builder * b,uint16_t cb0_offset,enum nvk_mme_scratch scratch,struct mme_value val)3740 nvk_mme_set_cb0_scratch(struct mme_builder *b,
3741                         uint16_t cb0_offset,
3742                         enum nvk_mme_scratch scratch,
3743                         struct mme_value val)
3744 {
3745    const uint16_t mthd = NV9097_SET_MME_SHADOW_SCRATCH(scratch);
3746    nvk_mme_set_cb0_mthd(b, cb0_offset, mthd, val);
3747 }
3748 
3749 struct mme_draw_params {
3750    struct mme_value base_vertex;
3751    struct mme_value first_vertex;
3752    struct mme_value first_instance;
3753    struct mme_value draw_index;
3754 };
3755 
3756 static void
nvk_mme_build_set_draw_params(struct mme_builder * b,const struct mme_draw_params * p)3757 nvk_mme_build_set_draw_params(struct mme_builder *b,
3758                               const struct mme_draw_params *p)
3759 {
3760    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.base_vertex),
3761                            NVK_MME_SCRATCH_CB0_FIRST_VERTEX,
3762                            p->first_vertex);
3763    nvk_mme_set_cb0_mthd(b, nvk_root_descriptor_offset(draw.base_instance),
3764                         NV9097_SET_GLOBAL_BASE_INSTANCE_INDEX,
3765                         p->first_instance);
3766    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.draw_index),
3767                            NVK_MME_SCRATCH_CB0_DRAW_INDEX,
3768                            p->draw_index);
3769    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3770                            NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3771                            mme_zero());
3772 
3773    mme_mthd(b, NV9097_SET_GLOBAL_BASE_VERTEX_INDEX);
3774    mme_emit(b, p->base_vertex);
3775    mme_mthd(b, NV9097_SET_VERTEX_ID_BASE);
3776    mme_emit(b, p->base_vertex);
3777 }
3778 
3779 static void
nvk_mme_emit_view_index(struct mme_builder * b,struct mme_value view_index)3780 nvk_mme_emit_view_index(struct mme_builder *b, struct mme_value view_index)
3781 {
3782    /* Set the push constant */
3783    nvk_mme_set_cb0_scratch(b, nvk_root_descriptor_offset(draw.view_index),
3784                            NVK_MME_SCRATCH_CB0_VIEW_INDEX,
3785                            view_index);
3786 
3787    /* Set the layer to the view index */
3788    STATIC_ASSERT(DRF_LO(NV9097_SET_RT_LAYER_V) == 0);
3789    STATIC_ASSERT(NV9097_SET_RT_LAYER_CONTROL_V_SELECTS_LAYER == 0);
3790    mme_mthd(b, NV9097_SET_RT_LAYER);
3791    mme_emit(b, view_index);
3792 }
3793 
3794 static void
nvk_mme_build_draw_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_vertex,struct mme_value vertex_count)3795 nvk_mme_build_draw_loop(struct mme_builder *b,
3796                         struct mme_value instance_count,
3797                         struct mme_value first_vertex,
3798                         struct mme_value vertex_count)
3799 {
3800    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3801 
3802    mme_loop(b, instance_count) {
3803       mme_mthd(b, NV9097_BEGIN);
3804       mme_emit(b, begin);
3805 
3806       mme_mthd(b, NV9097_SET_VERTEX_ARRAY_START);
3807       mme_emit(b, first_vertex);
3808       mme_emit(b, vertex_count);
3809 
3810       mme_mthd(b, NV9097_END);
3811       mme_emit(b, mme_zero());
3812 
3813       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3814    }
3815 
3816    mme_free_reg(b, begin);
3817 }
3818 
3819 static void
nvk_mme_build_draw(struct mme_builder * b,struct mme_value draw_index)3820 nvk_mme_build_draw(struct mme_builder *b,
3821                    struct mme_value draw_index)
3822 {
3823    /* These are in VkDrawIndirectCommand order */
3824    struct mme_value vertex_count = mme_load(b);
3825    struct mme_value instance_count = mme_load(b);
3826    struct mme_value first_vertex = mme_load(b);
3827    struct mme_value first_instance = mme_load(b);
3828 
3829    struct mme_draw_params params = {
3830       .first_vertex = first_vertex,
3831       .first_instance = first_instance,
3832       .draw_index = draw_index,
3833    };
3834    nvk_mme_build_set_draw_params(b, &params);
3835 
3836    mme_free_reg(b, first_instance);
3837 
3838    if (b->devinfo->cls_eng3d < TURING_A)
3839       nvk_mme_spill(b, DRAW_IDX, draw_index);
3840 
3841    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3842    mme_if(b, ieq, view_mask, mme_zero()) {
3843       mme_free_reg(b, view_mask);
3844 
3845       nvk_mme_build_draw_loop(b, instance_count,
3846                               first_vertex, vertex_count);
3847    }
3848 
3849    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3850    mme_if(b, ine, view_mask, mme_zero()) {
3851       mme_free_reg(b, view_mask);
3852 
3853       struct mme_value view = mme_mov(b, mme_zero());
3854       mme_while(b, ine, view, mme_imm(32)) {
3855          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3856          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3857          mme_free_reg(b, view_mask);
3858          mme_if(b, ine, has_view, mme_zero()) {
3859             mme_free_reg(b, has_view);
3860             nvk_mme_emit_view_index(b, view);
3861             nvk_mme_build_draw_loop(b, instance_count,
3862                                     first_vertex, vertex_count);
3863          }
3864 
3865          mme_add_to(b, view, view, mme_imm(1));
3866       }
3867       mme_free_reg(b, view);
3868    }
3869 
3870    mme_free_reg(b, instance_count);
3871    mme_free_reg(b, first_vertex);
3872    mme_free_reg(b, vertex_count);
3873 
3874    if (b->devinfo->cls_eng3d < TURING_A)
3875       nvk_mme_unspill(b, DRAW_IDX, draw_index);
3876 }
3877 
3878 void
nvk_mme_draw(struct mme_builder * b)3879 nvk_mme_draw(struct mme_builder *b)
3880 {
3881    struct mme_value draw_index = mme_load(b);
3882    nvk_mme_build_draw(b, draw_index);
3883 }
3884 
3885 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3886 nvk_CmdDraw(VkCommandBuffer commandBuffer,
3887             uint32_t vertexCount,
3888             uint32_t instanceCount,
3889             uint32_t firstVertex,
3890             uint32_t firstInstance)
3891 {
3892    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3893 
3894    nvk_cmd_flush_gfx_state(cmd);
3895 
3896    struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3897    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3898    P_INLINE_DATA(p, 0 /* draw_index */);
3899    P_INLINE_DATA(p, vertexCount);
3900    P_INLINE_DATA(p, instanceCount);
3901    P_INLINE_DATA(p, firstVertex);
3902    P_INLINE_DATA(p, firstInstance);
3903 }
3904 
3905 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3906 nvk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3907                     uint32_t drawCount,
3908                     const VkMultiDrawInfoEXT *pVertexInfo,
3909                     uint32_t instanceCount,
3910                     uint32_t firstInstance,
3911                     uint32_t stride)
3912 {
3913    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
3914 
3915    nvk_cmd_flush_gfx_state(cmd);
3916 
3917    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
3918       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
3919       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW));
3920       P_INLINE_DATA(p, draw_index);
3921       P_INLINE_DATA(p, pVertexInfo->vertexCount);
3922       P_INLINE_DATA(p, instanceCount);
3923       P_INLINE_DATA(p, pVertexInfo->firstVertex);
3924       P_INLINE_DATA(p, firstInstance);
3925 
3926       pVertexInfo = ((void *)pVertexInfo) + stride;
3927    }
3928 }
3929 
3930 static void
nvk_mme_build_draw_indexed_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value first_index,struct mme_value index_count)3931 nvk_mme_build_draw_indexed_loop(struct mme_builder *b,
3932                                 struct mme_value instance_count,
3933                                 struct mme_value first_index,
3934                                 struct mme_value index_count)
3935 {
3936    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
3937 
3938    mme_loop(b, instance_count) {
3939       mme_mthd(b, NV9097_BEGIN);
3940       mme_emit(b, begin);
3941 
3942       mme_mthd(b, NV9097_SET_INDEX_BUFFER_F);
3943       mme_emit(b, first_index);
3944       mme_emit(b, index_count);
3945 
3946       mme_mthd(b, NV9097_END);
3947       mme_emit(b, mme_zero());
3948 
3949       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
3950    }
3951 
3952    mme_free_reg(b, begin);
3953 }
3954 
3955 static void
nvk_mme_build_draw_indexed(struct mme_builder * b,struct mme_value draw_index)3956 nvk_mme_build_draw_indexed(struct mme_builder *b,
3957                            struct mme_value draw_index)
3958 {
3959    /* These are in VkDrawIndexedIndirectCommand order */
3960    struct mme_value index_count = mme_load(b);
3961    struct mme_value instance_count = mme_load(b);
3962    struct mme_value first_index = mme_load(b);
3963    struct mme_value vertex_offset = mme_load(b);
3964    struct mme_value first_instance = mme_load(b);
3965 
3966    struct mme_draw_params params = {
3967       .base_vertex = vertex_offset,
3968       .first_vertex = vertex_offset,
3969       .first_instance = first_instance,
3970       .draw_index = draw_index,
3971    };
3972    nvk_mme_build_set_draw_params(b, &params);
3973 
3974    mme_free_reg(b, vertex_offset);
3975    mme_free_reg(b, first_instance);
3976 
3977    if (b->devinfo->cls_eng3d < TURING_A)
3978       nvk_mme_spill(b, DRAW_IDX, draw_index);
3979 
3980    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3981    mme_if(b, ieq, view_mask, mme_zero()) {
3982       mme_free_reg(b, view_mask);
3983 
3984       nvk_mme_build_draw_indexed_loop(b, instance_count,
3985                                       first_index, index_count);
3986    }
3987 
3988    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3989    mme_if(b, ine, view_mask, mme_zero()) {
3990       mme_free_reg(b, view_mask);
3991 
3992       struct mme_value view = mme_mov(b, mme_zero());
3993       mme_while(b, ine, view, mme_imm(32)) {
3994          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
3995          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
3996          mme_free_reg(b, view_mask);
3997          mme_if(b, ine, has_view, mme_zero()) {
3998             mme_free_reg(b, has_view);
3999             nvk_mme_emit_view_index(b, view);
4000             nvk_mme_build_draw_indexed_loop(b, instance_count,
4001                                             first_index, index_count);
4002          }
4003 
4004          mme_add_to(b, view, view, mme_imm(1));
4005       }
4006       mme_free_reg(b, view);
4007    }
4008 
4009    mme_free_reg(b, instance_count);
4010    mme_free_reg(b, first_index);
4011    mme_free_reg(b, index_count);
4012 
4013    if (b->devinfo->cls_eng3d < TURING_A)
4014       nvk_mme_unspill(b, DRAW_IDX, draw_index);
4015 }
4016 
4017 void
nvk_mme_draw_indexed(struct mme_builder * b)4018 nvk_mme_draw_indexed(struct mme_builder *b)
4019 {
4020    struct mme_value draw_index = mme_load(b);
4021    nvk_mme_build_draw_indexed(b, draw_index);
4022 }
4023 
4024 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)4025 nvk_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4026                    uint32_t indexCount,
4027                    uint32_t instanceCount,
4028                    uint32_t firstIndex,
4029                    int32_t vertexOffset,
4030                    uint32_t firstInstance)
4031 {
4032    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4033 
4034    nvk_cmd_flush_gfx_state(cmd);
4035 
4036    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4037    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4038    P_INLINE_DATA(p, 0 /* draw_index */);
4039    P_INLINE_DATA(p, indexCount);
4040    P_INLINE_DATA(p, instanceCount);
4041    P_INLINE_DATA(p, firstIndex);
4042    P_INLINE_DATA(p, vertexOffset);
4043    P_INLINE_DATA(p, firstInstance);
4044 }
4045 
4046 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)4047 nvk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
4048                            uint32_t drawCount,
4049                            const VkMultiDrawIndexedInfoEXT *pIndexInfo,
4050                            uint32_t instanceCount,
4051                            uint32_t firstInstance,
4052                            uint32_t stride,
4053                            const int32_t *pVertexOffset)
4054 {
4055    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4056 
4057    nvk_cmd_flush_gfx_state(cmd);
4058 
4059    for (uint32_t draw_index = 0; draw_index < drawCount; draw_index++) {
4060       const uint32_t vertex_offset =
4061          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
4062 
4063       struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4064       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED));
4065       P_INLINE_DATA(p, draw_index);
4066       P_INLINE_DATA(p, pIndexInfo->indexCount);
4067       P_INLINE_DATA(p, instanceCount);
4068       P_INLINE_DATA(p, pIndexInfo->firstIndex);
4069       P_INLINE_DATA(p, vertex_offset);
4070       P_INLINE_DATA(p, firstInstance);
4071 
4072       pIndexInfo = ((void *)pIndexInfo) + stride;
4073    }
4074 }
4075 
4076 void
nvk_mme_draw_indirect(struct mme_builder * b)4077 nvk_mme_draw_indirect(struct mme_builder *b)
4078 {
4079    if (b->devinfo->cls_eng3d >= TURING_A) {
4080       struct mme_value64 draw_addr = mme_load_addr64(b);
4081       struct mme_value draw_count = mme_load(b);
4082       struct mme_value stride = mme_load(b);
4083 
4084       struct mme_value draw = mme_mov(b, mme_zero());
4085       mme_while(b, ult, draw, draw_count) {
4086          mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4087 
4088          nvk_mme_build_draw(b, draw);
4089 
4090          mme_add_to(b, draw, draw, mme_imm(1));
4091          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4092       }
4093    } else {
4094       struct mme_value draw_count = mme_load(b);
4095       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4096 
4097       struct mme_value draw = mme_mov(b, mme_zero());
4098       mme_while(b, ine, draw, draw_count) {
4099          nvk_mme_spill(b, DRAW_COUNT, draw_count);
4100 
4101          nvk_mme_build_draw(b, draw);
4102          mme_add_to(b, draw, draw, mme_imm(1));
4103 
4104          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4105          mme_loop(b, pad_dw) {
4106             mme_free_reg(b, mme_load(b));
4107          }
4108          mme_free_reg(b, pad_dw);
4109 
4110          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4111       }
4112    }
4113 }
4114 
4115 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4116 nvk_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4117                     VkBuffer _buffer,
4118                     VkDeviceSize offset,
4119                     uint32_t drawCount,
4120                     uint32_t stride)
4121 {
4122    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4123    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4124 
4125    /* From the Vulkan 1.3.238 spec:
4126     *
4127     *    VUID-vkCmdDrawIndirect-drawCount-00476
4128     *
4129     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
4130     *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
4131     *
4132     * and
4133     *
4134     *    "If drawCount is less than or equal to one, stride is ignored."
4135     */
4136    if (drawCount > 1) {
4137       assert(stride % 4 == 0);
4138       assert(stride >= sizeof(VkDrawIndirectCommand));
4139    } else {
4140       stride = sizeof(VkDrawIndirectCommand);
4141    }
4142 
4143    nvk_cmd_flush_gfx_state(cmd);
4144 
4145    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4146       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4147       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4148       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4149       P_INLINE_DATA(p, draw_addr >> 32);
4150       P_INLINE_DATA(p, draw_addr);
4151       P_INLINE_DATA(p, drawCount);
4152       P_INLINE_DATA(p, stride);
4153    } else {
4154       const uint32_t max_draws_per_push =
4155          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4156 
4157       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4158       while (drawCount) {
4159          const uint32_t count = MIN2(drawCount, max_draws_per_push);
4160 
4161          struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4162          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT));
4163          P_INLINE_DATA(p, count);
4164          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndirectCommand)) / 4);
4165 
4166          uint64_t range = count * (uint64_t)stride;
4167          nv_push_update_count(p, range / 4);
4168          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4169 
4170          draw_addr += range;
4171          drawCount -= count;
4172       }
4173    }
4174 }
4175 
4176 void
nvk_mme_draw_indexed_indirect(struct mme_builder * b)4177 nvk_mme_draw_indexed_indirect(struct mme_builder *b)
4178 {
4179    if (b->devinfo->cls_eng3d >= TURING_A) {
4180       struct mme_value64 draw_addr = mme_load_addr64(b);
4181       struct mme_value draw_count = mme_load(b);
4182       struct mme_value stride = mme_load(b);
4183 
4184       struct mme_value draw = mme_mov(b, mme_zero());
4185       mme_while(b, ult, draw, draw_count) {
4186          mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4187 
4188          nvk_mme_build_draw_indexed(b, draw);
4189 
4190          mme_add_to(b, draw, draw, mme_imm(1));
4191          mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4192       }
4193    } else {
4194       struct mme_value draw_count = mme_load(b);
4195       nvk_mme_load_to_scratch(b, DRAW_PAD_DW);
4196 
4197       struct mme_value draw = mme_mov(b, mme_zero());
4198       mme_while(b, ine, draw, draw_count) {
4199          nvk_mme_spill(b, DRAW_COUNT, draw_count);
4200 
4201          nvk_mme_build_draw_indexed(b, draw);
4202          mme_add_to(b, draw, draw, mme_imm(1));
4203 
4204          struct mme_value pad_dw = nvk_mme_load_scratch(b, DRAW_PAD_DW);
4205          mme_loop(b, pad_dw) {
4206             mme_free_reg(b, mme_load(b));
4207          }
4208          mme_free_reg(b, pad_dw);
4209 
4210          nvk_mme_unspill(b, DRAW_COUNT, draw_count);
4211       }
4212    }
4213 }
4214 
4215 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)4216 nvk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4217                            VkBuffer _buffer,
4218                            VkDeviceSize offset,
4219                            uint32_t drawCount,
4220                            uint32_t stride)
4221 {
4222    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4223    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4224 
4225    /* From the Vulkan 1.3.238 spec:
4226     *
4227     *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
4228     *
4229     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
4230     *    must be greater than or equal to sizeof(VkDrawIndexedIndirectCommand)"
4231     *
4232     * and
4233     *
4234     *    "If drawCount is less than or equal to one, stride is ignored."
4235     */
4236    if (drawCount > 1) {
4237       assert(stride % 4 == 0);
4238       assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
4239    } else {
4240       stride = sizeof(VkDrawIndexedIndirectCommand);
4241    }
4242 
4243    nvk_cmd_flush_gfx_state(cmd);
4244 
4245    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4246       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4247       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4248       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4249       P_INLINE_DATA(p, draw_addr >> 32);
4250       P_INLINE_DATA(p, draw_addr);
4251       P_INLINE_DATA(p, drawCount);
4252       P_INLINE_DATA(p, stride);
4253    } else {
4254       const uint32_t max_draws_per_push =
4255          ((NV_PUSH_MAX_COUNT - 3) * 4) / stride;
4256 
4257       uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4258       while (drawCount) {
4259          const uint32_t count = MIN2(drawCount, max_draws_per_push);
4260 
4261          struct nv_push *p = nvk_cmd_buffer_push(cmd, 3);
4262          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT));
4263          P_INLINE_DATA(p, count);
4264          P_INLINE_DATA(p, (stride - sizeof(VkDrawIndexedIndirectCommand)) / 4);
4265 
4266          uint64_t range = count * (uint64_t)stride;
4267          nv_push_update_count(p, range / 4);
4268          nvk_cmd_buffer_push_indirect(cmd, draw_addr, range);
4269 
4270          draw_addr += range;
4271          drawCount -= count;
4272       }
4273    }
4274 }
4275 
4276 void
nvk_mme_draw_indirect_count(struct mme_builder * b)4277 nvk_mme_draw_indirect_count(struct mme_builder *b)
4278 {
4279    if (b->devinfo->cls_eng3d < TURING_A)
4280       return;
4281 
4282    struct mme_value64 draw_addr = mme_load_addr64(b);
4283    struct mme_value64 draw_count_addr = mme_load_addr64(b);
4284    struct mme_value draw_max = mme_load(b);
4285    struct mme_value stride = mme_load(b);
4286 
4287    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4288    mme_free_reg64(b, draw_count_addr);
4289    struct mme_value draw_count_buf = mme_load(b);
4290 
4291    mme_if(b, ule, draw_count_buf, draw_max) {
4292       mme_mov_to(b, draw_max, draw_count_buf);
4293    }
4294    mme_free_reg(b, draw_count_buf);
4295 
4296    struct mme_value draw = mme_mov(b, mme_zero());
4297    mme_while(b, ult, draw, draw_max) {
4298       mme_tu104_read_fifoed(b, draw_addr, mme_imm(4));
4299 
4300       nvk_mme_build_draw(b, draw);
4301 
4302       mme_add_to(b, draw, draw, mme_imm(1));
4303       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4304    }
4305 }
4306 
4307 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4308 nvk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4309                          VkBuffer _buffer,
4310                          VkDeviceSize offset,
4311                          VkBuffer countBuffer,
4312                          VkDeviceSize countBufferOffset,
4313                          uint32_t maxDrawCount,
4314                          uint32_t stride)
4315 {
4316    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4317    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4318    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4319 
4320    /* TODO: Indirect count draw pre-Turing */
4321    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4322 
4323    nvk_cmd_flush_gfx_state(cmd);
4324 
4325    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4326    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT_COUNT));
4327    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4328    P_INLINE_DATA(p, draw_addr >> 32);
4329    P_INLINE_DATA(p, draw_addr);
4330    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4331                                                  countBufferOffset);
4332    P_INLINE_DATA(p, draw_count_addr >> 32);
4333    P_INLINE_DATA(p, draw_count_addr);
4334    P_INLINE_DATA(p, maxDrawCount);
4335    P_INLINE_DATA(p, stride);
4336 }
4337 
4338 void
nvk_mme_draw_indexed_indirect_count(struct mme_builder * b)4339 nvk_mme_draw_indexed_indirect_count(struct mme_builder *b)
4340 {
4341    if (b->devinfo->cls_eng3d < TURING_A)
4342       return;
4343 
4344    struct mme_value64 draw_addr = mme_load_addr64(b);
4345    struct mme_value64 draw_count_addr = mme_load_addr64(b);
4346    struct mme_value draw_max = mme_load(b);
4347    struct mme_value stride = mme_load(b);
4348 
4349    mme_tu104_read_fifoed(b, draw_count_addr, mme_imm(1));
4350    mme_free_reg64(b, draw_count_addr);
4351    struct mme_value draw_count_buf = mme_load(b);
4352 
4353    mme_if(b, ule, draw_count_buf, draw_max) {
4354       mme_mov_to(b, draw_max, draw_count_buf);
4355    }
4356    mme_free_reg(b, draw_count_buf);
4357 
4358    struct mme_value draw = mme_mov(b, mme_zero());
4359    mme_while(b, ult, draw, draw_max) {
4360       mme_tu104_read_fifoed(b, draw_addr, mme_imm(5));
4361 
4362       nvk_mme_build_draw_indexed(b, draw);
4363 
4364       mme_add_to(b, draw, draw, mme_imm(1));
4365       mme_add64_to(b, draw_addr, draw_addr, mme_value64(stride, mme_zero()));
4366    }
4367 }
4368 
4369 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)4370 nvk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4371                                 VkBuffer _buffer,
4372                                 VkDeviceSize offset,
4373                                 VkBuffer countBuffer,
4374                                 VkDeviceSize countBufferOffset,
4375                                 uint32_t maxDrawCount,
4376                                 uint32_t stride)
4377 {
4378    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4379    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
4380    VK_FROM_HANDLE(nvk_buffer, count_buffer, countBuffer);
4381 
4382    /* TODO: Indexed indirect count draw pre-Turing */
4383    assert(nvk_cmd_buffer_3d_cls(cmd) >= TURING_A);
4384 
4385    nvk_cmd_flush_gfx_state(cmd);
4386 
4387    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
4388    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT_COUNT));
4389    uint64_t draw_addr = nvk_buffer_address(buffer, offset);
4390    P_INLINE_DATA(p, draw_addr >> 32);
4391    P_INLINE_DATA(p, draw_addr);
4392    uint64_t draw_count_addr = nvk_buffer_address(count_buffer,
4393                                                  countBufferOffset);
4394    P_INLINE_DATA(p, draw_count_addr >> 32);
4395    P_INLINE_DATA(p, draw_count_addr);
4396    P_INLINE_DATA(p, maxDrawCount);
4397    P_INLINE_DATA(p, stride);
4398 }
4399 
4400 static void
nvk_mme_xfb_draw_indirect_loop(struct mme_builder * b,struct mme_value instance_count,struct mme_value counter)4401 nvk_mme_xfb_draw_indirect_loop(struct mme_builder *b,
4402                                struct mme_value instance_count,
4403                                struct mme_value counter)
4404 {
4405    struct mme_value begin = nvk_mme_load_scratch(b, DRAW_BEGIN);
4406 
4407    mme_loop(b, instance_count) {
4408       mme_mthd(b, NV9097_BEGIN);
4409       mme_emit(b, begin);
4410 
4411       mme_mthd(b, NV9097_DRAW_AUTO);
4412       mme_emit(b, counter);
4413 
4414       mme_mthd(b, NV9097_END);
4415       mme_emit(b, mme_zero());
4416 
4417       mme_set_field_enum(b, begin, NV9097_BEGIN_INSTANCE_ID, SUBSEQUENT);
4418    }
4419 
4420    mme_free_reg(b, begin);
4421 }
4422 
4423 void
nvk_mme_xfb_draw_indirect(struct mme_builder * b)4424 nvk_mme_xfb_draw_indirect(struct mme_builder *b)
4425 {
4426    struct mme_value instance_count = mme_load(b);
4427    struct mme_value first_instance = mme_load(b);
4428 
4429    if (b->devinfo->cls_eng3d >= TURING_A) {
4430       struct mme_value64 counter_addr = mme_load_addr64(b);
4431       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4432       mme_free_reg(b, counter_addr.lo);
4433       mme_free_reg(b, counter_addr.hi);
4434    }
4435    struct mme_value counter = mme_load(b);
4436 
4437    struct mme_draw_params params = {
4438       .first_instance = first_instance,
4439    };
4440    nvk_mme_build_set_draw_params(b, &params);
4441 
4442    mme_free_reg(b, first_instance);
4443 
4444    struct mme_value view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4445    mme_if(b, ieq, view_mask, mme_zero()) {
4446       mme_free_reg(b, view_mask);
4447 
4448       nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4449    }
4450 
4451    view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4452    mme_if(b, ine, view_mask, mme_zero()) {
4453       mme_free_reg(b, view_mask);
4454 
4455       struct mme_value view = mme_mov(b, mme_zero());
4456       mme_while(b, ine, view, mme_imm(32)) {
4457          view_mask = nvk_mme_load_scratch(b, VIEW_MASK);
4458          struct mme_value has_view = mme_bfe(b, view_mask, view, 1);
4459          mme_free_reg(b, view_mask);
4460          mme_if(b, ine, has_view, mme_zero()) {
4461             mme_free_reg(b, has_view);
4462             nvk_mme_emit_view_index(b, view);
4463             nvk_mme_xfb_draw_indirect_loop(b, instance_count, counter);
4464          }
4465 
4466          mme_add_to(b, view, view, mme_imm(1));
4467       }
4468    }
4469 
4470    mme_free_reg(b, instance_count);
4471    mme_free_reg(b, counter);
4472 }
4473 
4474 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)4475 nvk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4476                                 uint32_t instanceCount,
4477                                 uint32_t firstInstance,
4478                                 VkBuffer counterBuffer,
4479                                 VkDeviceSize counterBufferOffset,
4480                                 uint32_t counterOffset,
4481                                 uint32_t vertexStride)
4482 {
4483    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4484    VK_FROM_HANDLE(nvk_buffer, counter_buffer, counterBuffer);
4485 
4486    nvk_cmd_flush_gfx_state(cmd);
4487 
4488    uint64_t counter_addr = nvk_buffer_address(counter_buffer,
4489                                               counterBufferOffset);
4490 
4491    if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4492       struct nv_push *p = nvk_cmd_buffer_push(cmd, 9);
4493       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4494       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4495 
4496       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4497       P_INLINE_DATA(p, instanceCount);
4498       P_INLINE_DATA(p, firstInstance);
4499       P_INLINE_DATA(p, counter_addr >> 32);
4500       P_INLINE_DATA(p, counter_addr);
4501    } else {
4502       struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
4503       P_IMMD(p, NV9097, SET_DRAW_AUTO_START, counterOffset);
4504       P_IMMD(p, NV9097, SET_DRAW_AUTO_STRIDE, vertexStride);
4505 
4506       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_DRAW_INDIRECT));
4507       P_INLINE_DATA(p, instanceCount);
4508       P_INLINE_DATA(p, firstInstance);
4509       nv_push_update_count(p, 1);
4510       nvk_cmd_buffer_push_indirect(cmd, counter_addr, 4);
4511    }
4512 }
4513 
4514 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)4515 nvk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
4516                                        uint32_t firstBinding,
4517                                        uint32_t bindingCount,
4518                                        const VkBuffer *pBuffers,
4519                                        const VkDeviceSize *pOffsets,
4520                                        const VkDeviceSize *pSizes)
4521 {
4522    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4523 
4524    for (uint32_t i = 0; i < bindingCount; i++) {
4525       VK_FROM_HANDLE(nvk_buffer, buffer, pBuffers[i]);
4526       uint32_t idx = firstBinding + i;
4527       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
4528       struct nvk_addr_range addr_range =
4529          nvk_buffer_addr_range(buffer, pOffsets[i], size);
4530       assert(addr_range.range <= UINT32_MAX);
4531 
4532       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
4533 
4534       P_MTHD(p, NV9097, SET_STREAM_OUT_BUFFER_ENABLE(idx));
4535       P_NV9097_SET_STREAM_OUT_BUFFER_ENABLE(p, idx, V_TRUE);
4536       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_A(p, idx, addr_range.addr >> 32);
4537       P_NV9097_SET_STREAM_OUT_BUFFER_ADDRESS_B(p, idx, addr_range.addr);
4538       P_NV9097_SET_STREAM_OUT_BUFFER_SIZE(p, idx, (uint32_t)addr_range.range);
4539    }
4540 
4541    // TODO: do we need to SET_STREAM_OUT_BUFFER_ENABLE V_FALSE ?
4542 }
4543 
4544 void
nvk_mme_xfb_counter_load(struct mme_builder * b)4545 nvk_mme_xfb_counter_load(struct mme_builder *b)
4546 {
4547    struct mme_value buffer = mme_load(b);
4548 
4549    struct mme_value counter;
4550    if (b->devinfo->cls_eng3d >= TURING_A) {
4551       struct mme_value64 counter_addr = mme_load_addr64(b);
4552 
4553       mme_tu104_read_fifoed(b, counter_addr, mme_imm(1));
4554       mme_free_reg(b, counter_addr.lo);
4555       mme_free_reg(b, counter_addr.hi);
4556 
4557       counter = mme_load(b);
4558    } else {
4559       counter = mme_load(b);
4560    }
4561 
4562    mme_mthd_arr(b, NV9097_SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(0), buffer);
4563    mme_emit(b, counter);
4564 
4565    mme_free_reg(b, counter);
4566    mme_free_reg(b, buffer);
4567 }
4568 
4569 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4570 nvk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4571                                  uint32_t firstCounterBuffer,
4572                                  uint32_t counterBufferCount,
4573                                  const VkBuffer *pCounterBuffers,
4574                                  const VkDeviceSize *pCounterBufferOffsets)
4575 {
4576    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4577    const uint32_t max_buffers = 4;
4578 
4579    struct nv_push *p = nvk_cmd_buffer_push(cmd, 2 + 2 * max_buffers);
4580 
4581    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_TRUE);
4582    for (uint32_t i = 0; i < max_buffers; ++i) {
4583       P_IMMD(p, NV9097, SET_STREAM_OUT_BUFFER_LOAD_WRITE_POINTER(i), 0);
4584    }
4585 
4586    for (uint32_t i = 0; i < counterBufferCount; ++i) {
4587       if (pCounterBuffers == NULL || pCounterBuffers[i] == VK_NULL_HANDLE)
4588          continue;
4589 
4590       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4591       // index of counter buffer corresponts to index of transform buffer
4592       uint32_t cb_idx = firstCounterBuffer + i;
4593       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4594       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4595 
4596       if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) {
4597          struct nv_push *p = nvk_cmd_buffer_push(cmd, 4);
4598          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4599          /* The STREAM_OUT_BUFFER_LOAD_WRITE_POINTER registers are 8 dword stride */
4600          P_INLINE_DATA(p, cb_idx * 8);
4601          P_INLINE_DATA(p, cb_addr >> 32);
4602          P_INLINE_DATA(p, cb_addr);
4603       } else {
4604          struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
4605          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_XFB_COUNTER_LOAD));
4606          P_INLINE_DATA(p, cb_idx);
4607          nv_push_update_count(p, 1);
4608          nvk_cmd_buffer_push_indirect(cmd, cb_addr, 4);
4609       }
4610    }
4611 }
4612 
4613 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)4614 nvk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
4615                                uint32_t firstCounterBuffer,
4616                                uint32_t counterBufferCount,
4617                                const VkBuffer *pCounterBuffers,
4618                                const VkDeviceSize *pCounterBufferOffsets)
4619 {
4620    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4621 
4622    struct nv_push *p = nvk_cmd_buffer_push(cmd, 5 * counterBufferCount + 2);
4623 
4624    P_IMMD(p, NV9097, SET_STREAM_OUTPUT, ENABLE_FALSE);
4625 
4626    for (uint32_t i = 0; i < counterBufferCount; ++i) {
4627       if (pCounterBuffers == NULL || pCounterBuffers[i] == VK_NULL_HANDLE)
4628          continue;
4629 
4630       VK_FROM_HANDLE(nvk_buffer, buffer, pCounterBuffers[i]);
4631       // index of counter buffer corresponts to index of transform buffer
4632       uint32_t cb_idx = firstCounterBuffer + i;
4633       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
4634       uint64_t cb_addr = nvk_buffer_address(buffer, offset);
4635 
4636       P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
4637       P_NV9097_SET_REPORT_SEMAPHORE_A(p, cb_addr >> 32);
4638       P_NV9097_SET_REPORT_SEMAPHORE_B(p, cb_addr);
4639       P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
4640       P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
4641          .operation = OPERATION_REPORT_ONLY,
4642          .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
4643          .report = REPORT_STREAMING_BYTE_COUNT,
4644          .sub_report = cb_idx,
4645          .structure_size = STRUCTURE_SIZE_ONE_WORD,
4646       });
4647    }
4648 }
4649 
4650 VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)4651 nvk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4652                                     const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4653 {
4654    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4655    VK_FROM_HANDLE(nvk_buffer, buffer, pConditionalRenderingBegin->buffer);
4656 
4657    uint64_t addr = nvk_buffer_address(buffer, pConditionalRenderingBegin->offset);
4658    bool inverted = pConditionalRenderingBegin->flags &
4659       VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4660 
4661    /* From the Vulkan 1.3.280 spec:
4662     *
4663     *    "If the 32-bit value at offset in buffer memory is zero,
4664     *     then the rendering commands are discarded,
4665     *     otherwise they are executed as normal."
4666     *
4667     * The hardware compare a 64-bit value, as such we are required to copy it.
4668     */
4669    uint64_t tmp_addr;
4670    VkResult result = nvk_cmd_buffer_cond_render_alloc(cmd, &tmp_addr);
4671    if (result != VK_SUCCESS) {
4672       vk_command_buffer_set_error(&cmd->vk, result);
4673       return;
4674    }
4675 
4676    struct nv_push *p = nvk_cmd_buffer_push(cmd, 26);
4677 
4678    P_MTHD(p, NV90B5, OFFSET_IN_UPPER);
4679    P_NV90B5_OFFSET_IN_UPPER(p, addr >> 32);
4680    P_NV90B5_OFFSET_IN_LOWER(p, addr & 0xffffffff);
4681    P_NV90B5_OFFSET_OUT_UPPER(p, tmp_addr >> 32);
4682    P_NV90B5_OFFSET_OUT_LOWER(p, tmp_addr & 0xffffffff);
4683    P_NV90B5_PITCH_IN(p, 4);
4684    P_NV90B5_PITCH_OUT(p, 4);
4685    P_NV90B5_LINE_LENGTH_IN(p, 4);
4686    P_NV90B5_LINE_COUNT(p, 1);
4687 
4688    P_IMMD(p, NV90B5, SET_REMAP_COMPONENTS, {
4689       .dst_x = DST_X_SRC_X,
4690       .dst_y = DST_Y_SRC_X,
4691       .dst_z = DST_Z_NO_WRITE,
4692       .dst_w = DST_W_NO_WRITE,
4693       .component_size = COMPONENT_SIZE_ONE,
4694       .num_src_components = NUM_SRC_COMPONENTS_ONE,
4695       .num_dst_components = NUM_DST_COMPONENTS_TWO,
4696    });
4697 
4698    P_IMMD(p, NV90B5, LAUNCH_DMA, {
4699       .data_transfer_type = DATA_TRANSFER_TYPE_PIPELINED,
4700       .multi_line_enable = MULTI_LINE_ENABLE_TRUE,
4701       .flush_enable = FLUSH_ENABLE_TRUE,
4702       .src_memory_layout = SRC_MEMORY_LAYOUT_PITCH,
4703       .dst_memory_layout = DST_MEMORY_LAYOUT_PITCH,
4704       .remap_enable = REMAP_ENABLE_TRUE,
4705    });
4706 
4707    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4708    P_NV9097_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4709    P_NV9097_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4710    P_NV9097_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4711 
4712    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4713    P_NV90C0_SET_RENDER_ENABLE_A(p, tmp_addr >> 32);
4714    P_NV90C0_SET_RENDER_ENABLE_B(p, tmp_addr & 0xfffffff0);
4715    P_NV90C0_SET_RENDER_ENABLE_C(p, inverted ? MODE_RENDER_IF_EQUAL : MODE_RENDER_IF_NOT_EQUAL);
4716 }
4717 
4718 VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)4719 nvk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4720 {
4721    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
4722 
4723    struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
4724    P_MTHD(p, NV9097, SET_RENDER_ENABLE_A);
4725    P_NV9097_SET_RENDER_ENABLE_A(p, 0);
4726    P_NV9097_SET_RENDER_ENABLE_B(p, 0);
4727    P_NV9097_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4728 
4729    P_MTHD(p, NV90C0, SET_RENDER_ENABLE_A);
4730    P_NV90C0_SET_RENDER_ENABLE_A(p, 0);
4731    P_NV90C0_SET_RENDER_ENABLE_B(p, 0);
4732    P_NV90C0_SET_RENDER_ENABLE_C(p, MODE_TRUE);
4733 }
4734