• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file iris_state.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * This is the main state upload code.
31  *
32  * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33  * complex, or highly reusable state can be created once, and bound and
34  * rebound multiple times.  This is modeled with the pipe->create_*_state()
35  * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36  * streamed out on the fly, via pipe->set_*_state() hooks.
37  *
38  * OpenGL involves frequently mutating context state, which is mirrored in
39  * core Mesa by highly mutable data structures.  However, most applications
40  * typically draw the same things over and over - from frame to frame, most
41  * of the same objects are still visible and need to be redrawn.  So, rather
42  * than inventing new state all the time, applications usually mutate to swap
43  * between known states that we've seen before.
44  *
45  * Gallium isolates us from this mutation by tracking API state, and
46  * distilling it into a set of Constant State Objects, or CSOs.  Large,
47  * complex, or typically reusable state can be created once, then reused
48  * multiple times.  Drivers can create and store their own associated data.
49  * This create/bind model corresponds to the pipe->create_*_state() and
50  * pipe->bind_*_state() driver hooks.
51  *
52  * Some state is cheap to create, or expected to be highly dynamic.  Rather
53  * than creating and caching piles of CSOs for these, Gallium simply streams
54  * them out, via the pipe->set_*_state() driver hooks.
55  *
56  * To reduce draw time overhead, we try to compute as much state at create
57  * time as possible.  Wherever possible, we translate the Gallium pipe state
58  * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59  * we can simply memcpy them into a batch buffer.
60  *
61  * No hardware matches the abstraction perfectly, so some commands require
62  * information from multiple CSOs.  In this case, we can store two copies
63  * of the packet (one in each CSO), and simply | together their DWords at
64  * draw time.  Sometimes the second set is trivial (one or two fields), so
65  * we simply pack it at draw time.
66  *
67  * There are two main components in the file below.  First, the CSO hooks
68  * create/bind/track state.  The second are the draw-time upload functions,
69  * iris_upload_render_state() and iris_upload_compute_state(), which read
70  * the context state and emit the commands into the actual batch.
71  */
72 
73 #include <stdio.h>
74 #include <errno.h>
75 
76 #ifdef HAVE_VALGRIND
77 #include <valgrind.h>
78 #include <memcheck.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83 
84 #include "pipe/p_defines.h"
85 #include "pipe/p_state.h"
86 #include "pipe/p_context.h"
87 #include "pipe/p_screen.h"
88 #include "util/u_dual_blend.h"
89 #include "util/u_inlines.h"
90 #include "util/format/u_format.h"
91 #include "util/u_framebuffer.h"
92 #include "util/u_transfer.h"
93 #include "util/u_upload_mgr.h"
94 #include "util/u_viewport.h"
95 #include "util/u_memory.h"
96 #include "util/u_trace_gallium.h"
97 #include "nir.h"
98 #include "intel/common/intel_aux_map.h"
99 #include "intel/common/intel_compute_slm.h"
100 #include "intel/common/intel_l3_config.h"
101 #include "intel/common/intel_sample_positions.h"
102 #include "intel/ds/intel_tracepoints.h"
103 #include "iris_batch.h"
104 #include "iris_context.h"
105 #include "iris_defines.h"
106 #include "iris_pipe.h"
107 #include "iris_resource.h"
108 #include "iris_utrace.h"
109 
110 #include "iris_genx_macros.h"
111 
112 #if GFX_VER >= 9
113 #include "intel/compiler/brw_compiler.h"
114 #include "intel/common/intel_genX_state_brw.h"
115 #else
116 #include "intel/compiler/elk/elk_compiler.h"
117 #include "intel/common/intel_genX_state_elk.h"
118 #endif
119 
120 #include "intel/common/intel_guardband.h"
121 #include "intel/common/intel_pixel_hash.h"
122 #include "intel/common/intel_tiled_render.h"
123 
124 /**
125  * Statically assert that PIPE_* enums match the hardware packets.
126  * (As long as they match, we don't need to translate them.)
127  */
pipe_asserts()128 UNUSED static void pipe_asserts()
129 {
130 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
131 
132    /* pipe_logicop happens to match the hardware. */
133    PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
134    PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
135    PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
136    PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
137    PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
138    PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
139    PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
140    PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
141    PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
142    PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
143    PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
144    PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
145    PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
146    PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
147    PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
148    PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
149 
150    /* pipe_blend_func happens to match the hardware. */
151    PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
152    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
153    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
154    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
155    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
156    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
157    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
158    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
159    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
160    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
161    PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
162    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
163    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
164    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
165    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
166    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
167    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
168    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
169    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
170 
171    /* pipe_blend_func happens to match the hardware. */
172    PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
173    PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
174    PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
175    PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
176    PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
177 
178    /* pipe_stencil_op happens to match the hardware. */
179    PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
180    PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
181    PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
182    PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
183    PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
184    PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
185    PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
186    PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
187 
188    /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
189    PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
190    PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
191 #undef PIPE_ASSERT
192 }
193 
194 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)195 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
196 {
197    static const unsigned map[] = {
198       [MESA_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
199       [MESA_PRIM_LINES]                    = _3DPRIM_LINELIST,
200       [MESA_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
201       [MESA_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
202       [MESA_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
203       [MESA_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
204       [MESA_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
205       [MESA_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
206       [MESA_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
207       [MESA_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
208       [MESA_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
209       [MESA_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
210       [MESA_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
211       [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
212       [MESA_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
213    };
214 
215    return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
216 }
217 
218 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)219 translate_compare_func(enum pipe_compare_func pipe_func)
220 {
221    static const unsigned map[] = {
222       [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
223       [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
224       [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
225       [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
226       [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
227       [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
228       [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
229       [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
230    };
231    return map[pipe_func];
232 }
233 
234 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)235 translate_shadow_func(enum pipe_compare_func pipe_func)
236 {
237    /* Gallium specifies the result of shadow comparisons as:
238     *
239     *    1 if ref <op> texel,
240     *    0 otherwise.
241     *
242     * The hardware does:
243     *
244     *    0 if texel <op> ref,
245     *    1 otherwise.
246     *
247     * So we need to flip the operator and also negate.
248     */
249    static const unsigned map[] = {
250       [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
251       [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
252       [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
253       [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
254       [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
255       [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
256       [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
257       [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
258    };
259    return map[pipe_func];
260 }
261 
262 static unsigned
translate_cull_mode(unsigned pipe_face)263 translate_cull_mode(unsigned pipe_face)
264 {
265    static const unsigned map[4] = {
266       [PIPE_FACE_NONE]           = CULLMODE_NONE,
267       [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
268       [PIPE_FACE_BACK]           = CULLMODE_BACK,
269       [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
270    };
271    return map[pipe_face];
272 }
273 
274 static unsigned
translate_fill_mode(unsigned pipe_polymode)275 translate_fill_mode(unsigned pipe_polymode)
276 {
277    static const unsigned map[4] = {
278       [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
279       [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
280       [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
281       [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282    };
283    return map[pipe_polymode];
284 }
285 
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289    static const unsigned map[] = {
290       [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291       [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
292       [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
293    };
294    return map[pipe_mip];
295 }
296 
297 static uint32_t
translate_wrap(unsigned pipe_wrap)298 translate_wrap(unsigned pipe_wrap)
299 {
300    static const unsigned map[] = {
301       [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
302       [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
303       [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
304       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
305       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
306       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
307 
308       /* These are unsupported. */
309       [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
310       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
311    };
312    return map[pipe_wrap];
313 }
314 
315 /**
316  * Allocate space for some indirect state.
317  *
318  * Return a pointer to the map (to fill it out) and a state ref (for
319  * referring to the state in GPU commands).
320  */
321 static void *
upload_state(struct u_upload_mgr * uploader,struct iris_state_ref * ref,unsigned size,unsigned alignment)322 upload_state(struct u_upload_mgr *uploader,
323              struct iris_state_ref *ref,
324              unsigned size,
325              unsigned alignment)
326 {
327    void *p = NULL;
328    u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
329    return p;
330 }
331 
332 /**
333  * Stream out temporary/short-lived state.
334  *
335  * This allocates space, pins the BO, and includes the BO address in the
336  * returned offset (which works because all state lives in 32-bit memory
337  * zones).
338  */
339 static uint32_t *
stream_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,unsigned size,unsigned alignment,uint32_t * out_offset)340 stream_state(struct iris_batch *batch,
341              struct u_upload_mgr *uploader,
342              struct pipe_resource **out_res,
343              unsigned size,
344              unsigned alignment,
345              uint32_t *out_offset)
346 {
347    void *ptr = NULL;
348 
349    u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
350 
351    struct iris_bo *bo = iris_resource_bo(*out_res);
352    iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
353 
354    iris_record_state_size(batch->state_sizes,
355                           bo->address + *out_offset, size);
356 
357    *out_offset += iris_bo_offset_from_base_address(bo);
358 
359    return ptr;
360 }
361 
362 /**
363  * stream_state() + memcpy.
364  */
365 static uint32_t
emit_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,const void * data,unsigned size,unsigned alignment)366 emit_state(struct iris_batch *batch,
367            struct u_upload_mgr *uploader,
368            struct pipe_resource **out_res,
369            const void *data,
370            unsigned size,
371            unsigned alignment)
372 {
373    unsigned offset = 0;
374    uint32_t *map =
375       stream_state(batch, uploader, out_res, size, alignment, &offset);
376 
377    if (map)
378       memcpy(map, data, size);
379 
380    return offset;
381 }
382 
383 /**
384  * Did field 'x' change between 'old_cso' and 'new_cso'?
385  *
386  * (If so, we may want to set some dirty flags.)
387  */
388 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
389 #define cso_changed_memcmp(x) \
390    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
391 #define cso_changed_memcmp_elts(x, n) \
392    (!old_cso || memcmp(old_cso->x, new_cso->x, n * sizeof(old_cso->x[0])) != 0)
393 
394 static void
flush_before_state_base_change(struct iris_batch * batch)395 flush_before_state_base_change(struct iris_batch *batch)
396 {
397    /* Wa_14014427904 - We need additional invalidate/flush when
398     * emitting NP state commands with ATS-M in compute mode.
399     */
400    bool atsm_compute = intel_device_info_is_atsm(batch->screen->devinfo) &&
401                        batch->name == IRIS_BATCH_COMPUTE;
402    uint32_t np_state_wa_bits =
403       PIPE_CONTROL_CS_STALL |
404       PIPE_CONTROL_STATE_CACHE_INVALIDATE |
405       PIPE_CONTROL_CONST_CACHE_INVALIDATE |
406       PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
407       PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
408       PIPE_CONTROL_INSTRUCTION_INVALIDATE |
409       PIPE_CONTROL_FLUSH_HDC;
410 
411    /* Flush before emitting STATE_BASE_ADDRESS.
412     *
413     * This isn't documented anywhere in the PRM.  However, it seems to be
414     * necessary prior to changing the surface state base address.  We've
415     * seen issues in Vulkan where we get GPU hangs when using multi-level
416     * command buffers which clear depth, reset state base address, and then
417     * go render stuff.
418     *
419     * Normally, in GL, we would trust the kernel to do sufficient stalls
420     * and flushes prior to executing our batch.  However, it doesn't seem
421     * as if the kernel's flushing is always sufficient and we don't want to
422     * rely on it.
423     *
424     * We make this an end-of-pipe sync instead of a normal flush because we
425     * do not know the current status of the GPU.  On Haswell at least,
426     * having a fast-clear operation in flight at the same time as a normal
427     * rendering operation can cause hangs.  Since the kernel's flushing is
428     * insufficient, we need to ensure that any rendering operations from
429     * other processes are definitely complete before we try to do our own
430     * rendering.  It's a bit of a big hammer but it appears to work.
431     *
432     * Render target cache flush before SBA is required by Wa_18039438632.
433     */
434    iris_emit_end_of_pipe_sync(batch,
435                               "change STATE_BASE_ADDRESS (flushes)",
436                               atsm_compute ? np_state_wa_bits : 0 |
437                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
438                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
439                               PIPE_CONTROL_DATA_CACHE_FLUSH);
440 }
441 
442 static void
flush_after_state_base_change(struct iris_batch * batch)443 flush_after_state_base_change(struct iris_batch *batch)
444 {
445    const struct intel_device_info *devinfo = batch->screen->devinfo;
446    /* After re-setting the surface state base address, we have to do some
447     * cache flusing so that the sampler engine will pick up the new
448     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
449     * Shared Function > 3D Sampler > State > State Caching (page 96):
450     *
451     *    Coherency with system memory in the state cache, like the texture
452     *    cache is handled partially by software. It is expected that the
453     *    command stream or shader will issue Cache Flush operation or
454     *    Cache_Flush sampler message to ensure that the L1 cache remains
455     *    coherent with system memory.
456     *
457     *    [...]
458     *
459     *    Whenever the value of the Dynamic_State_Base_Addr,
460     *    Surface_State_Base_Addr are altered, the L1 state cache must be
461     *    invalidated to ensure the new surface or sampler state is fetched
462     *    from system memory.
463     *
464     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
465     * which, according the PIPE_CONTROL instruction documentation in the
466     * Broadwell PRM:
467     *
468     *    Setting this bit is independent of any other bit in this packet.
469     *    This bit controls the invalidation of the L1 and L2 state caches
470     *    at the top of the pipe i.e. at the parsing time.
471     *
472     * Unfortunately, experimentation seems to indicate that state cache
473     * invalidation through a PIPE_CONTROL does nothing whatsoever in
474     * regards to surface state and binding tables.  In stead, it seems that
475     * invalidating the texture cache is what is actually needed.
476     *
477     * XXX:  As far as we have been able to determine through
478     * experimentation, shows that flush the texture cache appears to be
479     * sufficient.  The theory here is that all of the sampling/rendering
480     * units cache the binding table in the texture cache.  However, we have
481     * yet to be able to actually confirm this.
482     *
483     * Wa_16013000631:
484     *
485     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
486     *   or program pipe control with Instruction cache invalidate post
487     *   STATE_BASE_ADDRESS command"
488     */
489    iris_emit_end_of_pipe_sync(batch,
490                               "change STATE_BASE_ADDRESS (invalidates)",
491                               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
492                               PIPE_CONTROL_CONST_CACHE_INVALIDATE |
493                               PIPE_CONTROL_STATE_CACHE_INVALIDATE |
494                               (intel_needs_workaround(devinfo, 16013000631) ?
495                                PIPE_CONTROL_INSTRUCTION_INVALIDATE : 0));
496 }
497 
498 static void
iris_load_register_reg32(struct iris_batch * batch,uint32_t dst,uint32_t src)499 iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,
500                          uint32_t src)
501 {
502    struct mi_builder b;
503    mi_builder_init(&b, batch->screen->devinfo, batch);
504    mi_store(&b, mi_reg32(dst), mi_reg32(src));
505 }
506 
507 static void
iris_load_register_reg64(struct iris_batch * batch,uint32_t dst,uint32_t src)508 iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,
509                          uint32_t src)
510 {
511    struct mi_builder b;
512    mi_builder_init(&b, batch->screen->devinfo, batch);
513    mi_store(&b, mi_reg64(dst), mi_reg64(src));
514 }
515 
516 static void
iris_load_register_imm32(struct iris_batch * batch,uint32_t reg,uint32_t val)517 iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,
518                          uint32_t val)
519 {
520    struct mi_builder b;
521    mi_builder_init(&b, batch->screen->devinfo, batch);
522    mi_store(&b, mi_reg32(reg), mi_imm(val));
523 }
524 
525 static void
iris_load_register_imm64(struct iris_batch * batch,uint32_t reg,uint64_t val)526 iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,
527                          uint64_t val)
528 {
529    struct mi_builder b;
530    mi_builder_init(&b, batch->screen->devinfo, batch);
531    mi_store(&b, mi_reg64(reg), mi_imm(val));
532 }
533 
534 /**
535  * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
536  */
537 static void
iris_load_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)538 iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,
539                          struct iris_bo *bo, uint32_t offset)
540 {
541    iris_batch_sync_region_start(batch);
542    struct mi_builder b;
543    mi_builder_init(&b, batch->screen->devinfo, batch);
544    struct mi_value src = mi_mem32(ro_bo(bo, offset));
545    mi_store(&b, mi_reg32(reg), src);
546    iris_batch_sync_region_end(batch);
547 }
548 
549 /**
550  * Load a 64-bit value from a buffer into a MMIO register via
551  * two MI_LOAD_REGISTER_MEM commands.
552  */
553 static void
iris_load_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)554 iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,
555                          struct iris_bo *bo, uint32_t offset)
556 {
557    iris_batch_sync_region_start(batch);
558    struct mi_builder b;
559    mi_builder_init(&b, batch->screen->devinfo, batch);
560    struct mi_value src = mi_mem64(ro_bo(bo, offset));
561    mi_store(&b, mi_reg64(reg), src);
562    iris_batch_sync_region_end(batch);
563 }
564 
565 static void
iris_store_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)566 iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,
567                           struct iris_bo *bo, uint32_t offset,
568                           bool predicated)
569 {
570    iris_batch_sync_region_start(batch);
571    struct mi_builder b;
572    mi_builder_init(&b, batch->screen->devinfo, batch);
573    struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
574    struct mi_value src = mi_reg32(reg);
575    if (predicated)
576       mi_store_if(&b, dst, src);
577    else
578       mi_store(&b, dst, src);
579    iris_batch_sync_region_end(batch);
580 }
581 
582 static void
iris_store_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)583 iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,
584                           struct iris_bo *bo, uint32_t offset,
585                           bool predicated)
586 {
587    iris_batch_sync_region_start(batch);
588    struct mi_builder b;
589    mi_builder_init(&b, batch->screen->devinfo, batch);
590    struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
591    struct mi_value src = mi_reg64(reg);
592    if (predicated)
593       mi_store_if(&b, dst, src);
594    else
595       mi_store(&b, dst, src);
596    iris_batch_sync_region_end(batch);
597 }
598 
599 static void
iris_store_data_imm32(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint32_t imm)600 iris_store_data_imm32(struct iris_batch *batch,
601                       struct iris_bo *bo, uint32_t offset,
602                       uint32_t imm)
603 {
604    iris_batch_sync_region_start(batch);
605    struct mi_builder b;
606    mi_builder_init(&b, batch->screen->devinfo, batch);
607    struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
608    struct mi_value src = mi_imm(imm);
609    mi_store(&b, dst, src);
610    iris_batch_sync_region_end(batch);
611 }
612 
613 static void
iris_store_data_imm64(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint64_t imm)614 iris_store_data_imm64(struct iris_batch *batch,
615                       struct iris_bo *bo, uint32_t offset,
616                       uint64_t imm)
617 {
618    iris_batch_sync_region_start(batch);
619    struct mi_builder b;
620    mi_builder_init(&b, batch->screen->devinfo, batch);
621    struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
622    struct mi_value src = mi_imm(imm);
623    mi_store(&b, dst, src);
624    iris_batch_sync_region_end(batch);
625 }
626 
627 static void
iris_copy_mem_mem(struct iris_batch * batch,struct iris_bo * dst_bo,uint32_t dst_offset,struct iris_bo * src_bo,uint32_t src_offset,unsigned bytes)628 iris_copy_mem_mem(struct iris_batch *batch,
629                   struct iris_bo *dst_bo, uint32_t dst_offset,
630                   struct iris_bo *src_bo, uint32_t src_offset,
631                   unsigned bytes)
632 {
633    /* MI_COPY_MEM_MEM operates on DWords. */
634    assert(bytes % 4 == 0);
635    assert(dst_offset % 4 == 0);
636    assert(src_offset % 4 == 0);
637    iris_batch_sync_region_start(batch);
638 
639    for (unsigned i = 0; i < bytes; i += 4) {
640       iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {
641          cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,
642                                              IRIS_DOMAIN_OTHER_WRITE);
643          cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);
644       }
645    }
646 
647    iris_batch_sync_region_end(batch);
648 }
649 
650 static void
iris_rewrite_compute_walker_pc(struct iris_batch * batch,uint32_t * walker,struct iris_bo * bo,uint32_t offset)651 iris_rewrite_compute_walker_pc(struct iris_batch *batch,
652                                uint32_t *walker,
653                                struct iris_bo *bo,
654                                uint32_t offset)
655 {
656 #if GFX_VERx10 >= 125
657    struct iris_screen *screen = batch->screen;
658    struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
659 
660    uint32_t dwords[GENX(COMPUTE_WALKER_length)];
661 
662    _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) {
663       cw.body.PostSync.Operation = WriteTimestamp;
664       cw.body.PostSync.DestinationAddress = addr;
665       cw.body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
666    }
667 
668    for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++)
669       walker[i] |= dwords[i];
670 #else
671    unreachable("Unsupported");
672 #endif
673 }
674 
675 static void
emit_pipeline_select(struct iris_batch * batch,uint32_t pipeline)676 emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
677 {
678    /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
679 #if GFX_VER < 20
680 
681 #if GFX_VER >= 8 && GFX_VER < 10
682    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
683     *
684     *   Software must clear the COLOR_CALC_STATE Valid field in
685     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
686     *   with Pipeline Select set to GPGPU.
687     *
688     * The internal hardware docs recommend the same workaround for Gfx9
689     * hardware too.
690     */
691    if (pipeline == GPGPU)
692       iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
693 #endif
694 
695 #if GFX_VER >= 12
696    /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
697     *
698     *   "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
699     *   are flushed through a stalling PIPE_CONTROL command prior to
700     *   programming of PIPELINE_SELECT command transitioning Pipeline Select
701     *   from 3D to GPGPU/Media.
702     *   Software must ensure HDC Pipeline flush and Generic Media State Clear
703     *   is issued through a stalling PIPE_CONTROL command prior to programming
704     *   of PIPELINE_SELECT command transitioning Pipeline Select from
705     *   GPGPU/Media to 3D."
706     *
707     * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
708     * because PIPE was not in MEDIA mode?!
709     */
710    enum pipe_control_flags flags = PIPE_CONTROL_CS_STALL |
711                                    PIPE_CONTROL_FLUSH_HDC;
712 
713    if (pipeline == GPGPU && batch->name == IRIS_BATCH_RENDER) {
714       flags |= PIPE_CONTROL_RENDER_TARGET_FLUSH |
715                PIPE_CONTROL_DEPTH_CACHE_FLUSH;
716    } else {
717       flags |= PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH;
718    }
719    /* Wa_16013063087 -  State Cache Invalidate must be issued prior to
720     * PIPELINE_SELECT when switching from 3D to Compute.
721     *
722     * SW must do this by programming of PIPECONTROL with “CS Stall” followed
723     * by a PIPECONTROL with State Cache Invalidate bit set.
724     */
725    if (pipeline == GPGPU &&
726        intel_needs_workaround(batch->screen->devinfo, 16013063087))
727       flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
728 
729    iris_emit_pipe_control_flush(batch, "PIPELINE_SELECT flush", flags);
730 #else
731    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
732     * PIPELINE_SELECT [DevBWR+]":
733     *
734     *    "Project: DEVSNB+
735     *
736     *     Software must ensure all the write caches are flushed through a
737     *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
738     *     command to invalidate read only caches prior to programming
739     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
740     */
741     iris_emit_pipe_control_flush(batch,
742                                  "workaround: PIPELINE_SELECT flushes (1/2)",
743                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
744                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
745                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
746                                  PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
747                                  PIPE_CONTROL_CS_STALL);
748 
749     iris_emit_pipe_control_flush(batch,
750                                  "workaround: PIPELINE_SELECT flushes (2/2)",
751                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
752                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
753                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
754                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE);
755 #endif
756 
757    iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
758 #if GFX_VER >= 9
759       sel.MaskBits = GFX_VER == 12 ? 0x13 : 0x3;
760 #if GFX_VER == 12
761       sel.MediaSamplerDOPClockGateEnable = true;
762 #endif /* if GFX_VER == 12 */
763 #endif /* if GFX_VER >= 9 */
764       sel.PipelineSelection = pipeline;
765    }
766 #endif /* if GFX_VER < 20 */
767 }
768 
769 UNUSED static void
init_glk_barrier_mode(struct iris_batch * batch,uint32_t value)770 init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)
771 {
772 #if GFX_VER == 9
773    /* Project: DevGLK
774     *
775     *    "This chicken bit works around a hardware issue with barrier
776     *     logic encountered when switching between GPGPU and 3D pipelines.
777     *     To workaround the issue, this mode bit should be set after a
778     *     pipeline is selected."
779     */
780    iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {
781       reg.GLKBarrierMode = value;
782       reg.GLKBarrierModeMask = 1;
783    }
784 #endif
785 }
786 
787 static void
init_state_base_address(struct iris_batch * batch)788 init_state_base_address(struct iris_batch *batch)
789 {
790    struct isl_device *isl_dev = &batch->screen->isl_dev;
791    uint32_t mocs = isl_mocs(isl_dev, 0, false);
792    flush_before_state_base_change(batch);
793 
794    /* We program most base addresses once at context initialization time.
795     * Each base address points at a 4GB memory zone, and never needs to
796     * change.  See iris_bufmgr.h for a description of the memory zones.
797     *
798     * The one exception is Surface State Base Address, which needs to be
799     * updated occasionally.  See iris_binder.c for the details there.
800     */
801    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
802       sba.GeneralStateMOCS            = mocs;
803       sba.StatelessDataPortAccessMOCS = mocs;
804       sba.DynamicStateMOCS            = mocs;
805       sba.IndirectObjectMOCS          = mocs;
806       sba.InstructionMOCS             = mocs;
807       sba.SurfaceStateMOCS            = mocs;
808 #if GFX_VER >= 9
809       sba.BindlessSurfaceStateMOCS    = mocs;
810 #endif
811 
812       sba.GeneralStateBaseAddressModifyEnable   = true;
813       sba.DynamicStateBaseAddressModifyEnable   = true;
814       sba.IndirectObjectBaseAddressModifyEnable = true;
815       sba.InstructionBaseAddressModifyEnable    = true;
816       sba.GeneralStateBufferSizeModifyEnable    = true;
817       sba.DynamicStateBufferSizeModifyEnable    = true;
818       sba.SurfaceStateBaseAddressModifyEnable   = true;
819 #if GFX_VER >= 11
820       sba.BindlessSamplerStateMOCS    = mocs;
821 #endif
822       sba.IndirectObjectBufferSizeModifyEnable  = true;
823       sba.InstructionBuffersizeModifyEnable     = true;
824 
825       sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
826       sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
827       sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDER_START);
828 
829       sba.GeneralStateBufferSize   = 0xfffff;
830       sba.IndirectObjectBufferSize = 0xfffff;
831       sba.InstructionBufferSize    = 0xfffff;
832       sba.DynamicStateBufferSize   = 0xfffff;
833 #if GFX_VERx10 >= 125
834       sba.L1CacheControl = L1CC_WB;
835 #endif
836    }
837 
838    flush_after_state_base_change(batch);
839 }
840 
841 static void
iris_emit_l3_config(struct iris_batch * batch,const struct intel_l3_config * cfg)842 iris_emit_l3_config(struct iris_batch *batch,
843                     const struct intel_l3_config *cfg)
844 {
845 #if GFX_VER < 20
846    assert(cfg || GFX_VER >= 12);
847 
848 #if GFX_VER >= 12
849 #define L3_ALLOCATION_REG GENX(L3ALLOC)
850 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
851 #else
852 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
853 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
854 #endif
855 
856    iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {
857 #if GFX_VER < 11
858       reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
859 #endif
860 #if GFX_VER == 11
861       /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set
862        * in L3CNTLREG register. The default setting of the bit is not the
863        * desirable behavior.
864        */
865       reg.ErrorDetectionBehaviorControl = true;
866       reg.UseFullWays = true;
867 #endif
868       if (GFX_VER < 12 || (cfg && cfg->n[INTEL_L3P_ALL] <= 126)) {
869          reg.URBAllocation = cfg->n[INTEL_L3P_URB];
870          reg.ROAllocation = cfg->n[INTEL_L3P_RO];
871          reg.DCAllocation = cfg->n[INTEL_L3P_DC];
872          reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
873       } else {
874          assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
875                           cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
876                           cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
877                           cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
878 #if GFX_VER >= 12
879          reg.L3FullWayAllocationEnable = true;
880 #endif
881       }
882    }
883 #endif /* GFX_VER < 20 */
884 }
885 
886 void
genX(emit_urb_config)887 genX(emit_urb_config)(struct iris_batch *batch,
888                       bool has_tess_eval,
889                       bool has_geometry)
890 {
891    struct iris_screen *screen = batch->screen;
892    struct iris_context *ice = batch->ice;
893 
894    intel_get_urb_config(screen->devinfo,
895                         screen->l3_config_3d,
896                         has_tess_eval,
897                         has_geometry,
898                         &ice->shaders.urb.cfg,
899                         &ice->state.urb_deref_block_size,
900                         &ice->shaders.urb.constrained);
901 
902    genX(urb_workaround)(batch, &ice->shaders.urb.cfg);
903 
904    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
905 #if GFX_VER >= 12
906       iris_emit_cmd(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
907          urb._3DCommandSubOpcode           += i;
908          urb.VSURBEntryAllocationSize       = ice->shaders.urb.cfg.size[i] - 1;
909          urb.VSURBStartingAddressSlice0     = ice->shaders.urb.cfg.start[i];
910          urb.VSURBStartingAddressSliceN     = ice->shaders.urb.cfg.start[i];
911          urb.VSNumberofURBEntriesSlice0     = ice->shaders.urb.cfg.entries[i];
912          urb.VSNumberofURBEntriesSliceN     = ice->shaders.urb.cfg.entries[i];
913       }
914 #else
915       iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
916          urb._3DCommandSubOpcode += i;
917          urb.VSURBStartingAddress     = ice->shaders.urb.cfg.start[i];
918          urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
919          urb.VSNumberofURBEntries     = ice->shaders.urb.cfg.entries[i];
920       }
921 #endif
922    }
923 }
924 
925 #if GFX_VER == 9
926 static void
iris_enable_obj_preemption(struct iris_batch * batch,bool enable)927 iris_enable_obj_preemption(struct iris_batch *batch, bool enable)
928 {
929    /* A fixed function pipe flush is required before modifying this field */
930    iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"
931                                             : "disable preemption",
932                               PIPE_CONTROL_RENDER_TARGET_FLUSH);
933 
934    /* enable object level preemption */
935    iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
936       reg.ReplayMode = enable;
937       reg.ReplayModeMask = true;
938    }
939 }
940 #endif
941 
942 static void
upload_pixel_hashing_tables(struct iris_batch * batch)943 upload_pixel_hashing_tables(struct iris_batch *batch)
944 {
945    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
946    UNUSED struct iris_context *ice = batch->ice;
947    assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
948 
949 #if GFX_VER == 11
950    /* Gfx11 hardware has two pixel pipes at most. */
951    for (unsigned i = 2; i < ARRAY_SIZE(devinfo->ppipe_subslices); i++)
952       assert(devinfo->ppipe_subslices[i] == 0);
953 
954    if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])
955       return;
956 
957    unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
958    uint32_t hash_address;
959    struct pipe_resource *tmp = NULL;
960    uint32_t *map =
961       stream_state(batch, ice->state.dynamic_uploader, &tmp,
962                    size, 64, &hash_address);
963    pipe_resource_reference(&tmp, NULL);
964 
965    const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];
966    struct GENX(SLICE_HASH_TABLE) table;
967    intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
968 
969    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
970 
971    iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
972       ptr.SliceHashStatePointerValid = true;
973       ptr.SliceHashTableStatePointer = hash_address;
974    }
975 
976    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
977       mode.SliceHashingTableEnable = true;
978    }
979 
980 #elif GFX_VERx10 == 120
981    /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
982     * present with n active dual subslices.
983     */
984    unsigned ppipes_of[3] = {};
985 
986    for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
987       for (unsigned p = 0; p < 3; p++)
988          ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);
989    }
990 
991    /* Gfx12 has three pixel pipes. */
992    for (unsigned p = 3; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)
993       assert(devinfo->ppipe_subslices[p] == 0);
994 
995    if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
996       /* All three pixel pipes have the maximum number of active dual
997        * subslices, or there is only one active pixel pipe: Nothing to do.
998        */
999       return;
1000    }
1001 
1002    iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
1003       p.SliceHashControl[0] = TABLE_0;
1004 
1005       if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1006          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
1007       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1008          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
1009 
1010       if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
1011          intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
1012       else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1013          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
1014       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1015          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
1016       else
1017          unreachable("Illegal fusing.");
1018    }
1019 
1020    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1021       p.SubsliceHashingTableEnable = true;
1022       p.SubsliceHashingTableEnableMask = true;
1023    }
1024 
1025 #elif GFX_VERx10 == 125
1026    struct pipe_screen *pscreen = &batch->screen->base;
1027    const unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
1028    const struct pipe_resource tmpl = {
1029      .target = PIPE_BUFFER,
1030      .format = PIPE_FORMAT_R8_UNORM,
1031      .bind = PIPE_BIND_CUSTOM,
1032      .usage = PIPE_USAGE_IMMUTABLE,
1033      .flags = IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE,
1034      .width0 = size,
1035      .height0 = 1,
1036      .depth0 = 1,
1037      .array_size = 1
1038    };
1039 
1040    pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
1041    ice->state.pixel_hashing_tables = pscreen->resource_create(pscreen, &tmpl);
1042 
1043    struct iris_resource *res = (struct iris_resource *)ice->state.pixel_hashing_tables;
1044    struct pipe_transfer *transfer = NULL;
1045    uint32_t *map = pipe_buffer_map_range(&ice->ctx, ice->state.pixel_hashing_tables,
1046                                          0, size, PIPE_MAP_WRITE,
1047                                          &transfer);
1048 
1049    /* Calculate the set of present pixel pipes, and another set of
1050     * present pixel pipes with 2 dual subslices enabled, the latter
1051     * will appear on the hashing table with twice the frequency of
1052     * pixel pipes with a single dual subslice present.
1053     */
1054    uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
1055    for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
1056       if (devinfo->ppipe_subslices[p])
1057          ppipe_mask1 |= (1u << p);
1058       if (devinfo->ppipe_subslices[p] > 1)
1059          ppipe_mask2 |= (1u << p);
1060    }
1061    assert(ppipe_mask1);
1062 
1063    struct GENX(SLICE_HASH_TABLE) table;
1064 
1065    /* Note that the hardware expects an array with 7 tables, each
1066     * table is intended to specify the pixel pipe hashing behavior for
1067     * every possible slice count between 2 and 8, however that doesn't
1068     * actually work, among other reasons due to hardware bugs that
1069     * will cause the GPU to erroneously access the table at the wrong
1070     * index in some cases, so in practice all 7 tables need to be
1071     * initialized to the same value.
1072     */
1073    for (unsigned i = 0; i < 7; i++)
1074       intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
1075                                           table.Entry[i][0]);
1076 
1077    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
1078 
1079    pipe_buffer_unmap(&ice->ctx, transfer);
1080 
1081    iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_NONE);
1082    iris_record_state_size(batch->state_sizes, res->bo->address + res->offset, size);
1083 
1084    iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
1085       ptr.SliceHashStatePointerValid = true;
1086       ptr.SliceHashTableStatePointer = iris_bo_offset_from_base_address(res->bo) +
1087                                        res->offset;
1088    }
1089 
1090    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
1091       mode.SliceHashingTableEnable = true;
1092       mode.SliceHashingTableEnableMask = true;
1093       mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
1094                                     hashing32x32 : NormalMode);
1095       mode.CrossSliceHashingModeMask = -1;
1096    }
1097 #endif
1098 }
1099 
1100 static void
iris_alloc_push_constants(struct iris_batch * batch)1101 iris_alloc_push_constants(struct iris_batch *batch)
1102 {
1103    const struct intel_device_info *devinfo = batch->screen->devinfo;
1104 
1105    /* For now, we set a static partitioning of the push constant area,
1106     * assuming that all stages could be in use.
1107     *
1108     * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1109     *       see if that improves performance by offering more space to
1110     *       the VS/FS when those aren't in use.  Also, try dynamically
1111     *       enabling/disabling it like i965 does.  This would be more
1112     *       stalls and may not actually help; we don't know yet.
1113     */
1114 
1115    /* Divide as equally as possible with any remainder given to FRAGMENT. */
1116    const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
1117    const unsigned stage_size = push_constant_kb / 5;
1118    const unsigned frag_size = push_constant_kb - 4 * stage_size;
1119 
1120    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1121       iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1122          alloc._3DCommandSubOpcode = 18 + i;
1123          alloc.ConstantBufferOffset = stage_size * i;
1124          alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
1125       }
1126    }
1127 
1128 #if GFX_VERx10 == 125
1129    /* DG2: Wa_22011440098
1130     * MTL: Wa_18022330953
1131     *
1132     * In 3D mode, after programming push constant alloc command immediately
1133     * program push constant command(ZERO length) without any commit between
1134     * them.
1135     */
1136    iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
1137       /* Update empty push constants for all stages (bitmask = 11111b) */
1138       c.ShaderUpdateEnable = 0x1f;
1139       c.MOCS = iris_mocs(NULL, &batch->screen->isl_dev, 0);
1140    }
1141 #endif
1142 }
1143 
1144 #if GFX_VER >= 12
1145 static void
1146 init_aux_map_state(struct iris_batch *batch);
1147 #endif
1148 
1149 /* This updates a register. Caller should stall the pipeline as needed. */
1150 static void
iris_disable_rhwo_optimization(struct iris_batch * batch,bool disable)1151 iris_disable_rhwo_optimization(struct iris_batch *batch, bool disable)
1152 {
1153    assert(batch->screen->devinfo->verx10 == 120);
1154 #if GFX_VERx10 == 120
1155    iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1156       c1.RCCRHWOOptimizationDisable = disable;
1157       c1.RCCRHWOOptimizationDisableMask = true;
1158    };
1159 #endif
1160 }
1161 
1162 static void
state_system_mem_fence_address_emit(struct iris_batch * batch)1163 state_system_mem_fence_address_emit(struct iris_batch *batch)
1164 {
1165 #if GFX_VERx10 >= 200
1166    struct iris_screen *screen = batch->screen;
1167    struct iris_address addr = { .bo = iris_bufmgr_get_mem_fence_bo(screen->bufmgr) };
1168    iris_emit_cmd(batch, GENX(STATE_SYSTEM_MEM_FENCE_ADDRESS), mem_fence_addr) {
1169       mem_fence_addr.SystemMemoryFenceAddress = addr;
1170    }
1171 #endif
1172 }
1173 
1174 /**
1175  * Upload initial GPU state for any kind of context.
1176  *
1177  * These need to happen for both render and compute.
1178  */
1179 static void
iris_init_common_context(struct iris_batch * batch)1180 iris_init_common_context(struct iris_batch *batch)
1181 {
1182 #if GFX_VER == 11
1183    iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {
1184       reg.HeaderlessMessageforPreemptableContexts = 1;
1185       reg.HeaderlessMessageforPreemptableContextsMask = 1;
1186    }
1187 
1188    /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
1189    iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {
1190       reg.EnabledTexelOffsetPrecisionFix = 1;
1191       reg.EnabledTexelOffsetPrecisionFixMask = 1;
1192    }
1193 #endif
1194 
1195    /* Select 256B-aligned binding table mode on Icelake through Tigerlake,
1196     * which gives us larger binding table pointers, at the cost of higher
1197     * alignment requirements (bits 18:8 are valid instead of 15:5).  When
1198     * using this mode, we have to shift binding table pointers by 3 bits,
1199     * as they're still stored in the same bit-location in the field.
1200     */
1201 #if GFX_VER >= 11 && GFX_VERx10 < 125
1202    iris_emit_reg(batch, GENX(GT_MODE), reg) {
1203       reg.BindingTableAlignment = BTP_18_8;
1204       reg.BindingTableAlignmentMask = true;
1205    }
1206 #endif
1207 
1208 #if GFX_VERx10 == 125
1209    /* Even though L3 partial write merging is supposed to be enabled
1210     * by default on Gfx12.5 according to the hardware spec, i915
1211     * appears to accidentally clear the enables during context
1212     * initialization, so make sure to enable them here since partial
1213     * write merging has a large impact on rendering performance.
1214     */
1215    iris_emit_reg(batch, GENX(L3SQCREG5), reg) {
1216       reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
1217       reg.CompressiblePartialWriteMergeEnable = true;
1218       reg.CoherentPartialWriteMergeEnable = true;
1219       reg.CrossTilePartialWriteMergeEnable = true;
1220    }
1221 #endif
1222 
1223    state_system_mem_fence_address_emit(batch);
1224 }
1225 
1226 static void
toggle_protected(struct iris_batch * batch)1227 toggle_protected(struct iris_batch *batch)
1228 {
1229    struct iris_context *ice;
1230 
1231    if (batch->name == IRIS_BATCH_RENDER)
1232       ice =container_of(batch, struct iris_context, batches[IRIS_BATCH_RENDER]);
1233    else if (batch->name == IRIS_BATCH_COMPUTE)
1234       ice = container_of(batch, struct iris_context, batches[IRIS_BATCH_COMPUTE]);
1235    else
1236       unreachable("unhandled batch");
1237 
1238    if (!ice->protected)
1239       return;
1240 
1241 #if GFX_VER >= 12
1242    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1243       pc.CommandStreamerStallEnable = true;
1244       pc.RenderTargetCacheFlushEnable = true;
1245       pc.ProtectedMemoryDisable = true;
1246    }
1247    iris_emit_cmd(batch, GENX(MI_SET_APPID), appid) {
1248       /* Default value for single session. */
1249       appid.ProtectedMemoryApplicationID = 0xf;
1250       appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
1251    }
1252    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1253       pc.CommandStreamerStallEnable = true;
1254       pc.RenderTargetCacheFlushEnable = true;
1255       pc.ProtectedMemoryEnable = true;
1256    }
1257 #else
1258    unreachable("Not supported");
1259 #endif
1260 }
1261 
1262 #if GFX_VER >= 20
1263 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
1264 #else
1265 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
1266 #endif
1267 
1268 /**
1269  * Upload the initial GPU state for a render context.
1270  *
1271  * This sets some invariant state that needs to be programmed a particular
1272  * way, but we never actually change.
1273  */
1274 static void
iris_init_render_context(struct iris_batch * batch)1275 iris_init_render_context(struct iris_batch *batch)
1276 {
1277    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1278 
1279    iris_batch_sync_region_start(batch);
1280 
1281    emit_pipeline_select(batch, _3D);
1282 
1283    toggle_protected(batch);
1284 
1285    iris_emit_l3_config(batch, batch->screen->l3_config_3d);
1286 
1287    init_state_base_address(batch);
1288 
1289    iris_init_common_context(batch);
1290 
1291 #if GFX_VER >= 9
1292    iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {
1293       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1294       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1295    }
1296 #else
1297    iris_emit_reg(batch, GENX(INSTPM), reg) {
1298       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1299       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1300    }
1301 #endif
1302 
1303 #if GFX_VER == 9
1304    iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1305       reg.FloatBlendOptimizationEnable = true;
1306       reg.FloatBlendOptimizationEnableMask = true;
1307       reg.MSCRAWHazardAvoidanceBit = true;
1308       reg.MSCRAWHazardAvoidanceBitMask = true;
1309       reg.PartialResolveDisableInVC = true;
1310       reg.PartialResolveDisableInVCMask = true;
1311    }
1312 
1313    if (devinfo->platform == INTEL_PLATFORM_GLK)
1314       init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);
1315 #endif
1316 
1317 #if GFX_VER == 11
1318    iris_emit_reg(batch, GENX(TCCNTLREG), reg) {
1319       reg.L3DataPartialWriteMergingEnable = true;
1320       reg.ColorZPartialWriteMergingEnable = true;
1321       reg.URBPartialWriteMergingEnable = true;
1322       reg.TCDisable = true;
1323    }
1324 
1325    /* Hardware specification recommends disabling repacking for the
1326     * compatibility with decompression mechanism in display controller.
1327     */
1328    if (devinfo->disable_ccs_repack) {
1329       iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {
1330          reg.DisableRepackingforCompression = true;
1331          reg.DisableRepackingforCompressionMask = true;
1332       }
1333    }
1334 #endif
1335 
1336 #if GFX_VER == 12
1337    iris_emit_reg(batch, GENX(FF_MODE2), reg) {
1338       /* On Alchemist, the FF_MODE2 docs for the GS timer say:
1339        *
1340        *    "The timer value must be set to 224."
1341        *
1342        * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
1343        * and that this is necessary to avoid hanging the HS/DS units.  It
1344        * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
1345        *
1346        * The HS timer docs also have the same quote for Alchemist.  I am
1347        * unaware of a reason it needs to be set to 224 on Tigerlake, but
1348        * we do so for consistency if nothing else.
1349        *
1350        * For the TDS timer value, the docs say:
1351        *
1352        *    "For best performance, a value of 4 should be programmed."
1353        *
1354        * i915 also sets it this way on Tigerlake due to workarounds.
1355        *
1356        * The default VS timer appears to be 0, so we leave it at that.
1357        */
1358       reg.GSTimerValue  = 224;
1359       reg.HSTimerValue  = 224;
1360       reg.TDSTimerValue = 4;
1361       reg.VSTimerValue  = 0;
1362    }
1363 #endif
1364 
1365 #if INTEL_NEEDS_WA_1508744258
1366    /* The suggested workaround is:
1367     *
1368     *    Disable RHWO by setting 0x7010[14] by default except during resolve
1369     *    pass.
1370     *
1371     * We implement global disabling of the optimization here and we toggle it
1372     * in iris_resolve_color.
1373     *
1374     * iris_init_compute_context is unmodified because we don't expect to
1375     * access the RCC in the compute context. iris_mcs_partial_resolve is
1376     * unmodified because that pass doesn't use a HW bit to perform the
1377     * resolve (related HSDs specifically call out the RenderTargetResolveType
1378     * field in the 3DSTATE_PS instruction).
1379     */
1380    iris_disable_rhwo_optimization(batch, true);
1381 #endif
1382 
1383 #if GFX_VERx10 == 120
1384    /* Wa_1806527549 says to disable the following HiZ optimization when the
1385     * depth buffer is D16_UNORM. We've found the WA to help with more depth
1386     * buffer configurations however, so we always disable it just to be safe.
1387     */
1388    iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
1389       reg.HZDepthTestLEGEOptimizationDisable = true;
1390       reg.HZDepthTestLEGEOptimizationDisableMask = true;
1391    }
1392 #endif
1393 
1394 #if GFX_VERx10 == 125
1395    iris_emit_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
1396       reg.TBIMRBatchSizeOverride = true;
1397       reg.TBIMROpenBatchEnable = true;
1398       reg.TBIMRFastClip = true;
1399       reg.TBIMRBatchSizeOverrideMask = true;
1400       reg.TBIMROpenBatchEnableMask = true;
1401       reg.TBIMRFastClipMask = true;
1402    };
1403 #endif
1404 
1405 #if GFX_VER >= 20
1406    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1407       p.DX10OGLBorderModeforYCRCB = true;
1408       p.DX10OGLBorderModeforYCRCBMask = true;
1409    }
1410 #endif
1411 
1412 #if GFX_VER >= 30
1413    iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
1414       cm.EnableVariableRegisterSizeAllocationMask = 1;
1415       cm.EnableVariableRegisterSizeAllocation = true;
1416    }
1417 #endif
1418 
1419    upload_pixel_hashing_tables(batch);
1420 
1421    /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid
1422     * changing it dynamically.  We set it to the maximum size here, and
1423     * instead include the render target dimensions in the viewport, so
1424     * viewport extents clipping takes care of pruning stray geometry.
1425     */
1426    iris_emit_cmd(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
1427       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
1428       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
1429    }
1430 
1431    /* Set the initial MSAA sample positions. */
1432    iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1433       INTEL_SAMPLE_POS_1X(pat._1xSample);
1434       INTEL_SAMPLE_POS_2X(pat._2xSample);
1435       INTEL_SAMPLE_POS_4X(pat._4xSample);
1436       INTEL_SAMPLE_POS_8X(pat._8xSample);
1437 #if GFX_VER >= 9
1438       INTEL_SAMPLE_POS_16X(pat._16xSample);
1439 #endif
1440    }
1441 
1442    /* Use the legacy AA line coverage computation. */
1443    iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1444 
1445    /* Disable chromakeying (it's for media) */
1446    iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1447 
1448    /* We want regular rendering, not special HiZ operations. */
1449    iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1450 
1451    /* No polygon stippling offsets are necessary. */
1452    /* TODO: may need to set an offset for origin-UL framebuffers */
1453    iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1454 
1455 #if GFX_VERx10 >= 125
1456    iris_emit_cmd(batch, GENX(3DSTATE_MESH_CONTROL), foo);
1457    iris_emit_cmd(batch, GENX(3DSTATE_TASK_CONTROL), foo);
1458 #endif
1459 
1460 #if INTEL_NEEDS_WA_14019857787
1461    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1462       p.EnableOOOreadsinRCPB = true;
1463       p.EnableOOOreadsinRCPBMask = true;
1464    }
1465 #endif
1466 
1467    iris_alloc_push_constants(batch);
1468 
1469 #if GFX_VER >= 12
1470    init_aux_map_state(batch);
1471 #endif
1472 
1473    iris_batch_sync_region_end(batch);
1474 }
1475 
1476 static void
iris_init_compute_context(struct iris_batch * batch)1477 iris_init_compute_context(struct iris_batch *batch)
1478 {
1479    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1480 
1481    iris_batch_sync_region_start(batch);
1482 
1483    /* Wa_1607854226:
1484     *
1485     *  Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.
1486     */
1487 #if GFX_VERx10 == 120
1488    emit_pipeline_select(batch, _3D);
1489 #else
1490    emit_pipeline_select(batch, GPGPU);
1491 #endif
1492 
1493    toggle_protected(batch);
1494 
1495    iris_emit_l3_config(batch, batch->screen->l3_config_cs);
1496 
1497    init_state_base_address(batch);
1498 
1499    iris_init_common_context(batch);
1500 
1501 #if GFX_VERx10 == 120
1502    emit_pipeline_select(batch, GPGPU);
1503 #endif
1504 
1505 #if GFX_VER == 9
1506    if (devinfo->platform == INTEL_PLATFORM_GLK)
1507       init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);
1508 #endif
1509 
1510 #if GFX_VER >= 12
1511    init_aux_map_state(batch);
1512 #endif
1513 
1514 #if GFX_VERx10 >= 125
1515    /* Wa_14015782607 - Issue pipe control with HDC_flush and
1516     * untyped cache flush set to 1 when CCS has NP state update with
1517     * STATE_COMPUTE_MODE.
1518     */
1519    if (intel_needs_workaround(devinfo, 14015782607))
1520       iris_emit_pipe_control_flush(batch, "Wa_14015782607",
1521                                    PIPE_CONTROL_CS_STALL |
1522                                    PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
1523                                    PIPE_CONTROL_FLUSH_HDC);
1524 
1525    /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
1526     * emitting NP state commands with ATS-M in compute mode.
1527     */
1528    if (intel_device_info_is_atsm(devinfo))
1529       iris_emit_pipe_control_flush(batch, "Wa_14014427904/22013045878",
1530                                    PIPE_CONTROL_CS_STALL |
1531                                    PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1532                                    PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1533                                    PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
1534                                    PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1535                                    PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1536                                    PIPE_CONTROL_FLUSH_HDC);
1537 
1538    iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
1539 #if GFX_VER >= 30
1540       cm.EnableVariableRegisterSizeAllocationMask = 1;
1541       cm.EnableVariableRegisterSizeAllocation = true;
1542 #endif
1543 #if GFX_VER >= 20
1544       cm.AsyncComputeThreadLimit = ACTL_Max8;
1545       cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
1546       cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit;
1547       cm.AsyncComputeThreadLimitMask = 0x7;
1548       cm.ZPassAsyncComputeThreadLimitMask = 0x7;
1549       cm.ZAsyncThrottlesettingsMask = 0x3;
1550 #else
1551       cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
1552       cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
1553       cm.PixelAsyncComputeThreadLimitMask = 0x7;
1554       cm.ZPassAsyncComputeThreadLimitMask = 0x7;
1555       if (intel_device_info_is_mtl_or_arl(devinfo)) {
1556          cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit;
1557          cm.ZAsyncThrottlesettingsMask = 0x3;
1558       }
1559 #endif
1560    }
1561 #endif
1562 
1563 #if GFX_VERx10 >= 125
1564    iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
1565       cfe.MaximumNumberofThreads =
1566          devinfo->max_cs_threads * devinfo->subslice_total;
1567    }
1568 #endif
1569 
1570    iris_batch_sync_region_end(batch);
1571 }
1572 
1573 static void
iris_init_copy_context(struct iris_batch * batch)1574 iris_init_copy_context(struct iris_batch *batch)
1575 {
1576    iris_batch_sync_region_start(batch);
1577 
1578 #if GFX_VER >= 12
1579    init_aux_map_state(batch);
1580 #endif
1581 
1582    state_system_mem_fence_address_emit(batch);
1583 
1584    iris_batch_sync_region_end(batch);
1585 }
1586 
1587 struct iris_vertex_buffer_state {
1588    /** The VERTEX_BUFFER_STATE hardware structure. */
1589    uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];
1590 
1591    /** The resource to source vertex data from. */
1592    struct pipe_resource *resource;
1593 
1594    int offset;
1595 };
1596 
1597 struct iris_depth_buffer_state {
1598    /* Depth/HiZ/Stencil related hardware packets. */
1599 #if GFX_VER < 20
1600    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1601                     GENX(3DSTATE_STENCIL_BUFFER_length) +
1602                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
1603                     GENX(3DSTATE_CLEAR_PARAMS_length)];
1604 #else
1605    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1606                     GENX(3DSTATE_STENCIL_BUFFER_length) +
1607                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length)];
1608 #endif
1609 };
1610 
1611 #if INTEL_NEEDS_WA_1808121037
1612 enum iris_depth_reg_mode {
1613    IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
1614    IRIS_DEPTH_REG_MODE_D16_1X_MSAA,
1615    IRIS_DEPTH_REG_MODE_UNKNOWN,
1616 };
1617 #endif
1618 
1619 /**
1620  * Generation-specific context state (ice->state.genx->...).
1621  *
1622  * Most state can go in iris_context directly, but these encode hardware
1623  * packets which vary by generation.
1624  */
1625 struct iris_genx_state {
1626    struct iris_vertex_buffer_state vertex_buffers[33];
1627    uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];
1628 
1629    struct iris_depth_buffer_state depth_buffer;
1630 
1631    uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
1632 
1633 #if GFX_VER == 8
1634    bool pma_fix_enabled;
1635 #endif
1636 
1637    /* Is object level preemption enabled? */
1638    bool object_preemption;
1639 
1640 #if INTEL_NEEDS_WA_1808121037
1641    enum iris_depth_reg_mode depth_reg_mode;
1642 #endif
1643 
1644    struct {
1645 #if GFX_VER == 8
1646       struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1647 #endif
1648    } shaders[MESA_SHADER_STAGES];
1649 };
1650 
1651 /**
1652  * The pipe->set_blend_color() driver hook.
1653  *
1654  * This corresponds to our COLOR_CALC_STATE.
1655  */
1656 static void
iris_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1657 iris_set_blend_color(struct pipe_context *ctx,
1658                      const struct pipe_blend_color *state)
1659 {
1660    struct iris_context *ice = (struct iris_context *) ctx;
1661 
1662    /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1663    memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1664    ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1665 }
1666 
1667 /**
1668  * Gallium CSO for blend state (see pipe_blend_state).
1669  */
1670 struct iris_blend_state {
1671    /** Partial 3DSTATE_PS_BLEND */
1672    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1673 
1674    /** Partial BLEND_STATE */
1675    uint32_t blend_state[GENX(BLEND_STATE_length) +
1676                         IRIS_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
1677 
1678    bool alpha_to_coverage; /* for shader key */
1679 
1680    /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1681    uint8_t blend_enables;
1682 
1683    /** Bitfield of whether color writes are enabled for RT[i] */
1684    uint8_t color_write_enables;
1685 
1686    /** Does RT[0] use dual color blending? */
1687    bool dual_color_blending;
1688 
1689    int ps_dst_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1690    int ps_dst_alpha_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1691 };
1692 
1693 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1694 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1695 {
1696    if (alpha_to_one) {
1697       if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1698          return PIPE_BLENDFACTOR_ONE;
1699 
1700       if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1701          return PIPE_BLENDFACTOR_ZERO;
1702    }
1703 
1704    return f;
1705 }
1706 
1707 /**
1708  * The pipe->create_blend_state() driver hook.
1709  *
1710  * Translates a pipe_blend_state into iris_blend_state.
1711  */
1712 static void *
iris_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1713 iris_create_blend_state(struct pipe_context *ctx,
1714                         const struct pipe_blend_state *state)
1715 {
1716    struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
1717    uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);
1718 
1719    cso->blend_enables = 0;
1720    cso->color_write_enables = 0;
1721    STATIC_ASSERT(IRIS_MAX_DRAW_BUFFERS <= 8);
1722 
1723    cso->alpha_to_coverage = state->alpha_to_coverage;
1724 
1725    bool indep_alpha_blend = false;
1726 
1727    for (int i = 0; i < IRIS_MAX_DRAW_BUFFERS; i++) {
1728       const struct pipe_rt_blend_state *rt =
1729          &state->rt[state->independent_blend_enable ? i : 0];
1730 
1731       enum pipe_blendfactor src_rgb =
1732          fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1733       enum pipe_blendfactor src_alpha =
1734          fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1735       enum pipe_blendfactor dst_rgb =
1736          fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1737       enum pipe_blendfactor dst_alpha =
1738          fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1739 
1740       /* Stored separately in cso for dynamic emission. */
1741       cso->ps_dst_blend_factor[i] = (int) dst_rgb;
1742       cso->ps_dst_alpha_blend_factor[i] = (int) dst_alpha;
1743 
1744       if (rt->rgb_func != rt->alpha_func ||
1745           src_rgb != src_alpha || dst_rgb != dst_alpha)
1746          indep_alpha_blend = true;
1747 
1748       if (rt->blend_enable)
1749          cso->blend_enables |= 1u << i;
1750 
1751       if (rt->colormask)
1752          cso->color_write_enables |= 1u << i;
1753 
1754       iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {
1755          be.LogicOpEnable = state->logicop_enable;
1756          be.LogicOpFunction = state->logicop_func;
1757 
1758          be.PreBlendSourceOnlyClampEnable = false;
1759          be.ColorClampRange = COLORCLAMP_RTFORMAT;
1760          be.PreBlendColorClampEnable = true;
1761          be.PostBlendColorClampEnable = true;
1762 
1763          be.ColorBufferBlendEnable = rt->blend_enable;
1764 
1765          be.ColorBlendFunction          = rt->rgb_func;
1766          be.AlphaBlendFunction          = rt->alpha_func;
1767 
1768          /* The casts prevent warnings about implicit enum type conversions. */
1769          be.SourceBlendFactor           = (int) src_rgb;
1770          be.SourceAlphaBlendFactor      = (int) src_alpha;
1771 
1772          be.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
1773          be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
1774          be.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
1775          be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
1776       }
1777       blend_entry += GENX(BLEND_STATE_ENTRY_length);
1778    }
1779 
1780    iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1781       /* pb.HasWriteableRT is filled in at draw time.
1782        * pb.AlphaTestEnable is filled in at draw time.
1783        *
1784        * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1785        * setting it when dual color blending without an appropriate shader.
1786        */
1787 
1788       pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1789       pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1790 
1791       /* The casts prevent warnings about implicit enum type conversions. */
1792       pb.SourceBlendFactor =
1793          (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1794       pb.SourceAlphaBlendFactor =
1795          (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1796    }
1797 
1798    iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
1799       bs.AlphaToCoverageEnable = state->alpha_to_coverage;
1800       bs.IndependentAlphaBlendEnable = indep_alpha_blend;
1801       bs.AlphaToOneEnable = state->alpha_to_one;
1802       bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage_dither;
1803       bs.ColorDitherEnable = state->dither;
1804       /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
1805    }
1806 
1807    cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1808 
1809    return cso;
1810 }
1811 
1812 /**
1813  * The pipe->bind_blend_state() driver hook.
1814  *
1815  * Bind a blending CSO and flag related dirty bits.
1816  */
1817 static void
iris_bind_blend_state(struct pipe_context * ctx,void * state)1818 iris_bind_blend_state(struct pipe_context *ctx, void *state)
1819 {
1820    struct iris_context *ice = (struct iris_context *) ctx;
1821    struct iris_blend_state *cso = state;
1822 
1823    ice->state.cso_blend = cso;
1824 
1825    ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
1826    ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1827    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
1828 
1829    if (GFX_VER == 8)
1830       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1831 }
1832 
1833 /**
1834  * Return true if the FS writes to any color outputs which are not disabled
1835  * via color masking.
1836  */
1837 static bool
has_writeable_rt(const struct iris_blend_state * cso_blend,const struct shader_info * fs_info)1838 has_writeable_rt(const struct iris_blend_state *cso_blend,
1839                  const struct shader_info *fs_info)
1840 {
1841    if (!fs_info)
1842       return false;
1843 
1844    unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1845 
1846    if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1847       rt_outputs = (1 << IRIS_MAX_DRAW_BUFFERS) - 1;
1848 
1849    return cso_blend->color_write_enables & rt_outputs;
1850 }
1851 
1852 /**
1853  * Gallium CSO for depth, stencil, and alpha testing state.
1854  */
1855 struct iris_depth_stencil_alpha_state {
1856    /** Partial 3DSTATE_WM_DEPTH_STENCIL. */
1857    uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
1858 
1859 #if GFX_VER >= 12
1860    uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];
1861 #endif
1862 
1863    /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */
1864    unsigned alpha_enabled:1;
1865    unsigned alpha_func:3;     /**< PIPE_FUNC_x */
1866    float alpha_ref_value;     /**< reference value */
1867 
1868    /** Outbound to resolve and cache set tracking. */
1869    bool depth_writes_enabled;
1870    bool stencil_writes_enabled;
1871 
1872    /** Outbound to Gfx8-9 PMA stall equations */
1873    bool depth_test_enabled;
1874 
1875    /** Tracking state of DS writes for Wa_18019816803. */
1876    bool ds_write_state;
1877 };
1878 
1879 /**
1880  * The pipe->create_depth_stencil_alpha_state() driver hook.
1881  *
1882  * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1883  * testing state since we need pieces of it in a variety of places.
1884  */
1885 static void *
iris_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1886 iris_create_zsa_state(struct pipe_context *ctx,
1887                       const struct pipe_depth_stencil_alpha_state *state)
1888 {
1889    struct iris_depth_stencil_alpha_state *cso =
1890       malloc(sizeof(struct iris_depth_stencil_alpha_state));
1891 
1892    bool two_sided_stencil = state->stencil[1].enabled;
1893 
1894    bool depth_write_enabled = false;
1895    bool stencil_write_enabled = false;
1896 
1897    /* Depth writes enabled? */
1898    if (state->depth_writemask &&
1899       ((!state->depth_enabled) ||
1900       ((state->depth_func != PIPE_FUNC_NEVER) &&
1901         (state->depth_func != PIPE_FUNC_EQUAL))))
1902       depth_write_enabled = true;
1903 
1904    bool stencil_all_keep =
1905       state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1906       state->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
1907       state->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
1908       (!two_sided_stencil ||
1909        (state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
1910         state->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
1911         state->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP));
1912 
1913    bool stencil_mask_zero =
1914       state->stencil[0].writemask == 0 ||
1915       (!two_sided_stencil || state->stencil[1].writemask  == 0);
1916 
1917    bool stencil_func_never =
1918       state->stencil[0].func == PIPE_FUNC_NEVER &&
1919       state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1920       (!two_sided_stencil ||
1921        (state->stencil[1].func == PIPE_FUNC_NEVER &&
1922         state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP));
1923 
1924    /* Stencil writes enabled? */
1925    if (state->stencil[0].writemask != 0 ||
1926       ((two_sided_stencil && state->stencil[1].writemask != 0) &&
1927        (!stencil_all_keep &&
1928         !stencil_mask_zero &&
1929         !stencil_func_never)))
1930       stencil_write_enabled = true;
1931 
1932    cso->ds_write_state = depth_write_enabled || stencil_write_enabled;
1933 
1934    cso->alpha_enabled = state->alpha_enabled;
1935    cso->alpha_func = state->alpha_func;
1936    cso->alpha_ref_value = state->alpha_ref_value;
1937    cso->depth_writes_enabled = state->depth_writemask;
1938    cso->depth_test_enabled = state->depth_enabled;
1939    cso->stencil_writes_enabled =
1940       state->stencil[0].writemask != 0 ||
1941       (two_sided_stencil && state->stencil[1].writemask != 0);
1942 
1943    /* gallium frontends need to optimize away EQUAL writes for us. */
1944    assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1945 
1946    iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {
1947       wmds.StencilFailOp = state->stencil[0].fail_op;
1948       wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;
1949       wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;
1950       wmds.StencilTestFunction =
1951          translate_compare_func(state->stencil[0].func);
1952       wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;
1953       wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;
1954       wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;
1955       wmds.BackfaceStencilTestFunction =
1956          translate_compare_func(state->stencil[1].func);
1957       wmds.DepthTestFunction = translate_compare_func(state->depth_func);
1958       wmds.DoubleSidedStencilEnable = two_sided_stencil;
1959       wmds.StencilTestEnable = state->stencil[0].enabled;
1960       wmds.StencilBufferWriteEnable =
1961          state->stencil[0].writemask != 0 ||
1962          (two_sided_stencil && state->stencil[1].writemask != 0);
1963       wmds.DepthTestEnable = state->depth_enabled;
1964       wmds.DepthBufferWriteEnable = state->depth_writemask;
1965       wmds.StencilTestMask = state->stencil[0].valuemask;
1966       wmds.StencilWriteMask = state->stencil[0].writemask;
1967       wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;
1968       wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;
1969       /* wmds.[Backface]StencilReferenceValue are merged later */
1970 #if GFX_VER >= 12
1971       wmds.StencilReferenceValueModifyDisable = true;
1972 #endif
1973    }
1974 
1975 #if GFX_VER >= 12
1976    iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {
1977       depth_bounds.DepthBoundsTestValueModifyDisable = false;
1978       depth_bounds.DepthBoundsTestEnableModifyDisable = false;
1979       depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;
1980       depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;
1981       depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;
1982    }
1983 #endif
1984 
1985    return cso;
1986 }
1987 
1988 /**
1989  * The pipe->bind_depth_stencil_alpha_state() driver hook.
1990  *
1991  * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1992  */
1993 static void
iris_bind_zsa_state(struct pipe_context * ctx,void * state)1994 iris_bind_zsa_state(struct pipe_context *ctx, void *state)
1995 {
1996    struct iris_context *ice = (struct iris_context *) ctx;
1997    struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1998    struct iris_depth_stencil_alpha_state *new_cso = state;
1999 
2000    if (new_cso) {
2001       if (cso_changed(alpha_ref_value))
2002          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
2003 
2004       if (cso_changed(alpha_enabled))
2005          ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
2006 
2007       if (cso_changed(alpha_func))
2008          ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
2009 
2010       if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))
2011          ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
2012 
2013       ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
2014       ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
2015 
2016       /* State ds_write_enable changed, need to flag dirty DS. */
2017       if (!old_cso || (ice->state.ds_write_state != new_cso->ds_write_state)) {
2018          ice->state.dirty |= IRIS_DIRTY_DS_WRITE_ENABLE;
2019          ice->state.ds_write_state = new_cso->ds_write_state;
2020       }
2021 
2022 #if GFX_VER >= 12
2023       if (cso_changed(depth_bounds))
2024          ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;
2025 #endif
2026    }
2027 
2028    ice->state.cso_zsa = new_cso;
2029    ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2030    ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
2031    ice->state.stage_dirty |=
2032       ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
2033 
2034    if (GFX_VER == 8)
2035       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
2036 }
2037 
2038 #if GFX_VER == 8
2039 static bool
want_pma_fix(struct iris_context * ice)2040 want_pma_fix(struct iris_context *ice)
2041 {
2042    UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
2043    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2044    const struct iris_fs_data *fs_data =
2045       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
2046    const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
2047    const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
2048    const struct iris_blend_state *cso_blend = ice->state.cso_blend;
2049 
2050    /* In very specific combinations of state, we can instruct Gfx8-9 hardware
2051     * to avoid stalling at the pixel mask array.  The state equations are
2052     * documented in these places:
2053     *
2054     * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
2055     * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
2056     *
2057     * Both equations share some common elements:
2058     *
2059     *    no_hiz_op =
2060     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2061     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2062     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2063     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
2064     *
2065     *    killpixels =
2066     *       3DSTATE_WM::ForceKillPix != ForceOff &&
2067     *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2068     *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2069     *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2070     *        3DSTATE_PS_BLEND::AlphaTestEnable ||
2071     *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2072     *
2073     *    (Technically the stencil PMA treats ForceKillPix differently,
2074     *     but I think this is a documentation oversight, and we don't
2075     *     ever use it in this way, so it doesn't matter).
2076     *
2077     *    common_pma_fix =
2078     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
2079     *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
2080     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2081     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2082     *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
2083     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
2084     *       no_hiz_op
2085     *
2086     * These are always true:
2087     *
2088     *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
2089     *    3DSTATE_PS_EXTRA::PixelShaderValid
2090     *
2091     * Also, we never use the normal drawing path for HiZ ops; these are true:
2092     *
2093     *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2094     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2095     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2096     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
2097     *
2098     * This happens sometimes:
2099     *
2100     *    3DSTATE_WM::ForceThreadDispatch != 1
2101     *
2102     * However, we choose to ignore it as it either agrees with the signal
2103     * (dispatch was already enabled, so nothing out of the ordinary), or
2104     * there are no framebuffer attachments (so no depth or HiZ anyway,
2105     * meaning the PMA signal will already be disabled).
2106     */
2107 
2108    if (!cso_fb->zsbuf)
2109       return false;
2110 
2111    struct iris_resource *zres, *sres;
2112    iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
2113 
2114    /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2115     * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2116     */
2117    if (!zres ||
2118        !iris_resource_level_has_hiz(devinfo, zres, cso_fb->zsbuf->u.tex.level))
2119       return false;
2120 
2121    /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
2122    if (fs_data->early_fragment_tests)
2123       return false;
2124 
2125    /* 3DSTATE_WM::ForceKillPix != ForceOff &&
2126     * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2127     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2128     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2129     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
2130     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2131     */
2132    bool killpixels = fs_data->uses_kill || fs_data->uses_omask ||
2133                      cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;
2134 
2135    /* The Gfx8 depth PMA equation becomes:
2136     *
2137     *    depth_writes =
2138     *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
2139     *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
2140     *
2141     *    stencil_writes =
2142     *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
2143     *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
2144     *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
2145     *
2146     *    Z_PMA_OPT =
2147     *       common_pma_fix &&
2148     *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
2149     *       ((killpixels && (depth_writes || stencil_writes)) ||
2150     *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
2151     *
2152     */
2153    if (!cso_zsa->depth_test_enabled)
2154       return false;
2155 
2156    return fs_data->computed_depth_mode != PSCDEPTH_OFF ||
2157           (killpixels && (cso_zsa->depth_writes_enabled ||
2158                           (sres && cso_zsa->stencil_writes_enabled)));
2159 }
2160 #endif
2161 
2162 void
genX(update_pma_fix)2163 genX(update_pma_fix)(struct iris_context *ice,
2164                      struct iris_batch *batch,
2165                      bool enable)
2166 {
2167 #if GFX_VER == 8
2168    struct iris_genx_state *genx = ice->state.genx;
2169 
2170    if (genx->pma_fix_enabled == enable)
2171       return;
2172 
2173    genx->pma_fix_enabled = enable;
2174 
2175    /* According to the Broadwell PIPE_CONTROL documentation, software should
2176     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2177     * prior to the LRI.  If stencil buffer writes are enabled, then a Render
2178     * Cache Flush is also necessary.
2179     *
2180     * The Gfx9 docs say to use a depth stall rather than a command streamer
2181     * stall.  However, the hardware seems to violently disagree.  A full
2182     * command streamer stall seems to be needed in both cases.
2183     */
2184    iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2185                                 PIPE_CONTROL_CS_STALL |
2186                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2187                                 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2188 
2189    iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
2190       reg.NPPMAFixEnable = enable;
2191       reg.NPEarlyZFailsDisable = enable;
2192       reg.NPPMAFixEnableMask = true;
2193       reg.NPEarlyZFailsDisableMask = true;
2194    }
2195 
2196    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2197     * Flush bits is often necessary.  We do it regardless because it's easier.
2198     * The render cache flush is also necessary if stencil writes are enabled.
2199     *
2200     * Again, the Gfx9 docs give a different set of flushes but the Broadwell
2201     * flushes seem to work just as well.
2202     */
2203    iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2204                                 PIPE_CONTROL_DEPTH_STALL |
2205                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2206                                 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2207 #endif
2208 }
2209 
2210 /**
2211  * Gallium CSO for rasterizer state.
2212  */
2213 struct iris_rasterizer_state {
2214    uint32_t sf[GENX(3DSTATE_SF_length)];
2215    uint32_t clip[GENX(3DSTATE_CLIP_length)];
2216    uint32_t raster[GENX(3DSTATE_RASTER_length)];
2217    uint32_t wm[GENX(3DSTATE_WM_length)];
2218    uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
2219 
2220    uint8_t num_clip_plane_consts;
2221    bool clip_halfz; /* for CC_VIEWPORT */
2222    bool depth_clip_near; /* for CC_VIEWPORT */
2223    bool depth_clip_far; /* for CC_VIEWPORT */
2224    bool flatshade; /* for shader state */
2225    bool flatshade_first; /* for stream output */
2226    bool clamp_fragment_color; /* for shader state */
2227    bool light_twoside; /* for shader state */
2228    bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */
2229    bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
2230    bool line_smooth;
2231    bool line_stipple_enable;
2232    bool poly_stipple_enable;
2233    bool multisample;
2234    bool force_persample_interp;
2235    bool conservative_rasterization;
2236    bool fill_mode_point;
2237    bool fill_mode_line;
2238    bool fill_mode_point_or_line;
2239    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
2240    uint16_t sprite_coord_enable;
2241 };
2242 
2243 static float
get_line_width(const struct pipe_rasterizer_state * state)2244 get_line_width(const struct pipe_rasterizer_state *state)
2245 {
2246    float line_width = state->line_width;
2247 
2248    /* From the OpenGL 4.4 spec:
2249     *
2250     * "The actual width of non-antialiased lines is determined by rounding
2251     *  the supplied width to the nearest integer, then clamping it to the
2252     *  implementation-dependent maximum non-antialiased line width."
2253     */
2254    if (!state->multisample && !state->line_smooth)
2255       line_width = roundf(state->line_width);
2256 
2257    if (!state->multisample && state->line_smooth && line_width < 1.5f) {
2258       /* For 1 pixel line thickness or less, the general anti-aliasing
2259        * algorithm gives up, and a garbage line is generated.  Setting a
2260        * Line Width of 0.0 specifies the rasterization of the "thinnest"
2261        * (one-pixel-wide), non-antialiased lines.
2262        *
2263        * Lines rendered with zero Line Width are rasterized using the
2264        * "Grid Intersection Quantization" rules as specified by the
2265        * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
2266        */
2267       line_width = 0.0f;
2268    }
2269 
2270    return line_width;
2271 }
2272 
2273 /**
2274  * The pipe->create_rasterizer_state() driver hook.
2275  */
2276 static void *
iris_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)2277 iris_create_rasterizer_state(struct pipe_context *ctx,
2278                              const struct pipe_rasterizer_state *state)
2279 {
2280    struct iris_rasterizer_state *cso =
2281       malloc(sizeof(struct iris_rasterizer_state));
2282 
2283    cso->multisample = state->multisample;
2284    cso->force_persample_interp = state->force_persample_interp;
2285    cso->clip_halfz = state->clip_halfz;
2286    cso->depth_clip_near = state->depth_clip_near;
2287    cso->depth_clip_far = state->depth_clip_far;
2288    cso->flatshade = state->flatshade;
2289    cso->flatshade_first = state->flatshade_first;
2290    cso->clamp_fragment_color = state->clamp_fragment_color;
2291    cso->light_twoside = state->light_twoside;
2292    cso->rasterizer_discard = state->rasterizer_discard;
2293    cso->half_pixel_center = state->half_pixel_center;
2294    cso->sprite_coord_mode = state->sprite_coord_mode;
2295    cso->sprite_coord_enable = state->sprite_coord_enable;
2296    cso->line_smooth = state->line_smooth;
2297    cso->line_stipple_enable = state->line_stipple_enable;
2298    cso->poly_stipple_enable = state->poly_stipple_enable;
2299    cso->conservative_rasterization =
2300       state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
2301 
2302    cso->fill_mode_point =
2303       state->fill_front == PIPE_POLYGON_MODE_POINT ||
2304       state->fill_back == PIPE_POLYGON_MODE_POINT;
2305    cso->fill_mode_line =
2306       state->fill_front == PIPE_POLYGON_MODE_LINE ||
2307       state->fill_back == PIPE_POLYGON_MODE_LINE;
2308    cso->fill_mode_point_or_line =
2309       cso->fill_mode_point ||
2310       cso->fill_mode_line;
2311 
2312    if (state->clip_plane_enable != 0)
2313       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2314    else
2315       cso->num_clip_plane_consts = 0;
2316 
2317    float line_width = get_line_width(state);
2318 
2319    iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2320       sf.StatisticsEnable = true;
2321       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2322       sf.LineEndCapAntialiasingRegionWidth =
2323          state->line_smooth ? _10pixels : _05pixels;
2324       sf.LastPixelEnable = state->line_last_pixel;
2325       sf.LineWidth = line_width;
2326       sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&
2327                              !state->point_quad_rasterization;
2328       sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2329       sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);
2330 
2331       if (state->flatshade_first) {
2332          sf.TriangleFanProvokingVertexSelect = 1;
2333       } else {
2334          sf.TriangleStripListProvokingVertexSelect = 2;
2335          sf.TriangleFanProvokingVertexSelect = 2;
2336          sf.LineStripListProvokingVertexSelect = 1;
2337       }
2338    }
2339 
2340    iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2341       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2342       rr.CullMode = translate_cull_mode(state->cull_face);
2343       rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2344       rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2345       rr.DXMultisampleRasterizationEnable = state->multisample;
2346       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2347       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2348       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2349       rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2350       rr.GlobalDepthOffsetScale = state->offset_scale;
2351       rr.GlobalDepthOffsetClamp = state->offset_clamp;
2352       rr.SmoothPointEnable = state->point_smooth;
2353       rr.ScissorRectangleEnable = state->scissor;
2354 #if GFX_VER >= 9
2355       rr.ViewportZNearClipTestEnable = state->depth_clip_near;
2356       rr.ViewportZFarClipTestEnable = state->depth_clip_far;
2357       rr.ConservativeRasterizationEnable =
2358          cso->conservative_rasterization;
2359 #else
2360       rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2361 #endif
2362    }
2363 
2364    iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2365       /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2366        * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2367        */
2368       cl.EarlyCullEnable = true;
2369       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2370       cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2371       cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2372       cl.GuardbandClipTestEnable = true;
2373       cl.ClipEnable = true;
2374       cl.MinimumPointWidth = 0.125;
2375       cl.MaximumPointWidth = 255.875;
2376 
2377       if (state->flatshade_first) {
2378          cl.TriangleFanProvokingVertexSelect = 1;
2379       } else {
2380          cl.TriangleStripListProvokingVertexSelect = 2;
2381          cl.TriangleFanProvokingVertexSelect = 2;
2382          cl.LineStripListProvokingVertexSelect = 1;
2383       }
2384    }
2385 
2386    iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
2387       /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
2388        * filled in at draw time from the FS program.
2389        */
2390       wm.LineAntialiasingRegionWidth = _10pixels;
2391       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2392       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2393       wm.LineStippleEnable = state->line_stipple_enable;
2394       wm.PolygonStippleEnable = state->poly_stipple_enable;
2395    }
2396 
2397    /* Remap from 0..255 back to 1..256 */
2398    const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2399 
2400    iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2401       if (state->line_stipple_enable) {
2402          line.LineStipplePattern = state->line_stipple_pattern;
2403          line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2404          line.LineStippleRepeatCount = line_stipple_factor;
2405       }
2406    }
2407 
2408    return cso;
2409 }
2410 
2411 /**
2412  * The pipe->bind_rasterizer_state() driver hook.
2413  *
2414  * Bind a rasterizer CSO and flag related dirty bits.
2415  */
2416 static void
iris_bind_rasterizer_state(struct pipe_context * ctx,void * state)2417 iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2418 {
2419    struct iris_context *ice = (struct iris_context *) ctx;
2420    struct iris_rasterizer_state *old_cso = ice->state.cso_rast;
2421    struct iris_rasterizer_state *new_cso = state;
2422 
2423    if (new_cso) {
2424       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2425       if (cso_changed_memcmp(line_stipple))
2426          ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
2427 
2428       if (cso_changed(half_pixel_center))
2429          ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
2430 
2431       if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
2432          ice->state.dirty |= IRIS_DIRTY_WM;
2433 
2434       if (cso_changed(rasterizer_discard))
2435          ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
2436 
2437       if (cso_changed(flatshade_first))
2438          ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
2439 
2440       if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||
2441           cso_changed(clip_halfz))
2442          ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2443 
2444       if (cso_changed(sprite_coord_enable) ||
2445           cso_changed(sprite_coord_mode) ||
2446           cso_changed(light_twoside))
2447          ice->state.dirty |= IRIS_DIRTY_SBE;
2448 
2449       if (cso_changed(conservative_rasterization))
2450          ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
2451    }
2452 
2453    ice->state.cso_rast = new_cso;
2454    ice->state.dirty |= IRIS_DIRTY_RASTER;
2455    ice->state.dirty |= IRIS_DIRTY_CLIP;
2456    ice->state.stage_dirty |=
2457       ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];
2458 }
2459 
2460 /**
2461  * Return true if the given wrap mode requires the border color to exist.
2462  *
2463  * (We can skip uploading it if the sampler isn't going to use it.)
2464  */
2465 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2466 wrap_mode_needs_border_color(unsigned wrap_mode)
2467 {
2468    return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2469 }
2470 
2471 /**
2472  * Gallium CSO for sampler state.
2473  */
2474 struct iris_sampler_state {
2475    union pipe_color_union border_color;
2476    bool needs_border_color;
2477 
2478    uint32_t sampler_state[GENX(SAMPLER_STATE_length)];
2479 
2480 #if GFX_VERx10 == 125
2481    /* Sampler state structure to use for 3D textures in order to
2482     * implement Wa_14014414195.
2483     */
2484    uint32_t sampler_state_3d[GENX(SAMPLER_STATE_length)];
2485 #endif
2486 };
2487 
2488 static void
fill_sampler_state(uint32_t * sampler_state,const struct pipe_sampler_state * state,unsigned max_anisotropy)2489 fill_sampler_state(uint32_t *sampler_state,
2490                    const struct pipe_sampler_state *state,
2491                    unsigned max_anisotropy)
2492 {
2493    float min_lod = state->min_lod;
2494    unsigned mag_img_filter = state->mag_img_filter;
2495 
2496    // XXX: explain this code ported from ilo...I don't get it at all...
2497    if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2498        state->min_lod > 0.0f) {
2499       min_lod = 0.0f;
2500       mag_img_filter = state->min_img_filter;
2501    }
2502 
2503    iris_pack_state(GENX(SAMPLER_STATE), sampler_state, samp) {
2504       samp.TCXAddressControlMode = translate_wrap(state->wrap_s);
2505       samp.TCYAddressControlMode = translate_wrap(state->wrap_t);
2506       samp.TCZAddressControlMode = translate_wrap(state->wrap_r);
2507       samp.CubeSurfaceControlMode = state->seamless_cube_map;
2508       samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2509       samp.MinModeFilter = state->min_img_filter;
2510       samp.MagModeFilter = mag_img_filter;
2511       samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2512       samp.MaximumAnisotropy = RATIO21;
2513 
2514       if (max_anisotropy >= 2) {
2515          if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2516 #if GFX_VER >= 30
2517             samp.MinModeFilter = MAPFILTER_ANISOTROPIC_FAST;
2518 #else
2519             samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2520 #endif
2521             samp.AnisotropicAlgorithm = EWAApproximation;
2522          }
2523 
2524          if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR) {
2525 #if GFX_VER >= 30
2526             samp.MagModeFilter = MAPFILTER_ANISOTROPIC_FAST;
2527 #else
2528             samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2529 #endif
2530          }
2531 
2532          samp.MaximumAnisotropy =
2533             MIN2((max_anisotropy - 2) / 2, RATIO161);
2534       }
2535 
2536       /* Set address rounding bits if not using nearest filtering. */
2537       if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2538          samp.UAddressMinFilterRoundingEnable = true;
2539          samp.VAddressMinFilterRoundingEnable = true;
2540          samp.RAddressMinFilterRoundingEnable = true;
2541       }
2542 
2543       if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2544          samp.UAddressMagFilterRoundingEnable = true;
2545          samp.VAddressMagFilterRoundingEnable = true;
2546          samp.RAddressMagFilterRoundingEnable = true;
2547       }
2548 
2549       if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2550          samp.ShadowFunction = translate_shadow_func(state->compare_func);
2551 
2552       const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2553 
2554       samp.LODPreClampMode = CLAMP_MODE_OGL;
2555       samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);
2556       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2557       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2558 
2559       /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
2560    }
2561 }
2562 
2563 /**
2564  * The pipe->create_sampler_state() driver hook.
2565  *
2566  * We fill out SAMPLER_STATE (except for the border color pointer), and
2567  * store that on the CPU.  It doesn't make sense to upload it to a GPU
2568  * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2569  * all bound sampler states to be in contiguous memory.
2570  */
2571 static void *
iris_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2572 iris_create_sampler_state(struct pipe_context *ctx,
2573                           const struct pipe_sampler_state *state)
2574 {
2575    UNUSED struct iris_screen *screen = (void *)ctx->screen;
2576    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2577    struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
2578 
2579    if (!cso)
2580       return NULL;
2581 
2582    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2583    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2584 
2585    unsigned wrap_s = translate_wrap(state->wrap_s);
2586    unsigned wrap_t = translate_wrap(state->wrap_t);
2587    unsigned wrap_r = translate_wrap(state->wrap_r);
2588 
2589    memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2590 
2591    cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||
2592                              wrap_mode_needs_border_color(wrap_t) ||
2593                              wrap_mode_needs_border_color(wrap_r);
2594 
2595    fill_sampler_state(cso->sampler_state, state, state->max_anisotropy);
2596 
2597 #if GFX_VERx10 == 125
2598    /* Fill an extra sampler state structure with anisotropic filtering
2599     * disabled used to implement Wa_14014414195.
2600     */
2601    if (intel_needs_workaround(screen->devinfo, 14014414195))
2602       fill_sampler_state(cso->sampler_state_3d, state, 0);
2603 #endif
2604 
2605    return cso;
2606 }
2607 
2608 /**
2609  * The pipe->bind_sampler_states() driver hook.
2610  */
2611 static void
iris_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2612 iris_bind_sampler_states(struct pipe_context *ctx,
2613                          enum pipe_shader_type p_stage,
2614                          unsigned start, unsigned count,
2615                          void **states)
2616 {
2617    struct iris_context *ice = (struct iris_context *) ctx;
2618    gl_shader_stage stage = stage_from_pipe(p_stage);
2619    struct iris_shader_state *shs = &ice->state.shaders[stage];
2620 
2621    assert(start + count <= IRIS_MAX_SAMPLERS);
2622 
2623    bool dirty = false;
2624 
2625    for (int i = 0; i < count; i++) {
2626       struct iris_sampler_state *state = states ? states[i] : NULL;
2627       if (shs->samplers[start + i] != state) {
2628          shs->samplers[start + i] = state;
2629          dirty = true;
2630       }
2631    }
2632 
2633    if (dirty)
2634       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2635 }
2636 
2637 /**
2638  * Upload the sampler states into a contiguous area of GPU memory, for
2639  * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2640  *
2641  * Also fill out the border color state pointers.
2642  */
2643 static void
iris_upload_sampler_states(struct iris_context * ice,gl_shader_stage stage)2644 iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)
2645 {
2646    struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen;
2647    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
2648    struct iris_shader_state *shs = &ice->state.shaders[stage];
2649    struct iris_border_color_pool *border_color_pool =
2650       iris_bufmgr_get_border_color_pool(screen->bufmgr);
2651 
2652    /* We assume gallium frontends will call pipe->bind_sampler_states()
2653     * if the program's number of textures changes.
2654     */
2655    unsigned count = util_last_bit64(shader->bt.samplers_used_mask);
2656 
2657    if (!count)
2658       return;
2659 
2660    /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2661     * in the dynamic state memory zone, so we can point to it via the
2662     * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2663     */
2664    unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2665    uint32_t *map =
2666       upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);
2667    if (unlikely(!map))
2668       return;
2669 
2670    struct pipe_resource *res = shs->sampler_table.res;
2671    struct iris_bo *bo = iris_resource_bo(res);
2672 
2673    iris_record_state_size(ice->state.sizes,
2674                           bo->address + shs->sampler_table.offset, size);
2675 
2676    shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);
2677 
2678    ice->state.need_border_colors &= ~(1 << stage);
2679 
2680    for (int i = 0; i < count; i++) {
2681       struct iris_sampler_state *state = shs->samplers[i];
2682       struct iris_sampler_view *tex = shs->textures[i];
2683 
2684       if (!state) {
2685          memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2686       } else {
2687          const uint32_t *sampler_state = state->sampler_state;
2688 
2689 #if GFX_VERx10 == 125
2690          if (intel_needs_workaround(screen->devinfo, 14014414195) &&
2691              tex && tex->res->base.b.target == PIPE_TEXTURE_3D) {
2692                sampler_state = state->sampler_state_3d;
2693          }
2694 #endif
2695 
2696          if (!state->needs_border_color) {
2697             memcpy(map, sampler_state, 4 * GENX(SAMPLER_STATE_length));
2698          } else {
2699             ice->state.need_border_colors |= 1 << stage;
2700 
2701             /* We may need to swizzle the border color for format faking.
2702              * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2703              * This means we need to move the border color's A channel into
2704              * the R or G channels so that those read swizzles will move it
2705              * back into A.
2706              */
2707             union pipe_color_union *color = &state->border_color;
2708             union pipe_color_union tmp;
2709             if (tex) {
2710                enum pipe_format internal_format = tex->res->internal_format;
2711 
2712                if (util_format_is_alpha(internal_format)) {
2713                   unsigned char swz[4] = {
2714                      PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,
2715                      PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2716                   };
2717                   util_format_apply_color_swizzle(&tmp, color, swz, true);
2718                   color = &tmp;
2719                } else if (util_format_is_luminance_alpha(internal_format) &&
2720                           internal_format != PIPE_FORMAT_L8A8_SRGB) {
2721                   unsigned char swz[4] = {
2722                      PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,
2723                      PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2724                   };
2725                   util_format_apply_color_swizzle(&tmp, color, swz, true);
2726                   color = &tmp;
2727                }
2728             }
2729 
2730             /* Stream out the border color and merge the pointer. */
2731             uint32_t offset = iris_upload_border_color(border_color_pool,
2732                                                        color);
2733 
2734             uint32_t dynamic[GENX(SAMPLER_STATE_length)];
2735             iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
2736                dyns.BorderColorPointer = offset;
2737             }
2738 
2739             for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
2740                map[j] = sampler_state[j] | dynamic[j];
2741          }
2742       }
2743 
2744       map += GENX(SAMPLER_STATE_length);
2745    }
2746 }
2747 
2748 static enum isl_channel_select
fmt_swizzle(const struct iris_format_info * fmt,enum pipe_swizzle swz)2749 fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)
2750 {
2751    switch (swz) {
2752    case PIPE_SWIZZLE_X: return fmt->swizzle.r;
2753    case PIPE_SWIZZLE_Y: return fmt->swizzle.g;
2754    case PIPE_SWIZZLE_Z: return fmt->swizzle.b;
2755    case PIPE_SWIZZLE_W: return fmt->swizzle.a;
2756    case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;
2757    case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;
2758    default: unreachable("invalid swizzle");
2759    }
2760 }
2761 
2762 static void
fill_buffer_surface_state(struct isl_device * isl_dev,struct iris_resource * res,void * map,enum isl_format format,struct isl_swizzle swizzle,unsigned offset,unsigned size,isl_surf_usage_flags_t usage)2763 fill_buffer_surface_state(struct isl_device *isl_dev,
2764                           struct iris_resource *res,
2765                           void *map,
2766                           enum isl_format format,
2767                           struct isl_swizzle swizzle,
2768                           unsigned offset,
2769                           unsigned size,
2770                           isl_surf_usage_flags_t usage)
2771 {
2772    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2773    const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2774 
2775    /* The ARB_texture_buffer_specification says:
2776     *
2777     *    "The number of texels in the buffer texture's texel array is given by
2778     *
2779     *       floor(<buffer_size> / (<components> * sizeof(<base_type>)),
2780     *
2781     *     where <buffer_size> is the size of the buffer object, in basic
2782     *     machine units and <components> and <base_type> are the element count
2783     *     and base data type for elements, as specified in Table X.1.  The
2784     *     number of texels in the texel array is then clamped to the
2785     *     implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."
2786     *
2787     * We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,
2788     * so that when ISL divides by stride to obtain the number of texels, that
2789     * texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.
2790     */
2791    unsigned final_size =
2792       MIN3(size, res->bo->size - res->offset - offset,
2793            IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);
2794 
2795    isl_buffer_fill_state(isl_dev, map,
2796                          .address = res->bo->address + res->offset + offset,
2797                          .size_B = final_size,
2798                          .format = format,
2799                          .swizzle = swizzle,
2800                          .stride_B = cpp,
2801                          .mocs = iris_mocs(res->bo, isl_dev, usage));
2802 }
2803 
2804 #define SURFACE_STATE_ALIGNMENT 64
2805 
2806 /**
2807  * Allocate several contiguous SURFACE_STATE structures, one for each
2808  * supported auxiliary surface mode.  This only allocates the CPU-side
2809  * copy, they will need to be uploaded later after they're filled in.
2810  */
2811 static void
alloc_surface_states(struct iris_surface_state * surf_state,unsigned aux_usages)2812 alloc_surface_states(struct iris_surface_state *surf_state,
2813                      unsigned aux_usages)
2814 {
2815    enum { surf_size = 4 * GENX(RENDER_SURFACE_STATE_length) };
2816 
2817    /* If this changes, update this to explicitly align pointers */
2818    STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);
2819 
2820    assert(aux_usages != 0);
2821 
2822    /* In case we're re-allocating them... */
2823    free(surf_state->cpu);
2824 
2825    surf_state->aux_usages = aux_usages;
2826    surf_state->num_states = util_bitcount(aux_usages);
2827    surf_state->cpu = calloc(surf_state->num_states, surf_size);
2828    surf_state->ref.offset = 0;
2829    pipe_resource_reference(&surf_state->ref.res, NULL);
2830 
2831    assert(surf_state->cpu);
2832 }
2833 
2834 /**
2835  * Upload the CPU side SURFACE_STATEs into a GPU buffer.
2836  */
2837 static void
upload_surface_states(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state)2838 upload_surface_states(struct u_upload_mgr *mgr,
2839                       struct iris_surface_state *surf_state)
2840 {
2841    const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2842    const unsigned bytes = surf_state->num_states * surf_size;
2843 
2844    void *map =
2845       upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);
2846 
2847    surf_state->ref.offset +=
2848       iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));
2849 
2850    if (map)
2851       memcpy(map, surf_state->cpu, bytes);
2852 }
2853 
2854 /**
2855  * Update resource addresses in a set of SURFACE_STATE descriptors,
2856  * and re-upload them if necessary.
2857  */
2858 static bool
update_surface_state_addrs(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state,struct iris_bo * bo)2859 update_surface_state_addrs(struct u_upload_mgr *mgr,
2860                            struct iris_surface_state *surf_state,
2861                            struct iris_bo *bo)
2862 {
2863    if (surf_state->bo_address == bo->address)
2864       return false;
2865 
2866    STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);
2867    STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);
2868 
2869    uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];
2870 
2871    /* First, update the CPU copies.  We assume no other fields exist in
2872     * the QWord containing Surface Base Address.
2873     */
2874    for (unsigned i = 0; i < surf_state->num_states; i++) {
2875       *ss_addr = *ss_addr - surf_state->bo_address + bo->address;
2876       ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;
2877    }
2878 
2879    /* Next, upload the updated copies to a GPU buffer. */
2880    upload_surface_states(mgr, surf_state);
2881 
2882    surf_state->bo_address = bo->address;
2883 
2884    return true;
2885 }
2886 
2887 /* We should only use this function when it's needed to fill out
2888  * surf with information provided by the pipe_(image|sampler)_view.
2889  * This is only necessary for CL extension cl_khr_image2d_from_buffer.
2890  * This is the reason why ISL_SURF_DIM_2D is hardcoded on dim field.
2891  */
2892 static void
fill_surf_for_tex2d_from_buffer(struct isl_device * isl_dev,enum isl_format format,unsigned width,unsigned height,unsigned row_stride,isl_surf_usage_flags_t usage,struct isl_surf * surf)2893 fill_surf_for_tex2d_from_buffer(struct isl_device *isl_dev,
2894                                 enum isl_format format,
2895                                 unsigned width,
2896                                 unsigned height,
2897                                 unsigned row_stride,
2898                                 isl_surf_usage_flags_t usage,
2899                                 struct isl_surf *surf)
2900 {
2901    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2902    const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2903 
2904    const struct isl_surf_init_info init_info = {
2905       .dim = ISL_SURF_DIM_2D,
2906       .format = format,
2907       .width = width,
2908       .height = height,
2909       .depth = 1,
2910       .levels = 1,
2911       .array_len = 1,
2912       .samples = 1,
2913       .min_alignment_B = 4,
2914       .row_pitch_B = row_stride * cpp,
2915       .usage = usage,
2916       .tiling_flags = ISL_TILING_LINEAR_BIT,
2917    };
2918 
2919    const bool isl_surf_created_successfully =
2920       isl_surf_init_s(isl_dev, surf, &init_info);
2921 
2922    assert(isl_surf_created_successfully);
2923 }
2924 
2925 static void
fill_surface_state(struct isl_device * isl_dev,void * map,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,unsigned aux_usage,uint32_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2926 fill_surface_state(struct isl_device *isl_dev,
2927                    void *map,
2928                    struct iris_resource *res,
2929                    struct isl_surf *surf,
2930                    struct isl_view *view,
2931                    unsigned aux_usage,
2932                    uint32_t extra_main_offset,
2933                    uint32_t tile_x_sa,
2934                    uint32_t tile_y_sa)
2935 {
2936    struct isl_surf_fill_state_info f = {
2937       .surf = surf,
2938       .view = view,
2939       .mocs = iris_mocs(res->bo, isl_dev, view->usage),
2940       .address = res->bo->address + res->offset + extra_main_offset,
2941       .x_offset_sa = tile_x_sa,
2942       .y_offset_sa = tile_y_sa,
2943    };
2944 
2945    if (aux_usage != ISL_AUX_USAGE_NONE) {
2946       f.aux_surf = &res->aux.surf;
2947       f.aux_usage = aux_usage;
2948       f.clear_color = res->aux.clear_color;
2949 
2950       if (aux_usage == ISL_AUX_USAGE_MC)
2951          f.mc_format = iris_format_for_usage(isl_dev->info,
2952                                              res->external_format,
2953                                              surf->usage).fmt;
2954 
2955       if (res->aux.bo)
2956          f.aux_address = res->aux.bo->address + res->aux.offset;
2957 
2958       if (res->aux.clear_color_bo) {
2959          f.clear_address = res->aux.clear_color_bo->address +
2960                            res->aux.clear_color_offset;
2961          f.use_clear_address = isl_dev->info->ver > 9;
2962       }
2963    }
2964 
2965    isl_surf_fill_state_s(isl_dev, map, &f);
2966 }
2967 
2968 static void
fill_surface_states(struct isl_device * isl_dev,struct iris_surface_state * surf_state,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,uint64_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2969 fill_surface_states(struct isl_device *isl_dev,
2970                     struct iris_surface_state *surf_state,
2971                     struct iris_resource *res,
2972                     struct isl_surf *surf,
2973                     struct isl_view *view,
2974                     uint64_t extra_main_offset,
2975                     uint32_t tile_x_sa,
2976                     uint32_t tile_y_sa)
2977 {
2978    void *map = surf_state->cpu;
2979    unsigned aux_modes = surf_state->aux_usages;
2980 
2981    while (aux_modes) {
2982       enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2983 
2984       fill_surface_state(isl_dev, map, res, surf, view, aux_usage,
2985                          extra_main_offset, tile_x_sa, tile_y_sa);
2986 
2987       map += SURFACE_STATE_ALIGNMENT;
2988    }
2989 }
2990 
2991 /**
2992  * The pipe->create_sampler_view() driver hook.
2993  */
2994 static struct pipe_sampler_view *
iris_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2995 iris_create_sampler_view(struct pipe_context *ctx,
2996                          struct pipe_resource *tex,
2997                          const struct pipe_sampler_view *tmpl)
2998 {
2999    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3000    const struct intel_device_info *devinfo = screen->devinfo;
3001    struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
3002 
3003    if (!isv)
3004       return NULL;
3005 
3006    /* initialize base object */
3007    isv->base = *tmpl;
3008    isv->base.context = ctx;
3009    isv->base.texture = NULL;
3010    pipe_reference_init(&isv->base.reference, 1);
3011    pipe_resource_reference(&isv->base.texture, tex);
3012 
3013    if (util_format_is_depth_or_stencil(tmpl->format)) {
3014       struct iris_resource *zres, *sres;
3015       const struct util_format_description *desc =
3016          util_format_description(tmpl->format);
3017 
3018       iris_get_depth_stencil_resources(tex, &zres, &sres);
3019 
3020       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
3021    }
3022 
3023    isv->res = (struct iris_resource *) tex;
3024 
3025    isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
3026 
3027    if (isv->base.target == PIPE_TEXTURE_CUBE ||
3028        isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
3029       usage |= ISL_SURF_USAGE_CUBE_BIT;
3030 
3031    const struct iris_format_info fmt =
3032       iris_format_for_usage(devinfo, tmpl->format, usage);
3033 
3034    isv->clear_color = isv->res->aux.clear_color;
3035 
3036    isv->view = (struct isl_view) {
3037       .format = fmt.fmt,
3038       .swizzle = (struct isl_swizzle) {
3039          .r = fmt_swizzle(&fmt, tmpl->swizzle_r),
3040          .g = fmt_swizzle(&fmt, tmpl->swizzle_g),
3041          .b = fmt_swizzle(&fmt, tmpl->swizzle_b),
3042          .a = fmt_swizzle(&fmt, tmpl->swizzle_a),
3043       },
3044       .usage = usage,
3045    };
3046 
3047    unsigned aux_usages = 0;
3048 
3049    if ((isv->res->aux.usage == ISL_AUX_USAGE_CCS_D ||
3050         isv->res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3051         isv->res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3052        !isl_format_supports_ccs_e(devinfo, isv->view.format)) {
3053       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3054    } else if (isl_aux_usage_has_hiz(isv->res->aux.usage) &&
3055               !iris_sample_with_depth_aux(devinfo, isv->res)) {
3056       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3057    } else {
3058       aux_usages = 1 << ISL_AUX_USAGE_NONE |
3059                    1 << isv->res->aux.usage;
3060    }
3061 
3062    alloc_surface_states(&isv->surface_state, aux_usages);
3063    isv->surface_state.bo_address = isv->res->bo->address;
3064 
3065    /* Fill out SURFACE_STATE for this view. */
3066    if (tmpl->target != PIPE_BUFFER) {
3067       isv->view.base_level = tmpl->u.tex.first_level;
3068       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
3069 
3070       if (tmpl->target == PIPE_TEXTURE_3D) {
3071          isv->view.base_array_layer = 0;
3072          isv->view.array_len = 1;
3073       } else {
3074 #if GFX_VER < 9
3075          /* Hardware older than skylake ignores this value */
3076          assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
3077 #endif
3078          isv->view.base_array_layer = tmpl->u.tex.first_layer;
3079          isv->view.array_len =
3080             tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3081       }
3082 
3083       fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3084                           &isv->res->surf, &isv->view, 0, 0, 0);
3085    } else if (isv->base.is_tex2d_from_buf) {
3086       /* In case it's a 2d image created from a buffer, we should
3087        * use fill_surface_states function with image parameters provided
3088        * by the CL application
3089        */
3090       isv->view.base_array_layer = 0;
3091       isv->view.array_len = 1;
3092 
3093       /* Create temp_surf and fill with values provided by CL application */
3094       struct isl_surf temp_surf;
3095       fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt.fmt,
3096                                       isv->base.u.tex2d_from_buf.width,
3097                                       isv->base.u.tex2d_from_buf.height,
3098                                       isv->base.u.tex2d_from_buf.row_stride,
3099                                       usage,
3100                                       &temp_surf);
3101 
3102       fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3103                           &temp_surf, &isv->view, 0, 0, 0);
3104    } else {
3105       fill_buffer_surface_state(&screen->isl_dev, isv->res,
3106                                 isv->surface_state.cpu,
3107                                 isv->view.format, isv->view.swizzle,
3108                                 tmpl->u.buf.offset, tmpl->u.buf.size,
3109                                 ISL_SURF_USAGE_TEXTURE_BIT);
3110    }
3111 
3112    return &isv->base;
3113 }
3114 
3115 static void
iris_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)3116 iris_sampler_view_destroy(struct pipe_context *ctx,
3117                           struct pipe_sampler_view *state)
3118 {
3119    struct iris_sampler_view *isv = (void *) state;
3120    pipe_resource_reference(&state->texture, NULL);
3121    pipe_resource_reference(&isv->surface_state.ref.res, NULL);
3122    free(isv->surface_state.cpu);
3123    free(isv);
3124 }
3125 
3126 /**
3127  * The pipe->create_surface() driver hook.
3128  *
3129  * In Gallium nomenclature, "surfaces" are a view of a resource that
3130  * can be bound as a render target or depth/stencil buffer.
3131  */
3132 static struct pipe_surface *
iris_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)3133 iris_create_surface(struct pipe_context *ctx,
3134                     struct pipe_resource *tex,
3135                     const struct pipe_surface *tmpl)
3136 {
3137    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3138    const struct intel_device_info *devinfo = screen->devinfo;
3139 
3140    isl_surf_usage_flags_t usage = 0;
3141    if (tmpl->writable)
3142       usage = ISL_SURF_USAGE_STORAGE_BIT;
3143    else if (util_format_is_depth_or_stencil(tmpl->format))
3144       usage = ISL_SURF_USAGE_DEPTH_BIT;
3145    else
3146       usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
3147 
3148    const struct iris_format_info fmt =
3149       iris_format_for_usage(devinfo, tmpl->format, usage);
3150 
3151    if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
3152        !isl_format_supports_rendering(devinfo, fmt.fmt)) {
3153       /* Framebuffer validation will reject this invalid case, but it
3154        * hasn't had the opportunity yet.  In the meantime, we need to
3155        * avoid hitting ISL asserts about unsupported formats below.
3156        */
3157       return NULL;
3158    }
3159 
3160    struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
3161    struct iris_resource *res = (struct iris_resource *) tex;
3162 
3163    if (!surf)
3164       return NULL;
3165 
3166    uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3167 
3168    struct isl_view *view = &surf->view;
3169    *view = (struct isl_view) {
3170       .format = fmt.fmt,
3171       .base_level = tmpl->u.tex.level,
3172       .levels = 1,
3173       .base_array_layer = tmpl->u.tex.first_layer,
3174       .array_len = array_len,
3175       .swizzle = ISL_SWIZZLE_IDENTITY,
3176       .usage = usage,
3177    };
3178 
3179 #if GFX_VER == 8
3180    struct isl_view *read_view = &surf->read_view;
3181    *read_view = (struct isl_view) {
3182       .format = fmt.fmt,
3183       .base_level = tmpl->u.tex.level,
3184       .levels = 1,
3185       .base_array_layer = tmpl->u.tex.first_layer,
3186       .array_len = array_len,
3187       .swizzle = ISL_SWIZZLE_IDENTITY,
3188       .usage = ISL_SURF_USAGE_TEXTURE_BIT,
3189    };
3190 
3191    struct isl_surf read_surf = res->surf;
3192    uint64_t read_surf_offset_B = 0;
3193    uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;
3194    if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {
3195       /* The minimum array element field of the surface state structure is
3196        * ignored by the sampler unit for 3D textures on some hardware.  If the
3197        * render buffer is a single slice of a 3D texture, create a 2D texture
3198        * covering that slice.
3199        *
3200        * TODO: This only handles the case where we're rendering to a single
3201        * slice of an array texture.  If we have layered rendering combined
3202        * with non-coherent FB fetch and a non-zero base_array_layer, then
3203        * we're going to run into problems.
3204        *
3205        * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/4904
3206        */
3207       isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
3208                               read_view->base_level,
3209                               0, read_view->base_array_layer,
3210                               &read_surf, &read_surf_offset_B,
3211                               &read_surf_tile_x_sa, &read_surf_tile_y_sa);
3212       read_view->base_level = 0;
3213       read_view->base_array_layer = 0;
3214       assert(read_view->array_len == 1);
3215    } else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {
3216       /* Convert 1D array textures to 2D arrays because shaders always provide
3217        * the array index coordinate at the Z component to avoid recompiles
3218        * when changing the texture target of the framebuffer.
3219        */
3220       assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);
3221       read_surf.dim = ISL_SURF_DIM_2D;
3222    }
3223 #endif
3224 
3225    struct isl_surf isl_surf = res->surf;
3226    uint64_t offset_B = 0;
3227    uint32_t tile_x_el = 0, tile_y_el = 0;
3228    if (isl_format_is_compressed(res->surf.format)) {
3229       /* The resource has a compressed format, which is not renderable, but we
3230        * have a renderable view format.  We must be attempting to upload
3231        * blocks of compressed data via an uncompressed view.
3232        *
3233        * In this case, we can assume there are no auxiliary surfaces, a single
3234        * miplevel, and that the resource is single-sampled.  Gallium may try
3235        * and create an uncompressed view with multiple layers, however.
3236        */
3237       assert(res->aux.surf.size_B == 0);
3238       assert(res->surf.samples == 1);
3239       assert(view->levels == 1);
3240 
3241       bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev,
3242                                                &res->surf, view,
3243                                                &isl_surf, view, &offset_B,
3244                                                &tile_x_el, &tile_y_el);
3245 
3246       /* On Broadwell, HALIGN and VALIGN are specified in pixels and are
3247        * hard-coded to align to exactly the block size of the compressed
3248        * texture. This means that, when reinterpreted as a non-compressed
3249        * texture, the tile offsets may be anything.
3250        *
3251        * We need them to be multiples of 4 to be usable in RENDER_SURFACE_STATE,
3252        * so force the state tracker to take fallback paths if they're not.
3253        */
3254 #if GFX_VER == 8
3255       if (tile_x_el % 4 != 0 || tile_y_el % 4 != 0) {
3256          ok = false;
3257       }
3258 #endif
3259 
3260       if (!ok) {
3261          free(surf);
3262          return NULL;
3263       }
3264    }
3265 
3266    surf->clear_color = res->aux.clear_color;
3267 
3268    struct pipe_surface *psurf = &surf->base;
3269    pipe_reference_init(&psurf->reference, 1);
3270    pipe_resource_reference(&psurf->texture, tex);
3271    psurf->context = ctx;
3272    psurf->format = tmpl->format;
3273    psurf->width = isl_surf.logical_level0_px.width;
3274    psurf->height = isl_surf.logical_level0_px.height;
3275    psurf->texture = tex;
3276    psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
3277    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
3278    psurf->u.tex.level = tmpl->u.tex.level;
3279 
3280    /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
3281    if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
3282                           ISL_SURF_USAGE_STENCIL_BIT))
3283       return psurf;
3284 
3285    /* Fill out a SURFACE_STATE for each possible auxiliary surface mode and
3286     * return the pipe_surface.
3287     */
3288    unsigned aux_usages = 0;
3289 
3290    if ((res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3291         res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3292        !isl_format_supports_ccs_e(devinfo, view->format)) {
3293       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3294    } else {
3295       aux_usages = 1 << ISL_AUX_USAGE_NONE |
3296                    1 << res->aux.usage;
3297    }
3298 
3299    alloc_surface_states(&surf->surface_state, aux_usages);
3300    surf->surface_state.bo_address = res->bo->address;
3301    fill_surface_states(&screen->isl_dev, &surf->surface_state, res,
3302                        &isl_surf, view, offset_B, tile_x_el, tile_y_el);
3303 
3304 #if GFX_VER == 8
3305    alloc_surface_states(&surf->surface_state_read, aux_usages);
3306    surf->surface_state_read.bo_address = res->bo->address;
3307    fill_surface_states(&screen->isl_dev, &surf->surface_state_read, res,
3308                        &read_surf, read_view, read_surf_offset_B,
3309                        read_surf_tile_x_sa, read_surf_tile_y_sa);
3310 #endif
3311 
3312    return psurf;
3313 }
3314 
3315 #if GFX_VER < 9
3316 static void
fill_default_image_param(struct isl_image_param * param)3317 fill_default_image_param(struct isl_image_param *param)
3318 {
3319    memset(param, 0, sizeof(*param));
3320    /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3321     * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3322     * detailed explanation of these parameters.
3323     */
3324    param->swizzling[0] = 0xff;
3325    param->swizzling[1] = 0xff;
3326 }
3327 
3328 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3329 fill_buffer_image_param(struct isl_image_param *param,
3330                         enum pipe_format pfmt,
3331                         unsigned size)
3332 {
3333    const unsigned cpp = util_format_get_blocksize(pfmt);
3334 
3335    fill_default_image_param(param);
3336    param->size[0] = size / cpp;
3337    param->stride[0] = cpp;
3338 }
3339 #else
3340 #define isl_surf_fill_image_param(x, ...)
3341 #define fill_default_image_param(x, ...)
3342 #define fill_buffer_image_param(x, ...)
3343 #endif
3344 
3345 /**
3346  * The pipe->set_shader_images() driver hook.
3347  */
3348 static void
iris_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3349 iris_set_shader_images(struct pipe_context *ctx,
3350                        enum pipe_shader_type p_stage,
3351                        unsigned start_slot, unsigned count,
3352                        unsigned unbind_num_trailing_slots,
3353                        const struct pipe_image_view *p_images)
3354 {
3355    struct iris_context *ice = (struct iris_context *) ctx;
3356    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3357    gl_shader_stage stage = stage_from_pipe(p_stage);
3358    struct iris_shader_state *shs = &ice->state.shaders[stage];
3359 #if GFX_VER == 8
3360    struct iris_genx_state *genx = ice->state.genx;
3361    struct isl_image_param *image_params = genx->shaders[stage].image_param;
3362 #endif
3363 
3364    shs->bound_image_views &=
3365       ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3366 
3367    for (unsigned i = 0; i < count; i++) {
3368       struct iris_image_view *iv = &shs->image[start_slot + i];
3369 
3370       if (p_images && p_images[i].resource) {
3371          const struct pipe_image_view *img = &p_images[i];
3372          struct iris_resource *res = (void *) img->resource;
3373 
3374          util_copy_image_view(&iv->base, img);
3375 
3376          shs->bound_image_views |= BITFIELD64_BIT(start_slot + i);
3377 
3378          res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3379          res->bind_stages |= 1 << stage;
3380 
3381          enum isl_format isl_fmt = iris_image_view_get_format(ice, img);
3382 
3383          unsigned aux_usages = 1 << ISL_AUX_USAGE_NONE;
3384 
3385          /* Gfx12+ supports render compression for images */
3386          if (GFX_VER >= 12 && isl_aux_usage_has_ccs_e(res->aux.usage))
3387             aux_usages |= 1 << ISL_AUX_USAGE_CCS_E;
3388 
3389          alloc_surface_states(&iv->surface_state, aux_usages);
3390          iv->surface_state.bo_address = res->bo->address;
3391 
3392          if (res->base.b.target != PIPE_BUFFER) {
3393             struct isl_view view = {
3394                .format = isl_fmt,
3395                .base_level = img->u.tex.level,
3396                .levels = 1,
3397                .base_array_layer = img->u.tex.first_layer,
3398                .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3399                .swizzle = ISL_SWIZZLE_IDENTITY,
3400                .usage = ISL_SURF_USAGE_STORAGE_BIT,
3401             };
3402 
3403             /* If using untyped fallback. */
3404             if (isl_fmt == ISL_FORMAT_RAW) {
3405                fill_buffer_surface_state(&screen->isl_dev, res,
3406                                          iv->surface_state.cpu,
3407                                          isl_fmt, ISL_SWIZZLE_IDENTITY,
3408                                          0, res->bo->size,
3409                                          ISL_SURF_USAGE_STORAGE_BIT);
3410             } else {
3411                fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3412                                    &res->surf, &view, 0, 0, 0);
3413             }
3414 
3415             isl_surf_fill_image_param(&screen->isl_dev,
3416                                       &image_params[start_slot + i],
3417                                       &res->surf, &view);
3418          } else if (img->access & PIPE_IMAGE_ACCESS_TEX2D_FROM_BUFFER) {
3419             /* In case it's a 2d image created from a buffer, we should
3420              * use fill_surface_states function with image parameters provided
3421              * by the CL application
3422              */
3423             isl_surf_usage_flags_t usage =  ISL_SURF_USAGE_STORAGE_BIT;
3424             struct isl_view view = {
3425                .format = isl_fmt,
3426                .base_level = 0,
3427                .levels = 1,
3428                .base_array_layer = 0,
3429                .array_len = 1,
3430                .swizzle = ISL_SWIZZLE_IDENTITY,
3431                .usage = usage,
3432             };
3433 
3434             /* Create temp_surf and fill with values provided by CL application */
3435             struct isl_surf temp_surf;
3436             enum isl_format fmt = iris_image_view_get_format(ice, img);
3437             fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt,
3438                                             img->u.tex2d_from_buf.width,
3439                                             img->u.tex2d_from_buf.height,
3440                                             img->u.tex2d_from_buf.row_stride,
3441                                             usage,
3442                                             &temp_surf);
3443 
3444             fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3445                                 &temp_surf, &view, 0, 0, 0);
3446             isl_surf_fill_image_param(&screen->isl_dev,
3447                                       &image_params[start_slot + i],
3448                                       &temp_surf, &view);
3449          } else {
3450             util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3451                            img->u.buf.offset + img->u.buf.size);
3452 
3453             fill_buffer_surface_state(&screen->isl_dev, res,
3454                                       iv->surface_state.cpu,
3455                                       isl_fmt, ISL_SWIZZLE_IDENTITY,
3456                                       img->u.buf.offset, img->u.buf.size,
3457                                       ISL_SURF_USAGE_STORAGE_BIT);
3458             fill_buffer_image_param(&image_params[start_slot + i],
3459                                     img->format, img->u.buf.size);
3460          }
3461 
3462          upload_surface_states(ice->state.surface_uploader, &iv->surface_state);
3463       } else {
3464          pipe_resource_reference(&iv->base.resource, NULL);
3465          pipe_resource_reference(&iv->surface_state.ref.res, NULL);
3466          fill_default_image_param(&image_params[start_slot + i]);
3467       }
3468    }
3469 
3470    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
3471    ice->state.dirty |=
3472       stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3473                                    : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3474 
3475    /* Broadwell also needs isl_image_params re-uploaded */
3476    if (GFX_VER < 9) {
3477       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3478       shs->sysvals_need_upload = true;
3479    }
3480 
3481    if (unbind_num_trailing_slots) {
3482       iris_set_shader_images(ctx, p_stage, start_slot + count,
3483                              unbind_num_trailing_slots, 0, NULL);
3484    }
3485 }
3486 
3487 UNUSED static bool
is_sampler_view_3d(const struct iris_sampler_view * view)3488 is_sampler_view_3d(const struct iris_sampler_view *view)
3489 {
3490    return view && view->res->base.b.target == PIPE_TEXTURE_3D;
3491 }
3492 
3493 /**
3494  * The pipe->set_sampler_views() driver hook.
3495  */
3496 static void
iris_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3497 iris_set_sampler_views(struct pipe_context *ctx,
3498                        enum pipe_shader_type p_stage,
3499                        unsigned start, unsigned count,
3500                        unsigned unbind_num_trailing_slots,
3501                        bool take_ownership,
3502                        struct pipe_sampler_view **views)
3503 {
3504    struct iris_context *ice = (struct iris_context *) ctx;
3505    UNUSED struct iris_screen *screen = (void *) ctx->screen;
3506    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
3507    gl_shader_stage stage = stage_from_pipe(p_stage);
3508    struct iris_shader_state *shs = &ice->state.shaders[stage];
3509    unsigned i;
3510 
3511    if (count == 0 && unbind_num_trailing_slots == 0)
3512       return;
3513 
3514    BITSET_CLEAR_RANGE(shs->bound_sampler_views, start,
3515                       start + count + unbind_num_trailing_slots - 1);
3516 
3517    for (i = 0; i < count; i++) {
3518       struct pipe_sampler_view *pview = views ? views[i] : NULL;
3519       struct iris_sampler_view *view = (void *) pview;
3520 
3521 #if GFX_VERx10 == 125
3522       if (intel_needs_workaround(screen->devinfo, 14014414195)) {
3523          if (is_sampler_view_3d(shs->textures[start + i]) !=
3524              is_sampler_view_3d(view))
3525             ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3526       }
3527 #endif
3528 
3529       if (take_ownership) {
3530          pipe_sampler_view_reference((struct pipe_sampler_view **)
3531                                      &shs->textures[start + i], NULL);
3532          shs->textures[start + i] = (struct iris_sampler_view *)pview;
3533       } else {
3534          pipe_sampler_view_reference((struct pipe_sampler_view **)
3535                                      &shs->textures[start + i], pview);
3536       }
3537       if (view) {
3538          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3539          view->res->bind_stages |= 1 << stage;
3540 
3541          BITSET_SET(shs->bound_sampler_views, start + i);
3542 
3543          update_surface_state_addrs(ice->state.surface_uploader,
3544                                     &view->surface_state, view->res->bo);
3545       }
3546    }
3547    for (; i < count + unbind_num_trailing_slots; i++) {
3548       pipe_sampler_view_reference((struct pipe_sampler_view **)
3549                                   &shs->textures[start + i], NULL);
3550    }
3551 
3552    ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);
3553    ice->state.dirty |=
3554       stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3555                                    : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3556 }
3557 
3558 static void
iris_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** resources)3559 iris_set_compute_resources(struct pipe_context *ctx,
3560                            unsigned start, unsigned count,
3561                            struct pipe_surface **resources)
3562 {
3563    assert(count == 0);
3564 }
3565 
3566 static void
iris_set_global_binding(struct pipe_context * ctx,unsigned start_slot,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)3567 iris_set_global_binding(struct pipe_context *ctx,
3568                         unsigned start_slot, unsigned count,
3569                         struct pipe_resource **resources,
3570                         uint32_t **handles)
3571 {
3572    struct iris_context *ice = (struct iris_context *) ctx;
3573 
3574    assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
3575    for (unsigned i = 0; i < count; i++) {
3576       if (resources && resources[i]) {
3577          pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3578                                  resources[i]);
3579 
3580          struct iris_resource *res = (void *) resources[i];
3581          assert(res->base.b.target == PIPE_BUFFER);
3582          util_range_add(&res->base.b, &res->valid_buffer_range,
3583                         0, res->base.b.width0);
3584 
3585          uint64_t addr = 0;
3586          memcpy(&addr, handles[i], sizeof(addr));
3587          addr += res->bo->address + res->offset;
3588          memcpy(handles[i], &addr, sizeof(addr));
3589       } else {
3590          pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3591                                  NULL);
3592       }
3593    }
3594 
3595    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
3596 }
3597 
3598 /**
3599  * The pipe->set_tess_state() driver hook.
3600  */
3601 static void
iris_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3602 iris_set_tess_state(struct pipe_context *ctx,
3603                     const float default_outer_level[4],
3604                     const float default_inner_level[2])
3605 {
3606    struct iris_context *ice = (struct iris_context *) ctx;
3607    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3608 
3609    memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3610    memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3611 
3612    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;
3613    shs->sysvals_need_upload = true;
3614 }
3615 
3616 static void
iris_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3617 iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3618 {
3619    struct iris_context *ice = (struct iris_context *) ctx;
3620 
3621    ice->state.patch_vertices = patch_vertices;
3622 }
3623 
3624 static void
iris_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3625 iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3626 {
3627    struct iris_surface *surf = (void *) p_surf;
3628    pipe_resource_reference(&p_surf->texture, NULL);
3629    pipe_resource_reference(&surf->surface_state.ref.res, NULL);
3630    pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);
3631    free(surf->surface_state.cpu);
3632    free(surf->surface_state_read.cpu);
3633    free(surf);
3634 }
3635 
3636 static void
iris_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3637 iris_set_clip_state(struct pipe_context *ctx,
3638                     const struct pipe_clip_state *state)
3639 {
3640    struct iris_context *ice = (struct iris_context *) ctx;
3641    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3642    struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3643    struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3644 
3645    memcpy(&ice->state.clip_planes, state, sizeof(*state));
3646 
3647    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |
3648                              IRIS_STAGE_DIRTY_CONSTANTS_GS |
3649                              IRIS_STAGE_DIRTY_CONSTANTS_TES;
3650    shs->sysvals_need_upload = true;
3651    gshs->sysvals_need_upload = true;
3652    tshs->sysvals_need_upload = true;
3653 }
3654 
3655 /**
3656  * The pipe->set_polygon_stipple() driver hook.
3657  */
3658 static void
iris_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3659 iris_set_polygon_stipple(struct pipe_context *ctx,
3660                          const struct pipe_poly_stipple *state)
3661 {
3662    struct iris_context *ice = (struct iris_context *) ctx;
3663    memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3664    ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;
3665 }
3666 
3667 /**
3668  * The pipe->set_sample_mask() driver hook.
3669  */
3670 static void
iris_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3671 iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3672 {
3673    struct iris_context *ice = (struct iris_context *) ctx;
3674 
3675    /* We only support 16x MSAA, so we have 16 bits of sample maks.
3676     * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3677     */
3678    ice->state.sample_mask = sample_mask & 0xffff;
3679    ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;
3680 }
3681 
3682 /**
3683  * The pipe->set_scissor_states() driver hook.
3684  *
3685  * This corresponds to our SCISSOR_RECT state structures.  It's an
3686  * exact match, so we just store them, and memcpy them out later.
3687  */
3688 static void
iris_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3689 iris_set_scissor_states(struct pipe_context *ctx,
3690                         unsigned start_slot,
3691                         unsigned num_scissors,
3692                         const struct pipe_scissor_state *rects)
3693 {
3694    struct iris_context *ice = (struct iris_context *) ctx;
3695 
3696    for (unsigned i = 0; i < num_scissors; i++) {
3697       if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3698          /* If the scissor was out of bounds and got clamped to 0 width/height
3699           * at the bounds, the subtraction of 1 from maximums could produce a
3700           * negative number and thus not clip anything.  Instead, just provide
3701           * a min > max scissor inside the bounds, which produces the expected
3702           * no rendering.
3703           */
3704          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3705             .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3706          };
3707       } else {
3708          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3709             .minx = rects[i].minx,     .miny = rects[i].miny,
3710             .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3711          };
3712       }
3713    }
3714 
3715    ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
3716 }
3717 
3718 /**
3719  * The pipe->set_stencil_ref() driver hook.
3720  *
3721  * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3722  */
3723 static void
iris_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref state)3724 iris_set_stencil_ref(struct pipe_context *ctx,
3725                      const struct pipe_stencil_ref state)
3726 {
3727    struct iris_context *ice = (struct iris_context *) ctx;
3728    memcpy(&ice->state.stencil_ref, &state, sizeof(state));
3729    if (GFX_VER >= 12)
3730       ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;
3731    else if (GFX_VER >= 9)
3732       ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
3733    else
3734       ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
3735 }
3736 
3737 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3738 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3739 {
3740    return copysignf(state->scale[axis], sign) + state->translate[axis];
3741 }
3742 
3743 /**
3744  * The pipe->set_viewport_states() driver hook.
3745  *
3746  * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3747  * the guardband yet, as we need the framebuffer dimensions, but we can
3748  * at least fill out the rest.
3749  */
3750 static void
iris_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3751 iris_set_viewport_states(struct pipe_context *ctx,
3752                          unsigned start_slot,
3753                          unsigned count,
3754                          const struct pipe_viewport_state *states)
3755 {
3756    struct iris_context *ice = (struct iris_context *) ctx;
3757    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3758 
3759    memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3760 
3761    /* Fix depth test misrenderings by lowering translated depth range */
3762    if (screen->driconf.lower_depth_range_rate != 1.0f)
3763       ice->state.viewports[start_slot].translate[2] *=
3764          screen->driconf.lower_depth_range_rate;
3765 
3766    ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3767 
3768    if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||
3769                                !ice->state.cso_rast->depth_clip_far))
3770       ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
3771 }
3772 
3773 /**
3774  * The pipe->set_framebuffer_state() driver hook.
3775  *
3776  * Sets the current draw FBO, including color render targets, depth,
3777  * and stencil buffers.
3778  */
3779 static void
iris_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3780 iris_set_framebuffer_state(struct pipe_context *ctx,
3781                            const struct pipe_framebuffer_state *state)
3782 {
3783    struct iris_context *ice = (struct iris_context *) ctx;
3784    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3785    const struct intel_device_info *devinfo = screen->devinfo;
3786    struct isl_device *isl_dev = &screen->isl_dev;
3787    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3788    struct iris_resource *zres;
3789    struct iris_resource *stencil_res;
3790    struct iris_resource *new_res = NULL;
3791    struct pipe_box new_render_area;
3792 
3793    unsigned samples = util_framebuffer_get_num_samples(state);
3794    unsigned layers = util_framebuffer_get_num_layers(state);
3795 
3796    /* multiview not supported */
3797    assert(!state->viewmask);
3798 
3799    if (cso->samples != samples) {
3800       ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
3801 
3802       /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */
3803       if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))
3804          ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
3805 
3806       /* We may need to emit blend state for Wa_14018912822. */
3807       if ((cso->samples > 1) != (samples > 1) &&
3808           intel_needs_workaround(devinfo, 14018912822)) {
3809          ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3810          ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
3811       }
3812    }
3813 
3814    if (cso->nr_cbufs != state->nr_cbufs) {
3815       ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3816    }
3817 
3818    if ((cso->layers == 0) != (layers == 0)) {
3819       ice->state.dirty |= IRIS_DIRTY_CLIP;
3820    }
3821 
3822    if (state->nr_cbufs > 0 && state->cbufs[0])
3823       new_res = (struct iris_resource *)state->cbufs[0]->texture;
3824 
3825    if (new_res && new_res->use_damage) {
3826       new_render_area = new_res->damage;
3827    } else {
3828       new_render_area.x = 0;
3829       new_render_area.y = 0;
3830       new_render_area.z = 0;
3831       new_render_area.width = state->width;
3832       new_render_area.height = state->height;
3833       new_render_area.depth = 0;
3834    }
3835 
3836    if (memcmp(&ice->state.render_area, &new_render_area, sizeof(new_render_area))) {
3837       ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3838       ice->state.render_area = new_render_area;
3839    }
3840 
3841    if (cso->zsbuf || state->zsbuf) {
3842       ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
3843    }
3844 
3845    bool has_integer_rt = false;
3846    for (unsigned i = 0; i < state->nr_cbufs; i++) {
3847       if (state->cbufs[i]) {
3848          enum isl_format ifmt =
3849             isl_format_for_pipe_format(state->cbufs[i]->format);
3850          has_integer_rt |= isl_format_has_int_channel(ifmt);
3851       }
3852    }
3853 
3854    /* 3DSTATE_RASTER::AntialiasingEnable */
3855    if (has_integer_rt != ice->state.has_integer_rt ||
3856        cso->samples != samples) {
3857       ice->state.dirty |= IRIS_DIRTY_RASTER;
3858    }
3859 
3860    util_copy_framebuffer_state(cso, state);
3861    cso->samples = samples;
3862    cso->layers = layers;
3863 
3864    ice->state.has_integer_rt = has_integer_rt;
3865 
3866    struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
3867 
3868    struct isl_view view = {
3869       .base_level = 0,
3870       .levels = 1,
3871       .base_array_layer = 0,
3872       .array_len = 1,
3873       .swizzle = ISL_SWIZZLE_IDENTITY,
3874    };
3875 
3876    struct isl_depth_stencil_hiz_emit_info info = {
3877       .view = &view,
3878       .mocs = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_DEPTH_BIT),
3879    };
3880 
3881    if (cso->zsbuf) {
3882       iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,
3883                                        &stencil_res);
3884 
3885       view.base_level = cso->zsbuf->u.tex.level;
3886       view.base_array_layer = cso->zsbuf->u.tex.first_layer;
3887       view.array_len =
3888          cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
3889 
3890       if (zres) {
3891          view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
3892 
3893          info.depth_surf = &zres->surf;
3894          info.depth_address = zres->bo->address + zres->offset;
3895          info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);
3896 
3897          view.format = zres->surf.format;
3898 
3899          if (iris_resource_level_has_hiz(devinfo, zres, view.base_level)) {
3900             info.hiz_usage = zres->aux.usage;
3901             info.hiz_surf = &zres->aux.surf;
3902             info.hiz_address = zres->aux.bo->address + zres->aux.offset;
3903          }
3904 
3905          ice->state.hiz_usage = info.hiz_usage;
3906       }
3907 
3908       if (stencil_res) {
3909          view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
3910          info.stencil_aux_usage = stencil_res->aux.usage;
3911          info.stencil_surf = &stencil_res->surf;
3912          info.stencil_address = stencil_res->bo->address + stencil_res->offset;
3913          if (!zres) {
3914             view.format = stencil_res->surf.format;
3915             info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);
3916          }
3917       }
3918    }
3919 
3920    isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
3921 
3922    /* Make a null surface for unbound buffers */
3923    void *null_surf_map =
3924       upload_state(ice->state.surface_uploader, &ice->state.null_fb,
3925                    4 * GENX(RENDER_SURFACE_STATE_length), 64);
3926    isl_null_fill_state(&screen->isl_dev, null_surf_map,
3927                        .size = isl_extent3d(MAX2(cso->width, 1),
3928                                             MAX2(cso->height, 1),
3929                                             cso->layers ? cso->layers : 1));
3930    ice->state.null_fb.offset +=
3931       iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));
3932 
3933    /* Render target change */
3934    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;
3935 
3936    ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
3937 
3938    ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3939 
3940    ice->state.stage_dirty |=
3941       ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
3942 
3943    if (GFX_VER == 8)
3944       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
3945 }
3946 
3947 /**
3948  * The pipe->set_constant_buffer() driver hook.
3949  *
3950  * This uploads any constant data in user buffers, and references
3951  * any UBO resources containing constant data.
3952  */
3953 static void
iris_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3954 iris_set_constant_buffer(struct pipe_context *ctx,
3955                          enum pipe_shader_type p_stage, unsigned index,
3956                          bool take_ownership,
3957                          const struct pipe_constant_buffer *input)
3958 {
3959    struct iris_context *ice = (struct iris_context *) ctx;
3960    gl_shader_stage stage = stage_from_pipe(p_stage);
3961    struct iris_shader_state *shs = &ice->state.shaders[stage];
3962    struct pipe_shader_buffer *cbuf = &shs->constbuf[index];
3963 
3964    /* TODO: Only do this if the buffer changes? */
3965    pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);
3966 
3967    if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3968       shs->bound_cbufs |= 1u << index;
3969 
3970       if (input->user_buffer) {
3971          void *map = NULL;
3972          pipe_resource_reference(&cbuf->buffer, NULL);
3973          u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3974                         &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3975 
3976          if (!cbuf->buffer) {
3977             /* Allocation was unsuccessful - just unbind */
3978             iris_set_constant_buffer(ctx, p_stage, index, false, NULL);
3979             return;
3980          }
3981 
3982          assert(map);
3983          memcpy(map, input->user_buffer, input->buffer_size);
3984       } else if (input->buffer) {
3985          if (cbuf->buffer != input->buffer) {
3986             ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3987                                  IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3988             shs->dirty_cbufs |= 1u << index;
3989          }
3990 
3991          if (take_ownership) {
3992             pipe_resource_reference(&cbuf->buffer, NULL);
3993             cbuf->buffer = input->buffer;
3994          } else {
3995             pipe_resource_reference(&cbuf->buffer, input->buffer);
3996          }
3997 
3998          cbuf->buffer_offset = input->buffer_offset;
3999       }
4000 
4001       cbuf->buffer_size =
4002          MIN2(input->buffer_size,
4003               iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
4004 
4005       struct iris_resource *res = (void *) cbuf->buffer;
4006       res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
4007       res->bind_stages |= 1 << stage;
4008    } else {
4009       shs->bound_cbufs &= ~(1u << index);
4010       pipe_resource_reference(&cbuf->buffer, NULL);
4011    }
4012 
4013    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
4014 }
4015 
4016 static void
upload_sysvals(struct iris_context * ice,gl_shader_stage stage,const struct pipe_grid_info * grid)4017 upload_sysvals(struct iris_context *ice,
4018                gl_shader_stage stage,
4019                const struct pipe_grid_info *grid)
4020 {
4021    UNUSED struct iris_genx_state *genx = ice->state.genx;
4022    struct iris_shader_state *shs = &ice->state.shaders[stage];
4023 
4024    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
4025    if (!shader || (shader->num_system_values == 0 &&
4026                    shader->kernel_input_size == 0))
4027       return;
4028 
4029    assert(shader->num_cbufs > 0);
4030 
4031    unsigned sysval_cbuf_index = shader->num_cbufs - 1;
4032    struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
4033    unsigned system_values_start =
4034       ALIGN(shader->kernel_input_size, sizeof(uint32_t));
4035    unsigned upload_size = system_values_start +
4036                           shader->num_system_values * sizeof(uint32_t);
4037    void *map = NULL;
4038 
4039    assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
4040    u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
4041                   &cbuf->buffer_offset, &cbuf->buffer, &map);
4042 
4043    if (shader->kernel_input_size > 0)
4044       memcpy(map, grid->input, shader->kernel_input_size);
4045 
4046    uint32_t *sysval_map = map + system_values_start;
4047    for (int i = 0; i < shader->num_system_values; i++) {
4048       uint32_t sysval = shader->system_values[i];
4049       uint32_t value = 0;
4050 
4051 #if GFX_VER >= 9
4052       #define COMPILER(x) BRW_##x
4053 #else
4054       #define COMPILER(x) ELK_##x
4055 #endif
4056 
4057       if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
4058 #if GFX_VER == 8
4059          unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
4060          unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
4061          struct isl_image_param *param =
4062             &genx->shaders[stage].image_param[img];
4063 
4064          assert(offset < sizeof(struct isl_image_param));
4065          value = ((uint32_t *) param)[offset];
4066 #endif
4067       } else if (sysval == COMPILER(PARAM_BUILTIN_ZERO)) {
4068          value = 0;
4069       } else if (COMPILER(PARAM_BUILTIN_IS_CLIP_PLANE(sysval))) {
4070          int plane = COMPILER(PARAM_BUILTIN_CLIP_PLANE_IDX(sysval));
4071          int comp  = COMPILER(PARAM_BUILTIN_CLIP_PLANE_COMP(sysval));
4072          value = fui(ice->state.clip_planes.ucp[plane][comp]);
4073       } else if (sysval == COMPILER(PARAM_BUILTIN_PATCH_VERTICES_IN)) {
4074          if (stage == MESA_SHADER_TESS_CTRL) {
4075             value = ice->state.vertices_per_patch;
4076          } else {
4077             assert(stage == MESA_SHADER_TESS_EVAL);
4078             const struct shader_info *tcs_info =
4079                iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
4080             if (tcs_info)
4081                value = tcs_info->tess.tcs_vertices_out;
4082             else
4083                value = ice->state.vertices_per_patch;
4084          }
4085       } else if (sysval >= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X) &&
4086                  sysval <= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_W)) {
4087          unsigned i = sysval - COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X);
4088          value = fui(ice->state.default_outer_level[i]);
4089       } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_X)) {
4090          value = fui(ice->state.default_inner_level[0]);
4091       } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_Y)) {
4092          value = fui(ice->state.default_inner_level[1]);
4093       } else if (sysval >= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X) &&
4094                  sysval <= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_Z)) {
4095          unsigned i = sysval - COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X);
4096          value = ice->state.last_block[i];
4097       } else if (sysval == COMPILER(PARAM_BUILTIN_WORK_DIM)) {
4098          value = grid->work_dim;
4099       } else {
4100          assert(!"unhandled system value");
4101       }
4102 
4103       *sysval_map++ = value;
4104    }
4105 
4106    cbuf->buffer_size = upload_size;
4107    iris_upload_ubo_ssbo_surf_state(ice, cbuf,
4108                                    &shs->constbuf_surf_state[sysval_cbuf_index],
4109                                    ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
4110 
4111    shs->sysvals_need_upload = false;
4112 }
4113 
4114 /**
4115  * The pipe->set_shader_buffers() driver hook.
4116  *
4117  * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
4118  * SURFACE_STATE here, as the buffer offset may change each time.
4119  */
4120 static void
iris_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4121 iris_set_shader_buffers(struct pipe_context *ctx,
4122                         enum pipe_shader_type p_stage,
4123                         unsigned start_slot, unsigned count,
4124                         const struct pipe_shader_buffer *buffers,
4125                         unsigned writable_bitmask)
4126 {
4127    struct iris_context *ice = (struct iris_context *) ctx;
4128    gl_shader_stage stage = stage_from_pipe(p_stage);
4129    struct iris_shader_state *shs = &ice->state.shaders[stage];
4130 
4131    unsigned modified_bits = u_bit_consecutive(start_slot, count);
4132 
4133    shs->bound_ssbos &= ~modified_bits;
4134    shs->writable_ssbos &= ~modified_bits;
4135    shs->writable_ssbos |= writable_bitmask << start_slot;
4136 
4137    for (unsigned i = 0; i < count; i++) {
4138       if (buffers && buffers[i].buffer) {
4139          struct iris_resource *res = (void *) buffers[i].buffer;
4140          struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
4141          struct iris_state_ref *surf_state =
4142             &shs->ssbo_surf_state[start_slot + i];
4143          pipe_resource_reference(&ssbo->buffer, &res->base.b);
4144          ssbo->buffer_offset = buffers[i].buffer_offset;
4145          ssbo->buffer_size =
4146             MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
4147 
4148          shs->bound_ssbos |= 1 << (start_slot + i);
4149 
4150          isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
4151 
4152          iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);
4153 
4154          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
4155          res->bind_stages |= 1 << stage;
4156 
4157          util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
4158                         ssbo->buffer_offset + ssbo->buffer_size);
4159       } else {
4160          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
4161          pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
4162                                  NULL);
4163       }
4164    }
4165 
4166    ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
4167                         IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
4168    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
4169 }
4170 
4171 static void
iris_delete_state(struct pipe_context * ctx,void * state)4172 iris_delete_state(struct pipe_context *ctx, void *state)
4173 {
4174    free(state);
4175 }
4176 
4177 /**
4178  * The pipe->set_vertex_buffers() driver hook.
4179  *
4180  * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
4181  */
4182 static void
iris_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)4183 iris_set_vertex_buffers(struct pipe_context *ctx,
4184                         unsigned count,
4185                         const struct pipe_vertex_buffer *buffers)
4186 {
4187    struct iris_context *ice = (struct iris_context *) ctx;
4188    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4189    struct iris_genx_state *genx = ice->state.genx;
4190 
4191    unsigned last_count = util_last_bit64(ice->state.bound_vertex_buffers);
4192    ice->state.bound_vertex_buffers = 0;
4193 
4194    for (unsigned i = 0; i < count; i++) {
4195       const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;
4196       struct iris_vertex_buffer_state *state =
4197          &genx->vertex_buffers[i];
4198 
4199       if (!buffer) {
4200          pipe_resource_reference(&state->resource, NULL);
4201          continue;
4202       }
4203 
4204       /* We may see user buffers that are NULL bindings. */
4205       assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
4206 
4207       if (buffer->buffer.resource &&
4208           state->resource != buffer->buffer.resource)
4209          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
4210 
4211       pipe_resource_reference(&state->resource, NULL);
4212       state->resource = buffer->buffer.resource;
4213 
4214       struct iris_resource *res = (void *) state->resource;
4215 
4216       state->offset = (int) buffer->buffer_offset;
4217 
4218       if (res) {
4219          ice->state.bound_vertex_buffers |= 1ull << i;
4220          res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4221       }
4222 
4223       iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
4224          vb.VertexBufferIndex = i;
4225          vb.AddressModifyEnable = true;
4226          /* vb.BufferPitch is merged in dynamically from VE state later */
4227          if (res) {
4228             vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;
4229             vb.BufferStartingAddress =
4230                ro_bo(NULL, res->bo->address + (int) buffer->buffer_offset);
4231             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4232                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4233 #if GFX_VER >= 12
4234             vb.L3BypassDisable       = true;
4235 #endif
4236          } else {
4237             vb.NullVertexBuffer = true;
4238             vb.MOCS = iris_mocs(NULL, &screen->isl_dev,
4239                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4240          }
4241       }
4242    }
4243 
4244    for (unsigned i = count; i < last_count; i++) {
4245       struct iris_vertex_buffer_state *state =
4246          &genx->vertex_buffers[i];
4247 
4248       pipe_resource_reference(&state->resource, NULL);
4249    }
4250 
4251    ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4252 }
4253 
4254 /**
4255  * Gallium CSO for vertex elements.
4256  */
4257 struct iris_vertex_element_state {
4258    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
4259    uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
4260    uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
4261    uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
4262    uint32_t stride[PIPE_MAX_ATTRIBS];
4263    unsigned vb_count;
4264    unsigned count;
4265 };
4266 
4267 /**
4268  * The pipe->create_vertex_elements_state() driver hook.
4269  *
4270  * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
4271  * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
4272  * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
4273  * needed. In these cases we will need information available at draw time.
4274  * We setup edgeflag_ve and edgeflag_vfi as alternatives last
4275  * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
4276  * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
4277  */
4278 static void *
iris_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)4279 iris_create_vertex_elements(struct pipe_context *ctx,
4280                             unsigned count,
4281                             const struct pipe_vertex_element *state)
4282 {
4283    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4284    const struct intel_device_info *devinfo = screen->devinfo;
4285    struct iris_vertex_element_state *cso =
4286       calloc(1, sizeof(struct iris_vertex_element_state));
4287 
4288    cso->count = count;
4289    cso->vb_count = 0;
4290 
4291    iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
4292       ve.DWordLength =
4293          1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
4294    }
4295 
4296    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
4297    uint32_t *vfi_pack_dest = cso->vf_instancing;
4298 
4299    if (count == 0) {
4300       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4301          ve.Valid = true;
4302          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
4303          ve.Component0Control = VFCOMP_STORE_0;
4304          ve.Component1Control = VFCOMP_STORE_0;
4305          ve.Component2Control = VFCOMP_STORE_0;
4306          ve.Component3Control = VFCOMP_STORE_1_FP;
4307       }
4308 
4309       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4310       }
4311    }
4312 
4313    for (int i = 0; i < count; i++) {
4314       const struct iris_format_info fmt =
4315          iris_format_for_usage(devinfo, state[i].src_format, 0);
4316       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
4317                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
4318 
4319       switch (isl_format_get_num_channels(fmt.fmt)) {
4320       case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
4321       case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
4322       case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
4323       case 3:
4324          comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
4325                                                        : VFCOMP_STORE_1_FP;
4326          break;
4327       }
4328       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4329          ve.EdgeFlagEnable = false;
4330          ve.VertexBufferIndex = state[i].vertex_buffer_index;
4331          ve.Valid = true;
4332          ve.SourceElementOffset = state[i].src_offset;
4333          ve.SourceElementFormat = fmt.fmt;
4334          ve.Component0Control = comp[0];
4335          ve.Component1Control = comp[1];
4336          ve.Component2Control = comp[2];
4337          ve.Component3Control = comp[3];
4338       }
4339 
4340       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4341          vi.VertexElementIndex = i;
4342          vi.InstancingEnable = state[i].instance_divisor > 0;
4343          vi.InstanceDataStepRate = state[i].instance_divisor;
4344       }
4345 
4346       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
4347       vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
4348       cso->stride[state[i].vertex_buffer_index] = state[i].src_stride;
4349       cso->vb_count = MAX2(state[i].vertex_buffer_index + 1, cso->vb_count);
4350    }
4351 
4352    /* An alternative version of the last VE and VFI is stored so it
4353     * can be used at draw time in case Vertex Shader uses EdgeFlag
4354     */
4355    if (count) {
4356       const unsigned edgeflag_index = count - 1;
4357       const struct iris_format_info fmt =
4358          iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
4359       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
4360          ve.EdgeFlagEnable = true ;
4361          ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
4362          ve.Valid = true;
4363          ve.SourceElementOffset = state[edgeflag_index].src_offset;
4364          ve.SourceElementFormat = fmt.fmt;
4365          ve.Component0Control = VFCOMP_STORE_SRC;
4366          ve.Component1Control = VFCOMP_STORE_0;
4367          ve.Component2Control = VFCOMP_STORE_0;
4368          ve.Component3Control = VFCOMP_STORE_0;
4369       }
4370       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
4371          /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
4372           * at draw time, as it should change if SGVs are emitted.
4373           */
4374          vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
4375          vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
4376       }
4377    }
4378 
4379    return cso;
4380 }
4381 
4382 /**
4383  * The pipe->bind_vertex_elements_state() driver hook.
4384  */
4385 static void
iris_bind_vertex_elements_state(struct pipe_context * ctx,void * state)4386 iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
4387 {
4388    struct iris_context *ice = (struct iris_context *) ctx;
4389    struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
4390    struct iris_vertex_element_state *new_cso = state;
4391 
4392    /* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,
4393     * we need to re-emit it to ensure we're overriding the right one.
4394     */
4395    if (new_cso && cso_changed(count))
4396       ice->state.dirty |= IRIS_DIRTY_VF_SGVS;
4397 
4398    ice->state.cso_vertex_elements = state;
4399    ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;
4400    if (new_cso) {
4401       /* re-emit vertex buffer state if stride changes */
4402       if (cso_changed(vb_count) ||
4403           cso_changed_memcmp_elts(stride, new_cso->vb_count))
4404          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4405    }
4406 }
4407 
4408 /**
4409  * The pipe->create_stream_output_target() driver hook.
4410  *
4411  * "Target" here refers to a destination buffer.  We translate this into
4412  * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4413  * know which buffer this represents, or whether we ought to zero the
4414  * write-offsets, or append.  Those are handled in the set() hook.
4415  */
4416 static struct pipe_stream_output_target *
iris_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4417 iris_create_stream_output_target(struct pipe_context *ctx,
4418                                  struct pipe_resource *p_res,
4419                                  unsigned buffer_offset,
4420                                  unsigned buffer_size)
4421 {
4422    struct iris_resource *res = (void *) p_res;
4423    struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));
4424    if (!cso)
4425       return NULL;
4426 
4427    res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4428 
4429    pipe_reference_init(&cso->base.reference, 1);
4430    pipe_resource_reference(&cso->base.buffer, p_res);
4431    cso->base.buffer_offset = buffer_offset;
4432    cso->base.buffer_size = buffer_size;
4433    cso->base.context = ctx;
4434 
4435    util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4436                   buffer_offset + buffer_size);
4437 
4438    return &cso->base;
4439 }
4440 
4441 static void
iris_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4442 iris_stream_output_target_destroy(struct pipe_context *ctx,
4443                                   struct pipe_stream_output_target *state)
4444 {
4445    struct iris_stream_output_target *cso = (void *) state;
4446 
4447    pipe_resource_reference(&cso->base.buffer, NULL);
4448    pipe_resource_reference(&cso->offset.res, NULL);
4449 
4450    free(cso);
4451 }
4452 
4453 /**
4454  * The pipe->set_stream_output_targets() driver hook.
4455  *
4456  * At this point, we know which targets are bound to a particular index,
4457  * and also whether we want to append or start over.  We can finish the
4458  * 3DSTATE_SO_BUFFER packets we started earlier.
4459  */
4460 static void
iris_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets,enum mesa_prim output_prim)4461 iris_set_stream_output_targets(struct pipe_context *ctx,
4462                                unsigned num_targets,
4463                                struct pipe_stream_output_target **targets,
4464                                const unsigned *offsets,
4465                                enum mesa_prim output_prim)
4466 {
4467    struct iris_context *ice = (struct iris_context *) ctx;
4468    struct iris_genx_state *genx = ice->state.genx;
4469    uint32_t *so_buffers = genx->so_buffers;
4470    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4471 
4472    const bool active = num_targets > 0;
4473    if (ice->state.streamout_active != active) {
4474       ice->state.streamout_active = active;
4475       ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
4476 
4477       /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4478        * it's a non-pipelined command.  If we're switching streamout on, we
4479        * may have missed emitting it earlier, so do so now.  (We're already
4480        * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4481        */
4482       if (active) {
4483          ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;
4484       } else {
4485          for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4486             struct iris_stream_output_target *tgt =
4487                (void *) ice->state.so_target[i];
4488 
4489             if (tgt)
4490                iris_dirty_for_history(ice, (void *)tgt->base.buffer);
4491          }
4492       }
4493    }
4494 
4495    for (int i = 0; i < 4; i++) {
4496       pipe_so_target_reference(&ice->state.so_target[i],
4497                                i < num_targets ? targets[i] : NULL);
4498    }
4499 
4500    /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4501    if (!active)
4502       return;
4503 
4504    for (unsigned i = 0; i < 4; i++,
4505         so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
4506 
4507       struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];
4508       unsigned offset = offsets[i];
4509 
4510       if (!tgt) {
4511          iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4512 #if GFX_VER < 12
4513             sob.SOBufferIndex = i;
4514 #else
4515             sob._3DCommandOpcode = 0;
4516             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4517 #endif
4518             sob.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
4519          }
4520          continue;
4521       }
4522 
4523       if (!tgt->offset.res)
4524          upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);
4525 
4526       struct iris_resource *res = (void *) tgt->base.buffer;
4527 
4528       /* Note that offsets[i] will either be 0, causing us to zero
4529        * the value in the buffer, or 0xFFFFFFFF, which happens to mean
4530        * "continue appending at the existing offset."
4531        */
4532       assert(offset == 0 || offset == 0xFFFFFFFF);
4533 
4534       /* When we're first called with an offset of 0, we want the next
4535        * 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.
4536        * Any further times we emit those packets, we want to use 0xFFFFFFFF
4537        * to continue appending from the current offset.
4538        *
4539        * Note that we might be called by Begin (offset = 0), Pause, then
4540        * Resume (offset = 0xFFFFFFFF) before ever drawing (where these
4541        * commands will actually be sent to the GPU).  In this case, we
4542        * don't want to append - we still want to do our initial zeroing.
4543        */
4544       if (offset == 0)
4545          tgt->zero_offset = true;
4546 
4547       iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4548 #if GFX_VER < 12
4549          sob.SOBufferIndex = i;
4550 #else
4551          sob._3DCommandOpcode = 0;
4552          sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4553 #endif
4554          sob.SurfaceBaseAddress =
4555             rw_bo(NULL, res->bo->address + tgt->base.buffer_offset,
4556                   IRIS_DOMAIN_OTHER_WRITE);
4557          sob.SOBufferEnable = true;
4558          sob.StreamOffsetWriteEnable = true;
4559          sob.StreamOutputBufferOffsetAddressEnable = true;
4560          sob.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4561                               ISL_SURF_USAGE_STREAM_OUT_BIT);
4562 
4563          sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
4564          sob.StreamOutputBufferOffsetAddress =
4565             rw_bo(NULL, iris_resource_bo(tgt->offset.res)->address +
4566                         tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);
4567          sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
4568       }
4569    }
4570 
4571    ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
4572 }
4573 
4574 /**
4575  * An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4576  * 3DSTATE_STREAMOUT packets.
4577  *
4578  * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4579  * hardware to record.  We can create it entirely based on the shader, with
4580  * no dynamic state dependencies.
4581  *
4582  * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4583  * state-based settings.  We capture the shader-related ones here, and merge
4584  * the rest in at draw time.
4585  */
4586 static uint32_t *
iris_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4587 iris_create_so_decl_list(const struct pipe_stream_output_info *info,
4588                          const struct intel_vue_map *vue_map)
4589 {
4590    struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4591    int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4592    int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4593    int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4594    int max_decls = 0;
4595    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4596 
4597    memset(so_decl, 0, sizeof(so_decl));
4598 
4599    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4600     * command feels strange -- each dword pair contains a SO_DECL per stream.
4601     */
4602    for (unsigned i = 0; i < info->num_outputs; i++) {
4603       const struct pipe_stream_output *output = &info->output[i];
4604       const int buffer = output->output_buffer;
4605       const int varying = output->register_index;
4606       const unsigned stream_id = output->stream;
4607       assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4608 
4609       buffer_mask[stream_id] |= 1 << buffer;
4610 
4611       assert(vue_map->varying_to_slot[varying] >= 0);
4612 
4613       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4614        * array.  Instead, it simply increments DstOffset for the following
4615        * input by the number of components that should be skipped.
4616        *
4617        * Our hardware is unusual in that it requires us to program SO_DECLs
4618        * for fake "hole" components, rather than simply taking the offset
4619        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4620        * program as many size = 4 holes as we can, then a final hole to
4621        * accommodate the final 1, 2, or 3 remaining.
4622        */
4623       int skip_components = output->dst_offset - next_offset[buffer];
4624 
4625       while (skip_components > 0) {
4626          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4627             .HoleFlag = 1,
4628             .OutputBufferSlot = output->output_buffer,
4629             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4630          };
4631          skip_components -= 4;
4632       }
4633 
4634       next_offset[buffer] = output->dst_offset + output->num_components;
4635 
4636       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4637          .OutputBufferSlot = output->output_buffer,
4638          .RegisterIndex = vue_map->varying_to_slot[varying],
4639          .ComponentMask =
4640             ((1 << output->num_components) - 1) << output->start_component,
4641       };
4642 
4643       if (decls[stream_id] > max_decls)
4644          max_decls = decls[stream_id];
4645    }
4646 
4647    unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4648    uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4649    uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4650 
4651    iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4652       int urb_entry_read_offset = 0;
4653       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4654          urb_entry_read_offset;
4655 
4656       /* We always read the whole vertex.  This could be reduced at some
4657        * point by reading less and offsetting the register index in the
4658        * SO_DECLs.
4659        */
4660       sol.Stream0VertexReadOffset = urb_entry_read_offset;
4661       sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4662       sol.Stream1VertexReadOffset = urb_entry_read_offset;
4663       sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4664       sol.Stream2VertexReadOffset = urb_entry_read_offset;
4665       sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4666       sol.Stream3VertexReadOffset = urb_entry_read_offset;
4667       sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4668 
4669       /* Set buffer pitches; 0 means unbound. */
4670       sol.Buffer0SurfacePitch = 4 * info->stride[0];
4671       sol.Buffer1SurfacePitch = 4 * info->stride[1];
4672       sol.Buffer2SurfacePitch = 4 * info->stride[2];
4673       sol.Buffer3SurfacePitch = 4 * info->stride[3];
4674    }
4675 
4676    iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4677       list.DWordLength = 3 + 2 * max_decls - 2;
4678       list.StreamtoBufferSelects0 = buffer_mask[0];
4679       list.StreamtoBufferSelects1 = buffer_mask[1];
4680       list.StreamtoBufferSelects2 = buffer_mask[2];
4681       list.StreamtoBufferSelects3 = buffer_mask[3];
4682       list.NumEntries0 = decls[0];
4683       list.NumEntries1 = decls[1];
4684       list.NumEntries2 = decls[2];
4685       list.NumEntries3 = decls[3];
4686    }
4687 
4688    for (int i = 0; i < max_decls; i++) {
4689       iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4690          entry.Stream0Decl = so_decl[0][i];
4691          entry.Stream1Decl = so_decl[1][i];
4692          entry.Stream2Decl = so_decl[2][i];
4693          entry.Stream3Decl = so_decl[3][i];
4694       }
4695    }
4696 
4697    return map;
4698 }
4699 
4700 static inline int
iris_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)4701 iris_compute_first_urb_slot_required(uint64_t inputs_read,
4702                                      const struct intel_vue_map *prev_stage_vue_map)
4703 {
4704 #if GFX_VER >= 9
4705    return brw_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4706 #else
4707    return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4708 #endif
4709 }
4710 
4711 static void
iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,const struct intel_vue_map * last_vue_map,bool two_sided_color,unsigned * out_offset,unsigned * out_length)4712 iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
4713                                    const struct intel_vue_map *last_vue_map,
4714                                    bool two_sided_color,
4715                                    unsigned *out_offset,
4716                                    unsigned *out_length)
4717 {
4718    /* The compiler computes the first URB slot without considering COL/BFC
4719     * swizzling (because it doesn't know whether it's enabled), so we need
4720     * to do that here too.  This may result in a smaller offset, which
4721     * should be safe.
4722     */
4723    const unsigned first_slot =
4724       iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
4725 
4726    /* This becomes the URB read offset (counted in pairs of slots). */
4727    assert(first_slot % 2 == 0);
4728    *out_offset = first_slot / 2;
4729 
4730    /* We need to adjust the inputs read to account for front/back color
4731     * swizzling, as it can make the URB length longer.
4732     */
4733    for (int c = 0; c <= 1; c++) {
4734       if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
4735          /* If two sided color is enabled, the fragment shader's gl_Color
4736           * (COL0) input comes from either the gl_FrontColor (COL0) or
4737           * gl_BackColor (BFC0) input varyings.  Mark BFC as used, too.
4738           */
4739          if (two_sided_color)
4740             fs_input_slots |= (VARYING_BIT_BFC0 << c);
4741 
4742          /* If front color isn't written, we opt to give them back color
4743           * instead of an undefined value.  Switch from COL to BFC.
4744           */
4745          if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
4746             fs_input_slots &= ~(VARYING_BIT_COL0 << c);
4747             fs_input_slots |= (VARYING_BIT_BFC0 << c);
4748          }
4749       }
4750    }
4751 
4752    /* Compute the minimum URB Read Length necessary for the FS inputs.
4753     *
4754     * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4755     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4756     *
4757     * "This field should be set to the minimum length required to read the
4758     *  maximum source attribute.  The maximum source attribute is indicated
4759     *  by the maximum value of the enabled Attribute # Source Attribute if
4760     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4761     *  enable is not set.
4762     *  read_length = ceiling((max_source_attr + 1) / 2)
4763     *
4764     *  [errata] Corruption/Hang possible if length programmed larger than
4765     *  recommended"
4766     *
4767     * Similar text exists for Ivy Bridge.
4768     *
4769     * We find the last URB slot that's actually read by the FS.
4770     */
4771    unsigned last_read_slot = last_vue_map->num_slots - 1;
4772    while (last_read_slot > first_slot && !(fs_input_slots &
4773           (1ull << last_vue_map->slot_to_varying[last_read_slot])))
4774       --last_read_slot;
4775 
4776    /* The URB read length is the difference of the two, counted in pairs. */
4777    *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
4778 }
4779 
4780 static void
iris_emit_sbe_swiz(struct iris_batch * batch,const struct iris_context * ice,const struct intel_vue_map * vue_map,unsigned urb_read_offset,unsigned sprite_coord_enables)4781 iris_emit_sbe_swiz(struct iris_batch *batch,
4782                    const struct iris_context *ice,
4783                    const struct intel_vue_map *vue_map,
4784                    unsigned urb_read_offset,
4785                    unsigned sprite_coord_enables)
4786 {
4787    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
4788    const struct iris_fs_data *fs_data =
4789       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4790    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4791 
4792    /* XXX: this should be generated when putting programs in place */
4793 
4794    for (uint8_t idx = 0; idx < fs_data->urb_setup_attribs_count; idx++) {
4795       const uint8_t fs_attr = fs_data->urb_setup_attribs[idx];
4796       const int input_index = fs_data->urb_setup[fs_attr];
4797       if (input_index < 0 || input_index >= 16)
4798          continue;
4799 
4800       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
4801          &attr_overrides[input_index];
4802       int slot = vue_map->varying_to_slot[fs_attr];
4803 
4804       /* Viewport and Layer are stored in the VUE header.  We need to override
4805        * them to zero if earlier stages didn't write them, as GL requires that
4806        * they read back as zero when not explicitly set.
4807        */
4808       switch (fs_attr) {
4809       case VARYING_SLOT_VIEWPORT:
4810       case VARYING_SLOT_LAYER:
4811          attr->ComponentOverrideX = true;
4812          attr->ComponentOverrideW = true;
4813          attr->ConstantSource = CONST_0000;
4814 
4815          if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4816             attr->ComponentOverrideY = true;
4817          if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4818             attr->ComponentOverrideZ = true;
4819          continue;
4820 
4821       default:
4822          break;
4823       }
4824 
4825       if (sprite_coord_enables & (1 << input_index))
4826          continue;
4827 
4828       /* If there was only a back color written but not front, use back
4829        * as the color instead of undefined.
4830        */
4831       if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4832          slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4833       if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4834          slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4835 
4836       /* Not written by the previous stage - undefined. */
4837       if (slot == -1) {
4838          attr->ComponentOverrideX = true;
4839          attr->ComponentOverrideY = true;
4840          attr->ComponentOverrideZ = true;
4841          attr->ComponentOverrideW = true;
4842          attr->ConstantSource = CONST_0001_FLOAT;
4843          continue;
4844       }
4845 
4846       /* Compute the location of the attribute relative to the read offset,
4847        * which is counted in 256-bit increments (two 128-bit VUE slots).
4848        */
4849       const int source_attr = slot - 2 * urb_read_offset;
4850       assert(source_attr >= 0 && source_attr <= 32);
4851       attr->SourceAttribute = source_attr;
4852 
4853       /* If we are doing two-sided color, and the VUE slot following this one
4854        * represents a back-facing color, then we need to instruct the SF unit
4855        * to do back-facing swizzling.
4856        */
4857       if (cso_rast->light_twoside &&
4858           ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4859             vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4860            (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4861             vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
4862          attr->SwizzleSelect = INPUTATTR_FACING;
4863    }
4864 
4865    iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4866       for (int i = 0; i < 16; i++)
4867          sbes.Attribute[i] = attr_overrides[i];
4868    }
4869 }
4870 
4871 static bool
iris_is_drawing_points(const struct iris_context * ice)4872 iris_is_drawing_points(const struct iris_context *ice)
4873 {
4874    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4875 
4876    if (cso_rast->fill_mode_point) {
4877       return true;
4878    }
4879 
4880    if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4881       const struct iris_gs_data *gs_data =
4882          iris_gs_data(ice->shaders.prog[MESA_SHADER_GEOMETRY]);
4883       return gs_data->output_topology == _3DPRIM_POINTLIST;
4884    } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4885       const struct iris_tes_data *tes_data =
4886          iris_tes_data(ice->shaders.prog[MESA_SHADER_TESS_EVAL]);
4887       return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4888    } else {
4889       return ice->state.prim_mode == MESA_PRIM_POINTS;
4890    }
4891 }
4892 
4893 static unsigned
iris_calculate_point_sprite_overrides(const struct iris_fs_data * fs_data,const struct iris_rasterizer_state * cso)4894 iris_calculate_point_sprite_overrides(const struct iris_fs_data *fs_data,
4895                                       const struct iris_rasterizer_state *cso)
4896 {
4897    unsigned overrides = 0;
4898 
4899    if (fs_data->urb_setup[VARYING_SLOT_PNTC] != -1)
4900       overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_PNTC];
4901 
4902    for (int i = 0; i < 8; i++) {
4903       if ((cso->sprite_coord_enable & (1 << i)) &&
4904           fs_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)
4905          overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_TEX0 + i];
4906    }
4907 
4908    return overrides;
4909 }
4910 
4911 static void
iris_emit_sbe(struct iris_batch * batch,const struct iris_context * ice)4912 iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
4913 {
4914    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4915    const struct iris_fs_data *fs_data =
4916       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4917    const struct intel_vue_map *last_vue_map =
4918       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
4919 
4920    unsigned urb_read_offset, urb_read_length;
4921    iris_compute_sbe_urb_read_interval(fs_data->inputs,
4922                                       last_vue_map,
4923                                       cso_rast->light_twoside,
4924                                       &urb_read_offset, &urb_read_length);
4925 
4926    unsigned sprite_coord_overrides =
4927       iris_is_drawing_points(ice) ?
4928       iris_calculate_point_sprite_overrides(fs_data, cso_rast) : 0;
4929 
4930    iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4931       sbe.AttributeSwizzleEnable = true;
4932       sbe.NumberofSFOutputAttributes = fs_data->num_varying_inputs;
4933       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
4934       sbe.VertexURBEntryReadOffset = urb_read_offset;
4935       sbe.VertexURBEntryReadLength = urb_read_length;
4936       sbe.ForceVertexURBEntryReadOffset = true;
4937       sbe.ForceVertexURBEntryReadLength = true;
4938       sbe.ConstantInterpolationEnable = fs_data->flat_inputs;
4939       sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;
4940 #if GFX_VER >= 9
4941       for (int i = 0; i < 32; i++) {
4942          sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
4943       }
4944 #endif
4945 
4946       /* Ask the hardware to supply PrimitiveID if the fragment shader
4947        * reads it but a previous stage didn't write one.
4948        */
4949       if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
4950           last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
4951          sbe.PrimitiveIDOverrideAttributeSelect =
4952             fs_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
4953          sbe.PrimitiveIDOverrideComponentX = true;
4954          sbe.PrimitiveIDOverrideComponentY = true;
4955          sbe.PrimitiveIDOverrideComponentZ = true;
4956          sbe.PrimitiveIDOverrideComponentW = true;
4957       }
4958    }
4959 
4960    iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,
4961                       sprite_coord_overrides);
4962 }
4963 
4964 /* ------------------------------------------------------------------- */
4965 
4966 /**
4967  * Populate VS program key fields based on the current state.
4968  */
4969 static void
iris_populate_vs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_vs_prog_key * key)4970 iris_populate_vs_key(const struct iris_context *ice,
4971                      const struct shader_info *info,
4972                      gl_shader_stage last_stage,
4973                      struct iris_vs_prog_key *key)
4974 {
4975    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4976 
4977    if (info->clip_distance_array_size == 0 &&
4978        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4979        last_stage == MESA_SHADER_VERTEX)
4980       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4981 }
4982 
4983 /**
4984  * Populate TCS program key fields based on the current state.
4985  */
4986 static void
iris_populate_tcs_key(const struct iris_context * ice,struct iris_tcs_prog_key * key)4987 iris_populate_tcs_key(const struct iris_context *ice,
4988                       struct iris_tcs_prog_key *key)
4989 {
4990 }
4991 
4992 /**
4993  * Populate TES program key fields based on the current state.
4994  */
4995 static void
iris_populate_tes_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_tes_prog_key * key)4996 iris_populate_tes_key(const struct iris_context *ice,
4997                       const struct shader_info *info,
4998                       gl_shader_stage last_stage,
4999                       struct iris_tes_prog_key *key)
5000 {
5001    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
5002 
5003    if (info->clip_distance_array_size == 0 &&
5004        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
5005        last_stage == MESA_SHADER_TESS_EVAL)
5006       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
5007 }
5008 
5009 /**
5010  * Populate GS program key fields based on the current state.
5011  */
5012 static void
iris_populate_gs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_gs_prog_key * key)5013 iris_populate_gs_key(const struct iris_context *ice,
5014                      const struct shader_info *info,
5015                      gl_shader_stage last_stage,
5016                      struct iris_gs_prog_key *key)
5017 {
5018    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
5019 
5020    if (info->clip_distance_array_size == 0 &&
5021        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
5022        last_stage == MESA_SHADER_GEOMETRY)
5023       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
5024 }
5025 
5026 /**
5027  * Populate FS program key fields based on the current state.
5028  */
5029 static void
iris_populate_fs_key(const struct iris_context * ice,const struct shader_info * info,struct iris_fs_prog_key * key)5030 iris_populate_fs_key(const struct iris_context *ice,
5031                      const struct shader_info *info,
5032                      struct iris_fs_prog_key *key)
5033 {
5034    struct iris_screen *screen = (void *) ice->ctx.screen;
5035    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
5036    const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
5037    const struct iris_rasterizer_state *rast = ice->state.cso_rast;
5038    const struct iris_blend_state *blend = ice->state.cso_blend;
5039 
5040    key->nr_color_regions = fb->nr_cbufs;
5041 
5042    key->clamp_fragment_color = rast->clamp_fragment_color;
5043 
5044    key->alpha_to_coverage = blend->alpha_to_coverage;
5045 
5046    key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;
5047 
5048    key->flat_shade = rast->flatshade &&
5049       (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
5050 
5051    key->persample_interp = rast->force_persample_interp;
5052    key->multisample_fbo = rast->multisample && fb->samples > 1;
5053 
5054    key->coherent_fb_fetch = GFX_VER >= 9 && GFX_VER < 20;
5055 
5056    key->force_dual_color_blend =
5057       screen->driconf.dual_color_blend_by_location &&
5058       (blend->blend_enables & 1) && blend->dual_color_blending;
5059 }
5060 
5061 static void
iris_populate_cs_key(const struct iris_context * ice,struct iris_cs_prog_key * key)5062 iris_populate_cs_key(const struct iris_context *ice,
5063                      struct iris_cs_prog_key *key)
5064 {
5065 }
5066 
5067 static inline uint32_t
encode_sampler_count(const struct iris_compiled_shader * shader)5068 encode_sampler_count(const struct iris_compiled_shader *shader)
5069 {
5070    /* We can potentially have way more than 32 samplers and that's ok.
5071     * However, the 3DSTATE_XS packets only have 3 bits to specify how
5072     * many to pre-fetch and all values above 4 are marked reserved.
5073     */
5074    uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
5075    return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
5076 }
5077 
5078 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
5079    pkt.KernelStartPointer = KSP(shader);                                  \
5080    pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;                \
5081    pkt.SamplerCount = encode_sampler_count(shader);                       \
5082    pkt.FloatingPointMode = shader->use_alt_mode;                          \
5083                                                                           \
5084    pkt.DispatchGRFStartRegisterForURBData =                               \
5085       shader->dispatch_grf_start_reg;                                     \
5086    pkt.prefix##URBEntryReadLength = vue_data->urb_read_length;            \
5087    pkt.prefix##URBEntryReadOffset = 0;                                    \
5088                                                                           \
5089    pkt.StatisticsEnable = true;                                           \
5090    pkt.Enable           = true;                                           \
5091                                                                           \
5092    if (shader->total_scratch) {                                           \
5093       INIT_THREAD_SCRATCH_SIZE(pkt)                                       \
5094    }
5095 
5096 /* Note that on Gfx12HP we pass a scratch space surface state offset
5097  * shifted by 2 relative to the value specified on the BSpec, since
5098  * that allows the compiler to save a shift instruction while
5099  * constructing the extended descriptor for SS addressing.  That
5100  * worked because we limit the scratch surface state pool to 8 MB and
5101  * because we relied on the legacy (ExBSO=0) encoding of the extended
5102  * descriptor in order to save the shift, which is no longer supported
5103  * for the UGM shared function on Xe2 platforms, so we no longer
5104  * attempt to do that trick.
5105  */
5106 #define SCRATCH_SPACE_BUFFER_SHIFT (GFX_VER >= 20 ? 6 : 4)
5107 
5108 #if GFX_VERx10 >= 125
5109 #define INIT_THREAD_SCRATCH_SIZE(pkt)
5110 #define MERGE_SCRATCH_ADDR(name)                                          \
5111 {                                                                         \
5112    uint32_t pkt2[GENX(name##_length)] = {0};                              \
5113    _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
5114       p.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;  \
5115    }                                                                      \
5116    iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
5117 }
5118 #else
5119 #define INIT_THREAD_SCRATCH_SIZE(pkt)                                     \
5120    pkt.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
5121 #define MERGE_SCRATCH_ADDR(name)                                          \
5122 {                                                                         \
5123    uint32_t pkt2[GENX(name##_length)] = {0};                              \
5124    _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
5125       p.ScratchSpaceBasePointer =                                         \
5126          rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);                     \
5127    }                                                                      \
5128    iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
5129 }
5130 #endif
5131 
5132 
5133 /**
5134  * Encode most of 3DSTATE_VS based on the compiled shader.
5135  */
5136 static void
iris_store_vs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5137 iris_store_vs_state(const struct intel_device_info *devinfo,
5138                     struct iris_compiled_shader *shader)
5139 {
5140    struct iris_vue_data *vue_data = iris_vue_data(shader);
5141 
5142    iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
5143       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
5144       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
5145 #if GFX_VER < 20
5146       vs.SIMD8DispatchEnable = true;
5147 #endif
5148       vs.UserClipDistanceCullTestEnableBitmask =
5149          vue_data->cull_distance_mask;
5150 #if GFX_VER >= 30
5151       vs.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
5152 #endif
5153    }
5154 }
5155 
5156 /**
5157  * Encode most of 3DSTATE_HS based on the compiled shader.
5158  */
5159 static void
iris_store_tcs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5160 iris_store_tcs_state(const struct intel_device_info *devinfo,
5161                      struct iris_compiled_shader *shader)
5162 {
5163    struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
5164    struct iris_vue_data *vue_data = &tcs_data->base;
5165 
5166    iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
5167       INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
5168 
5169 #if GFX_VER >= 12
5170       /* Wa_1604578095:
5171        *
5172        *    Hang occurs when the number of max threads is less than 2 times
5173        *    the number of instance count. The number of max threads must be
5174        *    more than 2 times the number of instance count.
5175        */
5176       assert((devinfo->max_tcs_threads / 2) > tcs_data->instances);
5177       hs.DispatchGRFStartRegisterForURBData = shader->dispatch_grf_start_reg & 0x1f;
5178       hs.DispatchGRFStartRegisterForURBData5 = shader->dispatch_grf_start_reg >> 5;
5179 #endif
5180 
5181       hs.InstanceCount = tcs_data->instances - 1;
5182       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
5183       hs.IncludeVertexHandles = true;
5184 
5185 #if GFX_VER == 12
5186       /* Patch Count threshold specifies the maximum number of patches that
5187        * will be accumulated before a thread dispatch is forced.
5188        */
5189       hs.PatchCountThreshold = tcs_data->patch_count_threshold;
5190 #endif
5191 
5192 #if GFX_VER >= 9
5193 #if GFX_VER < 20
5194       hs.DispatchMode = vue_data->dispatch_mode;
5195 #endif
5196       hs.IncludePrimitiveID = tcs_data->include_primitive_id;
5197 #endif
5198 
5199 #if GFX_VER >= 30
5200       hs.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
5201 #endif
5202    }
5203 }
5204 
5205 /**
5206  * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
5207  */
5208 static void
iris_store_tes_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5209 iris_store_tes_state(const struct intel_device_info *devinfo,
5210                      struct iris_compiled_shader *shader)
5211 {
5212    struct iris_tes_data *tes_data = iris_tes_data(shader);
5213    struct iris_vue_data *vue_data = &tes_data->base;
5214 
5215    uint32_t *ds_state = (void *) shader->derived_data;
5216    uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
5217 
5218    iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
5219       INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
5220 
5221       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
5222       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
5223       ds.ComputeWCoordinateEnable =
5224          tes_data->domain == INTEL_TESS_DOMAIN_TRI;
5225 
5226 #if GFX_VER >= 12
5227       ds.PrimitiveIDNotRequired = !tes_data->include_primitive_id;
5228 #endif
5229       ds.UserClipDistanceCullTestEnableBitmask =
5230          vue_data->cull_distance_mask;
5231 
5232 #if GFX_VER >= 30
5233       ds.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
5234 #endif
5235    }
5236 
5237    iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
5238       te.Partitioning = tes_data->partitioning;
5239 #if GFX_VER >= 20
5240       te.NumberOfRegionsPerPatch = 2;
5241 #endif
5242       te.OutputTopology = tes_data->output_topology;
5243       te.TEDomain = tes_data->domain;
5244       te.TEEnable = true;
5245       te.MaximumTessellationFactorOdd = 63.0;
5246       te.MaximumTessellationFactorNotOdd = 64.0;
5247 #if GFX_VERx10 >= 125
5248       STATIC_ASSERT(TEDMODE_OFF == 0);
5249       if (intel_needs_workaround(devinfo, 14015055625)) {
5250          te.TessellationDistributionMode = TEDMODE_OFF;
5251       } else if (intel_needs_workaround(devinfo, 22012699309)) {
5252          te.TessellationDistributionMode = TEDMODE_RR_STRICT;
5253       } else {
5254          te.TessellationDistributionMode = TEDMODE_RR_FREE;
5255       }
5256 
5257    #if GFX_VER >= 20
5258       te.TessellationDistributionLevel = TEDLEVEL_REGION;
5259    #else
5260       te.TessellationDistributionLevel = TEDLEVEL_PATCH;
5261    #endif
5262       /* 64_TRIANGLES */
5263       te.SmallPatchThreshold = 3;
5264       /* 1K_TRIANGLES */
5265       te.TargetBlockSize = 8;
5266       /* 1K_TRIANGLES */
5267       te.LocalBOPAccumulatorThreshold = 1;
5268 #endif
5269    }
5270 }
5271 
5272 /**
5273  * Encode most of 3DSTATE_GS based on the compiled shader.
5274  */
5275 static void
iris_store_gs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5276 iris_store_gs_state(const struct intel_device_info *devinfo,
5277                     struct iris_compiled_shader *shader)
5278 {
5279    struct iris_gs_data *gs_data = iris_gs_data(shader);
5280    struct iris_vue_data *vue_data = &gs_data->base;
5281 
5282    iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
5283       INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
5284 
5285       gs.OutputVertexSize = gs_data->output_vertex_size_hwords * 2 - 1;
5286       gs.OutputTopology = gs_data->output_topology;
5287       gs.ControlDataHeaderSize = gs_data->control_data_header_size_hwords;
5288       gs.InstanceControl = gs_data->invocations - 1;
5289 #if GFX_VER < 20
5290       gs.DispatchMode = DISPATCH_MODE_SIMD8;
5291 #endif
5292       gs.IncludePrimitiveID = gs_data->include_primitive_id;
5293       gs.ControlDataFormat = gs_data->control_data_format;
5294       gs.ExpectedVertexCount = gs_data->vertices_in;
5295       gs.MaximumNumberofThreads =
5296          GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
5297                       : (devinfo->max_gs_threads - 1);
5298 
5299       if (gs_data->static_vertex_count != -1) {
5300          gs.StaticOutput = true;
5301          gs.StaticOutputVertexCount = gs_data->static_vertex_count;
5302       }
5303       gs.IncludeVertexHandles = vue_data->include_vue_handles;
5304 
5305       gs.UserClipDistanceCullTestEnableBitmask = vue_data->cull_distance_mask;
5306 
5307       const int urb_entry_write_offset = 1;
5308       const uint32_t urb_entry_output_length =
5309          DIV_ROUND_UP(vue_data->vue_map.num_slots, 2) - urb_entry_write_offset;
5310 
5311       gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
5312       gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
5313 
5314 #if GFX_VER >= 30
5315       gs.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
5316 #endif
5317    }
5318 }
5319 
5320 /**
5321  * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
5322  */
5323 static void
iris_store_fs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5324 iris_store_fs_state(const struct intel_device_info *devinfo,
5325                     struct iris_compiled_shader *shader)
5326 {
5327    struct iris_fs_data *fs_data = iris_fs_data(shader);
5328 
5329    uint32_t *ps_state = (void *) shader->derived_data;
5330    uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
5331 
5332    iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
5333       ps.VectorMaskEnable = fs_data->uses_vmask;
5334       ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
5335       ps.SamplerCount = encode_sampler_count(shader);
5336       ps.FloatingPointMode = shader->use_alt_mode;
5337       ps.MaximumNumberofThreadsPerPSD =
5338          devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
5339 
5340 #if GFX_VER < 20
5341       ps.PushConstantEnable = devinfo->needs_null_push_constant_tbimr_workaround ||
5342                               shader->ubo_ranges[0].length > 0;
5343 #endif
5344 
5345 #if GFX_VER >= 30
5346       ps.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
5347 #endif
5348 
5349       /* From the documentation for this packet:
5350        * "If the PS kernel does not need the Position XY Offsets to
5351        *  compute a Position Value, then this field should be programmed
5352        *  to POSOFFSET_NONE."
5353        *
5354        * "SW Recommendation: If the PS kernel needs the Position Offsets
5355        *  to compute a Position XY value, this field should match Position
5356        *  ZW Interpolation Mode to ensure a consistent position.xyzw
5357        *  computation."
5358        *
5359        * We only require XY sample offsets. So, this recommendation doesn't
5360        * look useful at the moment.  We might need this in future.
5361        */
5362       ps.PositionXYOffsetSelect =
5363          fs_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
5364 
5365       if (shader->total_scratch) {
5366          INIT_THREAD_SCRATCH_SIZE(ps);
5367       }
5368    }
5369 
5370    iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
5371       psx.PixelShaderValid = true;
5372       psx.PixelShaderComputedDepthMode = fs_data->computed_depth_mode;
5373       psx.PixelShaderKillsPixel = fs_data->uses_kill;
5374 #if GFX_VER < 20
5375       psx.AttributeEnable = fs_data->num_varying_inputs != 0;
5376 #endif
5377       psx.PixelShaderUsesSourceDepth = fs_data->uses_src_depth;
5378       psx.PixelShaderUsesSourceW = fs_data->uses_src_w;
5379       psx.PixelShaderIsPerSample = fs_data->is_per_sample;
5380       psx.oMaskPresenttoRenderTarget = fs_data->uses_omask;
5381 
5382 #if GFX_VER >= 9
5383 #if GFX_VER >= 20
5384       assert(!fs_data->pulls_bary);
5385 #else
5386       psx.PixelShaderPullsBary = fs_data->pulls_bary;
5387 #endif
5388       psx.PixelShaderComputesStencil = fs_data->computed_stencil;
5389 #endif
5390 
5391 #if GFX_VER >= 11
5392       psx.PixelShaderRequiresSubpixelSampleOffsets =
5393          fs_data->uses_sample_offsets;
5394       psx.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
5395          fs_data->uses_npc_bary_coefficients;
5396       psx.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
5397          fs_data->uses_pc_bary_coefficients;
5398       psx.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
5399          fs_data->uses_depth_w_coefficients;
5400 #endif
5401    }
5402 }
5403 
5404 /**
5405  * Compute the size of the derived data (shader command packets).
5406  *
5407  * This must match the data written by the iris_store_xs_state() functions.
5408  */
5409 static void
iris_store_cs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5410 iris_store_cs_state(const struct intel_device_info *devinfo,
5411                     struct iris_compiled_shader *shader)
5412 {
5413    struct iris_cs_data *cs_data = iris_cs_data(shader);
5414    void *map = shader->derived_data;
5415 
5416    iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
5417 #if GFX_VERx10 < 125
5418       desc.ConstantURBEntryReadLength = cs_data->push.per_thread.regs;
5419       desc.CrossThreadConstantDataReadLength =
5420          cs_data->push.cross_thread.regs;
5421 #else
5422       assert(cs_data->push.per_thread.regs == 0);
5423       assert(cs_data->push.cross_thread.regs == 0);
5424 #endif
5425 #if GFX_VERx10 <= 125
5426       desc.BarrierEnable = cs_data->uses_barrier;
5427 #endif
5428       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
5429       desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
5430          0 : MIN2(shader->bt.size_bytes / 4, 31);
5431       desc.SamplerCount = encode_sampler_count(shader);
5432       /* TODO: Check if we are missing workarounds and enable mid-thread
5433        * preemption.
5434        *
5435        * We still have issues with mid-thread preemption (it was already
5436        * disabled by the kernel on gfx11, due to missing workarounds). It's
5437        * possible that we are just missing some workarounds, and could enable
5438        * it later, but for now let's disable it to fix a GPU in compute in Car
5439        * Chase (and possibly more).
5440        */
5441 #if GFX_VER >= 20
5442       desc.ThreadPreemption = false;
5443 #elif GFX_VER >= 12
5444       desc.ThreadPreemptionDisable = true;
5445 #endif
5446 #if GFX_VER >= 30
5447       desc.RegistersPerThread = ptl_register_blocks(
5448          shader->brw_prog_data->grf_used);
5449 #endif
5450    }
5451 }
5452 
5453 static unsigned
iris_derived_program_state_size(enum iris_program_cache_id cache_id)5454 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
5455 {
5456    assert(cache_id <= IRIS_CACHE_BLORP);
5457 
5458    static const unsigned dwords[] = {
5459       [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
5460       [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
5461       [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
5462       [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
5463       [IRIS_CACHE_FS] =
5464          GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
5465       [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
5466       [IRIS_CACHE_BLORP] = 0,
5467    };
5468 
5469    return sizeof(uint32_t) * dwords[cache_id];
5470 }
5471 
5472 /**
5473  * Create any state packets corresponding to the given shader stage
5474  * (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.
5475  * This means that we can look up a program in the in-memory cache and
5476  * get most of the state packet without having to reconstruct it.
5477  */
5478 static void
iris_store_derived_program_state(const struct intel_device_info * devinfo,enum iris_program_cache_id cache_id,struct iris_compiled_shader * shader)5479 iris_store_derived_program_state(const struct intel_device_info *devinfo,
5480                                  enum iris_program_cache_id cache_id,
5481                                  struct iris_compiled_shader *shader)
5482 {
5483    switch (cache_id) {
5484    case IRIS_CACHE_VS:
5485       iris_store_vs_state(devinfo, shader);
5486       break;
5487    case IRIS_CACHE_TCS:
5488       iris_store_tcs_state(devinfo, shader);
5489       break;
5490    case IRIS_CACHE_TES:
5491       iris_store_tes_state(devinfo, shader);
5492       break;
5493    case IRIS_CACHE_GS:
5494       iris_store_gs_state(devinfo, shader);
5495       break;
5496    case IRIS_CACHE_FS:
5497       iris_store_fs_state(devinfo, shader);
5498       break;
5499    case IRIS_CACHE_CS:
5500       iris_store_cs_state(devinfo, shader);
5501       break;
5502    case IRIS_CACHE_BLORP:
5503       break;
5504    }
5505 }
5506 
5507 /* ------------------------------------------------------------------- */
5508 
5509 static const uint32_t push_constant_opcodes[] = {
5510    [MESA_SHADER_VERTEX]    = 21,
5511    [MESA_SHADER_TESS_CTRL] = 25, /* HS */
5512    [MESA_SHADER_TESS_EVAL] = 26, /* DS */
5513    [MESA_SHADER_GEOMETRY]  = 22,
5514    [MESA_SHADER_FRAGMENT]  = 23,
5515    [MESA_SHADER_COMPUTE]   = 0,
5516 };
5517 
5518 static uint32_t
use_null_surface(struct iris_batch * batch,struct iris_context * ice)5519 use_null_surface(struct iris_batch *batch, struct iris_context *ice)
5520 {
5521    struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
5522 
5523    iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5524 
5525    return ice->state.unbound_tex.offset;
5526 }
5527 
5528 static uint32_t
use_null_fb_surface(struct iris_batch * batch,struct iris_context * ice)5529 use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)
5530 {
5531    /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
5532    if (!ice->state.null_fb.res)
5533       return use_null_surface(batch, ice);
5534 
5535    struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);
5536 
5537    iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5538 
5539    return ice->state.null_fb.offset;
5540 }
5541 
5542 static uint32_t
surf_state_offset_for_aux(unsigned aux_modes,enum isl_aux_usage aux_usage)5543 surf_state_offset_for_aux(unsigned aux_modes,
5544                           enum isl_aux_usage aux_usage)
5545 {
5546    assert(aux_modes & (1 << aux_usage));
5547    return SURFACE_STATE_ALIGNMENT *
5548           util_bitcount(aux_modes & ((1 << aux_usage) - 1));
5549 }
5550 
5551 #if GFX_VER == 9
5552 static void
surf_state_update_clear_value(struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5553 surf_state_update_clear_value(struct iris_batch *batch,
5554                               struct iris_resource *res,
5555                               struct iris_surface_state *surf_state,
5556                               enum isl_aux_usage aux_usage)
5557 {
5558    struct isl_device *isl_dev = &batch->screen->isl_dev;
5559    struct iris_bo *state_bo = iris_resource_bo(surf_state->ref.res);
5560    uint64_t real_offset = surf_state->ref.offset + IRIS_MEMZONE_BINDER_START;
5561    uint32_t offset_into_bo = real_offset - state_bo->address;
5562    uint32_t clear_offset = offset_into_bo +
5563       isl_dev->ss.clear_value_offset +
5564       surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5565    uint32_t *color = res->aux.clear_color.u32;
5566 
5567    assert(isl_dev->ss.clear_value_size == 16);
5568 
5569    if (aux_usage == ISL_AUX_USAGE_HIZ) {
5570       iris_emit_pipe_control_write(batch, "update fast clear value (Z)",
5571                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5572                                    state_bo, clear_offset, color[0]);
5573    } else {
5574       iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",
5575                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5576                                    state_bo, clear_offset,
5577                                    (uint64_t) color[0] |
5578                                    (uint64_t) color[1] << 32);
5579       iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",
5580                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5581                                    state_bo, clear_offset + 8,
5582                                    (uint64_t) color[2] |
5583                                    (uint64_t) color[3] << 32);
5584    }
5585 
5586    iris_emit_pipe_control_flush(batch,
5587                                 "update fast clear: state cache invalidate",
5588                                 PIPE_CONTROL_FLUSH_ENABLE |
5589                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
5590 }
5591 #endif
5592 
5593 static void
update_clear_value(struct iris_context * ice,struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,struct isl_view * view)5594 update_clear_value(struct iris_context *ice,
5595                    struct iris_batch *batch,
5596                    struct iris_resource *res,
5597                    struct iris_surface_state *surf_state,
5598                    struct isl_view *view)
5599 {
5600    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5601    UNUSED unsigned aux_modes = surf_state->aux_usages;
5602 
5603    /* We only need to update the clear color in the surface state for gfx8 and
5604     * gfx9. Newer gens can read it directly from the clear color state buffer.
5605     */
5606 #if GFX_VER == 9
5607    /* Skip updating the ISL_AUX_USAGE_NONE surface state */
5608    aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);
5609 
5610    while (aux_modes) {
5611       enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
5612 
5613       surf_state_update_clear_value(batch, res, surf_state, aux_usage);
5614    }
5615 #elif GFX_VER == 8
5616    /* TODO: Could update rather than re-filling */
5617    alloc_surface_states(surf_state, surf_state->aux_usages);
5618 
5619    fill_surface_states(isl_dev, surf_state, res, &res->surf, view, 0, 0, 0);
5620 
5621    upload_surface_states(ice->state.surface_uploader, surf_state);
5622 #endif
5623 }
5624 
5625 static uint32_t
use_surface_state(struct iris_batch * batch,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5626 use_surface_state(struct iris_batch *batch,
5627                   struct iris_surface_state *surf_state,
5628                   enum isl_aux_usage aux_usage)
5629 {
5630    iris_use_pinned_bo(batch, iris_resource_bo(surf_state->ref.res), false,
5631                       IRIS_DOMAIN_NONE);
5632 
5633    return surf_state->ref.offset +
5634           surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5635 }
5636 
5637 /**
5638  * Add a surface to the validation list, as well as the buffer containing
5639  * the corresponding SURFACE_STATE.
5640  *
5641  * Returns the binding table entry (offset to SURFACE_STATE).
5642  */
5643 static uint32_t
use_surface(struct iris_context * ice,struct iris_batch * batch,struct pipe_surface * p_surf,bool writeable,enum isl_aux_usage aux_usage,bool is_read_surface,enum iris_domain access)5644 use_surface(struct iris_context *ice,
5645             struct iris_batch *batch,
5646             struct pipe_surface *p_surf,
5647             bool writeable,
5648             enum isl_aux_usage aux_usage,
5649             bool is_read_surface,
5650             enum iris_domain access)
5651 {
5652    struct iris_surface *surf = (void *) p_surf;
5653    struct iris_resource *res = (void *) p_surf->texture;
5654 
5655    if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {
5656       upload_surface_states(ice->state.surface_uploader,
5657                             &surf->surface_state_read);
5658    }
5659 
5660    if (!surf->surface_state.ref.res) {
5661       upload_surface_states(ice->state.surface_uploader,
5662                             &surf->surface_state);
5663    }
5664 
5665    if (memcmp(&res->aux.clear_color, &surf->clear_color,
5666               sizeof(surf->clear_color)) != 0) {
5667       update_clear_value(ice, batch, res, &surf->surface_state, &surf->view);
5668       if (GFX_VER == 8) {
5669          update_clear_value(ice, batch, res, &surf->surface_state_read,
5670                             &surf->read_view);
5671       }
5672       surf->clear_color = res->aux.clear_color;
5673    }
5674 
5675    if (res->aux.clear_color_bo)
5676       iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
5677 
5678    if (res->aux.bo)
5679       iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
5680 
5681    iris_use_pinned_bo(batch, res->bo, writeable, access);
5682 
5683    if (GFX_VER == 8 && is_read_surface) {
5684       return use_surface_state(batch, &surf->surface_state_read, aux_usage);
5685    } else {
5686       return use_surface_state(batch, &surf->surface_state, aux_usage);
5687    }
5688 }
5689 
5690 static uint32_t
use_sampler_view(struct iris_context * ice,struct iris_batch * batch,struct iris_sampler_view * isv)5691 use_sampler_view(struct iris_context *ice,
5692                  struct iris_batch *batch,
5693                  struct iris_sampler_view *isv)
5694 {
5695    enum isl_aux_usage aux_usage =
5696       iris_resource_texture_aux_usage(ice, isv->res, isv->view.format,
5697                                       isv->view.base_level, isv->view.levels);
5698 
5699    if (!isv->surface_state.ref.res)
5700       upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
5701 
5702    if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
5703               sizeof(isv->clear_color)) != 0) {
5704       update_clear_value(ice, batch, isv->res, &isv->surface_state,
5705                          &isv->view);
5706       isv->clear_color = isv->res->aux.clear_color;
5707    }
5708 
5709    if (isv->res->aux.clear_color_bo) {
5710       iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
5711                          false, IRIS_DOMAIN_SAMPLER_READ);
5712    }
5713 
5714    if (isv->res->aux.bo) {
5715       iris_use_pinned_bo(batch, isv->res->aux.bo,
5716                          false, IRIS_DOMAIN_SAMPLER_READ);
5717    }
5718 
5719    iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_SAMPLER_READ);
5720 
5721    return use_surface_state(batch, &isv->surface_state, aux_usage);
5722 }
5723 
5724 static uint32_t
use_ubo_ssbo(struct iris_batch * batch,struct iris_context * ice,struct pipe_shader_buffer * buf,struct iris_state_ref * surf_state,bool writable,enum iris_domain access)5725 use_ubo_ssbo(struct iris_batch *batch,
5726              struct iris_context *ice,
5727              struct pipe_shader_buffer *buf,
5728              struct iris_state_ref *surf_state,
5729              bool writable, enum iris_domain access)
5730 {
5731    if (!buf->buffer || !surf_state->res)
5732       return use_null_surface(batch, ice);
5733 
5734    iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);
5735    iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,
5736                       IRIS_DOMAIN_NONE);
5737 
5738    return surf_state->offset;
5739 }
5740 
5741 static uint32_t
use_image(struct iris_batch * batch,struct iris_context * ice,struct iris_shader_state * shs,const struct shader_info * info,int i)5742 use_image(struct iris_batch *batch, struct iris_context *ice,
5743           struct iris_shader_state *shs, const struct shader_info *info,
5744           int i)
5745 {
5746    struct iris_image_view *iv = &shs->image[i];
5747    struct iris_resource *res = (void *) iv->base.resource;
5748 
5749    if (!res)
5750       return use_null_surface(batch, ice);
5751 
5752    bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5753 
5754    iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);
5755 
5756    if (res->aux.bo)
5757       iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);
5758 
5759    if (res->aux.clear_color_bo) {
5760       iris_use_pinned_bo(batch, res->aux.clear_color_bo, false,
5761                          IRIS_DOMAIN_NONE);
5762    }
5763 
5764    enum isl_aux_usage aux_usage = shs->image_aux_usage[i];
5765 
5766    return use_surface_state(batch, &iv->surface_state, aux_usage);
5767 }
5768 
5769 #define push_bt_entry(addr) \
5770    assert(addr >= surf_base_offset); \
5771    assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \
5772    if (!pin_only) bt_map[s++] = (addr) - surf_base_offset;
5773 
5774 #define bt_assert(section) \
5775    if (!pin_only && shader->bt.used_mask[section] != 0) \
5776       assert(shader->bt.offsets[section] == s);
5777 
5778 /**
5779  * Populate the binding table for a given shader stage.
5780  *
5781  * This fills out the table of pointers to surfaces required by the shader,
5782  * and also adds those buffers to the validation list so the kernel can make
5783  * resident before running our batch.
5784  */
5785 static void
iris_populate_binding_table(struct iris_context * ice,struct iris_batch * batch,gl_shader_stage stage,bool pin_only)5786 iris_populate_binding_table(struct iris_context *ice,
5787                             struct iris_batch *batch,
5788                             gl_shader_stage stage,
5789                             bool pin_only)
5790 {
5791    const struct iris_binder *binder = &ice->state.binder;
5792    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5793    if (!shader)
5794       return;
5795 
5796    struct iris_binding_table *bt = &shader->bt;
5797    struct iris_shader_state *shs = &ice->state.shaders[stage];
5798    uint32_t surf_base_offset = GFX_VER < 11 ? binder->bo->address : 0;
5799 
5800    uint32_t *bt_map = binder->map + binder->bt_offset[stage];
5801    int s = 0;
5802 
5803    const struct shader_info *info = iris_get_shader_info(ice, stage);
5804    if (!info) {
5805       /* TCS passthrough doesn't need a binding table. */
5806       assert(stage == MESA_SHADER_TESS_CTRL);
5807       return;
5808    }
5809 
5810    if (stage == MESA_SHADER_COMPUTE &&
5811        shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {
5812       /* surface for gl_NumWorkGroups */
5813       struct iris_state_ref *grid_data = &ice->state.grid_size;
5814       struct iris_state_ref *grid_state = &ice->state.grid_surf_state;
5815       iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,
5816                          IRIS_DOMAIN_PULL_CONSTANT_READ);
5817       iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,
5818                          IRIS_DOMAIN_NONE);
5819       push_bt_entry(grid_state->offset);
5820    }
5821 
5822    if (stage == MESA_SHADER_FRAGMENT) {
5823       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5824       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5825       if (cso_fb->nr_cbufs) {
5826          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5827             uint32_t addr;
5828             if (cso_fb->cbufs[i]) {
5829                addr = use_surface(ice, batch, cso_fb->cbufs[i], true,
5830                                   ice->state.draw_aux_usage[i], false,
5831                                   IRIS_DOMAIN_RENDER_WRITE);
5832             } else {
5833                addr = use_null_fb_surface(batch, ice);
5834             }
5835             push_bt_entry(addr);
5836          }
5837       } else if (bt->use_null_rt) {
5838          uint32_t addr = use_null_fb_surface(batch, ice);
5839          push_bt_entry(addr);
5840       }
5841    }
5842 
5843 #define foreach_surface_used(index, group) \
5844    bt_assert(group); \
5845    for (int index = 0; index < bt->sizes[group]; index++) \
5846       if (iris_group_index_to_bti(bt, group, index) != \
5847           IRIS_SURFACE_NOT_USED)
5848 
5849    foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {
5850       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5851       uint32_t addr;
5852       if (cso_fb->cbufs[i]) {
5853          addr = use_surface(ice, batch, cso_fb->cbufs[i],
5854                             false, ice->state.draw_aux_usage[i], true,
5855                             IRIS_DOMAIN_SAMPLER_READ);
5856          push_bt_entry(addr);
5857       }
5858    }
5859 
5860    foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_LOW64) {
5861       struct iris_sampler_view *view = shs->textures[i];
5862       uint32_t addr = view ? use_sampler_view(ice, batch, view)
5863                            : use_null_surface(batch, ice);
5864       push_bt_entry(addr);
5865    }
5866 
5867    foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_HIGH64) {
5868       struct iris_sampler_view *view = shs->textures[64 + i];
5869       uint32_t addr = view ? use_sampler_view(ice, batch, view)
5870                            : use_null_surface(batch, ice);
5871       push_bt_entry(addr);
5872    }
5873 
5874    foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {
5875       uint32_t addr = use_image(batch, ice, shs, info, i);
5876       push_bt_entry(addr);
5877    }
5878 
5879    foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {
5880       uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],
5881                                    &shs->constbuf_surf_state[i], false,
5882                                    IRIS_DOMAIN_PULL_CONSTANT_READ);
5883       push_bt_entry(addr);
5884    }
5885 
5886    foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {
5887       uint32_t addr =
5888          use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],
5889                       shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);
5890       push_bt_entry(addr);
5891    }
5892 
5893 #if 0
5894       /* XXX: YUV surfaces not implemented yet */
5895       bt_assert(plane_start[1], ...);
5896       bt_assert(plane_start[2], ...);
5897 #endif
5898 }
5899 
5900 static void
iris_use_optional_res(struct iris_batch * batch,struct pipe_resource * res,bool writeable,enum iris_domain access)5901 iris_use_optional_res(struct iris_batch *batch,
5902                       struct pipe_resource *res,
5903                       bool writeable,
5904                       enum iris_domain access)
5905 {
5906    if (res) {
5907       struct iris_bo *bo = iris_resource_bo(res);
5908       iris_use_pinned_bo(batch, bo, writeable, access);
5909    }
5910 }
5911 
5912 static void
pin_depth_and_stencil_buffers(struct iris_batch * batch,struct pipe_surface * zsbuf,struct iris_depth_stencil_alpha_state * cso_zsa)5913 pin_depth_and_stencil_buffers(struct iris_batch *batch,
5914                               struct pipe_surface *zsbuf,
5915                               struct iris_depth_stencil_alpha_state *cso_zsa)
5916 {
5917    if (!zsbuf)
5918       return;
5919 
5920    struct iris_resource *zres, *sres;
5921    iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);
5922 
5923    if (zres) {
5924       iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,
5925                          IRIS_DOMAIN_DEPTH_WRITE);
5926       if (zres->aux.bo) {
5927          iris_use_pinned_bo(batch, zres->aux.bo,
5928                             cso_zsa->depth_writes_enabled,
5929                             IRIS_DOMAIN_DEPTH_WRITE);
5930       }
5931    }
5932 
5933    if (sres) {
5934       iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,
5935                          IRIS_DOMAIN_DEPTH_WRITE);
5936    }
5937 }
5938 
5939 static uint32_t
pin_scratch_space(struct iris_context * ice,struct iris_batch * batch,const struct iris_compiled_shader * shader,gl_shader_stage stage)5940 pin_scratch_space(struct iris_context *ice,
5941                   struct iris_batch *batch,
5942                   const struct iris_compiled_shader *shader,
5943                   gl_shader_stage stage)
5944 {
5945    uint32_t scratch_addr = 0;
5946 
5947    if (shader->total_scratch > 0) {
5948       struct iris_bo *scratch_bo =
5949          iris_get_scratch_space(ice, shader->total_scratch, stage);
5950       iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);
5951 
5952 #if GFX_VERx10 >= 125
5953       const struct iris_state_ref *ref =
5954          iris_get_scratch_surf(ice, shader->total_scratch);
5955       iris_use_pinned_bo(batch, iris_resource_bo(ref->res),
5956                          false, IRIS_DOMAIN_NONE);
5957       scratch_addr = ref->offset +
5958                      iris_resource_bo(ref->res)->address -
5959                      IRIS_MEMZONE_SCRATCH_START;
5960       assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));
5961 #else
5962       scratch_addr = scratch_bo->address;
5963 #endif
5964    }
5965 
5966    return scratch_addr;
5967 }
5968 
5969 /* ------------------------------------------------------------------- */
5970 
5971 /**
5972  * Pin any BOs which were installed by a previous batch, and restored
5973  * via the hardware logical context mechanism.
5974  *
5975  * We don't need to re-emit all state every batch - the hardware context
5976  * mechanism will save and restore it for us.  This includes pointers to
5977  * various BOs...which won't exist unless we ask the kernel to pin them
5978  * by adding them to the validation list.
5979  *
5980  * We can skip buffers if we've re-emitted those packets, as we're
5981  * overwriting those stale pointers with new ones, and don't actually
5982  * refer to the old BOs.
5983  */
5984 static void
iris_restore_render_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)5985 iris_restore_render_saved_bos(struct iris_context *ice,
5986                               struct iris_batch *batch,
5987                               const struct pipe_draw_info *draw)
5988 {
5989    struct iris_genx_state *genx = ice->state.genx;
5990 
5991    const uint64_t clean = ~ice->state.dirty;
5992    const uint64_t stage_clean = ~ice->state.stage_dirty;
5993 
5994    if (clean & IRIS_DIRTY_CC_VIEWPORT) {
5995       iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,
5996                             IRIS_DOMAIN_NONE);
5997    }
5998 
5999    if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
6000       iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,
6001                             IRIS_DOMAIN_NONE);
6002    }
6003 
6004    if (clean & IRIS_DIRTY_BLEND_STATE) {
6005       iris_use_optional_res(batch, ice->state.last_res.blend, false,
6006                             IRIS_DOMAIN_NONE);
6007    }
6008 
6009    if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
6010       iris_use_optional_res(batch, ice->state.last_res.color_calc, false,
6011                             IRIS_DOMAIN_NONE);
6012    }
6013 
6014    if (clean & IRIS_DIRTY_SCISSOR_RECT) {
6015       iris_use_optional_res(batch, ice->state.last_res.scissor, false,
6016                             IRIS_DOMAIN_NONE);
6017    }
6018 
6019    if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {
6020       for (int i = 0; i < 4; i++) {
6021          struct iris_stream_output_target *tgt =
6022             (void *) ice->state.so_target[i];
6023          if (tgt) {
6024             iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
6025                                true, IRIS_DOMAIN_OTHER_WRITE);
6026             iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
6027                                true, IRIS_DOMAIN_OTHER_WRITE);
6028          }
6029       }
6030    }
6031 
6032    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6033       if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6034          continue;
6035 
6036       struct iris_shader_state *shs = &ice->state.shaders[stage];
6037       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6038 
6039       if (!shader)
6040          continue;
6041 
6042       for (int i = 0; i < 4; i++) {
6043          const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6044 
6045          if (range->length == 0)
6046             continue;
6047 
6048          /* Range block is a binding table index, map back to UBO index. */
6049          unsigned block_index = iris_bti_to_group_index(
6050             &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6051          assert(block_index != IRIS_SURFACE_NOT_USED);
6052 
6053          struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6054          struct iris_resource *res = (void *) cbuf->buffer;
6055 
6056          if (res)
6057             iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
6058          else
6059             iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,
6060                                IRIS_DOMAIN_OTHER_READ);
6061       }
6062    }
6063 
6064    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6065       if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6066          /* Re-pin any buffers referred to by the binding table. */
6067          iris_populate_binding_table(ice, batch, stage, true);
6068       }
6069    }
6070 
6071    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6072       struct iris_shader_state *shs = &ice->state.shaders[stage];
6073       struct pipe_resource *res = shs->sampler_table.res;
6074       if (res)
6075          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
6076                             IRIS_DOMAIN_NONE);
6077    }
6078 
6079    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6080       if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {
6081          struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6082 
6083          if (shader) {
6084             struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6085             iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6086 
6087             pin_scratch_space(ice, batch, shader, stage);
6088          }
6089       }
6090    }
6091 
6092    if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&
6093        (clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {
6094       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6095       pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
6096    }
6097 
6098    iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
6099                          IRIS_DOMAIN_VF_READ);
6100 
6101    if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
6102       uint64_t bound = ice->state.bound_vertex_buffers;
6103       while (bound) {
6104          const int i = u_bit_scan64(&bound);
6105          struct pipe_resource *res = genx->vertex_buffers[i].resource;
6106          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
6107                             IRIS_DOMAIN_VF_READ);
6108       }
6109    }
6110 }
6111 
6112 static void
iris_restore_compute_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)6113 iris_restore_compute_saved_bos(struct iris_context *ice,
6114                                struct iris_batch *batch,
6115                                const struct pipe_grid_info *grid)
6116 {
6117    const uint64_t stage_clean = ~ice->state.stage_dirty;
6118 
6119    const int stage = MESA_SHADER_COMPUTE;
6120    struct iris_shader_state *shs = &ice->state.shaders[stage];
6121 
6122    if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {
6123       /* Re-pin any buffers referred to by the binding table. */
6124       iris_populate_binding_table(ice, batch, stage, true);
6125    }
6126 
6127    struct pipe_resource *sampler_res = shs->sampler_table.res;
6128    if (sampler_res)
6129       iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,
6130                          IRIS_DOMAIN_NONE);
6131 
6132    if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&
6133        (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&
6134        (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
6135        (stage_clean & IRIS_STAGE_DIRTY_CS)) {
6136       iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,
6137                             IRIS_DOMAIN_NONE);
6138    }
6139 
6140    if (stage_clean & IRIS_STAGE_DIRTY_CS) {
6141       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6142 
6143       if (shader) {
6144          struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6145          iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6146 
6147          if (GFX_VERx10 < 125) {
6148             struct iris_bo *curbe_bo =
6149                iris_resource_bo(ice->state.last_res.cs_thread_ids);
6150             iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);
6151          }
6152 
6153          pin_scratch_space(ice, batch, shader, stage);
6154       }
6155    }
6156 }
6157 
6158 /**
6159  * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
6160  */
6161 static void
iris_update_binder_address(struct iris_batch * batch,struct iris_binder * binder)6162 iris_update_binder_address(struct iris_batch *batch,
6163                            struct iris_binder *binder)
6164 {
6165    if (batch->last_binder_address == binder->bo->address)
6166       return;
6167 
6168    struct isl_device *isl_dev = &batch->screen->isl_dev;
6169    uint32_t mocs = isl_mocs(isl_dev, 0, false);
6170 
6171    iris_batch_sync_region_start(batch);
6172 
6173 #if GFX_VER >= 11
6174    /* Use 3DSTATE_BINDING_TABLE_POOL_ALLOC on Icelake and later */
6175 
6176 #if GFX_VERx10 == 120
6177    /* Wa_1607854226:
6178     *
6179     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
6180     *  mode by putting the pipeline temporarily in 3D mode..
6181     */
6182    if (batch->name == IRIS_BATCH_COMPUTE)
6183       emit_pipeline_select(batch, _3D);
6184 #endif
6185 
6186    iris_emit_pipe_control_flush(batch, "Stall for binder realloc",
6187                                 PIPE_CONTROL_CS_STALL);
6188 
6189    iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
6190       btpa.BindingTablePoolBaseAddress = ro_bo(binder->bo, 0);
6191       btpa.BindingTablePoolBufferSize = binder->size / 4096;
6192 #if GFX_VERx10 < 125
6193       btpa.BindingTablePoolEnable = true;
6194 #endif
6195       btpa.MOCS = mocs;
6196    }
6197 
6198 #if GFX_VERx10 == 120
6199    /* Wa_1607854226:
6200     *
6201     *  Put the pipeline back into compute mode.
6202     */
6203    if (batch->name == IRIS_BATCH_COMPUTE)
6204       emit_pipeline_select(batch, GPGPU);
6205 #endif
6206 #else
6207    /* Use STATE_BASE_ADDRESS on older platforms */
6208    flush_before_state_base_change(batch);
6209 
6210    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
6211       sba.SurfaceStateBaseAddressModifyEnable = true;
6212       sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
6213 
6214       /* The hardware appears to pay attention to the MOCS fields even
6215        * if you don't set the "Address Modify Enable" bit for the base.
6216        */
6217       sba.GeneralStateMOCS            = mocs;
6218       sba.StatelessDataPortAccessMOCS = mocs;
6219       sba.DynamicStateMOCS            = mocs;
6220       sba.IndirectObjectMOCS          = mocs;
6221       sba.InstructionMOCS             = mocs;
6222       sba.SurfaceStateMOCS            = mocs;
6223 #if GFX_VER >= 9
6224       sba.BindlessSurfaceStateMOCS    = mocs;
6225 #endif
6226 #if GFX_VERx10 >= 125
6227       sba.L1CacheControl = L1CC_WB;
6228 #endif
6229    }
6230 #endif
6231 
6232    flush_after_state_base_change(batch);
6233    iris_batch_sync_region_end(batch);
6234 
6235    batch->last_binder_address = binder->bo->address;
6236 }
6237 
6238 static inline void
iris_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)6239 iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
6240                         bool window_space_position, float *zmin, float *zmax)
6241 {
6242    if (window_space_position) {
6243       *zmin = 0.f;
6244       *zmax = 1.f;
6245       return;
6246    }
6247    util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
6248 }
6249 
6250 /* Wa_16018063123 */
6251 static inline void
batch_emit_fast_color_dummy_blit(struct iris_batch * batch)6252 batch_emit_fast_color_dummy_blit(struct iris_batch *batch)
6253 {
6254 #if GFX_VERx10 >= 125
6255    iris_emit_cmd(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6256       blt.DestinationBaseAddress = batch->screen->workaround_address;
6257       blt.DestinationMOCS = iris_mocs(batch->screen->workaround_address.bo,
6258                                       &batch->screen->isl_dev,
6259                                       ISL_SURF_USAGE_BLITTER_DST_BIT);
6260       blt.DestinationPitch = 63;
6261       blt.DestinationX2 = 1;
6262       blt.DestinationY2 = 4;
6263       blt.DestinationSurfaceWidth = 1;
6264       blt.DestinationSurfaceHeight = 4;
6265       blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6266       blt.DestinationSurfaceQPitch = 4;
6267       blt.DestinationTiling = XY_TILE_LINEAR;
6268    }
6269 #endif
6270 }
6271 
6272 #if GFX_VER >= 12
6273 static void
invalidate_aux_map_state_per_engine(struct iris_batch * batch)6274 invalidate_aux_map_state_per_engine(struct iris_batch *batch)
6275 {
6276    uint64_t register_addr = 0;
6277 
6278    switch (batch->name) {
6279    case IRIS_BATCH_RENDER: {
6280       /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6281        * RCS engine idle sequence:
6282        *
6283        *    Gfx12+:
6284        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6285        *                      Target Cache Flush + Depth Cache
6286        *
6287        *    Gfx125+:
6288        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6289        *                      Target Cache Flush + Depth Cache + CCS flush
6290        */
6291       iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6292                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
6293                                  PIPE_CONTROL_L3_FABRIC_FLUSH |
6294                                  PIPE_CONTROL_CS_STALL |
6295                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
6296                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
6297                                  (GFX_VERx10 == 125 ?
6298                                   PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6299 
6300       register_addr = GENX(GFX_CCS_AUX_INV_num);
6301       break;
6302    }
6303    case IRIS_BATCH_COMPUTE: {
6304       /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6305        * Compute engine idle sequence:
6306        *
6307        *    Gfx12+:
6308        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall
6309        *
6310        *    Gfx125+:
6311        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + CCS flush
6312        */
6313       iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6314                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
6315                                  PIPE_CONTROL_L3_FABRIC_FLUSH |
6316                                  PIPE_CONTROL_CS_STALL |
6317                                  (GFX_VERx10 == 125 ?
6318                                   PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6319 
6320       register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
6321       break;
6322    }
6323    case IRIS_BATCH_BLITTER: {
6324 #if GFX_VERx10 >= 125
6325       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6326       if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
6327          batch_emit_fast_color_dummy_blit(batch);
6328 
6329       /*
6330        * Notice we don't set the L3 Fabric Flush here, because we have
6331        * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6332        * documentation says :
6333        *
6334        *    "L3 Fabric Flush will ensure all the pending transactions in the
6335        *     L3 Fabric are flushed to global observation point. HW does
6336        *     implicit L3 Fabric Flush on all stalling flushes (both explicit
6337        *     and implicit) and on PIPECONTROL having Post Sync Operation
6338        *     enabled."
6339        *
6340        * Therefore setting L3 Fabric Flush here would be redundant.
6341        *
6342        * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6343        * Blitter engine idle sequence:
6344        *
6345        *    Gfx125+:
6346        *       MI_FLUSH_DW (dw0;b16 – flush CCS)
6347        */
6348       iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
6349          fd.FlushCCS = true;
6350       }
6351       register_addr = GENX(BCS_CCS_AUX_INV_num);
6352 #endif
6353       break;
6354    }
6355    default:
6356       unreachable("Invalid batch for aux map invalidation");
6357       break;
6358    }
6359 
6360    if (register_addr != 0) {
6361       /* If the aux-map state number increased, then we need to rewrite the
6362        * register. Rewriting the register is used to both set the aux-map
6363        * translation table address, and also to invalidate any previously
6364        * cached translations.
6365        */
6366       iris_load_register_imm32(batch, register_addr, 1);
6367 
6368       /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6369        *
6370        *    "Poll Aux Invalidation bit once the invalidation is set (Register
6371        *     4208 bit 0)"
6372        */
6373       iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6374          sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6375          sem.WaitMode = PollingMode;
6376          sem.RegisterPollMode = true;
6377          sem.SemaphoreDataDword = 0x0;
6378          sem.SemaphoreAddress = ro_bo(NULL, register_addr);
6379       }
6380    }
6381 }
6382 
6383 void
genX(invalidate_aux_map_state)6384 genX(invalidate_aux_map_state)(struct iris_batch *batch)
6385 {
6386    struct iris_screen *screen = batch->screen;
6387    void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6388    if (!aux_map_ctx)
6389       return;
6390    uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);
6391    if (batch->last_aux_map_state != aux_map_state_num) {
6392       invalidate_aux_map_state_per_engine(batch);
6393       batch->last_aux_map_state = aux_map_state_num;
6394    }
6395 }
6396 
6397 static void
init_aux_map_state(struct iris_batch * batch)6398 init_aux_map_state(struct iris_batch *batch)
6399 {
6400    struct iris_screen *screen = batch->screen;
6401    void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6402    if (!aux_map_ctx)
6403       return;
6404 
6405    uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);
6406    assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);
6407 
6408    uint32_t reg = 0;
6409    switch (batch->name) {
6410    case IRIS_BATCH_COMPUTE:
6411       if (iris_bufmgr_compute_engine_supported(screen->bufmgr)) {
6412          reg = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
6413          break;
6414       }
6415       /* fallthrough */
6416       FALLTHROUGH;
6417    case IRIS_BATCH_RENDER:
6418       reg = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
6419       break;
6420    case IRIS_BATCH_BLITTER:
6421 #if GFX_VERx10 >= 125
6422       reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
6423 #endif
6424       break;
6425    default:
6426       unreachable("Invalid batch for aux map init.");
6427    }
6428 
6429    if (reg)
6430       iris_load_register_imm64(batch, reg, base_addr);
6431 }
6432 #endif
6433 
6434 struct push_bos {
6435    struct {
6436       struct iris_address addr;
6437       uint32_t length;
6438    } buffers[4];
6439    int buffer_count;
6440    uint32_t max_length;
6441 };
6442 
6443 static void
setup_constant_buffers(struct iris_context * ice,struct iris_batch * batch,int stage,struct push_bos * push_bos)6444 setup_constant_buffers(struct iris_context *ice,
6445                        struct iris_batch *batch,
6446                        int stage,
6447                        struct push_bos *push_bos)
6448 {
6449    struct iris_shader_state *shs = &ice->state.shaders[stage];
6450    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6451 
6452    uint32_t push_range_sum = 0;
6453 
6454    int n = 0;
6455    for (int i = 0; i < 4; i++) {
6456       const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6457 
6458       if (range->length == 0)
6459          continue;
6460 
6461       push_range_sum += range->length;
6462 
6463       if (range->length > push_bos->max_length)
6464          push_bos->max_length = range->length;
6465 
6466       /* Range block is a binding table index, map back to UBO index. */
6467       unsigned block_index = iris_bti_to_group_index(
6468          &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6469       assert(block_index != IRIS_SURFACE_NOT_USED);
6470 
6471       struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6472       struct iris_resource *res = (void *) cbuf->buffer;
6473 
6474       assert(cbuf->buffer_offset % 32 == 0);
6475 
6476       if (res)
6477          iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
6478 
6479       push_bos->buffers[n].length = range->length;
6480       push_bos->buffers[n].addr =
6481          res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
6482          : batch->screen->workaround_address;
6483       n++;
6484    }
6485 
6486    /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
6487     *
6488     *    "The sum of all four read length fields must be less than or
6489     *    equal to the size of 64."
6490     */
6491    assert(push_range_sum <= 64);
6492 
6493    push_bos->buffer_count = n;
6494 }
6495 
6496 static void
emit_push_constant_packets(struct iris_context * ice,struct iris_batch * batch,int stage,const struct push_bos * push_bos)6497 emit_push_constant_packets(struct iris_context *ice,
6498                            struct iris_batch *batch,
6499                            int stage,
6500                            const struct push_bos *push_bos)
6501 {
6502    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
6503 
6504    iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
6505       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
6506 
6507 #if GFX_VER >= 9
6508       pkt.MOCS = isl_mocs(isl_dev, 0, false);
6509 #endif
6510 
6511       /* The Skylake PRM contains the following restriction:
6512        *
6513        *    "The driver must ensure The following case does not occur
6514        *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
6515        *     buffer 3 read length equal to zero committed followed by a
6516        *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
6517        *     zero committed."
6518        *
6519        * To avoid this, we program the buffers in the highest slots.
6520        * This way, slot 0 is only used if slot 3 is also used.
6521        */
6522       const int n = push_bos->buffer_count;
6523       assert(n <= 4);
6524       const unsigned shift = 4 - n;
6525       for (int i = 0; i < n; i++) {
6526          pkt.ConstantBody.ReadLength[i + shift] =
6527             push_bos->buffers[i].length;
6528          pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
6529       }
6530    }
6531 }
6532 
6533 #if GFX_VER >= 12
6534 static void
emit_null_push_constant_tbimr_workaround(struct iris_batch * batch)6535 emit_null_push_constant_tbimr_workaround(struct iris_batch *batch)
6536 {
6537    struct isl_device *isl_dev = &batch->screen->isl_dev;
6538    /* Pass a single-register push constant payload for the PS
6539     * stage even if empty, since PS invocations with zero push
6540     * constant cycles have been found to cause hangs with TBIMR
6541     * enabled.  See HSDES #22020184996.
6542     *
6543     * XXX - Use workaround infrastructure and final workaround
6544     *       when provided by hardware team.
6545     */
6546    const struct iris_address null_addr = {
6547       .bo = batch->screen->workaround_bo,
6548       .offset = 1024,
6549    };
6550    const uint32_t num_dwords = 2 + 2 * 1;
6551    uint32_t const_all[num_dwords];
6552    uint32_t *dw = &const_all[0];
6553 
6554    iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6555       all.DWordLength = num_dwords - 2;
6556       all.MOCS = isl_mocs(isl_dev, 0, false);
6557       all.ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT);
6558       all.PointerBufferMask = 1;
6559    }
6560    dw += 2;
6561 
6562    _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), dw, data) {
6563       data.PointerToConstantBuffer = null_addr;
6564       data.ConstantBufferReadLength = 1;
6565    }
6566 
6567    iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6568 }
6569 
6570 static void
emit_push_constant_packet_all(struct iris_context * ice,struct iris_batch * batch,uint32_t shader_mask,const struct push_bos * push_bos)6571 emit_push_constant_packet_all(struct iris_context *ice,
6572                               struct iris_batch *batch,
6573                               uint32_t shader_mask,
6574                               const struct push_bos *push_bos)
6575 {
6576    struct isl_device *isl_dev = &batch->screen->isl_dev;
6577 
6578    if (!push_bos) {
6579       if (batch->screen->devinfo->needs_null_push_constant_tbimr_workaround &&
6580           (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
6581          emit_null_push_constant_tbimr_workaround(batch);
6582          shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
6583       }
6584 
6585       if (shader_mask) {
6586          iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
6587             pc.ShaderUpdateEnable = shader_mask;
6588             pc.MOCS = iris_mocs(NULL, isl_dev, 0);
6589          }
6590       }
6591       return;
6592    }
6593 
6594    const uint32_t n = push_bos->buffer_count;
6595    const uint32_t max_pointers = 4;
6596    const uint32_t num_dwords = 2 + 2 * n;
6597    uint32_t const_all[2 + 2 * max_pointers];
6598    uint32_t *dw = &const_all[0];
6599 
6600    assert(n <= max_pointers);
6601    iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6602       all.DWordLength = num_dwords - 2;
6603       all.MOCS = isl_mocs(isl_dev, 0, false);
6604       all.ShaderUpdateEnable = shader_mask;
6605       all.PointerBufferMask = (1 << n) - 1;
6606    }
6607    dw += 2;
6608 
6609    for (int i = 0; i < n; i++) {
6610       _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
6611                        dw + i * 2, data) {
6612          data.PointerToConstantBuffer = push_bos->buffers[i].addr;
6613          data.ConstantBufferReadLength = push_bos->buffers[i].length;
6614       }
6615    }
6616    iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6617 }
6618 #endif
6619 
6620 void
genX(emit_depth_state_workarounds)6621 genX(emit_depth_state_workarounds)(struct iris_context *ice,
6622                                    struct iris_batch *batch,
6623                                    const struct isl_surf *surf)
6624 {
6625 #if INTEL_NEEDS_WA_1808121037
6626    const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6627                                surf->samples == 1;
6628 
6629    switch (ice->state.genx->depth_reg_mode) {
6630    case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
6631       if (!is_d16_1x_msaa)
6632          return;
6633       break;
6634    case IRIS_DEPTH_REG_MODE_D16_1X_MSAA:
6635       if (is_d16_1x_msaa)
6636          return;
6637       break;
6638    case IRIS_DEPTH_REG_MODE_UNKNOWN:
6639       break;
6640    }
6641 
6642    /* We'll change some CHICKEN registers depending on the depth surface
6643     * format. Do a depth flush and stall so the pipeline is not using these
6644     * settings while we change the registers.
6645     */
6646    iris_emit_end_of_pipe_sync(batch,
6647                               "Workaround: Stop pipeline for Wa_1808121037",
6648                               PIPE_CONTROL_DEPTH_STALL |
6649                               PIPE_CONTROL_DEPTH_CACHE_FLUSH);
6650 
6651    /* Wa_1808121037
6652     *
6653     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6654     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6655     */
6656    iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6657       reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6658       reg.HIZPlaneOptimizationdisablebitMask = true;
6659    }
6660 
6661    ice->state.genx->depth_reg_mode =
6662       is_d16_1x_msaa ? IRIS_DEPTH_REG_MODE_D16_1X_MSAA :
6663                        IRIS_DEPTH_REG_MODE_HW_DEFAULT;
6664 #endif
6665 }
6666 
6667 /* Calculate TBIMR tiling parameters adequate for the current pipeline
6668  * setup.  Return true if TBIMR should be enabled.
6669  */
6670 UNUSED static bool
calculate_tile_dimensions(struct iris_context * ice,unsigned * tile_width,unsigned * tile_height)6671 calculate_tile_dimensions(struct iris_context *ice,
6672                           unsigned *tile_width, unsigned *tile_height)
6673 {
6674    struct iris_screen *screen = (void *)ice->ctx.screen;
6675    const struct intel_device_info *devinfo = screen->devinfo;
6676 
6677    assert(GFX_VER == 12);
6678    const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
6679 
6680    /* Perform a rough calculation of the tile cache footprint of the
6681     * pixel pipeline, approximating it as the sum of the amount of
6682     * memory used per pixel by every render target, depth, stencil and
6683     * auxiliary surfaces bound to the pipeline.
6684     */
6685    unsigned pixel_size = 0;
6686 
6687    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
6688 
6689    if (cso->width == 0 || cso->height == 0)
6690       return false;
6691 
6692    for (unsigned i = 0; i < cso->nr_cbufs; i++) {
6693       const struct iris_surface *surf = (void *)cso->cbufs[i];
6694 
6695       if (surf) {
6696          const struct iris_resource *res = (void *)surf->base.texture;
6697 
6698          pixel_size += intel_calculate_surface_pixel_size(&res->surf);
6699 
6700          /* XXX - Pessimistic, in some cases it might be helpful to neglect
6701           *       aux surface traffic.
6702           */
6703          if (ice->state.draw_aux_usage[i]) {
6704             pixel_size += intel_calculate_surface_pixel_size(&res->aux.surf);
6705 
6706             if (isl_aux_usage_has_ccs(res->aux.usage)) {
6707                pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6708                                              &res->surf), aux_scale);
6709             }
6710          }
6711       }
6712    }
6713 
6714    if (cso->zsbuf) {
6715       struct iris_resource *zres;
6716       struct iris_resource *sres;
6717       iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres, &sres);
6718 
6719       if (zres) {
6720          pixel_size += intel_calculate_surface_pixel_size(&zres->surf);
6721 
6722          /* XXX - Pessimistic, in some cases it might be helpful to neglect
6723           *       aux surface traffic.
6724           */
6725          if (iris_resource_level_has_hiz(devinfo, zres, cso->zsbuf->u.tex.level)) {
6726             pixel_size += intel_calculate_surface_pixel_size(&zres->aux.surf);
6727 
6728             if (isl_aux_usage_has_ccs(zres->aux.usage)) {
6729                pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6730                                              &zres->surf), aux_scale);
6731             }
6732          }
6733       }
6734 
6735       if (sres) {
6736          pixel_size += intel_calculate_surface_pixel_size(&sres->surf);
6737       }
6738    }
6739 
6740    /* Compute a tile layout that allows reasonable utilization of the
6741     * tile cache based on the per-pixel cache footprint estimated
6742     * above.
6743     */
6744    intel_calculate_tile_dimensions(devinfo, screen->l3_config_3d,
6745                                    32, 32, cso->width, cso->height, pixel_size,
6746                                    tile_width, tile_height);
6747 
6748    /* Perform TBIMR tile passes only if the framebuffer covers more
6749     * than a single tile.
6750     */
6751    return *tile_width < cso->width || *tile_height < cso->height;
6752 }
6753 
6754 static void
iris_preemption_streamout_wa(struct iris_context * ice,struct iris_batch * batch,bool enable)6755 iris_preemption_streamout_wa(struct iris_context *ice,
6756                              struct iris_batch *batch,
6757                              bool enable)
6758 {
6759 #if GFX_VERx10 >= 120
6760    if (!intel_needs_workaround(batch->screen->devinfo, 16013994831))
6761       return;
6762 
6763    iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
6764       reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !enable;
6765       reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
6766    }
6767 
6768    /* Emit CS_STALL and 250 noops. */
6769    iris_emit_pipe_control_flush(batch, "workaround: Wa_16013994831",
6770                                 PIPE_CONTROL_CS_STALL);
6771    for (unsigned i = 0; i < 250; i++)
6772       iris_emit_cmd(batch, GENX(MI_NOOP), noop);
6773 
6774    ice->state.genx->object_preemption = enable;
6775 #endif
6776 }
6777 
6778 static void
shader_program_uses_primitive_id(struct iris_context * ice,struct iris_batch * batch,struct iris_compiled_shader * shader,gl_shader_stage stage,bool * uses_primitive_id)6779 shader_program_uses_primitive_id(struct iris_context *ice,
6780                                  struct iris_batch *batch,
6781                                  struct iris_compiled_shader *shader,
6782                                  gl_shader_stage stage,
6783                                  bool *uses_primitive_id)
6784 {
6785    switch (stage) {
6786    case MESA_SHADER_TESS_CTRL: {
6787       struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
6788       *uses_primitive_id |= tcs_data->include_primitive_id;
6789       break;
6790    }
6791    case MESA_SHADER_TESS_EVAL: {
6792       struct iris_tes_data *tes_data = iris_tes_data(shader);
6793       *uses_primitive_id |= tes_data->include_primitive_id;
6794       break;
6795    }
6796    default:
6797       break;
6798    }
6799 
6800    struct iris_compiled_shader *gs_shader =
6801       ice->shaders.prog[MESA_SHADER_GEOMETRY];
6802    const struct iris_gs_data *gs_data =
6803       gs_shader ? iris_gs_data(gs_shader) : NULL;
6804 
6805    *uses_primitive_id |= gs_data && gs_data->include_primitive_id;
6806 }
6807 
6808 static void
emit_wa_18020335297_dummy_draw(struct iris_batch * batch)6809 emit_wa_18020335297_dummy_draw(struct iris_batch *batch)
6810 {
6811 #if GFX_VERx10 >= 125
6812    iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
6813       vfg.DistributionMode = RR_STRICT;
6814    }
6815    iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
6816       vf.GeometryDistributionEnable = true;
6817    }
6818 #endif
6819 
6820 #if GFX_VER >= 12
6821    iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
6822       pr.ReplicaMask = 1;
6823    }
6824 #endif
6825 
6826    iris_emit_cmd(batch, GENX(3DSTATE_RASTER), rr) {
6827       rr.CullMode = CULLMODE_NONE;
6828       rr.FrontFaceFillMode = FILL_MODE_SOLID;
6829       rr.BackFaceFillMode = FILL_MODE_SOLID;
6830    }
6831 
6832    iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { }
6833    iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) { }
6834 
6835 #if GFX_VER >= 11
6836    iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs2) { }
6837 #endif
6838 
6839    iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) {
6840       clip.ClipEnable = true;
6841       clip.ClipMode = CLIPMODE_REJECT_ALL;
6842    }
6843 
6844    iris_emit_cmd(batch, GENX(3DSTATE_VS), vs) { }
6845    iris_emit_cmd(batch, GENX(3DSTATE_GS), gs) { }
6846    iris_emit_cmd(batch, GENX(3DSTATE_HS), hs) { }
6847    iris_emit_cmd(batch, GENX(3DSTATE_TE), te) { }
6848    iris_emit_cmd(batch, GENX(3DSTATE_DS), ds) { }
6849    iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so) { }
6850 
6851    uint32_t vertex_elements[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)];
6852    uint32_t *ve_pack_dest = &vertex_elements[1];
6853 
6854    iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), vertex_elements, ve) {
6855       ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 -
6856                        GENX(3DSTATE_VERTEX_ELEMENTS_length_bias);
6857    }
6858 
6859    for (int i = 0; i < 2; i++) {
6860       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6861          ve.Valid = true;
6862          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
6863          ve.Component0Control = VFCOMP_STORE_0;
6864          ve.Component1Control = VFCOMP_STORE_0;
6865          ve.Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6866          ve.Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6867       }
6868       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6869    }
6870 
6871    iris_batch_emit(batch, vertex_elements, sizeof(uint32_t) *
6872                    (1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)));
6873 
6874    iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
6875       topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
6876    }
6877 
6878    /* Emit dummy draw per slice. */
6879    for (unsigned i = 0; i < batch->screen->devinfo->num_slices; i++) {
6880       iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
6881          prim.VertexCountPerInstance = 3;
6882          prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
6883          prim.InstanceCount = 1;
6884          prim.VertexAccessType = SEQUENTIAL;
6885       }
6886    }
6887 }
6888 
6889 static void
iris_upload_dirty_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,bool skip_vb_params)6890 iris_upload_dirty_render_state(struct iris_context *ice,
6891                                struct iris_batch *batch,
6892                                const struct pipe_draw_info *draw,
6893                                bool skip_vb_params)
6894 {
6895    struct iris_screen *screen = batch->screen;
6896    struct iris_border_color_pool *border_color_pool =
6897       iris_bufmgr_get_border_color_pool(screen->bufmgr);
6898 
6899    /* Re-emit 3DSTATE_DS before any 3DPRIMITIVE when tessellation is on */
6900    if (intel_needs_workaround(batch->screen->devinfo, 22018402687) &&
6901        ice->shaders.prog[MESA_SHADER_TESS_EVAL])
6902       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TES;
6903 
6904    uint64_t dirty = ice->state.dirty;
6905    uint64_t stage_dirty = ice->state.stage_dirty;
6906 
6907    if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&
6908        !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))
6909       return;
6910 
6911    struct iris_genx_state *genx = ice->state.genx;
6912    struct iris_binder *binder = &ice->state.binder;
6913    struct iris_fs_data *fs_data =
6914       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
6915 
6916    /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
6917     * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
6918     */
6919    bool needs_wa_14018912822 =
6920       screen->driconf.intel_enable_wa_14018912822 &&
6921       intel_needs_workaround(batch->screen->devinfo, 14018912822) &&
6922       util_framebuffer_get_num_samples(&ice->state.framebuffer) > 1;
6923 
6924    if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
6925       const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6926       uint32_t cc_vp_address;
6927       bool wa_18020335297_applied = false;
6928 
6929       /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
6930       if (intel_needs_workaround(screen->devinfo, 18020335297) &&
6931           batch->name == IRIS_BATCH_RENDER &&
6932           ice->state.viewport_ptr_set) {
6933          emit_wa_18020335297_dummy_draw(batch);
6934          wa_18020335297_applied = true;
6935       }
6936 
6937       /* XXX: could avoid streaming for depth_clip [0,1] case. */
6938       uint32_t *cc_vp_map =
6939          stream_state(batch, ice->state.dynamic_uploader,
6940                       &ice->state.last_res.cc_vp,
6941                       4 * ice->state.num_viewports *
6942                       GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
6943       for (int i = 0; i < ice->state.num_viewports; i++) {
6944          float zmin, zmax;
6945          iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,
6946                                  ice->state.window_space_position,
6947                                  &zmin, &zmax);
6948          if (cso_rast->depth_clip_near)
6949             zmin = 0.0;
6950          if (cso_rast->depth_clip_far)
6951             zmax = 1.0;
6952 
6953          iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
6954             ccv.MinimumDepth = zmin;
6955             ccv.MaximumDepth = zmax;
6956          }
6957 
6958          cc_vp_map += GENX(CC_VIEWPORT_length);
6959       }
6960 
6961       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
6962          ptr.CCViewportPointer = cc_vp_address;
6963       }
6964 
6965       if (wa_18020335297_applied) {
6966 #if GFX_VER >= 12
6967          iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { }
6968 #endif
6969          /* Dirty all emitted WA state to make sure that current real
6970           * state is restored.
6971           */
6972          dirty |= IRIS_DIRTY_VFG |
6973                   IRIS_DIRTY_VF |
6974                   IRIS_DIRTY_RASTER |
6975                   IRIS_DIRTY_VF_STATISTICS |
6976                   IRIS_DIRTY_VF_SGVS |
6977                   IRIS_DIRTY_CLIP |
6978                   IRIS_DIRTY_STREAMOUT |
6979                   IRIS_DIRTY_VERTEX_ELEMENTS |
6980                   IRIS_DIRTY_VF_TOPOLOGY;
6981 
6982          for (int stage = 0; stage < MESA_SHADER_FRAGMENT; stage++) {
6983             if (ice->shaders.prog[stage])
6984                stage_dirty |= (IRIS_STAGE_DIRTY_VS << stage);
6985          }
6986       }
6987       ice->state.viewport_ptr_set = true;
6988    }
6989 
6990    if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
6991       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6992       int32_t x_min, y_min, x_max, y_max;
6993       uint32_t sf_cl_vp_address;
6994       uint32_t *vp_map =
6995          stream_state(batch, ice->state.dynamic_uploader,
6996                       &ice->state.last_res.sf_cl_vp,
6997                       4 * ice->state.num_viewports *
6998                       GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
6999 
7000       x_min = ice->state.render_area.x;
7001       y_min = ice->state.render_area.y;
7002       x_max = ice->state.render_area.width;
7003       y_max = ice->state.render_area.height;
7004 
7005       for (unsigned i = 0; i < ice->state.num_viewports; i++) {
7006          const struct pipe_viewport_state *state = &ice->state.viewports[i];
7007          float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
7008 
7009          float vp_xmin = viewport_extent(state, 0, -1.0f);
7010          float vp_xmax = viewport_extent(state, 0,  1.0f);
7011          float vp_ymin = viewport_extent(state, 1, -1.0f);
7012          float vp_ymax = viewport_extent(state, 1,  1.0f);
7013 
7014          intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
7015                                         state->scale[0], state->scale[1],
7016                                         state->translate[0], state->translate[1],
7017                                         &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
7018 
7019          iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
7020             vp.ViewportMatrixElementm00 = state->scale[0];
7021             vp.ViewportMatrixElementm11 = state->scale[1];
7022             vp.ViewportMatrixElementm22 = state->scale[2];
7023             vp.ViewportMatrixElementm30 = state->translate[0];
7024             vp.ViewportMatrixElementm31 = state->translate[1];
7025             vp.ViewportMatrixElementm32 = state->translate[2];
7026             vp.XMinClipGuardband = gb_xmin;
7027             vp.XMaxClipGuardband = gb_xmax;
7028             vp.YMinClipGuardband = gb_ymin;
7029             vp.YMaxClipGuardband = gb_ymax;
7030             vp.XMinViewPort = MAX2(vp_xmin, 0);
7031             vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
7032             vp.YMinViewPort = MAX2(vp_ymin, 0);
7033             vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
7034          }
7035 
7036          vp_map += GENX(SF_CLIP_VIEWPORT_length);
7037       }
7038 
7039       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
7040          ptr.SFClipViewportPointer = sf_cl_vp_address;
7041       }
7042    }
7043 
7044    if (dirty & IRIS_DIRTY_URB) {
7045       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
7046          if (!ice->shaders.prog[i]) {
7047             ice->shaders.urb.cfg.size[i] = 1;
7048          } else {
7049             struct iris_vue_data *vue_data =
7050                iris_vue_data(ice->shaders.prog[i]);
7051             ice->shaders.urb.cfg.size[i] = vue_data->urb_entry_size;
7052          }
7053          assert(ice->shaders.urb.cfg.size[i] != 0);
7054       }
7055 
7056       genX(emit_urb_config)(batch,
7057                             ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
7058                             ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL);
7059    }
7060 
7061    if (dirty & IRIS_DIRTY_BLEND_STATE) {
7062       struct iris_blend_state *cso_blend = ice->state.cso_blend;
7063       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7064       struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7065 
7066       bool color_blend_zero = false;
7067       bool alpha_blend_zero = false;
7068 
7069       /* Always write at least one BLEND_STATE - the final RT message will
7070        * reference BLEND_STATE[0] even if there aren't color writes.  There
7071        * may still be alpha testing, computed depth, and so on.
7072        */
7073       const int rt_dwords =
7074          MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
7075 
7076       uint32_t blend_offset;
7077       uint32_t *blend_map =
7078          stream_state(batch, ice->state.dynamic_uploader,
7079                       &ice->state.last_res.blend,
7080                       96, 64, &blend_offset);
7081 
7082       /* Copy of blend entries for merging dynamic changes. */
7083       uint32_t blend_entries[4 * rt_dwords];
7084       memcpy(blend_entries, &cso_blend->blend_state[1], sizeof(blend_entries));
7085 
7086       unsigned cbufs = MAX2(cso_fb->nr_cbufs, 1);
7087 
7088       uint32_t *blend_entry = blend_entries;
7089       for (unsigned i = 0; i < cbufs; i++) {
7090          int dst_blend_factor = cso_blend->ps_dst_blend_factor[i];
7091          int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[i];
7092          uint32_t entry[GENX(BLEND_STATE_ENTRY_length)];
7093          iris_pack_state(GENX(BLEND_STATE_ENTRY), entry, be) {
7094             if (needs_wa_14018912822) {
7095                if (dst_blend_factor == BLENDFACTOR_ZERO) {
7096                   dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7097                   color_blend_zero = true;
7098                }
7099                if (dst_alpha_blend_factor == BLENDFACTOR_ZERO) {
7100                   dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7101                   alpha_blend_zero = true;
7102                }
7103             }
7104             be.DestinationBlendFactor = dst_blend_factor;
7105             be.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7106          }
7107 
7108          /* Merge entry. */
7109          uint32_t *dst = blend_entry;
7110          uint32_t *src = entry;
7111          for (unsigned j = 0; j < GENX(BLEND_STATE_ENTRY_length); j++)
7112             *dst |= *src;
7113 
7114          blend_entry += GENX(BLEND_STATE_ENTRY_length);
7115       }
7116 
7117       /* Blend constants modified for Wa_14018912822. */
7118       if (ice->state.color_blend_zero != color_blend_zero) {
7119          ice->state.color_blend_zero = color_blend_zero;
7120          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7121       }
7122       if (ice->state.alpha_blend_zero != alpha_blend_zero) {
7123          ice->state.alpha_blend_zero = alpha_blend_zero;
7124          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7125       }
7126 
7127       uint32_t blend_state_header;
7128       iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
7129          bs.AlphaTestEnable = cso_zsa->alpha_enabled;
7130          bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);
7131       }
7132 
7133       blend_map[0] = blend_state_header | cso_blend->blend_state[0];
7134       memcpy(&blend_map[1], blend_entries, 4 * rt_dwords);
7135 
7136       iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
7137          ptr.BlendStatePointer = blend_offset;
7138          ptr.BlendStatePointerValid = true;
7139       }
7140    }
7141 
7142    if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
7143       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7144 #if GFX_VER == 8
7145       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7146 #endif
7147       uint32_t cc_offset;
7148       void *cc_map =
7149          stream_state(batch, ice->state.dynamic_uploader,
7150                       &ice->state.last_res.color_calc,
7151                       sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
7152                       64, &cc_offset);
7153       iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
7154          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
7155          cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;
7156          cc.BlendConstantColorRed   = ice->state.color_blend_zero ?
7157             0.0 : ice->state.blend_color.color[0];
7158          cc.BlendConstantColorGreen = ice->state.color_blend_zero ?
7159             0.0 : ice->state.blend_color.color[1];
7160          cc.BlendConstantColorBlue  = ice->state.color_blend_zero ?
7161             0.0 : ice->state.blend_color.color[2];
7162          cc.BlendConstantColorAlpha = ice->state.alpha_blend_zero ?
7163             0.0 : ice->state.blend_color.color[3];
7164 #if GFX_VER == 8
7165 	 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
7166 	 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7167 #endif
7168       }
7169       iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7170          ptr.ColorCalcStatePointer = cc_offset;
7171          ptr.ColorCalcStatePointerValid = true;
7172       }
7173    }
7174 
7175 #if GFX_VERx10 == 125
7176    if (dirty & (IRIS_DIRTY_RENDER_BUFFER | IRIS_DIRTY_DEPTH_BUFFER)) {
7177       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7178       unsigned tile_width, tile_height;
7179 
7180       ice->state.use_tbimr = batch->screen->driconf.enable_tbimr &&
7181          calculate_tile_dimensions(ice, &tile_width, &tile_height);
7182 
7183       if (ice->state.use_tbimr) {
7184          /* Use a batch size of 128 polygons per slice as recommended
7185           * by BSpec 68436 "TBIMR Programming".
7186           */
7187          const unsigned num_slices = screen->devinfo->num_slices;
7188          const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
7189 
7190          iris_emit_cmd(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
7191             tbimr.TileRectangleHeight = tile_height;
7192             tbimr.TileRectangleWidth = tile_width;
7193             tbimr.VerticalTileCount = DIV_ROUND_UP(cso_fb->height, tile_height);
7194             tbimr.HorizontalTileCount = DIV_ROUND_UP(cso_fb->width, tile_width);
7195             tbimr.TBIMRBatchSize = util_logbase2(batch_size) - 5;
7196             tbimr.TileBoxCheck = true;
7197          }
7198       }
7199    }
7200 #endif
7201 
7202    /* Wa_1604061319
7203     *
7204     *    3DSTATE_CONSTANT_* needs to be programmed before BTP_*
7205     *
7206     * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
7207     * any stage has a dirty binding table.
7208     */
7209    const bool emit_const_wa = GFX_VER >= 11 &&
7210       ((dirty & IRIS_DIRTY_RENDER_BUFFER) ||
7211        (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));
7212 
7213 #if GFX_VER >= 12
7214    uint32_t nobuffer_stages = 0;
7215 #endif
7216 
7217    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7218       if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&
7219           !emit_const_wa)
7220          continue;
7221 
7222       struct iris_shader_state *shs = &ice->state.shaders[stage];
7223       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7224 
7225       if (!shader)
7226          continue;
7227 
7228       if (shs->sysvals_need_upload)
7229          upload_sysvals(ice, stage, NULL);
7230 
7231       struct push_bos push_bos = {};
7232       setup_constant_buffers(ice, batch, stage, &push_bos);
7233 
7234 #if GFX_VER >= 12
7235       /* If this stage doesn't have any push constants, emit it later in a
7236        * single CONSTANT_ALL packet with all the other stages.
7237        */
7238       if (push_bos.buffer_count == 0) {
7239          nobuffer_stages |= 1 << stage;
7240          continue;
7241       }
7242 
7243       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
7244        * contains only 5 bits, so we can only use it for buffers smaller than
7245        * 32.
7246        *
7247        * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
7248        * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
7249        * for disabling stages, where all address bits are zero.  However, we
7250        * can't safely use it for general buffers with arbitrary addresses.
7251        * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
7252        * case.
7253        */
7254       if (push_bos.max_length < 32 && GFX_VERx10 > 120) {
7255          emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);
7256          continue;
7257       }
7258 #endif
7259       emit_push_constant_packets(ice, batch, stage, &push_bos);
7260    }
7261 
7262 #if GFX_VER >= 12
7263    if (nobuffer_stages)
7264       /* Wa_16011448509: all address bits are zero */
7265       emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);
7266 #endif
7267 
7268    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7269       /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted
7270        * in order to commit constants.  TODO: Investigate "Disable Gather
7271        * at Set Shader" to go back to legacy mode...
7272        */
7273       if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |
7274                           (GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))
7275                             << stage)) {
7276          iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
7277             ptr._3DCommandSubOpcode = 38 + stage;
7278             ptr.PointertoVSBindingTable =
7279                binder->bt_offset[stage] >> IRIS_BT_OFFSET_SHIFT;
7280          }
7281       }
7282    }
7283 
7284    if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
7285       // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
7286       // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
7287 
7288       /* The PIPE_CONTROL command description says:
7289        *
7290        *   "Whenever a Binding Table Index (BTI) used by a Render Target
7291        *    Message points to a different RENDER_SURFACE_STATE, SW must issue a
7292        *    Render Target Cache Flush by enabling this bit. When render target
7293        *    flush is set due to new association of BTI, PS Scoreboard Stall bit
7294        *    must be set in this packet."
7295        */
7296       // XXX: does this need to happen at 3DSTATE_BTP_PS time?
7297       iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
7298                                    PIPE_CONTROL_RENDER_TARGET_FLUSH |
7299                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
7300    }
7301 
7302    if (dirty & IRIS_DIRTY_RENDER_BUFFER)
7303       trace_framebuffer_state(&batch->trace, NULL, &ice->state.framebuffer);
7304 
7305    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7306       if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
7307          iris_populate_binding_table(ice, batch, stage, false);
7308       }
7309    }
7310 
7311    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7312       if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
7313           !ice->shaders.prog[stage])
7314          continue;
7315 
7316       iris_upload_sampler_states(ice, stage);
7317 
7318       struct iris_shader_state *shs = &ice->state.shaders[stage];
7319       struct pipe_resource *res = shs->sampler_table.res;
7320       if (res)
7321          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
7322                             IRIS_DOMAIN_NONE);
7323 
7324       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
7325          ptr._3DCommandSubOpcode = 43 + stage;
7326          ptr.PointertoVSSamplerState = shs->sampler_table.offset;
7327       }
7328    }
7329 
7330    if (ice->state.need_border_colors)
7331       iris_use_pinned_bo(batch, border_color_pool->bo, false, IRIS_DOMAIN_NONE);
7332 
7333    if (dirty & IRIS_DIRTY_MULTISAMPLE) {
7334       iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
7335          ms.PixelLocation =
7336             ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
7337          if (ice->state.framebuffer.samples > 0)
7338             ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
7339       }
7340    }
7341 
7342    if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
7343       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
7344          ms.SampleMask = ice->state.sample_mask;
7345       }
7346    }
7347 
7348 #if GFX_VERx10 >= 125
7349    /* This is only used on >= gfx125 for dynamic 3DSTATE_TE and
7350     * 3DSTATE_VFG emission related workarounds.
7351     */
7352    bool program_uses_primitive_id = false;
7353 
7354    /* Check if FS stage will use primitive ID overrides. */
7355    const struct intel_vue_map *last_vue_map =
7356       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7357    if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
7358        last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
7359       program_uses_primitive_id = true;
7360    }
7361 #endif
7362 
7363    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7364       if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))
7365          continue;
7366 
7367       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7368 
7369       if (shader) {
7370          struct iris_resource *cache = (void *) shader->assembly.res;
7371          iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);
7372 
7373          uint32_t scratch_addr =
7374             pin_scratch_space(ice, batch, shader, stage);
7375 
7376 #if GFX_VERx10 >= 125
7377          shader_program_uses_primitive_id(ice, batch, shader, stage,
7378                                           &program_uses_primitive_id);
7379 #endif
7380 
7381          if (stage == MESA_SHADER_FRAGMENT) {
7382             UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;
7383             struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7384 
7385             uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
7386             _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
7387 #if GFX_VER >= 9
7388                struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(shader->brw_prog_data);
7389 #else
7390                struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(shader->elk_prog_data);
7391 #endif
7392                intel_set_ps_dispatch_state(&ps, batch->screen->devinfo,
7393                                            wm_prog_data, util_framebuffer_get_num_samples(cso_fb),
7394                                            0 /* msaa_flags */);
7395 
7396 #if GFX_VER == 12
7397                assert(fs_data->dispatch_multi == 0 ||
7398                       (fs_data->dispatch_multi == 16 && fs_data->max_polygons == 2));
7399                ps.DualSIMD8DispatchEnable = fs_data->dispatch_multi;
7400                /* XXX - No major improvement observed from enabling
7401                 *       overlapping subspans, but it could be helpful
7402                 *       in theory when the requirements listed on the
7403                 *       BSpec page for 3DSTATE_PS_BODY are met.
7404                 */
7405                ps.OverlappingSubspansEnable = false;
7406 #endif
7407 
7408 #if GFX_VER >= 9
7409                ps.DispatchGRFStartRegisterForConstantSetupData0 =
7410                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7411                ps.DispatchGRFStartRegisterForConstantSetupData1 =
7412                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7413 #if GFX_VER < 20
7414                ps.DispatchGRFStartRegisterForConstantSetupData2 =
7415                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7416 #endif
7417 
7418                ps.KernelStartPointer0 = KSP(shader) +
7419                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7420                ps.KernelStartPointer1 = KSP(shader) +
7421                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7422 #if GFX_VER < 20
7423                ps.KernelStartPointer2 = KSP(shader) +
7424                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7425 #endif
7426 #else
7427                ps.DispatchGRFStartRegisterForConstantSetupData0 =
7428                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7429                ps.DispatchGRFStartRegisterForConstantSetupData1 =
7430                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7431                ps.DispatchGRFStartRegisterForConstantSetupData2 =
7432                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7433 
7434                ps.KernelStartPointer0 = KSP(shader) +
7435                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7436                ps.KernelStartPointer1 = KSP(shader) +
7437                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7438                ps.KernelStartPointer2 = KSP(shader) +
7439                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7440 #endif
7441 
7442 #if GFX_VERx10 >= 125
7443                ps.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7444 #else
7445                ps.ScratchSpaceBasePointer =
7446                   rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7447 #endif
7448             }
7449 
7450             uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};
7451             iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
7452 #if GFX_VER >= 9
7453                if (!fs_data->uses_sample_mask)
7454                   psx.InputCoverageMaskState  = ICMS_NONE;
7455                else if (fs_data->post_depth_coverage)
7456                   psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
7457                else if (fs_data->inner_coverage &&
7458                         cso->conservative_rasterization)
7459                   psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
7460                else
7461                   psx.InputCoverageMaskState = ICMS_NORMAL;
7462 #else
7463                psx.PixelShaderUsesInputCoverageMask =
7464                   fs_data->uses_sample_mask;
7465 #endif
7466             }
7467 
7468             uint32_t *shader_ps = (uint32_t *) shader->derived_data;
7469             uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);
7470             iris_emit_merge(batch, shader_ps, ps_state,
7471                             GENX(3DSTATE_PS_length));
7472             iris_emit_merge(batch, shader_psx, psx_state,
7473                             GENX(3DSTATE_PS_EXTRA_length));
7474 #if GFX_VERx10 >= 125
7475          } else if (stage == MESA_SHADER_TESS_EVAL) {
7476             uint32_t te_state[GENX(3DSTATE_TE_length)] = { 0 };
7477             iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
7478                if (intel_needs_workaround(screen->devinfo, 14015055625) &&
7479                    program_uses_primitive_id)
7480                   te.TessellationDistributionMode = TEDMODE_OFF;
7481                else if (intel_needs_workaround(screen->devinfo, 22012699309))
7482                   te.TessellationDistributionMode = TEDMODE_RR_STRICT;
7483                else
7484                   te.TessellationDistributionMode = TEDMODE_RR_FREE;
7485             }
7486 
7487             uint32_t ds_state[GENX(3DSTATE_DS_length)] = { 0 };
7488             iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
7489                if (scratch_addr)
7490                   ds.ScratchSpaceBuffer =
7491                      scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7492             }
7493 
7494             uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7495             uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7496 
7497             iris_emit_merge(batch, shader_ds, ds_state,
7498                             GENX(3DSTATE_DS_length));
7499             iris_emit_merge(batch, shader_te, te_state,
7500                             GENX(3DSTATE_TE_length));
7501 #endif
7502          } else if (stage == MESA_SHADER_GEOMETRY) {
7503             const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7504 
7505             uint32_t gs_state[GENX(3DSTATE_GS_length)] = { 0 };
7506             iris_pack_command(GENX(3DSTATE_GS), gs_state, gs) {
7507                gs.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
7508 
7509                if (scratch_addr)
7510 #if GFX_VERx10 >= 125
7511                   gs.ScratchSpaceBuffer =
7512                      scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7513 #else
7514                   gs.ScratchSpaceBasePointer =
7515                      rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7516 #endif
7517             }
7518 
7519             uint32_t *shader_gs = (uint32_t *) shader->derived_data;
7520             iris_emit_merge(batch, shader_gs, gs_state,
7521                             GENX(3DSTATE_GS_length));
7522          } else if (scratch_addr) {
7523             uint32_t *pkt = (uint32_t *) shader->derived_data;
7524             switch (stage) {
7525             case MESA_SHADER_VERTEX:    MERGE_SCRATCH_ADDR(3DSTATE_VS); break;
7526             case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;
7527             case MESA_SHADER_TESS_EVAL: {
7528                uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7529                uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7530                iris_batch_emit(batch, shader_te, 4 * GENX(3DSTATE_TE_length));
7531                MERGE_SCRATCH_ADDR(3DSTATE_DS);
7532                break;
7533             }
7534             }
7535          } else {
7536             iris_batch_emit(batch, shader->derived_data,
7537                             iris_derived_program_state_size(stage));
7538          }
7539       } else {
7540          if (stage == MESA_SHADER_TESS_EVAL) {
7541             iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7542             iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
7543             iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7544          } else if (stage == MESA_SHADER_GEOMETRY) {
7545             iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
7546          }
7547       }
7548    }
7549 
7550 #if GFX_VERx10 >= 125
7551    /* Inspect program_uses_primitive_id state and dirty VFG if required. */
7552    if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
7553        program_uses_primitive_id != ice->state.uses_primitive_id) {
7554       dirty |= IRIS_DIRTY_VFG;
7555       ice->state.uses_primitive_id = program_uses_primitive_id;
7556    }
7557 #endif
7558 
7559    if (ice->state.streamout_active) {
7560       if (dirty & IRIS_DIRTY_SO_BUFFERS) {
7561          /* Wa_16011411144
7562           * SW must insert a PIPE_CONTROL cmd before and after the
7563           * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_* state is
7564           * not combined with other state changes.
7565           */
7566          if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7567             iris_emit_pipe_control_flush(batch,
7568                                          "SO pre change stall WA",
7569                                          PIPE_CONTROL_CS_STALL);
7570          }
7571 
7572          for (int i = 0; i < 4; i++) {
7573             struct iris_stream_output_target *tgt =
7574                (void *) ice->state.so_target[i];
7575             enum { dwords = GENX(3DSTATE_SO_BUFFER_length) };
7576             uint32_t *so_buffers = genx->so_buffers + i * dwords;
7577             bool zero_offset = false;
7578 
7579             if (tgt) {
7580                zero_offset = tgt->zero_offset;
7581                iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
7582                                   true, IRIS_DOMAIN_OTHER_WRITE);
7583                iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
7584                                   true, IRIS_DOMAIN_OTHER_WRITE);
7585             }
7586 
7587             if (zero_offset) {
7588                /* Skip the last DWord which contains "Stream Offset" of
7589                 * 0xFFFFFFFF and instead emit a dword of zero directly.
7590                 */
7591                STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==
7592                              32 * (dwords - 1));
7593                const uint32_t zero = 0;
7594                iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));
7595                iris_batch_emit(batch, &zero, sizeof(zero));
7596                tgt->zero_offset = false;
7597             } else {
7598                iris_batch_emit(batch, so_buffers, 4 * dwords);
7599             }
7600          }
7601 
7602          /* Wa_16011411144 */
7603          if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7604             iris_emit_pipe_control_flush(batch,
7605                                          "SO post change stall WA",
7606                                          PIPE_CONTROL_CS_STALL);
7607          }
7608       }
7609 
7610       if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
7611          /* Wa_16011773973:
7612           * If SOL is enabled and SO_DECL state has to be programmed,
7613           *    1. Send 3D State SOL state with SOL disabled
7614           *    2. Send SO_DECL NP state
7615           *    3. Send 3D State SOL with SOL Enabled
7616           */
7617          if (intel_device_info_is_dg2(batch->screen->devinfo))
7618             iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7619 
7620          uint32_t *decl_list =
7621             ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
7622          iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
7623 
7624 #if GFX_VER >= 11 && GFX_VER < 20
7625          /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7626           * 3DSTATE_SO_DECL_LIST:
7627           *
7628           *    "Workaround: This command must be followed by a PIPE_CONTROL
7629           *     with CS Stall bit set."
7630           *
7631           * On DG2+ also known as Wa_1509820217.
7632           */
7633          iris_emit_pipe_control_flush(batch,
7634                                       "workaround: cs stall after so_decl",
7635                                       PIPE_CONTROL_CS_STALL);
7636 #endif
7637       }
7638 
7639       if (dirty & IRIS_DIRTY_STREAMOUT) {
7640          const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7641 
7642 #if GFX_VERx10 >= 120
7643          /* Wa_16013994831 - Disable preemption. */
7644          if (intel_needs_workaround(batch->screen->devinfo, 16013994831))
7645             iris_preemption_streamout_wa(ice, batch, false);
7646 #endif
7647 
7648          uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
7649          iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
7650             sol.SOFunctionEnable = true;
7651             sol.SOStatisticsEnable = true;
7652 
7653             sol.RenderingDisable = cso_rast->rasterizer_discard &&
7654                                    !ice->state.prims_generated_query_active;
7655             sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
7656 
7657 
7658 #if INTEL_NEEDS_WA_18022508906
7659             /* Wa_14017076903 :
7660              *
7661              * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
7662              *
7663              * SOL_INT::Render_Enable =
7664              *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
7665              *   (
7666              *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
7667              *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
7668              *     !3DSTATE_STREAMOUT::API_Render_Disable &&
7669              *     (
7670              *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
7671              *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
7672              *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
7673              *       3DSTATE_PS_EXTRA::PS_Valid ||
7674              *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
7675              *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
7676              *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
7677              *     )
7678              *   )
7679              *
7680              * If SOL_INT::Render_Enable is false, the SO stage will not forward any
7681              * topologies down the pipeline. Which is not what we want for occlusion
7682              * queries.
7683              *
7684              * Here we force rendering to get SOL_INT::Render_Enable when occlusion
7685              * queries are active.
7686              */
7687             const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7688             if (!cso_rast->rasterizer_discard && ice->state.occlusion_query_active)
7689                sol.ForceRendering = Force_on;
7690 #endif
7691          }
7692 
7693          assert(ice->state.streamout);
7694 
7695          iris_emit_merge(batch, ice->state.streamout, dynamic_sol,
7696                          GENX(3DSTATE_STREAMOUT_length));
7697       }
7698    } else {
7699       if (dirty & IRIS_DIRTY_STREAMOUT) {
7700 
7701 #if GFX_VERx10 >= 120
7702          /* Wa_16013994831 - Enable preemption. */
7703          if (!ice->state.genx->object_preemption)
7704             iris_preemption_streamout_wa(ice, batch, true);
7705 #endif
7706 
7707          iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7708       }
7709    }
7710 
7711    if (dirty & IRIS_DIRTY_CLIP) {
7712       struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7713       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7714 
7715       bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
7716                        ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7717       bool points_or_lines = cso_rast->fill_mode_point_or_line ||
7718          (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
7719                     : ice->state.prim_is_points_or_lines);
7720       const struct intel_vue_map *last =
7721          &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7722 
7723       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
7724       iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
7725          cl.StatisticsEnable = ice->state.statistics_counters_enabled;
7726          if (cso_rast->rasterizer_discard)
7727             cl.ClipMode = CLIPMODE_REJECT_ALL;
7728          else if (ice->state.window_space_position)
7729             cl.ClipMode = CLIPMODE_ACCEPT_ALL;
7730          else
7731             cl.ClipMode = CLIPMODE_NORMAL;
7732 
7733          cl.PerspectiveDivideDisable = ice->state.window_space_position;
7734          cl.ViewportXYClipTestEnable = !points_or_lines;
7735 
7736          cl.NonPerspectiveBarycentricEnable = fs_data->uses_nonperspective_interp_modes;
7737 
7738          cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1 ||
7739                                       !(last->slots_valid & VARYING_BIT_LAYER);
7740          cl.MaximumVPIndex = ice->state.num_viewports - 1;
7741       }
7742       iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
7743                       ARRAY_SIZE(cso_rast->clip));
7744    }
7745 
7746    if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {
7747       /* From the Browadwell PRM, Volume 2, documentation for
7748        * 3DSTATE_RASTER, "Antialiasing Enable":
7749        *
7750        * "This field must be disabled if any of the render targets
7751        * have integer (UINT or SINT) surface format."
7752        *
7753        * Additionally internal documentation for Gfx12+ states:
7754        *
7755        * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
7756        *  FORCED_SAMPLE_COUNT > 1."
7757        */
7758       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7759       unsigned samples = util_framebuffer_get_num_samples(cso_fb);
7760       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7761 
7762       bool aa_enable = cso->line_smooth &&
7763                        !ice->state.has_integer_rt &&
7764                        !(batch->screen->devinfo->ver >= 12 && samples > 1);
7765 
7766       uint32_t dynamic_raster[GENX(3DSTATE_RASTER_length)];
7767       iris_pack_command(GENX(3DSTATE_RASTER), &dynamic_raster, raster) {
7768          raster.AntialiasingEnable = aa_enable;
7769       }
7770       iris_emit_merge(batch, cso->raster, dynamic_raster,
7771                       ARRAY_SIZE(cso->raster));
7772 
7773       uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7774       iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7775          sf.ViewportTransformEnable = !ice->state.window_space_position;
7776 
7777 #if GFX_VER >= 12
7778          sf.DerefBlockSize = ice->state.urb_deref_block_size;
7779 #endif
7780       }
7781       iris_emit_merge(batch, cso->sf, dynamic_sf,
7782                       ARRAY_SIZE(dynamic_sf));
7783    }
7784 
7785    if (dirty & IRIS_DIRTY_WM) {
7786       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7787       uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
7788 
7789       iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
7790          wm.StatisticsEnable = ice->state.statistics_counters_enabled;
7791 
7792          wm.BarycentricInterpolationMode =
7793             iris_fs_barycentric_modes(ice->shaders.prog[MESA_SHADER_FRAGMENT], 0);
7794 
7795          if (fs_data->early_fragment_tests)
7796             wm.EarlyDepthStencilControl = EDSC_PREPS;
7797          else if (fs_data->has_side_effects)
7798             wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7799          else
7800             wm.EarlyDepthStencilControl = EDSC_NORMAL;
7801 
7802          /* We could skip this bit if color writes are enabled. */
7803          if (fs_data->has_side_effects || fs_data->uses_kill)
7804             wm.ForceThreadDispatchEnable = ForceON;
7805       }
7806       iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
7807    }
7808 
7809    if (dirty & IRIS_DIRTY_SBE) {
7810       iris_emit_sbe(batch, ice);
7811    }
7812 
7813    if (dirty & IRIS_DIRTY_PS_BLEND) {
7814       struct iris_blend_state *cso_blend = ice->state.cso_blend;
7815       struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7816       const struct shader_info *fs_info =
7817          iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7818 
7819       int dst_blend_factor = cso_blend->ps_dst_blend_factor[0];
7820       int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[0];
7821 
7822       /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
7823        * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
7824        */
7825       if (needs_wa_14018912822) {
7826          if (ice->state.color_blend_zero)
7827             dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7828          if (ice->state.alpha_blend_zero)
7829             dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7830       }
7831 
7832       uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7833       iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7834          pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7835          pb.AlphaTestEnable = cso_zsa->alpha_enabled;
7836 
7837          pb.DestinationBlendFactor = dst_blend_factor;
7838          pb.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7839 
7840          /* The dual source blending docs caution against using SRC1 factors
7841           * when the shader doesn't use a dual source render target write.
7842           * Empirically, this can lead to GPU hangs, and the results are
7843           * undefined anyway, so simply disable blending to avoid the hang.
7844           */
7845          pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7846             (!cso_blend->dual_color_blending || fs_data->dual_src_blend);
7847       }
7848 
7849       iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7850                       ARRAY_SIZE(cso_blend->ps_blend));
7851    }
7852 
7853    if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
7854       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7855 #if GFX_VER >= 9 && GFX_VER < 12
7856       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7857       uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7858       iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7859          wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7860          wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7861       }
7862       iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
7863 #else
7864       /* Use modify disable fields which allow us to emit packets
7865        * directly instead of merging them later.
7866        */
7867       iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));
7868 #endif
7869 
7870    /* Depth or stencil write changed in cso. */
7871    if (intel_needs_workaround(batch->screen->devinfo, 18019816803) &&
7872        (dirty & IRIS_DIRTY_DS_WRITE_ENABLE)) {
7873       iris_emit_pipe_control_flush(
7874          batch, "workaround: PSS stall after DS write enable change",
7875          PIPE_CONTROL_PSS_STALL_SYNC);
7876    }
7877 
7878 #if GFX_VER >= 12
7879       iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));
7880 #endif
7881    }
7882 
7883    if (dirty & IRIS_DIRTY_STENCIL_REF) {
7884 #if GFX_VER >= 12
7885       /* Use modify disable fields which allow us to emit packets
7886        * directly instead of merging them later.
7887        */
7888       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7889       uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7890       iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7891          wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7892          wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7893          wmds.StencilTestMaskModifyDisable = true;
7894          wmds.StencilWriteMaskModifyDisable = true;
7895          wmds.StencilStateModifyDisable = true;
7896          wmds.DepthStateModifyDisable = true;
7897       }
7898       iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));
7899 #endif
7900    }
7901 
7902    if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
7903       /* Wa_1409725701:
7904        *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
7905        *    stored as an array of up to 16 elements. The location of first
7906        *    element of the array, as specified by Pointer to SCISSOR_RECT,
7907        *    should be aligned to a 64-byte boundary.
7908        */
7909       uint32_t alignment = 64;
7910       uint32_t scissor_offset =
7911          emit_state(batch, ice->state.dynamic_uploader,
7912                     &ice->state.last_res.scissor,
7913                     ice->state.scissors,
7914                     sizeof(struct pipe_scissor_state) *
7915                     ice->state.num_viewports, alignment);
7916 
7917       iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7918          ptr.ScissorRectPointer = scissor_offset;
7919       }
7920    }
7921 
7922    if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
7923       struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
7924 
7925       /* Do not emit the cso yet. We may need to update clear params first. */
7926       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7927       struct iris_resource *zres = NULL, *sres = NULL;
7928       if (cso_fb->zsbuf) {
7929          iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
7930                                           &zres, &sres);
7931       }
7932 
7933       if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
7934 #if GFX_VER < 20
7935          uint32_t *clear_params =
7936             cso_z->packets + ARRAY_SIZE(cso_z->packets) -
7937             GENX(3DSTATE_CLEAR_PARAMS_length);
7938 
7939          iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
7940             clear.DepthClearValueValid = true;
7941             clear.DepthClearValue = zres->aux.clear_color.f32[0];
7942          }
7943 #endif
7944       }
7945 
7946       iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
7947 
7948       if (intel_needs_workaround(batch->screen->devinfo, 1408224581) ||
7949           intel_needs_workaround(batch->screen->devinfo, 14014097488) ||
7950           intel_needs_workaround(batch->screen->devinfo, 14016712196)) {
7951          /* Wa_1408224581
7952           *
7953           * Workaround: Gfx12LP Astep only An additional pipe control with
7954           * post-sync = store dword operation would be required.( w/a is to
7955           * have an additional pipe control after the stencil state whenever
7956           * the surface state bits of this state is changing).
7957           *
7958           * This also seems sufficient to handle Wa_14014097488 and
7959           * Wa_14016712196.
7960           */
7961          iris_emit_pipe_control_write(batch, "WA for depth/stencil state",
7962                                       PIPE_CONTROL_WRITE_IMMEDIATE,
7963                                       screen->workaround_address.bo,
7964                                       screen->workaround_address.offset, 0);
7965       }
7966 
7967       if (zres)
7968          genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
7969    }
7970 
7971    if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
7972       /* Listen for buffer changes, and also write enable changes. */
7973       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7974       pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
7975    }
7976 
7977    if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
7978       iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7979          for (int i = 0; i < 32; i++) {
7980             poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7981          }
7982       }
7983    }
7984 
7985    if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
7986       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7987       iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7988 #if GFX_VER >= 11
7989       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7990        * 3DSTATE_LINE_STIPPLE:
7991        *
7992        *    "Workaround: This command must be followed by a PIPE_CONTROL with
7993        *     CS Stall bit set."
7994        */
7995       iris_emit_pipe_control_flush(batch,
7996                                    "workaround: post 3DSTATE_LINE_STIPPLE",
7997                                    PIPE_CONTROL_CS_STALL);
7998 #endif
7999    }
8000 
8001    if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
8002       iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
8003          topo.PrimitiveTopologyType =
8004             translate_prim_type(draw->mode, ice->state.vertices_per_patch);
8005       }
8006    }
8007 
8008    if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
8009       int count = util_bitcount64(ice->state.bound_vertex_buffers);
8010       uint64_t dynamic_bound = ice->state.bound_vertex_buffers;
8011 
8012       if (ice->state.vs_uses_draw_params && !skip_vb_params) {
8013          assert(ice->draw.draw_params.res);
8014 
8015          struct iris_vertex_buffer_state *state =
8016             &(ice->state.genx->vertex_buffers[count]);
8017          pipe_resource_reference(&state->resource, ice->draw.draw_params.res);
8018          struct iris_resource *res = (void *) state->resource;
8019 
8020          iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
8021             vb.VertexBufferIndex = count;
8022             vb.AddressModifyEnable = true;
8023             vb.BufferPitch = 0;
8024             vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;
8025             vb.BufferStartingAddress =
8026                ro_bo(NULL, res->bo->address +
8027                            (int) ice->draw.draw_params.offset);
8028             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
8029                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8030 #if GFX_VER >= 12
8031             vb.L3BypassDisable       = true;
8032 #endif
8033          }
8034          dynamic_bound |= 1ull << count;
8035          count++;
8036       }
8037 
8038       if (ice->state.vs_uses_derived_draw_params && !skip_vb_params) {
8039          struct iris_vertex_buffer_state *state =
8040             &(ice->state.genx->vertex_buffers[count]);
8041          pipe_resource_reference(&state->resource,
8042                                  ice->draw.derived_draw_params.res);
8043          struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;
8044 
8045          iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
8046              vb.VertexBufferIndex = count;
8047             vb.AddressModifyEnable = true;
8048             vb.BufferPitch = 0;
8049             vb.BufferSize =
8050                res->bo->size - ice->draw.derived_draw_params.offset;
8051             vb.BufferStartingAddress =
8052                ro_bo(NULL, res->bo->address +
8053                            (int) ice->draw.derived_draw_params.offset);
8054             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
8055                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8056 #if GFX_VER >= 12
8057             vb.L3BypassDisable       = true;
8058 #endif
8059          }
8060          dynamic_bound |= 1ull << count;
8061          count++;
8062       }
8063 
8064       if (count) {
8065 #if GFX_VER >= 11
8066          /* Gfx11+ doesn't need the cache workaround below */
8067          uint64_t bound = dynamic_bound;
8068          while (bound) {
8069             const int i = u_bit_scan64(&bound);
8070             iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
8071                                   false, IRIS_DOMAIN_VF_READ);
8072          }
8073 #else
8074          /* The VF cache designers cut corners, and made the cache key's
8075           * <VertexBufferIndex, Memory Address> tuple only consider the bottom
8076           * 32 bits of the address.  If you have two vertex buffers which get
8077           * placed exactly 4 GiB apart and use them in back-to-back draw calls,
8078           * you can get collisions (even within a single batch).
8079           *
8080           * So, we need to do a VF cache invalidate if the buffer for a VB
8081           * slot slot changes [48:32] address bits from the previous time.
8082           */
8083          unsigned flush_flags = 0;
8084 
8085          uint64_t bound = dynamic_bound;
8086          while (bound) {
8087             const int i = u_bit_scan64(&bound);
8088             uint16_t high_bits = 0;
8089 
8090             struct iris_resource *res =
8091                (void *) genx->vertex_buffers[i].resource;
8092             if (res) {
8093                iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
8094 
8095                high_bits = res->bo->address >> 32ull;
8096                if (high_bits != ice->state.last_vbo_high_bits[i]) {
8097                   flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |
8098                                  PIPE_CONTROL_CS_STALL;
8099                   ice->state.last_vbo_high_bits[i] = high_bits;
8100                }
8101             }
8102          }
8103 
8104          if (flush_flags) {
8105             iris_emit_pipe_control_flush(batch,
8106                                          "workaround: VF cache 32-bit key [VB]",
8107                                          flush_flags);
8108          }
8109 #endif
8110 
8111          const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
8112 
8113          uint32_t *map =
8114             iris_get_command_space(batch, 4 * (1 + vb_dwords * count));
8115          _iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
8116             vb.DWordLength = (vb_dwords * count + 1) - 2;
8117          }
8118          map += 1;
8119 
8120          const struct iris_vertex_element_state *cso_ve =
8121             ice->state.cso_vertex_elements;
8122 
8123          bound = dynamic_bound;
8124          while (bound) {
8125             const int i = u_bit_scan64(&bound);
8126 
8127             uint32_t vb_stride[GENX(VERTEX_BUFFER_STATE_length)];
8128             struct iris_bo *bo =
8129                iris_resource_bo(genx->vertex_buffers[i].resource);
8130             iris_pack_state(GENX(VERTEX_BUFFER_STATE), &vb_stride, vbs) {
8131                vbs.BufferPitch = cso_ve->stride[i];
8132                /* Unnecessary except to defeat the genxml nonzero checker */
8133                vbs.MOCS = iris_mocs(bo, &screen->isl_dev,
8134                                     ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8135             }
8136             for (unsigned d = 0; d < vb_dwords; d++)
8137                map[d] = genx->vertex_buffers[i].state[d] | vb_stride[d];
8138 
8139             map += vb_dwords;
8140          }
8141       }
8142    }
8143 
8144    if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
8145       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8146       const unsigned entries = MAX2(cso->count, 1);
8147       if (!(ice->state.vs_needs_sgvs_element ||
8148             ice->state.vs_uses_derived_draw_params ||
8149             ice->state.vs_needs_edge_flag)) {
8150          iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
8151                          (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
8152       } else {
8153          uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
8154          const unsigned dyn_count = cso->count +
8155             ice->state.vs_needs_sgvs_element +
8156             ice->state.vs_uses_derived_draw_params;
8157 
8158          iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
8159                            &dynamic_ves, ve) {
8160             ve.DWordLength =
8161                1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
8162          }
8163          memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
8164                 (cso->count - ice->state.vs_needs_edge_flag) *
8165                 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
8166          uint32_t *ve_pack_dest =
8167             &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
8168                          GENX(VERTEX_ELEMENT_STATE_length)];
8169 
8170          if (ice->state.vs_needs_sgvs_element) {
8171             uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
8172                                  VFCOMP_STORE_SRC : VFCOMP_STORE_0;
8173             iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8174                ve.Valid = true;
8175                ve.VertexBufferIndex =
8176                   util_bitcount64(ice->state.bound_vertex_buffers);
8177                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8178                ve.Component0Control = base_ctrl;
8179                ve.Component1Control = base_ctrl;
8180                ve.Component2Control = VFCOMP_STORE_0;
8181                ve.Component3Control = VFCOMP_STORE_0;
8182             }
8183             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8184          }
8185          if (ice->state.vs_uses_derived_draw_params) {
8186             iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8187                ve.Valid = true;
8188                ve.VertexBufferIndex =
8189                   util_bitcount64(ice->state.bound_vertex_buffers) +
8190                   ice->state.vs_uses_draw_params;
8191                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8192                ve.Component0Control = VFCOMP_STORE_SRC;
8193                ve.Component1Control = VFCOMP_STORE_SRC;
8194                ve.Component2Control = VFCOMP_STORE_0;
8195                ve.Component3Control = VFCOMP_STORE_0;
8196             }
8197             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8198          }
8199          if (ice->state.vs_needs_edge_flag) {
8200             for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
8201                ve_pack_dest[i] = cso->edgeflag_ve[i];
8202          }
8203 
8204          iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
8205                          (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
8206       }
8207 
8208       if (!ice->state.vs_needs_edge_flag) {
8209          iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
8210                          entries * GENX(3DSTATE_VF_INSTANCING_length));
8211       } else {
8212          assert(cso->count > 0);
8213          const unsigned edgeflag_index = cso->count - 1;
8214          uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
8215          memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
8216                 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
8217 
8218          uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
8219             edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
8220          iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
8221             vi.VertexElementIndex = edgeflag_index +
8222                ice->state.vs_needs_sgvs_element +
8223                ice->state.vs_uses_derived_draw_params;
8224          }
8225          for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
8226             vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
8227 
8228          iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
8229                          entries * GENX(3DSTATE_VF_INSTANCING_length));
8230       }
8231    }
8232 
8233    if (dirty & IRIS_DIRTY_VF_SGVS) {
8234       const struct iris_vs_data *vs_data =
8235          iris_vs_data(ice->shaders.prog[MESA_SHADER_VERTEX]);
8236       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8237 
8238       iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
8239          if (vs_data->uses_vertexid) {
8240             sgv.VertexIDEnable = true;
8241             sgv.VertexIDComponentNumber = 2;
8242             sgv.VertexIDElementOffset =
8243                cso->count - ice->state.vs_needs_edge_flag;
8244          }
8245 
8246          if (vs_data->uses_instanceid) {
8247             sgv.InstanceIDEnable = true;
8248             sgv.InstanceIDComponentNumber = 3;
8249             sgv.InstanceIDElementOffset =
8250                cso->count - ice->state.vs_needs_edge_flag;
8251          }
8252       }
8253    }
8254 
8255    if (dirty & IRIS_DIRTY_VF_STATISTICS) {
8256       iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
8257          vf.StatisticsEnable = true;
8258       }
8259    }
8260 
8261    if (dirty & IRIS_DIRTY_VF) {
8262 #if INTEL_WA_16012775297_GFX_VER
8263       /* Emit dummy VF statistics before each 3DSTATE_VF. */
8264       if (intel_needs_workaround(batch->screen->devinfo, 16012775297) &&
8265           (dirty & IRIS_DIRTY_VF_STATISTICS) == 0) {
8266          iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
8267             vfs.StatisticsEnable = true;
8268          }
8269       }
8270 #endif
8271 
8272       iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
8273 #if GFX_VERx10 >= 125
8274          vf.GeometryDistributionEnable = true;
8275 #endif
8276          if (draw->primitive_restart) {
8277             vf.IndexedDrawCutIndexEnable = true;
8278             vf.CutIndex = draw->restart_index;
8279          }
8280       }
8281    }
8282 
8283 #if GFX_VERx10 >= 125
8284    if (dirty & IRIS_DIRTY_VFG) {
8285       iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
8286          /* Gfx12.5: If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE */
8287          vfg.DistributionMode =
8288 #if GFX_VER < 20
8289             ice->shaders.prog[MESA_SHADER_TESS_EVAL] == NULL ? RR_FREE :
8290 #endif
8291                                                                RR_STRICT;
8292          if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
8293              program_uses_primitive_id)
8294             vfg.DistributionGranularity = InstanceLevelGranularity;
8295          else
8296             vfg.DistributionGranularity = BatchLevelGranularity;
8297 #if INTEL_WA_14014851047_GFX_VER
8298          vfg.GranularityThresholdDisable =
8299             intel_needs_workaround(batch->screen->devinfo, 14014851047);
8300 #endif
8301          vfg.ListCutIndexEnable = draw->primitive_restart;
8302          /* 192 vertices for TRILIST_ADJ */
8303          vfg.ListNBatchSizeScale = 0;
8304          /* Batch size of 384 vertices */
8305          vfg.List3BatchSizeScale = 2;
8306          /* Batch size of 128 vertices */
8307          vfg.List2BatchSizeScale = 1;
8308          /* Batch size of 128 vertices */
8309          vfg.List1BatchSizeScale = 2;
8310          /* Batch size of 256 vertices for STRIP topologies */
8311          vfg.StripBatchSizeScale = 3;
8312          /* 192 control points for PATCHLIST_3 */
8313          vfg.PatchBatchSizeScale = 1;
8314          /* 192 control points for PATCHLIST_3 */
8315          vfg.PatchBatchSizeMultiplier = 31;
8316       }
8317    }
8318 #endif
8319 
8320 #if GFX_VER == 8
8321    if (dirty & IRIS_DIRTY_PMA_FIX) {
8322       bool enable = want_pma_fix(ice);
8323       genX(update_pma_fix)(ice, batch, enable);
8324    }
8325 #endif
8326 
8327    if (ice->state.current_hash_scale != 1)
8328       genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
8329 
8330 #if GFX_VER >= 12
8331    genX(invalidate_aux_map_state)(batch);
8332 #endif
8333 }
8334 
8335 static void
flush_vbos(struct iris_context * ice,struct iris_batch * batch)8336 flush_vbos(struct iris_context *ice, struct iris_batch *batch)
8337 {
8338    struct iris_genx_state *genx = ice->state.genx;
8339    uint64_t bound = ice->state.bound_vertex_buffers;
8340    while (bound) {
8341       const int i = u_bit_scan64(&bound);
8342       struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
8343       iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
8344    }
8345 }
8346 
8347 static bool
point_or_line_list(enum mesa_prim prim_type)8348 point_or_line_list(enum mesa_prim prim_type)
8349 {
8350    switch (prim_type) {
8351    case MESA_PRIM_POINTS:
8352    case MESA_PRIM_LINES:
8353    case MESA_PRIM_LINE_STRIP:
8354    case MESA_PRIM_LINES_ADJACENCY:
8355    case MESA_PRIM_LINE_STRIP_ADJACENCY:
8356    case MESA_PRIM_LINE_LOOP:
8357       return true;
8358    default:
8359       return false;
8360    }
8361    return false;
8362 }
8363 
8364 void
genX(emit_breakpoint)8365 genX(emit_breakpoint)(struct iris_batch *batch, bool emit_before_draw)
8366 {
8367    struct iris_context *ice = batch->ice;
8368    uint32_t draw_count = emit_before_draw ?
8369                          p_atomic_inc_return(&ice->draw_call_count) :
8370                          p_atomic_read(&ice->draw_call_count);
8371 
8372    if (((draw_count == intel_debug_bkp_before_draw_count &&
8373          emit_before_draw) ||
8374         (draw_count == intel_debug_bkp_after_draw_count &&
8375          !emit_before_draw)))  {
8376       iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
8377          sem.WaitMode            = PollingMode;
8378          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
8379          sem.SemaphoreDataDword  = 0x1;
8380          sem.SemaphoreAddress    = rw_bo(batch->screen->breakpoint_bo, 0,
8381                                          IRIS_DOMAIN_OTHER_WRITE);
8382       };
8383    }
8384 }
8385 
8386 void
genX(emit_3dprimitive_was)8387 genX(emit_3dprimitive_was)(struct iris_batch *batch,
8388                            const struct pipe_draw_indirect_info *indirect,
8389                            uint32_t primitive_type,
8390                            uint32_t vertex_count)
8391 {
8392    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8393    UNUSED const struct iris_context *ice = batch->ice;
8394 
8395 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
8396    if (intel_needs_workaround(devinfo, 22014412737) &&
8397        (point_or_line_list(primitive_type) || indirect ||
8398         (vertex_count == 1 || vertex_count == 2))) {
8399          iris_emit_pipe_control_write(batch, "Wa_22014412737",
8400                                       PIPE_CONTROL_WRITE_IMMEDIATE,
8401                                       batch->screen->workaround_bo,
8402                                       batch->screen->workaround_address.offset,
8403                                       0ull);
8404       batch->num_3d_primitives_emitted = 0;
8405    } else if (intel_needs_workaround(devinfo, 16014538804)) {
8406       batch->num_3d_primitives_emitted++;
8407 
8408       /* Wa_16014538804 - Send empty/dummy pipe control after 3 3DPRIMITIVE. */
8409       if (batch->num_3d_primitives_emitted == 3) {
8410          iris_emit_pipe_control_flush(batch, "Wa_16014538804", 0);
8411          batch->num_3d_primitives_emitted = 0;
8412       }
8413    }
8414 #endif
8415 }
8416 
8417 void
genX(urb_workaround)8418 genX(urb_workaround)(struct iris_batch *batch,
8419                      const struct intel_urb_config *urb_cfg)
8420 {
8421 #if INTEL_NEEDS_WA_16014912113
8422    if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb,
8423                                MESA_SHADER_TESS_EVAL) &&
8424        batch->ice->shaders.last_urb.size[0] != 0) {
8425       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
8426 #if GFX_VER >= 12
8427          iris_emit_cmd(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
8428             urb._3DCommandSubOpcode += i;
8429             urb.VSURBEntryAllocationSize =
8430                batch->ice->shaders.last_urb.size[i] - 1;
8431             urb.VSURBStartingAddressSlice0 =
8432                batch->ice->shaders.last_urb.start[i];
8433             urb.VSURBStartingAddressSliceN =
8434                batch->ice->shaders.last_urb.start[i];
8435             urb.VSNumberofURBEntriesSlice0 = i == 0 ? 256 : 0;
8436             urb.VSNumberofURBEntriesSliceN = i == 0 ? 256 : 0;
8437          }
8438 #else
8439          iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
8440             urb._3DCommandSubOpcode += i;
8441             urb.VSURBStartingAddress =
8442                batch->ice->shaders.last_urb.start[i];
8443             urb.VSURBEntryAllocationSize =
8444                batch->ice->shaders.last_urb.size[i] - 1;
8445             urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
8446          }
8447 #endif
8448       }
8449       iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8450          pc.HDCPipelineFlushEnable = true;
8451       }
8452    }
8453 #endif
8454 
8455    /* Update current urb config. */
8456    memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg,
8457           sizeof(struct intel_urb_config));
8458 }
8459 
8460 static void
iris_emit_index_buffer(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,const struct pipe_draw_start_count_bias * sc)8461 iris_emit_index_buffer(struct iris_context *ice,
8462                        struct iris_batch *batch,
8463                        const struct pipe_draw_info *draw,
8464                        const struct pipe_draw_start_count_bias *sc)
8465 {
8466    unsigned offset;
8467 
8468    if (draw->has_user_indices) {
8469       unsigned start_offset = draw->index_size * sc->start;
8470 
8471       u_upload_data(ice->ctx.const_uploader, start_offset,
8472                     sc->count * draw->index_size, 4,
8473                     (char*)draw->index.user + start_offset,
8474                     &offset, &ice->state.last_res.index_buffer);
8475       offset -= start_offset;
8476    } else {
8477       struct iris_resource *res = (void *) draw->index.resource;
8478       res->bind_history |= PIPE_BIND_INDEX_BUFFER;
8479 
8480       pipe_resource_reference(&ice->state.last_res.index_buffer,
8481                               draw->index.resource);
8482       offset = 0;
8483 
8484       iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
8485    }
8486 
8487    struct iris_genx_state *genx = ice->state.genx;
8488    struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
8489 
8490    uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
8491    iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
8492       ib.IndexFormat = draw->index_size >> 1;
8493       ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
8494                           ISL_SURF_USAGE_INDEX_BUFFER_BIT);
8495       ib.BufferSize = bo->size - offset;
8496       ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
8497 #if GFX_VER >= 12
8498       ib.L3BypassDisable       = true;
8499 #endif
8500    }
8501 
8502    if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
8503       memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
8504       iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
8505       iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
8506    }
8507 
8508 #if GFX_VER < 11
8509    /* The VF cache key only uses 32-bits, see vertex buffer comment above */
8510    uint16_t high_bits = bo->address >> 32ull;
8511    if (high_bits != ice->state.last_index_bo_high_bits) {
8512       iris_emit_pipe_control_flush(batch,
8513                                    "workaround: VF cache 32-bit key [IB]",
8514                                    PIPE_CONTROL_VF_CACHE_INVALIDATE |
8515                                    PIPE_CONTROL_CS_STALL);
8516       ice->state.last_index_bo_high_bits = high_bits;
8517    }
8518 #endif
8519 }
8520 
8521 
8522 static void
iris_upload_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8523 iris_upload_render_state(struct iris_context *ice,
8524                          struct iris_batch *batch,
8525                          const struct pipe_draw_info *draw,
8526                          unsigned drawid_offset,
8527                          const struct pipe_draw_indirect_info *indirect,
8528                          const struct pipe_draw_start_count_bias *sc)
8529 {
8530    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8531    bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8532 
8533    trace_intel_begin_draw(&batch->trace);
8534 
8535    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8536       flush_vbos(ice, batch);
8537 
8538    iris_batch_sync_region_start(batch);
8539 
8540    /* Always pin the binder.  If we're emitting new binding table pointers,
8541     * we need it.  If not, we're probably inheriting old tables via the
8542     * context, and need it anyway.  Since true zero-bindings cases are
8543     * practically non-existent, just pin it and avoid last_res tracking.
8544     */
8545    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8546                       IRIS_DOMAIN_NONE);
8547 
8548    if (!batch->contains_draw) {
8549       if (GFX_VER == 12) {
8550          /* Re-emit constants when starting a new batch buffer in order to
8551           * work around push constant corruption on context switch.
8552           *
8553           * XXX - Provide hardware spec quotation when available.
8554           */
8555          ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8556                                     IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8557                                     IRIS_STAGE_DIRTY_CONSTANTS_TES |
8558                                     IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8559                                     IRIS_STAGE_DIRTY_CONSTANTS_FS);
8560       }
8561       batch->contains_draw = true;
8562    }
8563 
8564    if (!batch->contains_draw_with_next_seqno) {
8565       iris_restore_render_saved_bos(ice, batch, draw);
8566       batch->contains_draw_with_next_seqno = true;
8567    }
8568 
8569    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8570     * Wa_16011107343 (same for gfx12)
8571     * We implement this by setting TCS dirty on each draw.
8572     */
8573    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8574        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8575       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8576    }
8577 
8578    iris_upload_dirty_render_state(ice, batch, draw, false);
8579 
8580    if (draw->index_size > 0)
8581       iris_emit_index_buffer(ice, batch, draw, sc);
8582 
8583    if (indirect) {
8584       struct mi_builder b;
8585       uint32_t mocs;
8586       mi_builder_init(&b, batch->screen->devinfo, batch);
8587 
8588 #define _3DPRIM_END_OFFSET          0x2420
8589 #define _3DPRIM_START_VERTEX        0x2430
8590 #define _3DPRIM_VERTEX_COUNT        0x2434
8591 #define _3DPRIM_INSTANCE_COUNT      0x2438
8592 #define _3DPRIM_START_INSTANCE      0x243C
8593 #define _3DPRIM_BASE_VERTEX         0x2440
8594 
8595       if (!indirect->count_from_stream_output) {
8596          if (indirect->indirect_draw_count) {
8597             use_predicate = true;
8598 
8599             struct iris_bo *draw_count_bo =
8600                iris_resource_bo(indirect->indirect_draw_count);
8601             unsigned draw_count_offset =
8602                indirect->indirect_draw_count_offset;
8603             mocs = iris_mocs(draw_count_bo, &batch->screen->isl_dev, 0);
8604             mi_builder_set_mocs(&b, mocs);
8605 
8606             if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
8607                /* comparison = draw id < draw count */
8608                struct mi_value comparison =
8609                   mi_ult(&b, mi_imm(drawid_offset),
8610                              mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8611 
8612                /* predicate = comparison & conditional rendering predicate */
8613                mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
8614                             mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
8615             } else {
8616                uint32_t mi_predicate;
8617 
8618                /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
8619                mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(drawid_offset));
8620                /* Upload the current draw count from the draw parameters buffer
8621                 * to MI_PREDICATE_SRC0. Zero the top 32-bits of
8622                 * MI_PREDICATE_SRC0.
8623                 */
8624                mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
8625                         mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8626 
8627                if (drawid_offset == 0) {
8628                   mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
8629                                  MI_PREDICATE_COMBINEOP_SET |
8630                                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8631                } else {
8632                   /* While draw_index < draw_count the predicate's result will be
8633                    *  (draw_index == draw_count) ^ TRUE = TRUE
8634                    * When draw_index == draw_count the result is
8635                    *  (TRUE) ^ TRUE = FALSE
8636                    * After this all results will be:
8637                    *  (FALSE) ^ FALSE = FALSE
8638                    */
8639                   mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
8640                                  MI_PREDICATE_COMBINEOP_XOR |
8641                                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8642                }
8643                iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
8644             }
8645          }
8646          struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8647          assert(bo);
8648 
8649          mocs = iris_mocs(bo, &batch->screen->isl_dev, 0);
8650          mi_builder_set_mocs(&b, mocs);
8651 
8652          mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8653                   mi_mem32(ro_bo(bo, indirect->offset + 0)));
8654          mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8655                   mi_mem32(ro_bo(bo, indirect->offset + 4)));
8656          mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX),
8657                   mi_mem32(ro_bo(bo, indirect->offset + 8)));
8658          if (draw->index_size) {
8659             mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX),
8660                      mi_mem32(ro_bo(bo, indirect->offset + 12)));
8661             mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8662                      mi_mem32(ro_bo(bo, indirect->offset + 16)));
8663          } else {
8664             mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8665                      mi_mem32(ro_bo(bo, indirect->offset + 12)));
8666             mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8667          }
8668       } else if (indirect->count_from_stream_output) {
8669          struct iris_stream_output_target *so =
8670             (void *) indirect->count_from_stream_output;
8671          struct iris_bo *so_bo = iris_resource_bo(so->offset.res);
8672 
8673          mocs = iris_mocs(so_bo, &batch->screen->isl_dev, 0);
8674          mi_builder_set_mocs(&b, mocs);
8675 
8676          iris_emit_buffer_barrier_for(batch, so_bo, IRIS_DOMAIN_OTHER_READ);
8677 
8678          struct iris_address addr = ro_bo(so_bo, so->offset.offset);
8679          struct mi_value offset =
8680             mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8681          mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8682                       mi_udiv32_imm(&b, offset, so->stride));
8683          mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX), mi_imm(0));
8684          mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8685          mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE), mi_imm(0));
8686          mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8687                   mi_imm(draw->instance_count));
8688       }
8689    }
8690 
8691    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8692 
8693    genX(maybe_emit_breakpoint)(batch, true);
8694 
8695    iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8696       prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8697       prim.PredicateEnable = use_predicate;
8698 #if GFX_VERx10 >= 125
8699       prim.TBIMREnable = ice->state.use_tbimr;
8700 #endif
8701       if (indirect) {
8702          prim.IndirectParameterEnable = true;
8703       } else {
8704          prim.StartInstanceLocation = draw->start_instance;
8705          prim.InstanceCount = draw->instance_count;
8706          prim.VertexCountPerInstance = sc->count;
8707 
8708          prim.StartVertexLocation = sc->start;
8709 
8710          if (draw->index_size) {
8711             prim.BaseVertexLocation += sc->index_bias;
8712          }
8713       }
8714    }
8715 
8716    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8717    genX(maybe_emit_breakpoint)(batch, false);
8718 
8719    iris_batch_sync_region_end(batch);
8720 
8721    uint32_t count = (sc) ? sc->count : 0;
8722    count *= draw->instance_count ? draw->instance_count : 1;
8723    trace_intel_end_draw(&batch->trace, count, 0, 0);
8724 }
8725 
8726 static void
iris_upload_indirect_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8727 iris_upload_indirect_render_state(struct iris_context *ice,
8728                                   const struct pipe_draw_info *draw,
8729                                   const struct pipe_draw_indirect_info *indirect,
8730                                   const struct pipe_draw_start_count_bias *sc)
8731 {
8732 #if GFX_VERx10 >= 125
8733    assert(indirect);
8734 
8735    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8736    UNUSED struct iris_screen *screen = batch->screen;
8737    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8738    const bool use_predicate =
8739       ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8740 
8741    trace_intel_begin_draw(&batch->trace);
8742 
8743    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8744       flush_vbos(ice, batch);
8745 
8746    iris_batch_sync_region_start(batch);
8747 
8748    /* Always pin the binder.  If we're emitting new binding table pointers,
8749     * we need it.  If not, we're probably inheriting old tables via the
8750     * context, and need it anyway.  Since true zero-bindings cases are
8751     * practically non-existent, just pin it and avoid last_res tracking.
8752     */
8753    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8754                       IRIS_DOMAIN_NONE);
8755 
8756    if (!batch->contains_draw) {
8757       /* Re-emit constants when starting a new batch buffer in order to
8758        * work around push constant corruption on context switch.
8759        *
8760        * XXX - Provide hardware spec quotation when available.
8761        */
8762       ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8763                                  IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8764                                  IRIS_STAGE_DIRTY_CONSTANTS_TES |
8765                                  IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8766                                  IRIS_STAGE_DIRTY_CONSTANTS_FS);
8767       batch->contains_draw = true;
8768    }
8769 
8770    if (!batch->contains_draw_with_next_seqno) {
8771       iris_restore_render_saved_bos(ice, batch, draw);
8772       batch->contains_draw_with_next_seqno = true;
8773    }
8774 
8775    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8776     * Wa_16011107343 (same for gfx12)
8777     * We implement this by setting TCS dirty on each draw.
8778     */
8779    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8780        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8781       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8782    }
8783 
8784    iris_upload_dirty_render_state(ice, batch, draw, false);
8785 
8786    if (draw->index_size > 0)
8787       iris_emit_index_buffer(ice, batch, draw, sc);
8788 
8789    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8790 
8791    genX(maybe_emit_breakpoint)(batch, true);
8792 
8793    iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
8794       ind.ArgumentFormat             =
8795          draw->index_size > 0 ? XI_DRAWINDEXED : XI_DRAW;
8796       ind.PredicateEnable            = use_predicate;
8797       ind.TBIMREnabled               = ice->state.use_tbimr;
8798       ind.MaxCount                   = indirect->draw_count;
8799 
8800       if (indirect->buffer) {
8801          struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8802          ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset);
8803          ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0);
8804          } else {
8805          ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8806       }
8807 
8808       if (indirect->indirect_draw_count) {
8809          struct iris_bo *draw_count_bo      =
8810             iris_resource_bo(indirect->indirect_draw_count);
8811          ind.CountBufferIndirectEnable      = true;
8812          ind.CountBufferAddress             =
8813             ro_bo(draw_count_bo, indirect->indirect_draw_count_offset);
8814       }
8815    }
8816 
8817    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8818    genX(maybe_emit_breakpoint)(batch, false);
8819 
8820    iris_batch_sync_region_end(batch);
8821 
8822    uint32_t count = (sc) ? sc->count : 0;
8823    count *= draw->instance_count ? draw->instance_count : 1;
8824    trace_intel_end_draw(&batch->trace, count, 0, 0);
8825 #else
8826    unreachable("Unsupported path");
8827 #endif /* GFX_VERx10 >= 125 */
8828 }
8829 
8830 static void
iris_upload_indirect_shader_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8831 iris_upload_indirect_shader_render_state(struct iris_context *ice,
8832                                          const struct pipe_draw_info *draw,
8833                                          const struct pipe_draw_indirect_info *indirect,
8834                                          const struct pipe_draw_start_count_bias *sc)
8835 {
8836    assert(indirect);
8837 
8838    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8839    UNUSED struct iris_screen *screen = batch->screen;
8840    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8841 
8842    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8843       flush_vbos(ice, batch);
8844 
8845    iris_batch_sync_region_start(batch);
8846 
8847    /* Always pin the binder.  If we're emitting new binding table pointers,
8848     * we need it.  If not, we're probably inheriting old tables via the
8849     * context, and need it anyway.  Since true zero-bindings cases are
8850     * practically non-existent, just pin it and avoid last_res tracking.
8851     */
8852    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8853                       IRIS_DOMAIN_NONE);
8854 
8855    if (!batch->contains_draw) {
8856       if (GFX_VER == 12) {
8857          /* Re-emit constants when starting a new batch buffer in order to
8858           * work around push constant corruption on context switch.
8859           *
8860           * XXX - Provide hardware spec quotation when available.
8861           */
8862          ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8863                                     IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8864                                     IRIS_STAGE_DIRTY_CONSTANTS_TES |
8865                                     IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8866                                     IRIS_STAGE_DIRTY_CONSTANTS_FS);
8867       }
8868       batch->contains_draw = true;
8869    }
8870 
8871    if (!batch->contains_draw_with_next_seqno) {
8872       iris_restore_render_saved_bos(ice, batch, draw);
8873       batch->contains_draw_with_next_seqno = true;
8874    }
8875 
8876    if (draw->index_size > 0)
8877       iris_emit_index_buffer(ice, batch, draw, sc);
8878 
8879    /* Make sure we have enough space to keep all the commands in the single BO
8880     * (because of the jumps)
8881     */
8882    iris_require_command_space(batch, 2000);
8883 
8884 #ifndef NDEBUG
8885    struct iris_bo *command_bo = batch->bo;
8886 #endif
8887 
8888    /* Jump point to generate more draw if we run out of space in the ring
8889     * buffer.
8890     */
8891    uint64_t gen_addr = iris_batch_current_address_u64(batch);
8892 
8893    iris_handle_always_flush_cache(batch);
8894 
8895 #if GFX_VER == 9
8896    iris_emit_pipe_control_flush(batch, "before generation",
8897                                 PIPE_CONTROL_VF_CACHE_INVALIDATE);
8898 #endif
8899 
8900    struct iris_address params_addr;
8901    struct iris_gen_indirect_params *params =
8902       genX(emit_indirect_generate)(batch, draw, indirect, sc,
8903                                    &params_addr);
8904 
8905    iris_emit_pipe_control_flush(batch, "after generation flush",
8906                                 ((ice->state.vs_uses_draw_params ||
8907                                   ice->state.vs_uses_derived_draw_params) ?
8908                                  PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) |
8909                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8910                                 PIPE_CONTROL_DATA_CACHE_FLUSH |
8911                                 PIPE_CONTROL_CS_STALL);
8912 
8913    trace_intel_begin_draw(&batch->trace);
8914 
8915    /* Always pin the binder.  If we're emitting new binding table pointers,
8916     * we need it.  If not, we're probably inheriting old tables via the
8917     * context, and need it anyway.  Since true zero-bindings cases are
8918     * practically non-existent, just pin it and avoid last_res tracking.
8919     */
8920    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8921                       IRIS_DOMAIN_NONE);
8922 
8923    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8924     * Wa_16011107343 (same for gfx12)
8925     * We implement this by setting TCS dirty on each draw.
8926     */
8927    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8928        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8929       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8930    }
8931 
8932    iris_upload_dirty_render_state(ice, batch, draw, true);
8933 
8934    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8935 
8936    genX(maybe_emit_breakpoint)(batch, true);
8937 
8938 #if GFX_VER >= 12
8939    iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) {
8940       arb.PreParserDisableMask = true;
8941       arb.PreParserDisable = true;
8942    }
8943 #endif
8944 
8945    iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8946       bbs.AddressSpaceIndicator = ASI_PPGTT;
8947       bbs.BatchBufferStartAddress = (struct iris_address) {
8948          .bo = ice->draw.generation.ring_bo,
8949       };
8950    }
8951 
8952    /* Run the ring buffer one more time with the next set of commands */
8953    uint64_t inc_addr = iris_batch_current_address_u64(batch);
8954    {
8955       iris_emit_pipe_control_flush(batch,
8956                                    "post generated draws wait",
8957                                    PIPE_CONTROL_STALL_AT_SCOREBOARD |
8958                                    PIPE_CONTROL_CS_STALL);
8959 
8960       struct mi_builder b;
8961       mi_builder_init(&b, batch->screen->devinfo, batch);
8962 
8963       struct iris_address draw_base_addr = iris_address_add(
8964          params_addr,
8965          offsetof(struct iris_gen_indirect_params, draw_base));
8966 
8967       const uint32_t mocs =
8968          iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0);
8969       mi_builder_set_mocs(&b, mocs);
8970 
8971       mi_store(&b, mi_mem32(draw_base_addr),
8972                    mi_iadd(&b, mi_mem32(draw_base_addr),
8973                                mi_imm(params->ring_count)));
8974 
8975       iris_emit_pipe_control_flush(batch,
8976                                    "post generation base increment",
8977                                    PIPE_CONTROL_CS_STALL |
8978                                    PIPE_CONTROL_CONST_CACHE_INVALIDATE);
8979 
8980       iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8981          bbs.AddressSpaceIndicator = ASI_PPGTT;
8982          bbs.BatchBufferStartAddress = (struct iris_address) {
8983             .offset = gen_addr,
8984          };
8985       }
8986    }
8987 
8988    /* Exit of the ring buffer */
8989    uint64_t end_addr = iris_batch_current_address_u64(batch);
8990 
8991 #ifndef NDEBUG
8992    assert(command_bo == batch->bo);
8993 #endif
8994 
8995    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8996    genX(maybe_emit_breakpoint)(batch, false);
8997 
8998    iris_emit_pipe_control_flush(batch,
8999                                 "post generated draws wait",
9000                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9001                                 PIPE_CONTROL_CS_STALL);
9002 
9003    params->gen_addr = inc_addr;
9004    params->end_addr = end_addr;
9005 
9006    iris_batch_sync_region_end(batch);
9007 
9008    uint32_t count = (sc) ? sc->count : 0;
9009    count *= draw->instance_count ? draw->instance_count : 1;
9010    trace_intel_end_draw(&batch->trace, count, 0, 0);
9011 }
9012 
9013 static void
iris_load_indirect_location(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9014 iris_load_indirect_location(struct iris_context *ice,
9015                             struct iris_batch *batch,
9016                             const struct pipe_grid_info *grid)
9017 {
9018 #define GPGPU_DISPATCHDIMX 0x2500
9019 #define GPGPU_DISPATCHDIMY 0x2504
9020 #define GPGPU_DISPATCHDIMZ 0x2508
9021 
9022    assert(grid->indirect);
9023 
9024    struct iris_state_ref *grid_size = &ice->state.grid_size;
9025    struct iris_bo *bo = iris_resource_bo(grid_size->res);
9026    struct mi_builder b;
9027    mi_builder_init(&b, batch->screen->devinfo, batch);
9028    struct mi_value size_x = mi_mem32(ro_bo(bo, grid_size->offset + 0));
9029    struct mi_value size_y = mi_mem32(ro_bo(bo, grid_size->offset + 4));
9030    struct mi_value size_z = mi_mem32(ro_bo(bo, grid_size->offset + 8));
9031    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
9032    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
9033    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
9034 }
9035 
iris_emit_indirect_dispatch_supported(const struct intel_device_info * devinfo)9036 static bool iris_emit_indirect_dispatch_supported(const struct intel_device_info *devinfo)
9037 {
9038    // TODO: Swizzling X and Y workgroup sizes is not supported in execute indirect dispatch
9039    return devinfo->has_indirect_unroll;
9040 }
9041 
9042 #if GFX_VERx10 >= 125
9043 
iris_emit_execute_indirect_dispatch(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid,const struct GENX (INTERFACE_DESCRIPTOR_DATA)idd)9044 static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
9045                                                 struct iris_batch *batch,
9046                                                 const struct pipe_grid_info *grid,
9047                                                 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd)
9048 {
9049    const struct iris_screen *screen = batch->screen;
9050    struct iris_compiled_shader *shader =
9051       ice->shaders.prog[MESA_SHADER_COMPUTE];
9052    const struct iris_cs_data *cs_data = iris_cs_data(shader);
9053    const struct intel_cs_dispatch_info dispatch =
9054       iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9055    struct iris_bo *indirect = iris_resource_bo(grid->indirect);
9056    const int dispatch_size = dispatch.simd_size / 16;
9057 
9058    struct GENX(COMPUTE_WALKER_BODY) body = {};
9059    body.SIMDSize            = dispatch_size;
9060    body.MessageSIMD         = dispatch_size;
9061    body.GenerateLocalID     = cs_data->generate_local_id != 0;
9062    body.EmitLocal           = cs_data->generate_local_id;
9063    body.WalkOrder           = cs_data->walk_order;
9064    body.TileLayout          = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9065                               TileY32bpe : Linear;
9066    body.LocalXMaximum       = grid->block[0] - 1;
9067    body.LocalYMaximum       = grid->block[1] - 1;
9068    body.LocalZMaximum       = grid->block[2] - 1;
9069    body.ExecutionMask       = dispatch.right_mask;
9070    body.PostSync.MOCS       = iris_mocs(NULL, &screen->isl_dev, 0);
9071    body.InterfaceDescriptor = idd;
9072    /* HSD 14016252163: Use of Morton walk order (and batching using a batch
9073     * size of 4) is expected to increase sampler cache hit rates by
9074     * increasing sample address locality within a subslice.
9075     */
9076 #if GFX_VER >= 30
9077    body.DispatchWalkOrder =
9078       cs_data->uses_sampler ? MortonWalk : LinearWalk;
9079    body.ThreadGroupBatchSize =
9080       cs_data->uses_sampler ? TG_BATCH_4 : TG_BATCH_1;
9081 #endif
9082 
9083    struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
9084    iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {
9085       ind.PredicateEnable            =
9086          ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
9087       ind.MaxCount                   = 1;
9088       ind.COMPUTE_WALKER_BODY        = body;
9089       ind.ArgumentBufferStartAddress = indirect_bo;
9090       ind.MOCS                       =
9091          iris_mocs(indirect_bo.bo, &screen->isl_dev, 0);
9092    }
9093 }
9094 
9095 static void
iris_upload_compute_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9096 iris_upload_compute_walker(struct iris_context *ice,
9097                            struct iris_batch *batch,
9098                            const struct pipe_grid_info *grid)
9099 {
9100    const uint64_t stage_dirty = ice->state.stage_dirty;
9101    struct iris_screen *screen = batch->screen;
9102    const struct intel_device_info *devinfo = screen->devinfo;
9103    struct iris_binder *binder = &ice->state.binder;
9104    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9105    struct iris_compiled_shader *shader =
9106       ice->shaders.prog[MESA_SHADER_COMPUTE];
9107    const struct iris_cs_data *cs_data = iris_cs_data(shader);
9108    const struct intel_cs_dispatch_info dispatch =
9109       iris_get_cs_dispatch_info(devinfo, shader, grid->block);
9110 
9111    trace_intel_begin_compute(&batch->trace);
9112 
9113    if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
9114       iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
9115          cfe.MaximumNumberofThreads =
9116             devinfo->max_cs_threads * devinfo->subslice_total;
9117          uint32_t scratch_addr = pin_scratch_space(ice, batch, shader,
9118                                                    MESA_SHADER_COMPUTE);
9119          cfe.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
9120       }
9121    }
9122 
9123    struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
9124    idd.KernelStartPointer = KSP(shader);
9125    idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9126    idd.SharedLocalMemorySize =
9127       intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
9128    idd.PreferredSLMAllocationSize =
9129       intel_compute_preferred_slm_calc_encode_size(devinfo,
9130                                                    shader->total_shared,
9131                                                    dispatch.group_size,
9132                                                    dispatch.simd_size);
9133    idd.SamplerStatePointer = shs->sampler_table.offset;
9134    idd.SamplerCount = encode_sampler_count(shader),
9135    idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
9136    /* Typically set to 0 to avoid prefetching on every thread dispatch. */
9137    idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
9138       0 : MIN2(shader->bt.size_bytes / 4, 31);
9139    idd.NumberOfBarriers = cs_data->uses_barrier;
9140 #if GFX_VER >= 30
9141    idd.RegistersPerThread = ptl_register_blocks(shader->brw_prog_data->grf_used);
9142 #endif
9143 
9144    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9145 
9146    if (iris_emit_indirect_dispatch_supported(devinfo) && grid->indirect) {
9147       iris_emit_execute_indirect_dispatch(ice, batch, grid, idd);
9148    } else {
9149       if (grid->indirect)
9150          iris_load_indirect_location(ice, batch, grid);
9151 
9152       iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9153 
9154       ice->utrace.last_compute_walker =
9155          iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length));
9156 
9157       struct GENX(COMPUTE_WALKER_BODY) body = {
9158          .SIMDSize                       = dispatch.simd_size / 16,
9159          .MessageSIMD                    = dispatch.simd_size / 16,
9160          .LocalXMaximum                  = grid->block[0] - 1,
9161          .LocalYMaximum                  = grid->block[1] - 1,
9162          .LocalZMaximum                  = grid->block[2] - 1,
9163          .ThreadGroupIDXDimension        = grid->grid[0],
9164          .ThreadGroupIDYDimension        = grid->grid[1],
9165          .ThreadGroupIDZDimension        = grid->grid[2],
9166          .ExecutionMask                  = dispatch.right_mask,
9167          .PostSync.MOCS                  = iris_mocs(NULL, &screen->isl_dev, 0),
9168          .InterfaceDescriptor            = idd,
9169 
9170 #if GFX_VERx10 >= 125
9171          .GenerateLocalID = cs_data->generate_local_id != 0,
9172          .EmitLocal       = cs_data->generate_local_id,
9173          .WalkOrder       = cs_data->walk_order,
9174          .TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9175                        TileY32bpe : Linear,
9176 #endif
9177       };
9178 
9179       _iris_pack_command(batch, GENX(COMPUTE_WALKER),
9180                          ice->utrace.last_compute_walker, cw) {
9181          cw.IndirectParameterEnable        = grid->indirect;
9182          cw.body                           = body;
9183          assert(iris_cs_push_const_total_size(shader, dispatch.threads) == 0);
9184       }
9185    }
9186 
9187    trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2], 0);
9188 }
9189 
9190 #else /* #if GFX_VERx10 >= 125 */
9191 
9192 static void
iris_upload_gpgpu_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9193 iris_upload_gpgpu_walker(struct iris_context *ice,
9194                          struct iris_batch *batch,
9195                          const struct pipe_grid_info *grid)
9196 {
9197    const uint64_t stage_dirty = ice->state.stage_dirty;
9198    struct iris_screen *screen = batch->screen;
9199    const struct intel_device_info *devinfo = screen->devinfo;
9200    struct iris_binder *binder = &ice->state.binder;
9201    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9202    struct iris_uncompiled_shader *ish =
9203       ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
9204    struct iris_compiled_shader *shader =
9205       ice->shaders.prog[MESA_SHADER_COMPUTE];
9206    struct iris_cs_data *cs_data = iris_cs_data(shader);
9207    const struct intel_cs_dispatch_info dispatch =
9208       iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9209 
9210    trace_intel_begin_compute(&batch->trace);
9211 
9212    if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9213        cs_data->local_size[0] == 0 /* Variable local group size */) {
9214       /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
9215        *
9216        *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
9217        *    the only bits that are changed are scoreboard related: Scoreboard
9218        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
9219        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
9220        *    sufficient."
9221        */
9222       iris_emit_pipe_control_flush(batch,
9223                                    "workaround: stall before MEDIA_VFE_STATE",
9224                                    PIPE_CONTROL_CS_STALL);
9225 
9226       iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
9227          if (shader->total_scratch) {
9228             uint32_t scratch_addr =
9229                pin_scratch_space(ice, batch, shader, MESA_SHADER_COMPUTE);
9230 
9231             vfe.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
9232             vfe.ScratchSpaceBasePointer =
9233                rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
9234          }
9235 
9236          vfe.MaximumNumberofThreads =
9237             devinfo->max_cs_threads * devinfo->subslice_total - 1;
9238 #if GFX_VER < 11
9239          vfe.ResetGatewayTimer =
9240             Resettingrelativetimerandlatchingtheglobaltimestamp;
9241 #endif
9242 #if GFX_VER == 8
9243          vfe.BypassGatewayControl = true;
9244 #endif
9245          vfe.NumberofURBEntries = 2;
9246          vfe.URBEntryAllocationSize = 2;
9247 
9248          vfe.CURBEAllocationSize =
9249             ALIGN(cs_data->push.per_thread.regs * dispatch.threads +
9250                   cs_data->push.cross_thread.regs, 2);
9251       }
9252    }
9253 
9254    /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
9255    if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9256        cs_data->local_size[0] == 0 /* Variable local group size */) {
9257       uint32_t curbe_data_offset = 0;
9258       assert(cs_data->push.cross_thread.dwords == 0 &&
9259              cs_data->push.per_thread.dwords == 1 &&
9260              cs_data->first_param_is_builtin_subgroup_id);
9261       const unsigned push_const_size =
9262          iris_cs_push_const_total_size(shader, dispatch.threads);
9263       uint32_t *curbe_data_map =
9264          stream_state(batch, ice->state.dynamic_uploader,
9265                       &ice->state.last_res.cs_thread_ids,
9266                       ALIGN(push_const_size, 64), 64,
9267                       &curbe_data_offset);
9268       assert(curbe_data_map);
9269       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
9270       iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads,
9271                                      curbe_data_map);
9272 
9273       iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
9274          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
9275          curbe.CURBEDataStartAddress = curbe_data_offset;
9276       }
9277    }
9278 
9279    for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
9280       struct pipe_resource *res = ice->state.global_bindings[i];
9281       if (!res)
9282          break;
9283 
9284       iris_use_pinned_bo(batch, iris_resource_bo(res),
9285                          true, IRIS_DOMAIN_NONE);
9286    }
9287 
9288    if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
9289                       IRIS_STAGE_DIRTY_BINDINGS_CS |
9290                       IRIS_STAGE_DIRTY_CONSTANTS_CS |
9291                       IRIS_STAGE_DIRTY_CS)) {
9292       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
9293 
9294       iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
9295          idd.SharedLocalMemorySize =
9296             intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
9297          idd.KernelStartPointer =
9298             KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);
9299          idd.SamplerStatePointer = shs->sampler_table.offset;
9300          idd.BindingTablePointer =
9301             binder->bt_offset[MESA_SHADER_COMPUTE] >> IRIS_BT_OFFSET_SHIFT;
9302          idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9303       }
9304 
9305       for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
9306          desc[i] |= ((uint32_t *) shader->derived_data)[i];
9307 
9308       iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
9309          load.InterfaceDescriptorTotalLength =
9310             GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
9311          load.InterfaceDescriptorDataStartAddress =
9312             emit_state(batch, ice->state.dynamic_uploader,
9313                        &ice->state.last_res.cs_desc, desc, sizeof(desc), 64);
9314       }
9315    }
9316 
9317    if (grid->indirect)
9318       iris_load_indirect_location(ice, batch, grid);
9319 
9320    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9321 
9322    iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
9323       ggw.IndirectParameterEnable    = grid->indirect != NULL;
9324       ggw.SIMDSize                   = dispatch.simd_size / 16;
9325       ggw.ThreadDepthCounterMaximum  = 0;
9326       ggw.ThreadHeightCounterMaximum = 0;
9327       ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
9328       ggw.ThreadGroupIDXDimension    = grid->grid[0];
9329       ggw.ThreadGroupIDYDimension    = grid->grid[1];
9330       ggw.ThreadGroupIDZDimension    = grid->grid[2];
9331       ggw.RightExecutionMask         = dispatch.right_mask;
9332       ggw.BottomExecutionMask        = 0xffffffff;
9333    }
9334 
9335    iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
9336 
9337    trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2], 0);
9338 }
9339 
9340 #endif /* #if GFX_VERx10 >= 125 */
9341 
9342 static void
iris_upload_compute_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9343 iris_upload_compute_state(struct iris_context *ice,
9344                           struct iris_batch *batch,
9345                           const struct pipe_grid_info *grid)
9346 {
9347    struct iris_screen *screen = batch->screen;
9348    const uint64_t stage_dirty = ice->state.stage_dirty;
9349    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9350    struct iris_compiled_shader *shader =
9351       ice->shaders.prog[MESA_SHADER_COMPUTE];
9352    struct iris_border_color_pool *border_color_pool =
9353       iris_bufmgr_get_border_color_pool(screen->bufmgr);
9354 
9355    iris_batch_sync_region_start(batch);
9356 
9357    /* Always pin the binder.  If we're emitting new binding table pointers,
9358     * we need it.  If not, we're probably inheriting old tables via the
9359     * context, and need it anyway.  Since true zero-bindings cases are
9360     * practically non-existent, just pin it and avoid last_res tracking.
9361     */
9362    iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
9363 
9364    if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
9365         shs->sysvals_need_upload) ||
9366        shader->kernel_input_size > 0)
9367       upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
9368 
9369    if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
9370       iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
9371 
9372    if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
9373       iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
9374 
9375    iris_use_optional_res(batch, shs->sampler_table.res, false,
9376                          IRIS_DOMAIN_NONE);
9377    iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
9378                       IRIS_DOMAIN_NONE);
9379 
9380    if (ice->state.need_border_colors)
9381       iris_use_pinned_bo(batch, border_color_pool->bo, false,
9382                          IRIS_DOMAIN_NONE);
9383 
9384 #if GFX_VER >= 12
9385    genX(invalidate_aux_map_state)(batch);
9386 #endif
9387 
9388 #if GFX_VERx10 >= 125
9389    iris_upload_compute_walker(ice, batch, grid);
9390 #else
9391    iris_upload_gpgpu_walker(ice, batch, grid);
9392 #endif
9393 
9394    if (!batch->contains_draw_with_next_seqno) {
9395       iris_restore_compute_saved_bos(ice, batch, grid);
9396       batch->contains_draw_with_next_seqno = batch->contains_draw = true;
9397    }
9398 
9399    iris_batch_sync_region_end(batch);
9400 }
9401 
9402 /**
9403  * State module teardown.
9404  */
9405 static void
iris_destroy_state(struct iris_context * ice)9406 iris_destroy_state(struct iris_context *ice)
9407 {
9408    struct iris_genx_state *genx = ice->state.genx;
9409 
9410    pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
9411 
9412    pipe_resource_reference(&ice->draw.draw_params.res, NULL);
9413    pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
9414    pipe_resource_reference(&ice->draw.generation.params.res, NULL);
9415    pipe_resource_reference(&ice->draw.generation.vertices.res, NULL);
9416 
9417    /* Loop over all VBOs, including ones for draw parameters */
9418    for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
9419       pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);
9420    }
9421 
9422    free(ice->state.genx);
9423 
9424    for (int i = 0; i < 4; i++) {
9425       pipe_so_target_reference(&ice->state.so_target[i], NULL);
9426    }
9427 
9428    util_unreference_framebuffer_state(&ice->state.framebuffer);
9429 
9430    for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
9431       struct iris_shader_state *shs = &ice->state.shaders[stage];
9432       pipe_resource_reference(&shs->sampler_table.res, NULL);
9433       for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
9434          pipe_resource_reference(&shs->constbuf[i].buffer, NULL);
9435          pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);
9436       }
9437       for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
9438          pipe_resource_reference(&shs->image[i].base.resource, NULL);
9439          pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);
9440          free(shs->image[i].surface_state.cpu);
9441       }
9442       for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
9443          pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
9444          pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);
9445       }
9446       for (int i = 0; i < IRIS_MAX_TEXTURES; i++) {
9447          pipe_sampler_view_reference((struct pipe_sampler_view **)
9448                                      &shs->textures[i], NULL);
9449       }
9450    }
9451 
9452    pipe_resource_reference(&ice->state.grid_size.res, NULL);
9453    pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);
9454 
9455    pipe_resource_reference(&ice->state.null_fb.res, NULL);
9456    pipe_resource_reference(&ice->state.unbound_tex.res, NULL);
9457 
9458    pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
9459    pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
9460    pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
9461    pipe_resource_reference(&ice->state.last_res.scissor, NULL);
9462    pipe_resource_reference(&ice->state.last_res.blend, NULL);
9463    pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);
9464    pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);
9465    pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);
9466 }
9467 
9468 /* ------------------------------------------------------------------- */
9469 
9470 static void
iris_rebind_buffer(struct iris_context * ice,struct iris_resource * res)9471 iris_rebind_buffer(struct iris_context *ice,
9472                    struct iris_resource *res)
9473 {
9474    struct pipe_context *ctx = &ice->ctx;
9475    struct iris_genx_state *genx = ice->state.genx;
9476 
9477    assert(res->base.b.target == PIPE_BUFFER);
9478 
9479    /* Buffers can't be framebuffer attachments, nor display related,
9480     * and we don't have upstream Clover support.
9481     */
9482    assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
9483                                  PIPE_BIND_RENDER_TARGET |
9484                                  PIPE_BIND_BLENDABLE |
9485                                  PIPE_BIND_DISPLAY_TARGET |
9486                                  PIPE_BIND_CURSOR |
9487                                  PIPE_BIND_COMPUTE_RESOURCE |
9488                                  PIPE_BIND_GLOBAL)));
9489 
9490    if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
9491       uint64_t bound_vbs = ice->state.bound_vertex_buffers;
9492       while (bound_vbs) {
9493          const int i = u_bit_scan64(&bound_vbs);
9494          struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];
9495 
9496          /* Update the CPU struct */
9497          STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);
9498          STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);
9499          uint64_t *addr = (uint64_t *) &state->state[1];
9500          struct iris_bo *bo = iris_resource_bo(state->resource);
9501 
9502          if (*addr != bo->address + state->offset) {
9503             *addr = bo->address + state->offset;
9504             ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
9505                                 IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
9506          }
9507       }
9508    }
9509 
9510    /* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit
9511     * the 3DSTATE_INDEX_BUFFER packet whenever the address changes.
9512     *
9513     * There is also no need to handle these:
9514     * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
9515     * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
9516     */
9517 
9518    if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
9519       uint32_t *so_buffers = genx->so_buffers;
9520       for (unsigned i = 0; i < 4; i++,
9521            so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
9522 
9523          /* There are no other fields in bits 127:64 */
9524          uint64_t *addr = (uint64_t *) &so_buffers[2];
9525          STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);
9526          STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);
9527 
9528          struct pipe_stream_output_target *tgt = ice->state.so_target[i];
9529          if (tgt) {
9530             struct iris_bo *bo = iris_resource_bo(tgt->buffer);
9531             if (*addr != bo->address + tgt->buffer_offset) {
9532                *addr = bo->address + tgt->buffer_offset;
9533                ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
9534             }
9535          }
9536       }
9537    }
9538 
9539    for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
9540       struct iris_shader_state *shs = &ice->state.shaders[s];
9541       enum pipe_shader_type p_stage = stage_to_pipe(s);
9542 
9543       if (!(res->bind_stages & (1 << s)))
9544          continue;
9545 
9546       if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
9547          /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
9548          uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
9549          while (bound_cbufs) {
9550             const int i = u_bit_scan(&bound_cbufs);
9551             struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
9552             struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];
9553 
9554             if (res->bo == iris_resource_bo(cbuf->buffer)) {
9555                pipe_resource_reference(&surf_state->res, NULL);
9556                shs->dirty_cbufs |= 1u << i;
9557                ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
9558                                     IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
9559                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
9560             }
9561          }
9562       }
9563 
9564       if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
9565          uint32_t bound_ssbos = shs->bound_ssbos;
9566          while (bound_ssbos) {
9567             const int i = u_bit_scan(&bound_ssbos);
9568             struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
9569 
9570             if (res->bo == iris_resource_bo(ssbo->buffer)) {
9571                struct pipe_shader_buffer buf = {
9572                   .buffer = &res->base.b,
9573                   .buffer_offset = ssbo->buffer_offset,
9574                   .buffer_size = ssbo->buffer_size,
9575                };
9576                iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,
9577                                        (shs->writable_ssbos >> i) & 1);
9578             }
9579          }
9580       }
9581 
9582       if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
9583          int i;
9584          BITSET_FOREACH_SET(i, shs->bound_sampler_views, IRIS_MAX_TEXTURES) {
9585             struct iris_sampler_view *isv = shs->textures[i];
9586             struct iris_bo *bo = isv->res->bo;
9587 
9588             if (update_surface_state_addrs(ice->state.surface_uploader,
9589                                            &isv->surface_state, bo)) {
9590                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9591             }
9592          }
9593       }
9594 
9595       if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
9596          uint64_t bound_image_views = shs->bound_image_views;
9597          while (bound_image_views) {
9598             const int i = u_bit_scan64(&bound_image_views);
9599             struct iris_image_view *iv = &shs->image[i];
9600             struct iris_bo *bo = iris_resource_bo(iv->base.resource);
9601 
9602             if (update_surface_state_addrs(ice->state.surface_uploader,
9603                                            &iv->surface_state, bo)) {
9604                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9605             }
9606          }
9607       }
9608    }
9609 }
9610 
9611 /* ------------------------------------------------------------------- */
9612 
9613 /**
9614  * Introduce a batch synchronization boundary, and update its cache coherency
9615  * status to reflect the execution of a PIPE_CONTROL command with the
9616  * specified flags.
9617  */
9618 static void
batch_mark_sync_for_pipe_control(struct iris_batch * batch,uint32_t flags)9619 batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
9620 {
9621    const struct intel_device_info *devinfo = batch->screen->devinfo;
9622 
9623    iris_batch_sync_boundary(batch);
9624 
9625    if ((flags & PIPE_CONTROL_CS_STALL)) {
9626       if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9627          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9628 
9629       if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9630          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9631 
9632       if ((flags & PIPE_CONTROL_TILE_CACHE_FLUSH)) {
9633          /* A tile cache flush makes any C/Z data in L3 visible to memory. */
9634          const unsigned c = IRIS_DOMAIN_RENDER_WRITE;
9635          const unsigned z = IRIS_DOMAIN_DEPTH_WRITE;
9636          batch->coherent_seqnos[c][c] = batch->l3_coherent_seqnos[c];
9637          batch->coherent_seqnos[z][z] = batch->l3_coherent_seqnos[z];
9638       }
9639 
9640       if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9641          /* HDC and DC flushes both flush the data cache out to L3 */
9642          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9643       }
9644 
9645       if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9646          /* A DC flush also flushes L3 data cache lines out to memory. */
9647          const unsigned i = IRIS_DOMAIN_DATA_WRITE;
9648          batch->coherent_seqnos[i][i] = batch->l3_coherent_seqnos[i];
9649       }
9650 
9651       if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9652          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9653 
9654       if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
9655                     PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
9656          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
9657          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9658          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9659          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
9660       }
9661    }
9662 
9663    if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9664       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9665 
9666    if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9667       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9668 
9669    if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH))
9670       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9671 
9672    if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9673       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9674 
9675    if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
9676       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
9677 
9678    if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE))
9679       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9680 
9681    /* Technically, to invalidate IRIS_DOMAIN_PULL_CONSTANT_READ, we need
9682     * both "Constant Cache Invalidate" and either "Texture Cache Invalidate"
9683     * or "Data Cache Flush" set, depending on the setting of
9684     * iris_indirect_ubos_use_sampler().
9685     *
9686     * However, "Data Cache Flush" and "Constant Cache Invalidate" will never
9687     * appear in the same PIPE_CONTROL command, because one is bottom-of-pipe
9688     * while the other is top-of-pipe.  Because we only look at one flush at
9689     * a time, we won't see both together.
9690     *
9691     * To deal with this, we mark it as invalidated when the constant cache
9692     * is invalidated, and trust the callers to also flush the other related
9693     * cache correctly at the same time.
9694     */
9695    if ((flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
9696       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9697 
9698    /* IRIS_DOMAIN_OTHER_READ no longer uses any caches. */
9699 
9700    if ((flags & PIPE_CONTROL_L3_RO_INVALIDATE_BITS) == PIPE_CONTROL_L3_RO_INVALIDATE_BITS) {
9701       /* If we just invalidated the read-only lines of L3, then writes from non-L3-coherent
9702        * domains will now be visible to those L3 clients.
9703        */
9704       for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
9705          if (!iris_domain_is_l3_coherent(devinfo, i))
9706             batch->l3_coherent_seqnos[i] = batch->coherent_seqnos[i][i];
9707       }
9708    }
9709 }
9710 
9711 static unsigned
flags_to_post_sync_op(uint32_t flags)9712 flags_to_post_sync_op(uint32_t flags)
9713 {
9714    if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
9715       return WriteImmediateData;
9716 
9717    if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
9718       return WritePSDepthCount;
9719 
9720    if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
9721       return WriteTimestamp;
9722 
9723    return 0;
9724 }
9725 
9726 /**
9727  * Do the given flags have a Post Sync or LRI Post Sync operation?
9728  */
9729 static enum pipe_control_flags
get_post_sync_flags(enum pipe_control_flags flags)9730 get_post_sync_flags(enum pipe_control_flags flags)
9731 {
9732    flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
9733             PIPE_CONTROL_WRITE_DEPTH_COUNT |
9734             PIPE_CONTROL_WRITE_TIMESTAMP |
9735             PIPE_CONTROL_LRI_POST_SYNC_OP;
9736 
9737    /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
9738     * "LRI Post Sync Operation".  So more than one bit set would be illegal.
9739     */
9740    assert(util_bitcount(flags) <= 1);
9741 
9742    return flags;
9743 }
9744 
9745 #define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)
9746 
9747 /**
9748  * Emit a series of PIPE_CONTROL commands, taking into account any
9749  * workarounds necessary to actually accomplish the caller's request.
9750  *
9751  * Unless otherwise noted, spec quotations in this function come from:
9752  *
9753  * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
9754  * Restrictions for PIPE_CONTROL.
9755  *
9756  * You should not use this function directly.  Use the helpers in
9757  * iris_pipe_control.c instead, which may split the pipe control further.
9758  */
9759 static void
iris_emit_raw_pipe_control(struct iris_batch * batch,const char * reason,uint32_t flags,struct iris_bo * bo,uint32_t offset,uint64_t imm)9760 iris_emit_raw_pipe_control(struct iris_batch *batch,
9761                            const char *reason,
9762                            uint32_t flags,
9763                            struct iris_bo *bo,
9764                            uint32_t offset,
9765                            uint64_t imm)
9766 {
9767    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
9768    enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
9769    enum pipe_control_flags non_lri_post_sync_flags =
9770       post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
9771 
9772 #if GFX_VER >= 12
9773    if (batch->name == IRIS_BATCH_BLITTER) {
9774       batch_mark_sync_for_pipe_control(batch, flags);
9775       iris_batch_sync_region_start(batch);
9776 
9777       assert(!(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT));
9778 
9779       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
9780       if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
9781          batch_emit_fast_color_dummy_blit(batch);
9782 
9783       /* The blitter doesn't actually use PIPE_CONTROL; rather it uses the
9784        * MI_FLUSH_DW command.  However, all of our code is set up to flush
9785        * via emitting a pipe control, so we just translate it at this point,
9786        * even if it is a bit hacky.
9787        */
9788       iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
9789          fd.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
9790          fd.ImmediateData = imm;
9791          fd.PostSyncOperation = flags_to_post_sync_op(flags);
9792 #if GFX_VERx10 >= 125
9793          /* TODO: This may not always be necessary */
9794          fd.FlushCCS = true;
9795 #endif
9796       }
9797       iris_batch_sync_region_end(batch);
9798       return;
9799    }
9800 #endif
9801 
9802    /* The "L3 Read Only Cache Invalidation Bit" docs say it "controls the
9803     * invalidation of the Geometry streams cached in L3 cache at the top
9804     * of the pipe".  In other words, index & vertex data that gets cached
9805     * in L3 when VERTEX_BUFFER_STATE::L3BypassDisable is set.
9806     *
9807     * Normally, invalidating L1/L2 read-only caches also invalidate their
9808     * related L3 cachelines, but this isn't the case for the VF cache.
9809     * Emulate it by setting the L3 Read Only bit when doing a VF invalidate.
9810     */
9811    if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)
9812       flags |= PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
9813 
9814    /* Recursive PIPE_CONTROL workarounds --------------------------------
9815     * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
9816     *
9817     * We do these first because we want to look at the original operation,
9818     * rather than any workarounds we set.
9819     */
9820    if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
9821       /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
9822        * lists several workarounds:
9823        *
9824        *    "Project: SKL, KBL, BXT
9825        *
9826        *     If the VF Cache Invalidation Enable is set to a 1 in a
9827        *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
9828        *     sets to 0, with the VF Cache Invalidation Enable set to 0
9829        *     needs to be sent prior to the PIPE_CONTROL with VF Cache
9830        *     Invalidation Enable set to a 1."
9831        */
9832       iris_emit_raw_pipe_control(batch,
9833                                  "workaround: recursive VF cache invalidate",
9834                                  0, NULL, 0, 0);
9835    }
9836 
9837    if (GFX_VER == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
9838       /* Project: SKL / Argument: LRI Post Sync Operation [23]
9839        *
9840        * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9841        *  programmed prior to programming a PIPECONTROL command with "LRI
9842        *  Post Sync Operation" in GPGPU mode of operation (i.e when
9843        *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
9844        *
9845        * The same text exists a few rows below for Post Sync Op.
9846        */
9847       iris_emit_raw_pipe_control(batch,
9848                                  "workaround: CS stall before gpgpu post-sync",
9849                                  PIPE_CONTROL_CS_STALL, bo, offset, imm);
9850    }
9851 
9852    /* "Flush Types" workarounds ---------------------------------------------
9853     * We do these now because they may add post-sync operations or CS stalls.
9854     */
9855 
9856    if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
9857       /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
9858        *
9859        * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
9860        *  'Write PS Depth Count' or 'Write Timestamp'."
9861        */
9862       if (!bo) {
9863          flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9864          post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9865          non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9866          bo = batch->screen->workaround_address.bo;
9867          offset = batch->screen->workaround_address.offset;
9868       }
9869    }
9870 
9871    if (flags & PIPE_CONTROL_DEPTH_STALL) {
9872       /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
9873        *
9874        *    "This bit must be DISABLED for operations other than writing
9875        *     PS_DEPTH_COUNT."
9876        *
9877        * This seems like nonsense.  An Ivybridge workaround requires us to
9878        * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
9879        * operation.  Gfx8+ requires us to emit depth stalls and depth cache
9880        * flushes together.  So, it's hard to imagine this means anything other
9881        * than "we originally intended this to be used for PS_DEPTH_COUNT".
9882        *
9883        * We ignore the supposed restriction and do nothing.
9884        */
9885    }
9886 
9887    if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
9888                 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9889       /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
9890        *
9891        *    "This bit must be DISABLED for End-of-pipe (Read) fences,
9892        *     PS_DEPTH_COUNT or TIMESTAMP queries."
9893        *
9894        * TODO: Implement end-of-pipe checking.
9895        */
9896       assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
9897                                   PIPE_CONTROL_WRITE_TIMESTAMP)));
9898    }
9899 
9900    if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9901       /* From the PIPE_CONTROL instruction table, bit 1:
9902        *
9903        *    "This bit is ignored if Depth Stall Enable is set.
9904        *     Further, the render cache is not flushed even if Write Cache
9905        *     Flush Enable bit is set."
9906        *
9907        * We assert that the caller doesn't do this combination, to try and
9908        * prevent mistakes.  It shouldn't hurt the GPU, though.
9909        *
9910        * We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"
9911        * and "Render Target Flush" combo is explicitly required for BTI
9912        * update workarounds.
9913        */
9914       assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
9915                         PIPE_CONTROL_RENDER_TARGET_FLUSH)));
9916    }
9917 
9918    /* PIPE_CONTROL page workarounds ------------------------------------- */
9919 
9920    if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
9921       /* From the PIPE_CONTROL page itself:
9922        *
9923        *    "IVB, HSW, BDW
9924        *     Restriction: Pipe_control with CS-stall bit set must be issued
9925        *     before a pipe-control command that has the State Cache
9926        *     Invalidate bit set."
9927        */
9928       flags |= PIPE_CONTROL_CS_STALL;
9929    }
9930 
9931    if (flags & PIPE_CONTROL_FLUSH_LLC) {
9932       /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
9933        *
9934        *    "Project: ALL
9935        *     SW must always program Post-Sync Operation to "Write Immediate
9936        *     Data" when Flush LLC is set."
9937        *
9938        * For now, we just require the caller to do it.
9939        */
9940       assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
9941    }
9942 
9943    /* Emulate a HDC flush with a full Data Cache Flush on older hardware which
9944     * doesn't support the new lightweight flush.
9945     */
9946 #if GFX_VER < 12
9947       if (flags & PIPE_CONTROL_FLUSH_HDC)
9948          flags |= PIPE_CONTROL_DATA_CACHE_FLUSH;
9949 #endif
9950 
9951    /* "Post-Sync Operation" workarounds -------------------------------- */
9952 
9953    /* Project: All / Argument: Global Snapshot Count Reset [19]
9954     *
9955     * "This bit must not be exercised on any product.
9956     *  Requires stall bit ([20] of DW1) set."
9957     *
9958     * We don't use this, so we just assert that it isn't used.  The
9959     * PIPE_CONTROL instruction page indicates that they intended this
9960     * as a debug feature and don't think it is useful in production,
9961     * but it may actually be usable, should we ever want to.
9962     */
9963    assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
9964 
9965    if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
9966                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
9967       /* Project: All / Arguments:
9968        *
9969        * - Generic Media State Clear [16]
9970        * - Indirect State Pointers Disable [16]
9971        *
9972        *    "Requires stall bit ([20] of DW1) set."
9973        *
9974        * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
9975        * State Clear) says:
9976        *
9977        *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
9978        *     programmed prior to programming a PIPECONTROL command with "Media
9979        *     State Clear" set in GPGPU mode of operation"
9980        *
9981        * This is a subset of the earlier rule, so there's nothing to do.
9982        */
9983       flags |= PIPE_CONTROL_CS_STALL;
9984    }
9985 
9986    if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
9987       /* Project: All / Argument: Store Data Index
9988        *
9989        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9990        *  than '0'."
9991        *
9992        * For now, we just assert that the caller does this.  We might want to
9993        * automatically add a write to the workaround BO...
9994        */
9995       assert(non_lri_post_sync_flags != 0);
9996    }
9997 
9998    if (flags & PIPE_CONTROL_SYNC_GFDT) {
9999       /* Project: All / Argument: Sync GFDT
10000        *
10001        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
10002        *  than '0' or 0x2520[13] must be set."
10003        *
10004        * For now, we just assert that the caller does this.
10005        */
10006       assert(non_lri_post_sync_flags != 0);
10007    }
10008 
10009    if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
10010       /* Project: IVB+ / Argument: TLB inv
10011        *
10012        *    "Requires stall bit ([20] of DW1) set."
10013        *
10014        * Also, from the PIPE_CONTROL instruction table:
10015        *
10016        *    "Project: SKL+
10017        *     Post Sync Operation or CS stall must be set to ensure a TLB
10018        *     invalidation occurs.  Otherwise no cycle will occur to the TLB
10019        *     cache to invalidate."
10020        *
10021        * This is not a subset of the earlier rule, so there's nothing to do.
10022        */
10023       flags |= PIPE_CONTROL_CS_STALL;
10024    }
10025 
10026    if (GFX_VER == 9 && devinfo->gt == 4) {
10027       /* TODO: The big Skylake GT4 post sync op workaround */
10028    }
10029 
10030    /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
10031 
10032    if (IS_COMPUTE_PIPELINE(batch)) {
10033       if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
10034          /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
10035           * PIPE_CONTROL, Flush Types:
10036           *   "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
10037           * For newer platforms this is documented in the PIPE_CONTROL
10038           * instruction page.
10039           */
10040          flags |= PIPE_CONTROL_CS_STALL;
10041       }
10042 
10043       if (GFX_VER == 8 && (post_sync_flags ||
10044                            (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
10045                                      PIPE_CONTROL_DEPTH_STALL |
10046                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
10047                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
10048                                      PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
10049          /* Project: BDW / Arguments:
10050           *
10051           * - LRI Post Sync Operation   [23]
10052           * - Post Sync Op              [15:14]
10053           * - Notify En                 [8]
10054           * - Depth Stall               [13]
10055           * - Render Target Cache Flush [12]
10056           * - Depth Cache Flush         [0]
10057           * - DC Flush Enable           [5]
10058           *
10059           *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
10060           *     Workloads."
10061           */
10062          flags |= PIPE_CONTROL_CS_STALL;
10063 
10064          /* Also, from the PIPE_CONTROL instruction table, bit 20:
10065           *
10066           *    "Project: BDW
10067           *     This bit must be always set when PIPE_CONTROL command is
10068           *     programmed by GPGPU and MEDIA workloads, except for the cases
10069           *     when only Read Only Cache Invalidation bits are set (State
10070           *     Cache Invalidation Enable, Instruction cache Invalidation
10071           *     Enable, Texture Cache Invalidation Enable, Constant Cache
10072           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
10073           *     need not implemented when FF_DOP_CG is disable via "Fixed
10074           *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
10075           *
10076           * It sounds like we could avoid CS stalls in some cases, but we
10077           * don't currently bother.  This list isn't exactly the list above,
10078           * either...
10079           */
10080       }
10081    }
10082 
10083    /* "Stall" workarounds ----------------------------------------------
10084     * These have to come after the earlier ones because we may have added
10085     * some additional CS stalls above.
10086     */
10087 
10088    if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
10089       /* Project: PRE-SKL, VLV, CHV
10090        *
10091        * "[All Stepping][All SKUs]:
10092        *
10093        *  One of the following must also be set:
10094        *
10095        *  - Render Target Cache Flush Enable ([12] of DW1)
10096        *  - Depth Cache Flush Enable ([0] of DW1)
10097        *  - Stall at Pixel Scoreboard ([1] of DW1)
10098        *  - Depth Stall ([13] of DW1)
10099        *  - Post-Sync Operation ([13] of DW1)
10100        *  - DC Flush Enable ([5] of DW1)"
10101        *
10102        * If we don't already have one of those bits set, we choose to add
10103        * "Stall at Pixel Scoreboard".  Some of the other bits require a
10104        * CS stall as a workaround (see above), which would send us into
10105        * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
10106        * appears to be safe, so we choose that.
10107        */
10108       const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
10109                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
10110                                PIPE_CONTROL_WRITE_IMMEDIATE |
10111                                PIPE_CONTROL_WRITE_DEPTH_COUNT |
10112                                PIPE_CONTROL_WRITE_TIMESTAMP |
10113                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
10114                                PIPE_CONTROL_DEPTH_STALL |
10115                                PIPE_CONTROL_DATA_CACHE_FLUSH;
10116       if (!(flags & wa_bits))
10117          flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
10118    }
10119 
10120    if (INTEL_NEEDS_WA_1409600907 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
10121       /* Wa_1409600907:
10122        *
10123        * "PIPE_CONTROL with Depth Stall Enable bit must be set
10124        * with any PIPE_CONTROL with Depth Flush Enable bit set.
10125        */
10126       flags |= PIPE_CONTROL_DEPTH_STALL;
10127    }
10128 
10129    /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
10130     * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
10131     * with CS_STALL Bit set (with No POST_SYNC ENABLED)
10132     */
10133    if (intel_device_info_is_adln(devinfo) &&
10134        IS_COMPUTE_PIPELINE(batch) &&
10135        flags_to_post_sync_op(flags) != NoWrite) {
10136       iris_emit_raw_pipe_control(batch, "Wa_14014966230",
10137                                  PIPE_CONTROL_CS_STALL, NULL, 0, 0);
10138    }
10139 
10140    batch_mark_sync_for_pipe_control(batch, flags);
10141 
10142 #if INTEL_NEEDS_WA_14010840176
10143    /* "If the intention of “constant cache invalidate” is
10144     *  to invalidate the L1 cache (which can cache constants), use “HDC
10145     *  pipeline flush” instead of Constant Cache invalidate command."
10146     *
10147     * "If L3 invalidate is needed, the w/a should be to set state invalidate
10148     * in the pipe control command, in addition to the HDC pipeline flush."
10149     */
10150    if (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) {
10151       flags &= ~PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10152       flags |= PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10153    }
10154 #endif
10155 
10156    /* Emit --------------------------------------------------------------- */
10157 
10158    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
10159       fprintf(stderr,
10160               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
10161               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
10162               (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
10163               (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
10164               (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
10165               (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
10166               (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
10167               (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
10168               (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
10169               (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
10170               (flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",
10171               (flags & PIPE_CONTROL_L3_FABRIC_FLUSH) ? "L3Fabric " : "",
10172               (flags & PIPE_CONTROL_CCS_CACHE_FLUSH) ? "CCS " : "",
10173               (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
10174               (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
10175               (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
10176               (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
10177               (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
10178               (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
10179               (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
10180                  "SnapRes" : "",
10181               (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
10182                   "ISPDis" : "",
10183               (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
10184               (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
10185               (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
10186               (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",
10187               (flags & PIPE_CONTROL_PSS_STALL_SYNC) ? "PSS " : "",
10188               (flags & PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH) ? "UntypedDataPortCache " : "",
10189               imm, reason);
10190    }
10191 
10192    iris_batch_sync_region_start(batch);
10193 
10194    const bool trace_pc =
10195       (flags & (PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CACHE_INVALIDATE_BITS)) != 0;
10196 
10197    if (trace_pc)
10198       trace_intel_begin_stall(&batch->trace);
10199 
10200    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
10201 #if GFX_VERx10 >= 125
10202       pc.PSSStallSyncEnable = flags & PIPE_CONTROL_PSS_STALL_SYNC;
10203 #endif
10204 #if GFX_VER == 12
10205       pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;
10206       pc.L3FabricFlush = flags & PIPE_CONTROL_L3_FABRIC_FLUSH;
10207 #endif
10208 #if GFX_VER > 11
10209       pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;
10210 #endif
10211 #if GFX_VERx10 >= 125
10212       pc.UntypedDataPortCacheFlushEnable =
10213          (flags & (PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
10214                    PIPE_CONTROL_FLUSH_HDC |
10215                    PIPE_CONTROL_DATA_CACHE_FLUSH)) &&
10216          IS_COMPUTE_PIPELINE(batch);
10217       pc.HDCPipelineFlushEnable |= pc.UntypedDataPortCacheFlushEnable;
10218       pc.CCSFlushEnable |= flags & PIPE_CONTROL_CCS_CACHE_FLUSH;
10219 #endif
10220       pc.LRIPostSyncOperation = NoLRIOperation;
10221       pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
10222       pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
10223       pc.StoreDataIndex = 0;
10224       pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
10225 #if GFX_VERx10 < 125
10226       pc.GlobalSnapshotCountReset =
10227          flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
10228 #endif
10229       pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
10230 #if GFX_VERx10 < 200
10231       pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
10232 #endif
10233       pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
10234       pc.RenderTargetCacheFlushEnable =
10235          flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
10236       pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
10237       pc.StateCacheInvalidationEnable =
10238          flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10239 #if GFX_VER >= 12
10240       pc.L3ReadOnlyCacheInvalidationEnable =
10241          flags & PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
10242 #endif
10243       pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
10244       pc.ConstantCacheInvalidationEnable =
10245          flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10246       pc.PostSyncOperation = flags_to_post_sync_op(flags);
10247       pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
10248       pc.InstructionCacheInvalidateEnable =
10249          flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
10250       pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
10251       pc.IndirectStatePointersDisable =
10252          flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
10253       pc.TextureCacheInvalidationEnable =
10254          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
10255       pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
10256       pc.ImmediateData = imm;
10257    }
10258 
10259    if (trace_pc) {
10260       trace_intel_end_stall(&batch->trace, flags,
10261                             iris_utrace_pipe_flush_bit_to_ds_stall_flag,
10262                             reason,0,0,0);
10263    }
10264 
10265    iris_batch_sync_region_end(batch);
10266 }
10267 
10268 #if GFX_VER == 9
10269 /**
10270  * Preemption on Gfx9 has to be enabled or disabled in various cases.
10271  *
10272  * See these workarounds for preemption:
10273  *  - WaDisableMidObjectPreemptionForGSLineStripAdj
10274  *  - WaDisableMidObjectPreemptionForTrifanOrPolygon
10275  *  - WaDisableMidObjectPreemptionForLineLoop
10276  *  - WA#0798
10277  *
10278  * We don't put this in the vtable because it's only used on Gfx9.
10279  */
10280 void
gfx9_toggle_preemption(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)10281 gfx9_toggle_preemption(struct iris_context *ice,
10282                        struct iris_batch *batch,
10283                        const struct pipe_draw_info *draw)
10284 {
10285    struct iris_genx_state *genx = ice->state.genx;
10286    bool object_preemption = true;
10287 
10288    /* WaDisableMidObjectPreemptionForGSLineStripAdj
10289     *
10290     *    "WA: Disable mid-draw preemption when draw-call is a linestrip_adj
10291     *     and GS is enabled."
10292     */
10293    if (draw->mode == MESA_PRIM_LINE_STRIP_ADJACENCY &&
10294        ice->shaders.prog[MESA_SHADER_GEOMETRY])
10295       object_preemption = false;
10296 
10297    /* WaDisableMidObjectPreemptionForTrifanOrPolygon
10298     *
10299     *    "TriFan miscompare in Execlist Preemption test. Cut index that is
10300     *     on a previous context. End the previous, the resume another context
10301     *     with a tri-fan or polygon, and the vertex count is corrupted. If we
10302     *     prempt again we will cause corruption.
10303     *
10304     *     WA: Disable mid-draw preemption when draw-call has a tri-fan."
10305     */
10306    if (draw->mode == MESA_PRIM_TRIANGLE_FAN)
10307       object_preemption = false;
10308 
10309    /* WaDisableMidObjectPreemptionForLineLoop
10310     *
10311     *    "VF Stats Counters Missing a vertex when preemption enabled.
10312     *
10313     *     WA: Disable mid-draw preemption when the draw uses a lineloop
10314     *     topology."
10315     */
10316    if (draw->mode == MESA_PRIM_LINE_LOOP)
10317       object_preemption = false;
10318 
10319    /* WA#0798
10320     *
10321     *    "VF is corrupting GAFS data when preempted on an instance boundary
10322     *     and replayed with instancing enabled.
10323     *
10324     *     WA: Disable preemption when using instanceing."
10325     */
10326    if (draw->instance_count > 1)
10327       object_preemption = false;
10328 
10329    if (genx->object_preemption != object_preemption) {
10330       iris_enable_obj_preemption(batch, object_preemption);
10331       genx->object_preemption = object_preemption;
10332    }
10333 }
10334 #endif
10335 
10336 static void
iris_lost_genx_state(struct iris_context * ice,struct iris_batch * batch)10337 iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
10338 {
10339    struct iris_genx_state *genx = ice->state.genx;
10340 
10341 #if INTEL_NEEDS_WA_1808121037
10342    genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
10343 #endif
10344 
10345    memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
10346 }
10347 
10348 static void
iris_emit_mi_report_perf_count(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset_in_bytes,uint32_t report_id)10349 iris_emit_mi_report_perf_count(struct iris_batch *batch,
10350                                struct iris_bo *bo,
10351                                uint32_t offset_in_bytes,
10352                                uint32_t report_id)
10353 {
10354    iris_batch_sync_region_start(batch);
10355    iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
10356       mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,
10357                                    IRIS_DOMAIN_OTHER_WRITE);
10358       mi_rpc.ReportID = report_id;
10359    }
10360    iris_batch_sync_region_end(batch);
10361 }
10362 
10363 /**
10364  * Update the pixel hashing modes that determine the balancing of PS threads
10365  * across subslices and slices.
10366  *
10367  * \param width Width bound of the rendering area (already scaled down if \p
10368  *              scale is greater than 1).
10369  * \param height Height bound of the rendering area (already scaled down if \p
10370  *               scale is greater than 1).
10371  * \param scale The number of framebuffer samples that could potentially be
10372  *              affected by an individual channel of the PS thread.  This is
10373  *              typically one for single-sampled rendering, but for operations
10374  *              like CCS resolves and fast clears a single PS invocation may
10375  *              update a huge number of pixels, in which case a finer
10376  *              balancing is desirable in order to maximally utilize the
10377  *              bandwidth available.  UINT_MAX can be used as shorthand for
10378  *              "finest hashing mode available".
10379  */
10380 void
genX(emit_hashing_mode)10381 genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
10382                         unsigned width, unsigned height, unsigned scale)
10383 {
10384 #if GFX_VER == 9
10385    const struct intel_device_info *devinfo = batch->screen->devinfo;
10386    const unsigned slice_hashing[] = {
10387       /* Because all Gfx9 platforms with more than one slice require
10388        * three-way subslice hashing, a single "normal" 16x16 slice hashing
10389        * block is guaranteed to suffer from substantial imbalance, with one
10390        * subslice receiving twice as much work as the other two in the
10391        * slice.
10392        *
10393        * The performance impact of that would be particularly severe when
10394        * three-way hashing is also in use for slice balancing (which is the
10395        * case for all Gfx9 GT4 platforms), because one of the slices
10396        * receives one every three 16x16 blocks in either direction, which
10397        * is roughly the periodicity of the underlying subslice imbalance
10398        * pattern ("roughly" because in reality the hardware's
10399        * implementation of three-way hashing doesn't do exact modulo 3
10400        * arithmetic, which somewhat decreases the magnitude of this effect
10401        * in practice).  This leads to a systematic subslice imbalance
10402        * within that slice regardless of the size of the primitive.  The
10403        * 32x32 hashing mode guarantees that the subslice imbalance within a
10404        * single slice hashing block is minimal, largely eliminating this
10405        * effect.
10406        */
10407       _32x32,
10408       /* Finest slice hashing mode available. */
10409       NORMAL
10410    };
10411    const unsigned subslice_hashing[] = {
10412       /* 16x16 would provide a slight cache locality benefit especially
10413        * visible in the sampler L1 cache efficiency of low-bandwidth
10414        * non-LLC platforms, but it comes at the cost of greater subslice
10415        * imbalance for primitives of dimensions approximately intermediate
10416        * between 16x4 and 16x16.
10417        */
10418       _16x4,
10419       /* Finest subslice hashing mode available. */
10420       _8x4
10421    };
10422    /* Dimensions of the smallest hashing block of a given hashing mode.  If
10423     * the rendering area is smaller than this there can't possibly be any
10424     * benefit from switching to this mode, so we optimize out the
10425     * transition.
10426     */
10427    const unsigned min_size[][2] = {
10428       { 16, 4 },
10429       { 8, 4 }
10430    };
10431    const unsigned idx = scale > 1;
10432 
10433    if (width > min_size[idx][0] || height > min_size[idx][1]) {
10434       iris_emit_raw_pipe_control(batch,
10435                                  "workaround: CS stall before GT_MODE LRI",
10436                                  PIPE_CONTROL_STALL_AT_SCOREBOARD |
10437                                  PIPE_CONTROL_CS_STALL,
10438                                  NULL, 0, 0);
10439 
10440       iris_emit_reg(batch, GENX(GT_MODE), reg) {
10441          reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
10442          reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
10443          reg.SubsliceHashing = subslice_hashing[idx];
10444          reg.SubsliceHashingMask = -1;
10445       };
10446 
10447       ice->state.current_hash_scale = scale;
10448    }
10449 #endif
10450 }
10451 
10452 static void
iris_set_frontend_noop(struct pipe_context * ctx,bool enable)10453 iris_set_frontend_noop(struct pipe_context *ctx, bool enable)
10454 {
10455    struct iris_context *ice = (struct iris_context *) ctx;
10456 
10457    if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {
10458       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
10459       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
10460    }
10461 
10462    if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {
10463       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
10464       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
10465    }
10466 }
10467 
10468 void
genX(init_screen_state)10469 genX(init_screen_state)(struct iris_screen *screen)
10470 {
10471    assert(screen->devinfo->verx10 == GFX_VERx10);
10472    screen->vtbl.destroy_state = iris_destroy_state;
10473    screen->vtbl.init_render_context = iris_init_render_context;
10474    screen->vtbl.init_compute_context = iris_init_compute_context;
10475    screen->vtbl.init_copy_context = iris_init_copy_context;
10476    screen->vtbl.upload_render_state = iris_upload_render_state;
10477    screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
10478    screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state;
10479    screen->vtbl.update_binder_address = iris_update_binder_address;
10480    screen->vtbl.upload_compute_state = iris_upload_compute_state;
10481    screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
10482    screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc;
10483    screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
10484    screen->vtbl.rebind_buffer = iris_rebind_buffer;
10485    screen->vtbl.load_register_reg32 = iris_load_register_reg32;
10486    screen->vtbl.load_register_reg64 = iris_load_register_reg64;
10487    screen->vtbl.load_register_imm32 = iris_load_register_imm32;
10488    screen->vtbl.load_register_imm64 = iris_load_register_imm64;
10489    screen->vtbl.load_register_mem32 = iris_load_register_mem32;
10490    screen->vtbl.load_register_mem64 = iris_load_register_mem64;
10491    screen->vtbl.store_register_mem32 = iris_store_register_mem32;
10492    screen->vtbl.store_register_mem64 = iris_store_register_mem64;
10493    screen->vtbl.store_data_imm32 = iris_store_data_imm32;
10494    screen->vtbl.store_data_imm64 = iris_store_data_imm64;
10495    screen->vtbl.copy_mem_mem = iris_copy_mem_mem;
10496    screen->vtbl.derived_program_state_size = iris_derived_program_state_size;
10497    screen->vtbl.store_derived_program_state = iris_store_derived_program_state;
10498    screen->vtbl.create_so_decl_list = iris_create_so_decl_list;
10499    screen->vtbl.populate_vs_key = iris_populate_vs_key;
10500    screen->vtbl.populate_tcs_key = iris_populate_tcs_key;
10501    screen->vtbl.populate_tes_key = iris_populate_tes_key;
10502    screen->vtbl.populate_gs_key = iris_populate_gs_key;
10503    screen->vtbl.populate_fs_key = iris_populate_fs_key;
10504    screen->vtbl.populate_cs_key = iris_populate_cs_key;
10505    screen->vtbl.lost_genx_state = iris_lost_genx_state;
10506    screen->vtbl.disable_rhwo_optimization = iris_disable_rhwo_optimization;
10507 }
10508 
10509 void
genX(init_state)10510 genX(init_state)(struct iris_context *ice)
10511 {
10512    struct pipe_context *ctx = &ice->ctx;
10513    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
10514 
10515    ctx->create_blend_state = iris_create_blend_state;
10516    ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
10517    ctx->create_rasterizer_state = iris_create_rasterizer_state;
10518    ctx->create_sampler_state = iris_create_sampler_state;
10519    ctx->create_sampler_view = iris_create_sampler_view;
10520    ctx->create_surface = iris_create_surface;
10521    ctx->create_vertex_elements_state = iris_create_vertex_elements;
10522    ctx->bind_blend_state = iris_bind_blend_state;
10523    ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
10524    ctx->bind_sampler_states = iris_bind_sampler_states;
10525    ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
10526    ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
10527    ctx->delete_blend_state = iris_delete_state;
10528    ctx->delete_depth_stencil_alpha_state = iris_delete_state;
10529    ctx->delete_rasterizer_state = iris_delete_state;
10530    ctx->delete_sampler_state = iris_delete_state;
10531    ctx->delete_vertex_elements_state = iris_delete_state;
10532    ctx->set_blend_color = iris_set_blend_color;
10533    ctx->set_clip_state = iris_set_clip_state;
10534    ctx->set_constant_buffer = iris_set_constant_buffer;
10535    ctx->set_shader_buffers = iris_set_shader_buffers;
10536    ctx->set_shader_images = iris_set_shader_images;
10537    ctx->set_sampler_views = iris_set_sampler_views;
10538    ctx->set_compute_resources = iris_set_compute_resources;
10539    ctx->set_global_binding = iris_set_global_binding;
10540    ctx->set_tess_state = iris_set_tess_state;
10541    ctx->set_patch_vertices = iris_set_patch_vertices;
10542    ctx->set_framebuffer_state = iris_set_framebuffer_state;
10543    ctx->set_polygon_stipple = iris_set_polygon_stipple;
10544    ctx->set_sample_mask = iris_set_sample_mask;
10545    ctx->set_scissor_states = iris_set_scissor_states;
10546    ctx->set_stencil_ref = iris_set_stencil_ref;
10547    ctx->set_vertex_buffers = iris_set_vertex_buffers;
10548    ctx->set_viewport_states = iris_set_viewport_states;
10549    ctx->sampler_view_destroy = iris_sampler_view_destroy;
10550    ctx->surface_destroy = iris_surface_destroy;
10551    ctx->draw_vbo = iris_draw_vbo;
10552    ctx->launch_grid = iris_launch_grid;
10553    ctx->create_stream_output_target = iris_create_stream_output_target;
10554    ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
10555    ctx->set_stream_output_targets = iris_set_stream_output_targets;
10556    ctx->set_frontend_noop = iris_set_frontend_noop;
10557 
10558    ice->state.dirty = ~0ull;
10559    ice->state.stage_dirty = ~0ull;
10560 
10561    ice->state.statistics_counters_enabled = true;
10562 
10563    ice->state.sample_mask = 0xffff;
10564    ice->state.num_viewports = 1;
10565    ice->state.prim_mode = MESA_PRIM_COUNT;
10566    ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
10567    ice->draw.derived_params.drawid = -1;
10568 
10569 #if GFX_VERx10 >= 120
10570    ice->state.genx->object_preemption = true;
10571 #endif
10572 
10573    /* Make a 1x1x1 null surface for unbound textures */
10574    void *null_surf_map =
10575       upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
10576                    4 * GENX(RENDER_SURFACE_STATE_length), 64);
10577    isl_null_fill_state(&screen->isl_dev, null_surf_map,
10578                        .size = isl_extent3d(1, 1, 1));
10579    ice->state.unbound_tex.offset +=
10580       iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));
10581 
10582    /* Default all scissor rectangles to be empty regions. */
10583    for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {
10584       ice->state.scissors[i] = (struct pipe_scissor_state) {
10585          .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
10586       };
10587    }
10588 }
10589