• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file iris_state.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * This is the main state upload code.
31  *
32  * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33  * complex, or highly reusable state can be created once, and bound and
34  * rebound multiple times.  This is modeled with the pipe->create_*_state()
35  * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36  * streamed out on the fly, via pipe->set_*_state() hooks.
37  *
38  * OpenGL involves frequently mutating context state, which is mirrored in
39  * core Mesa by highly mutable data structures.  However, most applications
40  * typically draw the same things over and over - from frame to frame, most
41  * of the same objects are still visible and need to be redrawn.  So, rather
42  * than inventing new state all the time, applications usually mutate to swap
43  * between known states that we've seen before.
44  *
45  * Gallium isolates us from this mutation by tracking API state, and
46  * distilling it into a set of Constant State Objects, or CSOs.  Large,
47  * complex, or typically reusable state can be created once, then reused
48  * multiple times.  Drivers can create and store their own associated data.
49  * This create/bind model corresponds to the pipe->create_*_state() and
50  * pipe->bind_*_state() driver hooks.
51  *
52  * Some state is cheap to create, or expected to be highly dynamic.  Rather
53  * than creating and caching piles of CSOs for these, Gallium simply streams
54  * them out, via the pipe->set_*_state() driver hooks.
55  *
56  * To reduce draw time overhead, we try to compute as much state at create
57  * time as possible.  Wherever possible, we translate the Gallium pipe state
58  * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59  * we can simply memcpy them into a batch buffer.
60  *
61  * No hardware matches the abstraction perfectly, so some commands require
62  * information from multiple CSOs.  In this case, we can store two copies
63  * of the packet (one in each CSO), and simply | together their DWords at
64  * draw time.  Sometimes the second set is trivial (one or two fields), so
65  * we simply pack it at draw time.
66  *
67  * There are two main components in the file below.  First, the CSO hooks
68  * create/bind/track state.  The second are the draw-time upload functions,
69  * iris_upload_render_state() and iris_upload_compute_state(), which read
70  * the context state and emit the commands into the actual batch.
71  */
72 
73 #include <stdio.h>
74 #include <errno.h>
75 
76 #ifdef HAVE_VALGRIND
77 #include <valgrind.h>
78 #include <memcheck.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83 
84 #include "pipe/p_defines.h"
85 #include "pipe/p_state.h"
86 #include "pipe/p_context.h"
87 #include "pipe/p_screen.h"
88 #include "util/u_dual_blend.h"
89 #include "util/u_inlines.h"
90 #include "util/format/u_format.h"
91 #include "util/u_framebuffer.h"
92 #include "util/u_transfer.h"
93 #include "util/u_upload_mgr.h"
94 #include "util/u_viewport.h"
95 #include "util/u_memory.h"
96 #include "util/u_trace_gallium.h"
97 #include "nir.h"
98 #include "intel/common/intel_aux_map.h"
99 #include "intel/common/intel_compute_slm.h"
100 #include "intel/common/intel_l3_config.h"
101 #include "intel/common/intel_sample_positions.h"
102 #include "intel/ds/intel_tracepoints.h"
103 #include "iris_batch.h"
104 #include "iris_context.h"
105 #include "iris_defines.h"
106 #include "iris_pipe.h"
107 #include "iris_resource.h"
108 #include "iris_utrace.h"
109 
110 #include "iris_genx_macros.h"
111 
112 #if GFX_VER >= 9
113 #include "intel/compiler/brw_compiler.h"
114 #include "intel/common/intel_genX_state_brw.h"
115 #else
116 #include "intel/compiler/elk/elk_compiler.h"
117 #include "intel/common/intel_genX_state_elk.h"
118 #endif
119 
120 #include "intel/common/intel_guardband.h"
121 #include "intel/common/intel_pixel_hash.h"
122 #include "intel/common/intel_tiled_render.h"
123 
124 /**
125  * Statically assert that PIPE_* enums match the hardware packets.
126  * (As long as they match, we don't need to translate them.)
127  */
pipe_asserts()128 UNUSED static void pipe_asserts()
129 {
130 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
131 
132    /* pipe_logicop happens to match the hardware. */
133    PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
134    PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
135    PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
136    PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
137    PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
138    PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
139    PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
140    PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
141    PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
142    PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
143    PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
144    PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
145    PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
146    PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
147    PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
148    PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
149 
150    /* pipe_blend_func happens to match the hardware. */
151    PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
152    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
153    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
154    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
155    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
156    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
157    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
158    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
159    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
160    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
161    PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
162    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
163    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
164    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
165    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
166    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
167    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
168    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
169    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
170 
171    /* pipe_blend_func happens to match the hardware. */
172    PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
173    PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
174    PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
175    PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
176    PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
177 
178    /* pipe_stencil_op happens to match the hardware. */
179    PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
180    PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
181    PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
182    PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
183    PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
184    PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
185    PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
186    PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
187 
188    /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
189    PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
190    PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
191 #undef PIPE_ASSERT
192 }
193 
194 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)195 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
196 {
197    static const unsigned map[] = {
198       [MESA_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
199       [MESA_PRIM_LINES]                    = _3DPRIM_LINELIST,
200       [MESA_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
201       [MESA_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
202       [MESA_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
203       [MESA_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
204       [MESA_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
205       [MESA_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
206       [MESA_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
207       [MESA_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
208       [MESA_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
209       [MESA_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
210       [MESA_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
211       [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
212       [MESA_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
213    };
214 
215    return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
216 }
217 
218 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)219 translate_compare_func(enum pipe_compare_func pipe_func)
220 {
221    static const unsigned map[] = {
222       [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
223       [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
224       [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
225       [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
226       [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
227       [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
228       [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
229       [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
230    };
231    return map[pipe_func];
232 }
233 
234 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)235 translate_shadow_func(enum pipe_compare_func pipe_func)
236 {
237    /* Gallium specifies the result of shadow comparisons as:
238     *
239     *    1 if ref <op> texel,
240     *    0 otherwise.
241     *
242     * The hardware does:
243     *
244     *    0 if texel <op> ref,
245     *    1 otherwise.
246     *
247     * So we need to flip the operator and also negate.
248     */
249    static const unsigned map[] = {
250       [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
251       [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
252       [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
253       [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
254       [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
255       [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
256       [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
257       [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
258    };
259    return map[pipe_func];
260 }
261 
262 static unsigned
translate_cull_mode(unsigned pipe_face)263 translate_cull_mode(unsigned pipe_face)
264 {
265    static const unsigned map[4] = {
266       [PIPE_FACE_NONE]           = CULLMODE_NONE,
267       [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
268       [PIPE_FACE_BACK]           = CULLMODE_BACK,
269       [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
270    };
271    return map[pipe_face];
272 }
273 
274 static unsigned
translate_fill_mode(unsigned pipe_polymode)275 translate_fill_mode(unsigned pipe_polymode)
276 {
277    static const unsigned map[4] = {
278       [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
279       [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
280       [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
281       [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282    };
283    return map[pipe_polymode];
284 }
285 
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289    static const unsigned map[] = {
290       [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291       [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
292       [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
293    };
294    return map[pipe_mip];
295 }
296 
297 static uint32_t
translate_wrap(unsigned pipe_wrap)298 translate_wrap(unsigned pipe_wrap)
299 {
300    static const unsigned map[] = {
301       [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
302       [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
303       [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
304       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
305       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
306       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
307 
308       /* These are unsupported. */
309       [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
310       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
311    };
312    return map[pipe_wrap];
313 }
314 
315 /**
316  * Allocate space for some indirect state.
317  *
318  * Return a pointer to the map (to fill it out) and a state ref (for
319  * referring to the state in GPU commands).
320  */
321 static void *
upload_state(struct u_upload_mgr * uploader,struct iris_state_ref * ref,unsigned size,unsigned alignment)322 upload_state(struct u_upload_mgr *uploader,
323              struct iris_state_ref *ref,
324              unsigned size,
325              unsigned alignment)
326 {
327    void *p = NULL;
328    u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
329    return p;
330 }
331 
332 /**
333  * Stream out temporary/short-lived state.
334  *
335  * This allocates space, pins the BO, and includes the BO address in the
336  * returned offset (which works because all state lives in 32-bit memory
337  * zones).
338  */
339 static uint32_t *
stream_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,unsigned size,unsigned alignment,uint32_t * out_offset)340 stream_state(struct iris_batch *batch,
341              struct u_upload_mgr *uploader,
342              struct pipe_resource **out_res,
343              unsigned size,
344              unsigned alignment,
345              uint32_t *out_offset)
346 {
347    void *ptr = NULL;
348 
349    u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
350 
351    struct iris_bo *bo = iris_resource_bo(*out_res);
352    iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
353 
354    iris_record_state_size(batch->state_sizes,
355                           bo->address + *out_offset, size);
356 
357    *out_offset += iris_bo_offset_from_base_address(bo);
358 
359    return ptr;
360 }
361 
362 /**
363  * stream_state() + memcpy.
364  */
365 static uint32_t
emit_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,const void * data,unsigned size,unsigned alignment)366 emit_state(struct iris_batch *batch,
367            struct u_upload_mgr *uploader,
368            struct pipe_resource **out_res,
369            const void *data,
370            unsigned size,
371            unsigned alignment)
372 {
373    unsigned offset = 0;
374    uint32_t *map =
375       stream_state(batch, uploader, out_res, size, alignment, &offset);
376 
377    if (map)
378       memcpy(map, data, size);
379 
380    return offset;
381 }
382 
383 /**
384  * Did field 'x' change between 'old_cso' and 'new_cso'?
385  *
386  * (If so, we may want to set some dirty flags.)
387  */
388 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
389 #define cso_changed_memcmp(x) \
390    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
391 #define cso_changed_memcmp_elts(x, n) \
392    (!old_cso || memcmp(old_cso->x, new_cso->x, n * sizeof(old_cso->x[0])) != 0)
393 
394 static void
flush_before_state_base_change(struct iris_batch * batch)395 flush_before_state_base_change(struct iris_batch *batch)
396 {
397    /* Wa_14014427904 - We need additional invalidate/flush when
398     * emitting NP state commands with ATS-M in compute mode.
399     */
400    bool atsm_compute = intel_device_info_is_atsm(batch->screen->devinfo) &&
401                        batch->name == IRIS_BATCH_COMPUTE;
402    uint32_t np_state_wa_bits =
403       PIPE_CONTROL_CS_STALL |
404       PIPE_CONTROL_STATE_CACHE_INVALIDATE |
405       PIPE_CONTROL_CONST_CACHE_INVALIDATE |
406       PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
407       PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
408       PIPE_CONTROL_INSTRUCTION_INVALIDATE |
409       PIPE_CONTROL_FLUSH_HDC;
410 
411    /* Flush before emitting STATE_BASE_ADDRESS.
412     *
413     * This isn't documented anywhere in the PRM.  However, it seems to be
414     * necessary prior to changing the surface state base address.  We've
415     * seen issues in Vulkan where we get GPU hangs when using multi-level
416     * command buffers which clear depth, reset state base address, and then
417     * go render stuff.
418     *
419     * Normally, in GL, we would trust the kernel to do sufficient stalls
420     * and flushes prior to executing our batch.  However, it doesn't seem
421     * as if the kernel's flushing is always sufficient and we don't want to
422     * rely on it.
423     *
424     * We make this an end-of-pipe sync instead of a normal flush because we
425     * do not know the current status of the GPU.  On Haswell at least,
426     * having a fast-clear operation in flight at the same time as a normal
427     * rendering operation can cause hangs.  Since the kernel's flushing is
428     * insufficient, we need to ensure that any rendering operations from
429     * other processes are definitely complete before we try to do our own
430     * rendering.  It's a bit of a big hammer but it appears to work.
431     *
432     * Render target cache flush before SBA is required by Wa_18039438632.
433     */
434    iris_emit_end_of_pipe_sync(batch,
435                               "change STATE_BASE_ADDRESS (flushes)",
436                               atsm_compute ? np_state_wa_bits : 0 |
437                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
438                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
439                               PIPE_CONTROL_DATA_CACHE_FLUSH);
440 }
441 
442 static void
flush_after_state_base_change(struct iris_batch * batch)443 flush_after_state_base_change(struct iris_batch *batch)
444 {
445    const struct intel_device_info *devinfo = batch->screen->devinfo;
446    /* After re-setting the surface state base address, we have to do some
447     * cache flusing so that the sampler engine will pick up the new
448     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
449     * Shared Function > 3D Sampler > State > State Caching (page 96):
450     *
451     *    Coherency with system memory in the state cache, like the texture
452     *    cache is handled partially by software. It is expected that the
453     *    command stream or shader will issue Cache Flush operation or
454     *    Cache_Flush sampler message to ensure that the L1 cache remains
455     *    coherent with system memory.
456     *
457     *    [...]
458     *
459     *    Whenever the value of the Dynamic_State_Base_Addr,
460     *    Surface_State_Base_Addr are altered, the L1 state cache must be
461     *    invalidated to ensure the new surface or sampler state is fetched
462     *    from system memory.
463     *
464     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
465     * which, according the PIPE_CONTROL instruction documentation in the
466     * Broadwell PRM:
467     *
468     *    Setting this bit is independent of any other bit in this packet.
469     *    This bit controls the invalidation of the L1 and L2 state caches
470     *    at the top of the pipe i.e. at the parsing time.
471     *
472     * Unfortunately, experimentation seems to indicate that state cache
473     * invalidation through a PIPE_CONTROL does nothing whatsoever in
474     * regards to surface state and binding tables.  In stead, it seems that
475     * invalidating the texture cache is what is actually needed.
476     *
477     * XXX:  As far as we have been able to determine through
478     * experimentation, shows that flush the texture cache appears to be
479     * sufficient.  The theory here is that all of the sampling/rendering
480     * units cache the binding table in the texture cache.  However, we have
481     * yet to be able to actually confirm this.
482     *
483     * Wa_16013000631:
484     *
485     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
486     *   or program pipe control with Instruction cache invalidate post
487     *   STATE_BASE_ADDRESS command"
488     */
489    iris_emit_end_of_pipe_sync(batch,
490                               "change STATE_BASE_ADDRESS (invalidates)",
491                               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
492                               PIPE_CONTROL_CONST_CACHE_INVALIDATE |
493                               PIPE_CONTROL_STATE_CACHE_INVALIDATE |
494                               (intel_needs_workaround(devinfo, 16013000631) ?
495                                PIPE_CONTROL_INSTRUCTION_INVALIDATE : 0));
496 }
497 
498 static void
iris_load_register_reg32(struct iris_batch * batch,uint32_t dst,uint32_t src)499 iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,
500                          uint32_t src)
501 {
502    struct mi_builder b;
503    mi_builder_init(&b, batch->screen->devinfo, batch);
504    mi_store(&b, mi_reg32(dst), mi_reg32(src));
505 }
506 
507 static void
iris_load_register_reg64(struct iris_batch * batch,uint32_t dst,uint32_t src)508 iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,
509                          uint32_t src)
510 {
511    struct mi_builder b;
512    mi_builder_init(&b, batch->screen->devinfo, batch);
513    mi_store(&b, mi_reg64(dst), mi_reg64(src));
514 }
515 
516 static void
iris_load_register_imm32(struct iris_batch * batch,uint32_t reg,uint32_t val)517 iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,
518                          uint32_t val)
519 {
520    struct mi_builder b;
521    mi_builder_init(&b, batch->screen->devinfo, batch);
522    mi_store(&b, mi_reg32(reg), mi_imm(val));
523 }
524 
525 static void
iris_load_register_imm64(struct iris_batch * batch,uint32_t reg,uint64_t val)526 iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,
527                          uint64_t val)
528 {
529    struct mi_builder b;
530    mi_builder_init(&b, batch->screen->devinfo, batch);
531    mi_store(&b, mi_reg64(reg), mi_imm(val));
532 }
533 
534 /**
535  * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
536  */
537 static void
iris_load_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)538 iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,
539                          struct iris_bo *bo, uint32_t offset)
540 {
541    iris_batch_sync_region_start(batch);
542    struct mi_builder b;
543    mi_builder_init(&b, batch->screen->devinfo, batch);
544    struct mi_value src = mi_mem32(ro_bo(bo, offset));
545    mi_store(&b, mi_reg32(reg), src);
546    iris_batch_sync_region_end(batch);
547 }
548 
549 /**
550  * Load a 64-bit value from a buffer into a MMIO register via
551  * two MI_LOAD_REGISTER_MEM commands.
552  */
553 static void
iris_load_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)554 iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,
555                          struct iris_bo *bo, uint32_t offset)
556 {
557    iris_batch_sync_region_start(batch);
558    struct mi_builder b;
559    mi_builder_init(&b, batch->screen->devinfo, batch);
560    struct mi_value src = mi_mem64(ro_bo(bo, offset));
561    mi_store(&b, mi_reg64(reg), src);
562    iris_batch_sync_region_end(batch);
563 }
564 
565 static void
iris_store_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)566 iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,
567                           struct iris_bo *bo, uint32_t offset,
568                           bool predicated)
569 {
570    iris_batch_sync_region_start(batch);
571    struct mi_builder b;
572    mi_builder_init(&b, batch->screen->devinfo, batch);
573    struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
574    struct mi_value src = mi_reg32(reg);
575    if (predicated)
576       mi_store_if(&b, dst, src);
577    else
578       mi_store(&b, dst, src);
579    iris_batch_sync_region_end(batch);
580 }
581 
582 static void
iris_store_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)583 iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,
584                           struct iris_bo *bo, uint32_t offset,
585                           bool predicated)
586 {
587    iris_batch_sync_region_start(batch);
588    struct mi_builder b;
589    mi_builder_init(&b, batch->screen->devinfo, batch);
590    struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
591    struct mi_value src = mi_reg64(reg);
592    if (predicated)
593       mi_store_if(&b, dst, src);
594    else
595       mi_store(&b, dst, src);
596    iris_batch_sync_region_end(batch);
597 }
598 
599 static void
iris_store_data_imm32(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint32_t imm)600 iris_store_data_imm32(struct iris_batch *batch,
601                       struct iris_bo *bo, uint32_t offset,
602                       uint32_t imm)
603 {
604    iris_batch_sync_region_start(batch);
605    struct mi_builder b;
606    mi_builder_init(&b, batch->screen->devinfo, batch);
607    struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
608    struct mi_value src = mi_imm(imm);
609    mi_store(&b, dst, src);
610    iris_batch_sync_region_end(batch);
611 }
612 
613 static void
iris_store_data_imm64(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint64_t imm)614 iris_store_data_imm64(struct iris_batch *batch,
615                       struct iris_bo *bo, uint32_t offset,
616                       uint64_t imm)
617 {
618    iris_batch_sync_region_start(batch);
619    struct mi_builder b;
620    mi_builder_init(&b, batch->screen->devinfo, batch);
621    struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
622    struct mi_value src = mi_imm(imm);
623    mi_store(&b, dst, src);
624    iris_batch_sync_region_end(batch);
625 }
626 
627 static void
iris_copy_mem_mem(struct iris_batch * batch,struct iris_bo * dst_bo,uint32_t dst_offset,struct iris_bo * src_bo,uint32_t src_offset,unsigned bytes)628 iris_copy_mem_mem(struct iris_batch *batch,
629                   struct iris_bo *dst_bo, uint32_t dst_offset,
630                   struct iris_bo *src_bo, uint32_t src_offset,
631                   unsigned bytes)
632 {
633    /* MI_COPY_MEM_MEM operates on DWords. */
634    assert(bytes % 4 == 0);
635    assert(dst_offset % 4 == 0);
636    assert(src_offset % 4 == 0);
637    iris_batch_sync_region_start(batch);
638 
639    for (unsigned i = 0; i < bytes; i += 4) {
640       iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {
641          cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,
642                                              IRIS_DOMAIN_OTHER_WRITE);
643          cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);
644       }
645    }
646 
647    iris_batch_sync_region_end(batch);
648 }
649 
650 static void
iris_rewrite_compute_walker_pc(struct iris_batch * batch,uint32_t * walker,struct iris_bo * bo,uint32_t offset)651 iris_rewrite_compute_walker_pc(struct iris_batch *batch,
652                                uint32_t *walker,
653                                struct iris_bo *bo,
654                                uint32_t offset)
655 {
656 #if GFX_VERx10 >= 125
657    struct iris_screen *screen = batch->screen;
658    struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
659 
660    uint32_t dwords[GENX(COMPUTE_WALKER_length)];
661 
662    _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) {
663       cw.body.PostSync.Operation = WriteTimestamp;
664       cw.body.PostSync.DestinationAddress = addr;
665       cw.body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
666    }
667 
668    for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++)
669       walker[i] |= dwords[i];
670 #else
671    unreachable("Unsupported");
672 #endif
673 }
674 
675 static void
emit_pipeline_select(struct iris_batch * batch,uint32_t pipeline)676 emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
677 {
678    /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
679 #if GFX_VER < 20
680 
681 #if GFX_VER >= 8 && GFX_VER < 10
682    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
683     *
684     *   Software must clear the COLOR_CALC_STATE Valid field in
685     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
686     *   with Pipeline Select set to GPGPU.
687     *
688     * The internal hardware docs recommend the same workaround for Gfx9
689     * hardware too.
690     */
691    if (pipeline == GPGPU)
692       iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
693 #endif
694 
695 #if GFX_VER >= 12
696    /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
697     *
698     *   "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
699     *   are flushed through a stalling PIPE_CONTROL command prior to
700     *   programming of PIPELINE_SELECT command transitioning Pipeline Select
701     *   from 3D to GPGPU/Media.
702     *   Software must ensure HDC Pipeline flush and Generic Media State Clear
703     *   is issued through a stalling PIPE_CONTROL command prior to programming
704     *   of PIPELINE_SELECT command transitioning Pipeline Select from
705     *   GPGPU/Media to 3D."
706     *
707     * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
708     * because PIPE was not in MEDIA mode?!
709     */
710    enum pipe_control_flags flags = PIPE_CONTROL_CS_STALL |
711                                    PIPE_CONTROL_FLUSH_HDC;
712 
713    if (pipeline == GPGPU && batch->name == IRIS_BATCH_RENDER) {
714       flags |= PIPE_CONTROL_RENDER_TARGET_FLUSH |
715                PIPE_CONTROL_DEPTH_CACHE_FLUSH;
716    } else {
717       flags |= PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH;
718    }
719    /* Wa_16013063087 -  State Cache Invalidate must be issued prior to
720     * PIPELINE_SELECT when switching from 3D to Compute.
721     *
722     * SW must do this by programming of PIPECONTROL with “CS Stall” followed
723     * by a PIPECONTROL with State Cache Invalidate bit set.
724     */
725    if (pipeline == GPGPU &&
726        intel_needs_workaround(batch->screen->devinfo, 16013063087))
727       flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
728 
729    iris_emit_pipe_control_flush(batch, "PIPELINE_SELECT flush", flags);
730 #else
731    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
732     * PIPELINE_SELECT [DevBWR+]":
733     *
734     *    "Project: DEVSNB+
735     *
736     *     Software must ensure all the write caches are flushed through a
737     *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
738     *     command to invalidate read only caches prior to programming
739     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
740     */
741     iris_emit_pipe_control_flush(batch,
742                                  "workaround: PIPELINE_SELECT flushes (1/2)",
743                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
744                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
745                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
746                                  PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
747                                  PIPE_CONTROL_CS_STALL);
748 
749     iris_emit_pipe_control_flush(batch,
750                                  "workaround: PIPELINE_SELECT flushes (2/2)",
751                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
752                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
753                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
754                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE);
755 #endif
756 
757    iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
758 #if GFX_VER >= 9
759       sel.MaskBits = GFX_VER == 12 ? 0x13 : 0x3;
760 #if GFX_VER == 12
761       sel.MediaSamplerDOPClockGateEnable = true;
762 #endif /* if GFX_VER == 12 */
763 #endif /* if GFX_VER >= 9 */
764       sel.PipelineSelection = pipeline;
765    }
766 #endif /* if GFX_VER < 20 */
767 }
768 
769 UNUSED static void
init_glk_barrier_mode(struct iris_batch * batch,uint32_t value)770 init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)
771 {
772 #if GFX_VER == 9
773    /* Project: DevGLK
774     *
775     *    "This chicken bit works around a hardware issue with barrier
776     *     logic encountered when switching between GPGPU and 3D pipelines.
777     *     To workaround the issue, this mode bit should be set after a
778     *     pipeline is selected."
779     */
780    iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {
781       reg.GLKBarrierMode = value;
782       reg.GLKBarrierModeMask = 1;
783    }
784 #endif
785 }
786 
787 static void
init_state_base_address(struct iris_batch * batch)788 init_state_base_address(struct iris_batch *batch)
789 {
790    struct isl_device *isl_dev = &batch->screen->isl_dev;
791    uint32_t mocs = isl_mocs(isl_dev, 0, false);
792    flush_before_state_base_change(batch);
793 
794    /* We program most base addresses once at context initialization time.
795     * Each base address points at a 4GB memory zone, and never needs to
796     * change.  See iris_bufmgr.h for a description of the memory zones.
797     *
798     * The one exception is Surface State Base Address, which needs to be
799     * updated occasionally.  See iris_binder.c for the details there.
800     */
801    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
802       sba.GeneralStateMOCS            = mocs;
803       sba.StatelessDataPortAccessMOCS = mocs;
804       sba.DynamicStateMOCS            = mocs;
805       sba.IndirectObjectMOCS          = mocs;
806       sba.InstructionMOCS             = mocs;
807       sba.SurfaceStateMOCS            = mocs;
808 #if GFX_VER >= 9
809       sba.BindlessSurfaceStateMOCS    = mocs;
810 #endif
811 
812       sba.GeneralStateBaseAddressModifyEnable   = true;
813       sba.DynamicStateBaseAddressModifyEnable   = true;
814       sba.IndirectObjectBaseAddressModifyEnable = true;
815       sba.InstructionBaseAddressModifyEnable    = true;
816       sba.GeneralStateBufferSizeModifyEnable    = true;
817       sba.DynamicStateBufferSizeModifyEnable    = true;
818       sba.SurfaceStateBaseAddressModifyEnable   = true;
819 #if GFX_VER >= 11
820       sba.BindlessSamplerStateMOCS    = mocs;
821 #endif
822       sba.IndirectObjectBufferSizeModifyEnable  = true;
823       sba.InstructionBuffersizeModifyEnable     = true;
824 
825       sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
826       sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
827       sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDER_START);
828 
829       sba.GeneralStateBufferSize   = 0xfffff;
830       sba.IndirectObjectBufferSize = 0xfffff;
831       sba.InstructionBufferSize    = 0xfffff;
832       sba.DynamicStateBufferSize   = 0xfffff;
833 #if GFX_VERx10 >= 125
834       sba.L1CacheControl = L1CC_WB;
835 #endif
836    }
837 
838    flush_after_state_base_change(batch);
839 }
840 
841 static void
iris_emit_l3_config(struct iris_batch * batch,const struct intel_l3_config * cfg)842 iris_emit_l3_config(struct iris_batch *batch,
843                     const struct intel_l3_config *cfg)
844 {
845 #if GFX_VER < 20
846    assert(cfg || GFX_VER >= 12);
847 
848 #if GFX_VER >= 12
849 #define L3_ALLOCATION_REG GENX(L3ALLOC)
850 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
851 #else
852 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
853 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
854 #endif
855 
856    iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {
857 #if GFX_VER < 11
858       reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
859 #endif
860 #if GFX_VER == 11
861       /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set
862        * in L3CNTLREG register. The default setting of the bit is not the
863        * desirable behavior.
864        */
865       reg.ErrorDetectionBehaviorControl = true;
866       reg.UseFullWays = true;
867 #endif
868       if (GFX_VER < 12 || (cfg && cfg->n[INTEL_L3P_ALL] <= 126)) {
869          reg.URBAllocation = cfg->n[INTEL_L3P_URB];
870          reg.ROAllocation = cfg->n[INTEL_L3P_RO];
871          reg.DCAllocation = cfg->n[INTEL_L3P_DC];
872          reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
873       } else {
874          assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
875                           cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
876                           cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
877                           cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
878 #if GFX_VER >= 12
879          reg.L3FullWayAllocationEnable = true;
880 #endif
881       }
882    }
883 #endif /* GFX_VER < 20 */
884 }
885 
886 void
genX(emit_urb_config)887 genX(emit_urb_config)(struct iris_batch *batch,
888                       bool has_tess_eval,
889                       bool has_geometry)
890 {
891    struct iris_screen *screen = batch->screen;
892    struct iris_context *ice = batch->ice;
893 
894    intel_get_urb_config(screen->devinfo,
895                         screen->l3_config_3d,
896                         has_tess_eval,
897                         has_geometry,
898                         &ice->shaders.urb.cfg,
899                         &ice->state.urb_deref_block_size,
900                         &ice->shaders.urb.constrained);
901 
902    genX(urb_workaround)(batch, &ice->shaders.urb.cfg);
903 
904    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
905 #if GFX_VER >= 12
906       iris_emit_cmd(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
907          urb._3DCommandSubOpcode           += i;
908          urb.VSURBEntryAllocationSize       = ice->shaders.urb.cfg.size[i] - 1;
909          urb.VSURBStartingAddressSlice0     = ice->shaders.urb.cfg.start[i];
910          urb.VSURBStartingAddressSliceN     = ice->shaders.urb.cfg.start[i];
911          urb.VSNumberofURBEntriesSlice0     = ice->shaders.urb.cfg.entries[i];
912          urb.VSNumberofURBEntriesSliceN     = ice->shaders.urb.cfg.entries[i];
913       }
914 #else
915       iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
916          urb._3DCommandSubOpcode += i;
917          urb.VSURBStartingAddress     = ice->shaders.urb.cfg.start[i];
918          urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
919          urb.VSNumberofURBEntries     = ice->shaders.urb.cfg.entries[i];
920       }
921 #endif
922    }
923 }
924 
925 #if GFX_VER == 9
926 static void
iris_enable_obj_preemption(struct iris_batch * batch,bool enable)927 iris_enable_obj_preemption(struct iris_batch *batch, bool enable)
928 {
929    /* A fixed function pipe flush is required before modifying this field */
930    iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"
931                                             : "disable preemption",
932                               PIPE_CONTROL_RENDER_TARGET_FLUSH);
933 
934    /* enable object level preemption */
935    iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
936       reg.ReplayMode = enable;
937       reg.ReplayModeMask = true;
938    }
939 }
940 #endif
941 
942 static void
upload_pixel_hashing_tables(struct iris_batch * batch)943 upload_pixel_hashing_tables(struct iris_batch *batch)
944 {
945    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
946    UNUSED struct iris_context *ice = batch->ice;
947    assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
948 
949 #if GFX_VER == 11
950    /* Gfx11 hardware has two pixel pipes at most. */
951    for (unsigned i = 2; i < ARRAY_SIZE(devinfo->ppipe_subslices); i++)
952       assert(devinfo->ppipe_subslices[i] == 0);
953 
954    if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])
955       return;
956 
957    unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
958    uint32_t hash_address;
959    struct pipe_resource *tmp = NULL;
960    uint32_t *map =
961       stream_state(batch, ice->state.dynamic_uploader, &tmp,
962                    size, 64, &hash_address);
963    pipe_resource_reference(&tmp, NULL);
964 
965    const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];
966    struct GENX(SLICE_HASH_TABLE) table;
967    intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
968 
969    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
970 
971    iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
972       ptr.SliceHashStatePointerValid = true;
973       ptr.SliceHashTableStatePointer = hash_address;
974    }
975 
976    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
977       mode.SliceHashingTableEnable = true;
978    }
979 
980 #elif GFX_VERx10 == 120
981    /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
982     * present with n active dual subslices.
983     */
984    unsigned ppipes_of[3] = {};
985 
986    for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
987       for (unsigned p = 0; p < 3; p++)
988          ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);
989    }
990 
991    /* Gfx12 has three pixel pipes. */
992    for (unsigned p = 3; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)
993       assert(devinfo->ppipe_subslices[p] == 0);
994 
995    if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
996       /* All three pixel pipes have the maximum number of active dual
997        * subslices, or there is only one active pixel pipe: Nothing to do.
998        */
999       return;
1000    }
1001 
1002    iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
1003       p.SliceHashControl[0] = TABLE_0;
1004 
1005       if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1006          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
1007       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1008          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
1009 
1010       if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
1011          intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
1012       else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1013          intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
1014       else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1015          intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
1016       else
1017          unreachable("Illegal fusing.");
1018    }
1019 
1020    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1021       p.SubsliceHashingTableEnable = true;
1022       p.SubsliceHashingTableEnableMask = true;
1023    }
1024 
1025 #elif GFX_VERx10 == 125
1026    struct pipe_screen *pscreen = &batch->screen->base;
1027    const unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
1028    const struct pipe_resource tmpl = {
1029      .target = PIPE_BUFFER,
1030      .format = PIPE_FORMAT_R8_UNORM,
1031      .bind = PIPE_BIND_CUSTOM,
1032      .usage = PIPE_USAGE_IMMUTABLE,
1033      .flags = IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE,
1034      .width0 = size,
1035      .height0 = 1,
1036      .depth0 = 1,
1037      .array_size = 1
1038    };
1039 
1040    pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
1041    ice->state.pixel_hashing_tables = pscreen->resource_create(pscreen, &tmpl);
1042 
1043    struct iris_resource *res = (struct iris_resource *)ice->state.pixel_hashing_tables;
1044    struct pipe_transfer *transfer = NULL;
1045    uint32_t *map = pipe_buffer_map_range(&ice->ctx, ice->state.pixel_hashing_tables,
1046                                          0, size, PIPE_MAP_WRITE,
1047                                          &transfer);
1048 
1049    /* Calculate the set of present pixel pipes, and another set of
1050     * present pixel pipes with 2 dual subslices enabled, the latter
1051     * will appear on the hashing table with twice the frequency of
1052     * pixel pipes with a single dual subslice present.
1053     */
1054    uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
1055    for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
1056       if (devinfo->ppipe_subslices[p])
1057          ppipe_mask1 |= (1u << p);
1058       if (devinfo->ppipe_subslices[p] > 1)
1059          ppipe_mask2 |= (1u << p);
1060    }
1061    assert(ppipe_mask1);
1062 
1063    struct GENX(SLICE_HASH_TABLE) table;
1064 
1065    /* Note that the hardware expects an array with 7 tables, each
1066     * table is intended to specify the pixel pipe hashing behavior for
1067     * every possible slice count between 2 and 8, however that doesn't
1068     * actually work, among other reasons due to hardware bugs that
1069     * will cause the GPU to erroneously access the table at the wrong
1070     * index in some cases, so in practice all 7 tables need to be
1071     * initialized to the same value.
1072     */
1073    for (unsigned i = 0; i < 7; i++)
1074       intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
1075                                           table.Entry[i][0]);
1076 
1077    GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
1078 
1079    pipe_buffer_unmap(&ice->ctx, transfer);
1080 
1081    iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_NONE);
1082    iris_record_state_size(batch->state_sizes, res->bo->address + res->offset, size);
1083 
1084    iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
1085       ptr.SliceHashStatePointerValid = true;
1086       ptr.SliceHashTableStatePointer = iris_bo_offset_from_base_address(res->bo) +
1087                                        res->offset;
1088    }
1089 
1090    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
1091       mode.SliceHashingTableEnable = true;
1092       mode.SliceHashingTableEnableMask = true;
1093       mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
1094                                     hashing32x32 : NormalMode);
1095       mode.CrossSliceHashingModeMask = -1;
1096    }
1097 #endif
1098 }
1099 
1100 static void
iris_alloc_push_constants(struct iris_batch * batch)1101 iris_alloc_push_constants(struct iris_batch *batch)
1102 {
1103    const struct intel_device_info *devinfo = batch->screen->devinfo;
1104 
1105    /* For now, we set a static partitioning of the push constant area,
1106     * assuming that all stages could be in use.
1107     *
1108     * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1109     *       see if that improves performance by offering more space to
1110     *       the VS/FS when those aren't in use.  Also, try dynamically
1111     *       enabling/disabling it like i965 does.  This would be more
1112     *       stalls and may not actually help; we don't know yet.
1113     */
1114 
1115    /* Divide as equally as possible with any remainder given to FRAGMENT. */
1116    const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
1117    const unsigned stage_size = push_constant_kb / 5;
1118    const unsigned frag_size = push_constant_kb - 4 * stage_size;
1119 
1120    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1121       iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1122          alloc._3DCommandSubOpcode = 18 + i;
1123          alloc.ConstantBufferOffset = stage_size * i;
1124          alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
1125       }
1126    }
1127 
1128 #if GFX_VERx10 == 125
1129    /* DG2: Wa_22011440098
1130     * MTL: Wa_18022330953
1131     *
1132     * In 3D mode, after programming push constant alloc command immediately
1133     * program push constant command(ZERO length) without any commit between
1134     * them.
1135     */
1136    iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
1137       /* Update empty push constants for all stages (bitmask = 11111b) */
1138       c.ShaderUpdateEnable = 0x1f;
1139       c.MOCS = iris_mocs(NULL, &batch->screen->isl_dev, 0);
1140    }
1141 #endif
1142 }
1143 
1144 #if GFX_VER >= 12
1145 static void
1146 init_aux_map_state(struct iris_batch *batch);
1147 #endif
1148 
1149 /* This updates a register. Caller should stall the pipeline as needed. */
1150 static void
iris_disable_rhwo_optimization(struct iris_batch * batch,bool disable)1151 iris_disable_rhwo_optimization(struct iris_batch *batch, bool disable)
1152 {
1153    assert(batch->screen->devinfo->verx10 == 120);
1154 #if GFX_VERx10 == 120
1155    iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1156       c1.RCCRHWOOptimizationDisable = disable;
1157       c1.RCCRHWOOptimizationDisableMask = true;
1158    };
1159 #endif
1160 }
1161 
1162 static void
state_system_mem_fence_address_emit(struct iris_batch * batch)1163 state_system_mem_fence_address_emit(struct iris_batch *batch)
1164 {
1165 #if GFX_VERx10 >= 200
1166    struct iris_screen *screen = batch->screen;
1167    struct iris_address addr = { .bo = iris_bufmgr_get_mem_fence_bo(screen->bufmgr) };
1168    iris_emit_cmd(batch, GENX(STATE_SYSTEM_MEM_FENCE_ADDRESS), mem_fence_addr) {
1169       mem_fence_addr.SystemMemoryFenceAddress = addr;
1170    }
1171 #endif
1172 }
1173 
1174 /**
1175  * Upload initial GPU state for any kind of context.
1176  *
1177  * These need to happen for both render and compute.
1178  */
1179 static void
iris_init_common_context(struct iris_batch * batch)1180 iris_init_common_context(struct iris_batch *batch)
1181 {
1182 #if GFX_VER == 11
1183    iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {
1184       reg.HeaderlessMessageforPreemptableContexts = 1;
1185       reg.HeaderlessMessageforPreemptableContextsMask = 1;
1186    }
1187 
1188    /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
1189    iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {
1190       reg.EnabledTexelOffsetPrecisionFix = 1;
1191       reg.EnabledTexelOffsetPrecisionFixMask = 1;
1192    }
1193 #endif
1194 
1195    /* Select 256B-aligned binding table mode on Icelake through Tigerlake,
1196     * which gives us larger binding table pointers, at the cost of higher
1197     * alignment requirements (bits 18:8 are valid instead of 15:5).  When
1198     * using this mode, we have to shift binding table pointers by 3 bits,
1199     * as they're still stored in the same bit-location in the field.
1200     */
1201 #if GFX_VER >= 11 && GFX_VERx10 < 125
1202    iris_emit_reg(batch, GENX(GT_MODE), reg) {
1203       reg.BindingTableAlignment = BTP_18_8;
1204       reg.BindingTableAlignmentMask = true;
1205    }
1206 #endif
1207 
1208 #if GFX_VERx10 == 125
1209    /* Even though L3 partial write merging is supposed to be enabled
1210     * by default on Gfx12.5 according to the hardware spec, i915
1211     * appears to accidentally clear the enables during context
1212     * initialization, so make sure to enable them here since partial
1213     * write merging has a large impact on rendering performance.
1214     */
1215    iris_emit_reg(batch, GENX(L3SQCREG5), reg) {
1216       reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
1217       reg.CompressiblePartialWriteMergeEnable = true;
1218       reg.CoherentPartialWriteMergeEnable = true;
1219       reg.CrossTilePartialWriteMergeEnable = true;
1220    }
1221 #endif
1222 
1223    state_system_mem_fence_address_emit(batch);
1224 }
1225 
1226 static void
toggle_protected(struct iris_batch * batch)1227 toggle_protected(struct iris_batch *batch)
1228 {
1229    struct iris_context *ice;
1230 
1231    if (batch->name == IRIS_BATCH_RENDER)
1232       ice =container_of(batch, struct iris_context, batches[IRIS_BATCH_RENDER]);
1233    else if (batch->name == IRIS_BATCH_COMPUTE)
1234       ice = container_of(batch, struct iris_context, batches[IRIS_BATCH_COMPUTE]);
1235    else
1236       unreachable("unhandled batch");
1237 
1238    if (!ice->protected)
1239       return;
1240 
1241 #if GFX_VER >= 12
1242    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1243       pc.CommandStreamerStallEnable = true;
1244       pc.RenderTargetCacheFlushEnable = true;
1245       pc.ProtectedMemoryDisable = true;
1246    }
1247    iris_emit_cmd(batch, GENX(MI_SET_APPID), appid) {
1248       /* Default value for single session. */
1249       appid.ProtectedMemoryApplicationID = 0xf;
1250       appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
1251    }
1252    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1253       pc.CommandStreamerStallEnable = true;
1254       pc.RenderTargetCacheFlushEnable = true;
1255       pc.ProtectedMemoryEnable = true;
1256    }
1257 #else
1258    unreachable("Not supported");
1259 #endif
1260 }
1261 
1262 #if GFX_VER >= 20
1263 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
1264 #else
1265 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
1266 #endif
1267 
1268 /**
1269  * Upload the initial GPU state for a render context.
1270  *
1271  * This sets some invariant state that needs to be programmed a particular
1272  * way, but we never actually change.
1273  */
1274 static void
iris_init_render_context(struct iris_batch * batch)1275 iris_init_render_context(struct iris_batch *batch)
1276 {
1277    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1278 
1279    iris_batch_sync_region_start(batch);
1280 
1281    emit_pipeline_select(batch, _3D);
1282 
1283    toggle_protected(batch);
1284 
1285    iris_emit_l3_config(batch, batch->screen->l3_config_3d);
1286 
1287    init_state_base_address(batch);
1288 
1289    iris_init_common_context(batch);
1290 
1291 #if GFX_VER >= 9
1292    iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {
1293       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1294       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1295    }
1296 #else
1297    iris_emit_reg(batch, GENX(INSTPM), reg) {
1298       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1299       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1300    }
1301 #endif
1302 
1303 #if GFX_VER == 9
1304    iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1305       reg.FloatBlendOptimizationEnable = true;
1306       reg.FloatBlendOptimizationEnableMask = true;
1307       reg.MSCRAWHazardAvoidanceBit = true;
1308       reg.MSCRAWHazardAvoidanceBitMask = true;
1309       reg.PartialResolveDisableInVC = true;
1310       reg.PartialResolveDisableInVCMask = true;
1311    }
1312 
1313    if (devinfo->platform == INTEL_PLATFORM_GLK)
1314       init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);
1315 #endif
1316 
1317 #if GFX_VER == 11
1318    iris_emit_reg(batch, GENX(TCCNTLREG), reg) {
1319       reg.L3DataPartialWriteMergingEnable = true;
1320       reg.ColorZPartialWriteMergingEnable = true;
1321       reg.URBPartialWriteMergingEnable = true;
1322       reg.TCDisable = true;
1323    }
1324 
1325    /* Hardware specification recommends disabling repacking for the
1326     * compatibility with decompression mechanism in display controller.
1327     */
1328    if (devinfo->disable_ccs_repack) {
1329       iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {
1330          reg.DisableRepackingforCompression = true;
1331          reg.DisableRepackingforCompressionMask = true;
1332       }
1333    }
1334 #endif
1335 
1336 #if GFX_VER == 12
1337    iris_emit_reg(batch, GENX(FF_MODE2), reg) {
1338       /* On Alchemist, the FF_MODE2 docs for the GS timer say:
1339        *
1340        *    "The timer value must be set to 224."
1341        *
1342        * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
1343        * and that this is necessary to avoid hanging the HS/DS units.  It
1344        * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
1345        *
1346        * The HS timer docs also have the same quote for Alchemist.  I am
1347        * unaware of a reason it needs to be set to 224 on Tigerlake, but
1348        * we do so for consistency if nothing else.
1349        *
1350        * For the TDS timer value, the docs say:
1351        *
1352        *    "For best performance, a value of 4 should be programmed."
1353        *
1354        * i915 also sets it this way on Tigerlake due to workarounds.
1355        *
1356        * The default VS timer appears to be 0, so we leave it at that.
1357        */
1358       reg.GSTimerValue  = 224;
1359       reg.HSTimerValue  = 224;
1360       reg.TDSTimerValue = 4;
1361       reg.VSTimerValue  = 0;
1362    }
1363 #endif
1364 
1365 #if INTEL_NEEDS_WA_1508744258
1366    /* The suggested workaround is:
1367     *
1368     *    Disable RHWO by setting 0x7010[14] by default except during resolve
1369     *    pass.
1370     *
1371     * We implement global disabling of the optimization here and we toggle it
1372     * in iris_resolve_color.
1373     *
1374     * iris_init_compute_context is unmodified because we don't expect to
1375     * access the RCC in the compute context. iris_mcs_partial_resolve is
1376     * unmodified because that pass doesn't use a HW bit to perform the
1377     * resolve (related HSDs specifically call out the RenderTargetResolveType
1378     * field in the 3DSTATE_PS instruction).
1379     */
1380    iris_disable_rhwo_optimization(batch, true);
1381 #endif
1382 
1383 #if GFX_VERx10 == 120
1384    /* Wa_1806527549 says to disable the following HiZ optimization when the
1385     * depth buffer is D16_UNORM. We've found the WA to help with more depth
1386     * buffer configurations however, so we always disable it just to be safe.
1387     */
1388    iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
1389       reg.HZDepthTestLEGEOptimizationDisable = true;
1390       reg.HZDepthTestLEGEOptimizationDisableMask = true;
1391    }
1392 #endif
1393 
1394 #if GFX_VERx10 == 125
1395    iris_emit_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
1396       reg.TBIMRBatchSizeOverride = true;
1397       reg.TBIMROpenBatchEnable = true;
1398       reg.TBIMRFastClip = true;
1399       reg.TBIMRBatchSizeOverrideMask = true;
1400       reg.TBIMROpenBatchEnableMask = true;
1401       reg.TBIMRFastClipMask = true;
1402    };
1403 #endif
1404 
1405 #if GFX_VER >= 20
1406    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1407       p.DX10OGLBorderModeforYCRCB = true;
1408       p.DX10OGLBorderModeforYCRCBMask = true;
1409    }
1410 #endif
1411 
1412    upload_pixel_hashing_tables(batch);
1413 
1414    /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid
1415     * changing it dynamically.  We set it to the maximum size here, and
1416     * instead include the render target dimensions in the viewport, so
1417     * viewport extents clipping takes care of pruning stray geometry.
1418     */
1419    iris_emit_cmd(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
1420       rect.ClippedDrawingRectangleXMax = UINT16_MAX;
1421       rect.ClippedDrawingRectangleYMax = UINT16_MAX;
1422    }
1423 
1424    /* Set the initial MSAA sample positions. */
1425    iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1426       INTEL_SAMPLE_POS_1X(pat._1xSample);
1427       INTEL_SAMPLE_POS_2X(pat._2xSample);
1428       INTEL_SAMPLE_POS_4X(pat._4xSample);
1429       INTEL_SAMPLE_POS_8X(pat._8xSample);
1430 #if GFX_VER >= 9
1431       INTEL_SAMPLE_POS_16X(pat._16xSample);
1432 #endif
1433    }
1434 
1435    /* Use the legacy AA line coverage computation. */
1436    iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1437 
1438    /* Disable chromakeying (it's for media) */
1439    iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1440 
1441    /* We want regular rendering, not special HiZ operations. */
1442    iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1443 
1444    /* No polygon stippling offsets are necessary. */
1445    /* TODO: may need to set an offset for origin-UL framebuffers */
1446    iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1447 
1448 #if GFX_VERx10 >= 125
1449    iris_emit_cmd(batch, GENX(3DSTATE_MESH_CONTROL), foo);
1450    iris_emit_cmd(batch, GENX(3DSTATE_TASK_CONTROL), foo);
1451 #endif
1452 
1453 #if INTEL_NEEDS_WA_14019857787
1454    iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1455       p.EnableOOOreadsinRCPB = true;
1456       p.EnableOOOreadsinRCPBMask = true;
1457    }
1458 #endif
1459 
1460    iris_alloc_push_constants(batch);
1461 
1462 #if GFX_VER >= 12
1463    init_aux_map_state(batch);
1464 #endif
1465 
1466    iris_batch_sync_region_end(batch);
1467 }
1468 
1469 static void
iris_init_compute_context(struct iris_batch * batch)1470 iris_init_compute_context(struct iris_batch *batch)
1471 {
1472    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1473 
1474    iris_batch_sync_region_start(batch);
1475 
1476    /* Wa_1607854226:
1477     *
1478     *  Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.
1479     */
1480 #if GFX_VERx10 == 120
1481    emit_pipeline_select(batch, _3D);
1482 #else
1483    emit_pipeline_select(batch, GPGPU);
1484 #endif
1485 
1486    toggle_protected(batch);
1487 
1488    iris_emit_l3_config(batch, batch->screen->l3_config_cs);
1489 
1490    init_state_base_address(batch);
1491 
1492    iris_init_common_context(batch);
1493 
1494 #if GFX_VERx10 == 120
1495    emit_pipeline_select(batch, GPGPU);
1496 #endif
1497 
1498 #if GFX_VER == 9
1499    if (devinfo->platform == INTEL_PLATFORM_GLK)
1500       init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);
1501 #endif
1502 
1503 #if GFX_VER >= 12
1504    init_aux_map_state(batch);
1505 #endif
1506 
1507 #if GFX_VERx10 >= 125
1508    /* Wa_14015782607 - Issue pipe control with HDC_flush and
1509     * untyped cache flush set to 1 when CCS has NP state update with
1510     * STATE_COMPUTE_MODE.
1511     */
1512    if (intel_needs_workaround(devinfo, 14015782607))
1513       iris_emit_pipe_control_flush(batch, "Wa_14015782607",
1514                                    PIPE_CONTROL_CS_STALL |
1515                                    PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
1516                                    PIPE_CONTROL_FLUSH_HDC);
1517 
1518    /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
1519     * emitting NP state commands with ATS-M in compute mode.
1520     */
1521    if (intel_device_info_is_atsm(devinfo))
1522       iris_emit_pipe_control_flush(batch, "Wa_14014427904/22013045878",
1523                                    PIPE_CONTROL_CS_STALL |
1524                                    PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1525                                    PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1526                                    PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
1527                                    PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1528                                    PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1529                                    PIPE_CONTROL_FLUSH_HDC);
1530 
1531    iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
1532 #if GFX_VER >= 20
1533       cm.AsyncComputeThreadLimit = ACTL_Max8;
1534       cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
1535       cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit;
1536       cm.AsyncComputeThreadLimitMask = 0x7;
1537       cm.ZPassAsyncComputeThreadLimitMask = 0x7;
1538       cm.ZAsyncThrottlesettingsMask = 0x3;
1539 #else
1540       cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
1541       cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
1542       cm.PixelAsyncComputeThreadLimitMask = 0x7;
1543       cm.ZPassAsyncComputeThreadLimitMask = 0x7;
1544       if (intel_device_info_is_mtl_or_arl(devinfo)) {
1545          cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit;
1546          cm.ZAsyncThrottlesettingsMask = 0x3;
1547       }
1548 #endif
1549    }
1550 #endif
1551 
1552 #if GFX_VERx10 >= 125
1553    iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
1554       cfe.MaximumNumberofThreads =
1555          devinfo->max_cs_threads * devinfo->subslice_total;
1556    }
1557 #endif
1558 
1559    iris_batch_sync_region_end(batch);
1560 }
1561 
1562 static void
iris_init_copy_context(struct iris_batch * batch)1563 iris_init_copy_context(struct iris_batch *batch)
1564 {
1565    iris_batch_sync_region_start(batch);
1566 
1567 #if GFX_VER >= 12
1568    init_aux_map_state(batch);
1569 #endif
1570 
1571    state_system_mem_fence_address_emit(batch);
1572 
1573    iris_batch_sync_region_end(batch);
1574 }
1575 
1576 struct iris_vertex_buffer_state {
1577    /** The VERTEX_BUFFER_STATE hardware structure. */
1578    uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];
1579 
1580    /** The resource to source vertex data from. */
1581    struct pipe_resource *resource;
1582 
1583    int offset;
1584 };
1585 
1586 struct iris_depth_buffer_state {
1587    /* Depth/HiZ/Stencil related hardware packets. */
1588 #if GFX_VER < 20
1589    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1590                     GENX(3DSTATE_STENCIL_BUFFER_length) +
1591                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
1592                     GENX(3DSTATE_CLEAR_PARAMS_length)];
1593 #else
1594    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1595                     GENX(3DSTATE_STENCIL_BUFFER_length) +
1596                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length)];
1597 #endif
1598 };
1599 
1600 #if INTEL_NEEDS_WA_1808121037
1601 enum iris_depth_reg_mode {
1602    IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
1603    IRIS_DEPTH_REG_MODE_D16_1X_MSAA,
1604    IRIS_DEPTH_REG_MODE_UNKNOWN,
1605 };
1606 #endif
1607 
1608 /**
1609  * Generation-specific context state (ice->state.genx->...).
1610  *
1611  * Most state can go in iris_context directly, but these encode hardware
1612  * packets which vary by generation.
1613  */
1614 struct iris_genx_state {
1615    struct iris_vertex_buffer_state vertex_buffers[33];
1616    uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];
1617 
1618    struct iris_depth_buffer_state depth_buffer;
1619 
1620    uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
1621 
1622 #if GFX_VER == 8
1623    bool pma_fix_enabled;
1624 #endif
1625 
1626    /* Is object level preemption enabled? */
1627    bool object_preemption;
1628 
1629 #if INTEL_NEEDS_WA_1808121037
1630    enum iris_depth_reg_mode depth_reg_mode;
1631 #endif
1632 
1633    struct {
1634 #if GFX_VER == 8
1635       struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1636 #endif
1637    } shaders[MESA_SHADER_STAGES];
1638 };
1639 
1640 /**
1641  * The pipe->set_blend_color() driver hook.
1642  *
1643  * This corresponds to our COLOR_CALC_STATE.
1644  */
1645 static void
iris_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1646 iris_set_blend_color(struct pipe_context *ctx,
1647                      const struct pipe_blend_color *state)
1648 {
1649    struct iris_context *ice = (struct iris_context *) ctx;
1650 
1651    /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1652    memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1653    ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1654 }
1655 
1656 /**
1657  * Gallium CSO for blend state (see pipe_blend_state).
1658  */
1659 struct iris_blend_state {
1660    /** Partial 3DSTATE_PS_BLEND */
1661    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1662 
1663    /** Partial BLEND_STATE */
1664    uint32_t blend_state[GENX(BLEND_STATE_length) +
1665                         IRIS_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
1666 
1667    bool alpha_to_coverage; /* for shader key */
1668 
1669    /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1670    uint8_t blend_enables;
1671 
1672    /** Bitfield of whether color writes are enabled for RT[i] */
1673    uint8_t color_write_enables;
1674 
1675    /** Does RT[0] use dual color blending? */
1676    bool dual_color_blending;
1677 
1678    int ps_dst_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1679    int ps_dst_alpha_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1680 };
1681 
1682 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1683 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1684 {
1685    if (alpha_to_one) {
1686       if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1687          return PIPE_BLENDFACTOR_ONE;
1688 
1689       if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1690          return PIPE_BLENDFACTOR_ZERO;
1691    }
1692 
1693    return f;
1694 }
1695 
1696 /**
1697  * The pipe->create_blend_state() driver hook.
1698  *
1699  * Translates a pipe_blend_state into iris_blend_state.
1700  */
1701 static void *
iris_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1702 iris_create_blend_state(struct pipe_context *ctx,
1703                         const struct pipe_blend_state *state)
1704 {
1705    struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
1706    uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);
1707 
1708    cso->blend_enables = 0;
1709    cso->color_write_enables = 0;
1710    STATIC_ASSERT(IRIS_MAX_DRAW_BUFFERS <= 8);
1711 
1712    cso->alpha_to_coverage = state->alpha_to_coverage;
1713 
1714    bool indep_alpha_blend = false;
1715 
1716    for (int i = 0; i < IRIS_MAX_DRAW_BUFFERS; i++) {
1717       const struct pipe_rt_blend_state *rt =
1718          &state->rt[state->independent_blend_enable ? i : 0];
1719 
1720       enum pipe_blendfactor src_rgb =
1721          fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1722       enum pipe_blendfactor src_alpha =
1723          fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1724       enum pipe_blendfactor dst_rgb =
1725          fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1726       enum pipe_blendfactor dst_alpha =
1727          fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1728 
1729       /* Stored separately in cso for dynamic emission. */
1730       cso->ps_dst_blend_factor[i] = (int) dst_rgb;
1731       cso->ps_dst_alpha_blend_factor[i] = (int) dst_alpha;
1732 
1733       if (rt->rgb_func != rt->alpha_func ||
1734           src_rgb != src_alpha || dst_rgb != dst_alpha)
1735          indep_alpha_blend = true;
1736 
1737       if (rt->blend_enable)
1738          cso->blend_enables |= 1u << i;
1739 
1740       if (rt->colormask)
1741          cso->color_write_enables |= 1u << i;
1742 
1743       iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {
1744          be.LogicOpEnable = state->logicop_enable;
1745          be.LogicOpFunction = state->logicop_func;
1746 
1747          be.PreBlendSourceOnlyClampEnable = false;
1748          be.ColorClampRange = COLORCLAMP_RTFORMAT;
1749          be.PreBlendColorClampEnable = true;
1750          be.PostBlendColorClampEnable = true;
1751 
1752          be.ColorBufferBlendEnable = rt->blend_enable;
1753 
1754          be.ColorBlendFunction          = rt->rgb_func;
1755          be.AlphaBlendFunction          = rt->alpha_func;
1756 
1757          /* The casts prevent warnings about implicit enum type conversions. */
1758          be.SourceBlendFactor           = (int) src_rgb;
1759          be.SourceAlphaBlendFactor      = (int) src_alpha;
1760 
1761          be.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
1762          be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
1763          be.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
1764          be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
1765       }
1766       blend_entry += GENX(BLEND_STATE_ENTRY_length);
1767    }
1768 
1769    iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1770       /* pb.HasWriteableRT is filled in at draw time.
1771        * pb.AlphaTestEnable is filled in at draw time.
1772        *
1773        * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1774        * setting it when dual color blending without an appropriate shader.
1775        */
1776 
1777       pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1778       pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1779 
1780       /* The casts prevent warnings about implicit enum type conversions. */
1781       pb.SourceBlendFactor =
1782          (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1783       pb.SourceAlphaBlendFactor =
1784          (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1785    }
1786 
1787    iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
1788       bs.AlphaToCoverageEnable = state->alpha_to_coverage;
1789       bs.IndependentAlphaBlendEnable = indep_alpha_blend;
1790       bs.AlphaToOneEnable = state->alpha_to_one;
1791       bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage_dither;
1792       bs.ColorDitherEnable = state->dither;
1793       /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
1794    }
1795 
1796    cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1797 
1798    return cso;
1799 }
1800 
1801 /**
1802  * The pipe->bind_blend_state() driver hook.
1803  *
1804  * Bind a blending CSO and flag related dirty bits.
1805  */
1806 static void
iris_bind_blend_state(struct pipe_context * ctx,void * state)1807 iris_bind_blend_state(struct pipe_context *ctx, void *state)
1808 {
1809    struct iris_context *ice = (struct iris_context *) ctx;
1810    struct iris_blend_state *cso = state;
1811 
1812    ice->state.cso_blend = cso;
1813 
1814    ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
1815    ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1816    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
1817 
1818    if (GFX_VER == 8)
1819       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1820 }
1821 
1822 /**
1823  * Return true if the FS writes to any color outputs which are not disabled
1824  * via color masking.
1825  */
1826 static bool
has_writeable_rt(const struct iris_blend_state * cso_blend,const struct shader_info * fs_info)1827 has_writeable_rt(const struct iris_blend_state *cso_blend,
1828                  const struct shader_info *fs_info)
1829 {
1830    if (!fs_info)
1831       return false;
1832 
1833    unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1834 
1835    if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1836       rt_outputs = (1 << IRIS_MAX_DRAW_BUFFERS) - 1;
1837 
1838    return cso_blend->color_write_enables & rt_outputs;
1839 }
1840 
1841 /**
1842  * Gallium CSO for depth, stencil, and alpha testing state.
1843  */
1844 struct iris_depth_stencil_alpha_state {
1845    /** Partial 3DSTATE_WM_DEPTH_STENCIL. */
1846    uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
1847 
1848 #if GFX_VER >= 12
1849    uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];
1850 #endif
1851 
1852    /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */
1853    unsigned alpha_enabled:1;
1854    unsigned alpha_func:3;     /**< PIPE_FUNC_x */
1855    float alpha_ref_value;     /**< reference value */
1856 
1857    /** Outbound to resolve and cache set tracking. */
1858    bool depth_writes_enabled;
1859    bool stencil_writes_enabled;
1860 
1861    /** Outbound to Gfx8-9 PMA stall equations */
1862    bool depth_test_enabled;
1863 
1864    /** Tracking state of DS writes for Wa_18019816803. */
1865    bool ds_write_state;
1866 };
1867 
1868 /**
1869  * The pipe->create_depth_stencil_alpha_state() driver hook.
1870  *
1871  * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1872  * testing state since we need pieces of it in a variety of places.
1873  */
1874 static void *
iris_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1875 iris_create_zsa_state(struct pipe_context *ctx,
1876                       const struct pipe_depth_stencil_alpha_state *state)
1877 {
1878    struct iris_depth_stencil_alpha_state *cso =
1879       malloc(sizeof(struct iris_depth_stencil_alpha_state));
1880 
1881    bool two_sided_stencil = state->stencil[1].enabled;
1882 
1883    bool depth_write_enabled = false;
1884    bool stencil_write_enabled = false;
1885 
1886    /* Depth writes enabled? */
1887    if (state->depth_writemask &&
1888       ((!state->depth_enabled) ||
1889       ((state->depth_func != PIPE_FUNC_NEVER) &&
1890         (state->depth_func != PIPE_FUNC_EQUAL))))
1891       depth_write_enabled = true;
1892 
1893    bool stencil_all_keep =
1894       state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1895       state->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
1896       state->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
1897       (!two_sided_stencil ||
1898        (state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
1899         state->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
1900         state->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP));
1901 
1902    bool stencil_mask_zero =
1903       state->stencil[0].writemask == 0 ||
1904       (!two_sided_stencil || state->stencil[1].writemask  == 0);
1905 
1906    bool stencil_func_never =
1907       state->stencil[0].func == PIPE_FUNC_NEVER &&
1908       state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1909       (!two_sided_stencil ||
1910        (state->stencil[1].func == PIPE_FUNC_NEVER &&
1911         state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP));
1912 
1913    /* Stencil writes enabled? */
1914    if (state->stencil[0].writemask != 0 ||
1915       ((two_sided_stencil && state->stencil[1].writemask != 0) &&
1916        (!stencil_all_keep &&
1917         !stencil_mask_zero &&
1918         !stencil_func_never)))
1919       stencil_write_enabled = true;
1920 
1921    cso->ds_write_state = depth_write_enabled || stencil_write_enabled;
1922 
1923    cso->alpha_enabled = state->alpha_enabled;
1924    cso->alpha_func = state->alpha_func;
1925    cso->alpha_ref_value = state->alpha_ref_value;
1926    cso->depth_writes_enabled = state->depth_writemask;
1927    cso->depth_test_enabled = state->depth_enabled;
1928    cso->stencil_writes_enabled =
1929       state->stencil[0].writemask != 0 ||
1930       (two_sided_stencil && state->stencil[1].writemask != 0);
1931 
1932    /* gallium frontends need to optimize away EQUAL writes for us. */
1933    assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1934 
1935    iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {
1936       wmds.StencilFailOp = state->stencil[0].fail_op;
1937       wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;
1938       wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;
1939       wmds.StencilTestFunction =
1940          translate_compare_func(state->stencil[0].func);
1941       wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;
1942       wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;
1943       wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;
1944       wmds.BackfaceStencilTestFunction =
1945          translate_compare_func(state->stencil[1].func);
1946       wmds.DepthTestFunction = translate_compare_func(state->depth_func);
1947       wmds.DoubleSidedStencilEnable = two_sided_stencil;
1948       wmds.StencilTestEnable = state->stencil[0].enabled;
1949       wmds.StencilBufferWriteEnable =
1950          state->stencil[0].writemask != 0 ||
1951          (two_sided_stencil && state->stencil[1].writemask != 0);
1952       wmds.DepthTestEnable = state->depth_enabled;
1953       wmds.DepthBufferWriteEnable = state->depth_writemask;
1954       wmds.StencilTestMask = state->stencil[0].valuemask;
1955       wmds.StencilWriteMask = state->stencil[0].writemask;
1956       wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;
1957       wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;
1958       /* wmds.[Backface]StencilReferenceValue are merged later */
1959 #if GFX_VER >= 12
1960       wmds.StencilReferenceValueModifyDisable = true;
1961 #endif
1962    }
1963 
1964 #if GFX_VER >= 12
1965    iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {
1966       depth_bounds.DepthBoundsTestValueModifyDisable = false;
1967       depth_bounds.DepthBoundsTestEnableModifyDisable = false;
1968       depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;
1969       depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;
1970       depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;
1971    }
1972 #endif
1973 
1974    return cso;
1975 }
1976 
1977 /**
1978  * The pipe->bind_depth_stencil_alpha_state() driver hook.
1979  *
1980  * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1981  */
1982 static void
iris_bind_zsa_state(struct pipe_context * ctx,void * state)1983 iris_bind_zsa_state(struct pipe_context *ctx, void *state)
1984 {
1985    struct iris_context *ice = (struct iris_context *) ctx;
1986    struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1987    struct iris_depth_stencil_alpha_state *new_cso = state;
1988 
1989    if (new_cso) {
1990       if (cso_changed(alpha_ref_value))
1991          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1992 
1993       if (cso_changed(alpha_enabled))
1994          ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
1995 
1996       if (cso_changed(alpha_func))
1997          ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1998 
1999       if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))
2000          ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
2001 
2002       ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
2003       ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
2004 
2005       /* State ds_write_enable changed, need to flag dirty DS. */
2006       if (!old_cso || (ice->state.ds_write_state != new_cso->ds_write_state)) {
2007          ice->state.dirty |= IRIS_DIRTY_DS_WRITE_ENABLE;
2008          ice->state.ds_write_state = new_cso->ds_write_state;
2009       }
2010 
2011 #if GFX_VER >= 12
2012       if (cso_changed(depth_bounds))
2013          ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;
2014 #endif
2015    }
2016 
2017    ice->state.cso_zsa = new_cso;
2018    ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2019    ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
2020    ice->state.stage_dirty |=
2021       ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
2022 
2023    if (GFX_VER == 8)
2024       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
2025 }
2026 
2027 #if GFX_VER == 8
2028 static bool
want_pma_fix(struct iris_context * ice)2029 want_pma_fix(struct iris_context *ice)
2030 {
2031    UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
2032    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2033    const struct iris_fs_data *fs_data =
2034       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
2035    const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
2036    const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
2037    const struct iris_blend_state *cso_blend = ice->state.cso_blend;
2038 
2039    /* In very specific combinations of state, we can instruct Gfx8-9 hardware
2040     * to avoid stalling at the pixel mask array.  The state equations are
2041     * documented in these places:
2042     *
2043     * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
2044     * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
2045     *
2046     * Both equations share some common elements:
2047     *
2048     *    no_hiz_op =
2049     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2050     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2051     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2052     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
2053     *
2054     *    killpixels =
2055     *       3DSTATE_WM::ForceKillPix != ForceOff &&
2056     *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2057     *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2058     *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2059     *        3DSTATE_PS_BLEND::AlphaTestEnable ||
2060     *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2061     *
2062     *    (Technically the stencil PMA treats ForceKillPix differently,
2063     *     but I think this is a documentation oversight, and we don't
2064     *     ever use it in this way, so it doesn't matter).
2065     *
2066     *    common_pma_fix =
2067     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
2068     *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
2069     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2070     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2071     *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
2072     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
2073     *       no_hiz_op
2074     *
2075     * These are always true:
2076     *
2077     *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
2078     *    3DSTATE_PS_EXTRA::PixelShaderValid
2079     *
2080     * Also, we never use the normal drawing path for HiZ ops; these are true:
2081     *
2082     *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2083     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2084     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2085     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
2086     *
2087     * This happens sometimes:
2088     *
2089     *    3DSTATE_WM::ForceThreadDispatch != 1
2090     *
2091     * However, we choose to ignore it as it either agrees with the signal
2092     * (dispatch was already enabled, so nothing out of the ordinary), or
2093     * there are no framebuffer attachments (so no depth or HiZ anyway,
2094     * meaning the PMA signal will already be disabled).
2095     */
2096 
2097    if (!cso_fb->zsbuf)
2098       return false;
2099 
2100    struct iris_resource *zres, *sres;
2101    iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
2102 
2103    /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2104     * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2105     */
2106    if (!zres ||
2107        !iris_resource_level_has_hiz(devinfo, zres, cso_fb->zsbuf->u.tex.level))
2108       return false;
2109 
2110    /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
2111    if (fs_data->early_fragment_tests)
2112       return false;
2113 
2114    /* 3DSTATE_WM::ForceKillPix != ForceOff &&
2115     * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2116     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2117     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2118     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
2119     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2120     */
2121    bool killpixels = fs_data->uses_kill || fs_data->uses_omask ||
2122                      cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;
2123 
2124    /* The Gfx8 depth PMA equation becomes:
2125     *
2126     *    depth_writes =
2127     *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
2128     *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
2129     *
2130     *    stencil_writes =
2131     *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
2132     *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
2133     *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
2134     *
2135     *    Z_PMA_OPT =
2136     *       common_pma_fix &&
2137     *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
2138     *       ((killpixels && (depth_writes || stencil_writes)) ||
2139     *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
2140     *
2141     */
2142    if (!cso_zsa->depth_test_enabled)
2143       return false;
2144 
2145    return fs_data->computed_depth_mode != PSCDEPTH_OFF ||
2146           (killpixels && (cso_zsa->depth_writes_enabled ||
2147                           (sres && cso_zsa->stencil_writes_enabled)));
2148 }
2149 #endif
2150 
2151 void
genX(update_pma_fix)2152 genX(update_pma_fix)(struct iris_context *ice,
2153                      struct iris_batch *batch,
2154                      bool enable)
2155 {
2156 #if GFX_VER == 8
2157    struct iris_genx_state *genx = ice->state.genx;
2158 
2159    if (genx->pma_fix_enabled == enable)
2160       return;
2161 
2162    genx->pma_fix_enabled = enable;
2163 
2164    /* According to the Broadwell PIPE_CONTROL documentation, software should
2165     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2166     * prior to the LRI.  If stencil buffer writes are enabled, then a Render
2167     * Cache Flush is also necessary.
2168     *
2169     * The Gfx9 docs say to use a depth stall rather than a command streamer
2170     * stall.  However, the hardware seems to violently disagree.  A full
2171     * command streamer stall seems to be needed in both cases.
2172     */
2173    iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2174                                 PIPE_CONTROL_CS_STALL |
2175                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2176                                 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2177 
2178    iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
2179       reg.NPPMAFixEnable = enable;
2180       reg.NPEarlyZFailsDisable = enable;
2181       reg.NPPMAFixEnableMask = true;
2182       reg.NPEarlyZFailsDisableMask = true;
2183    }
2184 
2185    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2186     * Flush bits is often necessary.  We do it regardless because it's easier.
2187     * The render cache flush is also necessary if stencil writes are enabled.
2188     *
2189     * Again, the Gfx9 docs give a different set of flushes but the Broadwell
2190     * flushes seem to work just as well.
2191     */
2192    iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2193                                 PIPE_CONTROL_DEPTH_STALL |
2194                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2195                                 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2196 #endif
2197 }
2198 
2199 /**
2200  * Gallium CSO for rasterizer state.
2201  */
2202 struct iris_rasterizer_state {
2203    uint32_t sf[GENX(3DSTATE_SF_length)];
2204    uint32_t clip[GENX(3DSTATE_CLIP_length)];
2205    uint32_t raster[GENX(3DSTATE_RASTER_length)];
2206    uint32_t wm[GENX(3DSTATE_WM_length)];
2207    uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
2208 
2209    uint8_t num_clip_plane_consts;
2210    bool clip_halfz; /* for CC_VIEWPORT */
2211    bool depth_clip_near; /* for CC_VIEWPORT */
2212    bool depth_clip_far; /* for CC_VIEWPORT */
2213    bool flatshade; /* for shader state */
2214    bool flatshade_first; /* for stream output */
2215    bool clamp_fragment_color; /* for shader state */
2216    bool light_twoside; /* for shader state */
2217    bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */
2218    bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
2219    bool line_smooth;
2220    bool line_stipple_enable;
2221    bool poly_stipple_enable;
2222    bool multisample;
2223    bool force_persample_interp;
2224    bool conservative_rasterization;
2225    bool fill_mode_point;
2226    bool fill_mode_line;
2227    bool fill_mode_point_or_line;
2228    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
2229    uint16_t sprite_coord_enable;
2230 };
2231 
2232 static float
get_line_width(const struct pipe_rasterizer_state * state)2233 get_line_width(const struct pipe_rasterizer_state *state)
2234 {
2235    float line_width = state->line_width;
2236 
2237    /* From the OpenGL 4.4 spec:
2238     *
2239     * "The actual width of non-antialiased lines is determined by rounding
2240     *  the supplied width to the nearest integer, then clamping it to the
2241     *  implementation-dependent maximum non-antialiased line width."
2242     */
2243    if (!state->multisample && !state->line_smooth)
2244       line_width = roundf(state->line_width);
2245 
2246    if (!state->multisample && state->line_smooth && line_width < 1.5f) {
2247       /* For 1 pixel line thickness or less, the general anti-aliasing
2248        * algorithm gives up, and a garbage line is generated.  Setting a
2249        * Line Width of 0.0 specifies the rasterization of the "thinnest"
2250        * (one-pixel-wide), non-antialiased lines.
2251        *
2252        * Lines rendered with zero Line Width are rasterized using the
2253        * "Grid Intersection Quantization" rules as specified by the
2254        * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
2255        */
2256       line_width = 0.0f;
2257    }
2258 
2259    return line_width;
2260 }
2261 
2262 /**
2263  * The pipe->create_rasterizer_state() driver hook.
2264  */
2265 static void *
iris_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)2266 iris_create_rasterizer_state(struct pipe_context *ctx,
2267                              const struct pipe_rasterizer_state *state)
2268 {
2269    struct iris_rasterizer_state *cso =
2270       malloc(sizeof(struct iris_rasterizer_state));
2271 
2272    cso->multisample = state->multisample;
2273    cso->force_persample_interp = state->force_persample_interp;
2274    cso->clip_halfz = state->clip_halfz;
2275    cso->depth_clip_near = state->depth_clip_near;
2276    cso->depth_clip_far = state->depth_clip_far;
2277    cso->flatshade = state->flatshade;
2278    cso->flatshade_first = state->flatshade_first;
2279    cso->clamp_fragment_color = state->clamp_fragment_color;
2280    cso->light_twoside = state->light_twoside;
2281    cso->rasterizer_discard = state->rasterizer_discard;
2282    cso->half_pixel_center = state->half_pixel_center;
2283    cso->sprite_coord_mode = state->sprite_coord_mode;
2284    cso->sprite_coord_enable = state->sprite_coord_enable;
2285    cso->line_smooth = state->line_smooth;
2286    cso->line_stipple_enable = state->line_stipple_enable;
2287    cso->poly_stipple_enable = state->poly_stipple_enable;
2288    cso->conservative_rasterization =
2289       state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
2290 
2291    cso->fill_mode_point =
2292       state->fill_front == PIPE_POLYGON_MODE_POINT ||
2293       state->fill_back == PIPE_POLYGON_MODE_POINT;
2294    cso->fill_mode_line =
2295       state->fill_front == PIPE_POLYGON_MODE_LINE ||
2296       state->fill_back == PIPE_POLYGON_MODE_LINE;
2297    cso->fill_mode_point_or_line =
2298       cso->fill_mode_point ||
2299       cso->fill_mode_line;
2300 
2301    if (state->clip_plane_enable != 0)
2302       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2303    else
2304       cso->num_clip_plane_consts = 0;
2305 
2306    float line_width = get_line_width(state);
2307 
2308    iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2309       sf.StatisticsEnable = true;
2310       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2311       sf.LineEndCapAntialiasingRegionWidth =
2312          state->line_smooth ? _10pixels : _05pixels;
2313       sf.LastPixelEnable = state->line_last_pixel;
2314       sf.LineWidth = line_width;
2315       sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&
2316                              !state->point_quad_rasterization;
2317       sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2318       sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);
2319 
2320       if (state->flatshade_first) {
2321          sf.TriangleFanProvokingVertexSelect = 1;
2322       } else {
2323          sf.TriangleStripListProvokingVertexSelect = 2;
2324          sf.TriangleFanProvokingVertexSelect = 2;
2325          sf.LineStripListProvokingVertexSelect = 1;
2326       }
2327    }
2328 
2329    iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2330       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2331       rr.CullMode = translate_cull_mode(state->cull_face);
2332       rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2333       rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2334       rr.DXMultisampleRasterizationEnable = state->multisample;
2335       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2336       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2337       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2338       rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2339       rr.GlobalDepthOffsetScale = state->offset_scale;
2340       rr.GlobalDepthOffsetClamp = state->offset_clamp;
2341       rr.SmoothPointEnable = state->point_smooth;
2342       rr.ScissorRectangleEnable = state->scissor;
2343 #if GFX_VER >= 9
2344       rr.ViewportZNearClipTestEnable = state->depth_clip_near;
2345       rr.ViewportZFarClipTestEnable = state->depth_clip_far;
2346       rr.ConservativeRasterizationEnable =
2347          cso->conservative_rasterization;
2348 #else
2349       rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2350 #endif
2351    }
2352 
2353    iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2354       /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2355        * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2356        */
2357       cl.EarlyCullEnable = true;
2358       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2359       cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2360       cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2361       cl.GuardbandClipTestEnable = true;
2362       cl.ClipEnable = true;
2363       cl.MinimumPointWidth = 0.125;
2364       cl.MaximumPointWidth = 255.875;
2365 
2366       if (state->flatshade_first) {
2367          cl.TriangleFanProvokingVertexSelect = 1;
2368       } else {
2369          cl.TriangleStripListProvokingVertexSelect = 2;
2370          cl.TriangleFanProvokingVertexSelect = 2;
2371          cl.LineStripListProvokingVertexSelect = 1;
2372       }
2373    }
2374 
2375    iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
2376       /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
2377        * filled in at draw time from the FS program.
2378        */
2379       wm.LineAntialiasingRegionWidth = _10pixels;
2380       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2381       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2382       wm.LineStippleEnable = state->line_stipple_enable;
2383       wm.PolygonStippleEnable = state->poly_stipple_enable;
2384    }
2385 
2386    /* Remap from 0..255 back to 1..256 */
2387    const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2388 
2389    iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2390       if (state->line_stipple_enable) {
2391          line.LineStipplePattern = state->line_stipple_pattern;
2392          line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2393          line.LineStippleRepeatCount = line_stipple_factor;
2394       }
2395    }
2396 
2397    return cso;
2398 }
2399 
2400 /**
2401  * The pipe->bind_rasterizer_state() driver hook.
2402  *
2403  * Bind a rasterizer CSO and flag related dirty bits.
2404  */
2405 static void
iris_bind_rasterizer_state(struct pipe_context * ctx,void * state)2406 iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2407 {
2408    struct iris_context *ice = (struct iris_context *) ctx;
2409    struct iris_rasterizer_state *old_cso = ice->state.cso_rast;
2410    struct iris_rasterizer_state *new_cso = state;
2411 
2412    if (new_cso) {
2413       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2414       if (cso_changed_memcmp(line_stipple))
2415          ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
2416 
2417       if (cso_changed(half_pixel_center))
2418          ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
2419 
2420       if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
2421          ice->state.dirty |= IRIS_DIRTY_WM;
2422 
2423       if (cso_changed(rasterizer_discard))
2424          ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
2425 
2426       if (cso_changed(flatshade_first))
2427          ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
2428 
2429       if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||
2430           cso_changed(clip_halfz))
2431          ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2432 
2433       if (cso_changed(sprite_coord_enable) ||
2434           cso_changed(sprite_coord_mode) ||
2435           cso_changed(light_twoside))
2436          ice->state.dirty |= IRIS_DIRTY_SBE;
2437 
2438       if (cso_changed(conservative_rasterization))
2439          ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
2440    }
2441 
2442    ice->state.cso_rast = new_cso;
2443    ice->state.dirty |= IRIS_DIRTY_RASTER;
2444    ice->state.dirty |= IRIS_DIRTY_CLIP;
2445    ice->state.stage_dirty |=
2446       ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];
2447 }
2448 
2449 /**
2450  * Return true if the given wrap mode requires the border color to exist.
2451  *
2452  * (We can skip uploading it if the sampler isn't going to use it.)
2453  */
2454 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2455 wrap_mode_needs_border_color(unsigned wrap_mode)
2456 {
2457    return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2458 }
2459 
2460 /**
2461  * Gallium CSO for sampler state.
2462  */
2463 struct iris_sampler_state {
2464    union pipe_color_union border_color;
2465    bool needs_border_color;
2466 
2467    uint32_t sampler_state[GENX(SAMPLER_STATE_length)];
2468 
2469 #if GFX_VERx10 == 125
2470    /* Sampler state structure to use for 3D textures in order to
2471     * implement Wa_14014414195.
2472     */
2473    uint32_t sampler_state_3d[GENX(SAMPLER_STATE_length)];
2474 #endif
2475 };
2476 
2477 static void
fill_sampler_state(uint32_t * sampler_state,const struct pipe_sampler_state * state,unsigned max_anisotropy)2478 fill_sampler_state(uint32_t *sampler_state,
2479                    const struct pipe_sampler_state *state,
2480                    unsigned max_anisotropy)
2481 {
2482    float min_lod = state->min_lod;
2483    unsigned mag_img_filter = state->mag_img_filter;
2484 
2485    // XXX: explain this code ported from ilo...I don't get it at all...
2486    if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2487        state->min_lod > 0.0f) {
2488       min_lod = 0.0f;
2489       mag_img_filter = state->min_img_filter;
2490    }
2491 
2492    iris_pack_state(GENX(SAMPLER_STATE), sampler_state, samp) {
2493       samp.TCXAddressControlMode = translate_wrap(state->wrap_s);
2494       samp.TCYAddressControlMode = translate_wrap(state->wrap_t);
2495       samp.TCZAddressControlMode = translate_wrap(state->wrap_r);
2496       samp.CubeSurfaceControlMode = state->seamless_cube_map;
2497       samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2498       samp.MinModeFilter = state->min_img_filter;
2499       samp.MagModeFilter = mag_img_filter;
2500       samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2501       samp.MaximumAnisotropy = RATIO21;
2502 
2503       if (max_anisotropy >= 2) {
2504          if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2505 #if GFX_VER >= 30
2506             samp.MinModeFilter = MAPFILTER_ANISOTROPIC_FAST;
2507 #else
2508             samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2509 #endif
2510             samp.AnisotropicAlgorithm = EWAApproximation;
2511          }
2512 
2513          if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR) {
2514 #if GFX_VER >= 30
2515             samp.MagModeFilter = MAPFILTER_ANISOTROPIC_FAST;
2516 #else
2517             samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2518 #endif
2519          }
2520 
2521          samp.MaximumAnisotropy =
2522             MIN2((max_anisotropy - 2) / 2, RATIO161);
2523       }
2524 
2525       /* Set address rounding bits if not using nearest filtering. */
2526       if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2527          samp.UAddressMinFilterRoundingEnable = true;
2528          samp.VAddressMinFilterRoundingEnable = true;
2529          samp.RAddressMinFilterRoundingEnable = true;
2530       }
2531 
2532       if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2533          samp.UAddressMagFilterRoundingEnable = true;
2534          samp.VAddressMagFilterRoundingEnable = true;
2535          samp.RAddressMagFilterRoundingEnable = true;
2536       }
2537 
2538       if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2539          samp.ShadowFunction = translate_shadow_func(state->compare_func);
2540 
2541       const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2542 
2543       samp.LODPreClampMode = CLAMP_MODE_OGL;
2544       samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);
2545       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2546       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2547 
2548       /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
2549    }
2550 }
2551 
2552 /**
2553  * The pipe->create_sampler_state() driver hook.
2554  *
2555  * We fill out SAMPLER_STATE (except for the border color pointer), and
2556  * store that on the CPU.  It doesn't make sense to upload it to a GPU
2557  * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2558  * all bound sampler states to be in contiguous memory.
2559  */
2560 static void *
iris_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2561 iris_create_sampler_state(struct pipe_context *ctx,
2562                           const struct pipe_sampler_state *state)
2563 {
2564    UNUSED struct iris_screen *screen = (void *)ctx->screen;
2565    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2566    struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
2567 
2568    if (!cso)
2569       return NULL;
2570 
2571    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2572    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2573 
2574    unsigned wrap_s = translate_wrap(state->wrap_s);
2575    unsigned wrap_t = translate_wrap(state->wrap_t);
2576    unsigned wrap_r = translate_wrap(state->wrap_r);
2577 
2578    memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2579 
2580    cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||
2581                              wrap_mode_needs_border_color(wrap_t) ||
2582                              wrap_mode_needs_border_color(wrap_r);
2583 
2584    fill_sampler_state(cso->sampler_state, state, state->max_anisotropy);
2585 
2586 #if GFX_VERx10 == 125
2587    /* Fill an extra sampler state structure with anisotropic filtering
2588     * disabled used to implement Wa_14014414195.
2589     */
2590    if (intel_needs_workaround(screen->devinfo, 14014414195))
2591       fill_sampler_state(cso->sampler_state_3d, state, 0);
2592 #endif
2593 
2594    return cso;
2595 }
2596 
2597 /**
2598  * The pipe->bind_sampler_states() driver hook.
2599  */
2600 static void
iris_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2601 iris_bind_sampler_states(struct pipe_context *ctx,
2602                          enum pipe_shader_type p_stage,
2603                          unsigned start, unsigned count,
2604                          void **states)
2605 {
2606    struct iris_context *ice = (struct iris_context *) ctx;
2607    gl_shader_stage stage = stage_from_pipe(p_stage);
2608    struct iris_shader_state *shs = &ice->state.shaders[stage];
2609 
2610    assert(start + count <= IRIS_MAX_SAMPLERS);
2611 
2612    bool dirty = false;
2613 
2614    for (int i = 0; i < count; i++) {
2615       struct iris_sampler_state *state = states ? states[i] : NULL;
2616       if (shs->samplers[start + i] != state) {
2617          shs->samplers[start + i] = state;
2618          dirty = true;
2619       }
2620    }
2621 
2622    if (dirty)
2623       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2624 }
2625 
2626 /**
2627  * Upload the sampler states into a contiguous area of GPU memory, for
2628  * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2629  *
2630  * Also fill out the border color state pointers.
2631  */
2632 static void
iris_upload_sampler_states(struct iris_context * ice,gl_shader_stage stage)2633 iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)
2634 {
2635    struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen;
2636    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
2637    struct iris_shader_state *shs = &ice->state.shaders[stage];
2638    struct iris_border_color_pool *border_color_pool =
2639       iris_bufmgr_get_border_color_pool(screen->bufmgr);
2640 
2641    /* We assume gallium frontends will call pipe->bind_sampler_states()
2642     * if the program's number of textures changes.
2643     */
2644    unsigned count = util_last_bit64(shader->bt.samplers_used_mask);
2645 
2646    if (!count)
2647       return;
2648 
2649    /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2650     * in the dynamic state memory zone, so we can point to it via the
2651     * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2652     */
2653    unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2654    uint32_t *map =
2655       upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);
2656    if (unlikely(!map))
2657       return;
2658 
2659    struct pipe_resource *res = shs->sampler_table.res;
2660    struct iris_bo *bo = iris_resource_bo(res);
2661 
2662    iris_record_state_size(ice->state.sizes,
2663                           bo->address + shs->sampler_table.offset, size);
2664 
2665    shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);
2666 
2667    ice->state.need_border_colors &= ~(1 << stage);
2668 
2669    for (int i = 0; i < count; i++) {
2670       struct iris_sampler_state *state = shs->samplers[i];
2671       struct iris_sampler_view *tex = shs->textures[i];
2672 
2673       if (!state) {
2674          memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2675       } else {
2676          const uint32_t *sampler_state = state->sampler_state;
2677 
2678 #if GFX_VERx10 == 125
2679          if (intel_needs_workaround(screen->devinfo, 14014414195) &&
2680              tex && tex->res->base.b.target == PIPE_TEXTURE_3D) {
2681                sampler_state = state->sampler_state_3d;
2682          }
2683 #endif
2684 
2685          if (!state->needs_border_color) {
2686             memcpy(map, sampler_state, 4 * GENX(SAMPLER_STATE_length));
2687          } else {
2688             ice->state.need_border_colors |= 1 << stage;
2689 
2690             /* We may need to swizzle the border color for format faking.
2691              * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2692              * This means we need to move the border color's A channel into
2693              * the R or G channels so that those read swizzles will move it
2694              * back into A.
2695              */
2696             union pipe_color_union *color = &state->border_color;
2697             union pipe_color_union tmp;
2698             if (tex) {
2699                enum pipe_format internal_format = tex->res->internal_format;
2700 
2701                if (util_format_is_alpha(internal_format)) {
2702                   unsigned char swz[4] = {
2703                      PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,
2704                      PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2705                   };
2706                   util_format_apply_color_swizzle(&tmp, color, swz, true);
2707                   color = &tmp;
2708                } else if (util_format_is_luminance_alpha(internal_format) &&
2709                           internal_format != PIPE_FORMAT_L8A8_SRGB) {
2710                   unsigned char swz[4] = {
2711                      PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,
2712                      PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2713                   };
2714                   util_format_apply_color_swizzle(&tmp, color, swz, true);
2715                   color = &tmp;
2716                }
2717             }
2718 
2719             /* Stream out the border color and merge the pointer. */
2720             uint32_t offset = iris_upload_border_color(border_color_pool,
2721                                                        color);
2722 
2723             uint32_t dynamic[GENX(SAMPLER_STATE_length)];
2724             iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
2725                dyns.BorderColorPointer = offset;
2726             }
2727 
2728             for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
2729                map[j] = sampler_state[j] | dynamic[j];
2730          }
2731       }
2732 
2733       map += GENX(SAMPLER_STATE_length);
2734    }
2735 }
2736 
2737 static enum isl_channel_select
fmt_swizzle(const struct iris_format_info * fmt,enum pipe_swizzle swz)2738 fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)
2739 {
2740    switch (swz) {
2741    case PIPE_SWIZZLE_X: return fmt->swizzle.r;
2742    case PIPE_SWIZZLE_Y: return fmt->swizzle.g;
2743    case PIPE_SWIZZLE_Z: return fmt->swizzle.b;
2744    case PIPE_SWIZZLE_W: return fmt->swizzle.a;
2745    case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;
2746    case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;
2747    default: unreachable("invalid swizzle");
2748    }
2749 }
2750 
2751 static void
fill_buffer_surface_state(struct isl_device * isl_dev,struct iris_resource * res,void * map,enum isl_format format,struct isl_swizzle swizzle,unsigned offset,unsigned size,isl_surf_usage_flags_t usage)2752 fill_buffer_surface_state(struct isl_device *isl_dev,
2753                           struct iris_resource *res,
2754                           void *map,
2755                           enum isl_format format,
2756                           struct isl_swizzle swizzle,
2757                           unsigned offset,
2758                           unsigned size,
2759                           isl_surf_usage_flags_t usage)
2760 {
2761    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2762    const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2763 
2764    /* The ARB_texture_buffer_specification says:
2765     *
2766     *    "The number of texels in the buffer texture's texel array is given by
2767     *
2768     *       floor(<buffer_size> / (<components> * sizeof(<base_type>)),
2769     *
2770     *     where <buffer_size> is the size of the buffer object, in basic
2771     *     machine units and <components> and <base_type> are the element count
2772     *     and base data type for elements, as specified in Table X.1.  The
2773     *     number of texels in the texel array is then clamped to the
2774     *     implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."
2775     *
2776     * We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,
2777     * so that when ISL divides by stride to obtain the number of texels, that
2778     * texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.
2779     */
2780    unsigned final_size =
2781       MIN3(size, res->bo->size - res->offset - offset,
2782            IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);
2783 
2784    isl_buffer_fill_state(isl_dev, map,
2785                          .address = res->bo->address + res->offset + offset,
2786                          .size_B = final_size,
2787                          .format = format,
2788                          .swizzle = swizzle,
2789                          .stride_B = cpp,
2790                          .mocs = iris_mocs(res->bo, isl_dev, usage));
2791 }
2792 
2793 #define SURFACE_STATE_ALIGNMENT 64
2794 
2795 /**
2796  * Allocate several contiguous SURFACE_STATE structures, one for each
2797  * supported auxiliary surface mode.  This only allocates the CPU-side
2798  * copy, they will need to be uploaded later after they're filled in.
2799  */
2800 static void
alloc_surface_states(struct iris_surface_state * surf_state,unsigned aux_usages)2801 alloc_surface_states(struct iris_surface_state *surf_state,
2802                      unsigned aux_usages)
2803 {
2804    enum { surf_size = 4 * GENX(RENDER_SURFACE_STATE_length) };
2805 
2806    /* If this changes, update this to explicitly align pointers */
2807    STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);
2808 
2809    assert(aux_usages != 0);
2810 
2811    /* In case we're re-allocating them... */
2812    free(surf_state->cpu);
2813 
2814    surf_state->aux_usages = aux_usages;
2815    surf_state->num_states = util_bitcount(aux_usages);
2816    surf_state->cpu = calloc(surf_state->num_states, surf_size);
2817    surf_state->ref.offset = 0;
2818    pipe_resource_reference(&surf_state->ref.res, NULL);
2819 
2820    assert(surf_state->cpu);
2821 }
2822 
2823 /**
2824  * Upload the CPU side SURFACE_STATEs into a GPU buffer.
2825  */
2826 static void
upload_surface_states(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state)2827 upload_surface_states(struct u_upload_mgr *mgr,
2828                       struct iris_surface_state *surf_state)
2829 {
2830    const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2831    const unsigned bytes = surf_state->num_states * surf_size;
2832 
2833    void *map =
2834       upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);
2835 
2836    surf_state->ref.offset +=
2837       iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));
2838 
2839    if (map)
2840       memcpy(map, surf_state->cpu, bytes);
2841 }
2842 
2843 /**
2844  * Update resource addresses in a set of SURFACE_STATE descriptors,
2845  * and re-upload them if necessary.
2846  */
2847 static bool
update_surface_state_addrs(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state,struct iris_bo * bo)2848 update_surface_state_addrs(struct u_upload_mgr *mgr,
2849                            struct iris_surface_state *surf_state,
2850                            struct iris_bo *bo)
2851 {
2852    if (surf_state->bo_address == bo->address)
2853       return false;
2854 
2855    STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);
2856    STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);
2857 
2858    uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];
2859 
2860    /* First, update the CPU copies.  We assume no other fields exist in
2861     * the QWord containing Surface Base Address.
2862     */
2863    for (unsigned i = 0; i < surf_state->num_states; i++) {
2864       *ss_addr = *ss_addr - surf_state->bo_address + bo->address;
2865       ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;
2866    }
2867 
2868    /* Next, upload the updated copies to a GPU buffer. */
2869    upload_surface_states(mgr, surf_state);
2870 
2871    surf_state->bo_address = bo->address;
2872 
2873    return true;
2874 }
2875 
2876 /* We should only use this function when it's needed to fill out
2877  * surf with information provided by the pipe_(image|sampler)_view.
2878  * This is only necessary for CL extension cl_khr_image2d_from_buffer.
2879  * This is the reason why ISL_SURF_DIM_2D is hardcoded on dim field.
2880  */
2881 static void
fill_surf_for_tex2d_from_buffer(struct isl_device * isl_dev,enum isl_format format,unsigned width,unsigned height,unsigned row_stride,isl_surf_usage_flags_t usage,struct isl_surf * surf)2882 fill_surf_for_tex2d_from_buffer(struct isl_device *isl_dev,
2883                                 enum isl_format format,
2884                                 unsigned width,
2885                                 unsigned height,
2886                                 unsigned row_stride,
2887                                 isl_surf_usage_flags_t usage,
2888                                 struct isl_surf *surf)
2889 {
2890    const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2891    const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2892 
2893    const struct isl_surf_init_info init_info = {
2894       .dim = ISL_SURF_DIM_2D,
2895       .format = format,
2896       .width = width,
2897       .height = height,
2898       .depth = 1,
2899       .levels = 1,
2900       .array_len = 1,
2901       .samples = 1,
2902       .min_alignment_B = 4,
2903       .row_pitch_B = row_stride * cpp,
2904       .usage = usage,
2905       .tiling_flags = ISL_TILING_LINEAR_BIT,
2906    };
2907 
2908    const bool isl_surf_created_successfully =
2909       isl_surf_init_s(isl_dev, surf, &init_info);
2910 
2911    assert(isl_surf_created_successfully);
2912 }
2913 
2914 static void
fill_surface_state(struct isl_device * isl_dev,void * map,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,unsigned aux_usage,uint32_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2915 fill_surface_state(struct isl_device *isl_dev,
2916                    void *map,
2917                    struct iris_resource *res,
2918                    struct isl_surf *surf,
2919                    struct isl_view *view,
2920                    unsigned aux_usage,
2921                    uint32_t extra_main_offset,
2922                    uint32_t tile_x_sa,
2923                    uint32_t tile_y_sa)
2924 {
2925    struct isl_surf_fill_state_info f = {
2926       .surf = surf,
2927       .view = view,
2928       .mocs = iris_mocs(res->bo, isl_dev, view->usage),
2929       .address = res->bo->address + res->offset + extra_main_offset,
2930       .x_offset_sa = tile_x_sa,
2931       .y_offset_sa = tile_y_sa,
2932    };
2933 
2934    if (aux_usage != ISL_AUX_USAGE_NONE) {
2935       f.aux_surf = &res->aux.surf;
2936       f.aux_usage = aux_usage;
2937       f.clear_color = res->aux.clear_color;
2938 
2939       if (aux_usage == ISL_AUX_USAGE_MC)
2940          f.mc_format = iris_format_for_usage(isl_dev->info,
2941                                              res->external_format,
2942                                              surf->usage).fmt;
2943 
2944       if (res->aux.bo)
2945          f.aux_address = res->aux.bo->address + res->aux.offset;
2946 
2947       if (res->aux.clear_color_bo) {
2948          f.clear_address = res->aux.clear_color_bo->address +
2949                            res->aux.clear_color_offset;
2950          f.use_clear_address = isl_dev->info->ver > 9;
2951       }
2952    }
2953 
2954    isl_surf_fill_state_s(isl_dev, map, &f);
2955 }
2956 
2957 static void
fill_surface_states(struct isl_device * isl_dev,struct iris_surface_state * surf_state,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,uint64_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2958 fill_surface_states(struct isl_device *isl_dev,
2959                     struct iris_surface_state *surf_state,
2960                     struct iris_resource *res,
2961                     struct isl_surf *surf,
2962                     struct isl_view *view,
2963                     uint64_t extra_main_offset,
2964                     uint32_t tile_x_sa,
2965                     uint32_t tile_y_sa)
2966 {
2967    void *map = surf_state->cpu;
2968    unsigned aux_modes = surf_state->aux_usages;
2969 
2970    while (aux_modes) {
2971       enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2972 
2973       fill_surface_state(isl_dev, map, res, surf, view, aux_usage,
2974                          extra_main_offset, tile_x_sa, tile_y_sa);
2975 
2976       map += SURFACE_STATE_ALIGNMENT;
2977    }
2978 }
2979 
2980 /**
2981  * The pipe->create_sampler_view() driver hook.
2982  */
2983 static struct pipe_sampler_view *
iris_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2984 iris_create_sampler_view(struct pipe_context *ctx,
2985                          struct pipe_resource *tex,
2986                          const struct pipe_sampler_view *tmpl)
2987 {
2988    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2989    const struct intel_device_info *devinfo = screen->devinfo;
2990    struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
2991 
2992    if (!isv)
2993       return NULL;
2994 
2995    /* initialize base object */
2996    isv->base = *tmpl;
2997    isv->base.context = ctx;
2998    isv->base.texture = NULL;
2999    pipe_reference_init(&isv->base.reference, 1);
3000    pipe_resource_reference(&isv->base.texture, tex);
3001 
3002    if (util_format_is_depth_or_stencil(tmpl->format)) {
3003       struct iris_resource *zres, *sres;
3004       const struct util_format_description *desc =
3005          util_format_description(tmpl->format);
3006 
3007       iris_get_depth_stencil_resources(tex, &zres, &sres);
3008 
3009       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
3010    }
3011 
3012    isv->res = (struct iris_resource *) tex;
3013 
3014    isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
3015 
3016    if (isv->base.target == PIPE_TEXTURE_CUBE ||
3017        isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
3018       usage |= ISL_SURF_USAGE_CUBE_BIT;
3019 
3020    const struct iris_format_info fmt =
3021       iris_format_for_usage(devinfo, tmpl->format, usage);
3022 
3023    isv->clear_color = isv->res->aux.clear_color;
3024 
3025    isv->view = (struct isl_view) {
3026       .format = fmt.fmt,
3027       .swizzle = (struct isl_swizzle) {
3028          .r = fmt_swizzle(&fmt, tmpl->swizzle_r),
3029          .g = fmt_swizzle(&fmt, tmpl->swizzle_g),
3030          .b = fmt_swizzle(&fmt, tmpl->swizzle_b),
3031          .a = fmt_swizzle(&fmt, tmpl->swizzle_a),
3032       },
3033       .usage = usage,
3034    };
3035 
3036    unsigned aux_usages = 0;
3037 
3038    if ((isv->res->aux.usage == ISL_AUX_USAGE_CCS_D ||
3039         isv->res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3040         isv->res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3041        !isl_format_supports_ccs_e(devinfo, isv->view.format)) {
3042       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3043    } else if (isl_aux_usage_has_hiz(isv->res->aux.usage) &&
3044               !iris_sample_with_depth_aux(devinfo, isv->res)) {
3045       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3046    } else {
3047       aux_usages = 1 << ISL_AUX_USAGE_NONE |
3048                    1 << isv->res->aux.usage;
3049    }
3050 
3051    alloc_surface_states(&isv->surface_state, aux_usages);
3052    isv->surface_state.bo_address = isv->res->bo->address;
3053 
3054    /* Fill out SURFACE_STATE for this view. */
3055    if (tmpl->target != PIPE_BUFFER) {
3056       isv->view.base_level = tmpl->u.tex.first_level;
3057       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
3058 
3059       if (tmpl->target == PIPE_TEXTURE_3D) {
3060          isv->view.base_array_layer = 0;
3061          isv->view.array_len = 1;
3062       } else {
3063 #if GFX_VER < 9
3064          /* Hardware older than skylake ignores this value */
3065          assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
3066 #endif
3067          isv->view.base_array_layer = tmpl->u.tex.first_layer;
3068          isv->view.array_len =
3069             tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3070       }
3071 
3072       fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3073                           &isv->res->surf, &isv->view, 0, 0, 0);
3074    } else if (isv->base.is_tex2d_from_buf) {
3075       /* In case it's a 2d image created from a buffer, we should
3076        * use fill_surface_states function with image parameters provided
3077        * by the CL application
3078        */
3079       isv->view.base_array_layer = 0;
3080       isv->view.array_len = 1;
3081 
3082       /* Create temp_surf and fill with values provided by CL application */
3083       struct isl_surf temp_surf;
3084       fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt.fmt,
3085                                       isv->base.u.tex2d_from_buf.width,
3086                                       isv->base.u.tex2d_from_buf.height,
3087                                       isv->base.u.tex2d_from_buf.row_stride,
3088                                       usage,
3089                                       &temp_surf);
3090 
3091       fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3092                           &temp_surf, &isv->view, 0, 0, 0);
3093    } else {
3094       fill_buffer_surface_state(&screen->isl_dev, isv->res,
3095                                 isv->surface_state.cpu,
3096                                 isv->view.format, isv->view.swizzle,
3097                                 tmpl->u.buf.offset, tmpl->u.buf.size,
3098                                 ISL_SURF_USAGE_TEXTURE_BIT);
3099    }
3100 
3101    return &isv->base;
3102 }
3103 
3104 static void
iris_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)3105 iris_sampler_view_destroy(struct pipe_context *ctx,
3106                           struct pipe_sampler_view *state)
3107 {
3108    struct iris_sampler_view *isv = (void *) state;
3109    pipe_resource_reference(&state->texture, NULL);
3110    pipe_resource_reference(&isv->surface_state.ref.res, NULL);
3111    free(isv->surface_state.cpu);
3112    free(isv);
3113 }
3114 
3115 /**
3116  * The pipe->create_surface() driver hook.
3117  *
3118  * In Gallium nomenclature, "surfaces" are a view of a resource that
3119  * can be bound as a render target or depth/stencil buffer.
3120  */
3121 static struct pipe_surface *
iris_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)3122 iris_create_surface(struct pipe_context *ctx,
3123                     struct pipe_resource *tex,
3124                     const struct pipe_surface *tmpl)
3125 {
3126    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3127    const struct intel_device_info *devinfo = screen->devinfo;
3128 
3129    isl_surf_usage_flags_t usage = 0;
3130    if (tmpl->writable)
3131       usage = ISL_SURF_USAGE_STORAGE_BIT;
3132    else if (util_format_is_depth_or_stencil(tmpl->format))
3133       usage = ISL_SURF_USAGE_DEPTH_BIT;
3134    else
3135       usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
3136 
3137    const struct iris_format_info fmt =
3138       iris_format_for_usage(devinfo, tmpl->format, usage);
3139 
3140    if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
3141        !isl_format_supports_rendering(devinfo, fmt.fmt)) {
3142       /* Framebuffer validation will reject this invalid case, but it
3143        * hasn't had the opportunity yet.  In the meantime, we need to
3144        * avoid hitting ISL asserts about unsupported formats below.
3145        */
3146       return NULL;
3147    }
3148 
3149    struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
3150    struct iris_resource *res = (struct iris_resource *) tex;
3151 
3152    if (!surf)
3153       return NULL;
3154 
3155    uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3156 
3157    struct isl_view *view = &surf->view;
3158    *view = (struct isl_view) {
3159       .format = fmt.fmt,
3160       .base_level = tmpl->u.tex.level,
3161       .levels = 1,
3162       .base_array_layer = tmpl->u.tex.first_layer,
3163       .array_len = array_len,
3164       .swizzle = ISL_SWIZZLE_IDENTITY,
3165       .usage = usage,
3166    };
3167 
3168 #if GFX_VER == 8
3169    struct isl_view *read_view = &surf->read_view;
3170    *read_view = (struct isl_view) {
3171       .format = fmt.fmt,
3172       .base_level = tmpl->u.tex.level,
3173       .levels = 1,
3174       .base_array_layer = tmpl->u.tex.first_layer,
3175       .array_len = array_len,
3176       .swizzle = ISL_SWIZZLE_IDENTITY,
3177       .usage = ISL_SURF_USAGE_TEXTURE_BIT,
3178    };
3179 
3180    struct isl_surf read_surf = res->surf;
3181    uint64_t read_surf_offset_B = 0;
3182    uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;
3183    if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {
3184       /* The minimum array element field of the surface state structure is
3185        * ignored by the sampler unit for 3D textures on some hardware.  If the
3186        * render buffer is a single slice of a 3D texture, create a 2D texture
3187        * covering that slice.
3188        *
3189        * TODO: This only handles the case where we're rendering to a single
3190        * slice of an array texture.  If we have layered rendering combined
3191        * with non-coherent FB fetch and a non-zero base_array_layer, then
3192        * we're going to run into problems.
3193        *
3194        * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/4904
3195        */
3196       isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
3197                               read_view->base_level,
3198                               0, read_view->base_array_layer,
3199                               &read_surf, &read_surf_offset_B,
3200                               &read_surf_tile_x_sa, &read_surf_tile_y_sa);
3201       read_view->base_level = 0;
3202       read_view->base_array_layer = 0;
3203       assert(read_view->array_len == 1);
3204    } else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {
3205       /* Convert 1D array textures to 2D arrays because shaders always provide
3206        * the array index coordinate at the Z component to avoid recompiles
3207        * when changing the texture target of the framebuffer.
3208        */
3209       assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);
3210       read_surf.dim = ISL_SURF_DIM_2D;
3211    }
3212 #endif
3213 
3214    struct isl_surf isl_surf = res->surf;
3215    uint64_t offset_B = 0;
3216    uint32_t tile_x_el = 0, tile_y_el = 0;
3217    if (isl_format_is_compressed(res->surf.format)) {
3218       /* The resource has a compressed format, which is not renderable, but we
3219        * have a renderable view format.  We must be attempting to upload
3220        * blocks of compressed data via an uncompressed view.
3221        *
3222        * In this case, we can assume there are no auxiliary surfaces, a single
3223        * miplevel, and that the resource is single-sampled.  Gallium may try
3224        * and create an uncompressed view with multiple layers, however.
3225        */
3226       assert(res->aux.surf.size_B == 0);
3227       assert(res->surf.samples == 1);
3228       assert(view->levels == 1);
3229 
3230       bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev,
3231                                                &res->surf, view,
3232                                                &isl_surf, view, &offset_B,
3233                                                &tile_x_el, &tile_y_el);
3234 
3235       /* On Broadwell, HALIGN and VALIGN are specified in pixels and are
3236        * hard-coded to align to exactly the block size of the compressed
3237        * texture. This means that, when reinterpreted as a non-compressed
3238        * texture, the tile offsets may be anything.
3239        *
3240        * We need them to be multiples of 4 to be usable in RENDER_SURFACE_STATE,
3241        * so force the state tracker to take fallback paths if they're not.
3242        */
3243 #if GFX_VER == 8
3244       if (tile_x_el % 4 != 0 || tile_y_el % 4 != 0) {
3245          ok = false;
3246       }
3247 #endif
3248 
3249       if (!ok) {
3250          free(surf);
3251          return NULL;
3252       }
3253    }
3254 
3255    surf->clear_color = res->aux.clear_color;
3256 
3257    struct pipe_surface *psurf = &surf->base;
3258    pipe_reference_init(&psurf->reference, 1);
3259    pipe_resource_reference(&psurf->texture, tex);
3260    psurf->context = ctx;
3261    psurf->format = tmpl->format;
3262    psurf->width = isl_surf.logical_level0_px.width;
3263    psurf->height = isl_surf.logical_level0_px.height;
3264    psurf->texture = tex;
3265    psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
3266    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
3267    psurf->u.tex.level = tmpl->u.tex.level;
3268 
3269    /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
3270    if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
3271                           ISL_SURF_USAGE_STENCIL_BIT))
3272       return psurf;
3273 
3274    /* Fill out a SURFACE_STATE for each possible auxiliary surface mode and
3275     * return the pipe_surface.
3276     */
3277    unsigned aux_usages = 0;
3278 
3279    if ((res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3280         res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3281        !isl_format_supports_ccs_e(devinfo, view->format)) {
3282       aux_usages = 1 << ISL_AUX_USAGE_NONE;
3283    } else {
3284       aux_usages = 1 << ISL_AUX_USAGE_NONE |
3285                    1 << res->aux.usage;
3286    }
3287 
3288    alloc_surface_states(&surf->surface_state, aux_usages);
3289    surf->surface_state.bo_address = res->bo->address;
3290    fill_surface_states(&screen->isl_dev, &surf->surface_state, res,
3291                        &isl_surf, view, offset_B, tile_x_el, tile_y_el);
3292 
3293 #if GFX_VER == 8
3294    alloc_surface_states(&surf->surface_state_read, aux_usages);
3295    surf->surface_state_read.bo_address = res->bo->address;
3296    fill_surface_states(&screen->isl_dev, &surf->surface_state_read, res,
3297                        &read_surf, read_view, read_surf_offset_B,
3298                        read_surf_tile_x_sa, read_surf_tile_y_sa);
3299 #endif
3300 
3301    return psurf;
3302 }
3303 
3304 #if GFX_VER < 9
3305 static void
fill_default_image_param(struct isl_image_param * param)3306 fill_default_image_param(struct isl_image_param *param)
3307 {
3308    memset(param, 0, sizeof(*param));
3309    /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3310     * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3311     * detailed explanation of these parameters.
3312     */
3313    param->swizzling[0] = 0xff;
3314    param->swizzling[1] = 0xff;
3315 }
3316 
3317 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3318 fill_buffer_image_param(struct isl_image_param *param,
3319                         enum pipe_format pfmt,
3320                         unsigned size)
3321 {
3322    const unsigned cpp = util_format_get_blocksize(pfmt);
3323 
3324    fill_default_image_param(param);
3325    param->size[0] = size / cpp;
3326    param->stride[0] = cpp;
3327 }
3328 #else
3329 #define isl_surf_fill_image_param(x, ...)
3330 #define fill_default_image_param(x, ...)
3331 #define fill_buffer_image_param(x, ...)
3332 #endif
3333 
3334 /**
3335  * The pipe->set_shader_images() driver hook.
3336  */
3337 static void
iris_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3338 iris_set_shader_images(struct pipe_context *ctx,
3339                        enum pipe_shader_type p_stage,
3340                        unsigned start_slot, unsigned count,
3341                        unsigned unbind_num_trailing_slots,
3342                        const struct pipe_image_view *p_images)
3343 {
3344    struct iris_context *ice = (struct iris_context *) ctx;
3345    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3346    gl_shader_stage stage = stage_from_pipe(p_stage);
3347    struct iris_shader_state *shs = &ice->state.shaders[stage];
3348 #if GFX_VER == 8
3349    struct iris_genx_state *genx = ice->state.genx;
3350    struct isl_image_param *image_params = genx->shaders[stage].image_param;
3351 #endif
3352 
3353    shs->bound_image_views &=
3354       ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3355 
3356    for (unsigned i = 0; i < count; i++) {
3357       struct iris_image_view *iv = &shs->image[start_slot + i];
3358 
3359       if (p_images && p_images[i].resource) {
3360          const struct pipe_image_view *img = &p_images[i];
3361          struct iris_resource *res = (void *) img->resource;
3362 
3363          util_copy_image_view(&iv->base, img);
3364 
3365          shs->bound_image_views |= BITFIELD64_BIT(start_slot + i);
3366 
3367          res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3368          res->bind_stages |= 1 << stage;
3369 
3370          enum isl_format isl_fmt = iris_image_view_get_format(ice, img);
3371 
3372          unsigned aux_usages = 1 << ISL_AUX_USAGE_NONE;
3373 
3374          /* Gfx12+ supports render compression for images */
3375          if (GFX_VER >= 12 && isl_aux_usage_has_ccs_e(res->aux.usage))
3376             aux_usages |= 1 << ISL_AUX_USAGE_CCS_E;
3377 
3378          alloc_surface_states(&iv->surface_state, aux_usages);
3379          iv->surface_state.bo_address = res->bo->address;
3380 
3381          if (res->base.b.target != PIPE_BUFFER) {
3382             struct isl_view view = {
3383                .format = isl_fmt,
3384                .base_level = img->u.tex.level,
3385                .levels = 1,
3386                .base_array_layer = img->u.tex.first_layer,
3387                .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3388                .swizzle = ISL_SWIZZLE_IDENTITY,
3389                .usage = ISL_SURF_USAGE_STORAGE_BIT,
3390             };
3391 
3392             /* If using untyped fallback. */
3393             if (isl_fmt == ISL_FORMAT_RAW) {
3394                fill_buffer_surface_state(&screen->isl_dev, res,
3395                                          iv->surface_state.cpu,
3396                                          isl_fmt, ISL_SWIZZLE_IDENTITY,
3397                                          0, res->bo->size,
3398                                          ISL_SURF_USAGE_STORAGE_BIT);
3399             } else {
3400                fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3401                                    &res->surf, &view, 0, 0, 0);
3402             }
3403 
3404             isl_surf_fill_image_param(&screen->isl_dev,
3405                                       &image_params[start_slot + i],
3406                                       &res->surf, &view);
3407          } else if (img->access & PIPE_IMAGE_ACCESS_TEX2D_FROM_BUFFER) {
3408             /* In case it's a 2d image created from a buffer, we should
3409              * use fill_surface_states function with image parameters provided
3410              * by the CL application
3411              */
3412             isl_surf_usage_flags_t usage =  ISL_SURF_USAGE_STORAGE_BIT;
3413             struct isl_view view = {
3414                .format = isl_fmt,
3415                .base_level = 0,
3416                .levels = 1,
3417                .base_array_layer = 0,
3418                .array_len = 1,
3419                .swizzle = ISL_SWIZZLE_IDENTITY,
3420                .usage = usage,
3421             };
3422 
3423             /* Create temp_surf and fill with values provided by CL application */
3424             struct isl_surf temp_surf;
3425             enum isl_format fmt = iris_image_view_get_format(ice, img);
3426             fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt,
3427                                             img->u.tex2d_from_buf.width,
3428                                             img->u.tex2d_from_buf.height,
3429                                             img->u.tex2d_from_buf.row_stride,
3430                                             usage,
3431                                             &temp_surf);
3432 
3433             fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3434                                 &temp_surf, &view, 0, 0, 0);
3435             isl_surf_fill_image_param(&screen->isl_dev,
3436                                       &image_params[start_slot + i],
3437                                       &temp_surf, &view);
3438          } else {
3439             util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3440                            img->u.buf.offset + img->u.buf.size);
3441 
3442             fill_buffer_surface_state(&screen->isl_dev, res,
3443                                       iv->surface_state.cpu,
3444                                       isl_fmt, ISL_SWIZZLE_IDENTITY,
3445                                       img->u.buf.offset, img->u.buf.size,
3446                                       ISL_SURF_USAGE_STORAGE_BIT);
3447             fill_buffer_image_param(&image_params[start_slot + i],
3448                                     img->format, img->u.buf.size);
3449          }
3450 
3451          upload_surface_states(ice->state.surface_uploader, &iv->surface_state);
3452       } else {
3453          pipe_resource_reference(&iv->base.resource, NULL);
3454          pipe_resource_reference(&iv->surface_state.ref.res, NULL);
3455          fill_default_image_param(&image_params[start_slot + i]);
3456       }
3457    }
3458 
3459    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
3460    ice->state.dirty |=
3461       stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3462                                    : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3463 
3464    /* Broadwell also needs isl_image_params re-uploaded */
3465    if (GFX_VER < 9) {
3466       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3467       shs->sysvals_need_upload = true;
3468    }
3469 
3470    if (unbind_num_trailing_slots) {
3471       iris_set_shader_images(ctx, p_stage, start_slot + count,
3472                              unbind_num_trailing_slots, 0, NULL);
3473    }
3474 }
3475 
3476 UNUSED static bool
is_sampler_view_3d(const struct iris_sampler_view * view)3477 is_sampler_view_3d(const struct iris_sampler_view *view)
3478 {
3479    return view && view->res->base.b.target == PIPE_TEXTURE_3D;
3480 }
3481 
3482 /**
3483  * The pipe->set_sampler_views() driver hook.
3484  */
3485 static void
iris_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3486 iris_set_sampler_views(struct pipe_context *ctx,
3487                        enum pipe_shader_type p_stage,
3488                        unsigned start, unsigned count,
3489                        unsigned unbind_num_trailing_slots,
3490                        bool take_ownership,
3491                        struct pipe_sampler_view **views)
3492 {
3493    struct iris_context *ice = (struct iris_context *) ctx;
3494    UNUSED struct iris_screen *screen = (void *) ctx->screen;
3495    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
3496    gl_shader_stage stage = stage_from_pipe(p_stage);
3497    struct iris_shader_state *shs = &ice->state.shaders[stage];
3498    unsigned i;
3499 
3500    if (count == 0 && unbind_num_trailing_slots == 0)
3501       return;
3502 
3503    BITSET_CLEAR_RANGE(shs->bound_sampler_views, start,
3504                       start + count + unbind_num_trailing_slots - 1);
3505 
3506    for (i = 0; i < count; i++) {
3507       struct pipe_sampler_view *pview = views ? views[i] : NULL;
3508       struct iris_sampler_view *view = (void *) pview;
3509 
3510 #if GFX_VERx10 == 125
3511       if (intel_needs_workaround(screen->devinfo, 14014414195)) {
3512          if (is_sampler_view_3d(shs->textures[start + i]) !=
3513              is_sampler_view_3d(view))
3514             ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3515       }
3516 #endif
3517 
3518       if (take_ownership) {
3519          pipe_sampler_view_reference((struct pipe_sampler_view **)
3520                                      &shs->textures[start + i], NULL);
3521          shs->textures[start + i] = (struct iris_sampler_view *)pview;
3522       } else {
3523          pipe_sampler_view_reference((struct pipe_sampler_view **)
3524                                      &shs->textures[start + i], pview);
3525       }
3526       if (view) {
3527          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3528          view->res->bind_stages |= 1 << stage;
3529 
3530          BITSET_SET(shs->bound_sampler_views, start + i);
3531 
3532          update_surface_state_addrs(ice->state.surface_uploader,
3533                                     &view->surface_state, view->res->bo);
3534       }
3535    }
3536    for (; i < count + unbind_num_trailing_slots; i++) {
3537       pipe_sampler_view_reference((struct pipe_sampler_view **)
3538                                   &shs->textures[start + i], NULL);
3539    }
3540 
3541    ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);
3542    ice->state.dirty |=
3543       stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3544                                    : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3545 }
3546 
3547 static void
iris_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** resources)3548 iris_set_compute_resources(struct pipe_context *ctx,
3549                            unsigned start, unsigned count,
3550                            struct pipe_surface **resources)
3551 {
3552    assert(count == 0);
3553 }
3554 
3555 static void
iris_set_global_binding(struct pipe_context * ctx,unsigned start_slot,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)3556 iris_set_global_binding(struct pipe_context *ctx,
3557                         unsigned start_slot, unsigned count,
3558                         struct pipe_resource **resources,
3559                         uint32_t **handles)
3560 {
3561    struct iris_context *ice = (struct iris_context *) ctx;
3562 
3563    assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
3564    for (unsigned i = 0; i < count; i++) {
3565       if (resources && resources[i]) {
3566          pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3567                                  resources[i]);
3568 
3569          struct iris_resource *res = (void *) resources[i];
3570          assert(res->base.b.target == PIPE_BUFFER);
3571          util_range_add(&res->base.b, &res->valid_buffer_range,
3572                         0, res->base.b.width0);
3573 
3574          uint64_t addr = 0;
3575          memcpy(&addr, handles[i], sizeof(addr));
3576          addr += res->bo->address + res->offset;
3577          memcpy(handles[i], &addr, sizeof(addr));
3578       } else {
3579          pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3580                                  NULL);
3581       }
3582    }
3583 
3584    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
3585 }
3586 
3587 /**
3588  * The pipe->set_tess_state() driver hook.
3589  */
3590 static void
iris_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3591 iris_set_tess_state(struct pipe_context *ctx,
3592                     const float default_outer_level[4],
3593                     const float default_inner_level[2])
3594 {
3595    struct iris_context *ice = (struct iris_context *) ctx;
3596    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3597 
3598    memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3599    memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3600 
3601    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;
3602    shs->sysvals_need_upload = true;
3603 }
3604 
3605 static void
iris_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3606 iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3607 {
3608    struct iris_context *ice = (struct iris_context *) ctx;
3609 
3610    ice->state.patch_vertices = patch_vertices;
3611 }
3612 
3613 static void
iris_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3614 iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3615 {
3616    struct iris_surface *surf = (void *) p_surf;
3617    pipe_resource_reference(&p_surf->texture, NULL);
3618    pipe_resource_reference(&surf->surface_state.ref.res, NULL);
3619    pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);
3620    free(surf->surface_state.cpu);
3621    free(surf->surface_state_read.cpu);
3622    free(surf);
3623 }
3624 
3625 static void
iris_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3626 iris_set_clip_state(struct pipe_context *ctx,
3627                     const struct pipe_clip_state *state)
3628 {
3629    struct iris_context *ice = (struct iris_context *) ctx;
3630    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3631    struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3632    struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3633 
3634    memcpy(&ice->state.clip_planes, state, sizeof(*state));
3635 
3636    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |
3637                              IRIS_STAGE_DIRTY_CONSTANTS_GS |
3638                              IRIS_STAGE_DIRTY_CONSTANTS_TES;
3639    shs->sysvals_need_upload = true;
3640    gshs->sysvals_need_upload = true;
3641    tshs->sysvals_need_upload = true;
3642 }
3643 
3644 /**
3645  * The pipe->set_polygon_stipple() driver hook.
3646  */
3647 static void
iris_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3648 iris_set_polygon_stipple(struct pipe_context *ctx,
3649                          const struct pipe_poly_stipple *state)
3650 {
3651    struct iris_context *ice = (struct iris_context *) ctx;
3652    memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3653    ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;
3654 }
3655 
3656 /**
3657  * The pipe->set_sample_mask() driver hook.
3658  */
3659 static void
iris_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3660 iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3661 {
3662    struct iris_context *ice = (struct iris_context *) ctx;
3663 
3664    /* We only support 16x MSAA, so we have 16 bits of sample maks.
3665     * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3666     */
3667    ice->state.sample_mask = sample_mask & 0xffff;
3668    ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;
3669 }
3670 
3671 /**
3672  * The pipe->set_scissor_states() driver hook.
3673  *
3674  * This corresponds to our SCISSOR_RECT state structures.  It's an
3675  * exact match, so we just store them, and memcpy them out later.
3676  */
3677 static void
iris_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3678 iris_set_scissor_states(struct pipe_context *ctx,
3679                         unsigned start_slot,
3680                         unsigned num_scissors,
3681                         const struct pipe_scissor_state *rects)
3682 {
3683    struct iris_context *ice = (struct iris_context *) ctx;
3684 
3685    for (unsigned i = 0; i < num_scissors; i++) {
3686       if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3687          /* If the scissor was out of bounds and got clamped to 0 width/height
3688           * at the bounds, the subtraction of 1 from maximums could produce a
3689           * negative number and thus not clip anything.  Instead, just provide
3690           * a min > max scissor inside the bounds, which produces the expected
3691           * no rendering.
3692           */
3693          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3694             .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3695          };
3696       } else {
3697          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3698             .minx = rects[i].minx,     .miny = rects[i].miny,
3699             .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3700          };
3701       }
3702    }
3703 
3704    ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
3705 }
3706 
3707 /**
3708  * The pipe->set_stencil_ref() driver hook.
3709  *
3710  * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3711  */
3712 static void
iris_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref state)3713 iris_set_stencil_ref(struct pipe_context *ctx,
3714                      const struct pipe_stencil_ref state)
3715 {
3716    struct iris_context *ice = (struct iris_context *) ctx;
3717    memcpy(&ice->state.stencil_ref, &state, sizeof(state));
3718    if (GFX_VER >= 12)
3719       ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;
3720    else if (GFX_VER >= 9)
3721       ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
3722    else
3723       ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
3724 }
3725 
3726 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3727 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3728 {
3729    return copysignf(state->scale[axis], sign) + state->translate[axis];
3730 }
3731 
3732 /**
3733  * The pipe->set_viewport_states() driver hook.
3734  *
3735  * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3736  * the guardband yet, as we need the framebuffer dimensions, but we can
3737  * at least fill out the rest.
3738  */
3739 static void
iris_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3740 iris_set_viewport_states(struct pipe_context *ctx,
3741                          unsigned start_slot,
3742                          unsigned count,
3743                          const struct pipe_viewport_state *states)
3744 {
3745    struct iris_context *ice = (struct iris_context *) ctx;
3746    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3747 
3748    memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3749 
3750    /* Fix depth test misrenderings by lowering translated depth range */
3751    if (screen->driconf.lower_depth_range_rate != 1.0f)
3752       ice->state.viewports[start_slot].translate[2] *=
3753          screen->driconf.lower_depth_range_rate;
3754 
3755    ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3756 
3757    if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||
3758                                !ice->state.cso_rast->depth_clip_far))
3759       ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
3760 }
3761 
3762 /**
3763  * The pipe->set_framebuffer_state() driver hook.
3764  *
3765  * Sets the current draw FBO, including color render targets, depth,
3766  * and stencil buffers.
3767  */
3768 static void
iris_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3769 iris_set_framebuffer_state(struct pipe_context *ctx,
3770                            const struct pipe_framebuffer_state *state)
3771 {
3772    struct iris_context *ice = (struct iris_context *) ctx;
3773    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3774    const struct intel_device_info *devinfo = screen->devinfo;
3775    struct isl_device *isl_dev = &screen->isl_dev;
3776    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3777    struct iris_resource *zres;
3778    struct iris_resource *stencil_res;
3779    struct iris_resource *new_res = NULL;
3780    struct pipe_box new_render_area;
3781 
3782    unsigned samples = util_framebuffer_get_num_samples(state);
3783    unsigned layers = util_framebuffer_get_num_layers(state);
3784 
3785    /* multiview not supported */
3786    assert(!state->viewmask);
3787 
3788    if (cso->samples != samples) {
3789       ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
3790 
3791       /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */
3792       if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))
3793          ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
3794 
3795       /* We may need to emit blend state for Wa_14018912822. */
3796       if ((cso->samples > 1) != (samples > 1) &&
3797           intel_needs_workaround(devinfo, 14018912822)) {
3798          ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3799          ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
3800       }
3801    }
3802 
3803    if (cso->nr_cbufs != state->nr_cbufs) {
3804       ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3805    }
3806 
3807    if ((cso->layers == 0) != (layers == 0)) {
3808       ice->state.dirty |= IRIS_DIRTY_CLIP;
3809    }
3810 
3811    if (state->nr_cbufs > 0 && state->cbufs[0])
3812       new_res = (struct iris_resource *)state->cbufs[0]->texture;
3813 
3814    if (new_res && new_res->use_damage) {
3815       new_render_area = new_res->damage;
3816    } else {
3817       new_render_area.x = 0;
3818       new_render_area.y = 0;
3819       new_render_area.z = 0;
3820       new_render_area.width = state->width;
3821       new_render_area.height = state->height;
3822       new_render_area.depth = 0;
3823    }
3824 
3825    if (memcmp(&ice->state.render_area, &new_render_area, sizeof(new_render_area))) {
3826       ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3827       ice->state.render_area = new_render_area;
3828    }
3829 
3830    if (cso->zsbuf || state->zsbuf) {
3831       ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
3832    }
3833 
3834    bool has_integer_rt = false;
3835    for (unsigned i = 0; i < state->nr_cbufs; i++) {
3836       if (state->cbufs[i]) {
3837          enum isl_format ifmt =
3838             isl_format_for_pipe_format(state->cbufs[i]->format);
3839          has_integer_rt |= isl_format_has_int_channel(ifmt);
3840       }
3841    }
3842 
3843    /* 3DSTATE_RASTER::AntialiasingEnable */
3844    if (has_integer_rt != ice->state.has_integer_rt ||
3845        cso->samples != samples) {
3846       ice->state.dirty |= IRIS_DIRTY_RASTER;
3847    }
3848 
3849    util_copy_framebuffer_state(cso, state);
3850    cso->samples = samples;
3851    cso->layers = layers;
3852 
3853    ice->state.has_integer_rt = has_integer_rt;
3854 
3855    struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
3856 
3857    struct isl_view view = {
3858       .base_level = 0,
3859       .levels = 1,
3860       .base_array_layer = 0,
3861       .array_len = 1,
3862       .swizzle = ISL_SWIZZLE_IDENTITY,
3863    };
3864 
3865    struct isl_depth_stencil_hiz_emit_info info = {
3866       .view = &view,
3867       .mocs = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_DEPTH_BIT),
3868    };
3869 
3870    if (cso->zsbuf) {
3871       iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,
3872                                        &stencil_res);
3873 
3874       view.base_level = cso->zsbuf->u.tex.level;
3875       view.base_array_layer = cso->zsbuf->u.tex.first_layer;
3876       view.array_len =
3877          cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
3878 
3879       if (zres) {
3880          view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
3881 
3882          info.depth_surf = &zres->surf;
3883          info.depth_address = zres->bo->address + zres->offset;
3884          info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);
3885 
3886          view.format = zres->surf.format;
3887 
3888          if (iris_resource_level_has_hiz(devinfo, zres, view.base_level)) {
3889             info.hiz_usage = zres->aux.usage;
3890             info.hiz_surf = &zres->aux.surf;
3891             info.hiz_address = zres->aux.bo->address + zres->aux.offset;
3892          }
3893 
3894          ice->state.hiz_usage = info.hiz_usage;
3895       }
3896 
3897       if (stencil_res) {
3898          view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
3899          info.stencil_aux_usage = stencil_res->aux.usage;
3900          info.stencil_surf = &stencil_res->surf;
3901          info.stencil_address = stencil_res->bo->address + stencil_res->offset;
3902          if (!zres) {
3903             view.format = stencil_res->surf.format;
3904             info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);
3905          }
3906       }
3907    }
3908 
3909    isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
3910 
3911    /* Make a null surface for unbound buffers */
3912    void *null_surf_map =
3913       upload_state(ice->state.surface_uploader, &ice->state.null_fb,
3914                    4 * GENX(RENDER_SURFACE_STATE_length), 64);
3915    isl_null_fill_state(&screen->isl_dev, null_surf_map,
3916                        .size = isl_extent3d(MAX2(cso->width, 1),
3917                                             MAX2(cso->height, 1),
3918                                             cso->layers ? cso->layers : 1));
3919    ice->state.null_fb.offset +=
3920       iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));
3921 
3922    /* Render target change */
3923    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;
3924 
3925    ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
3926 
3927    ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3928 
3929    ice->state.stage_dirty |=
3930       ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
3931 
3932    if (GFX_VER == 8)
3933       ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
3934 }
3935 
3936 /**
3937  * The pipe->set_constant_buffer() driver hook.
3938  *
3939  * This uploads any constant data in user buffers, and references
3940  * any UBO resources containing constant data.
3941  */
3942 static void
iris_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3943 iris_set_constant_buffer(struct pipe_context *ctx,
3944                          enum pipe_shader_type p_stage, unsigned index,
3945                          bool take_ownership,
3946                          const struct pipe_constant_buffer *input)
3947 {
3948    struct iris_context *ice = (struct iris_context *) ctx;
3949    gl_shader_stage stage = stage_from_pipe(p_stage);
3950    struct iris_shader_state *shs = &ice->state.shaders[stage];
3951    struct pipe_shader_buffer *cbuf = &shs->constbuf[index];
3952 
3953    /* TODO: Only do this if the buffer changes? */
3954    pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);
3955 
3956    if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3957       shs->bound_cbufs |= 1u << index;
3958 
3959       if (input->user_buffer) {
3960          void *map = NULL;
3961          pipe_resource_reference(&cbuf->buffer, NULL);
3962          u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3963                         &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3964 
3965          if (!cbuf->buffer) {
3966             /* Allocation was unsuccessful - just unbind */
3967             iris_set_constant_buffer(ctx, p_stage, index, false, NULL);
3968             return;
3969          }
3970 
3971          assert(map);
3972          memcpy(map, input->user_buffer, input->buffer_size);
3973       } else if (input->buffer) {
3974          if (cbuf->buffer != input->buffer) {
3975             ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3976                                  IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3977             shs->dirty_cbufs |= 1u << index;
3978          }
3979 
3980          if (take_ownership) {
3981             pipe_resource_reference(&cbuf->buffer, NULL);
3982             cbuf->buffer = input->buffer;
3983          } else {
3984             pipe_resource_reference(&cbuf->buffer, input->buffer);
3985          }
3986 
3987          cbuf->buffer_offset = input->buffer_offset;
3988       }
3989 
3990       cbuf->buffer_size =
3991          MIN2(input->buffer_size,
3992               iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3993 
3994       struct iris_resource *res = (void *) cbuf->buffer;
3995       res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3996       res->bind_stages |= 1 << stage;
3997    } else {
3998       shs->bound_cbufs &= ~(1u << index);
3999       pipe_resource_reference(&cbuf->buffer, NULL);
4000    }
4001 
4002    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
4003 }
4004 
4005 static void
upload_sysvals(struct iris_context * ice,gl_shader_stage stage,const struct pipe_grid_info * grid)4006 upload_sysvals(struct iris_context *ice,
4007                gl_shader_stage stage,
4008                const struct pipe_grid_info *grid)
4009 {
4010    UNUSED struct iris_genx_state *genx = ice->state.genx;
4011    struct iris_shader_state *shs = &ice->state.shaders[stage];
4012 
4013    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
4014    if (!shader || (shader->num_system_values == 0 &&
4015                    shader->kernel_input_size == 0))
4016       return;
4017 
4018    assert(shader->num_cbufs > 0);
4019 
4020    unsigned sysval_cbuf_index = shader->num_cbufs - 1;
4021    struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
4022    unsigned system_values_start =
4023       ALIGN(shader->kernel_input_size, sizeof(uint32_t));
4024    unsigned upload_size = system_values_start +
4025                           shader->num_system_values * sizeof(uint32_t);
4026    void *map = NULL;
4027 
4028    assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
4029    u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
4030                   &cbuf->buffer_offset, &cbuf->buffer, &map);
4031 
4032    if (shader->kernel_input_size > 0)
4033       memcpy(map, grid->input, shader->kernel_input_size);
4034 
4035    uint32_t *sysval_map = map + system_values_start;
4036    for (int i = 0; i < shader->num_system_values; i++) {
4037       uint32_t sysval = shader->system_values[i];
4038       uint32_t value = 0;
4039 
4040 #if GFX_VER >= 9
4041       #define COMPILER(x) BRW_##x
4042 #else
4043       #define COMPILER(x) ELK_##x
4044 #endif
4045 
4046       if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
4047 #if GFX_VER == 8
4048          unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
4049          unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
4050          struct isl_image_param *param =
4051             &genx->shaders[stage].image_param[img];
4052 
4053          assert(offset < sizeof(struct isl_image_param));
4054          value = ((uint32_t *) param)[offset];
4055 #endif
4056       } else if (sysval == COMPILER(PARAM_BUILTIN_ZERO)) {
4057          value = 0;
4058       } else if (COMPILER(PARAM_BUILTIN_IS_CLIP_PLANE(sysval))) {
4059          int plane = COMPILER(PARAM_BUILTIN_CLIP_PLANE_IDX(sysval));
4060          int comp  = COMPILER(PARAM_BUILTIN_CLIP_PLANE_COMP(sysval));
4061          value = fui(ice->state.clip_planes.ucp[plane][comp]);
4062       } else if (sysval == COMPILER(PARAM_BUILTIN_PATCH_VERTICES_IN)) {
4063          if (stage == MESA_SHADER_TESS_CTRL) {
4064             value = ice->state.vertices_per_patch;
4065          } else {
4066             assert(stage == MESA_SHADER_TESS_EVAL);
4067             const struct shader_info *tcs_info =
4068                iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
4069             if (tcs_info)
4070                value = tcs_info->tess.tcs_vertices_out;
4071             else
4072                value = ice->state.vertices_per_patch;
4073          }
4074       } else if (sysval >= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X) &&
4075                  sysval <= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_W)) {
4076          unsigned i = sysval - COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X);
4077          value = fui(ice->state.default_outer_level[i]);
4078       } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_X)) {
4079          value = fui(ice->state.default_inner_level[0]);
4080       } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_Y)) {
4081          value = fui(ice->state.default_inner_level[1]);
4082       } else if (sysval >= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X) &&
4083                  sysval <= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_Z)) {
4084          unsigned i = sysval - COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X);
4085          value = ice->state.last_block[i];
4086       } else if (sysval == COMPILER(PARAM_BUILTIN_WORK_DIM)) {
4087          value = grid->work_dim;
4088       } else {
4089          assert(!"unhandled system value");
4090       }
4091 
4092       *sysval_map++ = value;
4093    }
4094 
4095    cbuf->buffer_size = upload_size;
4096    iris_upload_ubo_ssbo_surf_state(ice, cbuf,
4097                                    &shs->constbuf_surf_state[sysval_cbuf_index],
4098                                    ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
4099 
4100    shs->sysvals_need_upload = false;
4101 }
4102 
4103 /**
4104  * The pipe->set_shader_buffers() driver hook.
4105  *
4106  * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
4107  * SURFACE_STATE here, as the buffer offset may change each time.
4108  */
4109 static void
iris_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4110 iris_set_shader_buffers(struct pipe_context *ctx,
4111                         enum pipe_shader_type p_stage,
4112                         unsigned start_slot, unsigned count,
4113                         const struct pipe_shader_buffer *buffers,
4114                         unsigned writable_bitmask)
4115 {
4116    struct iris_context *ice = (struct iris_context *) ctx;
4117    gl_shader_stage stage = stage_from_pipe(p_stage);
4118    struct iris_shader_state *shs = &ice->state.shaders[stage];
4119 
4120    unsigned modified_bits = u_bit_consecutive(start_slot, count);
4121 
4122    shs->bound_ssbos &= ~modified_bits;
4123    shs->writable_ssbos &= ~modified_bits;
4124    shs->writable_ssbos |= writable_bitmask << start_slot;
4125 
4126    for (unsigned i = 0; i < count; i++) {
4127       if (buffers && buffers[i].buffer) {
4128          struct iris_resource *res = (void *) buffers[i].buffer;
4129          struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
4130          struct iris_state_ref *surf_state =
4131             &shs->ssbo_surf_state[start_slot + i];
4132          pipe_resource_reference(&ssbo->buffer, &res->base.b);
4133          ssbo->buffer_offset = buffers[i].buffer_offset;
4134          ssbo->buffer_size =
4135             MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
4136 
4137          shs->bound_ssbos |= 1 << (start_slot + i);
4138 
4139          isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
4140 
4141          iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);
4142 
4143          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
4144          res->bind_stages |= 1 << stage;
4145 
4146          util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
4147                         ssbo->buffer_offset + ssbo->buffer_size);
4148       } else {
4149          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
4150          pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
4151                                  NULL);
4152       }
4153    }
4154 
4155    ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
4156                         IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
4157    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
4158 }
4159 
4160 static void
iris_delete_state(struct pipe_context * ctx,void * state)4161 iris_delete_state(struct pipe_context *ctx, void *state)
4162 {
4163    free(state);
4164 }
4165 
4166 /**
4167  * The pipe->set_vertex_buffers() driver hook.
4168  *
4169  * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
4170  */
4171 static void
iris_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)4172 iris_set_vertex_buffers(struct pipe_context *ctx,
4173                         unsigned count,
4174                         const struct pipe_vertex_buffer *buffers)
4175 {
4176    struct iris_context *ice = (struct iris_context *) ctx;
4177    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4178    struct iris_genx_state *genx = ice->state.genx;
4179 
4180    unsigned last_count = util_last_bit64(ice->state.bound_vertex_buffers);
4181    ice->state.bound_vertex_buffers = 0;
4182 
4183    for (unsigned i = 0; i < count; i++) {
4184       const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;
4185       struct iris_vertex_buffer_state *state =
4186          &genx->vertex_buffers[i];
4187 
4188       if (!buffer) {
4189          pipe_resource_reference(&state->resource, NULL);
4190          continue;
4191       }
4192 
4193       /* We may see user buffers that are NULL bindings. */
4194       assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
4195 
4196       if (buffer->buffer.resource &&
4197           state->resource != buffer->buffer.resource)
4198          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
4199 
4200       pipe_resource_reference(&state->resource, NULL);
4201       state->resource = buffer->buffer.resource;
4202 
4203       struct iris_resource *res = (void *) state->resource;
4204 
4205       state->offset = (int) buffer->buffer_offset;
4206 
4207       if (res) {
4208          ice->state.bound_vertex_buffers |= 1ull << i;
4209          res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4210       }
4211 
4212       iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
4213          vb.VertexBufferIndex = i;
4214          vb.AddressModifyEnable = true;
4215          /* vb.BufferPitch is merged in dynamically from VE state later */
4216          if (res) {
4217             vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;
4218             vb.BufferStartingAddress =
4219                ro_bo(NULL, res->bo->address + (int) buffer->buffer_offset);
4220             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4221                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4222 #if GFX_VER >= 12
4223             vb.L3BypassDisable       = true;
4224 #endif
4225          } else {
4226             vb.NullVertexBuffer = true;
4227             vb.MOCS = iris_mocs(NULL, &screen->isl_dev,
4228                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4229          }
4230       }
4231    }
4232 
4233    for (unsigned i = count; i < last_count; i++) {
4234       struct iris_vertex_buffer_state *state =
4235          &genx->vertex_buffers[i];
4236 
4237       pipe_resource_reference(&state->resource, NULL);
4238    }
4239 
4240    ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4241 }
4242 
4243 /**
4244  * Gallium CSO for vertex elements.
4245  */
4246 struct iris_vertex_element_state {
4247    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
4248    uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
4249    uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
4250    uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
4251    uint32_t stride[PIPE_MAX_ATTRIBS];
4252    unsigned vb_count;
4253    unsigned count;
4254 };
4255 
4256 /**
4257  * The pipe->create_vertex_elements_state() driver hook.
4258  *
4259  * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
4260  * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
4261  * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
4262  * needed. In these cases we will need information available at draw time.
4263  * We setup edgeflag_ve and edgeflag_vfi as alternatives last
4264  * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
4265  * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
4266  */
4267 static void *
iris_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)4268 iris_create_vertex_elements(struct pipe_context *ctx,
4269                             unsigned count,
4270                             const struct pipe_vertex_element *state)
4271 {
4272    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4273    const struct intel_device_info *devinfo = screen->devinfo;
4274    struct iris_vertex_element_state *cso =
4275       calloc(1, sizeof(struct iris_vertex_element_state));
4276 
4277    cso->count = count;
4278    cso->vb_count = 0;
4279 
4280    iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
4281       ve.DWordLength =
4282          1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
4283    }
4284 
4285    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
4286    uint32_t *vfi_pack_dest = cso->vf_instancing;
4287 
4288    if (count == 0) {
4289       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4290          ve.Valid = true;
4291          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
4292          ve.Component0Control = VFCOMP_STORE_0;
4293          ve.Component1Control = VFCOMP_STORE_0;
4294          ve.Component2Control = VFCOMP_STORE_0;
4295          ve.Component3Control = VFCOMP_STORE_1_FP;
4296       }
4297 
4298       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4299       }
4300    }
4301 
4302    for (int i = 0; i < count; i++) {
4303       const struct iris_format_info fmt =
4304          iris_format_for_usage(devinfo, state[i].src_format, 0);
4305       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
4306                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
4307 
4308       switch (isl_format_get_num_channels(fmt.fmt)) {
4309       case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
4310       case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
4311       case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
4312       case 3:
4313          comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
4314                                                        : VFCOMP_STORE_1_FP;
4315          break;
4316       }
4317       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4318          ve.EdgeFlagEnable = false;
4319          ve.VertexBufferIndex = state[i].vertex_buffer_index;
4320          ve.Valid = true;
4321          ve.SourceElementOffset = state[i].src_offset;
4322          ve.SourceElementFormat = fmt.fmt;
4323          ve.Component0Control = comp[0];
4324          ve.Component1Control = comp[1];
4325          ve.Component2Control = comp[2];
4326          ve.Component3Control = comp[3];
4327       }
4328 
4329       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4330          vi.VertexElementIndex = i;
4331          vi.InstancingEnable = state[i].instance_divisor > 0;
4332          vi.InstanceDataStepRate = state[i].instance_divisor;
4333       }
4334 
4335       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
4336       vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
4337       cso->stride[state[i].vertex_buffer_index] = state[i].src_stride;
4338       cso->vb_count = MAX2(state[i].vertex_buffer_index + 1, cso->vb_count);
4339    }
4340 
4341    /* An alternative version of the last VE and VFI is stored so it
4342     * can be used at draw time in case Vertex Shader uses EdgeFlag
4343     */
4344    if (count) {
4345       const unsigned edgeflag_index = count - 1;
4346       const struct iris_format_info fmt =
4347          iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
4348       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
4349          ve.EdgeFlagEnable = true ;
4350          ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
4351          ve.Valid = true;
4352          ve.SourceElementOffset = state[edgeflag_index].src_offset;
4353          ve.SourceElementFormat = fmt.fmt;
4354          ve.Component0Control = VFCOMP_STORE_SRC;
4355          ve.Component1Control = VFCOMP_STORE_0;
4356          ve.Component2Control = VFCOMP_STORE_0;
4357          ve.Component3Control = VFCOMP_STORE_0;
4358       }
4359       iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
4360          /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
4361           * at draw time, as it should change if SGVs are emitted.
4362           */
4363          vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
4364          vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
4365       }
4366    }
4367 
4368    return cso;
4369 }
4370 
4371 /**
4372  * The pipe->bind_vertex_elements_state() driver hook.
4373  */
4374 static void
iris_bind_vertex_elements_state(struct pipe_context * ctx,void * state)4375 iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
4376 {
4377    struct iris_context *ice = (struct iris_context *) ctx;
4378    struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
4379    struct iris_vertex_element_state *new_cso = state;
4380 
4381    /* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,
4382     * we need to re-emit it to ensure we're overriding the right one.
4383     */
4384    if (new_cso && cso_changed(count))
4385       ice->state.dirty |= IRIS_DIRTY_VF_SGVS;
4386 
4387    ice->state.cso_vertex_elements = state;
4388    ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;
4389    if (new_cso) {
4390       /* re-emit vertex buffer state if stride changes */
4391       if (cso_changed(vb_count) ||
4392           cso_changed_memcmp_elts(stride, new_cso->vb_count))
4393          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4394    }
4395 }
4396 
4397 /**
4398  * The pipe->create_stream_output_target() driver hook.
4399  *
4400  * "Target" here refers to a destination buffer.  We translate this into
4401  * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4402  * know which buffer this represents, or whether we ought to zero the
4403  * write-offsets, or append.  Those are handled in the set() hook.
4404  */
4405 static struct pipe_stream_output_target *
iris_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4406 iris_create_stream_output_target(struct pipe_context *ctx,
4407                                  struct pipe_resource *p_res,
4408                                  unsigned buffer_offset,
4409                                  unsigned buffer_size)
4410 {
4411    struct iris_resource *res = (void *) p_res;
4412    struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));
4413    if (!cso)
4414       return NULL;
4415 
4416    res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4417 
4418    pipe_reference_init(&cso->base.reference, 1);
4419    pipe_resource_reference(&cso->base.buffer, p_res);
4420    cso->base.buffer_offset = buffer_offset;
4421    cso->base.buffer_size = buffer_size;
4422    cso->base.context = ctx;
4423 
4424    util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4425                   buffer_offset + buffer_size);
4426 
4427    return &cso->base;
4428 }
4429 
4430 static void
iris_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4431 iris_stream_output_target_destroy(struct pipe_context *ctx,
4432                                   struct pipe_stream_output_target *state)
4433 {
4434    struct iris_stream_output_target *cso = (void *) state;
4435 
4436    pipe_resource_reference(&cso->base.buffer, NULL);
4437    pipe_resource_reference(&cso->offset.res, NULL);
4438 
4439    free(cso);
4440 }
4441 
4442 /**
4443  * The pipe->set_stream_output_targets() driver hook.
4444  *
4445  * At this point, we know which targets are bound to a particular index,
4446  * and also whether we want to append or start over.  We can finish the
4447  * 3DSTATE_SO_BUFFER packets we started earlier.
4448  */
4449 static void
iris_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets,enum mesa_prim output_prim)4450 iris_set_stream_output_targets(struct pipe_context *ctx,
4451                                unsigned num_targets,
4452                                struct pipe_stream_output_target **targets,
4453                                const unsigned *offsets,
4454                                enum mesa_prim output_prim)
4455 {
4456    struct iris_context *ice = (struct iris_context *) ctx;
4457    struct iris_genx_state *genx = ice->state.genx;
4458    uint32_t *so_buffers = genx->so_buffers;
4459    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4460 
4461    const bool active = num_targets > 0;
4462    if (ice->state.streamout_active != active) {
4463       ice->state.streamout_active = active;
4464       ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
4465 
4466       /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4467        * it's a non-pipelined command.  If we're switching streamout on, we
4468        * may have missed emitting it earlier, so do so now.  (We're already
4469        * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4470        */
4471       if (active) {
4472          ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;
4473       } else {
4474          for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4475             struct iris_stream_output_target *tgt =
4476                (void *) ice->state.so_target[i];
4477 
4478             if (tgt)
4479                iris_dirty_for_history(ice, (void *)tgt->base.buffer);
4480          }
4481       }
4482    }
4483 
4484    for (int i = 0; i < 4; i++) {
4485       pipe_so_target_reference(&ice->state.so_target[i],
4486                                i < num_targets ? targets[i] : NULL);
4487    }
4488 
4489    /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4490    if (!active)
4491       return;
4492 
4493    for (unsigned i = 0; i < 4; i++,
4494         so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
4495 
4496       struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];
4497       unsigned offset = offsets[i];
4498 
4499       if (!tgt) {
4500          iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4501 #if GFX_VER < 12
4502             sob.SOBufferIndex = i;
4503 #else
4504             sob._3DCommandOpcode = 0;
4505             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4506 #endif
4507             sob.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
4508          }
4509          continue;
4510       }
4511 
4512       if (!tgt->offset.res)
4513          upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);
4514 
4515       struct iris_resource *res = (void *) tgt->base.buffer;
4516 
4517       /* Note that offsets[i] will either be 0, causing us to zero
4518        * the value in the buffer, or 0xFFFFFFFF, which happens to mean
4519        * "continue appending at the existing offset."
4520        */
4521       assert(offset == 0 || offset == 0xFFFFFFFF);
4522 
4523       /* When we're first called with an offset of 0, we want the next
4524        * 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.
4525        * Any further times we emit those packets, we want to use 0xFFFFFFFF
4526        * to continue appending from the current offset.
4527        *
4528        * Note that we might be called by Begin (offset = 0), Pause, then
4529        * Resume (offset = 0xFFFFFFFF) before ever drawing (where these
4530        * commands will actually be sent to the GPU).  In this case, we
4531        * don't want to append - we still want to do our initial zeroing.
4532        */
4533       if (offset == 0)
4534          tgt->zero_offset = true;
4535 
4536       iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4537 #if GFX_VER < 12
4538          sob.SOBufferIndex = i;
4539 #else
4540          sob._3DCommandOpcode = 0;
4541          sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4542 #endif
4543          sob.SurfaceBaseAddress =
4544             rw_bo(NULL, res->bo->address + tgt->base.buffer_offset,
4545                   IRIS_DOMAIN_OTHER_WRITE);
4546          sob.SOBufferEnable = true;
4547          sob.StreamOffsetWriteEnable = true;
4548          sob.StreamOutputBufferOffsetAddressEnable = true;
4549          sob.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4550                               ISL_SURF_USAGE_STREAM_OUT_BIT);
4551 
4552          sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
4553          sob.StreamOutputBufferOffsetAddress =
4554             rw_bo(NULL, iris_resource_bo(tgt->offset.res)->address +
4555                         tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);
4556          sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
4557       }
4558    }
4559 
4560    ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
4561 }
4562 
4563 /**
4564  * An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4565  * 3DSTATE_STREAMOUT packets.
4566  *
4567  * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4568  * hardware to record.  We can create it entirely based on the shader, with
4569  * no dynamic state dependencies.
4570  *
4571  * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4572  * state-based settings.  We capture the shader-related ones here, and merge
4573  * the rest in at draw time.
4574  */
4575 static uint32_t *
iris_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4576 iris_create_so_decl_list(const struct pipe_stream_output_info *info,
4577                          const struct intel_vue_map *vue_map)
4578 {
4579    struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4580    int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4581    int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4582    int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4583    int max_decls = 0;
4584    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4585 
4586    memset(so_decl, 0, sizeof(so_decl));
4587 
4588    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4589     * command feels strange -- each dword pair contains a SO_DECL per stream.
4590     */
4591    for (unsigned i = 0; i < info->num_outputs; i++) {
4592       const struct pipe_stream_output *output = &info->output[i];
4593       const int buffer = output->output_buffer;
4594       const int varying = output->register_index;
4595       const unsigned stream_id = output->stream;
4596       assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4597 
4598       buffer_mask[stream_id] |= 1 << buffer;
4599 
4600       assert(vue_map->varying_to_slot[varying] >= 0);
4601 
4602       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4603        * array.  Instead, it simply increments DstOffset for the following
4604        * input by the number of components that should be skipped.
4605        *
4606        * Our hardware is unusual in that it requires us to program SO_DECLs
4607        * for fake "hole" components, rather than simply taking the offset
4608        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4609        * program as many size = 4 holes as we can, then a final hole to
4610        * accommodate the final 1, 2, or 3 remaining.
4611        */
4612       int skip_components = output->dst_offset - next_offset[buffer];
4613 
4614       while (skip_components > 0) {
4615          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4616             .HoleFlag = 1,
4617             .OutputBufferSlot = output->output_buffer,
4618             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4619          };
4620          skip_components -= 4;
4621       }
4622 
4623       next_offset[buffer] = output->dst_offset + output->num_components;
4624 
4625       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4626          .OutputBufferSlot = output->output_buffer,
4627          .RegisterIndex = vue_map->varying_to_slot[varying],
4628          .ComponentMask =
4629             ((1 << output->num_components) - 1) << output->start_component,
4630       };
4631 
4632       if (decls[stream_id] > max_decls)
4633          max_decls = decls[stream_id];
4634    }
4635 
4636    unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4637    uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4638    uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4639 
4640    iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4641       int urb_entry_read_offset = 0;
4642       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4643          urb_entry_read_offset;
4644 
4645       /* We always read the whole vertex.  This could be reduced at some
4646        * point by reading less and offsetting the register index in the
4647        * SO_DECLs.
4648        */
4649       sol.Stream0VertexReadOffset = urb_entry_read_offset;
4650       sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4651       sol.Stream1VertexReadOffset = urb_entry_read_offset;
4652       sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4653       sol.Stream2VertexReadOffset = urb_entry_read_offset;
4654       sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4655       sol.Stream3VertexReadOffset = urb_entry_read_offset;
4656       sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4657 
4658       /* Set buffer pitches; 0 means unbound. */
4659       sol.Buffer0SurfacePitch = 4 * info->stride[0];
4660       sol.Buffer1SurfacePitch = 4 * info->stride[1];
4661       sol.Buffer2SurfacePitch = 4 * info->stride[2];
4662       sol.Buffer3SurfacePitch = 4 * info->stride[3];
4663    }
4664 
4665    iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4666       list.DWordLength = 3 + 2 * max_decls - 2;
4667       list.StreamtoBufferSelects0 = buffer_mask[0];
4668       list.StreamtoBufferSelects1 = buffer_mask[1];
4669       list.StreamtoBufferSelects2 = buffer_mask[2];
4670       list.StreamtoBufferSelects3 = buffer_mask[3];
4671       list.NumEntries0 = decls[0];
4672       list.NumEntries1 = decls[1];
4673       list.NumEntries2 = decls[2];
4674       list.NumEntries3 = decls[3];
4675    }
4676 
4677    for (int i = 0; i < max_decls; i++) {
4678       iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4679          entry.Stream0Decl = so_decl[0][i];
4680          entry.Stream1Decl = so_decl[1][i];
4681          entry.Stream2Decl = so_decl[2][i];
4682          entry.Stream3Decl = so_decl[3][i];
4683       }
4684    }
4685 
4686    return map;
4687 }
4688 
4689 static inline int
iris_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)4690 iris_compute_first_urb_slot_required(uint64_t inputs_read,
4691                                      const struct intel_vue_map *prev_stage_vue_map)
4692 {
4693 #if GFX_VER >= 9
4694    return brw_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4695 #else
4696    return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4697 #endif
4698 }
4699 
4700 static void
iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,const struct intel_vue_map * last_vue_map,bool two_sided_color,unsigned * out_offset,unsigned * out_length)4701 iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
4702                                    const struct intel_vue_map *last_vue_map,
4703                                    bool two_sided_color,
4704                                    unsigned *out_offset,
4705                                    unsigned *out_length)
4706 {
4707    /* The compiler computes the first URB slot without considering COL/BFC
4708     * swizzling (because it doesn't know whether it's enabled), so we need
4709     * to do that here too.  This may result in a smaller offset, which
4710     * should be safe.
4711     */
4712    const unsigned first_slot =
4713       iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
4714 
4715    /* This becomes the URB read offset (counted in pairs of slots). */
4716    assert(first_slot % 2 == 0);
4717    *out_offset = first_slot / 2;
4718 
4719    /* We need to adjust the inputs read to account for front/back color
4720     * swizzling, as it can make the URB length longer.
4721     */
4722    for (int c = 0; c <= 1; c++) {
4723       if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
4724          /* If two sided color is enabled, the fragment shader's gl_Color
4725           * (COL0) input comes from either the gl_FrontColor (COL0) or
4726           * gl_BackColor (BFC0) input varyings.  Mark BFC as used, too.
4727           */
4728          if (two_sided_color)
4729             fs_input_slots |= (VARYING_BIT_BFC0 << c);
4730 
4731          /* If front color isn't written, we opt to give them back color
4732           * instead of an undefined value.  Switch from COL to BFC.
4733           */
4734          if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
4735             fs_input_slots &= ~(VARYING_BIT_COL0 << c);
4736             fs_input_slots |= (VARYING_BIT_BFC0 << c);
4737          }
4738       }
4739    }
4740 
4741    /* Compute the minimum URB Read Length necessary for the FS inputs.
4742     *
4743     * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4744     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4745     *
4746     * "This field should be set to the minimum length required to read the
4747     *  maximum source attribute.  The maximum source attribute is indicated
4748     *  by the maximum value of the enabled Attribute # Source Attribute if
4749     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4750     *  enable is not set.
4751     *  read_length = ceiling((max_source_attr + 1) / 2)
4752     *
4753     *  [errata] Corruption/Hang possible if length programmed larger than
4754     *  recommended"
4755     *
4756     * Similar text exists for Ivy Bridge.
4757     *
4758     * We find the last URB slot that's actually read by the FS.
4759     */
4760    unsigned last_read_slot = last_vue_map->num_slots - 1;
4761    while (last_read_slot > first_slot && !(fs_input_slots &
4762           (1ull << last_vue_map->slot_to_varying[last_read_slot])))
4763       --last_read_slot;
4764 
4765    /* The URB read length is the difference of the two, counted in pairs. */
4766    *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
4767 }
4768 
4769 static void
iris_emit_sbe_swiz(struct iris_batch * batch,const struct iris_context * ice,const struct intel_vue_map * vue_map,unsigned urb_read_offset,unsigned sprite_coord_enables)4770 iris_emit_sbe_swiz(struct iris_batch *batch,
4771                    const struct iris_context *ice,
4772                    const struct intel_vue_map *vue_map,
4773                    unsigned urb_read_offset,
4774                    unsigned sprite_coord_enables)
4775 {
4776    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
4777    const struct iris_fs_data *fs_data =
4778       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4779    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4780 
4781    /* XXX: this should be generated when putting programs in place */
4782 
4783    for (uint8_t idx = 0; idx < fs_data->urb_setup_attribs_count; idx++) {
4784       const uint8_t fs_attr = fs_data->urb_setup_attribs[idx];
4785       const int input_index = fs_data->urb_setup[fs_attr];
4786       if (input_index < 0 || input_index >= 16)
4787          continue;
4788 
4789       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
4790          &attr_overrides[input_index];
4791       int slot = vue_map->varying_to_slot[fs_attr];
4792 
4793       /* Viewport and Layer are stored in the VUE header.  We need to override
4794        * them to zero if earlier stages didn't write them, as GL requires that
4795        * they read back as zero when not explicitly set.
4796        */
4797       switch (fs_attr) {
4798       case VARYING_SLOT_VIEWPORT:
4799       case VARYING_SLOT_LAYER:
4800          attr->ComponentOverrideX = true;
4801          attr->ComponentOverrideW = true;
4802          attr->ConstantSource = CONST_0000;
4803 
4804          if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4805             attr->ComponentOverrideY = true;
4806          if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4807             attr->ComponentOverrideZ = true;
4808          continue;
4809 
4810       default:
4811          break;
4812       }
4813 
4814       if (sprite_coord_enables & (1 << input_index))
4815          continue;
4816 
4817       /* If there was only a back color written but not front, use back
4818        * as the color instead of undefined.
4819        */
4820       if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4821          slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4822       if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4823          slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4824 
4825       /* Not written by the previous stage - undefined. */
4826       if (slot == -1) {
4827          attr->ComponentOverrideX = true;
4828          attr->ComponentOverrideY = true;
4829          attr->ComponentOverrideZ = true;
4830          attr->ComponentOverrideW = true;
4831          attr->ConstantSource = CONST_0001_FLOAT;
4832          continue;
4833       }
4834 
4835       /* Compute the location of the attribute relative to the read offset,
4836        * which is counted in 256-bit increments (two 128-bit VUE slots).
4837        */
4838       const int source_attr = slot - 2 * urb_read_offset;
4839       assert(source_attr >= 0 && source_attr <= 32);
4840       attr->SourceAttribute = source_attr;
4841 
4842       /* If we are doing two-sided color, and the VUE slot following this one
4843        * represents a back-facing color, then we need to instruct the SF unit
4844        * to do back-facing swizzling.
4845        */
4846       if (cso_rast->light_twoside &&
4847           ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4848             vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4849            (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4850             vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
4851          attr->SwizzleSelect = INPUTATTR_FACING;
4852    }
4853 
4854    iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4855       for (int i = 0; i < 16; i++)
4856          sbes.Attribute[i] = attr_overrides[i];
4857    }
4858 }
4859 
4860 static bool
iris_is_drawing_points(const struct iris_context * ice)4861 iris_is_drawing_points(const struct iris_context *ice)
4862 {
4863    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4864 
4865    if (cso_rast->fill_mode_point) {
4866       return true;
4867    }
4868 
4869    if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4870       const struct iris_gs_data *gs_data =
4871          iris_gs_data(ice->shaders.prog[MESA_SHADER_GEOMETRY]);
4872       return gs_data->output_topology == _3DPRIM_POINTLIST;
4873    } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4874       const struct iris_tes_data *tes_data =
4875          iris_tes_data(ice->shaders.prog[MESA_SHADER_TESS_EVAL]);
4876       return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4877    } else {
4878       return ice->state.prim_mode == MESA_PRIM_POINTS;
4879    }
4880 }
4881 
4882 static unsigned
iris_calculate_point_sprite_overrides(const struct iris_fs_data * fs_data,const struct iris_rasterizer_state * cso)4883 iris_calculate_point_sprite_overrides(const struct iris_fs_data *fs_data,
4884                                       const struct iris_rasterizer_state *cso)
4885 {
4886    unsigned overrides = 0;
4887 
4888    if (fs_data->urb_setup[VARYING_SLOT_PNTC] != -1)
4889       overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_PNTC];
4890 
4891    for (int i = 0; i < 8; i++) {
4892       if ((cso->sprite_coord_enable & (1 << i)) &&
4893           fs_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)
4894          overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_TEX0 + i];
4895    }
4896 
4897    return overrides;
4898 }
4899 
4900 static void
iris_emit_sbe(struct iris_batch * batch,const struct iris_context * ice)4901 iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
4902 {
4903    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4904    const struct iris_fs_data *fs_data =
4905       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4906    const struct intel_vue_map *last_vue_map =
4907       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
4908 
4909    unsigned urb_read_offset, urb_read_length;
4910    iris_compute_sbe_urb_read_interval(fs_data->inputs,
4911                                       last_vue_map,
4912                                       cso_rast->light_twoside,
4913                                       &urb_read_offset, &urb_read_length);
4914 
4915    unsigned sprite_coord_overrides =
4916       iris_is_drawing_points(ice) ?
4917       iris_calculate_point_sprite_overrides(fs_data, cso_rast) : 0;
4918 
4919    iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4920       sbe.AttributeSwizzleEnable = true;
4921       sbe.NumberofSFOutputAttributes = fs_data->num_varying_inputs;
4922       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
4923       sbe.VertexURBEntryReadOffset = urb_read_offset;
4924       sbe.VertexURBEntryReadLength = urb_read_length;
4925       sbe.ForceVertexURBEntryReadOffset = true;
4926       sbe.ForceVertexURBEntryReadLength = true;
4927       sbe.ConstantInterpolationEnable = fs_data->flat_inputs;
4928       sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;
4929 #if GFX_VER >= 9
4930       for (int i = 0; i < 32; i++) {
4931          sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
4932       }
4933 #endif
4934 
4935       /* Ask the hardware to supply PrimitiveID if the fragment shader
4936        * reads it but a previous stage didn't write one.
4937        */
4938       if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
4939           last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
4940          sbe.PrimitiveIDOverrideAttributeSelect =
4941             fs_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
4942          sbe.PrimitiveIDOverrideComponentX = true;
4943          sbe.PrimitiveIDOverrideComponentY = true;
4944          sbe.PrimitiveIDOverrideComponentZ = true;
4945          sbe.PrimitiveIDOverrideComponentW = true;
4946       }
4947    }
4948 
4949    iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,
4950                       sprite_coord_overrides);
4951 }
4952 
4953 /* ------------------------------------------------------------------- */
4954 
4955 /**
4956  * Populate VS program key fields based on the current state.
4957  */
4958 static void
iris_populate_vs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_vs_prog_key * key)4959 iris_populate_vs_key(const struct iris_context *ice,
4960                      const struct shader_info *info,
4961                      gl_shader_stage last_stage,
4962                      struct iris_vs_prog_key *key)
4963 {
4964    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4965 
4966    if (info->clip_distance_array_size == 0 &&
4967        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4968        last_stage == MESA_SHADER_VERTEX)
4969       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4970 }
4971 
4972 /**
4973  * Populate TCS program key fields based on the current state.
4974  */
4975 static void
iris_populate_tcs_key(const struct iris_context * ice,struct iris_tcs_prog_key * key)4976 iris_populate_tcs_key(const struct iris_context *ice,
4977                       struct iris_tcs_prog_key *key)
4978 {
4979 }
4980 
4981 /**
4982  * Populate TES program key fields based on the current state.
4983  */
4984 static void
iris_populate_tes_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_tes_prog_key * key)4985 iris_populate_tes_key(const struct iris_context *ice,
4986                       const struct shader_info *info,
4987                       gl_shader_stage last_stage,
4988                       struct iris_tes_prog_key *key)
4989 {
4990    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4991 
4992    if (info->clip_distance_array_size == 0 &&
4993        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4994        last_stage == MESA_SHADER_TESS_EVAL)
4995       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4996 }
4997 
4998 /**
4999  * Populate GS program key fields based on the current state.
5000  */
5001 static void
iris_populate_gs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_gs_prog_key * key)5002 iris_populate_gs_key(const struct iris_context *ice,
5003                      const struct shader_info *info,
5004                      gl_shader_stage last_stage,
5005                      struct iris_gs_prog_key *key)
5006 {
5007    const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
5008 
5009    if (info->clip_distance_array_size == 0 &&
5010        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
5011        last_stage == MESA_SHADER_GEOMETRY)
5012       key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
5013 }
5014 
5015 /**
5016  * Populate FS program key fields based on the current state.
5017  */
5018 static void
iris_populate_fs_key(const struct iris_context * ice,const struct shader_info * info,struct iris_fs_prog_key * key)5019 iris_populate_fs_key(const struct iris_context *ice,
5020                      const struct shader_info *info,
5021                      struct iris_fs_prog_key *key)
5022 {
5023    struct iris_screen *screen = (void *) ice->ctx.screen;
5024    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
5025    const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
5026    const struct iris_rasterizer_state *rast = ice->state.cso_rast;
5027    const struct iris_blend_state *blend = ice->state.cso_blend;
5028 
5029    key->nr_color_regions = fb->nr_cbufs;
5030 
5031    key->clamp_fragment_color = rast->clamp_fragment_color;
5032 
5033    key->alpha_to_coverage = blend->alpha_to_coverage;
5034 
5035    key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;
5036 
5037    key->flat_shade = rast->flatshade &&
5038       (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
5039 
5040    key->persample_interp = rast->force_persample_interp;
5041    key->multisample_fbo = rast->multisample && fb->samples > 1;
5042 
5043    key->coherent_fb_fetch = GFX_VER >= 9 && GFX_VER < 20;
5044 
5045    key->force_dual_color_blend =
5046       screen->driconf.dual_color_blend_by_location &&
5047       (blend->blend_enables & 1) && blend->dual_color_blending;
5048 }
5049 
5050 static void
iris_populate_cs_key(const struct iris_context * ice,struct iris_cs_prog_key * key)5051 iris_populate_cs_key(const struct iris_context *ice,
5052                      struct iris_cs_prog_key *key)
5053 {
5054 }
5055 
5056 static inline uint32_t
encode_sampler_count(const struct iris_compiled_shader * shader)5057 encode_sampler_count(const struct iris_compiled_shader *shader)
5058 {
5059    /* We can potentially have way more than 32 samplers and that's ok.
5060     * However, the 3DSTATE_XS packets only have 3 bits to specify how
5061     * many to pre-fetch and all values above 4 are marked reserved.
5062     */
5063    uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
5064    return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
5065 }
5066 
5067 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
5068    pkt.KernelStartPointer = KSP(shader);                                  \
5069    pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;                \
5070    pkt.SamplerCount = encode_sampler_count(shader);                       \
5071    pkt.FloatingPointMode = shader->use_alt_mode;                          \
5072                                                                           \
5073    pkt.DispatchGRFStartRegisterForURBData =                               \
5074       shader->dispatch_grf_start_reg;                                     \
5075    pkt.prefix##URBEntryReadLength = vue_data->urb_read_length;            \
5076    pkt.prefix##URBEntryReadOffset = 0;                                    \
5077                                                                           \
5078    pkt.StatisticsEnable = true;                                           \
5079    pkt.Enable           = true;                                           \
5080                                                                           \
5081    if (shader->total_scratch) {                                           \
5082       INIT_THREAD_SCRATCH_SIZE(pkt)                                       \
5083    }
5084 
5085 /* Note that on Gfx12HP we pass a scratch space surface state offset
5086  * shifted by 2 relative to the value specified on the BSpec, since
5087  * that allows the compiler to save a shift instruction while
5088  * constructing the extended descriptor for SS addressing.  That
5089  * worked because we limit the scratch surface state pool to 8 MB and
5090  * because we relied on the legacy (ExBSO=0) encoding of the extended
5091  * descriptor in order to save the shift, which is no longer supported
5092  * for the UGM shared function on Xe2 platforms, so we no longer
5093  * attempt to do that trick.
5094  */
5095 #define SCRATCH_SPACE_BUFFER_SHIFT (GFX_VER >= 20 ? 6 : 4)
5096 
5097 #if GFX_VERx10 >= 125
5098 #define INIT_THREAD_SCRATCH_SIZE(pkt)
5099 #define MERGE_SCRATCH_ADDR(name)                                          \
5100 {                                                                         \
5101    uint32_t pkt2[GENX(name##_length)] = {0};                              \
5102    _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
5103       p.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;  \
5104    }                                                                      \
5105    iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
5106 }
5107 #else
5108 #define INIT_THREAD_SCRATCH_SIZE(pkt)                                     \
5109    pkt.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
5110 #define MERGE_SCRATCH_ADDR(name)                                          \
5111 {                                                                         \
5112    uint32_t pkt2[GENX(name##_length)] = {0};                              \
5113    _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
5114       p.ScratchSpaceBasePointer =                                         \
5115          rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);                     \
5116    }                                                                      \
5117    iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
5118 }
5119 #endif
5120 
5121 
5122 /**
5123  * Encode most of 3DSTATE_VS based on the compiled shader.
5124  */
5125 static void
iris_store_vs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5126 iris_store_vs_state(const struct intel_device_info *devinfo,
5127                     struct iris_compiled_shader *shader)
5128 {
5129    struct iris_vue_data *vue_data = iris_vue_data(shader);
5130 
5131    iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
5132       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
5133       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
5134 #if GFX_VER < 20
5135       vs.SIMD8DispatchEnable = true;
5136 #endif
5137       vs.UserClipDistanceCullTestEnableBitmask =
5138          vue_data->cull_distance_mask;
5139    }
5140 }
5141 
5142 /**
5143  * Encode most of 3DSTATE_HS based on the compiled shader.
5144  */
5145 static void
iris_store_tcs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5146 iris_store_tcs_state(const struct intel_device_info *devinfo,
5147                      struct iris_compiled_shader *shader)
5148 {
5149    struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
5150    struct iris_vue_data *vue_data = &tcs_data->base;
5151 
5152    iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
5153       INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
5154 
5155 #if GFX_VER >= 12
5156       /* Wa_1604578095:
5157        *
5158        *    Hang occurs when the number of max threads is less than 2 times
5159        *    the number of instance count. The number of max threads must be
5160        *    more than 2 times the number of instance count.
5161        */
5162       assert((devinfo->max_tcs_threads / 2) > tcs_data->instances);
5163       hs.DispatchGRFStartRegisterForURBData = shader->dispatch_grf_start_reg & 0x1f;
5164       hs.DispatchGRFStartRegisterForURBData5 = shader->dispatch_grf_start_reg >> 5;
5165 #endif
5166 
5167       hs.InstanceCount = tcs_data->instances - 1;
5168       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
5169       hs.IncludeVertexHandles = true;
5170 
5171 #if GFX_VER == 12
5172       /* Patch Count threshold specifies the maximum number of patches that
5173        * will be accumulated before a thread dispatch is forced.
5174        */
5175       hs.PatchCountThreshold = tcs_data->patch_count_threshold;
5176 #endif
5177 
5178 #if GFX_VER >= 9
5179 #if GFX_VER < 20
5180       hs.DispatchMode = vue_data->dispatch_mode;
5181 #endif
5182       hs.IncludePrimitiveID = tcs_data->include_primitive_id;
5183 #endif
5184    }
5185 }
5186 
5187 /**
5188  * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
5189  */
5190 static void
iris_store_tes_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5191 iris_store_tes_state(const struct intel_device_info *devinfo,
5192                      struct iris_compiled_shader *shader)
5193 {
5194    struct iris_tes_data *tes_data = iris_tes_data(shader);
5195    struct iris_vue_data *vue_data = &tes_data->base;
5196 
5197    uint32_t *ds_state = (void *) shader->derived_data;
5198    uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
5199 
5200    iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
5201       INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
5202 
5203       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
5204       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
5205       ds.ComputeWCoordinateEnable =
5206          tes_data->domain == INTEL_TESS_DOMAIN_TRI;
5207 
5208 #if GFX_VER >= 12
5209       ds.PrimitiveIDNotRequired = !tes_data->include_primitive_id;
5210 #endif
5211       ds.UserClipDistanceCullTestEnableBitmask =
5212          vue_data->cull_distance_mask;
5213    }
5214 
5215    iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
5216       te.Partitioning = tes_data->partitioning;
5217 #if GFX_VER >= 20
5218       te.NumberOfRegionsPerPatch = 2;
5219 #endif
5220       te.OutputTopology = tes_data->output_topology;
5221       te.TEDomain = tes_data->domain;
5222       te.TEEnable = true;
5223       te.MaximumTessellationFactorOdd = 63.0;
5224       te.MaximumTessellationFactorNotOdd = 64.0;
5225 #if GFX_VERx10 >= 125
5226       STATIC_ASSERT(TEDMODE_OFF == 0);
5227       if (intel_needs_workaround(devinfo, 14015055625)) {
5228          te.TessellationDistributionMode = TEDMODE_OFF;
5229       } else if (intel_needs_workaround(devinfo, 22012699309)) {
5230          te.TessellationDistributionMode = TEDMODE_RR_STRICT;
5231       } else {
5232          te.TessellationDistributionMode = TEDMODE_RR_FREE;
5233       }
5234 
5235    #if GFX_VER >= 20
5236       te.TessellationDistributionLevel = TEDLEVEL_REGION;
5237    #else
5238       te.TessellationDistributionLevel = TEDLEVEL_PATCH;
5239    #endif
5240       /* 64_TRIANGLES */
5241       te.SmallPatchThreshold = 3;
5242       /* 1K_TRIANGLES */
5243       te.TargetBlockSize = 8;
5244       /* 1K_TRIANGLES */
5245       te.LocalBOPAccumulatorThreshold = 1;
5246 #endif
5247    }
5248 }
5249 
5250 /**
5251  * Encode most of 3DSTATE_GS based on the compiled shader.
5252  */
5253 static void
iris_store_gs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5254 iris_store_gs_state(const struct intel_device_info *devinfo,
5255                     struct iris_compiled_shader *shader)
5256 {
5257    struct iris_gs_data *gs_data = iris_gs_data(shader);
5258    struct iris_vue_data *vue_data = &gs_data->base;
5259 
5260    iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
5261       INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
5262 
5263       gs.OutputVertexSize = gs_data->output_vertex_size_hwords * 2 - 1;
5264       gs.OutputTopology = gs_data->output_topology;
5265       gs.ControlDataHeaderSize = gs_data->control_data_header_size_hwords;
5266       gs.InstanceControl = gs_data->invocations - 1;
5267 #if GFX_VER < 20
5268       gs.DispatchMode = DISPATCH_MODE_SIMD8;
5269 #endif
5270       gs.IncludePrimitiveID = gs_data->include_primitive_id;
5271       gs.ControlDataFormat = gs_data->control_data_format;
5272       gs.ReorderMode = TRAILING;
5273       gs.ExpectedVertexCount = gs_data->vertices_in;
5274       gs.MaximumNumberofThreads =
5275          GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
5276                       : (devinfo->max_gs_threads - 1);
5277 
5278       if (gs_data->static_vertex_count != -1) {
5279          gs.StaticOutput = true;
5280          gs.StaticOutputVertexCount = gs_data->static_vertex_count;
5281       }
5282       gs.IncludeVertexHandles = vue_data->include_vue_handles;
5283 
5284       gs.UserClipDistanceCullTestEnableBitmask = vue_data->cull_distance_mask;
5285 
5286       const int urb_entry_write_offset = 1;
5287       const uint32_t urb_entry_output_length =
5288          DIV_ROUND_UP(vue_data->vue_map.num_slots, 2) - urb_entry_write_offset;
5289 
5290       gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
5291       gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
5292    }
5293 }
5294 
5295 /**
5296  * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
5297  */
5298 static void
iris_store_fs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5299 iris_store_fs_state(const struct intel_device_info *devinfo,
5300                     struct iris_compiled_shader *shader)
5301 {
5302    struct iris_fs_data *fs_data = iris_fs_data(shader);
5303 
5304    uint32_t *ps_state = (void *) shader->derived_data;
5305    uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
5306 
5307    iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
5308       ps.VectorMaskEnable = fs_data->uses_vmask;
5309       ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
5310       ps.SamplerCount = encode_sampler_count(shader);
5311       ps.FloatingPointMode = shader->use_alt_mode;
5312       ps.MaximumNumberofThreadsPerPSD =
5313          devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
5314 
5315 #if GFX_VER < 20
5316       ps.PushConstantEnable = devinfo->needs_null_push_constant_tbimr_workaround ||
5317                               shader->ubo_ranges[0].length > 0;
5318 #endif
5319 
5320       /* From the documentation for this packet:
5321        * "If the PS kernel does not need the Position XY Offsets to
5322        *  compute a Position Value, then this field should be programmed
5323        *  to POSOFFSET_NONE."
5324        *
5325        * "SW Recommendation: If the PS kernel needs the Position Offsets
5326        *  to compute a Position XY value, this field should match Position
5327        *  ZW Interpolation Mode to ensure a consistent position.xyzw
5328        *  computation."
5329        *
5330        * We only require XY sample offsets. So, this recommendation doesn't
5331        * look useful at the moment.  We might need this in future.
5332        */
5333       ps.PositionXYOffsetSelect =
5334          fs_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
5335 
5336       if (shader->total_scratch) {
5337          INIT_THREAD_SCRATCH_SIZE(ps);
5338       }
5339    }
5340 
5341    iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
5342       psx.PixelShaderValid = true;
5343       psx.PixelShaderComputedDepthMode = fs_data->computed_depth_mode;
5344       psx.PixelShaderKillsPixel = fs_data->uses_kill;
5345 #if GFX_VER < 20
5346       psx.AttributeEnable = fs_data->num_varying_inputs != 0;
5347 #endif
5348       psx.PixelShaderUsesSourceDepth = fs_data->uses_src_depth;
5349       psx.PixelShaderUsesSourceW = fs_data->uses_src_w;
5350       psx.PixelShaderIsPerSample = fs_data->is_per_sample;
5351       psx.oMaskPresenttoRenderTarget = fs_data->uses_omask;
5352 
5353 #if GFX_VER >= 9
5354 #if GFX_VER >= 20
5355       assert(!fs_data->pulls_bary);
5356 #else
5357       psx.PixelShaderPullsBary = fs_data->pulls_bary;
5358 #endif
5359       psx.PixelShaderComputesStencil = fs_data->computed_stencil;
5360 #endif
5361 
5362 #if GFX_VER >= 11
5363       psx.PixelShaderRequiresSubpixelSampleOffsets =
5364          fs_data->uses_sample_offsets;
5365       psx.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
5366          fs_data->uses_npc_bary_coefficients;
5367       psx.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
5368          fs_data->uses_pc_bary_coefficients;
5369       psx.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
5370          fs_data->uses_depth_w_coefficients;
5371 #endif
5372    }
5373 }
5374 
5375 /**
5376  * Compute the size of the derived data (shader command packets).
5377  *
5378  * This must match the data written by the iris_store_xs_state() functions.
5379  */
5380 static void
iris_store_cs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5381 iris_store_cs_state(const struct intel_device_info *devinfo,
5382                     struct iris_compiled_shader *shader)
5383 {
5384    struct iris_cs_data *cs_data = iris_cs_data(shader);
5385    void *map = shader->derived_data;
5386 
5387    iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
5388 #if GFX_VERx10 < 125
5389       desc.ConstantURBEntryReadLength = cs_data->push.per_thread.regs;
5390       desc.CrossThreadConstantDataReadLength =
5391          cs_data->push.cross_thread.regs;
5392 #else
5393       assert(cs_data->push.per_thread.regs == 0);
5394       assert(cs_data->push.cross_thread.regs == 0);
5395 #endif
5396 #if GFX_VERx10 <= 125
5397       desc.BarrierEnable = cs_data->uses_barrier;
5398 #endif
5399       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
5400       desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
5401          0 : MIN2(shader->bt.size_bytes / 4, 31);
5402       desc.SamplerCount = encode_sampler_count(shader);
5403       /* TODO: Check if we are missing workarounds and enable mid-thread
5404        * preemption.
5405        *
5406        * We still have issues with mid-thread preemption (it was already
5407        * disabled by the kernel on gfx11, due to missing workarounds). It's
5408        * possible that we are just missing some workarounds, and could enable
5409        * it later, but for now let's disable it to fix a GPU in compute in Car
5410        * Chase (and possibly more).
5411        */
5412 #if GFX_VER >= 20
5413       desc.ThreadPreemption = false;
5414 #elif GFX_VER >= 12
5415       desc.ThreadPreemptionDisable = true;
5416 #endif
5417    }
5418 }
5419 
5420 static unsigned
iris_derived_program_state_size(enum iris_program_cache_id cache_id)5421 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
5422 {
5423    assert(cache_id <= IRIS_CACHE_BLORP);
5424 
5425    static const unsigned dwords[] = {
5426       [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
5427       [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
5428       [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
5429       [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
5430       [IRIS_CACHE_FS] =
5431          GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
5432       [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
5433       [IRIS_CACHE_BLORP] = 0,
5434    };
5435 
5436    return sizeof(uint32_t) * dwords[cache_id];
5437 }
5438 
5439 /**
5440  * Create any state packets corresponding to the given shader stage
5441  * (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.
5442  * This means that we can look up a program in the in-memory cache and
5443  * get most of the state packet without having to reconstruct it.
5444  */
5445 static void
iris_store_derived_program_state(const struct intel_device_info * devinfo,enum iris_program_cache_id cache_id,struct iris_compiled_shader * shader)5446 iris_store_derived_program_state(const struct intel_device_info *devinfo,
5447                                  enum iris_program_cache_id cache_id,
5448                                  struct iris_compiled_shader *shader)
5449 {
5450    switch (cache_id) {
5451    case IRIS_CACHE_VS:
5452       iris_store_vs_state(devinfo, shader);
5453       break;
5454    case IRIS_CACHE_TCS:
5455       iris_store_tcs_state(devinfo, shader);
5456       break;
5457    case IRIS_CACHE_TES:
5458       iris_store_tes_state(devinfo, shader);
5459       break;
5460    case IRIS_CACHE_GS:
5461       iris_store_gs_state(devinfo, shader);
5462       break;
5463    case IRIS_CACHE_FS:
5464       iris_store_fs_state(devinfo, shader);
5465       break;
5466    case IRIS_CACHE_CS:
5467       iris_store_cs_state(devinfo, shader);
5468       break;
5469    case IRIS_CACHE_BLORP:
5470       break;
5471    }
5472 }
5473 
5474 /* ------------------------------------------------------------------- */
5475 
5476 static const uint32_t push_constant_opcodes[] = {
5477    [MESA_SHADER_VERTEX]    = 21,
5478    [MESA_SHADER_TESS_CTRL] = 25, /* HS */
5479    [MESA_SHADER_TESS_EVAL] = 26, /* DS */
5480    [MESA_SHADER_GEOMETRY]  = 22,
5481    [MESA_SHADER_FRAGMENT]  = 23,
5482    [MESA_SHADER_COMPUTE]   = 0,
5483 };
5484 
5485 static uint32_t
use_null_surface(struct iris_batch * batch,struct iris_context * ice)5486 use_null_surface(struct iris_batch *batch, struct iris_context *ice)
5487 {
5488    struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
5489 
5490    iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5491 
5492    return ice->state.unbound_tex.offset;
5493 }
5494 
5495 static uint32_t
use_null_fb_surface(struct iris_batch * batch,struct iris_context * ice)5496 use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)
5497 {
5498    /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
5499    if (!ice->state.null_fb.res)
5500       return use_null_surface(batch, ice);
5501 
5502    struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);
5503 
5504    iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5505 
5506    return ice->state.null_fb.offset;
5507 }
5508 
5509 static uint32_t
surf_state_offset_for_aux(unsigned aux_modes,enum isl_aux_usage aux_usage)5510 surf_state_offset_for_aux(unsigned aux_modes,
5511                           enum isl_aux_usage aux_usage)
5512 {
5513    assert(aux_modes & (1 << aux_usage));
5514    return SURFACE_STATE_ALIGNMENT *
5515           util_bitcount(aux_modes & ((1 << aux_usage) - 1));
5516 }
5517 
5518 #if GFX_VER == 9
5519 static void
surf_state_update_clear_value(struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5520 surf_state_update_clear_value(struct iris_batch *batch,
5521                               struct iris_resource *res,
5522                               struct iris_surface_state *surf_state,
5523                               enum isl_aux_usage aux_usage)
5524 {
5525    struct isl_device *isl_dev = &batch->screen->isl_dev;
5526    struct iris_bo *state_bo = iris_resource_bo(surf_state->ref.res);
5527    uint64_t real_offset = surf_state->ref.offset + IRIS_MEMZONE_BINDER_START;
5528    uint32_t offset_into_bo = real_offset - state_bo->address;
5529    uint32_t clear_offset = offset_into_bo +
5530       isl_dev->ss.clear_value_offset +
5531       surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5532    uint32_t *color = res->aux.clear_color.u32;
5533 
5534    assert(isl_dev->ss.clear_value_size == 16);
5535 
5536    if (aux_usage == ISL_AUX_USAGE_HIZ) {
5537       iris_emit_pipe_control_write(batch, "update fast clear value (Z)",
5538                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5539                                    state_bo, clear_offset, color[0]);
5540    } else {
5541       iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",
5542                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5543                                    state_bo, clear_offset,
5544                                    (uint64_t) color[0] |
5545                                    (uint64_t) color[1] << 32);
5546       iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",
5547                                    PIPE_CONTROL_WRITE_IMMEDIATE,
5548                                    state_bo, clear_offset + 8,
5549                                    (uint64_t) color[2] |
5550                                    (uint64_t) color[3] << 32);
5551    }
5552 
5553    iris_emit_pipe_control_flush(batch,
5554                                 "update fast clear: state cache invalidate",
5555                                 PIPE_CONTROL_FLUSH_ENABLE |
5556                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
5557 }
5558 #endif
5559 
5560 static void
update_clear_value(struct iris_context * ice,struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,struct isl_view * view)5561 update_clear_value(struct iris_context *ice,
5562                    struct iris_batch *batch,
5563                    struct iris_resource *res,
5564                    struct iris_surface_state *surf_state,
5565                    struct isl_view *view)
5566 {
5567    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5568    UNUSED unsigned aux_modes = surf_state->aux_usages;
5569 
5570    /* We only need to update the clear color in the surface state for gfx8 and
5571     * gfx9. Newer gens can read it directly from the clear color state buffer.
5572     */
5573 #if GFX_VER == 9
5574    /* Skip updating the ISL_AUX_USAGE_NONE surface state */
5575    aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);
5576 
5577    while (aux_modes) {
5578       enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
5579 
5580       surf_state_update_clear_value(batch, res, surf_state, aux_usage);
5581    }
5582 #elif GFX_VER == 8
5583    /* TODO: Could update rather than re-filling */
5584    alloc_surface_states(surf_state, surf_state->aux_usages);
5585 
5586    fill_surface_states(isl_dev, surf_state, res, &res->surf, view, 0, 0, 0);
5587 
5588    upload_surface_states(ice->state.surface_uploader, surf_state);
5589 #endif
5590 }
5591 
5592 static uint32_t
use_surface_state(struct iris_batch * batch,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5593 use_surface_state(struct iris_batch *batch,
5594                   struct iris_surface_state *surf_state,
5595                   enum isl_aux_usage aux_usage)
5596 {
5597    iris_use_pinned_bo(batch, iris_resource_bo(surf_state->ref.res), false,
5598                       IRIS_DOMAIN_NONE);
5599 
5600    return surf_state->ref.offset +
5601           surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5602 }
5603 
5604 /**
5605  * Add a surface to the validation list, as well as the buffer containing
5606  * the corresponding SURFACE_STATE.
5607  *
5608  * Returns the binding table entry (offset to SURFACE_STATE).
5609  */
5610 static uint32_t
use_surface(struct iris_context * ice,struct iris_batch * batch,struct pipe_surface * p_surf,bool writeable,enum isl_aux_usage aux_usage,bool is_read_surface,enum iris_domain access)5611 use_surface(struct iris_context *ice,
5612             struct iris_batch *batch,
5613             struct pipe_surface *p_surf,
5614             bool writeable,
5615             enum isl_aux_usage aux_usage,
5616             bool is_read_surface,
5617             enum iris_domain access)
5618 {
5619    struct iris_surface *surf = (void *) p_surf;
5620    struct iris_resource *res = (void *) p_surf->texture;
5621 
5622    if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {
5623       upload_surface_states(ice->state.surface_uploader,
5624                             &surf->surface_state_read);
5625    }
5626 
5627    if (!surf->surface_state.ref.res) {
5628       upload_surface_states(ice->state.surface_uploader,
5629                             &surf->surface_state);
5630    }
5631 
5632    if (memcmp(&res->aux.clear_color, &surf->clear_color,
5633               sizeof(surf->clear_color)) != 0) {
5634       update_clear_value(ice, batch, res, &surf->surface_state, &surf->view);
5635       if (GFX_VER == 8) {
5636          update_clear_value(ice, batch, res, &surf->surface_state_read,
5637                             &surf->read_view);
5638       }
5639       surf->clear_color = res->aux.clear_color;
5640    }
5641 
5642    if (res->aux.clear_color_bo)
5643       iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
5644 
5645    if (res->aux.bo)
5646       iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
5647 
5648    iris_use_pinned_bo(batch, res->bo, writeable, access);
5649 
5650    if (GFX_VER == 8 && is_read_surface) {
5651       return use_surface_state(batch, &surf->surface_state_read, aux_usage);
5652    } else {
5653       return use_surface_state(batch, &surf->surface_state, aux_usage);
5654    }
5655 }
5656 
5657 static uint32_t
use_sampler_view(struct iris_context * ice,struct iris_batch * batch,struct iris_sampler_view * isv)5658 use_sampler_view(struct iris_context *ice,
5659                  struct iris_batch *batch,
5660                  struct iris_sampler_view *isv)
5661 {
5662    enum isl_aux_usage aux_usage =
5663       iris_resource_texture_aux_usage(ice, isv->res, isv->view.format,
5664                                       isv->view.base_level, isv->view.levels);
5665 
5666    if (!isv->surface_state.ref.res)
5667       upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
5668 
5669    if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
5670               sizeof(isv->clear_color)) != 0) {
5671       update_clear_value(ice, batch, isv->res, &isv->surface_state,
5672                          &isv->view);
5673       isv->clear_color = isv->res->aux.clear_color;
5674    }
5675 
5676    if (isv->res->aux.clear_color_bo) {
5677       iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
5678                          false, IRIS_DOMAIN_SAMPLER_READ);
5679    }
5680 
5681    if (isv->res->aux.bo) {
5682       iris_use_pinned_bo(batch, isv->res->aux.bo,
5683                          false, IRIS_DOMAIN_SAMPLER_READ);
5684    }
5685 
5686    iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_SAMPLER_READ);
5687 
5688    return use_surface_state(batch, &isv->surface_state, aux_usage);
5689 }
5690 
5691 static uint32_t
use_ubo_ssbo(struct iris_batch * batch,struct iris_context * ice,struct pipe_shader_buffer * buf,struct iris_state_ref * surf_state,bool writable,enum iris_domain access)5692 use_ubo_ssbo(struct iris_batch *batch,
5693              struct iris_context *ice,
5694              struct pipe_shader_buffer *buf,
5695              struct iris_state_ref *surf_state,
5696              bool writable, enum iris_domain access)
5697 {
5698    if (!buf->buffer || !surf_state->res)
5699       return use_null_surface(batch, ice);
5700 
5701    iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);
5702    iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,
5703                       IRIS_DOMAIN_NONE);
5704 
5705    return surf_state->offset;
5706 }
5707 
5708 static uint32_t
use_image(struct iris_batch * batch,struct iris_context * ice,struct iris_shader_state * shs,const struct shader_info * info,int i)5709 use_image(struct iris_batch *batch, struct iris_context *ice,
5710           struct iris_shader_state *shs, const struct shader_info *info,
5711           int i)
5712 {
5713    struct iris_image_view *iv = &shs->image[i];
5714    struct iris_resource *res = (void *) iv->base.resource;
5715 
5716    if (!res)
5717       return use_null_surface(batch, ice);
5718 
5719    bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5720 
5721    iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);
5722 
5723    if (res->aux.bo)
5724       iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);
5725 
5726    if (res->aux.clear_color_bo) {
5727       iris_use_pinned_bo(batch, res->aux.clear_color_bo, false,
5728                          IRIS_DOMAIN_NONE);
5729    }
5730 
5731    enum isl_aux_usage aux_usage = shs->image_aux_usage[i];
5732 
5733    return use_surface_state(batch, &iv->surface_state, aux_usage);
5734 }
5735 
5736 #define push_bt_entry(addr) \
5737    assert(addr >= surf_base_offset); \
5738    assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \
5739    if (!pin_only) bt_map[s++] = (addr) - surf_base_offset;
5740 
5741 #define bt_assert(section) \
5742    if (!pin_only && shader->bt.used_mask[section] != 0) \
5743       assert(shader->bt.offsets[section] == s);
5744 
5745 /**
5746  * Populate the binding table for a given shader stage.
5747  *
5748  * This fills out the table of pointers to surfaces required by the shader,
5749  * and also adds those buffers to the validation list so the kernel can make
5750  * resident before running our batch.
5751  */
5752 static void
iris_populate_binding_table(struct iris_context * ice,struct iris_batch * batch,gl_shader_stage stage,bool pin_only)5753 iris_populate_binding_table(struct iris_context *ice,
5754                             struct iris_batch *batch,
5755                             gl_shader_stage stage,
5756                             bool pin_only)
5757 {
5758    const struct iris_binder *binder = &ice->state.binder;
5759    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5760    if (!shader)
5761       return;
5762 
5763    struct iris_binding_table *bt = &shader->bt;
5764    struct iris_shader_state *shs = &ice->state.shaders[stage];
5765    uint32_t surf_base_offset = GFX_VER < 11 ? binder->bo->address : 0;
5766 
5767    uint32_t *bt_map = binder->map + binder->bt_offset[stage];
5768    int s = 0;
5769 
5770    const struct shader_info *info = iris_get_shader_info(ice, stage);
5771    if (!info) {
5772       /* TCS passthrough doesn't need a binding table. */
5773       assert(stage == MESA_SHADER_TESS_CTRL);
5774       return;
5775    }
5776 
5777    if (stage == MESA_SHADER_COMPUTE &&
5778        shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {
5779       /* surface for gl_NumWorkGroups */
5780       struct iris_state_ref *grid_data = &ice->state.grid_size;
5781       struct iris_state_ref *grid_state = &ice->state.grid_surf_state;
5782       iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,
5783                          IRIS_DOMAIN_PULL_CONSTANT_READ);
5784       iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,
5785                          IRIS_DOMAIN_NONE);
5786       push_bt_entry(grid_state->offset);
5787    }
5788 
5789    if (stage == MESA_SHADER_FRAGMENT) {
5790       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5791       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5792       if (cso_fb->nr_cbufs) {
5793          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5794             uint32_t addr;
5795             if (cso_fb->cbufs[i]) {
5796                addr = use_surface(ice, batch, cso_fb->cbufs[i], true,
5797                                   ice->state.draw_aux_usage[i], false,
5798                                   IRIS_DOMAIN_RENDER_WRITE);
5799             } else {
5800                addr = use_null_fb_surface(batch, ice);
5801             }
5802             push_bt_entry(addr);
5803          }
5804       } else if (bt->use_null_rt) {
5805          uint32_t addr = use_null_fb_surface(batch, ice);
5806          push_bt_entry(addr);
5807       }
5808    }
5809 
5810 #define foreach_surface_used(index, group) \
5811    bt_assert(group); \
5812    for (int index = 0; index < bt->sizes[group]; index++) \
5813       if (iris_group_index_to_bti(bt, group, index) != \
5814           IRIS_SURFACE_NOT_USED)
5815 
5816    foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {
5817       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5818       uint32_t addr;
5819       if (cso_fb->cbufs[i]) {
5820          addr = use_surface(ice, batch, cso_fb->cbufs[i],
5821                             false, ice->state.draw_aux_usage[i], true,
5822                             IRIS_DOMAIN_SAMPLER_READ);
5823          push_bt_entry(addr);
5824       }
5825    }
5826 
5827    foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_LOW64) {
5828       struct iris_sampler_view *view = shs->textures[i];
5829       uint32_t addr = view ? use_sampler_view(ice, batch, view)
5830                            : use_null_surface(batch, ice);
5831       push_bt_entry(addr);
5832    }
5833 
5834    foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_HIGH64) {
5835       struct iris_sampler_view *view = shs->textures[64 + i];
5836       uint32_t addr = view ? use_sampler_view(ice, batch, view)
5837                            : use_null_surface(batch, ice);
5838       push_bt_entry(addr);
5839    }
5840 
5841    foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {
5842       uint32_t addr = use_image(batch, ice, shs, info, i);
5843       push_bt_entry(addr);
5844    }
5845 
5846    foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {
5847       uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],
5848                                    &shs->constbuf_surf_state[i], false,
5849                                    IRIS_DOMAIN_PULL_CONSTANT_READ);
5850       push_bt_entry(addr);
5851    }
5852 
5853    foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {
5854       uint32_t addr =
5855          use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],
5856                       shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);
5857       push_bt_entry(addr);
5858    }
5859 
5860 #if 0
5861       /* XXX: YUV surfaces not implemented yet */
5862       bt_assert(plane_start[1], ...);
5863       bt_assert(plane_start[2], ...);
5864 #endif
5865 }
5866 
5867 static void
iris_use_optional_res(struct iris_batch * batch,struct pipe_resource * res,bool writeable,enum iris_domain access)5868 iris_use_optional_res(struct iris_batch *batch,
5869                       struct pipe_resource *res,
5870                       bool writeable,
5871                       enum iris_domain access)
5872 {
5873    if (res) {
5874       struct iris_bo *bo = iris_resource_bo(res);
5875       iris_use_pinned_bo(batch, bo, writeable, access);
5876    }
5877 }
5878 
5879 static void
pin_depth_and_stencil_buffers(struct iris_batch * batch,struct pipe_surface * zsbuf,struct iris_depth_stencil_alpha_state * cso_zsa)5880 pin_depth_and_stencil_buffers(struct iris_batch *batch,
5881                               struct pipe_surface *zsbuf,
5882                               struct iris_depth_stencil_alpha_state *cso_zsa)
5883 {
5884    if (!zsbuf)
5885       return;
5886 
5887    struct iris_resource *zres, *sres;
5888    iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);
5889 
5890    if (zres) {
5891       iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,
5892                          IRIS_DOMAIN_DEPTH_WRITE);
5893       if (zres->aux.bo) {
5894          iris_use_pinned_bo(batch, zres->aux.bo,
5895                             cso_zsa->depth_writes_enabled,
5896                             IRIS_DOMAIN_DEPTH_WRITE);
5897       }
5898    }
5899 
5900    if (sres) {
5901       iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,
5902                          IRIS_DOMAIN_DEPTH_WRITE);
5903    }
5904 }
5905 
5906 static uint32_t
pin_scratch_space(struct iris_context * ice,struct iris_batch * batch,const struct iris_compiled_shader * shader,gl_shader_stage stage)5907 pin_scratch_space(struct iris_context *ice,
5908                   struct iris_batch *batch,
5909                   const struct iris_compiled_shader *shader,
5910                   gl_shader_stage stage)
5911 {
5912    uint32_t scratch_addr = 0;
5913 
5914    if (shader->total_scratch > 0) {
5915       struct iris_bo *scratch_bo =
5916          iris_get_scratch_space(ice, shader->total_scratch, stage);
5917       iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);
5918 
5919 #if GFX_VERx10 >= 125
5920       const struct iris_state_ref *ref =
5921          iris_get_scratch_surf(ice, shader->total_scratch);
5922       iris_use_pinned_bo(batch, iris_resource_bo(ref->res),
5923                          false, IRIS_DOMAIN_NONE);
5924       scratch_addr = ref->offset +
5925                      iris_resource_bo(ref->res)->address -
5926                      IRIS_MEMZONE_SCRATCH_START;
5927       assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));
5928 #else
5929       scratch_addr = scratch_bo->address;
5930 #endif
5931    }
5932 
5933    return scratch_addr;
5934 }
5935 
5936 /* ------------------------------------------------------------------- */
5937 
5938 /**
5939  * Pin any BOs which were installed by a previous batch, and restored
5940  * via the hardware logical context mechanism.
5941  *
5942  * We don't need to re-emit all state every batch - the hardware context
5943  * mechanism will save and restore it for us.  This includes pointers to
5944  * various BOs...which won't exist unless we ask the kernel to pin them
5945  * by adding them to the validation list.
5946  *
5947  * We can skip buffers if we've re-emitted those packets, as we're
5948  * overwriting those stale pointers with new ones, and don't actually
5949  * refer to the old BOs.
5950  */
5951 static void
iris_restore_render_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)5952 iris_restore_render_saved_bos(struct iris_context *ice,
5953                               struct iris_batch *batch,
5954                               const struct pipe_draw_info *draw)
5955 {
5956    struct iris_genx_state *genx = ice->state.genx;
5957 
5958    const uint64_t clean = ~ice->state.dirty;
5959    const uint64_t stage_clean = ~ice->state.stage_dirty;
5960 
5961    if (clean & IRIS_DIRTY_CC_VIEWPORT) {
5962       iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,
5963                             IRIS_DOMAIN_NONE);
5964    }
5965 
5966    if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
5967       iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,
5968                             IRIS_DOMAIN_NONE);
5969    }
5970 
5971    if (clean & IRIS_DIRTY_BLEND_STATE) {
5972       iris_use_optional_res(batch, ice->state.last_res.blend, false,
5973                             IRIS_DOMAIN_NONE);
5974    }
5975 
5976    if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
5977       iris_use_optional_res(batch, ice->state.last_res.color_calc, false,
5978                             IRIS_DOMAIN_NONE);
5979    }
5980 
5981    if (clean & IRIS_DIRTY_SCISSOR_RECT) {
5982       iris_use_optional_res(batch, ice->state.last_res.scissor, false,
5983                             IRIS_DOMAIN_NONE);
5984    }
5985 
5986    if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {
5987       for (int i = 0; i < 4; i++) {
5988          struct iris_stream_output_target *tgt =
5989             (void *) ice->state.so_target[i];
5990          if (tgt) {
5991             iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
5992                                true, IRIS_DOMAIN_OTHER_WRITE);
5993             iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
5994                                true, IRIS_DOMAIN_OTHER_WRITE);
5995          }
5996       }
5997    }
5998 
5999    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6000       if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6001          continue;
6002 
6003       struct iris_shader_state *shs = &ice->state.shaders[stage];
6004       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6005 
6006       if (!shader)
6007          continue;
6008 
6009       for (int i = 0; i < 4; i++) {
6010          const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6011 
6012          if (range->length == 0)
6013             continue;
6014 
6015          /* Range block is a binding table index, map back to UBO index. */
6016          unsigned block_index = iris_bti_to_group_index(
6017             &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6018          assert(block_index != IRIS_SURFACE_NOT_USED);
6019 
6020          struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6021          struct iris_resource *res = (void *) cbuf->buffer;
6022 
6023          if (res)
6024             iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
6025          else
6026             iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,
6027                                IRIS_DOMAIN_OTHER_READ);
6028       }
6029    }
6030 
6031    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6032       if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6033          /* Re-pin any buffers referred to by the binding table. */
6034          iris_populate_binding_table(ice, batch, stage, true);
6035       }
6036    }
6037 
6038    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6039       struct iris_shader_state *shs = &ice->state.shaders[stage];
6040       struct pipe_resource *res = shs->sampler_table.res;
6041       if (res)
6042          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
6043                             IRIS_DOMAIN_NONE);
6044    }
6045 
6046    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6047       if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {
6048          struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6049 
6050          if (shader) {
6051             struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6052             iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6053 
6054             pin_scratch_space(ice, batch, shader, stage);
6055          }
6056       }
6057    }
6058 
6059    if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&
6060        (clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {
6061       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6062       pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
6063    }
6064 
6065    iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
6066                          IRIS_DOMAIN_VF_READ);
6067 
6068    if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
6069       uint64_t bound = ice->state.bound_vertex_buffers;
6070       while (bound) {
6071          const int i = u_bit_scan64(&bound);
6072          struct pipe_resource *res = genx->vertex_buffers[i].resource;
6073          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
6074                             IRIS_DOMAIN_VF_READ);
6075       }
6076    }
6077 }
6078 
6079 static void
iris_restore_compute_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)6080 iris_restore_compute_saved_bos(struct iris_context *ice,
6081                                struct iris_batch *batch,
6082                                const struct pipe_grid_info *grid)
6083 {
6084    const uint64_t stage_clean = ~ice->state.stage_dirty;
6085 
6086    const int stage = MESA_SHADER_COMPUTE;
6087    struct iris_shader_state *shs = &ice->state.shaders[stage];
6088 
6089    if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {
6090       /* Re-pin any buffers referred to by the binding table. */
6091       iris_populate_binding_table(ice, batch, stage, true);
6092    }
6093 
6094    struct pipe_resource *sampler_res = shs->sampler_table.res;
6095    if (sampler_res)
6096       iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,
6097                          IRIS_DOMAIN_NONE);
6098 
6099    if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&
6100        (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&
6101        (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
6102        (stage_clean & IRIS_STAGE_DIRTY_CS)) {
6103       iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,
6104                             IRIS_DOMAIN_NONE);
6105    }
6106 
6107    if (stage_clean & IRIS_STAGE_DIRTY_CS) {
6108       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6109 
6110       if (shader) {
6111          struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6112          iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6113 
6114          if (GFX_VERx10 < 125) {
6115             struct iris_bo *curbe_bo =
6116                iris_resource_bo(ice->state.last_res.cs_thread_ids);
6117             iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);
6118          }
6119 
6120          pin_scratch_space(ice, batch, shader, stage);
6121       }
6122    }
6123 }
6124 
6125 /**
6126  * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
6127  */
6128 static void
iris_update_binder_address(struct iris_batch * batch,struct iris_binder * binder)6129 iris_update_binder_address(struct iris_batch *batch,
6130                            struct iris_binder *binder)
6131 {
6132    if (batch->last_binder_address == binder->bo->address)
6133       return;
6134 
6135    struct isl_device *isl_dev = &batch->screen->isl_dev;
6136    uint32_t mocs = isl_mocs(isl_dev, 0, false);
6137 
6138    iris_batch_sync_region_start(batch);
6139 
6140 #if GFX_VER >= 11
6141    /* Use 3DSTATE_BINDING_TABLE_POOL_ALLOC on Icelake and later */
6142 
6143 #if GFX_VERx10 == 120
6144    /* Wa_1607854226:
6145     *
6146     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
6147     *  mode by putting the pipeline temporarily in 3D mode..
6148     */
6149    if (batch->name == IRIS_BATCH_COMPUTE)
6150       emit_pipeline_select(batch, _3D);
6151 #endif
6152 
6153    iris_emit_pipe_control_flush(batch, "Stall for binder realloc",
6154                                 PIPE_CONTROL_CS_STALL);
6155 
6156    iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
6157       btpa.BindingTablePoolBaseAddress = ro_bo(binder->bo, 0);
6158       btpa.BindingTablePoolBufferSize = binder->size / 4096;
6159 #if GFX_VERx10 < 125
6160       btpa.BindingTablePoolEnable = true;
6161 #endif
6162       btpa.MOCS = mocs;
6163    }
6164 
6165 #if GFX_VERx10 == 120
6166    /* Wa_1607854226:
6167     *
6168     *  Put the pipeline back into compute mode.
6169     */
6170    if (batch->name == IRIS_BATCH_COMPUTE)
6171       emit_pipeline_select(batch, GPGPU);
6172 #endif
6173 #else
6174    /* Use STATE_BASE_ADDRESS on older platforms */
6175    flush_before_state_base_change(batch);
6176 
6177    iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
6178       sba.SurfaceStateBaseAddressModifyEnable = true;
6179       sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
6180 
6181       /* The hardware appears to pay attention to the MOCS fields even
6182        * if you don't set the "Address Modify Enable" bit for the base.
6183        */
6184       sba.GeneralStateMOCS            = mocs;
6185       sba.StatelessDataPortAccessMOCS = mocs;
6186       sba.DynamicStateMOCS            = mocs;
6187       sba.IndirectObjectMOCS          = mocs;
6188       sba.InstructionMOCS             = mocs;
6189       sba.SurfaceStateMOCS            = mocs;
6190 #if GFX_VER >= 9
6191       sba.BindlessSurfaceStateMOCS    = mocs;
6192 #endif
6193 #if GFX_VERx10 >= 125
6194       sba.L1CacheControl = L1CC_WB;
6195 #endif
6196    }
6197 #endif
6198 
6199    flush_after_state_base_change(batch);
6200    iris_batch_sync_region_end(batch);
6201 
6202    batch->last_binder_address = binder->bo->address;
6203 }
6204 
6205 static inline void
iris_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)6206 iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
6207                         bool window_space_position, float *zmin, float *zmax)
6208 {
6209    if (window_space_position) {
6210       *zmin = 0.f;
6211       *zmax = 1.f;
6212       return;
6213    }
6214    util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
6215 }
6216 
6217 /* Wa_16018063123 */
6218 static inline void
batch_emit_fast_color_dummy_blit(struct iris_batch * batch)6219 batch_emit_fast_color_dummy_blit(struct iris_batch *batch)
6220 {
6221 #if GFX_VERx10 >= 125
6222    iris_emit_cmd(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6223       blt.DestinationBaseAddress = batch->screen->workaround_address;
6224       blt.DestinationMOCS = iris_mocs(batch->screen->workaround_address.bo,
6225                                       &batch->screen->isl_dev,
6226                                       ISL_SURF_USAGE_BLITTER_DST_BIT);
6227       blt.DestinationPitch = 63;
6228       blt.DestinationX2 = 1;
6229       blt.DestinationY2 = 4;
6230       blt.DestinationSurfaceWidth = 1;
6231       blt.DestinationSurfaceHeight = 4;
6232       blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6233       blt.DestinationSurfaceQPitch = 4;
6234       blt.DestinationTiling = XY_TILE_LINEAR;
6235    }
6236 #endif
6237 }
6238 
6239 #if GFX_VER >= 12
6240 static void
invalidate_aux_map_state_per_engine(struct iris_batch * batch)6241 invalidate_aux_map_state_per_engine(struct iris_batch *batch)
6242 {
6243    uint64_t register_addr = 0;
6244 
6245    switch (batch->name) {
6246    case IRIS_BATCH_RENDER: {
6247       /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6248        * RCS engine idle sequence:
6249        *
6250        *    Gfx12+:
6251        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6252        *                      Target Cache Flush + Depth Cache
6253        *
6254        *    Gfx125+:
6255        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6256        *                      Target Cache Flush + Depth Cache + CCS flush
6257        */
6258       iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6259                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
6260                                  PIPE_CONTROL_L3_FABRIC_FLUSH |
6261                                  PIPE_CONTROL_CS_STALL |
6262                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
6263                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
6264                                  (GFX_VERx10 == 125 ?
6265                                   PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6266 
6267       register_addr = GENX(GFX_CCS_AUX_INV_num);
6268       break;
6269    }
6270    case IRIS_BATCH_COMPUTE: {
6271       /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6272        * Compute engine idle sequence:
6273        *
6274        *    Gfx12+:
6275        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall
6276        *
6277        *    Gfx125+:
6278        *       PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + CCS flush
6279        */
6280       iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6281                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
6282                                  PIPE_CONTROL_L3_FABRIC_FLUSH |
6283                                  PIPE_CONTROL_CS_STALL |
6284                                  (GFX_VERx10 == 125 ?
6285                                   PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6286 
6287       register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
6288       break;
6289    }
6290    case IRIS_BATCH_BLITTER: {
6291 #if GFX_VERx10 >= 125
6292       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6293       if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
6294          batch_emit_fast_color_dummy_blit(batch);
6295 
6296       /*
6297        * Notice we don't set the L3 Fabric Flush here, because we have
6298        * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6299        * documentation says :
6300        *
6301        *    "L3 Fabric Flush will ensure all the pending transactions in the
6302        *     L3 Fabric are flushed to global observation point. HW does
6303        *     implicit L3 Fabric Flush on all stalling flushes (both explicit
6304        *     and implicit) and on PIPECONTROL having Post Sync Operation
6305        *     enabled."
6306        *
6307        * Therefore setting L3 Fabric Flush here would be redundant.
6308        *
6309        * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6310        * Blitter engine idle sequence:
6311        *
6312        *    Gfx125+:
6313        *       MI_FLUSH_DW (dw0;b16 – flush CCS)
6314        */
6315       iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
6316          fd.FlushCCS = true;
6317       }
6318       register_addr = GENX(BCS_CCS_AUX_INV_num);
6319 #endif
6320       break;
6321    }
6322    default:
6323       unreachable("Invalid batch for aux map invalidation");
6324       break;
6325    }
6326 
6327    if (register_addr != 0) {
6328       /* If the aux-map state number increased, then we need to rewrite the
6329        * register. Rewriting the register is used to both set the aux-map
6330        * translation table address, and also to invalidate any previously
6331        * cached translations.
6332        */
6333       iris_load_register_imm32(batch, register_addr, 1);
6334 
6335       /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6336        *
6337        *    "Poll Aux Invalidation bit once the invalidation is set (Register
6338        *     4208 bit 0)"
6339        */
6340       iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6341          sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6342          sem.WaitMode = PollingMode;
6343          sem.RegisterPollMode = true;
6344          sem.SemaphoreDataDword = 0x0;
6345          sem.SemaphoreAddress = ro_bo(NULL, register_addr);
6346       }
6347    }
6348 }
6349 
6350 void
genX(invalidate_aux_map_state)6351 genX(invalidate_aux_map_state)(struct iris_batch *batch)
6352 {
6353    struct iris_screen *screen = batch->screen;
6354    void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6355    if (!aux_map_ctx)
6356       return;
6357    uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);
6358    if (batch->last_aux_map_state != aux_map_state_num) {
6359       invalidate_aux_map_state_per_engine(batch);
6360       batch->last_aux_map_state = aux_map_state_num;
6361    }
6362 }
6363 
6364 static void
init_aux_map_state(struct iris_batch * batch)6365 init_aux_map_state(struct iris_batch *batch)
6366 {
6367    struct iris_screen *screen = batch->screen;
6368    void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6369    if (!aux_map_ctx)
6370       return;
6371 
6372    uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);
6373    assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);
6374 
6375    uint32_t reg = 0;
6376    switch (batch->name) {
6377    case IRIS_BATCH_COMPUTE:
6378       if (iris_bufmgr_compute_engine_supported(screen->bufmgr)) {
6379          reg = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
6380          break;
6381       }
6382       /* fallthrough */
6383       FALLTHROUGH;
6384    case IRIS_BATCH_RENDER:
6385       reg = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
6386       break;
6387    case IRIS_BATCH_BLITTER:
6388 #if GFX_VERx10 >= 125
6389       reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
6390 #endif
6391       break;
6392    default:
6393       unreachable("Invalid batch for aux map init.");
6394    }
6395 
6396    if (reg)
6397       iris_load_register_imm64(batch, reg, base_addr);
6398 }
6399 #endif
6400 
6401 struct push_bos {
6402    struct {
6403       struct iris_address addr;
6404       uint32_t length;
6405    } buffers[4];
6406    int buffer_count;
6407    uint32_t max_length;
6408 };
6409 
6410 static void
setup_constant_buffers(struct iris_context * ice,struct iris_batch * batch,int stage,struct push_bos * push_bos)6411 setup_constant_buffers(struct iris_context *ice,
6412                        struct iris_batch *batch,
6413                        int stage,
6414                        struct push_bos *push_bos)
6415 {
6416    struct iris_shader_state *shs = &ice->state.shaders[stage];
6417    struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6418 
6419    uint32_t push_range_sum = 0;
6420 
6421    int n = 0;
6422    for (int i = 0; i < 4; i++) {
6423       const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6424 
6425       if (range->length == 0)
6426          continue;
6427 
6428       push_range_sum += range->length;
6429 
6430       if (range->length > push_bos->max_length)
6431          push_bos->max_length = range->length;
6432 
6433       /* Range block is a binding table index, map back to UBO index. */
6434       unsigned block_index = iris_bti_to_group_index(
6435          &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6436       assert(block_index != IRIS_SURFACE_NOT_USED);
6437 
6438       struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6439       struct iris_resource *res = (void *) cbuf->buffer;
6440 
6441       assert(cbuf->buffer_offset % 32 == 0);
6442 
6443       if (res)
6444          iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
6445 
6446       push_bos->buffers[n].length = range->length;
6447       push_bos->buffers[n].addr =
6448          res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
6449          : batch->screen->workaround_address;
6450       n++;
6451    }
6452 
6453    /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
6454     *
6455     *    "The sum of all four read length fields must be less than or
6456     *    equal to the size of 64."
6457     */
6458    assert(push_range_sum <= 64);
6459 
6460    push_bos->buffer_count = n;
6461 }
6462 
6463 static void
emit_push_constant_packets(struct iris_context * ice,struct iris_batch * batch,int stage,const struct push_bos * push_bos)6464 emit_push_constant_packets(struct iris_context *ice,
6465                            struct iris_batch *batch,
6466                            int stage,
6467                            const struct push_bos *push_bos)
6468 {
6469    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
6470 
6471    iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
6472       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
6473 
6474 #if GFX_VER >= 9
6475       pkt.MOCS = isl_mocs(isl_dev, 0, false);
6476 #endif
6477 
6478       /* The Skylake PRM contains the following restriction:
6479        *
6480        *    "The driver must ensure The following case does not occur
6481        *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
6482        *     buffer 3 read length equal to zero committed followed by a
6483        *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
6484        *     zero committed."
6485        *
6486        * To avoid this, we program the buffers in the highest slots.
6487        * This way, slot 0 is only used if slot 3 is also used.
6488        */
6489       const int n = push_bos->buffer_count;
6490       assert(n <= 4);
6491       const unsigned shift = 4 - n;
6492       for (int i = 0; i < n; i++) {
6493          pkt.ConstantBody.ReadLength[i + shift] =
6494             push_bos->buffers[i].length;
6495          pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
6496       }
6497    }
6498 }
6499 
6500 #if GFX_VER >= 12
6501 static void
emit_null_push_constant_tbimr_workaround(struct iris_batch * batch)6502 emit_null_push_constant_tbimr_workaround(struct iris_batch *batch)
6503 {
6504    struct isl_device *isl_dev = &batch->screen->isl_dev;
6505    /* Pass a single-register push constant payload for the PS
6506     * stage even if empty, since PS invocations with zero push
6507     * constant cycles have been found to cause hangs with TBIMR
6508     * enabled.  See HSDES #22020184996.
6509     *
6510     * XXX - Use workaround infrastructure and final workaround
6511     *       when provided by hardware team.
6512     */
6513    const struct iris_address null_addr = {
6514       .bo = batch->screen->workaround_bo,
6515       .offset = 1024,
6516    };
6517    const uint32_t num_dwords = 2 + 2 * 1;
6518    uint32_t const_all[num_dwords];
6519    uint32_t *dw = &const_all[0];
6520 
6521    iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6522       all.DWordLength = num_dwords - 2;
6523       all.MOCS = isl_mocs(isl_dev, 0, false);
6524       all.ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT);
6525       all.PointerBufferMask = 1;
6526    }
6527    dw += 2;
6528 
6529    _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), dw, data) {
6530       data.PointerToConstantBuffer = null_addr;
6531       data.ConstantBufferReadLength = 1;
6532    }
6533 
6534    iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6535 }
6536 
6537 static void
emit_push_constant_packet_all(struct iris_context * ice,struct iris_batch * batch,uint32_t shader_mask,const struct push_bos * push_bos)6538 emit_push_constant_packet_all(struct iris_context *ice,
6539                               struct iris_batch *batch,
6540                               uint32_t shader_mask,
6541                               const struct push_bos *push_bos)
6542 {
6543    struct isl_device *isl_dev = &batch->screen->isl_dev;
6544 
6545    if (!push_bos) {
6546       if (batch->screen->devinfo->needs_null_push_constant_tbimr_workaround &&
6547           (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
6548          emit_null_push_constant_tbimr_workaround(batch);
6549          shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
6550       }
6551 
6552       if (shader_mask) {
6553          iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
6554             pc.ShaderUpdateEnable = shader_mask;
6555             pc.MOCS = iris_mocs(NULL, isl_dev, 0);
6556          }
6557       }
6558       return;
6559    }
6560 
6561    const uint32_t n = push_bos->buffer_count;
6562    const uint32_t max_pointers = 4;
6563    const uint32_t num_dwords = 2 + 2 * n;
6564    uint32_t const_all[2 + 2 * max_pointers];
6565    uint32_t *dw = &const_all[0];
6566 
6567    assert(n <= max_pointers);
6568    iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6569       all.DWordLength = num_dwords - 2;
6570       all.MOCS = isl_mocs(isl_dev, 0, false);
6571       all.ShaderUpdateEnable = shader_mask;
6572       all.PointerBufferMask = (1 << n) - 1;
6573    }
6574    dw += 2;
6575 
6576    for (int i = 0; i < n; i++) {
6577       _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
6578                        dw + i * 2, data) {
6579          data.PointerToConstantBuffer = push_bos->buffers[i].addr;
6580          data.ConstantBufferReadLength = push_bos->buffers[i].length;
6581       }
6582    }
6583    iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6584 }
6585 #endif
6586 
6587 void
genX(emit_depth_state_workarounds)6588 genX(emit_depth_state_workarounds)(struct iris_context *ice,
6589                                    struct iris_batch *batch,
6590                                    const struct isl_surf *surf)
6591 {
6592 #if INTEL_NEEDS_WA_1808121037
6593    const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6594                                surf->samples == 1;
6595 
6596    switch (ice->state.genx->depth_reg_mode) {
6597    case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
6598       if (!is_d16_1x_msaa)
6599          return;
6600       break;
6601    case IRIS_DEPTH_REG_MODE_D16_1X_MSAA:
6602       if (is_d16_1x_msaa)
6603          return;
6604       break;
6605    case IRIS_DEPTH_REG_MODE_UNKNOWN:
6606       break;
6607    }
6608 
6609    /* We'll change some CHICKEN registers depending on the depth surface
6610     * format. Do a depth flush and stall so the pipeline is not using these
6611     * settings while we change the registers.
6612     */
6613    iris_emit_end_of_pipe_sync(batch,
6614                               "Workaround: Stop pipeline for Wa_1808121037",
6615                               PIPE_CONTROL_DEPTH_STALL |
6616                               PIPE_CONTROL_DEPTH_CACHE_FLUSH);
6617 
6618    /* Wa_1808121037
6619     *
6620     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6621     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6622     */
6623    iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6624       reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6625       reg.HIZPlaneOptimizationdisablebitMask = true;
6626    }
6627 
6628    ice->state.genx->depth_reg_mode =
6629       is_d16_1x_msaa ? IRIS_DEPTH_REG_MODE_D16_1X_MSAA :
6630                        IRIS_DEPTH_REG_MODE_HW_DEFAULT;
6631 #endif
6632 }
6633 
6634 /* Calculate TBIMR tiling parameters adequate for the current pipeline
6635  * setup.  Return true if TBIMR should be enabled.
6636  */
6637 UNUSED static bool
calculate_tile_dimensions(struct iris_context * ice,unsigned * tile_width,unsigned * tile_height)6638 calculate_tile_dimensions(struct iris_context *ice,
6639                           unsigned *tile_width, unsigned *tile_height)
6640 {
6641    struct iris_screen *screen = (void *)ice->ctx.screen;
6642    const struct intel_device_info *devinfo = screen->devinfo;
6643 
6644    assert(GFX_VER == 12);
6645    const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
6646 
6647    /* Perform a rough calculation of the tile cache footprint of the
6648     * pixel pipeline, approximating it as the sum of the amount of
6649     * memory used per pixel by every render target, depth, stencil and
6650     * auxiliary surfaces bound to the pipeline.
6651     */
6652    unsigned pixel_size = 0;
6653 
6654    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
6655 
6656    if (cso->width == 0 || cso->height == 0)
6657       return false;
6658 
6659    for (unsigned i = 0; i < cso->nr_cbufs; i++) {
6660       const struct iris_surface *surf = (void *)cso->cbufs[i];
6661 
6662       if (surf) {
6663          const struct iris_resource *res = (void *)surf->base.texture;
6664 
6665          pixel_size += intel_calculate_surface_pixel_size(&res->surf);
6666 
6667          /* XXX - Pessimistic, in some cases it might be helpful to neglect
6668           *       aux surface traffic.
6669           */
6670          if (ice->state.draw_aux_usage[i]) {
6671             pixel_size += intel_calculate_surface_pixel_size(&res->aux.surf);
6672 
6673             if (isl_aux_usage_has_ccs(res->aux.usage)) {
6674                pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6675                                              &res->surf), aux_scale);
6676             }
6677          }
6678       }
6679    }
6680 
6681    if (cso->zsbuf) {
6682       struct iris_resource *zres;
6683       struct iris_resource *sres;
6684       iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres, &sres);
6685 
6686       if (zres) {
6687          pixel_size += intel_calculate_surface_pixel_size(&zres->surf);
6688 
6689          /* XXX - Pessimistic, in some cases it might be helpful to neglect
6690           *       aux surface traffic.
6691           */
6692          if (iris_resource_level_has_hiz(devinfo, zres, cso->zsbuf->u.tex.level)) {
6693             pixel_size += intel_calculate_surface_pixel_size(&zres->aux.surf);
6694 
6695             if (isl_aux_usage_has_ccs(zres->aux.usage)) {
6696                pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6697                                              &zres->surf), aux_scale);
6698             }
6699          }
6700       }
6701 
6702       if (sres) {
6703          pixel_size += intel_calculate_surface_pixel_size(&sres->surf);
6704       }
6705    }
6706 
6707    /* Compute a tile layout that allows reasonable utilization of the
6708     * tile cache based on the per-pixel cache footprint estimated
6709     * above.
6710     */
6711    intel_calculate_tile_dimensions(devinfo, screen->l3_config_3d,
6712                                    32, 32, cso->width, cso->height, pixel_size,
6713                                    tile_width, tile_height);
6714 
6715    /* Perform TBIMR tile passes only if the framebuffer covers more
6716     * than a single tile.
6717     */
6718    return *tile_width < cso->width || *tile_height < cso->height;
6719 }
6720 
6721 static void
iris_preemption_streamout_wa(struct iris_context * ice,struct iris_batch * batch,bool enable)6722 iris_preemption_streamout_wa(struct iris_context *ice,
6723                              struct iris_batch *batch,
6724                              bool enable)
6725 {
6726 #if GFX_VERx10 >= 120
6727    if (!intel_needs_workaround(batch->screen->devinfo, 16013994831))
6728       return;
6729 
6730    iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
6731       reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !enable;
6732       reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
6733    }
6734 
6735    /* Emit CS_STALL and 250 noops. */
6736    iris_emit_pipe_control_flush(batch, "workaround: Wa_16013994831",
6737                                 PIPE_CONTROL_CS_STALL);
6738    for (unsigned i = 0; i < 250; i++)
6739       iris_emit_cmd(batch, GENX(MI_NOOP), noop);
6740 
6741    ice->state.genx->object_preemption = enable;
6742 #endif
6743 }
6744 
6745 static void
shader_program_uses_primitive_id(struct iris_context * ice,struct iris_batch * batch,struct iris_compiled_shader * shader,gl_shader_stage stage,bool * uses_primitive_id)6746 shader_program_uses_primitive_id(struct iris_context *ice,
6747                                  struct iris_batch *batch,
6748                                  struct iris_compiled_shader *shader,
6749                                  gl_shader_stage stage,
6750                                  bool *uses_primitive_id)
6751 {
6752    switch (stage) {
6753    case MESA_SHADER_TESS_CTRL: {
6754       struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
6755       *uses_primitive_id |= tcs_data->include_primitive_id;
6756       break;
6757    }
6758    case MESA_SHADER_TESS_EVAL: {
6759       struct iris_tes_data *tes_data = iris_tes_data(shader);
6760       *uses_primitive_id |= tes_data->include_primitive_id;
6761       break;
6762    }
6763    default:
6764       break;
6765    }
6766 
6767    struct iris_compiled_shader *gs_shader =
6768       ice->shaders.prog[MESA_SHADER_GEOMETRY];
6769    const struct iris_gs_data *gs_data =
6770       gs_shader ? iris_gs_data(gs_shader) : NULL;
6771 
6772    *uses_primitive_id |= gs_data && gs_data->include_primitive_id;
6773 }
6774 
6775 static void
emit_wa_18020335297_dummy_draw(struct iris_batch * batch)6776 emit_wa_18020335297_dummy_draw(struct iris_batch *batch)
6777 {
6778 #if GFX_VERx10 >= 125
6779    iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
6780       vfg.DistributionMode = RR_STRICT;
6781    }
6782    iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
6783       vf.GeometryDistributionEnable = true;
6784    }
6785 #endif
6786 
6787 #if GFX_VER >= 12
6788    iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
6789       pr.ReplicaMask = 1;
6790    }
6791 #endif
6792 
6793    iris_emit_cmd(batch, GENX(3DSTATE_RASTER), rr) {
6794       rr.CullMode = CULLMODE_NONE;
6795       rr.FrontFaceFillMode = FILL_MODE_SOLID;
6796       rr.BackFaceFillMode = FILL_MODE_SOLID;
6797    }
6798 
6799    iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { }
6800    iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) { }
6801 
6802 #if GFX_VER >= 11
6803    iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs2) { }
6804 #endif
6805 
6806    iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) {
6807       clip.ClipEnable = true;
6808       clip.ClipMode = CLIPMODE_REJECT_ALL;
6809    }
6810 
6811    iris_emit_cmd(batch, GENX(3DSTATE_VS), vs) { }
6812    iris_emit_cmd(batch, GENX(3DSTATE_GS), gs) { }
6813    iris_emit_cmd(batch, GENX(3DSTATE_HS), hs) { }
6814    iris_emit_cmd(batch, GENX(3DSTATE_TE), te) { }
6815    iris_emit_cmd(batch, GENX(3DSTATE_DS), ds) { }
6816    iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so) { }
6817 
6818    uint32_t vertex_elements[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)];
6819    uint32_t *ve_pack_dest = &vertex_elements[1];
6820 
6821    iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), vertex_elements, ve) {
6822       ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 -
6823                        GENX(3DSTATE_VERTEX_ELEMENTS_length_bias);
6824    }
6825 
6826    for (int i = 0; i < 2; i++) {
6827       iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6828          ve.Valid = true;
6829          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
6830          ve.Component0Control = VFCOMP_STORE_0;
6831          ve.Component1Control = VFCOMP_STORE_0;
6832          ve.Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6833          ve.Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6834       }
6835       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6836    }
6837 
6838    iris_batch_emit(batch, vertex_elements, sizeof(uint32_t) *
6839                    (1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)));
6840 
6841    iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
6842       topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
6843    }
6844 
6845    /* Emit dummy draw per slice. */
6846    for (unsigned i = 0; i < batch->screen->devinfo->num_slices; i++) {
6847       iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
6848          prim.VertexCountPerInstance = 3;
6849          prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
6850          prim.InstanceCount = 1;
6851          prim.VertexAccessType = SEQUENTIAL;
6852       }
6853    }
6854 }
6855 
6856 static void
iris_upload_dirty_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,bool skip_vb_params)6857 iris_upload_dirty_render_state(struct iris_context *ice,
6858                                struct iris_batch *batch,
6859                                const struct pipe_draw_info *draw,
6860                                bool skip_vb_params)
6861 {
6862    struct iris_screen *screen = batch->screen;
6863    struct iris_border_color_pool *border_color_pool =
6864       iris_bufmgr_get_border_color_pool(screen->bufmgr);
6865 
6866    /* Re-emit 3DSTATE_DS before any 3DPRIMITIVE when tessellation is on */
6867    if (intel_needs_workaround(batch->screen->devinfo, 22018402687) &&
6868        ice->shaders.prog[MESA_SHADER_TESS_EVAL])
6869       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TES;
6870 
6871    uint64_t dirty = ice->state.dirty;
6872    uint64_t stage_dirty = ice->state.stage_dirty;
6873 
6874    if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&
6875        !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))
6876       return;
6877 
6878    struct iris_genx_state *genx = ice->state.genx;
6879    struct iris_binder *binder = &ice->state.binder;
6880    struct iris_fs_data *fs_data =
6881       iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
6882 
6883    /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
6884     * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
6885     */
6886    bool needs_wa_14018912822 =
6887       screen->driconf.intel_enable_wa_14018912822 &&
6888       intel_needs_workaround(batch->screen->devinfo, 14018912822) &&
6889       util_framebuffer_get_num_samples(&ice->state.framebuffer) > 1;
6890 
6891    if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
6892       const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6893       uint32_t cc_vp_address;
6894       bool wa_18020335297_applied = false;
6895 
6896       /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
6897       if (intel_needs_workaround(screen->devinfo, 18020335297) &&
6898           batch->name == IRIS_BATCH_RENDER &&
6899           ice->state.viewport_ptr_set) {
6900          emit_wa_18020335297_dummy_draw(batch);
6901          wa_18020335297_applied = true;
6902       }
6903 
6904       /* XXX: could avoid streaming for depth_clip [0,1] case. */
6905       uint32_t *cc_vp_map =
6906          stream_state(batch, ice->state.dynamic_uploader,
6907                       &ice->state.last_res.cc_vp,
6908                       4 * ice->state.num_viewports *
6909                       GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
6910       for (int i = 0; i < ice->state.num_viewports; i++) {
6911          float zmin, zmax;
6912          iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,
6913                                  ice->state.window_space_position,
6914                                  &zmin, &zmax);
6915          if (cso_rast->depth_clip_near)
6916             zmin = 0.0;
6917          if (cso_rast->depth_clip_far)
6918             zmax = 1.0;
6919 
6920          iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
6921             ccv.MinimumDepth = zmin;
6922             ccv.MaximumDepth = zmax;
6923          }
6924 
6925          cc_vp_map += GENX(CC_VIEWPORT_length);
6926       }
6927 
6928       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
6929          ptr.CCViewportPointer = cc_vp_address;
6930       }
6931 
6932       if (wa_18020335297_applied) {
6933 #if GFX_VER >= 12
6934          iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { }
6935 #endif
6936          /* Dirty all emitted WA state to make sure that current real
6937           * state is restored.
6938           */
6939          dirty |= IRIS_DIRTY_VFG |
6940                   IRIS_DIRTY_VF |
6941                   IRIS_DIRTY_RASTER |
6942                   IRIS_DIRTY_VF_STATISTICS |
6943                   IRIS_DIRTY_VF_SGVS |
6944                   IRIS_DIRTY_CLIP |
6945                   IRIS_DIRTY_STREAMOUT |
6946                   IRIS_DIRTY_VERTEX_ELEMENTS |
6947                   IRIS_DIRTY_VF_TOPOLOGY;
6948 
6949          for (int stage = 0; stage < MESA_SHADER_FRAGMENT; stage++) {
6950             if (ice->shaders.prog[stage])
6951                stage_dirty |= (IRIS_STAGE_DIRTY_VS << stage);
6952          }
6953       }
6954       ice->state.viewport_ptr_set = true;
6955    }
6956 
6957    if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
6958       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6959       int32_t x_min, y_min, x_max, y_max;
6960       uint32_t sf_cl_vp_address;
6961       uint32_t *vp_map =
6962          stream_state(batch, ice->state.dynamic_uploader,
6963                       &ice->state.last_res.sf_cl_vp,
6964                       4 * ice->state.num_viewports *
6965                       GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
6966 
6967       x_min = ice->state.render_area.x;
6968       y_min = ice->state.render_area.y;
6969       x_max = ice->state.render_area.width;
6970       y_max = ice->state.render_area.height;
6971 
6972       for (unsigned i = 0; i < ice->state.num_viewports; i++) {
6973          const struct pipe_viewport_state *state = &ice->state.viewports[i];
6974          float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
6975 
6976          float vp_xmin = viewport_extent(state, 0, -1.0f);
6977          float vp_xmax = viewport_extent(state, 0,  1.0f);
6978          float vp_ymin = viewport_extent(state, 1, -1.0f);
6979          float vp_ymax = viewport_extent(state, 1,  1.0f);
6980 
6981          intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
6982                                         state->scale[0], state->scale[1],
6983                                         state->translate[0], state->translate[1],
6984                                         &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
6985 
6986          iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
6987             vp.ViewportMatrixElementm00 = state->scale[0];
6988             vp.ViewportMatrixElementm11 = state->scale[1];
6989             vp.ViewportMatrixElementm22 = state->scale[2];
6990             vp.ViewportMatrixElementm30 = state->translate[0];
6991             vp.ViewportMatrixElementm31 = state->translate[1];
6992             vp.ViewportMatrixElementm32 = state->translate[2];
6993             vp.XMinClipGuardband = gb_xmin;
6994             vp.XMaxClipGuardband = gb_xmax;
6995             vp.YMinClipGuardband = gb_ymin;
6996             vp.YMaxClipGuardband = gb_ymax;
6997             vp.XMinViewPort = MAX2(vp_xmin, 0);
6998             vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6999             vp.YMinViewPort = MAX2(vp_ymin, 0);
7000             vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
7001          }
7002 
7003          vp_map += GENX(SF_CLIP_VIEWPORT_length);
7004       }
7005 
7006       iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
7007          ptr.SFClipViewportPointer = sf_cl_vp_address;
7008       }
7009    }
7010 
7011    if (dirty & IRIS_DIRTY_URB) {
7012       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
7013          if (!ice->shaders.prog[i]) {
7014             ice->shaders.urb.cfg.size[i] = 1;
7015          } else {
7016             struct iris_vue_data *vue_data =
7017                iris_vue_data(ice->shaders.prog[i]);
7018             ice->shaders.urb.cfg.size[i] = vue_data->urb_entry_size;
7019          }
7020          assert(ice->shaders.urb.cfg.size[i] != 0);
7021       }
7022 
7023       genX(emit_urb_config)(batch,
7024                             ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
7025                             ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL);
7026    }
7027 
7028    if (dirty & IRIS_DIRTY_BLEND_STATE) {
7029       struct iris_blend_state *cso_blend = ice->state.cso_blend;
7030       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7031       struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7032 
7033       bool color_blend_zero = false;
7034       bool alpha_blend_zero = false;
7035 
7036       /* Always write at least one BLEND_STATE - the final RT message will
7037        * reference BLEND_STATE[0] even if there aren't color writes.  There
7038        * may still be alpha testing, computed depth, and so on.
7039        */
7040       const int rt_dwords =
7041          MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
7042 
7043       uint32_t blend_offset;
7044       uint32_t *blend_map =
7045          stream_state(batch, ice->state.dynamic_uploader,
7046                       &ice->state.last_res.blend,
7047                       96, 64, &blend_offset);
7048 
7049       /* Copy of blend entries for merging dynamic changes. */
7050       uint32_t blend_entries[4 * rt_dwords];
7051       memcpy(blend_entries, &cso_blend->blend_state[1], sizeof(blend_entries));
7052 
7053       unsigned cbufs = MAX2(cso_fb->nr_cbufs, 1);
7054 
7055       uint32_t *blend_entry = blend_entries;
7056       for (unsigned i = 0; i < cbufs; i++) {
7057          int dst_blend_factor = cso_blend->ps_dst_blend_factor[i];
7058          int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[i];
7059          uint32_t entry[GENX(BLEND_STATE_ENTRY_length)];
7060          iris_pack_state(GENX(BLEND_STATE_ENTRY), entry, be) {
7061             if (needs_wa_14018912822) {
7062                if (dst_blend_factor == BLENDFACTOR_ZERO) {
7063                   dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7064                   color_blend_zero = true;
7065                }
7066                if (dst_alpha_blend_factor == BLENDFACTOR_ZERO) {
7067                   dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7068                   alpha_blend_zero = true;
7069                }
7070             }
7071             be.DestinationBlendFactor = dst_blend_factor;
7072             be.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7073          }
7074 
7075          /* Merge entry. */
7076          uint32_t *dst = blend_entry;
7077          uint32_t *src = entry;
7078          for (unsigned j = 0; j < GENX(BLEND_STATE_ENTRY_length); j++)
7079             *dst |= *src;
7080 
7081          blend_entry += GENX(BLEND_STATE_ENTRY_length);
7082       }
7083 
7084       /* Blend constants modified for Wa_14018912822. */
7085       if (ice->state.color_blend_zero != color_blend_zero) {
7086          ice->state.color_blend_zero = color_blend_zero;
7087          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7088       }
7089       if (ice->state.alpha_blend_zero != alpha_blend_zero) {
7090          ice->state.alpha_blend_zero = alpha_blend_zero;
7091          ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7092       }
7093 
7094       uint32_t blend_state_header;
7095       iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
7096          bs.AlphaTestEnable = cso_zsa->alpha_enabled;
7097          bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);
7098       }
7099 
7100       blend_map[0] = blend_state_header | cso_blend->blend_state[0];
7101       memcpy(&blend_map[1], blend_entries, 4 * rt_dwords);
7102 
7103       iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
7104          ptr.BlendStatePointer = blend_offset;
7105          ptr.BlendStatePointerValid = true;
7106       }
7107    }
7108 
7109    if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
7110       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7111 #if GFX_VER == 8
7112       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7113 #endif
7114       uint32_t cc_offset;
7115       void *cc_map =
7116          stream_state(batch, ice->state.dynamic_uploader,
7117                       &ice->state.last_res.color_calc,
7118                       sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
7119                       64, &cc_offset);
7120       iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
7121          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
7122          cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;
7123          cc.BlendConstantColorRed   = ice->state.color_blend_zero ?
7124             0.0 : ice->state.blend_color.color[0];
7125          cc.BlendConstantColorGreen = ice->state.color_blend_zero ?
7126             0.0 : ice->state.blend_color.color[1];
7127          cc.BlendConstantColorBlue  = ice->state.color_blend_zero ?
7128             0.0 : ice->state.blend_color.color[2];
7129          cc.BlendConstantColorAlpha = ice->state.alpha_blend_zero ?
7130             0.0 : ice->state.blend_color.color[3];
7131 #if GFX_VER == 8
7132 	 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
7133 	 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7134 #endif
7135       }
7136       iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7137          ptr.ColorCalcStatePointer = cc_offset;
7138          ptr.ColorCalcStatePointerValid = true;
7139       }
7140    }
7141 
7142 #if GFX_VERx10 == 125
7143    if (dirty & (IRIS_DIRTY_RENDER_BUFFER | IRIS_DIRTY_DEPTH_BUFFER)) {
7144       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7145       unsigned tile_width, tile_height;
7146 
7147       ice->state.use_tbimr = batch->screen->driconf.enable_tbimr &&
7148          calculate_tile_dimensions(ice, &tile_width, &tile_height);
7149 
7150       if (ice->state.use_tbimr) {
7151          /* Use a batch size of 128 polygons per slice as recommended
7152           * by BSpec 68436 "TBIMR Programming".
7153           */
7154          const unsigned num_slices = screen->devinfo->num_slices;
7155          const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
7156 
7157          iris_emit_cmd(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
7158             tbimr.TileRectangleHeight = tile_height;
7159             tbimr.TileRectangleWidth = tile_width;
7160             tbimr.VerticalTileCount = DIV_ROUND_UP(cso_fb->height, tile_height);
7161             tbimr.HorizontalTileCount = DIV_ROUND_UP(cso_fb->width, tile_width);
7162             tbimr.TBIMRBatchSize = util_logbase2(batch_size) - 5;
7163             tbimr.TileBoxCheck = true;
7164          }
7165       }
7166    }
7167 #endif
7168 
7169    /* Wa_1604061319
7170     *
7171     *    3DSTATE_CONSTANT_* needs to be programmed before BTP_*
7172     *
7173     * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
7174     * any stage has a dirty binding table.
7175     */
7176    const bool emit_const_wa = GFX_VER >= 11 &&
7177       ((dirty & IRIS_DIRTY_RENDER_BUFFER) ||
7178        (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));
7179 
7180 #if GFX_VER >= 12
7181    uint32_t nobuffer_stages = 0;
7182 #endif
7183 
7184    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7185       if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&
7186           !emit_const_wa)
7187          continue;
7188 
7189       struct iris_shader_state *shs = &ice->state.shaders[stage];
7190       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7191 
7192       if (!shader)
7193          continue;
7194 
7195       if (shs->sysvals_need_upload)
7196          upload_sysvals(ice, stage, NULL);
7197 
7198       struct push_bos push_bos = {};
7199       setup_constant_buffers(ice, batch, stage, &push_bos);
7200 
7201 #if GFX_VER >= 12
7202       /* If this stage doesn't have any push constants, emit it later in a
7203        * single CONSTANT_ALL packet with all the other stages.
7204        */
7205       if (push_bos.buffer_count == 0) {
7206          nobuffer_stages |= 1 << stage;
7207          continue;
7208       }
7209 
7210       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
7211        * contains only 5 bits, so we can only use it for buffers smaller than
7212        * 32.
7213        *
7214        * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
7215        * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
7216        * for disabling stages, where all address bits are zero.  However, we
7217        * can't safely use it for general buffers with arbitrary addresses.
7218        * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
7219        * case.
7220        */
7221       if (push_bos.max_length < 32 && GFX_VERx10 > 120) {
7222          emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);
7223          continue;
7224       }
7225 #endif
7226       emit_push_constant_packets(ice, batch, stage, &push_bos);
7227    }
7228 
7229 #if GFX_VER >= 12
7230    if (nobuffer_stages)
7231       /* Wa_16011448509: all address bits are zero */
7232       emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);
7233 #endif
7234 
7235    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7236       /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted
7237        * in order to commit constants.  TODO: Investigate "Disable Gather
7238        * at Set Shader" to go back to legacy mode...
7239        */
7240       if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |
7241                           (GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))
7242                             << stage)) {
7243          iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
7244             ptr._3DCommandSubOpcode = 38 + stage;
7245             ptr.PointertoVSBindingTable =
7246                binder->bt_offset[stage] >> IRIS_BT_OFFSET_SHIFT;
7247          }
7248       }
7249    }
7250 
7251    if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
7252       // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
7253       // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
7254 
7255       /* The PIPE_CONTROL command description says:
7256        *
7257        *   "Whenever a Binding Table Index (BTI) used by a Render Target
7258        *    Message points to a different RENDER_SURFACE_STATE, SW must issue a
7259        *    Render Target Cache Flush by enabling this bit. When render target
7260        *    flush is set due to new association of BTI, PS Scoreboard Stall bit
7261        *    must be set in this packet."
7262        */
7263       // XXX: does this need to happen at 3DSTATE_BTP_PS time?
7264       iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
7265                                    PIPE_CONTROL_RENDER_TARGET_FLUSH |
7266                                    PIPE_CONTROL_STALL_AT_SCOREBOARD);
7267    }
7268 
7269    if (dirty & IRIS_DIRTY_RENDER_BUFFER)
7270       trace_framebuffer_state(&batch->trace, NULL, &ice->state.framebuffer);
7271 
7272    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7273       if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
7274          iris_populate_binding_table(ice, batch, stage, false);
7275       }
7276    }
7277 
7278    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7279       if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
7280           !ice->shaders.prog[stage])
7281          continue;
7282 
7283       iris_upload_sampler_states(ice, stage);
7284 
7285       struct iris_shader_state *shs = &ice->state.shaders[stage];
7286       struct pipe_resource *res = shs->sampler_table.res;
7287       if (res)
7288          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
7289                             IRIS_DOMAIN_NONE);
7290 
7291       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
7292          ptr._3DCommandSubOpcode = 43 + stage;
7293          ptr.PointertoVSSamplerState = shs->sampler_table.offset;
7294       }
7295    }
7296 
7297    if (ice->state.need_border_colors)
7298       iris_use_pinned_bo(batch, border_color_pool->bo, false, IRIS_DOMAIN_NONE);
7299 
7300    if (dirty & IRIS_DIRTY_MULTISAMPLE) {
7301       iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
7302          ms.PixelLocation =
7303             ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
7304          if (ice->state.framebuffer.samples > 0)
7305             ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
7306       }
7307    }
7308 
7309    if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
7310       iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
7311          ms.SampleMask = ice->state.sample_mask;
7312       }
7313    }
7314 
7315 #if GFX_VERx10 >= 125
7316    /* This is only used on >= gfx125 for dynamic 3DSTATE_TE and
7317     * 3DSTATE_VFG emission related workarounds.
7318     */
7319    bool program_uses_primitive_id = false;
7320 
7321    /* Check if FS stage will use primitive ID overrides. */
7322    const struct intel_vue_map *last_vue_map =
7323       &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7324    if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
7325        last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
7326       program_uses_primitive_id = true;
7327    }
7328 #endif
7329 
7330    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7331       if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))
7332          continue;
7333 
7334       struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7335 
7336       if (shader) {
7337          struct iris_resource *cache = (void *) shader->assembly.res;
7338          iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);
7339 
7340          uint32_t scratch_addr =
7341             pin_scratch_space(ice, batch, shader, stage);
7342 
7343 #if GFX_VERx10 >= 125
7344          shader_program_uses_primitive_id(ice, batch, shader, stage,
7345                                           &program_uses_primitive_id);
7346 #endif
7347 
7348          if (stage == MESA_SHADER_FRAGMENT) {
7349             UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;
7350             struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7351 
7352             uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
7353             _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
7354 #if GFX_VER >= 9
7355                struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(shader->brw_prog_data);
7356 #else
7357                struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(shader->elk_prog_data);
7358 #endif
7359                intel_set_ps_dispatch_state(&ps, batch->screen->devinfo,
7360                                            wm_prog_data, util_framebuffer_get_num_samples(cso_fb),
7361                                            0 /* msaa_flags */);
7362 
7363 #if GFX_VER == 12
7364                assert(fs_data->dispatch_multi == 0 ||
7365                       (fs_data->dispatch_multi == 16 && fs_data->max_polygons == 2));
7366                ps.DualSIMD8DispatchEnable = fs_data->dispatch_multi;
7367                /* XXX - No major improvement observed from enabling
7368                 *       overlapping subspans, but it could be helpful
7369                 *       in theory when the requirements listed on the
7370                 *       BSpec page for 3DSTATE_PS_BODY are met.
7371                 */
7372                ps.OverlappingSubspansEnable = false;
7373 #endif
7374 
7375 #if GFX_VER >= 9
7376                ps.DispatchGRFStartRegisterForConstantSetupData0 =
7377                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7378                ps.DispatchGRFStartRegisterForConstantSetupData1 =
7379                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7380 #if GFX_VER < 20
7381                ps.DispatchGRFStartRegisterForConstantSetupData2 =
7382                   brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7383 #endif
7384 
7385                ps.KernelStartPointer0 = KSP(shader) +
7386                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7387                ps.KernelStartPointer1 = KSP(shader) +
7388                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7389 #if GFX_VER < 20
7390                ps.KernelStartPointer2 = KSP(shader) +
7391                   brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7392 #endif
7393 #else
7394                ps.DispatchGRFStartRegisterForConstantSetupData0 =
7395                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7396                ps.DispatchGRFStartRegisterForConstantSetupData1 =
7397                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7398                ps.DispatchGRFStartRegisterForConstantSetupData2 =
7399                   elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7400 
7401                ps.KernelStartPointer0 = KSP(shader) +
7402                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7403                ps.KernelStartPointer1 = KSP(shader) +
7404                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7405                ps.KernelStartPointer2 = KSP(shader) +
7406                   elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7407 #endif
7408 
7409 #if GFX_VERx10 >= 125
7410                ps.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7411 #else
7412                ps.ScratchSpaceBasePointer =
7413                   rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7414 #endif
7415             }
7416 
7417             uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};
7418             iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
7419 #if GFX_VER >= 9
7420                if (!fs_data->uses_sample_mask)
7421                   psx.InputCoverageMaskState  = ICMS_NONE;
7422                else if (fs_data->post_depth_coverage)
7423                   psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
7424                else if (fs_data->inner_coverage &&
7425                         cso->conservative_rasterization)
7426                   psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
7427                else
7428                   psx.InputCoverageMaskState = ICMS_NORMAL;
7429 #else
7430                psx.PixelShaderUsesInputCoverageMask =
7431                   fs_data->uses_sample_mask;
7432 #endif
7433             }
7434 
7435             uint32_t *shader_ps = (uint32_t *) shader->derived_data;
7436             uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);
7437             iris_emit_merge(batch, shader_ps, ps_state,
7438                             GENX(3DSTATE_PS_length));
7439             iris_emit_merge(batch, shader_psx, psx_state,
7440                             GENX(3DSTATE_PS_EXTRA_length));
7441 #if GFX_VERx10 >= 125
7442          } else if (stage == MESA_SHADER_TESS_EVAL) {
7443             uint32_t te_state[GENX(3DSTATE_TE_length)] = { 0 };
7444             iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
7445                if (intel_needs_workaround(screen->devinfo, 14015055625) &&
7446                    program_uses_primitive_id)
7447                   te.TessellationDistributionMode = TEDMODE_OFF;
7448                else if (intel_needs_workaround(screen->devinfo, 22012699309))
7449                   te.TessellationDistributionMode = TEDMODE_RR_STRICT;
7450                else
7451                   te.TessellationDistributionMode = TEDMODE_RR_FREE;
7452             }
7453 
7454             uint32_t ds_state[GENX(3DSTATE_DS_length)] = { 0 };
7455             iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
7456                if (scratch_addr)
7457                   ds.ScratchSpaceBuffer =
7458                      scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7459             }
7460 
7461             uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7462             uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7463 
7464             iris_emit_merge(batch, shader_ds, ds_state,
7465                             GENX(3DSTATE_DS_length));
7466             iris_emit_merge(batch, shader_te, te_state,
7467                             GENX(3DSTATE_TE_length));
7468 #endif
7469          } else if (scratch_addr) {
7470             uint32_t *pkt = (uint32_t *) shader->derived_data;
7471             switch (stage) {
7472             case MESA_SHADER_VERTEX:    MERGE_SCRATCH_ADDR(3DSTATE_VS); break;
7473             case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;
7474             case MESA_SHADER_TESS_EVAL: {
7475                uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7476                uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7477                iris_batch_emit(batch, shader_te, 4 * GENX(3DSTATE_TE_length));
7478                MERGE_SCRATCH_ADDR(3DSTATE_DS);
7479                break;
7480             }
7481             case MESA_SHADER_GEOMETRY:  MERGE_SCRATCH_ADDR(3DSTATE_GS); break;
7482             }
7483          } else {
7484             iris_batch_emit(batch, shader->derived_data,
7485                             iris_derived_program_state_size(stage));
7486          }
7487       } else {
7488          if (stage == MESA_SHADER_TESS_EVAL) {
7489             iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7490             iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
7491             iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7492          } else if (stage == MESA_SHADER_GEOMETRY) {
7493             iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
7494          }
7495       }
7496    }
7497 
7498 #if GFX_VERx10 >= 125
7499    /* Inspect program_uses_primitive_id state and dirty VFG if required. */
7500    if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
7501        program_uses_primitive_id != ice->state.uses_primitive_id) {
7502       dirty |= IRIS_DIRTY_VFG;
7503       ice->state.uses_primitive_id = program_uses_primitive_id;
7504    }
7505 #endif
7506 
7507    if (ice->state.streamout_active) {
7508       if (dirty & IRIS_DIRTY_SO_BUFFERS) {
7509          /* Wa_16011411144
7510           * SW must insert a PIPE_CONTROL cmd before and after the
7511           * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_* state is
7512           * not combined with other state changes.
7513           */
7514          if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7515             iris_emit_pipe_control_flush(batch,
7516                                          "SO pre change stall WA",
7517                                          PIPE_CONTROL_CS_STALL);
7518          }
7519 
7520          for (int i = 0; i < 4; i++) {
7521             struct iris_stream_output_target *tgt =
7522                (void *) ice->state.so_target[i];
7523             enum { dwords = GENX(3DSTATE_SO_BUFFER_length) };
7524             uint32_t *so_buffers = genx->so_buffers + i * dwords;
7525             bool zero_offset = false;
7526 
7527             if (tgt) {
7528                zero_offset = tgt->zero_offset;
7529                iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
7530                                   true, IRIS_DOMAIN_OTHER_WRITE);
7531                iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
7532                                   true, IRIS_DOMAIN_OTHER_WRITE);
7533             }
7534 
7535             if (zero_offset) {
7536                /* Skip the last DWord which contains "Stream Offset" of
7537                 * 0xFFFFFFFF and instead emit a dword of zero directly.
7538                 */
7539                STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==
7540                              32 * (dwords - 1));
7541                const uint32_t zero = 0;
7542                iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));
7543                iris_batch_emit(batch, &zero, sizeof(zero));
7544                tgt->zero_offset = false;
7545             } else {
7546                iris_batch_emit(batch, so_buffers, 4 * dwords);
7547             }
7548          }
7549 
7550          /* Wa_16011411144 */
7551          if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7552             iris_emit_pipe_control_flush(batch,
7553                                          "SO post change stall WA",
7554                                          PIPE_CONTROL_CS_STALL);
7555          }
7556       }
7557 
7558       if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
7559          /* Wa_16011773973:
7560           * If SOL is enabled and SO_DECL state has to be programmed,
7561           *    1. Send 3D State SOL state with SOL disabled
7562           *    2. Send SO_DECL NP state
7563           *    3. Send 3D State SOL with SOL Enabled
7564           */
7565          if (intel_device_info_is_dg2(batch->screen->devinfo))
7566             iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7567 
7568          uint32_t *decl_list =
7569             ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
7570          iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
7571 
7572 #if GFX_VER >= 11 && GFX_VER < 20
7573          /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7574           * 3DSTATE_SO_DECL_LIST:
7575           *
7576           *    "Workaround: This command must be followed by a PIPE_CONTROL
7577           *     with CS Stall bit set."
7578           *
7579           * On DG2+ also known as Wa_1509820217.
7580           */
7581          iris_emit_pipe_control_flush(batch,
7582                                       "workaround: cs stall after so_decl",
7583                                       PIPE_CONTROL_CS_STALL);
7584 #endif
7585       }
7586 
7587       if (dirty & IRIS_DIRTY_STREAMOUT) {
7588          const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7589 
7590 #if GFX_VERx10 >= 120
7591          /* Wa_16013994831 - Disable preemption. */
7592          if (intel_needs_workaround(batch->screen->devinfo, 16013994831))
7593             iris_preemption_streamout_wa(ice, batch, false);
7594 #endif
7595 
7596          uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
7597          iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
7598             sol.SOFunctionEnable = true;
7599             sol.SOStatisticsEnable = true;
7600 
7601             sol.RenderingDisable = cso_rast->rasterizer_discard &&
7602                                    !ice->state.prims_generated_query_active;
7603             sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
7604 
7605 
7606 #if INTEL_NEEDS_WA_18022508906
7607             /* Wa_14017076903 :
7608              *
7609              * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
7610              *
7611              * SOL_INT::Render_Enable =
7612              *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
7613              *   (
7614              *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
7615              *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
7616              *     !3DSTATE_STREAMOUT::API_Render_Disable &&
7617              *     (
7618              *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
7619              *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
7620              *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
7621              *       3DSTATE_PS_EXTRA::PS_Valid ||
7622              *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
7623              *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
7624              *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
7625              *     )
7626              *   )
7627              *
7628              * If SOL_INT::Render_Enable is false, the SO stage will not forward any
7629              * topologies down the pipeline. Which is not what we want for occlusion
7630              * queries.
7631              *
7632              * Here we force rendering to get SOL_INT::Render_Enable when occlusion
7633              * queries are active.
7634              */
7635             const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7636             if (!cso_rast->rasterizer_discard && ice->state.occlusion_query_active)
7637                sol.ForceRendering = Force_on;
7638 #endif
7639          }
7640 
7641          assert(ice->state.streamout);
7642 
7643          iris_emit_merge(batch, ice->state.streamout, dynamic_sol,
7644                          GENX(3DSTATE_STREAMOUT_length));
7645       }
7646    } else {
7647       if (dirty & IRIS_DIRTY_STREAMOUT) {
7648 
7649 #if GFX_VERx10 >= 120
7650          /* Wa_16013994831 - Enable preemption. */
7651          if (!ice->state.genx->object_preemption)
7652             iris_preemption_streamout_wa(ice, batch, true);
7653 #endif
7654 
7655          iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7656       }
7657    }
7658 
7659    if (dirty & IRIS_DIRTY_CLIP) {
7660       struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7661       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7662 
7663       bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
7664                        ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7665       bool points_or_lines = cso_rast->fill_mode_point_or_line ||
7666          (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
7667                     : ice->state.prim_is_points_or_lines);
7668       const struct intel_vue_map *last =
7669          &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7670 
7671       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
7672       iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
7673          cl.StatisticsEnable = ice->state.statistics_counters_enabled;
7674          if (cso_rast->rasterizer_discard)
7675             cl.ClipMode = CLIPMODE_REJECT_ALL;
7676          else if (ice->state.window_space_position)
7677             cl.ClipMode = CLIPMODE_ACCEPT_ALL;
7678          else
7679             cl.ClipMode = CLIPMODE_NORMAL;
7680 
7681          cl.PerspectiveDivideDisable = ice->state.window_space_position;
7682          cl.ViewportXYClipTestEnable = !points_or_lines;
7683 
7684          cl.NonPerspectiveBarycentricEnable = fs_data->uses_nonperspective_interp_modes;
7685 
7686          cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1 ||
7687                                       !(last->slots_valid & VARYING_BIT_LAYER);
7688          cl.MaximumVPIndex = ice->state.num_viewports - 1;
7689       }
7690       iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
7691                       ARRAY_SIZE(cso_rast->clip));
7692    }
7693 
7694    if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {
7695       /* From the Browadwell PRM, Volume 2, documentation for
7696        * 3DSTATE_RASTER, "Antialiasing Enable":
7697        *
7698        * "This field must be disabled if any of the render targets
7699        * have integer (UINT or SINT) surface format."
7700        *
7701        * Additionally internal documentation for Gfx12+ states:
7702        *
7703        * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
7704        *  FORCED_SAMPLE_COUNT > 1."
7705        */
7706       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7707       unsigned samples = util_framebuffer_get_num_samples(cso_fb);
7708       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7709 
7710       bool aa_enable = cso->line_smooth &&
7711                        !ice->state.has_integer_rt &&
7712                        !(batch->screen->devinfo->ver >= 12 && samples > 1);
7713 
7714       uint32_t dynamic_raster[GENX(3DSTATE_RASTER_length)];
7715       iris_pack_command(GENX(3DSTATE_RASTER), &dynamic_raster, raster) {
7716          raster.AntialiasingEnable = aa_enable;
7717       }
7718       iris_emit_merge(batch, cso->raster, dynamic_raster,
7719                       ARRAY_SIZE(cso->raster));
7720 
7721       uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7722       iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7723          sf.ViewportTransformEnable = !ice->state.window_space_position;
7724 
7725 #if GFX_VER >= 12
7726          sf.DerefBlockSize = ice->state.urb_deref_block_size;
7727 #endif
7728       }
7729       iris_emit_merge(batch, cso->sf, dynamic_sf,
7730                       ARRAY_SIZE(dynamic_sf));
7731    }
7732 
7733    if (dirty & IRIS_DIRTY_WM) {
7734       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7735       uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
7736 
7737       iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
7738          wm.StatisticsEnable = ice->state.statistics_counters_enabled;
7739 
7740          wm.BarycentricInterpolationMode =
7741             iris_fs_barycentric_modes(ice->shaders.prog[MESA_SHADER_FRAGMENT], 0);
7742 
7743          if (fs_data->early_fragment_tests)
7744             wm.EarlyDepthStencilControl = EDSC_PREPS;
7745          else if (fs_data->has_side_effects)
7746             wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7747          else
7748             wm.EarlyDepthStencilControl = EDSC_NORMAL;
7749 
7750          /* We could skip this bit if color writes are enabled. */
7751          if (fs_data->has_side_effects || fs_data->uses_kill)
7752             wm.ForceThreadDispatchEnable = ForceON;
7753       }
7754       iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
7755    }
7756 
7757    if (dirty & IRIS_DIRTY_SBE) {
7758       iris_emit_sbe(batch, ice);
7759    }
7760 
7761    if (dirty & IRIS_DIRTY_PS_BLEND) {
7762       struct iris_blend_state *cso_blend = ice->state.cso_blend;
7763       struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7764       const struct shader_info *fs_info =
7765          iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7766 
7767       int dst_blend_factor = cso_blend->ps_dst_blend_factor[0];
7768       int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[0];
7769 
7770       /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
7771        * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
7772        */
7773       if (needs_wa_14018912822) {
7774          if (ice->state.color_blend_zero)
7775             dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7776          if (ice->state.alpha_blend_zero)
7777             dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7778       }
7779 
7780       uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7781       iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7782          pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7783          pb.AlphaTestEnable = cso_zsa->alpha_enabled;
7784 
7785          pb.DestinationBlendFactor = dst_blend_factor;
7786          pb.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7787 
7788          /* The dual source blending docs caution against using SRC1 factors
7789           * when the shader doesn't use a dual source render target write.
7790           * Empirically, this can lead to GPU hangs, and the results are
7791           * undefined anyway, so simply disable blending to avoid the hang.
7792           */
7793          pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7794             (!cso_blend->dual_color_blending || fs_data->dual_src_blend);
7795       }
7796 
7797       iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7798                       ARRAY_SIZE(cso_blend->ps_blend));
7799    }
7800 
7801    if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
7802       struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7803 #if GFX_VER >= 9 && GFX_VER < 12
7804       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7805       uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7806       iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7807          wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7808          wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7809       }
7810       iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
7811 #else
7812       /* Use modify disable fields which allow us to emit packets
7813        * directly instead of merging them later.
7814        */
7815       iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));
7816 #endif
7817 
7818    /* Depth or stencil write changed in cso. */
7819    if (intel_needs_workaround(batch->screen->devinfo, 18019816803) &&
7820        (dirty & IRIS_DIRTY_DS_WRITE_ENABLE)) {
7821       iris_emit_pipe_control_flush(
7822          batch, "workaround: PSS stall after DS write enable change",
7823          PIPE_CONTROL_PSS_STALL_SYNC);
7824    }
7825 
7826 #if GFX_VER >= 12
7827       iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));
7828 #endif
7829    }
7830 
7831    if (dirty & IRIS_DIRTY_STENCIL_REF) {
7832 #if GFX_VER >= 12
7833       /* Use modify disable fields which allow us to emit packets
7834        * directly instead of merging them later.
7835        */
7836       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7837       uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7838       iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7839          wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7840          wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7841          wmds.StencilTestMaskModifyDisable = true;
7842          wmds.StencilWriteMaskModifyDisable = true;
7843          wmds.StencilStateModifyDisable = true;
7844          wmds.DepthStateModifyDisable = true;
7845       }
7846       iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));
7847 #endif
7848    }
7849 
7850    if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
7851       /* Wa_1409725701:
7852        *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
7853        *    stored as an array of up to 16 elements. The location of first
7854        *    element of the array, as specified by Pointer to SCISSOR_RECT,
7855        *    should be aligned to a 64-byte boundary.
7856        */
7857       uint32_t alignment = 64;
7858       uint32_t scissor_offset =
7859          emit_state(batch, ice->state.dynamic_uploader,
7860                     &ice->state.last_res.scissor,
7861                     ice->state.scissors,
7862                     sizeof(struct pipe_scissor_state) *
7863                     ice->state.num_viewports, alignment);
7864 
7865       iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7866          ptr.ScissorRectPointer = scissor_offset;
7867       }
7868    }
7869 
7870    if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
7871       struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
7872 
7873       /* Do not emit the cso yet. We may need to update clear params first. */
7874       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7875       struct iris_resource *zres = NULL, *sres = NULL;
7876       if (cso_fb->zsbuf) {
7877          iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
7878                                           &zres, &sres);
7879       }
7880 
7881       if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
7882 #if GFX_VER < 20
7883          uint32_t *clear_params =
7884             cso_z->packets + ARRAY_SIZE(cso_z->packets) -
7885             GENX(3DSTATE_CLEAR_PARAMS_length);
7886 
7887          iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
7888             clear.DepthClearValueValid = true;
7889             clear.DepthClearValue = zres->aux.clear_color.f32[0];
7890          }
7891 #endif
7892       }
7893 
7894       iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
7895 
7896       if (intel_needs_workaround(batch->screen->devinfo, 1408224581) ||
7897           intel_needs_workaround(batch->screen->devinfo, 14014097488) ||
7898           intel_needs_workaround(batch->screen->devinfo, 14016712196)) {
7899          /* Wa_1408224581
7900           *
7901           * Workaround: Gfx12LP Astep only An additional pipe control with
7902           * post-sync = store dword operation would be required.( w/a is to
7903           * have an additional pipe control after the stencil state whenever
7904           * the surface state bits of this state is changing).
7905           *
7906           * This also seems sufficient to handle Wa_14014097488 and
7907           * Wa_14016712196.
7908           */
7909          iris_emit_pipe_control_write(batch, "WA for depth/stencil state",
7910                                       PIPE_CONTROL_WRITE_IMMEDIATE,
7911                                       screen->workaround_address.bo,
7912                                       screen->workaround_address.offset, 0);
7913       }
7914 
7915       if (zres)
7916          genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
7917    }
7918 
7919    if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
7920       /* Listen for buffer changes, and also write enable changes. */
7921       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7922       pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
7923    }
7924 
7925    if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
7926       iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7927          for (int i = 0; i < 32; i++) {
7928             poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7929          }
7930       }
7931    }
7932 
7933    if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
7934       struct iris_rasterizer_state *cso = ice->state.cso_rast;
7935       iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7936 #if GFX_VER >= 11
7937       /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7938        * 3DSTATE_LINE_STIPPLE:
7939        *
7940        *    "Workaround: This command must be followed by a PIPE_CONTROL with
7941        *     CS Stall bit set."
7942        */
7943       iris_emit_pipe_control_flush(batch,
7944                                    "workaround: post 3DSTATE_LINE_STIPPLE",
7945                                    PIPE_CONTROL_CS_STALL);
7946 #endif
7947    }
7948 
7949    if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
7950       iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7951          topo.PrimitiveTopologyType =
7952             translate_prim_type(draw->mode, ice->state.vertices_per_patch);
7953       }
7954    }
7955 
7956    if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
7957       int count = util_bitcount64(ice->state.bound_vertex_buffers);
7958       uint64_t dynamic_bound = ice->state.bound_vertex_buffers;
7959 
7960       if (ice->state.vs_uses_draw_params && !skip_vb_params) {
7961          assert(ice->draw.draw_params.res);
7962 
7963          struct iris_vertex_buffer_state *state =
7964             &(ice->state.genx->vertex_buffers[count]);
7965          pipe_resource_reference(&state->resource, ice->draw.draw_params.res);
7966          struct iris_resource *res = (void *) state->resource;
7967 
7968          iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7969             vb.VertexBufferIndex = count;
7970             vb.AddressModifyEnable = true;
7971             vb.BufferPitch = 0;
7972             vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;
7973             vb.BufferStartingAddress =
7974                ro_bo(NULL, res->bo->address +
7975                            (int) ice->draw.draw_params.offset);
7976             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
7977                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
7978 #if GFX_VER >= 12
7979             vb.L3BypassDisable       = true;
7980 #endif
7981          }
7982          dynamic_bound |= 1ull << count;
7983          count++;
7984       }
7985 
7986       if (ice->state.vs_uses_derived_draw_params && !skip_vb_params) {
7987          struct iris_vertex_buffer_state *state =
7988             &(ice->state.genx->vertex_buffers[count]);
7989          pipe_resource_reference(&state->resource,
7990                                  ice->draw.derived_draw_params.res);
7991          struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;
7992 
7993          iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7994              vb.VertexBufferIndex = count;
7995             vb.AddressModifyEnable = true;
7996             vb.BufferPitch = 0;
7997             vb.BufferSize =
7998                res->bo->size - ice->draw.derived_draw_params.offset;
7999             vb.BufferStartingAddress =
8000                ro_bo(NULL, res->bo->address +
8001                            (int) ice->draw.derived_draw_params.offset);
8002             vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
8003                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8004 #if GFX_VER >= 12
8005             vb.L3BypassDisable       = true;
8006 #endif
8007          }
8008          dynamic_bound |= 1ull << count;
8009          count++;
8010       }
8011 
8012       if (count) {
8013 #if GFX_VER >= 11
8014          /* Gfx11+ doesn't need the cache workaround below */
8015          uint64_t bound = dynamic_bound;
8016          while (bound) {
8017             const int i = u_bit_scan64(&bound);
8018             iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
8019                                   false, IRIS_DOMAIN_VF_READ);
8020          }
8021 #else
8022          /* The VF cache designers cut corners, and made the cache key's
8023           * <VertexBufferIndex, Memory Address> tuple only consider the bottom
8024           * 32 bits of the address.  If you have two vertex buffers which get
8025           * placed exactly 4 GiB apart and use them in back-to-back draw calls,
8026           * you can get collisions (even within a single batch).
8027           *
8028           * So, we need to do a VF cache invalidate if the buffer for a VB
8029           * slot slot changes [48:32] address bits from the previous time.
8030           */
8031          unsigned flush_flags = 0;
8032 
8033          uint64_t bound = dynamic_bound;
8034          while (bound) {
8035             const int i = u_bit_scan64(&bound);
8036             uint16_t high_bits = 0;
8037 
8038             struct iris_resource *res =
8039                (void *) genx->vertex_buffers[i].resource;
8040             if (res) {
8041                iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
8042 
8043                high_bits = res->bo->address >> 32ull;
8044                if (high_bits != ice->state.last_vbo_high_bits[i]) {
8045                   flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |
8046                                  PIPE_CONTROL_CS_STALL;
8047                   ice->state.last_vbo_high_bits[i] = high_bits;
8048                }
8049             }
8050          }
8051 
8052          if (flush_flags) {
8053             iris_emit_pipe_control_flush(batch,
8054                                          "workaround: VF cache 32-bit key [VB]",
8055                                          flush_flags);
8056          }
8057 #endif
8058 
8059          const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
8060 
8061          uint32_t *map =
8062             iris_get_command_space(batch, 4 * (1 + vb_dwords * count));
8063          _iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
8064             vb.DWordLength = (vb_dwords * count + 1) - 2;
8065          }
8066          map += 1;
8067 
8068          const struct iris_vertex_element_state *cso_ve =
8069             ice->state.cso_vertex_elements;
8070 
8071          bound = dynamic_bound;
8072          while (bound) {
8073             const int i = u_bit_scan64(&bound);
8074 
8075             uint32_t vb_stride[GENX(VERTEX_BUFFER_STATE_length)];
8076             struct iris_bo *bo =
8077                iris_resource_bo(genx->vertex_buffers[i].resource);
8078             iris_pack_state(GENX(VERTEX_BUFFER_STATE), &vb_stride, vbs) {
8079                vbs.BufferPitch = cso_ve->stride[i];
8080                /* Unnecessary except to defeat the genxml nonzero checker */
8081                vbs.MOCS = iris_mocs(bo, &screen->isl_dev,
8082                                     ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8083             }
8084             for (unsigned d = 0; d < vb_dwords; d++)
8085                map[d] = genx->vertex_buffers[i].state[d] | vb_stride[d];
8086 
8087             map += vb_dwords;
8088          }
8089       }
8090    }
8091 
8092    if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
8093       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8094       const unsigned entries = MAX2(cso->count, 1);
8095       if (!(ice->state.vs_needs_sgvs_element ||
8096             ice->state.vs_uses_derived_draw_params ||
8097             ice->state.vs_needs_edge_flag)) {
8098          iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
8099                          (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
8100       } else {
8101          uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
8102          const unsigned dyn_count = cso->count +
8103             ice->state.vs_needs_sgvs_element +
8104             ice->state.vs_uses_derived_draw_params;
8105 
8106          iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
8107                            &dynamic_ves, ve) {
8108             ve.DWordLength =
8109                1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
8110          }
8111          memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
8112                 (cso->count - ice->state.vs_needs_edge_flag) *
8113                 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
8114          uint32_t *ve_pack_dest =
8115             &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
8116                          GENX(VERTEX_ELEMENT_STATE_length)];
8117 
8118          if (ice->state.vs_needs_sgvs_element) {
8119             uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
8120                                  VFCOMP_STORE_SRC : VFCOMP_STORE_0;
8121             iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8122                ve.Valid = true;
8123                ve.VertexBufferIndex =
8124                   util_bitcount64(ice->state.bound_vertex_buffers);
8125                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8126                ve.Component0Control = base_ctrl;
8127                ve.Component1Control = base_ctrl;
8128                ve.Component2Control = VFCOMP_STORE_0;
8129                ve.Component3Control = VFCOMP_STORE_0;
8130             }
8131             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8132          }
8133          if (ice->state.vs_uses_derived_draw_params) {
8134             iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8135                ve.Valid = true;
8136                ve.VertexBufferIndex =
8137                   util_bitcount64(ice->state.bound_vertex_buffers) +
8138                   ice->state.vs_uses_draw_params;
8139                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8140                ve.Component0Control = VFCOMP_STORE_SRC;
8141                ve.Component1Control = VFCOMP_STORE_SRC;
8142                ve.Component2Control = VFCOMP_STORE_0;
8143                ve.Component3Control = VFCOMP_STORE_0;
8144             }
8145             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8146          }
8147          if (ice->state.vs_needs_edge_flag) {
8148             for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
8149                ve_pack_dest[i] = cso->edgeflag_ve[i];
8150          }
8151 
8152          iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
8153                          (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
8154       }
8155 
8156       if (!ice->state.vs_needs_edge_flag) {
8157          iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
8158                          entries * GENX(3DSTATE_VF_INSTANCING_length));
8159       } else {
8160          assert(cso->count > 0);
8161          const unsigned edgeflag_index = cso->count - 1;
8162          uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
8163          memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
8164                 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
8165 
8166          uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
8167             edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
8168          iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
8169             vi.VertexElementIndex = edgeflag_index +
8170                ice->state.vs_needs_sgvs_element +
8171                ice->state.vs_uses_derived_draw_params;
8172          }
8173          for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
8174             vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
8175 
8176          iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
8177                          entries * GENX(3DSTATE_VF_INSTANCING_length));
8178       }
8179    }
8180 
8181    if (dirty & IRIS_DIRTY_VF_SGVS) {
8182       const struct iris_vs_data *vs_data =
8183          iris_vs_data(ice->shaders.prog[MESA_SHADER_VERTEX]);
8184       struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8185 
8186       iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
8187          if (vs_data->uses_vertexid) {
8188             sgv.VertexIDEnable = true;
8189             sgv.VertexIDComponentNumber = 2;
8190             sgv.VertexIDElementOffset =
8191                cso->count - ice->state.vs_needs_edge_flag;
8192          }
8193 
8194          if (vs_data->uses_instanceid) {
8195             sgv.InstanceIDEnable = true;
8196             sgv.InstanceIDComponentNumber = 3;
8197             sgv.InstanceIDElementOffset =
8198                cso->count - ice->state.vs_needs_edge_flag;
8199          }
8200       }
8201    }
8202 
8203    if (dirty & IRIS_DIRTY_VF_STATISTICS) {
8204       iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
8205          vf.StatisticsEnable = true;
8206       }
8207    }
8208 
8209    if (dirty & IRIS_DIRTY_VF) {
8210 #if INTEL_WA_16012775297_GFX_VER
8211       /* Emit dummy VF statistics before each 3DSTATE_VF. */
8212       if (intel_needs_workaround(batch->screen->devinfo, 16012775297) &&
8213           (dirty & IRIS_DIRTY_VF_STATISTICS) == 0) {
8214          iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
8215             vfs.StatisticsEnable = true;
8216          }
8217       }
8218 #endif
8219 
8220       iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
8221 #if GFX_VERx10 >= 125
8222          vf.GeometryDistributionEnable = true;
8223 #endif
8224          if (draw->primitive_restart) {
8225             vf.IndexedDrawCutIndexEnable = true;
8226             vf.CutIndex = draw->restart_index;
8227          }
8228       }
8229    }
8230 
8231 #if GFX_VERx10 >= 125
8232    if (dirty & IRIS_DIRTY_VFG) {
8233       iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
8234          /* Gfx12.5: If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE */
8235          vfg.DistributionMode =
8236 #if GFX_VER < 20
8237             ice->shaders.prog[MESA_SHADER_TESS_EVAL] == NULL ? RR_FREE :
8238 #endif
8239                                                                RR_STRICT;
8240          if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
8241              program_uses_primitive_id)
8242             vfg.DistributionGranularity = InstanceLevelGranularity;
8243          else
8244             vfg.DistributionGranularity = BatchLevelGranularity;
8245 #if INTEL_WA_14014851047_GFX_VER
8246          vfg.GranularityThresholdDisable =
8247             intel_needs_workaround(batch->screen->devinfo, 14014851047);
8248 #endif
8249          vfg.ListCutIndexEnable = draw->primitive_restart;
8250          /* 192 vertices for TRILIST_ADJ */
8251          vfg.ListNBatchSizeScale = 0;
8252          /* Batch size of 384 vertices */
8253          vfg.List3BatchSizeScale = 2;
8254          /* Batch size of 128 vertices */
8255          vfg.List2BatchSizeScale = 1;
8256          /* Batch size of 128 vertices */
8257          vfg.List1BatchSizeScale = 2;
8258          /* Batch size of 256 vertices for STRIP topologies */
8259          vfg.StripBatchSizeScale = 3;
8260          /* 192 control points for PATCHLIST_3 */
8261          vfg.PatchBatchSizeScale = 1;
8262          /* 192 control points for PATCHLIST_3 */
8263          vfg.PatchBatchSizeMultiplier = 31;
8264       }
8265    }
8266 #endif
8267 
8268 #if GFX_VER == 8
8269    if (dirty & IRIS_DIRTY_PMA_FIX) {
8270       bool enable = want_pma_fix(ice);
8271       genX(update_pma_fix)(ice, batch, enable);
8272    }
8273 #endif
8274 
8275    if (ice->state.current_hash_scale != 1)
8276       genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
8277 
8278 #if GFX_VER >= 12
8279    genX(invalidate_aux_map_state)(batch);
8280 #endif
8281 }
8282 
8283 static void
flush_vbos(struct iris_context * ice,struct iris_batch * batch)8284 flush_vbos(struct iris_context *ice, struct iris_batch *batch)
8285 {
8286    struct iris_genx_state *genx = ice->state.genx;
8287    uint64_t bound = ice->state.bound_vertex_buffers;
8288    while (bound) {
8289       const int i = u_bit_scan64(&bound);
8290       struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
8291       iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
8292    }
8293 }
8294 
8295 static bool
point_or_line_list(enum mesa_prim prim_type)8296 point_or_line_list(enum mesa_prim prim_type)
8297 {
8298    switch (prim_type) {
8299    case MESA_PRIM_POINTS:
8300    case MESA_PRIM_LINES:
8301    case MESA_PRIM_LINE_STRIP:
8302    case MESA_PRIM_LINES_ADJACENCY:
8303    case MESA_PRIM_LINE_STRIP_ADJACENCY:
8304    case MESA_PRIM_LINE_LOOP:
8305       return true;
8306    default:
8307       return false;
8308    }
8309    return false;
8310 }
8311 
8312 void
genX(emit_breakpoint)8313 genX(emit_breakpoint)(struct iris_batch *batch, bool emit_before_draw)
8314 {
8315    struct iris_context *ice = batch->ice;
8316    uint32_t draw_count = emit_before_draw ?
8317                          p_atomic_inc_return(&ice->draw_call_count) :
8318                          p_atomic_read(&ice->draw_call_count);
8319 
8320    if (((draw_count == intel_debug_bkp_before_draw_count &&
8321          emit_before_draw) ||
8322         (draw_count == intel_debug_bkp_after_draw_count &&
8323          !emit_before_draw)))  {
8324       iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
8325          sem.WaitMode            = PollingMode;
8326          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
8327          sem.SemaphoreDataDword  = 0x1;
8328          sem.SemaphoreAddress    = rw_bo(batch->screen->breakpoint_bo, 0,
8329                                          IRIS_DOMAIN_OTHER_WRITE);
8330       };
8331    }
8332 }
8333 
8334 void
genX(emit_3dprimitive_was)8335 genX(emit_3dprimitive_was)(struct iris_batch *batch,
8336                            const struct pipe_draw_indirect_info *indirect,
8337                            uint32_t primitive_type,
8338                            uint32_t vertex_count)
8339 {
8340    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8341    UNUSED const struct iris_context *ice = batch->ice;
8342 
8343 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
8344    if (intel_needs_workaround(devinfo, 22014412737) &&
8345        (point_or_line_list(primitive_type) || indirect ||
8346         (vertex_count == 1 || vertex_count == 2))) {
8347          iris_emit_pipe_control_write(batch, "Wa_22014412737",
8348                                       PIPE_CONTROL_WRITE_IMMEDIATE,
8349                                       batch->screen->workaround_bo,
8350                                       batch->screen->workaround_address.offset,
8351                                       0ull);
8352       batch->num_3d_primitives_emitted = 0;
8353    } else if (intel_needs_workaround(devinfo, 16014538804)) {
8354       batch->num_3d_primitives_emitted++;
8355 
8356       /* Wa_16014538804 - Send empty/dummy pipe control after 3 3DPRIMITIVE. */
8357       if (batch->num_3d_primitives_emitted == 3) {
8358          iris_emit_pipe_control_flush(batch, "Wa_16014538804", 0);
8359          batch->num_3d_primitives_emitted = 0;
8360       }
8361    }
8362 #endif
8363 }
8364 
8365 void
genX(urb_workaround)8366 genX(urb_workaround)(struct iris_batch *batch,
8367                      const struct intel_urb_config *urb_cfg)
8368 {
8369 #if INTEL_NEEDS_WA_16014912113
8370    if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb,
8371                                MESA_SHADER_TESS_EVAL) &&
8372        batch->ice->shaders.last_urb.size[0] != 0) {
8373       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
8374 #if GFX_VER >= 12
8375          iris_emit_cmd(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
8376             urb._3DCommandSubOpcode += i;
8377             urb.VSURBEntryAllocationSize =
8378                batch->ice->shaders.last_urb.size[i] - 1;
8379             urb.VSURBStartingAddressSlice0 =
8380                batch->ice->shaders.last_urb.start[i];
8381             urb.VSURBStartingAddressSliceN =
8382                batch->ice->shaders.last_urb.start[i];
8383             urb.VSNumberofURBEntriesSlice0 = i == 0 ? 256 : 0;
8384             urb.VSNumberofURBEntriesSliceN = i == 0 ? 256 : 0;
8385          }
8386 #else
8387          iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
8388             urb._3DCommandSubOpcode += i;
8389             urb.VSURBStartingAddress =
8390                batch->ice->shaders.last_urb.start[i];
8391             urb.VSURBEntryAllocationSize =
8392                batch->ice->shaders.last_urb.size[i] - 1;
8393             urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
8394          }
8395 #endif
8396       }
8397       iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8398          pc.HDCPipelineFlushEnable = true;
8399       }
8400    }
8401 #endif
8402 
8403    /* Update current urb config. */
8404    memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg,
8405           sizeof(struct intel_urb_config));
8406 }
8407 
8408 static void
iris_emit_index_buffer(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,const struct pipe_draw_start_count_bias * sc)8409 iris_emit_index_buffer(struct iris_context *ice,
8410                        struct iris_batch *batch,
8411                        const struct pipe_draw_info *draw,
8412                        const struct pipe_draw_start_count_bias *sc)
8413 {
8414    unsigned offset;
8415 
8416    if (draw->has_user_indices) {
8417       unsigned start_offset = draw->index_size * sc->start;
8418 
8419       u_upload_data(ice->ctx.const_uploader, start_offset,
8420                     sc->count * draw->index_size, 4,
8421                     (char*)draw->index.user + start_offset,
8422                     &offset, &ice->state.last_res.index_buffer);
8423       offset -= start_offset;
8424    } else {
8425       struct iris_resource *res = (void *) draw->index.resource;
8426       res->bind_history |= PIPE_BIND_INDEX_BUFFER;
8427 
8428       pipe_resource_reference(&ice->state.last_res.index_buffer,
8429                               draw->index.resource);
8430       offset = 0;
8431 
8432       iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
8433    }
8434 
8435    struct iris_genx_state *genx = ice->state.genx;
8436    struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
8437 
8438    uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
8439    iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
8440       ib.IndexFormat = draw->index_size >> 1;
8441       ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
8442                           ISL_SURF_USAGE_INDEX_BUFFER_BIT);
8443       ib.BufferSize = bo->size - offset;
8444       ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
8445 #if GFX_VER >= 12
8446       ib.L3BypassDisable       = true;
8447 #endif
8448    }
8449 
8450    if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
8451       memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
8452       iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
8453       iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
8454    }
8455 
8456 #if GFX_VER < 11
8457    /* The VF cache key only uses 32-bits, see vertex buffer comment above */
8458    uint16_t high_bits = bo->address >> 32ull;
8459    if (high_bits != ice->state.last_index_bo_high_bits) {
8460       iris_emit_pipe_control_flush(batch,
8461                                    "workaround: VF cache 32-bit key [IB]",
8462                                    PIPE_CONTROL_VF_CACHE_INVALIDATE |
8463                                    PIPE_CONTROL_CS_STALL);
8464       ice->state.last_index_bo_high_bits = high_bits;
8465    }
8466 #endif
8467 }
8468 
8469 
8470 static void
iris_upload_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8471 iris_upload_render_state(struct iris_context *ice,
8472                          struct iris_batch *batch,
8473                          const struct pipe_draw_info *draw,
8474                          unsigned drawid_offset,
8475                          const struct pipe_draw_indirect_info *indirect,
8476                          const struct pipe_draw_start_count_bias *sc)
8477 {
8478    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8479    bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8480 
8481    trace_intel_begin_draw(&batch->trace);
8482 
8483    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8484       flush_vbos(ice, batch);
8485 
8486    iris_batch_sync_region_start(batch);
8487 
8488    /* Always pin the binder.  If we're emitting new binding table pointers,
8489     * we need it.  If not, we're probably inheriting old tables via the
8490     * context, and need it anyway.  Since true zero-bindings cases are
8491     * practically non-existent, just pin it and avoid last_res tracking.
8492     */
8493    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8494                       IRIS_DOMAIN_NONE);
8495 
8496    if (!batch->contains_draw) {
8497       if (GFX_VER == 12) {
8498          /* Re-emit constants when starting a new batch buffer in order to
8499           * work around push constant corruption on context switch.
8500           *
8501           * XXX - Provide hardware spec quotation when available.
8502           */
8503          ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8504                                     IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8505                                     IRIS_STAGE_DIRTY_CONSTANTS_TES |
8506                                     IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8507                                     IRIS_STAGE_DIRTY_CONSTANTS_FS);
8508       }
8509       batch->contains_draw = true;
8510    }
8511 
8512    if (!batch->contains_draw_with_next_seqno) {
8513       iris_restore_render_saved_bos(ice, batch, draw);
8514       batch->contains_draw_with_next_seqno = true;
8515    }
8516 
8517    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8518     * Wa_16011107343 (same for gfx12)
8519     * We implement this by setting TCS dirty on each draw.
8520     */
8521    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8522        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8523       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8524    }
8525 
8526    iris_upload_dirty_render_state(ice, batch, draw, false);
8527 
8528    if (draw->index_size > 0)
8529       iris_emit_index_buffer(ice, batch, draw, sc);
8530 
8531    if (indirect) {
8532       struct mi_builder b;
8533       uint32_t mocs;
8534       mi_builder_init(&b, batch->screen->devinfo, batch);
8535 
8536 #define _3DPRIM_END_OFFSET          0x2420
8537 #define _3DPRIM_START_VERTEX        0x2430
8538 #define _3DPRIM_VERTEX_COUNT        0x2434
8539 #define _3DPRIM_INSTANCE_COUNT      0x2438
8540 #define _3DPRIM_START_INSTANCE      0x243C
8541 #define _3DPRIM_BASE_VERTEX         0x2440
8542 
8543       if (!indirect->count_from_stream_output) {
8544          if (indirect->indirect_draw_count) {
8545             use_predicate = true;
8546 
8547             struct iris_bo *draw_count_bo =
8548                iris_resource_bo(indirect->indirect_draw_count);
8549             unsigned draw_count_offset =
8550                indirect->indirect_draw_count_offset;
8551             mocs = iris_mocs(draw_count_bo, &batch->screen->isl_dev, 0);
8552             mi_builder_set_mocs(&b, mocs);
8553 
8554             if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
8555                /* comparison = draw id < draw count */
8556                struct mi_value comparison =
8557                   mi_ult(&b, mi_imm(drawid_offset),
8558                              mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8559 
8560                /* predicate = comparison & conditional rendering predicate */
8561                mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
8562                             mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
8563             } else {
8564                uint32_t mi_predicate;
8565 
8566                /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
8567                mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(drawid_offset));
8568                /* Upload the current draw count from the draw parameters buffer
8569                 * to MI_PREDICATE_SRC0. Zero the top 32-bits of
8570                 * MI_PREDICATE_SRC0.
8571                 */
8572                mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
8573                         mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8574 
8575                if (drawid_offset == 0) {
8576                   mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
8577                                  MI_PREDICATE_COMBINEOP_SET |
8578                                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8579                } else {
8580                   /* While draw_index < draw_count the predicate's result will be
8581                    *  (draw_index == draw_count) ^ TRUE = TRUE
8582                    * When draw_index == draw_count the result is
8583                    *  (TRUE) ^ TRUE = FALSE
8584                    * After this all results will be:
8585                    *  (FALSE) ^ FALSE = FALSE
8586                    */
8587                   mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
8588                                  MI_PREDICATE_COMBINEOP_XOR |
8589                                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8590                }
8591                iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
8592             }
8593          }
8594          struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8595          assert(bo);
8596 
8597          mocs = iris_mocs(bo, &batch->screen->isl_dev, 0);
8598          mi_builder_set_mocs(&b, mocs);
8599 
8600          mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8601                   mi_mem32(ro_bo(bo, indirect->offset + 0)));
8602          mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8603                   mi_mem32(ro_bo(bo, indirect->offset + 4)));
8604          mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX),
8605                   mi_mem32(ro_bo(bo, indirect->offset + 8)));
8606          if (draw->index_size) {
8607             mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX),
8608                      mi_mem32(ro_bo(bo, indirect->offset + 12)));
8609             mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8610                      mi_mem32(ro_bo(bo, indirect->offset + 16)));
8611          } else {
8612             mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8613                      mi_mem32(ro_bo(bo, indirect->offset + 12)));
8614             mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8615          }
8616       } else if (indirect->count_from_stream_output) {
8617          struct iris_stream_output_target *so =
8618             (void *) indirect->count_from_stream_output;
8619          struct iris_bo *so_bo = iris_resource_bo(so->offset.res);
8620 
8621          mocs = iris_mocs(so_bo, &batch->screen->isl_dev, 0);
8622          mi_builder_set_mocs(&b, mocs);
8623 
8624          iris_emit_buffer_barrier_for(batch, so_bo, IRIS_DOMAIN_OTHER_READ);
8625 
8626          struct iris_address addr = ro_bo(so_bo, so->offset.offset);
8627          struct mi_value offset =
8628             mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8629          mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8630                       mi_udiv32_imm(&b, offset, so->stride));
8631          mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX), mi_imm(0));
8632          mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8633          mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE), mi_imm(0));
8634          mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8635                   mi_imm(draw->instance_count));
8636       }
8637    }
8638 
8639    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8640 
8641    genX(maybe_emit_breakpoint)(batch, true);
8642 
8643    iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8644       prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8645       prim.PredicateEnable = use_predicate;
8646 #if GFX_VERx10 >= 125
8647       prim.TBIMREnable = ice->state.use_tbimr;
8648 #endif
8649       if (indirect) {
8650          prim.IndirectParameterEnable = true;
8651       } else {
8652          prim.StartInstanceLocation = draw->start_instance;
8653          prim.InstanceCount = draw->instance_count;
8654          prim.VertexCountPerInstance = sc->count;
8655 
8656          prim.StartVertexLocation = sc->start;
8657 
8658          if (draw->index_size) {
8659             prim.BaseVertexLocation += sc->index_bias;
8660          }
8661       }
8662    }
8663 
8664    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8665    genX(maybe_emit_breakpoint)(batch, false);
8666 
8667    iris_batch_sync_region_end(batch);
8668 
8669    uint32_t count = (sc) ? sc->count : 0;
8670    count *= draw->instance_count ? draw->instance_count : 1;
8671    trace_intel_end_draw(&batch->trace, count, 0, 0);
8672 }
8673 
8674 static void
iris_upload_indirect_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8675 iris_upload_indirect_render_state(struct iris_context *ice,
8676                                   const struct pipe_draw_info *draw,
8677                                   const struct pipe_draw_indirect_info *indirect,
8678                                   const struct pipe_draw_start_count_bias *sc)
8679 {
8680 #if GFX_VERx10 >= 125
8681    assert(indirect);
8682 
8683    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8684    UNUSED struct iris_screen *screen = batch->screen;
8685    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8686    const bool use_predicate =
8687       ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8688 
8689    trace_intel_begin_draw(&batch->trace);
8690 
8691    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8692       flush_vbos(ice, batch);
8693 
8694    iris_batch_sync_region_start(batch);
8695 
8696    /* Always pin the binder.  If we're emitting new binding table pointers,
8697     * we need it.  If not, we're probably inheriting old tables via the
8698     * context, and need it anyway.  Since true zero-bindings cases are
8699     * practically non-existent, just pin it and avoid last_res tracking.
8700     */
8701    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8702                       IRIS_DOMAIN_NONE);
8703 
8704    if (!batch->contains_draw) {
8705       /* Re-emit constants when starting a new batch buffer in order to
8706        * work around push constant corruption on context switch.
8707        *
8708        * XXX - Provide hardware spec quotation when available.
8709        */
8710       ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8711                                  IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8712                                  IRIS_STAGE_DIRTY_CONSTANTS_TES |
8713                                  IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8714                                  IRIS_STAGE_DIRTY_CONSTANTS_FS);
8715       batch->contains_draw = true;
8716    }
8717 
8718    if (!batch->contains_draw_with_next_seqno) {
8719       iris_restore_render_saved_bos(ice, batch, draw);
8720       batch->contains_draw_with_next_seqno = true;
8721    }
8722 
8723    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8724     * Wa_16011107343 (same for gfx12)
8725     * We implement this by setting TCS dirty on each draw.
8726     */
8727    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8728        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8729       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8730    }
8731 
8732    iris_upload_dirty_render_state(ice, batch, draw, false);
8733 
8734    if (draw->index_size > 0)
8735       iris_emit_index_buffer(ice, batch, draw, sc);
8736 
8737    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8738 
8739    genX(maybe_emit_breakpoint)(batch, true);
8740 
8741    iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
8742       ind.ArgumentFormat             =
8743          draw->index_size > 0 ? XI_DRAWINDEXED : XI_DRAW;
8744       ind.PredicateEnable            = use_predicate;
8745       ind.TBIMREnabled               = ice->state.use_tbimr;
8746       ind.MaxCount                   = indirect->draw_count;
8747 
8748       if (indirect->buffer) {
8749          struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8750          ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset);
8751          ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0);
8752          } else {
8753          ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8754       }
8755 
8756       if (indirect->indirect_draw_count) {
8757          struct iris_bo *draw_count_bo      =
8758             iris_resource_bo(indirect->indirect_draw_count);
8759          ind.CountBufferIndirectEnable      = true;
8760          ind.CountBufferAddress             =
8761             ro_bo(draw_count_bo, indirect->indirect_draw_count_offset);
8762       }
8763    }
8764 
8765    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8766    genX(maybe_emit_breakpoint)(batch, false);
8767 
8768    iris_batch_sync_region_end(batch);
8769 
8770    uint32_t count = (sc) ? sc->count : 0;
8771    count *= draw->instance_count ? draw->instance_count : 1;
8772    trace_intel_end_draw(&batch->trace, count, 0, 0);
8773 #else
8774    unreachable("Unsupported path");
8775 #endif /* GFX_VERx10 >= 125 */
8776 }
8777 
8778 static void
iris_upload_indirect_shader_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8779 iris_upload_indirect_shader_render_state(struct iris_context *ice,
8780                                          const struct pipe_draw_info *draw,
8781                                          const struct pipe_draw_indirect_info *indirect,
8782                                          const struct pipe_draw_start_count_bias *sc)
8783 {
8784    assert(indirect);
8785 
8786    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8787    UNUSED struct iris_screen *screen = batch->screen;
8788    UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8789 
8790    if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8791       flush_vbos(ice, batch);
8792 
8793    iris_batch_sync_region_start(batch);
8794 
8795    /* Always pin the binder.  If we're emitting new binding table pointers,
8796     * we need it.  If not, we're probably inheriting old tables via the
8797     * context, and need it anyway.  Since true zero-bindings cases are
8798     * practically non-existent, just pin it and avoid last_res tracking.
8799     */
8800    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8801                       IRIS_DOMAIN_NONE);
8802 
8803    if (!batch->contains_draw) {
8804       if (GFX_VER == 12) {
8805          /* Re-emit constants when starting a new batch buffer in order to
8806           * work around push constant corruption on context switch.
8807           *
8808           * XXX - Provide hardware spec quotation when available.
8809           */
8810          ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
8811                                     IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8812                                     IRIS_STAGE_DIRTY_CONSTANTS_TES |
8813                                     IRIS_STAGE_DIRTY_CONSTANTS_GS  |
8814                                     IRIS_STAGE_DIRTY_CONSTANTS_FS);
8815       }
8816       batch->contains_draw = true;
8817    }
8818 
8819    if (!batch->contains_draw_with_next_seqno) {
8820       iris_restore_render_saved_bos(ice, batch, draw);
8821       batch->contains_draw_with_next_seqno = true;
8822    }
8823 
8824    if (draw->index_size > 0)
8825       iris_emit_index_buffer(ice, batch, draw, sc);
8826 
8827    /* Make sure we have enough space to keep all the commands in the single BO
8828     * (because of the jumps)
8829     */
8830    iris_require_command_space(batch, 2000);
8831 
8832 #ifndef NDEBUG
8833    struct iris_bo *command_bo = batch->bo;
8834 #endif
8835 
8836    /* Jump point to generate more draw if we run out of space in the ring
8837     * buffer.
8838     */
8839    uint64_t gen_addr = iris_batch_current_address_u64(batch);
8840 
8841    iris_handle_always_flush_cache(batch);
8842 
8843 #if GFX_VER == 9
8844    iris_emit_pipe_control_flush(batch, "before generation",
8845                                 PIPE_CONTROL_VF_CACHE_INVALIDATE);
8846 #endif
8847 
8848    struct iris_address params_addr;
8849    struct iris_gen_indirect_params *params =
8850       genX(emit_indirect_generate)(batch, draw, indirect, sc,
8851                                    &params_addr);
8852 
8853    iris_emit_pipe_control_flush(batch, "after generation flush",
8854                                 ((ice->state.vs_uses_draw_params ||
8855                                   ice->state.vs_uses_derived_draw_params) ?
8856                                  PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) |
8857                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8858                                 PIPE_CONTROL_DATA_CACHE_FLUSH |
8859                                 PIPE_CONTROL_CS_STALL);
8860 
8861    trace_intel_begin_draw(&batch->trace);
8862 
8863    /* Always pin the binder.  If we're emitting new binding table pointers,
8864     * we need it.  If not, we're probably inheriting old tables via the
8865     * context, and need it anyway.  Since true zero-bindings cases are
8866     * practically non-existent, just pin it and avoid last_res tracking.
8867     */
8868    iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8869                       IRIS_DOMAIN_NONE);
8870 
8871    /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8872     * Wa_16011107343 (same for gfx12)
8873     * We implement this by setting TCS dirty on each draw.
8874     */
8875    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8876        ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8877       ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8878    }
8879 
8880    iris_upload_dirty_render_state(ice, batch, draw, true);
8881 
8882    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8883 
8884    genX(maybe_emit_breakpoint)(batch, true);
8885 
8886 #if GFX_VER >= 12
8887    iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) {
8888       arb.PreParserDisableMask = true;
8889       arb.PreParserDisable = true;
8890    }
8891 #endif
8892 
8893    iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8894       bbs.AddressSpaceIndicator = ASI_PPGTT;
8895       bbs.BatchBufferStartAddress = (struct iris_address) {
8896          .bo = ice->draw.generation.ring_bo,
8897       };
8898    }
8899 
8900    /* Run the ring buffer one more time with the next set of commands */
8901    uint64_t inc_addr = iris_batch_current_address_u64(batch);
8902    {
8903       iris_emit_pipe_control_flush(batch,
8904                                    "post generated draws wait",
8905                                    PIPE_CONTROL_STALL_AT_SCOREBOARD |
8906                                    PIPE_CONTROL_CS_STALL);
8907 
8908       struct mi_builder b;
8909       mi_builder_init(&b, batch->screen->devinfo, batch);
8910 
8911       struct iris_address draw_base_addr = iris_address_add(
8912          params_addr,
8913          offsetof(struct iris_gen_indirect_params, draw_base));
8914 
8915       const uint32_t mocs =
8916          iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0);
8917       mi_builder_set_mocs(&b, mocs);
8918 
8919       mi_store(&b, mi_mem32(draw_base_addr),
8920                    mi_iadd(&b, mi_mem32(draw_base_addr),
8921                                mi_imm(params->ring_count)));
8922 
8923       iris_emit_pipe_control_flush(batch,
8924                                    "post generation base increment",
8925                                    PIPE_CONTROL_CS_STALL |
8926                                    PIPE_CONTROL_CONST_CACHE_INVALIDATE);
8927 
8928       iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8929          bbs.AddressSpaceIndicator = ASI_PPGTT;
8930          bbs.BatchBufferStartAddress = (struct iris_address) {
8931             .offset = gen_addr,
8932          };
8933       }
8934    }
8935 
8936    /* Exit of the ring buffer */
8937    uint64_t end_addr = iris_batch_current_address_u64(batch);
8938 
8939 #ifndef NDEBUG
8940    assert(command_bo == batch->bo);
8941 #endif
8942 
8943    genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8944    genX(maybe_emit_breakpoint)(batch, false);
8945 
8946    iris_emit_pipe_control_flush(batch,
8947                                 "post generated draws wait",
8948                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8949                                 PIPE_CONTROL_CS_STALL);
8950 
8951    params->gen_addr = inc_addr;
8952    params->end_addr = end_addr;
8953 
8954    iris_batch_sync_region_end(batch);
8955 
8956    uint32_t count = (sc) ? sc->count : 0;
8957    count *= draw->instance_count ? draw->instance_count : 1;
8958    trace_intel_end_draw(&batch->trace, count, 0, 0);
8959 }
8960 
8961 static void
iris_load_indirect_location(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)8962 iris_load_indirect_location(struct iris_context *ice,
8963                             struct iris_batch *batch,
8964                             const struct pipe_grid_info *grid)
8965 {
8966 #define GPGPU_DISPATCHDIMX 0x2500
8967 #define GPGPU_DISPATCHDIMY 0x2504
8968 #define GPGPU_DISPATCHDIMZ 0x2508
8969 
8970    assert(grid->indirect);
8971 
8972    struct iris_state_ref *grid_size = &ice->state.grid_size;
8973    struct iris_bo *bo = iris_resource_bo(grid_size->res);
8974    struct mi_builder b;
8975    mi_builder_init(&b, batch->screen->devinfo, batch);
8976    struct mi_value size_x = mi_mem32(ro_bo(bo, grid_size->offset + 0));
8977    struct mi_value size_y = mi_mem32(ro_bo(bo, grid_size->offset + 4));
8978    struct mi_value size_z = mi_mem32(ro_bo(bo, grid_size->offset + 8));
8979    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
8980    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
8981    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
8982 }
8983 
iris_emit_indirect_dispatch_supported(const struct intel_device_info * devinfo)8984 static bool iris_emit_indirect_dispatch_supported(const struct intel_device_info *devinfo)
8985 {
8986    // TODO: Swizzling X and Y workgroup sizes is not supported in execute indirect dispatch
8987    return devinfo->has_indirect_unroll;
8988 }
8989 
8990 #if GFX_VERx10 >= 125
8991 
iris_emit_execute_indirect_dispatch(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid,const struct GENX (INTERFACE_DESCRIPTOR_DATA)idd)8992 static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
8993                                                 struct iris_batch *batch,
8994                                                 const struct pipe_grid_info *grid,
8995                                                 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd)
8996 {
8997    const struct iris_screen *screen = batch->screen;
8998    struct iris_compiled_shader *shader =
8999       ice->shaders.prog[MESA_SHADER_COMPUTE];
9000    const struct iris_cs_data *cs_data = iris_cs_data(shader);
9001    const struct intel_cs_dispatch_info dispatch =
9002       iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9003    struct iris_bo *indirect = iris_resource_bo(grid->indirect);
9004    const int dispatch_size = dispatch.simd_size / 16;
9005 
9006    struct GENX(COMPUTE_WALKER_BODY) body = {};
9007    body.SIMDSize            = dispatch_size;
9008    body.MessageSIMD         = dispatch_size;
9009    body.GenerateLocalID     = cs_data->generate_local_id != 0;
9010    body.EmitLocal           = cs_data->generate_local_id;
9011    body.WalkOrder           = cs_data->walk_order;
9012    body.TileLayout          = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9013                               TileY32bpe : Linear;
9014    body.LocalXMaximum       = grid->block[0] - 1;
9015    body.LocalYMaximum       = grid->block[1] - 1;
9016    body.LocalZMaximum       = grid->block[2] - 1;
9017    body.ExecutionMask       = dispatch.right_mask;
9018    body.PostSync.MOCS       = iris_mocs(NULL, &screen->isl_dev, 0);
9019    body.InterfaceDescriptor = idd;
9020    /* HSD 14016252163: Use of Morton walk order (and batching using a batch
9021     * size of 4) is expected to increase sampler cache hit rates by
9022     * increasing sample address locality within a subslice.
9023     */
9024 #if GFX_VER >= 30
9025    body.DispatchWalkOrder =
9026       cs_data->uses_sampler ? MortonWalk : LinearWalk;
9027    body.ThreadGroupBatchSize =
9028       cs_data->uses_sampler ? TG_BATCH_4 : TG_BATCH_1;
9029 #endif
9030 
9031    struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
9032    iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {
9033       ind.PredicateEnable            =
9034          ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
9035       ind.MaxCount                   = 1;
9036       ind.COMPUTE_WALKER_BODY        = body;
9037       ind.ArgumentBufferStartAddress = indirect_bo;
9038       ind.MOCS                       =
9039          iris_mocs(indirect_bo.bo, &screen->isl_dev, 0);
9040    }
9041 }
9042 
9043 static void
iris_upload_compute_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9044 iris_upload_compute_walker(struct iris_context *ice,
9045                            struct iris_batch *batch,
9046                            const struct pipe_grid_info *grid)
9047 {
9048    const uint64_t stage_dirty = ice->state.stage_dirty;
9049    struct iris_screen *screen = batch->screen;
9050    const struct intel_device_info *devinfo = screen->devinfo;
9051    struct iris_binder *binder = &ice->state.binder;
9052    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9053    struct iris_compiled_shader *shader =
9054       ice->shaders.prog[MESA_SHADER_COMPUTE];
9055    const struct iris_cs_data *cs_data = iris_cs_data(shader);
9056    const struct intel_cs_dispatch_info dispatch =
9057       iris_get_cs_dispatch_info(devinfo, shader, grid->block);
9058 
9059    trace_intel_begin_compute(&batch->trace);
9060 
9061    if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
9062       iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
9063          cfe.MaximumNumberofThreads =
9064             devinfo->max_cs_threads * devinfo->subslice_total;
9065          uint32_t scratch_addr = pin_scratch_space(ice, batch, shader,
9066                                                    MESA_SHADER_COMPUTE);
9067          cfe.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
9068       }
9069    }
9070 
9071    struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
9072    idd.KernelStartPointer = KSP(shader);
9073    idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9074    idd.SharedLocalMemorySize =
9075       intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
9076    idd.PreferredSLMAllocationSize =
9077       intel_compute_preferred_slm_calc_encode_size(devinfo,
9078                                                    shader->total_shared,
9079                                                    dispatch.group_size,
9080                                                    dispatch.simd_size);
9081    idd.SamplerStatePointer = shs->sampler_table.offset;
9082    idd.SamplerCount = encode_sampler_count(shader),
9083    idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
9084    /* Typically set to 0 to avoid prefetching on every thread dispatch. */
9085    idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
9086       0 : MIN2(shader->bt.size_bytes / 4, 31);
9087    idd.NumberOfBarriers = cs_data->uses_barrier;
9088 
9089    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9090 
9091    if (iris_emit_indirect_dispatch_supported(devinfo) && grid->indirect) {
9092       iris_emit_execute_indirect_dispatch(ice, batch, grid, idd);
9093    } else {
9094       if (grid->indirect)
9095          iris_load_indirect_location(ice, batch, grid);
9096 
9097       iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9098 
9099       ice->utrace.last_compute_walker =
9100          iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length));
9101 
9102       struct GENX(COMPUTE_WALKER_BODY) body = {
9103          .SIMDSize                       = dispatch.simd_size / 16,
9104          .MessageSIMD                    = dispatch.simd_size / 16,
9105          .LocalXMaximum                  = grid->block[0] - 1,
9106          .LocalYMaximum                  = grid->block[1] - 1,
9107          .LocalZMaximum                  = grid->block[2] - 1,
9108          .ThreadGroupIDXDimension        = grid->grid[0],
9109          .ThreadGroupIDYDimension        = grid->grid[1],
9110          .ThreadGroupIDZDimension        = grid->grid[2],
9111          .ExecutionMask                  = dispatch.right_mask,
9112          .PostSync.MOCS                  = iris_mocs(NULL, &screen->isl_dev, 0),
9113          .InterfaceDescriptor            = idd,
9114 
9115 #if GFX_VERx10 >= 125
9116          .GenerateLocalID = cs_data->generate_local_id != 0,
9117          .EmitLocal       = cs_data->generate_local_id,
9118          .WalkOrder       = cs_data->walk_order,
9119          .TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9120                        TileY32bpe : Linear,
9121 #endif
9122       };
9123 
9124       _iris_pack_command(batch, GENX(COMPUTE_WALKER),
9125                          ice->utrace.last_compute_walker, cw) {
9126          cw.IndirectParameterEnable        = grid->indirect;
9127          cw.body                           = body;
9128          assert(iris_cs_push_const_total_size(shader, dispatch.threads) == 0);
9129       }
9130    }
9131 
9132    trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2], 0);
9133 }
9134 
9135 #else /* #if GFX_VERx10 >= 125 */
9136 
9137 static void
iris_upload_gpgpu_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9138 iris_upload_gpgpu_walker(struct iris_context *ice,
9139                          struct iris_batch *batch,
9140                          const struct pipe_grid_info *grid)
9141 {
9142    const uint64_t stage_dirty = ice->state.stage_dirty;
9143    struct iris_screen *screen = batch->screen;
9144    const struct intel_device_info *devinfo = screen->devinfo;
9145    struct iris_binder *binder = &ice->state.binder;
9146    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9147    struct iris_uncompiled_shader *ish =
9148       ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
9149    struct iris_compiled_shader *shader =
9150       ice->shaders.prog[MESA_SHADER_COMPUTE];
9151    struct iris_cs_data *cs_data = iris_cs_data(shader);
9152    const struct intel_cs_dispatch_info dispatch =
9153       iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9154 
9155    trace_intel_begin_compute(&batch->trace);
9156 
9157    if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9158        cs_data->local_size[0] == 0 /* Variable local group size */) {
9159       /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
9160        *
9161        *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
9162        *    the only bits that are changed are scoreboard related: Scoreboard
9163        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
9164        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
9165        *    sufficient."
9166        */
9167       iris_emit_pipe_control_flush(batch,
9168                                    "workaround: stall before MEDIA_VFE_STATE",
9169                                    PIPE_CONTROL_CS_STALL);
9170 
9171       iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
9172          if (shader->total_scratch) {
9173             uint32_t scratch_addr =
9174                pin_scratch_space(ice, batch, shader, MESA_SHADER_COMPUTE);
9175 
9176             vfe.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
9177             vfe.ScratchSpaceBasePointer =
9178                rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
9179          }
9180 
9181          vfe.MaximumNumberofThreads =
9182             devinfo->max_cs_threads * devinfo->subslice_total - 1;
9183 #if GFX_VER < 11
9184          vfe.ResetGatewayTimer =
9185             Resettingrelativetimerandlatchingtheglobaltimestamp;
9186 #endif
9187 #if GFX_VER == 8
9188          vfe.BypassGatewayControl = true;
9189 #endif
9190          vfe.NumberofURBEntries = 2;
9191          vfe.URBEntryAllocationSize = 2;
9192 
9193          vfe.CURBEAllocationSize =
9194             ALIGN(cs_data->push.per_thread.regs * dispatch.threads +
9195                   cs_data->push.cross_thread.regs, 2);
9196       }
9197    }
9198 
9199    /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
9200    if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9201        cs_data->local_size[0] == 0 /* Variable local group size */) {
9202       uint32_t curbe_data_offset = 0;
9203       assert(cs_data->push.cross_thread.dwords == 0 &&
9204              cs_data->push.per_thread.dwords == 1 &&
9205              cs_data->first_param_is_builtin_subgroup_id);
9206       const unsigned push_const_size =
9207          iris_cs_push_const_total_size(shader, dispatch.threads);
9208       uint32_t *curbe_data_map =
9209          stream_state(batch, ice->state.dynamic_uploader,
9210                       &ice->state.last_res.cs_thread_ids,
9211                       ALIGN(push_const_size, 64), 64,
9212                       &curbe_data_offset);
9213       assert(curbe_data_map);
9214       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
9215       iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads,
9216                                      curbe_data_map);
9217 
9218       iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
9219          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
9220          curbe.CURBEDataStartAddress = curbe_data_offset;
9221       }
9222    }
9223 
9224    for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
9225       struct pipe_resource *res = ice->state.global_bindings[i];
9226       if (!res)
9227          break;
9228 
9229       iris_use_pinned_bo(batch, iris_resource_bo(res),
9230                          true, IRIS_DOMAIN_NONE);
9231    }
9232 
9233    if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
9234                       IRIS_STAGE_DIRTY_BINDINGS_CS |
9235                       IRIS_STAGE_DIRTY_CONSTANTS_CS |
9236                       IRIS_STAGE_DIRTY_CS)) {
9237       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
9238 
9239       iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
9240          idd.SharedLocalMemorySize =
9241             intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
9242          idd.KernelStartPointer =
9243             KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);
9244          idd.SamplerStatePointer = shs->sampler_table.offset;
9245          idd.BindingTablePointer =
9246             binder->bt_offset[MESA_SHADER_COMPUTE] >> IRIS_BT_OFFSET_SHIFT;
9247          idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9248       }
9249 
9250       for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
9251          desc[i] |= ((uint32_t *) shader->derived_data)[i];
9252 
9253       iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
9254          load.InterfaceDescriptorTotalLength =
9255             GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
9256          load.InterfaceDescriptorDataStartAddress =
9257             emit_state(batch, ice->state.dynamic_uploader,
9258                        &ice->state.last_res.cs_desc, desc, sizeof(desc), 64);
9259       }
9260    }
9261 
9262    if (grid->indirect)
9263       iris_load_indirect_location(ice, batch, grid);
9264 
9265    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9266 
9267    iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
9268       ggw.IndirectParameterEnable    = grid->indirect != NULL;
9269       ggw.SIMDSize                   = dispatch.simd_size / 16;
9270       ggw.ThreadDepthCounterMaximum  = 0;
9271       ggw.ThreadHeightCounterMaximum = 0;
9272       ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
9273       ggw.ThreadGroupIDXDimension    = grid->grid[0];
9274       ggw.ThreadGroupIDYDimension    = grid->grid[1];
9275       ggw.ThreadGroupIDZDimension    = grid->grid[2];
9276       ggw.RightExecutionMask         = dispatch.right_mask;
9277       ggw.BottomExecutionMask        = 0xffffffff;
9278    }
9279 
9280    iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
9281 
9282    trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2], 0);
9283 }
9284 
9285 #endif /* #if GFX_VERx10 >= 125 */
9286 
9287 static void
iris_upload_compute_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9288 iris_upload_compute_state(struct iris_context *ice,
9289                           struct iris_batch *batch,
9290                           const struct pipe_grid_info *grid)
9291 {
9292    struct iris_screen *screen = batch->screen;
9293    const uint64_t stage_dirty = ice->state.stage_dirty;
9294    struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9295    struct iris_compiled_shader *shader =
9296       ice->shaders.prog[MESA_SHADER_COMPUTE];
9297    struct iris_border_color_pool *border_color_pool =
9298       iris_bufmgr_get_border_color_pool(screen->bufmgr);
9299 
9300    iris_batch_sync_region_start(batch);
9301 
9302    /* Always pin the binder.  If we're emitting new binding table pointers,
9303     * we need it.  If not, we're probably inheriting old tables via the
9304     * context, and need it anyway.  Since true zero-bindings cases are
9305     * practically non-existent, just pin it and avoid last_res tracking.
9306     */
9307    iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
9308 
9309    if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
9310         shs->sysvals_need_upload) ||
9311        shader->kernel_input_size > 0)
9312       upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
9313 
9314    if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
9315       iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
9316 
9317    if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
9318       iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
9319 
9320    iris_use_optional_res(batch, shs->sampler_table.res, false,
9321                          IRIS_DOMAIN_NONE);
9322    iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
9323                       IRIS_DOMAIN_NONE);
9324 
9325    if (ice->state.need_border_colors)
9326       iris_use_pinned_bo(batch, border_color_pool->bo, false,
9327                          IRIS_DOMAIN_NONE);
9328 
9329 #if GFX_VER >= 12
9330    genX(invalidate_aux_map_state)(batch);
9331 #endif
9332 
9333 #if GFX_VERx10 >= 125
9334    iris_upload_compute_walker(ice, batch, grid);
9335 #else
9336    iris_upload_gpgpu_walker(ice, batch, grid);
9337 #endif
9338 
9339    if (!batch->contains_draw_with_next_seqno) {
9340       iris_restore_compute_saved_bos(ice, batch, grid);
9341       batch->contains_draw_with_next_seqno = batch->contains_draw = true;
9342    }
9343 
9344    iris_batch_sync_region_end(batch);
9345 }
9346 
9347 /**
9348  * State module teardown.
9349  */
9350 static void
iris_destroy_state(struct iris_context * ice)9351 iris_destroy_state(struct iris_context *ice)
9352 {
9353    struct iris_genx_state *genx = ice->state.genx;
9354 
9355    pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
9356 
9357    pipe_resource_reference(&ice->draw.draw_params.res, NULL);
9358    pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
9359    pipe_resource_reference(&ice->draw.generation.params.res, NULL);
9360    pipe_resource_reference(&ice->draw.generation.vertices.res, NULL);
9361 
9362    /* Loop over all VBOs, including ones for draw parameters */
9363    for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
9364       pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);
9365    }
9366 
9367    free(ice->state.genx);
9368 
9369    for (int i = 0; i < 4; i++) {
9370       pipe_so_target_reference(&ice->state.so_target[i], NULL);
9371    }
9372 
9373    util_unreference_framebuffer_state(&ice->state.framebuffer);
9374 
9375    for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
9376       struct iris_shader_state *shs = &ice->state.shaders[stage];
9377       pipe_resource_reference(&shs->sampler_table.res, NULL);
9378       for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
9379          pipe_resource_reference(&shs->constbuf[i].buffer, NULL);
9380          pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);
9381       }
9382       for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
9383          pipe_resource_reference(&shs->image[i].base.resource, NULL);
9384          pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);
9385          free(shs->image[i].surface_state.cpu);
9386       }
9387       for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
9388          pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
9389          pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);
9390       }
9391       for (int i = 0; i < IRIS_MAX_TEXTURES; i++) {
9392          pipe_sampler_view_reference((struct pipe_sampler_view **)
9393                                      &shs->textures[i], NULL);
9394       }
9395    }
9396 
9397    pipe_resource_reference(&ice->state.grid_size.res, NULL);
9398    pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);
9399 
9400    pipe_resource_reference(&ice->state.null_fb.res, NULL);
9401    pipe_resource_reference(&ice->state.unbound_tex.res, NULL);
9402 
9403    pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
9404    pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
9405    pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
9406    pipe_resource_reference(&ice->state.last_res.scissor, NULL);
9407    pipe_resource_reference(&ice->state.last_res.blend, NULL);
9408    pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);
9409    pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);
9410    pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);
9411 }
9412 
9413 /* ------------------------------------------------------------------- */
9414 
9415 static void
iris_rebind_buffer(struct iris_context * ice,struct iris_resource * res)9416 iris_rebind_buffer(struct iris_context *ice,
9417                    struct iris_resource *res)
9418 {
9419    struct pipe_context *ctx = &ice->ctx;
9420    struct iris_genx_state *genx = ice->state.genx;
9421 
9422    assert(res->base.b.target == PIPE_BUFFER);
9423 
9424    /* Buffers can't be framebuffer attachments, nor display related,
9425     * and we don't have upstream Clover support.
9426     */
9427    assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
9428                                  PIPE_BIND_RENDER_TARGET |
9429                                  PIPE_BIND_BLENDABLE |
9430                                  PIPE_BIND_DISPLAY_TARGET |
9431                                  PIPE_BIND_CURSOR |
9432                                  PIPE_BIND_COMPUTE_RESOURCE |
9433                                  PIPE_BIND_GLOBAL)));
9434 
9435    if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
9436       uint64_t bound_vbs = ice->state.bound_vertex_buffers;
9437       while (bound_vbs) {
9438          const int i = u_bit_scan64(&bound_vbs);
9439          struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];
9440 
9441          /* Update the CPU struct */
9442          STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);
9443          STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);
9444          uint64_t *addr = (uint64_t *) &state->state[1];
9445          struct iris_bo *bo = iris_resource_bo(state->resource);
9446 
9447          if (*addr != bo->address + state->offset) {
9448             *addr = bo->address + state->offset;
9449             ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
9450                                 IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
9451          }
9452       }
9453    }
9454 
9455    /* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit
9456     * the 3DSTATE_INDEX_BUFFER packet whenever the address changes.
9457     *
9458     * There is also no need to handle these:
9459     * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
9460     * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
9461     */
9462 
9463    if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
9464       uint32_t *so_buffers = genx->so_buffers;
9465       for (unsigned i = 0; i < 4; i++,
9466            so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
9467 
9468          /* There are no other fields in bits 127:64 */
9469          uint64_t *addr = (uint64_t *) &so_buffers[2];
9470          STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);
9471          STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);
9472 
9473          struct pipe_stream_output_target *tgt = ice->state.so_target[i];
9474          if (tgt) {
9475             struct iris_bo *bo = iris_resource_bo(tgt->buffer);
9476             if (*addr != bo->address + tgt->buffer_offset) {
9477                *addr = bo->address + tgt->buffer_offset;
9478                ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
9479             }
9480          }
9481       }
9482    }
9483 
9484    for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
9485       struct iris_shader_state *shs = &ice->state.shaders[s];
9486       enum pipe_shader_type p_stage = stage_to_pipe(s);
9487 
9488       if (!(res->bind_stages & (1 << s)))
9489          continue;
9490 
9491       if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
9492          /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
9493          uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
9494          while (bound_cbufs) {
9495             const int i = u_bit_scan(&bound_cbufs);
9496             struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
9497             struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];
9498 
9499             if (res->bo == iris_resource_bo(cbuf->buffer)) {
9500                pipe_resource_reference(&surf_state->res, NULL);
9501                shs->dirty_cbufs |= 1u << i;
9502                ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
9503                                     IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
9504                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
9505             }
9506          }
9507       }
9508 
9509       if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
9510          uint32_t bound_ssbos = shs->bound_ssbos;
9511          while (bound_ssbos) {
9512             const int i = u_bit_scan(&bound_ssbos);
9513             struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
9514 
9515             if (res->bo == iris_resource_bo(ssbo->buffer)) {
9516                struct pipe_shader_buffer buf = {
9517                   .buffer = &res->base.b,
9518                   .buffer_offset = ssbo->buffer_offset,
9519                   .buffer_size = ssbo->buffer_size,
9520                };
9521                iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,
9522                                        (shs->writable_ssbos >> i) & 1);
9523             }
9524          }
9525       }
9526 
9527       if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
9528          int i;
9529          BITSET_FOREACH_SET(i, shs->bound_sampler_views, IRIS_MAX_TEXTURES) {
9530             struct iris_sampler_view *isv = shs->textures[i];
9531             struct iris_bo *bo = isv->res->bo;
9532 
9533             if (update_surface_state_addrs(ice->state.surface_uploader,
9534                                            &isv->surface_state, bo)) {
9535                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9536             }
9537          }
9538       }
9539 
9540       if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
9541          uint64_t bound_image_views = shs->bound_image_views;
9542          while (bound_image_views) {
9543             const int i = u_bit_scan64(&bound_image_views);
9544             struct iris_image_view *iv = &shs->image[i];
9545             struct iris_bo *bo = iris_resource_bo(iv->base.resource);
9546 
9547             if (update_surface_state_addrs(ice->state.surface_uploader,
9548                                            &iv->surface_state, bo)) {
9549                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9550             }
9551          }
9552       }
9553    }
9554 }
9555 
9556 /* ------------------------------------------------------------------- */
9557 
9558 /**
9559  * Introduce a batch synchronization boundary, and update its cache coherency
9560  * status to reflect the execution of a PIPE_CONTROL command with the
9561  * specified flags.
9562  */
9563 static void
batch_mark_sync_for_pipe_control(struct iris_batch * batch,uint32_t flags)9564 batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
9565 {
9566    const struct intel_device_info *devinfo = batch->screen->devinfo;
9567 
9568    iris_batch_sync_boundary(batch);
9569 
9570    if ((flags & PIPE_CONTROL_CS_STALL)) {
9571       if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9572          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9573 
9574       if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9575          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9576 
9577       if ((flags & PIPE_CONTROL_TILE_CACHE_FLUSH)) {
9578          /* A tile cache flush makes any C/Z data in L3 visible to memory. */
9579          const unsigned c = IRIS_DOMAIN_RENDER_WRITE;
9580          const unsigned z = IRIS_DOMAIN_DEPTH_WRITE;
9581          batch->coherent_seqnos[c][c] = batch->l3_coherent_seqnos[c];
9582          batch->coherent_seqnos[z][z] = batch->l3_coherent_seqnos[z];
9583       }
9584 
9585       if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9586          /* HDC and DC flushes both flush the data cache out to L3 */
9587          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9588       }
9589 
9590       if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9591          /* A DC flush also flushes L3 data cache lines out to memory. */
9592          const unsigned i = IRIS_DOMAIN_DATA_WRITE;
9593          batch->coherent_seqnos[i][i] = batch->l3_coherent_seqnos[i];
9594       }
9595 
9596       if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9597          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9598 
9599       if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
9600                     PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
9601          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
9602          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9603          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9604          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
9605       }
9606    }
9607 
9608    if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9609       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9610 
9611    if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9612       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9613 
9614    if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH))
9615       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9616 
9617    if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9618       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9619 
9620    if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
9621       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
9622 
9623    if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE))
9624       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9625 
9626    /* Technically, to invalidate IRIS_DOMAIN_PULL_CONSTANT_READ, we need
9627     * both "Constant Cache Invalidate" and either "Texture Cache Invalidate"
9628     * or "Data Cache Flush" set, depending on the setting of
9629     * iris_indirect_ubos_use_sampler().
9630     *
9631     * However, "Data Cache Flush" and "Constant Cache Invalidate" will never
9632     * appear in the same PIPE_CONTROL command, because one is bottom-of-pipe
9633     * while the other is top-of-pipe.  Because we only look at one flush at
9634     * a time, we won't see both together.
9635     *
9636     * To deal with this, we mark it as invalidated when the constant cache
9637     * is invalidated, and trust the callers to also flush the other related
9638     * cache correctly at the same time.
9639     */
9640    if ((flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
9641       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9642 
9643    /* IRIS_DOMAIN_OTHER_READ no longer uses any caches. */
9644 
9645    if ((flags & PIPE_CONTROL_L3_RO_INVALIDATE_BITS) == PIPE_CONTROL_L3_RO_INVALIDATE_BITS) {
9646       /* If we just invalidated the read-only lines of L3, then writes from non-L3-coherent
9647        * domains will now be visible to those L3 clients.
9648        */
9649       for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
9650          if (!iris_domain_is_l3_coherent(devinfo, i))
9651             batch->l3_coherent_seqnos[i] = batch->coherent_seqnos[i][i];
9652       }
9653    }
9654 }
9655 
9656 static unsigned
flags_to_post_sync_op(uint32_t flags)9657 flags_to_post_sync_op(uint32_t flags)
9658 {
9659    if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
9660       return WriteImmediateData;
9661 
9662    if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
9663       return WritePSDepthCount;
9664 
9665    if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
9666       return WriteTimestamp;
9667 
9668    return 0;
9669 }
9670 
9671 /**
9672  * Do the given flags have a Post Sync or LRI Post Sync operation?
9673  */
9674 static enum pipe_control_flags
get_post_sync_flags(enum pipe_control_flags flags)9675 get_post_sync_flags(enum pipe_control_flags flags)
9676 {
9677    flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
9678             PIPE_CONTROL_WRITE_DEPTH_COUNT |
9679             PIPE_CONTROL_WRITE_TIMESTAMP |
9680             PIPE_CONTROL_LRI_POST_SYNC_OP;
9681 
9682    /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
9683     * "LRI Post Sync Operation".  So more than one bit set would be illegal.
9684     */
9685    assert(util_bitcount(flags) <= 1);
9686 
9687    return flags;
9688 }
9689 
9690 #define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)
9691 
9692 /**
9693  * Emit a series of PIPE_CONTROL commands, taking into account any
9694  * workarounds necessary to actually accomplish the caller's request.
9695  *
9696  * Unless otherwise noted, spec quotations in this function come from:
9697  *
9698  * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
9699  * Restrictions for PIPE_CONTROL.
9700  *
9701  * You should not use this function directly.  Use the helpers in
9702  * iris_pipe_control.c instead, which may split the pipe control further.
9703  */
9704 static void
iris_emit_raw_pipe_control(struct iris_batch * batch,const char * reason,uint32_t flags,struct iris_bo * bo,uint32_t offset,uint64_t imm)9705 iris_emit_raw_pipe_control(struct iris_batch *batch,
9706                            const char *reason,
9707                            uint32_t flags,
9708                            struct iris_bo *bo,
9709                            uint32_t offset,
9710                            uint64_t imm)
9711 {
9712    UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
9713    enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
9714    enum pipe_control_flags non_lri_post_sync_flags =
9715       post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
9716 
9717 #if GFX_VER >= 12
9718    if (batch->name == IRIS_BATCH_BLITTER) {
9719       batch_mark_sync_for_pipe_control(batch, flags);
9720       iris_batch_sync_region_start(batch);
9721 
9722       assert(!(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT));
9723 
9724       /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
9725       if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
9726          batch_emit_fast_color_dummy_blit(batch);
9727 
9728       /* The blitter doesn't actually use PIPE_CONTROL; rather it uses the
9729        * MI_FLUSH_DW command.  However, all of our code is set up to flush
9730        * via emitting a pipe control, so we just translate it at this point,
9731        * even if it is a bit hacky.
9732        */
9733       iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
9734          fd.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
9735          fd.ImmediateData = imm;
9736          fd.PostSyncOperation = flags_to_post_sync_op(flags);
9737 #if GFX_VERx10 >= 125
9738          /* TODO: This may not always be necessary */
9739          fd.FlushCCS = true;
9740 #endif
9741       }
9742       iris_batch_sync_region_end(batch);
9743       return;
9744    }
9745 #endif
9746 
9747    /* The "L3 Read Only Cache Invalidation Bit" docs say it "controls the
9748     * invalidation of the Geometry streams cached in L3 cache at the top
9749     * of the pipe".  In other words, index & vertex data that gets cached
9750     * in L3 when VERTEX_BUFFER_STATE::L3BypassDisable is set.
9751     *
9752     * Normally, invalidating L1/L2 read-only caches also invalidate their
9753     * related L3 cachelines, but this isn't the case for the VF cache.
9754     * Emulate it by setting the L3 Read Only bit when doing a VF invalidate.
9755     */
9756    if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)
9757       flags |= PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
9758 
9759    /* Recursive PIPE_CONTROL workarounds --------------------------------
9760     * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
9761     *
9762     * We do these first because we want to look at the original operation,
9763     * rather than any workarounds we set.
9764     */
9765    if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
9766       /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
9767        * lists several workarounds:
9768        *
9769        *    "Project: SKL, KBL, BXT
9770        *
9771        *     If the VF Cache Invalidation Enable is set to a 1 in a
9772        *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
9773        *     sets to 0, with the VF Cache Invalidation Enable set to 0
9774        *     needs to be sent prior to the PIPE_CONTROL with VF Cache
9775        *     Invalidation Enable set to a 1."
9776        */
9777       iris_emit_raw_pipe_control(batch,
9778                                  "workaround: recursive VF cache invalidate",
9779                                  0, NULL, 0, 0);
9780    }
9781 
9782    if (GFX_VER == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
9783       /* Project: SKL / Argument: LRI Post Sync Operation [23]
9784        *
9785        * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9786        *  programmed prior to programming a PIPECONTROL command with "LRI
9787        *  Post Sync Operation" in GPGPU mode of operation (i.e when
9788        *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
9789        *
9790        * The same text exists a few rows below for Post Sync Op.
9791        */
9792       iris_emit_raw_pipe_control(batch,
9793                                  "workaround: CS stall before gpgpu post-sync",
9794                                  PIPE_CONTROL_CS_STALL, bo, offset, imm);
9795    }
9796 
9797    /* "Flush Types" workarounds ---------------------------------------------
9798     * We do these now because they may add post-sync operations or CS stalls.
9799     */
9800 
9801    if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
9802       /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
9803        *
9804        * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
9805        *  'Write PS Depth Count' or 'Write Timestamp'."
9806        */
9807       if (!bo) {
9808          flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9809          post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9810          non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9811          bo = batch->screen->workaround_address.bo;
9812          offset = batch->screen->workaround_address.offset;
9813       }
9814    }
9815 
9816    if (flags & PIPE_CONTROL_DEPTH_STALL) {
9817       /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
9818        *
9819        *    "This bit must be DISABLED for operations other than writing
9820        *     PS_DEPTH_COUNT."
9821        *
9822        * This seems like nonsense.  An Ivybridge workaround requires us to
9823        * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
9824        * operation.  Gfx8+ requires us to emit depth stalls and depth cache
9825        * flushes together.  So, it's hard to imagine this means anything other
9826        * than "we originally intended this to be used for PS_DEPTH_COUNT".
9827        *
9828        * We ignore the supposed restriction and do nothing.
9829        */
9830    }
9831 
9832    if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
9833                 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9834       /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
9835        *
9836        *    "This bit must be DISABLED for End-of-pipe (Read) fences,
9837        *     PS_DEPTH_COUNT or TIMESTAMP queries."
9838        *
9839        * TODO: Implement end-of-pipe checking.
9840        */
9841       assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
9842                                   PIPE_CONTROL_WRITE_TIMESTAMP)));
9843    }
9844 
9845    if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9846       /* From the PIPE_CONTROL instruction table, bit 1:
9847        *
9848        *    "This bit is ignored if Depth Stall Enable is set.
9849        *     Further, the render cache is not flushed even if Write Cache
9850        *     Flush Enable bit is set."
9851        *
9852        * We assert that the caller doesn't do this combination, to try and
9853        * prevent mistakes.  It shouldn't hurt the GPU, though.
9854        *
9855        * We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"
9856        * and "Render Target Flush" combo is explicitly required for BTI
9857        * update workarounds.
9858        */
9859       assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
9860                         PIPE_CONTROL_RENDER_TARGET_FLUSH)));
9861    }
9862 
9863    /* PIPE_CONTROL page workarounds ------------------------------------- */
9864 
9865    if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
9866       /* From the PIPE_CONTROL page itself:
9867        *
9868        *    "IVB, HSW, BDW
9869        *     Restriction: Pipe_control with CS-stall bit set must be issued
9870        *     before a pipe-control command that has the State Cache
9871        *     Invalidate bit set."
9872        */
9873       flags |= PIPE_CONTROL_CS_STALL;
9874    }
9875 
9876    if (flags & PIPE_CONTROL_FLUSH_LLC) {
9877       /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
9878        *
9879        *    "Project: ALL
9880        *     SW must always program Post-Sync Operation to "Write Immediate
9881        *     Data" when Flush LLC is set."
9882        *
9883        * For now, we just require the caller to do it.
9884        */
9885       assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
9886    }
9887 
9888    /* Emulate a HDC flush with a full Data Cache Flush on older hardware which
9889     * doesn't support the new lightweight flush.
9890     */
9891 #if GFX_VER < 12
9892       if (flags & PIPE_CONTROL_FLUSH_HDC)
9893          flags |= PIPE_CONTROL_DATA_CACHE_FLUSH;
9894 #endif
9895 
9896    /* "Post-Sync Operation" workarounds -------------------------------- */
9897 
9898    /* Project: All / Argument: Global Snapshot Count Reset [19]
9899     *
9900     * "This bit must not be exercised on any product.
9901     *  Requires stall bit ([20] of DW1) set."
9902     *
9903     * We don't use this, so we just assert that it isn't used.  The
9904     * PIPE_CONTROL instruction page indicates that they intended this
9905     * as a debug feature and don't think it is useful in production,
9906     * but it may actually be usable, should we ever want to.
9907     */
9908    assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
9909 
9910    if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
9911                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
9912       /* Project: All / Arguments:
9913        *
9914        * - Generic Media State Clear [16]
9915        * - Indirect State Pointers Disable [16]
9916        *
9917        *    "Requires stall bit ([20] of DW1) set."
9918        *
9919        * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
9920        * State Clear) says:
9921        *
9922        *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
9923        *     programmed prior to programming a PIPECONTROL command with "Media
9924        *     State Clear" set in GPGPU mode of operation"
9925        *
9926        * This is a subset of the earlier rule, so there's nothing to do.
9927        */
9928       flags |= PIPE_CONTROL_CS_STALL;
9929    }
9930 
9931    if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
9932       /* Project: All / Argument: Store Data Index
9933        *
9934        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9935        *  than '0'."
9936        *
9937        * For now, we just assert that the caller does this.  We might want to
9938        * automatically add a write to the workaround BO...
9939        */
9940       assert(non_lri_post_sync_flags != 0);
9941    }
9942 
9943    if (flags & PIPE_CONTROL_SYNC_GFDT) {
9944       /* Project: All / Argument: Sync GFDT
9945        *
9946        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9947        *  than '0' or 0x2520[13] must be set."
9948        *
9949        * For now, we just assert that the caller does this.
9950        */
9951       assert(non_lri_post_sync_flags != 0);
9952    }
9953 
9954    if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
9955       /* Project: IVB+ / Argument: TLB inv
9956        *
9957        *    "Requires stall bit ([20] of DW1) set."
9958        *
9959        * Also, from the PIPE_CONTROL instruction table:
9960        *
9961        *    "Project: SKL+
9962        *     Post Sync Operation or CS stall must be set to ensure a TLB
9963        *     invalidation occurs.  Otherwise no cycle will occur to the TLB
9964        *     cache to invalidate."
9965        *
9966        * This is not a subset of the earlier rule, so there's nothing to do.
9967        */
9968       flags |= PIPE_CONTROL_CS_STALL;
9969    }
9970 
9971    if (GFX_VER == 9 && devinfo->gt == 4) {
9972       /* TODO: The big Skylake GT4 post sync op workaround */
9973    }
9974 
9975    /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
9976 
9977    if (IS_COMPUTE_PIPELINE(batch)) {
9978       if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
9979          /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
9980           * PIPE_CONTROL, Flush Types:
9981           *   "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
9982           * For newer platforms this is documented in the PIPE_CONTROL
9983           * instruction page.
9984           */
9985          flags |= PIPE_CONTROL_CS_STALL;
9986       }
9987 
9988       if (GFX_VER == 8 && (post_sync_flags ||
9989                            (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
9990                                      PIPE_CONTROL_DEPTH_STALL |
9991                                      PIPE_CONTROL_RENDER_TARGET_FLUSH |
9992                                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
9993                                      PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
9994          /* Project: BDW / Arguments:
9995           *
9996           * - LRI Post Sync Operation   [23]
9997           * - Post Sync Op              [15:14]
9998           * - Notify En                 [8]
9999           * - Depth Stall               [13]
10000           * - Render Target Cache Flush [12]
10001           * - Depth Cache Flush         [0]
10002           * - DC Flush Enable           [5]
10003           *
10004           *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
10005           *     Workloads."
10006           */
10007          flags |= PIPE_CONTROL_CS_STALL;
10008 
10009          /* Also, from the PIPE_CONTROL instruction table, bit 20:
10010           *
10011           *    "Project: BDW
10012           *     This bit must be always set when PIPE_CONTROL command is
10013           *     programmed by GPGPU and MEDIA workloads, except for the cases
10014           *     when only Read Only Cache Invalidation bits are set (State
10015           *     Cache Invalidation Enable, Instruction cache Invalidation
10016           *     Enable, Texture Cache Invalidation Enable, Constant Cache
10017           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
10018           *     need not implemented when FF_DOP_CG is disable via "Fixed
10019           *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
10020           *
10021           * It sounds like we could avoid CS stalls in some cases, but we
10022           * don't currently bother.  This list isn't exactly the list above,
10023           * either...
10024           */
10025       }
10026    }
10027 
10028    /* "Stall" workarounds ----------------------------------------------
10029     * These have to come after the earlier ones because we may have added
10030     * some additional CS stalls above.
10031     */
10032 
10033    if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
10034       /* Project: PRE-SKL, VLV, CHV
10035        *
10036        * "[All Stepping][All SKUs]:
10037        *
10038        *  One of the following must also be set:
10039        *
10040        *  - Render Target Cache Flush Enable ([12] of DW1)
10041        *  - Depth Cache Flush Enable ([0] of DW1)
10042        *  - Stall at Pixel Scoreboard ([1] of DW1)
10043        *  - Depth Stall ([13] of DW1)
10044        *  - Post-Sync Operation ([13] of DW1)
10045        *  - DC Flush Enable ([5] of DW1)"
10046        *
10047        * If we don't already have one of those bits set, we choose to add
10048        * "Stall at Pixel Scoreboard".  Some of the other bits require a
10049        * CS stall as a workaround (see above), which would send us into
10050        * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
10051        * appears to be safe, so we choose that.
10052        */
10053       const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
10054                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
10055                                PIPE_CONTROL_WRITE_IMMEDIATE |
10056                                PIPE_CONTROL_WRITE_DEPTH_COUNT |
10057                                PIPE_CONTROL_WRITE_TIMESTAMP |
10058                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
10059                                PIPE_CONTROL_DEPTH_STALL |
10060                                PIPE_CONTROL_DATA_CACHE_FLUSH;
10061       if (!(flags & wa_bits))
10062          flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
10063    }
10064 
10065    if (INTEL_NEEDS_WA_1409600907 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
10066       /* Wa_1409600907:
10067        *
10068        * "PIPE_CONTROL with Depth Stall Enable bit must be set
10069        * with any PIPE_CONTROL with Depth Flush Enable bit set.
10070        */
10071       flags |= PIPE_CONTROL_DEPTH_STALL;
10072    }
10073 
10074    /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
10075     * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
10076     * with CS_STALL Bit set (with No POST_SYNC ENABLED)
10077     */
10078    if (intel_device_info_is_adln(devinfo) &&
10079        IS_COMPUTE_PIPELINE(batch) &&
10080        flags_to_post_sync_op(flags) != NoWrite) {
10081       iris_emit_raw_pipe_control(batch, "Wa_14014966230",
10082                                  PIPE_CONTROL_CS_STALL, NULL, 0, 0);
10083    }
10084 
10085    batch_mark_sync_for_pipe_control(batch, flags);
10086 
10087 #if INTEL_NEEDS_WA_14010840176
10088    /* "If the intention of “constant cache invalidate” is
10089     *  to invalidate the L1 cache (which can cache constants), use “HDC
10090     *  pipeline flush” instead of Constant Cache invalidate command."
10091     *
10092     * "If L3 invalidate is needed, the w/a should be to set state invalidate
10093     * in the pipe control command, in addition to the HDC pipeline flush."
10094     */
10095    if (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) {
10096       flags &= ~PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10097       flags |= PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10098    }
10099 #endif
10100 
10101    /* Emit --------------------------------------------------------------- */
10102 
10103    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
10104       fprintf(stderr,
10105               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
10106               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
10107               (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
10108               (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
10109               (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
10110               (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
10111               (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
10112               (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
10113               (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
10114               (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
10115               (flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",
10116               (flags & PIPE_CONTROL_L3_FABRIC_FLUSH) ? "L3Fabric " : "",
10117               (flags & PIPE_CONTROL_CCS_CACHE_FLUSH) ? "CCS " : "",
10118               (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
10119               (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
10120               (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
10121               (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
10122               (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
10123               (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
10124               (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
10125                  "SnapRes" : "",
10126               (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
10127                   "ISPDis" : "",
10128               (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
10129               (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
10130               (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
10131               (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",
10132               (flags & PIPE_CONTROL_PSS_STALL_SYNC) ? "PSS " : "",
10133               (flags & PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH) ? "UntypedDataPortCache " : "",
10134               imm, reason);
10135    }
10136 
10137    iris_batch_sync_region_start(batch);
10138 
10139    const bool trace_pc =
10140       (flags & (PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CACHE_INVALIDATE_BITS)) != 0;
10141 
10142    if (trace_pc)
10143       trace_intel_begin_stall(&batch->trace);
10144 
10145    iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
10146 #if GFX_VERx10 >= 125
10147       pc.PSSStallSyncEnable = flags & PIPE_CONTROL_PSS_STALL_SYNC;
10148 #endif
10149 #if GFX_VER == 12
10150       pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;
10151       pc.L3FabricFlush = flags & PIPE_CONTROL_L3_FABRIC_FLUSH;
10152 #endif
10153 #if GFX_VER > 11
10154       pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;
10155 #endif
10156 #if GFX_VERx10 >= 125
10157       pc.UntypedDataPortCacheFlushEnable =
10158          (flags & (PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
10159                    PIPE_CONTROL_FLUSH_HDC |
10160                    PIPE_CONTROL_DATA_CACHE_FLUSH)) &&
10161          IS_COMPUTE_PIPELINE(batch);
10162       pc.HDCPipelineFlushEnable |= pc.UntypedDataPortCacheFlushEnable;
10163       pc.CCSFlushEnable |= flags & PIPE_CONTROL_CCS_CACHE_FLUSH;
10164 #endif
10165       pc.LRIPostSyncOperation = NoLRIOperation;
10166       pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
10167       pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
10168       pc.StoreDataIndex = 0;
10169       pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
10170 #if GFX_VERx10 < 125
10171       pc.GlobalSnapshotCountReset =
10172          flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
10173 #endif
10174       pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
10175 #if GFX_VERx10 < 200
10176       pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
10177 #endif
10178       pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
10179       pc.RenderTargetCacheFlushEnable =
10180          flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
10181       pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
10182       pc.StateCacheInvalidationEnable =
10183          flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10184 #if GFX_VER >= 12
10185       pc.L3ReadOnlyCacheInvalidationEnable =
10186          flags & PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
10187 #endif
10188       pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
10189       pc.ConstantCacheInvalidationEnable =
10190          flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10191       pc.PostSyncOperation = flags_to_post_sync_op(flags);
10192       pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
10193       pc.InstructionCacheInvalidateEnable =
10194          flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
10195       pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
10196       pc.IndirectStatePointersDisable =
10197          flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
10198       pc.TextureCacheInvalidationEnable =
10199          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
10200       pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
10201       pc.ImmediateData = imm;
10202    }
10203 
10204    if (trace_pc) {
10205       trace_intel_end_stall(&batch->trace, flags,
10206                             iris_utrace_pipe_flush_bit_to_ds_stall_flag,
10207                             reason,0,0,0);
10208    }
10209 
10210    iris_batch_sync_region_end(batch);
10211 }
10212 
10213 #if GFX_VER == 9
10214 /**
10215  * Preemption on Gfx9 has to be enabled or disabled in various cases.
10216  *
10217  * See these workarounds for preemption:
10218  *  - WaDisableMidObjectPreemptionForGSLineStripAdj
10219  *  - WaDisableMidObjectPreemptionForTrifanOrPolygon
10220  *  - WaDisableMidObjectPreemptionForLineLoop
10221  *  - WA#0798
10222  *
10223  * We don't put this in the vtable because it's only used on Gfx9.
10224  */
10225 void
gfx9_toggle_preemption(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)10226 gfx9_toggle_preemption(struct iris_context *ice,
10227                        struct iris_batch *batch,
10228                        const struct pipe_draw_info *draw)
10229 {
10230    struct iris_genx_state *genx = ice->state.genx;
10231    bool object_preemption = true;
10232 
10233    /* WaDisableMidObjectPreemptionForGSLineStripAdj
10234     *
10235     *    "WA: Disable mid-draw preemption when draw-call is a linestrip_adj
10236     *     and GS is enabled."
10237     */
10238    if (draw->mode == MESA_PRIM_LINE_STRIP_ADJACENCY &&
10239        ice->shaders.prog[MESA_SHADER_GEOMETRY])
10240       object_preemption = false;
10241 
10242    /* WaDisableMidObjectPreemptionForTrifanOrPolygon
10243     *
10244     *    "TriFan miscompare in Execlist Preemption test. Cut index that is
10245     *     on a previous context. End the previous, the resume another context
10246     *     with a tri-fan or polygon, and the vertex count is corrupted. If we
10247     *     prempt again we will cause corruption.
10248     *
10249     *     WA: Disable mid-draw preemption when draw-call has a tri-fan."
10250     */
10251    if (draw->mode == MESA_PRIM_TRIANGLE_FAN)
10252       object_preemption = false;
10253 
10254    /* WaDisableMidObjectPreemptionForLineLoop
10255     *
10256     *    "VF Stats Counters Missing a vertex when preemption enabled.
10257     *
10258     *     WA: Disable mid-draw preemption when the draw uses a lineloop
10259     *     topology."
10260     */
10261    if (draw->mode == MESA_PRIM_LINE_LOOP)
10262       object_preemption = false;
10263 
10264    /* WA#0798
10265     *
10266     *    "VF is corrupting GAFS data when preempted on an instance boundary
10267     *     and replayed with instancing enabled.
10268     *
10269     *     WA: Disable preemption when using instanceing."
10270     */
10271    if (draw->instance_count > 1)
10272       object_preemption = false;
10273 
10274    if (genx->object_preemption != object_preemption) {
10275       iris_enable_obj_preemption(batch, object_preemption);
10276       genx->object_preemption = object_preemption;
10277    }
10278 }
10279 #endif
10280 
10281 static void
iris_lost_genx_state(struct iris_context * ice,struct iris_batch * batch)10282 iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
10283 {
10284    struct iris_genx_state *genx = ice->state.genx;
10285 
10286 #if INTEL_NEEDS_WA_1808121037
10287    genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
10288 #endif
10289 
10290    memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
10291 }
10292 
10293 static void
iris_emit_mi_report_perf_count(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset_in_bytes,uint32_t report_id)10294 iris_emit_mi_report_perf_count(struct iris_batch *batch,
10295                                struct iris_bo *bo,
10296                                uint32_t offset_in_bytes,
10297                                uint32_t report_id)
10298 {
10299    iris_batch_sync_region_start(batch);
10300    iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
10301       mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,
10302                                    IRIS_DOMAIN_OTHER_WRITE);
10303       mi_rpc.ReportID = report_id;
10304    }
10305    iris_batch_sync_region_end(batch);
10306 }
10307 
10308 /**
10309  * Update the pixel hashing modes that determine the balancing of PS threads
10310  * across subslices and slices.
10311  *
10312  * \param width Width bound of the rendering area (already scaled down if \p
10313  *              scale is greater than 1).
10314  * \param height Height bound of the rendering area (already scaled down if \p
10315  *               scale is greater than 1).
10316  * \param scale The number of framebuffer samples that could potentially be
10317  *              affected by an individual channel of the PS thread.  This is
10318  *              typically one for single-sampled rendering, but for operations
10319  *              like CCS resolves and fast clears a single PS invocation may
10320  *              update a huge number of pixels, in which case a finer
10321  *              balancing is desirable in order to maximally utilize the
10322  *              bandwidth available.  UINT_MAX can be used as shorthand for
10323  *              "finest hashing mode available".
10324  */
10325 void
genX(emit_hashing_mode)10326 genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
10327                         unsigned width, unsigned height, unsigned scale)
10328 {
10329 #if GFX_VER == 9
10330    const struct intel_device_info *devinfo = batch->screen->devinfo;
10331    const unsigned slice_hashing[] = {
10332       /* Because all Gfx9 platforms with more than one slice require
10333        * three-way subslice hashing, a single "normal" 16x16 slice hashing
10334        * block is guaranteed to suffer from substantial imbalance, with one
10335        * subslice receiving twice as much work as the other two in the
10336        * slice.
10337        *
10338        * The performance impact of that would be particularly severe when
10339        * three-way hashing is also in use for slice balancing (which is the
10340        * case for all Gfx9 GT4 platforms), because one of the slices
10341        * receives one every three 16x16 blocks in either direction, which
10342        * is roughly the periodicity of the underlying subslice imbalance
10343        * pattern ("roughly" because in reality the hardware's
10344        * implementation of three-way hashing doesn't do exact modulo 3
10345        * arithmetic, which somewhat decreases the magnitude of this effect
10346        * in practice).  This leads to a systematic subslice imbalance
10347        * within that slice regardless of the size of the primitive.  The
10348        * 32x32 hashing mode guarantees that the subslice imbalance within a
10349        * single slice hashing block is minimal, largely eliminating this
10350        * effect.
10351        */
10352       _32x32,
10353       /* Finest slice hashing mode available. */
10354       NORMAL
10355    };
10356    const unsigned subslice_hashing[] = {
10357       /* 16x16 would provide a slight cache locality benefit especially
10358        * visible in the sampler L1 cache efficiency of low-bandwidth
10359        * non-LLC platforms, but it comes at the cost of greater subslice
10360        * imbalance for primitives of dimensions approximately intermediate
10361        * between 16x4 and 16x16.
10362        */
10363       _16x4,
10364       /* Finest subslice hashing mode available. */
10365       _8x4
10366    };
10367    /* Dimensions of the smallest hashing block of a given hashing mode.  If
10368     * the rendering area is smaller than this there can't possibly be any
10369     * benefit from switching to this mode, so we optimize out the
10370     * transition.
10371     */
10372    const unsigned min_size[][2] = {
10373       { 16, 4 },
10374       { 8, 4 }
10375    };
10376    const unsigned idx = scale > 1;
10377 
10378    if (width > min_size[idx][0] || height > min_size[idx][1]) {
10379       iris_emit_raw_pipe_control(batch,
10380                                  "workaround: CS stall before GT_MODE LRI",
10381                                  PIPE_CONTROL_STALL_AT_SCOREBOARD |
10382                                  PIPE_CONTROL_CS_STALL,
10383                                  NULL, 0, 0);
10384 
10385       iris_emit_reg(batch, GENX(GT_MODE), reg) {
10386          reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
10387          reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
10388          reg.SubsliceHashing = subslice_hashing[idx];
10389          reg.SubsliceHashingMask = -1;
10390       };
10391 
10392       ice->state.current_hash_scale = scale;
10393    }
10394 #endif
10395 }
10396 
10397 static void
iris_set_frontend_noop(struct pipe_context * ctx,bool enable)10398 iris_set_frontend_noop(struct pipe_context *ctx, bool enable)
10399 {
10400    struct iris_context *ice = (struct iris_context *) ctx;
10401 
10402    if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {
10403       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
10404       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
10405    }
10406 
10407    if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {
10408       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
10409       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
10410    }
10411 }
10412 
10413 void
genX(init_screen_state)10414 genX(init_screen_state)(struct iris_screen *screen)
10415 {
10416    assert(screen->devinfo->verx10 == GFX_VERx10);
10417    screen->vtbl.destroy_state = iris_destroy_state;
10418    screen->vtbl.init_render_context = iris_init_render_context;
10419    screen->vtbl.init_compute_context = iris_init_compute_context;
10420    screen->vtbl.init_copy_context = iris_init_copy_context;
10421    screen->vtbl.upload_render_state = iris_upload_render_state;
10422    screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
10423    screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state;
10424    screen->vtbl.update_binder_address = iris_update_binder_address;
10425    screen->vtbl.upload_compute_state = iris_upload_compute_state;
10426    screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
10427    screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc;
10428    screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
10429    screen->vtbl.rebind_buffer = iris_rebind_buffer;
10430    screen->vtbl.load_register_reg32 = iris_load_register_reg32;
10431    screen->vtbl.load_register_reg64 = iris_load_register_reg64;
10432    screen->vtbl.load_register_imm32 = iris_load_register_imm32;
10433    screen->vtbl.load_register_imm64 = iris_load_register_imm64;
10434    screen->vtbl.load_register_mem32 = iris_load_register_mem32;
10435    screen->vtbl.load_register_mem64 = iris_load_register_mem64;
10436    screen->vtbl.store_register_mem32 = iris_store_register_mem32;
10437    screen->vtbl.store_register_mem64 = iris_store_register_mem64;
10438    screen->vtbl.store_data_imm32 = iris_store_data_imm32;
10439    screen->vtbl.store_data_imm64 = iris_store_data_imm64;
10440    screen->vtbl.copy_mem_mem = iris_copy_mem_mem;
10441    screen->vtbl.derived_program_state_size = iris_derived_program_state_size;
10442    screen->vtbl.store_derived_program_state = iris_store_derived_program_state;
10443    screen->vtbl.create_so_decl_list = iris_create_so_decl_list;
10444    screen->vtbl.populate_vs_key = iris_populate_vs_key;
10445    screen->vtbl.populate_tcs_key = iris_populate_tcs_key;
10446    screen->vtbl.populate_tes_key = iris_populate_tes_key;
10447    screen->vtbl.populate_gs_key = iris_populate_gs_key;
10448    screen->vtbl.populate_fs_key = iris_populate_fs_key;
10449    screen->vtbl.populate_cs_key = iris_populate_cs_key;
10450    screen->vtbl.lost_genx_state = iris_lost_genx_state;
10451    screen->vtbl.disable_rhwo_optimization = iris_disable_rhwo_optimization;
10452 }
10453 
10454 void
genX(init_state)10455 genX(init_state)(struct iris_context *ice)
10456 {
10457    struct pipe_context *ctx = &ice->ctx;
10458    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
10459 
10460    ctx->create_blend_state = iris_create_blend_state;
10461    ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
10462    ctx->create_rasterizer_state = iris_create_rasterizer_state;
10463    ctx->create_sampler_state = iris_create_sampler_state;
10464    ctx->create_sampler_view = iris_create_sampler_view;
10465    ctx->create_surface = iris_create_surface;
10466    ctx->create_vertex_elements_state = iris_create_vertex_elements;
10467    ctx->bind_blend_state = iris_bind_blend_state;
10468    ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
10469    ctx->bind_sampler_states = iris_bind_sampler_states;
10470    ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
10471    ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
10472    ctx->delete_blend_state = iris_delete_state;
10473    ctx->delete_depth_stencil_alpha_state = iris_delete_state;
10474    ctx->delete_rasterizer_state = iris_delete_state;
10475    ctx->delete_sampler_state = iris_delete_state;
10476    ctx->delete_vertex_elements_state = iris_delete_state;
10477    ctx->set_blend_color = iris_set_blend_color;
10478    ctx->set_clip_state = iris_set_clip_state;
10479    ctx->set_constant_buffer = iris_set_constant_buffer;
10480    ctx->set_shader_buffers = iris_set_shader_buffers;
10481    ctx->set_shader_images = iris_set_shader_images;
10482    ctx->set_sampler_views = iris_set_sampler_views;
10483    ctx->set_compute_resources = iris_set_compute_resources;
10484    ctx->set_global_binding = iris_set_global_binding;
10485    ctx->set_tess_state = iris_set_tess_state;
10486    ctx->set_patch_vertices = iris_set_patch_vertices;
10487    ctx->set_framebuffer_state = iris_set_framebuffer_state;
10488    ctx->set_polygon_stipple = iris_set_polygon_stipple;
10489    ctx->set_sample_mask = iris_set_sample_mask;
10490    ctx->set_scissor_states = iris_set_scissor_states;
10491    ctx->set_stencil_ref = iris_set_stencil_ref;
10492    ctx->set_vertex_buffers = iris_set_vertex_buffers;
10493    ctx->set_viewport_states = iris_set_viewport_states;
10494    ctx->sampler_view_destroy = iris_sampler_view_destroy;
10495    ctx->surface_destroy = iris_surface_destroy;
10496    ctx->draw_vbo = iris_draw_vbo;
10497    ctx->launch_grid = iris_launch_grid;
10498    ctx->create_stream_output_target = iris_create_stream_output_target;
10499    ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
10500    ctx->set_stream_output_targets = iris_set_stream_output_targets;
10501    ctx->set_frontend_noop = iris_set_frontend_noop;
10502 
10503    ice->state.dirty = ~0ull;
10504    ice->state.stage_dirty = ~0ull;
10505 
10506    ice->state.statistics_counters_enabled = true;
10507 
10508    ice->state.sample_mask = 0xffff;
10509    ice->state.num_viewports = 1;
10510    ice->state.prim_mode = MESA_PRIM_COUNT;
10511    ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
10512    ice->draw.derived_params.drawid = -1;
10513 
10514 #if GFX_VERx10 >= 120
10515    ice->state.genx->object_preemption = true;
10516 #endif
10517 
10518    /* Make a 1x1x1 null surface for unbound textures */
10519    void *null_surf_map =
10520       upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
10521                    4 * GENX(RENDER_SURFACE_STATE_length), 64);
10522    isl_null_fill_state(&screen->isl_dev, null_surf_map,
10523                        .size = isl_extent3d(1, 1, 1));
10524    ice->state.unbound_tex.offset +=
10525       iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));
10526 
10527    /* Default all scissor rectangles to be empty regions. */
10528    for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {
10529       ice->state.scissors[i] = (struct pipe_scissor_state) {
10530          .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
10531       };
10532    }
10533 }
10534