• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file crocus_state.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * This is the main state upload code.
31  *
32  * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33  * complex, or highly reusable state can be created once, and bound and
34  * rebound multiple times.  This is modeled with the pipe->create_*_state()
35  * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36  * streamed out on the fly, via pipe->set_*_state() hooks.
37  *
38  * OpenGL involves frequently mutating context state, which is mirrored in
39  * core Mesa by highly mutable data structures.  However, most applications
40  * typically draw the same things over and over - from frame to frame, most
41  * of the same objects are still visible and need to be redrawn.  So, rather
42  * than inventing new state all the time, applications usually mutate to swap
43  * between known states that we've seen before.
44  *
45  * Gallium isolates us from this mutation by tracking API state, and
46  * distilling it into a set of Constant State Objects, or CSOs.  Large,
47  * complex, or typically reusable state can be created once, then reused
48  * multiple times.  Drivers can create and store their own associated data.
49  * This create/bind model corresponds to the pipe->create_*_state() and
50  * pipe->bind_*_state() driver hooks.
51  *
52  * Some state is cheap to create, or expected to be highly dynamic.  Rather
53  * than creating and caching piles of CSOs for these, Gallium simply streams
54  * them out, via the pipe->set_*_state() driver hooks.
55  *
56  * To reduce draw time overhead, we try to compute as much state at create
57  * time as possible.  Wherever possible, we translate the Gallium pipe state
58  * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59  * we can simply memcpy them into a batch buffer.
60  *
61  * No hardware matches the abstraction perfectly, so some commands require
62  * information from multiple CSOs.  In this case, we can store two copies
63  * of the packet (one in each CSO), and simply | together their DWords at
64  * draw time.  Sometimes the second set is trivial (one or two fields), so
65  * we simply pack it at draw time.
66  *
67  * There are two main components in the file below.  First, the CSO hooks
68  * create/bind/track state.  The second are the draw-time upload functions,
69  * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70  * the context state and emit the commands into the actual batch.
71  */
72 
73 #include <errno.h>
74 #include <stdio.h>
75 
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83 
84 #include "drm-uapi/i915_drm.h"
85 #include "intel/common/intel_l3_config.h"
86 #include "intel/common/intel_sample_positions.h"
87 #include "intel/compiler/elk/elk_compiler.h"
88 #include "compiler/shader_info.h"
89 #include "pipe/p_context.h"
90 #include "pipe/p_defines.h"
91 #include "pipe/p_screen.h"
92 #include "pipe/p_state.h"
93 #include "util/format/u_format.h"
94 #include "util/half_float.h"
95 #include "util/u_dual_blend.h"
96 #include "util/u_framebuffer.h"
97 #include "util/u_helpers.h"
98 #include "util/u_inlines.h"
99 #include "util/u_memory.h"
100 #include "util/u_prim.h"
101 #include "util/u_transfer.h"
102 #include "util/u_upload_mgr.h"
103 #include "util/u_viewport.h"
104 #include "crocus_batch.h"
105 #include "crocus_context.h"
106 #include "crocus_defines.h"
107 #include "crocus_pipe.h"
108 #include "crocus_resource.h"
109 
110 #include "crocus_genx_macros.h"
111 #include "intel/common/intel_genX_state_elk.h"
112 #include "intel/common/intel_guardband.h"
113 #include "main/macros.h" /* UNCLAMPED_* */
114 
115 /**
116  * Statically assert that PIPE_* enums match the hardware packets.
117  * (As long as they match, we don't need to translate them.)
118  */
pipe_asserts()119 UNUSED static void pipe_asserts()
120 {
121 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
122 
123    /* pipe_logicop happens to match the hardware. */
124    PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
125    PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
126    PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
127    PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
128    PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
129    PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
130    PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
131    PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
132    PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
133    PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
134    PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
135    PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
136    PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
137    PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
138    PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
139    PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
140 
141    /* pipe_blend_func happens to match the hardware. */
142    PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
143    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
144    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
145    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
146    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
147    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
148    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
149    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
150    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
151    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
152    PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
153    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
154    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
155    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
156    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
157    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
158    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
159    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
160    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
161 
162    /* pipe_blend_func happens to match the hardware. */
163    PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
164    PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
165    PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
166    PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
167    PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
168 
169    /* pipe_stencil_op happens to match the hardware. */
170    PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
171    PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
172    PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
173    PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
174    PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
175    PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
176    PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
177    PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
178 
179 #if GFX_VER >= 6
180    /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
181    PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
182    PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
183 #endif
184 #undef PIPE_ASSERT
185 }
186 
187 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)188 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
189 {
190    static const unsigned map[] = {
191       [MESA_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
192       [MESA_PRIM_LINES]                    = _3DPRIM_LINELIST,
193       [MESA_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
194       [MESA_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
195       [MESA_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
196       [MESA_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
197       [MESA_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
198       [MESA_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
199       [MESA_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
200       [MESA_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
201 #if GFX_VER >= 6
202       [MESA_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
203       [MESA_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
204       [MESA_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
205       [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
206 #endif
207 #if GFX_VER >= 7
208       [MESA_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
209 #endif
210    };
211 
212    return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
213 }
214 
215 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)216 translate_compare_func(enum pipe_compare_func pipe_func)
217 {
218    static const unsigned map[] = {
219       [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
220       [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
221       [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
222       [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
223       [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
224       [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
225       [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
226       [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
227    };
228    return map[pipe_func];
229 }
230 
231 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)232 translate_shadow_func(enum pipe_compare_func pipe_func)
233 {
234    /* Gallium specifies the result of shadow comparisons as:
235     *
236     *    1 if ref <op> texel,
237     *    0 otherwise.
238     *
239     * The hardware does:
240     *
241     *    0 if texel <op> ref,
242     *    1 otherwise.
243     *
244     * So we need to flip the operator and also negate.
245     */
246    static const unsigned map[] = {
247       [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
248       [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
249       [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
250       [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
251       [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
252       [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
253       [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
254       [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
255    };
256    return map[pipe_func];
257 }
258 
259 static unsigned
translate_cull_mode(unsigned pipe_face)260 translate_cull_mode(unsigned pipe_face)
261 {
262    static const unsigned map[4] = {
263       [PIPE_FACE_NONE]           = CULLMODE_NONE,
264       [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
265       [PIPE_FACE_BACK]           = CULLMODE_BACK,
266       [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
267    };
268    return map[pipe_face];
269 }
270 
271 #if GFX_VER >= 6
272 static unsigned
translate_fill_mode(unsigned pipe_polymode)273 translate_fill_mode(unsigned pipe_polymode)
274 {
275    static const unsigned map[4] = {
276       [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
277       [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
278       [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
279       [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
280    };
281    return map[pipe_polymode];
282 }
283 #endif
284 
285 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)286 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
287 {
288    static const unsigned map[] = {
289       [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
290       [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
291       [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
292    };
293    return map[pipe_mip];
294 }
295 
296 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)297 translate_wrap(unsigned pipe_wrap, bool either_nearest)
298 {
299    static const unsigned map[] = {
300       [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
301 #if GFX_VER == 8
302       [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
303 #else
304       [PIPE_TEX_WRAP_CLAMP]                  = TCM_CLAMP_BORDER,
305 #endif
306       [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
307       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
308       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
309       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
310 
311       /* These are unsupported. */
312       [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
313       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
314    };
315 #if GFX_VER < 8
316    if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
317       return TCM_CLAMP;
318 #endif
319    return map[pipe_wrap];
320 }
321 
322 /**
323  * Equiv if elk_state_batch
324  */
325 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)326 stream_state(struct crocus_batch *batch,
327              unsigned size,
328              unsigned alignment,
329              uint32_t *out_offset)
330 {
331    uint32_t offset = ALIGN(batch->state.used, alignment);
332 
333    if (offset + size >= STATE_SZ && !batch->no_wrap) {
334       crocus_batch_flush(batch);
335       offset = ALIGN(batch->state.used, alignment);
336    } else if (offset + size >= batch->state.bo->size) {
337       const unsigned new_size =
338          MIN2(batch->state.bo->size + batch->state.bo->size / 2,
339               MAX_STATE_SIZE);
340       crocus_grow_buffer(batch, true, batch->state.used, new_size);
341       assert(offset + size < batch->state.bo->size);
342    }
343 
344    crocus_record_state_size(batch->state_sizes, offset, size);
345 
346    batch->state.used = offset + size;
347    *out_offset = offset;
348 
349    return (uint32_t *)batch->state.map + (offset >> 2);
350 }
351 
352 /**
353  * stream_state() + memcpy.
354  */
355 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)356 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
357            unsigned alignment)
358 {
359    unsigned offset = 0;
360    uint32_t *map = stream_state(batch, size, alignment, &offset);
361 
362    if (map)
363       memcpy(map, data, size);
364 
365    return offset;
366 }
367 
368 #if GFX_VER <= 5
369 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)370 upload_pipelined_state_pointers(struct crocus_batch *batch,
371                                 bool gs_active, uint32_t gs_offset,
372                                 uint32_t vs_offset, uint32_t sf_offset,
373                                 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
374 {
375 #if GFX_VER == 5
376    /* Need to flush before changing clip max threads for errata. */
377    crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
378 #endif
379 
380    crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
381       pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
382       pp.GSEnable = gs_active;
383       if (gs_active)
384          pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
385       pp.ClipEnable = true;
386       pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
387       pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
388       pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
389       pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
390    }
391 }
392 
393 #endif
394 /**
395  * Did field 'x' change between 'old_cso' and 'new_cso'?
396  *
397  * (If so, we may want to set some dirty flags.)
398  */
399 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
400 #define cso_changed_memcmp(x) \
401    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
402 
403 static void
flush_before_state_base_change(struct crocus_batch * batch)404 flush_before_state_base_change(struct crocus_batch *batch)
405 {
406 #if GFX_VER >= 6
407    /* Flush before emitting STATE_BASE_ADDRESS.
408     *
409     * This isn't documented anywhere in the PRM.  However, it seems to be
410     * necessary prior to changing the surface state base adress.  We've
411     * seen issues in Vulkan where we get GPU hangs when using multi-level
412     * command buffers which clear depth, reset state base address, and then
413     * go render stuff.
414     *
415     * Normally, in GL, we would trust the kernel to do sufficient stalls
416     * and flushes prior to executing our batch.  However, it doesn't seem
417     * as if the kernel's flushing is always sufficient and we don't want to
418     * rely on it.
419     *
420     * We make this an end-of-pipe sync instead of a normal flush because we
421     * do not know the current status of the GPU.  On Haswell at least,
422     * having a fast-clear operation in flight at the same time as a normal
423     * rendering operation can cause hangs.  Since the kernel's flushing is
424     * insufficient, we need to ensure that any rendering operations from
425     * other processes are definitely complete before we try to do our own
426     * rendering.  It's a bit of a big hammer but it appears to work.
427     */
428    const unsigned dc_flush =
429       GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
430    crocus_emit_end_of_pipe_sync(batch,
431                                 "change STATE_BASE_ADDRESS (flushes)",
432                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
433                                 dc_flush |
434                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
435 #endif
436 }
437 
438 static void
flush_after_state_base_change(struct crocus_batch * batch)439 flush_after_state_base_change(struct crocus_batch *batch)
440 {
441    /* After re-setting the surface state base address, we have to do some
442     * cache flusing so that the sampler engine will pick up the new
443     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
444     * Shared Function > 3D Sampler > State > State Caching (page 96):
445     *
446     *    Coherency with system memory in the state cache, like the texture
447     *    cache is handled partially by software. It is expected that the
448     *    command stream or shader will issue Cache Flush operation or
449     *    Cache_Flush sampler message to ensure that the L1 cache remains
450     *    coherent with system memory.
451     *
452     *    [...]
453     *
454     *    Whenever the value of the Dynamic_State_Base_Addr,
455     *    Surface_State_Base_Addr are altered, the L1 state cache must be
456     *    invalidated to ensure the new surface or sampler state is fetched
457     *    from system memory.
458     *
459     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
460     * which, according the PIPE_CONTROL instruction documentation in the
461     * Broadwell PRM:
462     *
463     *    Setting this bit is independent of any other bit in this packet.
464     *    This bit controls the invalidation of the L1 and L2 state caches
465     *    at the top of the pipe i.e. at the parsing time.
466     *
467     * Unfortunately, experimentation seems to indicate that state cache
468     * invalidation through a PIPE_CONTROL does nothing whatsoever in
469     * regards to surface state and binding tables.  In stead, it seems that
470     * invalidating the texture cache is what is actually needed.
471     *
472     * XXX:  As far as we have been able to determine through
473     * experimentation, shows that flush the texture cache appears to be
474     * sufficient.  The theory here is that all of the sampling/rendering
475     * units cache the binding table in the texture cache.  However, we have
476     * yet to be able to actually confirm this.
477     */
478 #if GFX_VER >= 6
479    crocus_emit_end_of_pipe_sync(batch,
480                                 "change STATE_BASE_ADDRESS (invalidates)",
481                                 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
482                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
483                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
484                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
485 #endif
486 }
487 
488 #if GFX_VER >= 6
489 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)490 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
491                             struct crocus_bo *bo, uint32_t offset,
492                             bool predicated)
493 {
494    crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
495       srm.RegisterAddress = reg;
496       srm.MemoryAddress = ggtt_bo(bo, offset);
497 #if GFX_VERx10 >= 75
498       srm.PredicateEnable = predicated;
499 #else
500       if (predicated)
501          unreachable("unsupported predication");
502 #endif
503    }
504 }
505 
506 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)507 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
508                             struct crocus_bo *bo, uint32_t offset,
509                             bool predicated)
510 {
511    crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
512    crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
513 }
514 #endif
515 
516 #if GFX_VER >= 7
517 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)518 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
519 {
520    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
521       lri.RegisterOffset = reg;
522       lri.DataDWord      = val;
523    }
524 }
525 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
526 
527 #if GFX_VERx10 >= 75
528 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)529 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
530 {
531    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
532       lrr.SourceRegisterAddress = src;
533       lrr.DestinationRegisterAddress = dst;
534    }
535 }
536 
537 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)538 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
539                            uint32_t src)
540 {
541    _crocus_emit_lrr(batch, dst, src);
542 }
543 
544 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)545 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
546                            uint32_t src)
547 {
548    _crocus_emit_lrr(batch, dst, src);
549    _crocus_emit_lrr(batch, dst + 4, src + 4);
550 }
551 #endif
552 
553 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)554 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
555                            uint32_t val)
556 {
557    _crocus_emit_lri(batch, reg, val);
558 }
559 
560 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)561 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
562                            uint64_t val)
563 {
564    _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
565    _crocus_emit_lri(batch, reg + 4, val >> 32);
566 }
567 
568 /**
569  * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
570  */
571 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)572 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
573                            struct crocus_bo *bo, uint32_t offset)
574 {
575    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
576       lrm.RegisterAddress = reg;
577       lrm.MemoryAddress = ro_bo(bo, offset);
578    }
579 }
580 
581 /**
582  * Load a 64-bit value from a buffer into a MMIO register via
583  * two MI_LOAD_REGISTER_MEM commands.
584  */
585 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)586 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
587                            struct crocus_bo *bo, uint32_t offset)
588 {
589    crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
590    crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
591 }
592 
593 #if GFX_VERx10 >= 75
594 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)595 crocus_store_data_imm32(struct crocus_batch *batch,
596                         struct crocus_bo *bo, uint32_t offset,
597                         uint32_t imm)
598 {
599    crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
600       sdi.Address = rw_bo(bo, offset);
601 #if GFX_VER >= 6
602       sdi.ImmediateData = imm;
603 #endif
604    }
605 }
606 
607 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)608 crocus_store_data_imm64(struct crocus_batch *batch,
609                         struct crocus_bo *bo, uint32_t offset,
610                         uint64_t imm)
611 {
612    /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
613     * 2 in genxml but it's actually variable length and we need 5 DWords.
614     */
615    void *map = crocus_get_command_space(batch, 4 * 5);
616    _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
617       sdi.DWordLength = 5 - 2;
618       sdi.Address = rw_bo(bo, offset);
619 #if GFX_VER >= 6
620       sdi.ImmediateData = imm;
621 #endif
622    }
623 }
624 #endif
625 
626 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)627 crocus_copy_mem_mem(struct crocus_batch *batch,
628                     struct crocus_bo *dst_bo, uint32_t dst_offset,
629                     struct crocus_bo *src_bo, uint32_t src_offset,
630                     unsigned bytes)
631 {
632    assert(bytes % 4 == 0);
633    assert(dst_offset % 4 == 0);
634    assert(src_offset % 4 == 0);
635 
636 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
637    for (unsigned i = 0; i < bytes; i += 4) {
638       crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
639                                  src_bo, src_offset + i);
640       crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
641                                   dst_bo, dst_offset + i, false);
642    }
643 }
644 #endif
645 
646 /**
647  * Gallium CSO for rasterizer state.
648  */
649 struct crocus_rasterizer_state {
650    struct pipe_rasterizer_state cso;
651 #if GFX_VER >= 6
652    uint32_t sf[GENX(3DSTATE_SF_length)];
653    uint32_t clip[GENX(3DSTATE_CLIP_length)];
654 #endif
655 #if GFX_VER >= 8
656    uint32_t raster[GENX(3DSTATE_RASTER_length)];
657 #endif
658    uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
659 
660    uint8_t num_clip_plane_consts;
661    bool fill_mode_point_or_line;
662 };
663 
664 #if GFX_VER <= 5
665 #define URB_VS 0
666 #define URB_GS 1
667 #define URB_CLP 2
668 #define URB_SF 3
669 #define URB_CS 4
670 
671 static const struct {
672    uint32_t min_nr_entries;
673    uint32_t preferred_nr_entries;
674    uint32_t min_entry_size;
675    uint32_t  max_entry_size;
676 } limits[URB_CS+1] = {
677    { 16, 32, 1, 5 },                        /* vs */
678    { 4, 8,  1, 5 },                        /* gs */
679    { 5, 10,  1, 5 },                        /* clp */
680    { 1, 8,  1, 12 },                        /* sf */
681    { 1, 4,  1, 32 }                        /* cs */
682 };
683 
check_urb_layout(struct crocus_context * ice)684 static bool check_urb_layout(struct crocus_context *ice)
685 {
686    ice->urb.vs_start = 0;
687    ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
688    ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
689    ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
690    ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
691 
692    return ice->urb.cs_start + ice->urb.nr_cs_entries *
693       ice->urb.csize <= ice->urb.size;
694 }
695 
696 
697 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)698 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
699                            unsigned vsize, unsigned sfsize)
700 {
701    struct crocus_context *ice = batch->ice;
702    if (csize < limits[URB_CS].min_entry_size)
703       csize = limits[URB_CS].min_entry_size;
704 
705    if (vsize < limits[URB_VS].min_entry_size)
706       vsize = limits[URB_VS].min_entry_size;
707 
708    if (sfsize < limits[URB_SF].min_entry_size)
709       sfsize = limits[URB_SF].min_entry_size;
710 
711    if (ice->urb.vsize < vsize ||
712        ice->urb.sfsize < sfsize ||
713        ice->urb.csize < csize ||
714        (ice->urb.constrained && (ice->urb.vsize > vsize ||
715                                  ice->urb.sfsize > sfsize ||
716                                  ice->urb.csize > csize))) {
717 
718 
719       ice->urb.csize = csize;
720       ice->urb.sfsize = sfsize;
721       ice->urb.vsize = vsize;
722 
723       ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
724       ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
725       ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
726       ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
727       ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
728 
729       ice->urb.constrained = 0;
730 
731       if (GFX_VER == 5) {
732          ice->urb.nr_vs_entries = 128;
733          ice->urb.nr_sf_entries = 48;
734          if (check_urb_layout(ice)) {
735             goto done;
736          } else {
737             ice->urb.constrained = 1;
738             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
739             ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
740          }
741       } else if (GFX_VERx10 == 45) {
742          ice->urb.nr_vs_entries = 64;
743          if (check_urb_layout(ice)) {
744             goto done;
745          } else {
746             ice->urb.constrained = 1;
747             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
748          }
749       }
750 
751       if (!check_urb_layout(ice)) {
752          ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
753          ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
754          ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
755          ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
756          ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
757 
758          /* Mark us as operating with constrained nr_entries, so that next
759           * time we recalculate we'll resize the fences in the hope of
760           * escaping constrained mode and getting back to normal performance.
761           */
762          ice->urb.constrained = 1;
763 
764          if (!check_urb_layout(ice)) {
765             /* This is impossible, given the maximal sizes of urb
766              * entries and the values for minimum nr of entries
767              * provided above.
768              */
769             fprintf(stderr, "couldn't calculate URB layout!\n");
770             exit(1);
771          }
772 
773          if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
774             fprintf(stderr, "URB CONSTRAINED\n");
775       }
776 
777 done:
778       if (INTEL_DEBUG(DEBUG_URB))
779          fprintf(stderr,
780                  "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
781                  ice->urb.vs_start,
782                  ice->urb.gs_start,
783                  ice->urb.clip_start,
784                  ice->urb.sf_start,
785                  ice->urb.cs_start,
786                  ice->urb.size);
787       return true;
788    }
789    return false;
790 }
791 
792 static void
crocus_upload_urb_fence(struct crocus_batch * batch)793 crocus_upload_urb_fence(struct crocus_batch *batch)
794 {
795    uint32_t urb_fence[3];
796    _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
797       urb.VSUnitURBReallocationRequest = 1;
798       urb.GSUnitURBReallocationRequest = 1;
799       urb.CLIPUnitURBReallocationRequest = 1;
800       urb.SFUnitURBReallocationRequest = 1;
801       urb.VFEUnitURBReallocationRequest = 1;
802       urb.CSUnitURBReallocationRequest = 1;
803 
804       urb.VSFence = batch->ice->urb.gs_start;
805       urb.GSFence = batch->ice->urb.clip_start;
806       urb.CLIPFence = batch->ice->urb.sf_start;
807       urb.SFFence = batch->ice->urb.cs_start;
808       urb.CSFence = batch->ice->urb.size;
809    }
810 
811    /* erratum: URB_FENCE must not cross a 64byte cacheline */
812    if ((crocus_batch_bytes_used(batch) & 15) > 12) {
813       int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
814       do {
815          *(uint32_t *)batch->command.map_next = 0;
816          batch->command.map_next += sizeof(uint32_t);
817       } while (--pad);
818    }
819 
820    crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
821 }
822 
823 static bool
calculate_curbe_offsets(struct crocus_batch * batch)824 calculate_curbe_offsets(struct crocus_batch *batch)
825 {
826    struct crocus_context *ice = batch->ice;
827 
828    unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
829    unsigned total_regs;
830 
831    nr_fp_regs = 0;
832    for (int i = 0; i < 4; i++) {
833       const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
834       if (range->length == 0)
835          continue;
836 
837       /* ubo range tracks at 256-bit, we need 512-bit */
838       nr_fp_regs += (range->length + 1) / 2;
839    }
840 
841    if (ice->state.cso_rast->cso.clip_plane_enable) {
842       unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
843       nr_clip_regs = (nr_planes * 4 + 15) / 16;
844    }
845 
846    nr_vp_regs = 0;
847    for (int i = 0; i < 4; i++) {
848       const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
849       if (range->length == 0)
850          continue;
851 
852       /* ubo range tracks at 256-bit, we need 512-bit */
853       nr_vp_regs += (range->length + 1) / 2;
854    }
855    if (nr_vp_regs == 0) {
856       /* The pre-gen6 VS requires that some push constants get loaded no
857        * matter what, or the GPU would hang.
858        */
859       nr_vp_regs = 1;
860    }
861    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
862 
863    /* The CURBE allocation size is limited to 32 512-bit units (128 EU
864     * registers, or 1024 floats).  See CS_URB_STATE in the gen4 or gen5
865     * (volume 1, part 1) PRMs.
866     *
867     * Note that in elk_fs.cpp we're only loading up to 16 EU registers of
868     * values as push constants before spilling to pull constants, and in
869     * elk_vec4.cpp we're loading up to 32 registers of push constants.  An EU
870     * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
871     * regs for clip.
872     */
873    assert(total_regs <= 32);
874 
875    /* Lazy resize:
876     */
877    if (nr_fp_regs > ice->curbe.wm_size ||
878        nr_vp_regs > ice->curbe.vs_size ||
879        nr_clip_regs != ice->curbe.clip_size ||
880        (total_regs < ice->curbe.total_size / 4 &&
881         ice->curbe.total_size > 16)) {
882 
883       GLuint reg = 0;
884 
885       /* Calculate a new layout:
886        */
887       reg = 0;
888       ice->curbe.wm_start = reg;
889       ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
890       ice->curbe.clip_start = reg;
891       ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
892       ice->curbe.vs_start = reg;
893       ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
894       ice->curbe.total_size = reg;
895 
896       if (0)
897          fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
898                  ice->curbe.wm_start,
899                  ice->curbe.wm_size,
900                  ice->curbe.clip_start,
901                  ice->curbe.clip_size,
902                  ice->curbe.vs_start,
903                  ice->curbe.vs_size );
904       return true;
905    }
906    return false;
907 }
908 
909 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)910 upload_shader_consts(struct crocus_context *ice,
911                      gl_shader_stage stage,
912                      uint32_t *map,
913                      unsigned start)
914 {
915    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
916    struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
917    uint32_t *cmap;
918    bool found = false;
919    unsigned offset = start * 16;
920    int total = 0;
921    for (int i = 0; i < 4; i++) {
922       const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
923 
924       if (range->length == 0)
925          continue;
926 
927       unsigned block_index = crocus_bti_to_group_index(
928          &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
929       unsigned len = range->length * 8 * sizeof(float);
930       unsigned start = range->start * 8 * sizeof(float);
931       struct pipe_transfer *transfer;
932 
933       cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
934                                    ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
935                                    PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
936       if (cmap)
937          memcpy(&map[offset + (total * 8)], cmap, len);
938       pipe_buffer_unmap(&ice->ctx, transfer);
939       total += range->length;
940       found = true;
941    }
942 
943    if (stage == MESA_SHADER_VERTEX && !found) {
944       /* The pre-gen6 VS requires that some push constants get loaded no
945        * matter what, or the GPU would hang.
946        */
947       unsigned len = 16;
948       memset(&map[offset], 0, len);
949    }
950 }
951 
952 static const float fixed_plane[6][4] = {
953    { 0,    0,   -1, 1 },
954    { 0,    0,    1, 1 },
955    { 0,   -1,    0, 1 },
956    { 0,    1,    0, 1 },
957    {-1,    0,    0, 1 },
958    { 1,    0,    0, 1 }
959 };
960 
961 static void
gen4_upload_curbe(struct crocus_batch * batch)962 gen4_upload_curbe(struct crocus_batch *batch)
963 {
964    struct crocus_context *ice = batch->ice;
965    const unsigned sz = ice->curbe.total_size;
966    const unsigned buf_sz = sz * 16 * sizeof(float);
967 
968    if (sz == 0)
969       goto emit;
970 
971    uint32_t *map;
972    u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
973                   &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
974 
975    /* fragment shader constants */
976    if (ice->curbe.wm_size) {
977       upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
978    }
979 
980    /* clipper constants */
981    if (ice->curbe.clip_size) {
982       unsigned offset = ice->curbe.clip_start * 16;
983       float *fmap = (float *)map;
984       unsigned i;
985       /* If any planes are going this way, send them all this way:
986        */
987       for (i = 0; i < 6; i++) {
988          fmap[offset + i * 4 + 0] = fixed_plane[i][0];
989          fmap[offset + i * 4 + 1] = fixed_plane[i][1];
990          fmap[offset + i * 4 + 2] = fixed_plane[i][2];
991          fmap[offset + i * 4 + 3] = fixed_plane[i][3];
992       }
993 
994       unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
995       struct pipe_clip_state *cp = &ice->state.clip_planes;
996       while (mask) {
997          const int j = u_bit_scan(&mask);
998          fmap[offset + i * 4 + 0] = cp->ucp[j][0];
999          fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1000          fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1001          fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1002          i++;
1003       }
1004    }
1005 
1006    /* vertex shader constants */
1007    if (ice->curbe.vs_size) {
1008       upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1009    }
1010    if (0) {
1011       for (int i = 0; i < sz*16; i+=4) {
1012          float *f = (float *)map;
1013          fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1014                  f[i+0], f[i+1], f[i+2], f[i+3]);
1015       }
1016    }
1017 
1018 emit:
1019    crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1020       if (ice->curbe.curbe_res) {
1021          cb.BufferLength = ice->curbe.total_size - 1;
1022          cb.Valid = 1;
1023          cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1024       }
1025    }
1026 
1027 #if GFX_VER == 4 && GFX_VERx10 != 45
1028    /* Work around a Broadwater/Crestline depth interpolator bug.  The
1029     * following sequence will cause GPU hangs:
1030     *
1031     * 1. Change state so that all depth related fields in CC_STATE are
1032     *    disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1033     * 2. Emit a CONSTANT_BUFFER packet.
1034     * 3. Draw via 3DPRIMITIVE.
1035     *
1036     * The recommended workaround is to emit a non-pipelined state change after
1037     * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1038     *
1039     * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1040     * and always emit it when "PS Use Source Depth" is set.  We could be more
1041     * precise, but the additional complexity is probably not worth it.
1042     *
1043     */
1044    const struct shader_info *fs_info =
1045       crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1046 
1047    if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1048       ice->state.global_depth_offset_clamp = 0;
1049       crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1050    }
1051 #endif
1052 }
1053 #endif
1054 
1055 #if GFX_VER >= 7
1056 
1057 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
1058 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
1059 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
1060 
1061 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1062 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1063 {
1064 #if GFX_VER == 7
1065    const struct intel_device_info *devinfo = &batch->screen->devinfo;
1066    const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1067    const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1068                        cfg->n[INTEL_L3P_ALL];
1069    const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1070                       cfg->n[INTEL_L3P_ALL];
1071    const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1072                       cfg->n[INTEL_L3P_ALL];
1073    const bool has_slm = cfg->n[INTEL_L3P_SLM];
1074 #endif
1075 
1076    /* According to the hardware docs, the L3 partitioning can only be changed
1077     * while the pipeline is completely drained and the caches are flushed,
1078     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1079     */
1080    crocus_emit_pipe_control_flush(batch, "l3_config",
1081                                   PIPE_CONTROL_DATA_CACHE_FLUSH |
1082                                   PIPE_CONTROL_CS_STALL);
1083 
1084    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1085     * invalidation of the relevant caches.  Note that because RO invalidation
1086     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1087     * command is processed by the CS) we cannot combine it with the previous
1088     * stalling flush as the hardware documentation suggests, because that
1089     * would cause the CS to stall on previous rendering *after* RO
1090     * invalidation and wouldn't prevent the RO caches from being polluted by
1091     * concurrent rendering before the stall completes.  This intentionally
1092     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1093     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1094     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1095     * already guarantee that there is no concurrent GPGPU kernel execution
1096     * (see SKL HSD 2132585).
1097     */
1098    crocus_emit_pipe_control_flush(batch, "l3 config",
1099                                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1100                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1101                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1102                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1103 
1104    /* Now send a third stalling flush to make sure that invalidation is
1105     * complete when the L3 configuration registers are modified.
1106     */
1107    crocus_emit_pipe_control_flush(batch, "l3 config",
1108                                   PIPE_CONTROL_DATA_CACHE_FLUSH |
1109                                   PIPE_CONTROL_CS_STALL);
1110 
1111 #if GFX_VER == 8
1112    assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1113    crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1114       reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1115       reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1116       reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1117       reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1118       reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1119    }
1120 #else
1121    assert(!cfg->n[INTEL_L3P_ALL]);
1122 
1123    /* When enabled SLM only uses a portion of the L3 on half of the banks,
1124     * the matching space on the remaining banks has to be allocated to a
1125     * client (URB for all validated configurations) set to the
1126     * lower-bandwidth 2-bank address hashing mode.
1127     */
1128    const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1129    assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1130 
1131    /* Minimum number of ways that can be allocated to the URB. */
1132    const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1133    assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1134 
1135    uint32_t l3sqcr1, l3cr2, l3cr3;
1136 
1137    crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1138       reg.ConvertDC_UC = !has_dc;
1139       reg.ConvertIS_UC = !has_is;
1140       reg.ConvertC_UC = !has_c;
1141       reg.ConvertT_UC = !has_t;
1142 #if GFX_VERx10 == 75
1143       reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1144 #else
1145       reg.L3SQGeneralPriorityCreditInitialization =
1146          devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1147 #endif
1148       reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1149    };
1150 
1151    crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1152       reg.SLMEnable = has_slm;
1153       reg.URBLowBandwidth = urb_low_bw;
1154       reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1155 #if !(GFX_VERx10 == 75)
1156       reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1157 #endif
1158       reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1159       reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1160    };
1161 
1162    crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1163       reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1164       reg.ISLowBandwidth = 0;
1165       reg.CAllocation = cfg->n[INTEL_L3P_C];
1166       reg.CLowBandwidth = 0;
1167       reg.TAllocation = cfg->n[INTEL_L3P_T];
1168       reg.TLowBandwidth = 0;
1169    };
1170 
1171    /* Set up the L3 partitioning. */
1172    crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1173    crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1174    crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1175 
1176 #if GFX_VERx10 == 75
1177    /* TODO: Fail screen creation if command parser version < 4 */
1178    uint32_t scratch1, chicken3;
1179    crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1180       reg.L3AtomicDisable = !has_dc;
1181    }
1182    crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1183       reg.L3AtomicDisableMask = true;
1184       reg.L3AtomicDisable = !has_dc;
1185    }
1186    crocus_emit_lri(batch, SCRATCH1, scratch1);
1187    crocus_emit_lri(batch, CHICKEN3, chicken3);
1188 #endif
1189 #endif
1190 }
1191 
1192 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1193 emit_l3_state(struct crocus_batch *batch, bool compute)
1194 {
1195    const struct intel_l3_config *const cfg =
1196       compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1197 
1198    setup_l3_config(batch, cfg);
1199    if (INTEL_DEBUG(DEBUG_L3)) {
1200       intel_dump_l3_config(cfg, stderr);
1201    }
1202 }
1203 
1204 /**
1205  * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1206  */
1207 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1208 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1209 {
1210    crocus_emit_pipe_control_write(batch,
1211                                   "workaround",
1212                                   PIPE_CONTROL_CS_STALL
1213                                   | PIPE_CONTROL_WRITE_IMMEDIATE,
1214                                   batch->ice->workaround_bo,
1215                                   batch->ice->workaround_offset, 0);
1216 }
1217 #endif
1218 
1219 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1220 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1221 {
1222 #if GFX_VER == 8
1223    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1224     *
1225     *   Software must clear the COLOR_CALC_STATE Valid field in
1226     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1227     *   with Pipeline Select set to GPGPU.
1228     *
1229     * The internal hardware docs recommend the same workaround for Gfx9
1230     * hardware too.
1231     */
1232    if (pipeline == GPGPU)
1233       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1234 #endif
1235 
1236 #if GFX_VER >= 6
1237    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1238     * PIPELINE_SELECT [DevBWR+]":
1239     *
1240     *    "Project: DEVSNB+
1241     *
1242     *     Software must ensure all the write caches are flushed through a
1243     *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1244     *     command to invalidate read only caches prior to programming
1245     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1246     */
1247    const unsigned dc_flush =
1248       GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1249    crocus_emit_pipe_control_flush(batch,
1250                                   "workaround: PIPELINE_SELECT flushes (1/2)",
1251                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
1252                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1253                                   dc_flush |
1254                                   PIPE_CONTROL_CS_STALL);
1255 
1256    crocus_emit_pipe_control_flush(batch,
1257                                   "workaround: PIPELINE_SELECT flushes (2/2)",
1258                                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1259                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1260                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1261                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1262 #else
1263    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1264     * PIPELINE_SELECT [DevBWR+]":
1265     *
1266     *   Project: PRE-DEVSNB
1267     *
1268     *   Software must ensure the current pipeline is flushed via an
1269     *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1270     */
1271    crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1272 #endif
1273 
1274    crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1275       sel.PipelineSelection = pipeline;
1276    }
1277 
1278 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1279    if (pipeline == _3D) {
1280       gen7_emit_cs_stall_flush(batch);
1281 
1282       crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1283          prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1284       };
1285    }
1286 #endif
1287 }
1288 
1289 /**
1290  * The following diagram shows how we partition the URB:
1291  *
1292  *        16kB or 32kB               Rest of the URB space
1293  *   __________-__________   _________________-_________________
1294  *  /                     \ /                                   \
1295  * +-------------------------------------------------------------+
1296  * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
1297  * |       Constants       |               Entries               |
1298  * +-------------------------------------------------------------+
1299  *
1300  * Notably, push constants must be stored at the beginning of the URB
1301  * space, while entries can be stored anywhere.  Ivybridge and Haswell
1302  * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1303  * doubles this (32kB).
1304  *
1305  * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1306  * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
1307  * sized in increments of 2kB.
1308  *
1309  * Currently we split the constant buffer space evenly among whatever stages
1310  * are active.  This is probably not ideal, but simple.
1311  *
1312  * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1313  * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1314  * Haswell GT3 has 512kB of URB space.
1315  *
1316  * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1317  * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1318  */
1319 #if GFX_VER >= 7
1320 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1321 crocus_alloc_push_constants(struct crocus_batch *batch)
1322 {
1323    const unsigned push_constant_kb =
1324       batch->screen->devinfo.max_constant_urb_size_kb;
1325    unsigned size_per_stage = push_constant_kb / 5;
1326 
1327    /* For now, we set a static partitioning of the push constant area,
1328     * assuming that all stages could be in use.
1329     *
1330     * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1331     *       see if that improves performance by offering more space to
1332     *       the VS/FS when those aren't in use.  Also, try dynamically
1333     *       enabling/disabling it like i965 does.  This would be more
1334     *       stalls and may not actually help; we don't know yet.
1335     */
1336    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1337       crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1338          alloc._3DCommandSubOpcode = 18 + i;
1339          alloc.ConstantBufferOffset = size_per_stage * i;
1340          alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1341       }
1342    }
1343 
1344    /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1345     *
1346     *     A PIPE_CONTROL command with the CS Stall bit set must be programmed
1347     *     in the ring after this instruction.
1348     *
1349     * No such restriction exists for Haswell or Baytrail.
1350     */
1351    if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1352       gen7_emit_cs_stall_flush(batch);
1353 }
1354 #endif
1355 
1356 /**
1357  * Upload the initial GPU state for a render context.
1358  *
1359  * This sets some invariant state that needs to be programmed a particular
1360  * way, but we never actually change.
1361  */
1362 static void
crocus_init_render_context(struct crocus_batch * batch)1363 crocus_init_render_context(struct crocus_batch *batch)
1364 {
1365    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1366 
1367    emit_pipeline_select(batch, _3D);
1368 
1369    crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1370 
1371 #if GFX_VER >= 7
1372    emit_l3_state(batch, false);
1373 #endif
1374 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1375    crocus_emit_reg(batch, GENX(INSTPM), reg) {
1376       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1377       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1378    }
1379 #endif
1380 #if GFX_VER >= 5 || GFX_VERx10 == 45
1381    /* Use the legacy AA line coverage computation. */
1382    crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1383 #endif
1384 
1385    /* No polygon stippling offsets are necessary. */
1386    /* TODO: may need to set an offset for origin-UL framebuffers */
1387    crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1388 
1389 #if GFX_VER >= 7
1390    crocus_alloc_push_constants(batch);
1391 #endif
1392 
1393 #if GFX_VER == 8
1394    /* Set the initial MSAA sample positions. */
1395    crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1396       INTEL_SAMPLE_POS_1X(pat._1xSample);
1397       INTEL_SAMPLE_POS_2X(pat._2xSample);
1398       INTEL_SAMPLE_POS_4X(pat._4xSample);
1399       INTEL_SAMPLE_POS_8X(pat._8xSample);
1400    }
1401 
1402    /* Disable chromakeying (it's for media) */
1403    crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1404 
1405    /* We want regular rendering, not special HiZ operations. */
1406    crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1407 #endif
1408 }
1409 
1410 #if GFX_VER >= 7
1411 static void
crocus_init_compute_context(struct crocus_batch * batch)1412 crocus_init_compute_context(struct crocus_batch *batch)
1413 {
1414    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1415 
1416    emit_pipeline_select(batch, GPGPU);
1417 
1418 #if GFX_VER >= 7
1419    emit_l3_state(batch, true);
1420 #endif
1421 }
1422 #endif
1423 
1424 /**
1425  * Generation-specific context state (ice->state.genx->...).
1426  *
1427  * Most state can go in crocus_context directly, but these encode hardware
1428  * packets which vary by generation.
1429  */
1430 struct crocus_genx_state {
1431    struct {
1432 #if GFX_VER >= 7
1433       struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1434 #endif
1435    } shaders[MESA_SHADER_STAGES];
1436 
1437 #if GFX_VER == 8
1438    bool pma_fix_enabled;
1439 #endif
1440 };
1441 
1442 /**
1443  * The pipe->set_blend_color() driver hook.
1444  *
1445  * This corresponds to our COLOR_CALC_STATE.
1446  */
1447 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1448 crocus_set_blend_color(struct pipe_context *ctx,
1449                        const struct pipe_blend_color *state)
1450 {
1451    struct crocus_context *ice = (struct crocus_context *) ctx;
1452 
1453    /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1454    memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1455 #if GFX_VER <= 5
1456    ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1457 #else
1458    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1459 #endif
1460 }
1461 
1462 /**
1463  * Gallium CSO for blend state (see pipe_blend_state).
1464  */
1465 struct crocus_blend_state {
1466 #if GFX_VER == 8
1467    /** Partial 3DSTATE_PS_BLEND */
1468    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1469 #endif
1470 
1471    /** copy of BLEND_STATE */
1472    struct pipe_blend_state cso;
1473 
1474    /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1475    uint8_t blend_enables;
1476 
1477    /** Bitfield of whether color writes are enabled for RT[i] */
1478    uint8_t color_write_enables;
1479 
1480    /** Does RT[0] use dual color blending? */
1481    bool dual_color_blending;
1482 };
1483 
1484 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1485 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1486 {
1487    if (alpha_to_one) {
1488       if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1489          return PIPE_BLENDFACTOR_ONE;
1490 
1491       if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1492          return PIPE_BLENDFACTOR_ZERO;
1493    }
1494 
1495    return f;
1496 }
1497 
1498 #if GFX_VER >= 6
1499 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1500 #else
1501 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1502 #endif
1503 
1504 static bool
can_emit_logic_op(struct crocus_context * ice)1505 can_emit_logic_op(struct crocus_context *ice)
1506 {
1507    /* all pre gen8 have logicop restricted to unorm */
1508    enum pipe_format pformat = PIPE_FORMAT_NONE;
1509    for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1510       if (ice->state.framebuffer.cbufs[i]) {
1511          pformat = ice->state.framebuffer.cbufs[i]->format;
1512          break;
1513       }
1514    }
1515    return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1516 }
1517 
1518 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1519 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1520                      struct crocus_blend_state *cso_blend,
1521                      int idx)
1522 {
1523    struct crocus_context *ice = batch->ice;
1524    bool independent_alpha_blend = false;
1525    const struct pipe_rt_blend_state *rt =
1526       &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1527    const unsigned blend_enabled = rt->blend_enable;
1528 
1529    enum pipe_blendfactor src_rgb =
1530       fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1531    enum pipe_blendfactor src_alpha =
1532       fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1533    enum pipe_blendfactor dst_rgb =
1534       fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1535    enum pipe_blendfactor dst_alpha =
1536       fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1537 
1538    if (rt->rgb_func != rt->alpha_func ||
1539        src_rgb != src_alpha || dst_rgb != dst_alpha)
1540       independent_alpha_blend = true;
1541    if (cso_blend->cso.logicop_enable) {
1542       if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1543          entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1544          entry->LogicOpFunction = cso_blend->cso.logicop_func;
1545       }
1546    } else if (blend_enabled) {
1547       if (idx == 0) {
1548          struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1549          struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1550          entry->ColorBufferBlendEnable =
1551             (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1552       } else
1553          entry->ColorBufferBlendEnable = 1;
1554 
1555       entry->ColorBlendFunction          = rt->rgb_func;
1556       entry->AlphaBlendFunction          = rt->alpha_func;
1557       entry->SourceBlendFactor           = (int) src_rgb;
1558       entry->SourceAlphaBlendFactor      = (int) src_alpha;
1559       entry->DestinationBlendFactor      = (int) dst_rgb;
1560       entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1561    }
1562 #if GFX_VER <= 5
1563    /*
1564     * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1565     * when a dual src blend shader is in use. Setup dummy blending.
1566     */
1567    struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1568    struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1569    if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1570       entry->ColorBufferBlendEnable = 1;
1571       entry->ColorBlendFunction = PIPE_BLEND_ADD;
1572       entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1573       entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1574       entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1575       entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1576       entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1577    }
1578 #endif
1579    return independent_alpha_blend;
1580 }
1581 
1582 /**
1583  * The pipe->create_blend_state() driver hook.
1584  *
1585  * Translates a pipe_blend_state into crocus_blend_state.
1586  */
1587 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1588 crocus_create_blend_state(struct pipe_context *ctx,
1589                           const struct pipe_blend_state *state)
1590 {
1591    struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1592 
1593    cso->blend_enables = 0;
1594    cso->color_write_enables = 0;
1595    STATIC_ASSERT(ELK_MAX_DRAW_BUFFERS <= 8);
1596 
1597    cso->cso = *state;
1598    cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1599 
1600 #if GFX_VER == 8
1601    bool indep_alpha_blend = false;
1602 #endif
1603    for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
1604       const struct pipe_rt_blend_state *rt =
1605          &state->rt[state->independent_blend_enable ? i : 0];
1606       if (rt->blend_enable)
1607          cso->blend_enables |= 1u << i;
1608       if (rt->colormask)
1609          cso->color_write_enables |= 1u << i;
1610 #if GFX_VER == 8
1611       enum pipe_blendfactor src_rgb =
1612          fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1613       enum pipe_blendfactor src_alpha =
1614          fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1615       enum pipe_blendfactor dst_rgb =
1616          fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1617       enum pipe_blendfactor dst_alpha =
1618          fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1619 
1620       if (rt->rgb_func != rt->alpha_func ||
1621           src_rgb != src_alpha || dst_rgb != dst_alpha)
1622          indep_alpha_blend = true;
1623 #endif
1624    }
1625 
1626 #if GFX_VER == 8
1627    crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1628       /* pb.HasWriteableRT is filled in at draw time.
1629        * pb.AlphaTestEnable is filled in at draw time.
1630        *
1631        * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1632        * setting it when dual color blending without an appropriate shader.
1633        */
1634 
1635       pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1636       pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1637 
1638       /* The casts prevent warnings about implicit enum type conversions. */
1639       pb.SourceBlendFactor =
1640          (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1641       pb.SourceAlphaBlendFactor =
1642          (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1643       pb.DestinationBlendFactor =
1644          (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1645       pb.DestinationAlphaBlendFactor =
1646          (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1647    }
1648 #endif
1649    return cso;
1650 }
1651 
1652 /**
1653  * The pipe->bind_blend_state() driver hook.
1654  *
1655  * Bind a blending CSO and flag related dirty bits.
1656  */
1657 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1658 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1659 {
1660    struct crocus_context *ice = (struct crocus_context *) ctx;
1661    struct crocus_blend_state *cso = state;
1662 
1663    ice->state.cso_blend = cso;
1664    ice->state.blend_enables = cso ? cso->blend_enables : 0;
1665 
1666    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1667    ice->state.dirty |= CROCUS_DIRTY_WM;
1668 #if GFX_VER >= 6
1669    ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1670 #endif
1671 #if GFX_VER >= 7
1672    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1673 #endif
1674 #if GFX_VER == 8
1675    ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1676    ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1677 #endif
1678    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1679    ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1680    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1681 }
1682 
1683 /**
1684  * Return true if the FS writes to any color outputs which are not disabled
1685  * via color masking.
1686  */
1687 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1688 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1689                  const struct shader_info *fs_info)
1690 {
1691    if (!fs_info)
1692       return false;
1693 
1694    unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1695 
1696    if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1697       rt_outputs = (1 << ELK_MAX_DRAW_BUFFERS) - 1;
1698 
1699    return cso_blend->color_write_enables & rt_outputs;
1700 }
1701 
1702 /**
1703  * Gallium CSO for depth, stencil, and alpha testing state.
1704  */
1705 struct crocus_depth_stencil_alpha_state {
1706    struct pipe_depth_stencil_alpha_state cso;
1707 
1708    bool depth_writes_enabled;
1709    bool stencil_writes_enabled;
1710 };
1711 
1712 /**
1713  * The pipe->create_depth_stencil_alpha_state() driver hook.
1714  *
1715  * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1716  * testing state since we need pieces of it in a variety of places.
1717  */
1718 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1719 crocus_create_zsa_state(struct pipe_context *ctx,
1720                         const struct pipe_depth_stencil_alpha_state *state)
1721 {
1722    struct crocus_depth_stencil_alpha_state *cso =
1723       malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1724 
1725    bool two_sided_stencil = state->stencil[1].enabled;
1726    cso->cso = *state;
1727 
1728    cso->depth_writes_enabled = state->depth_writemask;
1729    cso->stencil_writes_enabled =
1730       state->stencil[0].writemask != 0 ||
1731       (two_sided_stencil && state->stencil[1].writemask != 0);
1732 
1733    /* The state tracker needs to optimize away EQUAL writes for us. */
1734    assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1735 
1736    return cso;
1737 }
1738 
1739 /**
1740  * The pipe->bind_depth_stencil_alpha_state() driver hook.
1741  *
1742  * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1743  */
1744 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1745 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1746 {
1747    struct crocus_context *ice = (struct crocus_context *) ctx;
1748    struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1749    struct crocus_depth_stencil_alpha_state *new_cso = state;
1750 
1751    if (new_cso) {
1752       if (cso_changed(cso.alpha_ref_value))
1753          ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1754 
1755       if (cso_changed(cso.alpha_enabled))
1756          ice->state.dirty |= CROCUS_DIRTY_WM;
1757 #if GFX_VER >= 6
1758       if (cso_changed(cso.alpha_enabled))
1759          ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1760 
1761       if (cso_changed(cso.alpha_func))
1762          ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1763 #endif
1764 #if GFX_VER == 8
1765       if (cso_changed(cso.alpha_enabled))
1766          ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1767 #endif
1768 
1769       if (cso_changed(depth_writes_enabled))
1770          ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1771 
1772       ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1773       ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1774 
1775 #if GFX_VER <= 5
1776       ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1777 #endif
1778    }
1779 
1780    ice->state.cso_zsa = new_cso;
1781    ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1782 #if GFX_VER >= 6
1783    ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1784 #endif
1785 #if GFX_VER == 8
1786    ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1787 #endif
1788    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1789 }
1790 
1791 #if GFX_VER == 8
1792 static bool
want_pma_fix(struct crocus_context * ice)1793 want_pma_fix(struct crocus_context *ice)
1794 {
1795    UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1796    UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1797    const struct elk_wm_prog_data *wm_prog_data = (void *)
1798       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1799    const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1800    const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1801    const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1802 
1803    /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1804     * to avoid stalling at the pixel mask array.  The state equations are
1805     * documented in these places:
1806     *
1807     * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
1808     * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1809     *
1810     * Both equations share some common elements:
1811     *
1812     *    no_hiz_op =
1813     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1814     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1815     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1816     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1817     *
1818     *    killpixels =
1819     *       3DSTATE_WM::ForceKillPix != ForceOff &&
1820     *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1821     *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1822     *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1823     *        3DSTATE_PS_BLEND::AlphaTestEnable ||
1824     *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1825     *
1826     *    (Technically the stencil PMA treats ForceKillPix differently,
1827     *     but I think this is a documentation oversight, and we don't
1828     *     ever use it in this way, so it doesn't matter).
1829     *
1830     *    common_pma_fix =
1831     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
1832     *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1833     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1834     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1835     *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1836     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
1837     *       no_hiz_op
1838     *
1839     * These are always true:
1840     *
1841     *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1842     *    3DSTATE_PS_EXTRA::PixelShaderValid
1843     *
1844     * Also, we never use the normal drawing path for HiZ ops; these are true:
1845     *
1846     *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1847     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1848     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1849     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
1850     *
1851     * This happens sometimes:
1852     *
1853     *    3DSTATE_WM::ForceThreadDispatch != 1
1854     *
1855     * However, we choose to ignore it as it either agrees with the signal
1856     * (dispatch was already enabled, so nothing out of the ordinary), or
1857     * there are no framebuffer attachments (so no depth or HiZ anyway,
1858     * meaning the PMA signal will already be disabled).
1859     */
1860 
1861    if (!cso_fb->zsbuf)
1862       return false;
1863 
1864    struct crocus_resource *zres, *sres;
1865    crocus_get_depth_stencil_resources(devinfo,
1866                                       cso_fb->zsbuf->texture, &zres, &sres);
1867 
1868    /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1869     * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1870     */
1871    if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1872       return false;
1873 
1874    /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1875    if (wm_prog_data->early_fragment_tests)
1876       return false;
1877 
1878    /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1879     * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1880     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1881     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1882     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
1883     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1884     */
1885    bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1886                      cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1887 
1888    /* The Gfx8 depth PMA equation becomes:
1889     *
1890     *    depth_writes =
1891     *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1892     *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1893     *
1894     *    stencil_writes =
1895     *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1896     *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1897     *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1898     *
1899     *    Z_PMA_OPT =
1900     *       common_pma_fix &&
1901     *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1902     *       ((killpixels && (depth_writes || stencil_writes)) ||
1903     *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1904     *
1905     */
1906    if (!cso_zsa->cso.depth_enabled)
1907       return false;
1908 
1909    return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1910           (killpixels && (cso_zsa->depth_writes_enabled ||
1911                           (sres && cso_zsa->stencil_writes_enabled)));
1912 }
1913 #endif
1914 void
genX(crocus_update_pma_fix)1915 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1916                             struct crocus_batch *batch,
1917                             bool enable)
1918 {
1919 #if GFX_VER == 8
1920    struct crocus_genx_state *genx = ice->state.genx;
1921 
1922    if (genx->pma_fix_enabled == enable)
1923       return;
1924 
1925    genx->pma_fix_enabled = enable;
1926 
1927    /* According to the Broadwell PIPE_CONTROL documentation, software should
1928     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1929     * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
1930     *
1931     * The Gfx9 docs say to use a depth stall rather than a command streamer
1932     * stall.  However, the hardware seems to violently disagree.  A full
1933     * command streamer stall seems to be needed in both cases.
1934     */
1935    crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1936                                   PIPE_CONTROL_CS_STALL |
1937                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1938                                   PIPE_CONTROL_RENDER_TARGET_FLUSH);
1939 
1940    crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1941       reg.NPPMAFixEnable = enable;
1942       reg.NPEarlyZFailsDisable = enable;
1943       reg.NPPMAFixEnableMask = true;
1944       reg.NPEarlyZFailsDisableMask = true;
1945    }
1946 
1947    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1948     * Flush bits is often necessary.  We do it regardless because it's easier.
1949     * The render cache flush is also necessary if stencil writes are enabled.
1950     *
1951     * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1952     * flushes seem to work just as well.
1953     */
1954    crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1955                                   PIPE_CONTROL_DEPTH_STALL |
1956                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1957                                   PIPE_CONTROL_RENDER_TARGET_FLUSH);
1958 #endif
1959 }
1960 
1961 static float
get_line_width(const struct pipe_rasterizer_state * state)1962 get_line_width(const struct pipe_rasterizer_state *state)
1963 {
1964    float line_width = state->line_width;
1965 
1966    /* From the OpenGL 4.4 spec:
1967     *
1968     * "The actual width of non-antialiased lines is determined by rounding
1969     *  the supplied width to the nearest integer, then clamping it to the
1970     *  implementation-dependent maximum non-antialiased line width."
1971     */
1972    if (!state->multisample && !state->line_smooth)
1973       line_width = roundf(state->line_width);
1974 
1975    if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1976       /* For 1 pixel line thickness or less, the general anti-aliasing
1977        * algorithm gives up, and a garbage line is generated.  Setting a
1978        * Line Width of 0.0 specifies the rasterization of the "thinnest"
1979        * (one-pixel-wide), non-antialiased lines.
1980        *
1981        * Lines rendered with zero Line Width are rasterized using the
1982        * "Grid Intersection Quantization" rules as specified by the
1983        * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1984        */
1985       /* hack around this for gfx4/5 fps counters in hud. */
1986       line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1987    }
1988    return line_width;
1989 }
1990 
1991 /**
1992  * The pipe->create_rasterizer_state() driver hook.
1993  */
1994 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1995 crocus_create_rasterizer_state(struct pipe_context *ctx,
1996                                const struct pipe_rasterizer_state *state)
1997 {
1998    struct crocus_rasterizer_state *cso =
1999       malloc(sizeof(struct crocus_rasterizer_state));
2000 
2001    cso->fill_mode_point_or_line =
2002       state->fill_front == PIPE_POLYGON_MODE_LINE ||
2003       state->fill_front == PIPE_POLYGON_MODE_POINT ||
2004       state->fill_back == PIPE_POLYGON_MODE_LINE ||
2005       state->fill_back == PIPE_POLYGON_MODE_POINT;
2006 
2007    if (state->clip_plane_enable != 0)
2008       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2009    else
2010       cso->num_clip_plane_consts = 0;
2011 
2012    cso->cso = *state;
2013 
2014 #if GFX_VER >= 6
2015    float line_width = get_line_width(state);
2016 
2017    crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2018       sf.StatisticsEnable = true;
2019       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2020       sf.LineEndCapAntialiasingRegionWidth =
2021          state->line_smooth ? _10pixels : _05pixels;
2022       sf.LastPixelEnable = state->line_last_pixel;
2023 #if GFX_VER <= 7
2024       sf.AntialiasingEnable = state->line_smooth;
2025 #endif
2026 #if GFX_VER == 8
2027       struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2028       if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2029          sf.CHVLineWidth = line_width;
2030       else
2031          sf.LineWidth = line_width;
2032 #else
2033       sf.LineWidth = line_width;
2034 #endif
2035       sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2036       sf.PointWidth = state->point_size;
2037 
2038       if (state->flatshade_first) {
2039          sf.TriangleFanProvokingVertexSelect = 1;
2040       } else {
2041          sf.TriangleStripListProvokingVertexSelect = 2;
2042          sf.TriangleFanProvokingVertexSelect = 2;
2043          sf.LineStripListProvokingVertexSelect = 1;
2044       }
2045 
2046 #if GFX_VER == 6
2047       sf.AttributeSwizzleEnable = true;
2048       if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2049          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2050       else
2051          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2052 #endif
2053 
2054 #if GFX_VER <= 7
2055       sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2056 
2057 #if GFX_VER >= 6
2058       sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2059       sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2060       sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2061       sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2062       sf.GlobalDepthOffsetScale = state->offset_scale;
2063       sf.GlobalDepthOffsetClamp = state->offset_clamp;
2064 
2065       sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2066       sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2067 #endif
2068 
2069       sf.CullMode = translate_cull_mode(state->cull_face);
2070       sf.ScissorRectangleEnable = true;
2071 
2072 #if GFX_VERx10 == 75
2073       sf.LineStippleEnable = state->line_stipple_enable;
2074 #endif
2075 #endif
2076    }
2077 #endif
2078 
2079 #if GFX_VER == 8
2080    crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2081       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2082       rr.CullMode = translate_cull_mode(state->cull_face);
2083       rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2084       rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2085       rr.DXMultisampleRasterizationEnable = state->multisample;
2086       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2087       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2088       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2089       rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2090       rr.GlobalDepthOffsetScale = state->offset_scale;
2091       rr.GlobalDepthOffsetClamp = state->offset_clamp;
2092       rr.SmoothPointEnable = state->point_smooth;
2093       rr.AntialiasingEnable = state->line_smooth;
2094       rr.ScissorRectangleEnable = state->scissor;
2095       rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2096    }
2097 #endif
2098 
2099 #if GFX_VER >= 6
2100    crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2101       /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2102        * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2103        */
2104 #if GFX_VER >= 7
2105       cl.EarlyCullEnable = true;
2106 #endif
2107 
2108 #if GFX_VER == 7
2109       cl.FrontWinding = state->front_ccw ? 1 : 0;
2110       cl.CullMode = translate_cull_mode(state->cull_face);
2111 #endif
2112       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2113 #if GFX_VER < 8
2114       cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2115 #endif
2116       cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2117       cl.GuardbandClipTestEnable = true;
2118       cl.ClipEnable = true;
2119       cl.MinimumPointWidth = 0.125;
2120       cl.MaximumPointWidth = 255.875;
2121 
2122 #if GFX_VER == 8
2123       cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2124 #endif
2125 
2126       if (state->flatshade_first) {
2127          cl.TriangleFanProvokingVertexSelect = 1;
2128       } else {
2129          cl.TriangleStripListProvokingVertexSelect = 2;
2130          cl.TriangleFanProvokingVertexSelect = 2;
2131          cl.LineStripListProvokingVertexSelect = 1;
2132       }
2133    }
2134 #endif
2135 
2136    /* Remap from 0..255 back to 1..256 */
2137    const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2138 
2139    crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2140       if (state->line_stipple_enable) {
2141          line.LineStipplePattern = state->line_stipple_pattern;
2142          line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2143          line.LineStippleRepeatCount = line_stipple_factor;
2144       }
2145    }
2146 
2147    return cso;
2148 }
2149 
2150 /**
2151  * The pipe->bind_rasterizer_state() driver hook.
2152  *
2153  * Bind a rasterizer CSO and flag related dirty bits.
2154  */
2155 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2156 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2157 {
2158    struct crocus_context *ice = (struct crocus_context *) ctx;
2159    struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2160    struct crocus_rasterizer_state *new_cso = state;
2161 
2162    if (new_cso) {
2163       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2164       if (cso_changed_memcmp(line_stipple))
2165          ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2166 #if GFX_VER >= 6
2167       if (cso_changed(cso.half_pixel_center))
2168          ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2169       if (cso_changed(cso.scissor))
2170          ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2171       if (cso_changed(cso.multisample))
2172 	 ice->state.dirty |= CROCUS_DIRTY_WM;
2173 #else
2174       if (cso_changed(cso.scissor))
2175          ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2176 #endif
2177 
2178       if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2179          ice->state.dirty |= CROCUS_DIRTY_WM;
2180 
2181 #if GFX_VER >= 6
2182       if (cso_changed(cso.rasterizer_discard))
2183          ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2184 
2185       if (cso_changed(cso.flatshade_first))
2186          ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2187 #endif
2188 
2189       if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2190           cso_changed(cso.clip_halfz))
2191          ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2192 
2193 #if GFX_VER >= 7
2194       if (cso_changed(cso.sprite_coord_enable) ||
2195           cso_changed(cso.sprite_coord_mode) ||
2196           cso_changed(cso.light_twoside))
2197          ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2198 #endif
2199 #if GFX_VER <= 5
2200       if (cso_changed(cso.clip_plane_enable))
2201          ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2202 #endif
2203    }
2204 
2205    ice->state.cso_rast = new_cso;
2206    ice->state.dirty |= CROCUS_DIRTY_RASTER;
2207    ice->state.dirty |= CROCUS_DIRTY_CLIP;
2208 #if GFX_VER <= 5
2209    ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2210    ice->state.dirty |= CROCUS_DIRTY_WM;
2211 #endif
2212 #if GFX_VER <= 6
2213    ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2214 #endif
2215    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2216 }
2217 
2218 /**
2219  * Return true if the given wrap mode requires the border color to exist.
2220  *
2221  * (We can skip uploading it if the sampler isn't going to use it.)
2222  */
2223 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2224 wrap_mode_needs_border_color(unsigned wrap_mode)
2225 {
2226 #if GFX_VER == 8
2227    return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2228 #else
2229    return wrap_mode == TCM_CLAMP_BORDER;
2230 #endif
2231 }
2232 
2233 /**
2234  * Gallium CSO for sampler state.
2235  */
2236 struct crocus_sampler_state {
2237    struct pipe_sampler_state pstate;
2238    union pipe_color_union border_color;
2239    bool needs_border_color;
2240    unsigned wrap_s;
2241    unsigned wrap_t;
2242    unsigned wrap_r;
2243    unsigned mag_img_filter;
2244    float min_lod;
2245 };
2246 
2247 /**
2248  * The pipe->create_sampler_state() driver hook.
2249  *
2250  * We fill out SAMPLER_STATE (except for the border color pointer), and
2251  * store that on the CPU.  It doesn't make sense to upload it to a GPU
2252  * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2253  * all bound sampler states to be in contiguous memor.
2254  */
2255 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2256 crocus_create_sampler_state(struct pipe_context *ctx,
2257                             const struct pipe_sampler_state *state)
2258 {
2259    struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2260 
2261    if (!cso)
2262       return NULL;
2263 
2264    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2265    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2266 
2267    bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2268       state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2269    cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2270    cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2271    cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2272 
2273    cso->pstate = *state;
2274 
2275    memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2276 
2277    cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2278                              wrap_mode_needs_border_color(cso->wrap_t) ||
2279                              wrap_mode_needs_border_color(cso->wrap_r);
2280 
2281    cso->min_lod = state->min_lod;
2282    cso->mag_img_filter = state->mag_img_filter;
2283 
2284    // XXX: explain this code ported from ilo...I don't get it at all...
2285    if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2286        state->min_lod > 0.0f) {
2287       cso->min_lod = 0.0f;
2288       cso->mag_img_filter = state->min_img_filter;
2289    }
2290 
2291    return cso;
2292 }
2293 
2294 /**
2295  * The pipe->bind_sampler_states() driver hook.
2296  */
2297 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2298 crocus_bind_sampler_states(struct pipe_context *ctx,
2299                            enum pipe_shader_type p_stage,
2300                            unsigned start, unsigned count,
2301                            void **states)
2302 {
2303    struct crocus_context *ice = (struct crocus_context *) ctx;
2304    gl_shader_stage stage = stage_from_pipe(p_stage);
2305    struct crocus_shader_state *shs = &ice->state.shaders[stage];
2306 
2307    assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2308 
2309    bool dirty = false;
2310 
2311    for (int i = 0; i < count; i++) {
2312       if (shs->samplers[start + i] != states[i]) {
2313          shs->samplers[start + i] = states[i];
2314          dirty = true;
2315       }
2316    }
2317 
2318    if (dirty) {
2319 #if GFX_VER <= 5
2320       if (p_stage == PIPE_SHADER_FRAGMENT)
2321          ice->state.dirty |= CROCUS_DIRTY_WM;
2322       else if (p_stage == PIPE_SHADER_VERTEX)
2323          ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2324 #endif
2325       ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2326       ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2327    }
2328 }
2329 
2330 enum samp_workaround {
2331    SAMP_NORMAL,
2332    SAMP_CUBE_CLAMP,
2333    SAMP_CUBE_CUBE,
2334    SAMP_T_WRAP,
2335 };
2336 
2337 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2338 crocus_upload_sampler_state(struct crocus_batch *batch,
2339                             struct crocus_sampler_state *cso,
2340                             uint32_t border_color_offset,
2341                             enum samp_workaround samp_workaround,
2342                             uint32_t first_level,
2343                             void *map)
2344 {
2345    struct pipe_sampler_state *state = &cso->pstate;
2346    uint32_t wrap_s, wrap_t, wrap_r;
2347 
2348    wrap_s = cso->wrap_s;
2349    wrap_t = cso->wrap_t;
2350    wrap_r = cso->wrap_r;
2351 
2352    switch (samp_workaround) {
2353    case SAMP_CUBE_CLAMP:
2354       wrap_s = TCM_CLAMP;
2355       wrap_t = TCM_CLAMP;
2356       wrap_r = TCM_CLAMP;
2357       break;
2358    case SAMP_CUBE_CUBE:
2359       wrap_s = TCM_CUBE;
2360       wrap_t = TCM_CUBE;
2361       wrap_r = TCM_CUBE;
2362       break;
2363    case SAMP_T_WRAP:
2364       wrap_t = TCM_WRAP;
2365       break;
2366    default:
2367       break;
2368    }
2369 
2370    _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2371       samp.TCXAddressControlMode = wrap_s;
2372       samp.TCYAddressControlMode = wrap_t;
2373       samp.TCZAddressControlMode = wrap_r;
2374 
2375 #if GFX_VER >= 6
2376       samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2377 #endif
2378       samp.MinModeFilter = state->min_img_filter;
2379       samp.MagModeFilter = cso->mag_img_filter;
2380       samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2381       samp.MaximumAnisotropy = RATIO21;
2382 
2383       if (state->max_anisotropy >= 2) {
2384          if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2385             samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2386 #if GFX_VER >= 7
2387             samp.AnisotropicAlgorithm = EWAApproximation;
2388 #endif
2389          }
2390 
2391          if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2392             samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2393 
2394          samp.MaximumAnisotropy =
2395             MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2396       }
2397 
2398       /* Set address rounding bits if not using nearest filtering. */
2399       if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2400          samp.UAddressMinFilterRoundingEnable = true;
2401          samp.VAddressMinFilterRoundingEnable = true;
2402          samp.RAddressMinFilterRoundingEnable = true;
2403       }
2404 
2405       if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2406          samp.UAddressMagFilterRoundingEnable = true;
2407          samp.VAddressMagFilterRoundingEnable = true;
2408          samp.RAddressMagFilterRoundingEnable = true;
2409       }
2410 
2411       if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2412          samp.ShadowFunction = translate_shadow_func(state->compare_func);
2413 
2414       const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2415 
2416 #if GFX_VER == 8
2417       samp.LODPreClampMode = CLAMP_MODE_OGL;
2418 #else
2419       samp.LODPreClampEnable = true;
2420 #endif
2421       samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2422       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2423       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2424 
2425 #if GFX_VER == 6
2426       samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2427       samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2428 #endif
2429 
2430 #if GFX_VER < 6
2431       samp.BorderColorPointer =
2432          ro_bo(batch->state.bo, border_color_offset);
2433 #else
2434       samp.BorderColorPointer = border_color_offset;
2435 #endif
2436    }
2437 }
2438 
2439 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2440 crocus_upload_border_color(struct crocus_batch *batch,
2441                            struct crocus_sampler_state *cso,
2442                            struct crocus_sampler_view *tex,
2443                            uint32_t *bc_offset)
2444 {
2445    /* We may need to swizzle the border color for format faking.
2446     * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2447     * This means we need to move the border color's A channel into
2448     * the R or G channels so that those read swizzles will move it
2449     * back into A.
2450     */
2451    enum pipe_format internal_format = PIPE_FORMAT_NONE;
2452    union pipe_color_union *color = &cso->border_color;
2453    union pipe_color_union tmp;
2454    if (tex) {
2455       internal_format = tex->res->internal_format;
2456 
2457       if (util_format_is_alpha(internal_format)) {
2458          unsigned char swz[4] = {
2459             PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2460             PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2461          };
2462          util_format_apply_color_swizzle(&tmp, color, swz, true);
2463          color = &tmp;
2464       } else if (util_format_is_luminance_alpha(internal_format) &&
2465                  internal_format != PIPE_FORMAT_L8A8_SRGB) {
2466          unsigned char swz[4] = {
2467             PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2468             PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2469          };
2470          util_format_apply_color_swizzle(&tmp, color, swz, true);
2471          color = &tmp;
2472       }
2473    }
2474    bool is_integer_format = util_format_is_pure_integer(internal_format);
2475    unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2476    const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2477    uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2478 
2479    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2480 
2481 #define ASSIGN(dst, src)                        \
2482    do {                                         \
2483       dst = src;                                \
2484    } while (0)
2485 
2486 #define ASSIGNu16(dst, src)                     \
2487    do {                                         \
2488       dst = (uint16_t)src;                      \
2489    } while (0)
2490 
2491 #define ASSIGNu8(dst, src)                      \
2492    do {                                         \
2493       dst = (uint8_t)src;                       \
2494    } while (0)
2495 
2496 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
2497    macro(state.BorderColor ## _color_type ## Red, src[0]);      \
2498    macro(state.BorderColor ## _color_type ## Green, src[1]);    \
2499    macro(state.BorderColor ## _color_type ## Blue, src[2]);     \
2500    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2501 
2502 #if GFX_VER >= 8
2503    /* On Broadwell, the border color is represented as four 32-bit floats,
2504     * integers, or unsigned values, interpreted according to the surface
2505     * format.  This matches the sampler->BorderColor union exactly; just
2506     * memcpy the values.
2507     */
2508    BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2509 #elif GFX_VERx10 == 75
2510    if (is_integer_format) {
2511       const struct util_format_description *format_desc =
2512          util_format_description(internal_format);
2513 
2514       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2515        * "If any color channel is missing from the surface format,
2516        *  corresponding border color should be programmed as zero and if
2517        *  alpha channel is missing, corresponding Alpha border color should
2518        *  be programmed as 1."
2519        */
2520       unsigned c[4] = { 0, 0, 0, 1 };
2521       for (int i = 0; i < 4; i++) {
2522          if (format_desc->channel[i].size)
2523             c[i] = color->ui[i];
2524       }
2525 
2526       switch (format_desc->channel[0].size) {
2527       case 8:
2528          /* Copy RGBA in order. */
2529          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2530          break;
2531       case 10:
2532          /* R10G10B10A2_UINT is treated like a 16-bit format. */
2533       case 16:
2534          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2535          break;
2536       case 32:
2537          if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2538             /* Careful inspection of the tables reveals that for RG32 formats,
2539              * the green channel needs to go where blue normally belongs.
2540              */
2541             state.BorderColor32bitRed = c[0];
2542             state.BorderColor32bitBlue = c[1];
2543             state.BorderColor32bitAlpha = 1;
2544          } else {
2545             /* Copy RGBA in order. */
2546             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2547          }
2548          break;
2549       default:
2550          assert(!"Invalid number of bits per channel in integer format.");
2551          break;
2552       }
2553    } else {
2554       BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2555    }
2556 #elif GFX_VER == 5 || GFX_VER == 6
2557    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2558    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2559    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2560 
2561 #define MESA_FLOAT_TO_HALF(dst, src)            \
2562    dst = _mesa_float_to_half(src);
2563 
2564    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2565 
2566 #undef MESA_FLOAT_TO_HALF
2567 
2568    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
2569    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2570    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
2571    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2572 
2573    BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2574 
2575 #elif GFX_VER == 4
2576    BORDER_COLOR_ATTR(ASSIGN, , color->f);
2577 #else
2578    BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2579 #endif
2580 
2581 #undef ASSIGN
2582 #undef BORDER_COLOR_ATTR
2583 
2584    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2585 }
2586 
2587 /**
2588  * Upload the sampler states into a contiguous area of GPU memory, for
2589  * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2590  *
2591  * Also fill out the border color state pointers.
2592  */
2593 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2594 crocus_upload_sampler_states(struct crocus_context *ice,
2595                              struct crocus_batch *batch, gl_shader_stage stage)
2596 {
2597    struct crocus_shader_state *shs = &ice->state.shaders[stage];
2598    const struct shader_info *info = crocus_get_shader_info(ice, stage);
2599 
2600    /* We assume the state tracker will call pipe->bind_sampler_states()
2601     * if the program's number of textures changes.
2602     */
2603    unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2604 
2605    if (!count)
2606       return;
2607 
2608    /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2609     * in the dynamic state memory zone, so we can point to it via the
2610     * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2611     */
2612    unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2613    uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2614 
2615    if (unlikely(!map))
2616       return;
2617 
2618    for (int i = 0; i < count; i++) {
2619       struct crocus_sampler_state *state = shs->samplers[i];
2620       struct crocus_sampler_view *tex = shs->textures[i];
2621 
2622       if (!state || !tex) {
2623          memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2624       } else {
2625          unsigned border_color_offset = 0;
2626          if (state->needs_border_color) {
2627             crocus_upload_border_color(batch, state, tex, &border_color_offset);
2628          }
2629 
2630          enum samp_workaround wa = SAMP_NORMAL;
2631          /* There's a bug in 1D texture sampling - it actually pays
2632           * attention to the wrap_t value, though it should not.
2633           * Override the wrap_t value here to GL_REPEAT to keep
2634           * any nonexistent border pixels from floating in.
2635           */
2636          if (tex->base.target == PIPE_TEXTURE_1D)
2637             wa = SAMP_T_WRAP;
2638          else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2639                   tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2640             /* Cube maps must use the same wrap mode for all three coordinate
2641              * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
2642              *
2643              * Ivybridge and Baytrail seem to have problems with CUBE mode and
2644              * integer formats.  Fall back to CLAMP for now.
2645              */
2646             if (state->pstate.seamless_cube_map &&
2647                 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2648                wa = SAMP_CUBE_CUBE;
2649             else
2650                wa = SAMP_CUBE_CLAMP;
2651          }
2652 
2653          uint32_t first_level = 0;
2654          if (tex->base.target != PIPE_BUFFER)
2655             first_level = tex->base.u.tex.first_level;
2656 
2657          crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2658       }
2659 
2660       map += GENX(SAMPLER_STATE_length);
2661    }
2662 }
2663 
2664 /**
2665  * The pipe->create_sampler_view() driver hook.
2666  */
2667 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2668 crocus_create_sampler_view(struct pipe_context *ctx,
2669                            struct pipe_resource *tex,
2670                            const struct pipe_sampler_view *tmpl)
2671 {
2672    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2673    const struct intel_device_info *devinfo = &screen->devinfo;
2674    struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2675 
2676    if (!isv)
2677       return NULL;
2678 
2679    /* initialize base object */
2680    isv->base = *tmpl;
2681    isv->base.context = ctx;
2682    isv->base.texture = NULL;
2683    pipe_reference_init(&isv->base.reference, 1);
2684    pipe_resource_reference(&isv->base.texture, tex);
2685 
2686    if (util_format_is_depth_or_stencil(tmpl->format)) {
2687       struct crocus_resource *zres, *sres;
2688       const struct util_format_description *desc =
2689          util_format_description(tmpl->format);
2690 
2691       crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2692 
2693       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2694 
2695       if (tex->format == PIPE_FORMAT_S8_UINT)
2696          if (GFX_VER == 7 && sres->shadow)
2697             tex = &sres->shadow->base.b;
2698    }
2699 
2700    isv->res = (struct crocus_resource *) tex;
2701 
2702    isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2703 
2704    if (isv->base.target == PIPE_TEXTURE_CUBE ||
2705        isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2706       usage |= ISL_SURF_USAGE_CUBE_BIT;
2707 
2708    const struct crocus_format_info fmt =
2709       crocus_format_for_usage(devinfo, tmpl->format, usage);
2710 
2711    enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2712    crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2713 
2714    /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2715    if (GFX_VER < 6 &&
2716        (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2717         tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2718       isv->swizzle[0] = tmpl->swizzle_g;
2719       isv->swizzle[1] = tmpl->swizzle_g;
2720       isv->swizzle[2] = tmpl->swizzle_g;
2721       isv->swizzle[3] = tmpl->swizzle_g;
2722    }
2723 
2724    isv->clear_color = isv->res->aux.clear_color;
2725 
2726    isv->view = (struct isl_view) {
2727       .format = fmt.fmt,
2728 #if GFX_VERx10 >= 75
2729       .swizzle = (struct isl_swizzle) {
2730          .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2731          .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2732          .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2733          .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2734       },
2735 #else
2736       /* swizzling handled in shader code */
2737       .swizzle = ISL_SWIZZLE_IDENTITY,
2738 #endif
2739       .usage = usage,
2740    };
2741 
2742    /* Fill out SURFACE_STATE for this view. */
2743    if (tmpl->target != PIPE_BUFFER) {
2744       isv->view.base_level = tmpl->u.tex.first_level;
2745       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2746 
2747       /* Hardware older than skylake ignores this value */
2748       assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2749 
2750       // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2751       isv->view.base_array_layer = tmpl->u.tex.first_layer;
2752       isv->view.array_len =
2753          tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2754    }
2755 #if GFX_VER >= 6
2756    /* just create a second view struct for texture gather just in case */
2757    isv->gather_view = isv->view;
2758 
2759 #if GFX_VER == 7
2760    if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2761        fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2762        fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2763       isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2764 #if GFX_VERx10 >= 75
2765       isv->gather_view.swizzle = (struct isl_swizzle) {
2766          .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2767          .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2768          .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2769          .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2770       };
2771 #endif
2772    }
2773 #endif
2774 #if GFX_VER == 6
2775    /* Sandybridge's gather4 message is broken for integer formats.
2776     * To work around this, we pretend the surface is UNORM for
2777     * 8 or 16-bit formats, and emit shader instructions to recover
2778     * the real INT/UINT value.  For 32-bit formats, we pretend
2779     * the surface is FLOAT, and simply reinterpret the resulting
2780     * bits.
2781     */
2782    switch (fmt.fmt) {
2783    case ISL_FORMAT_R8_SINT:
2784    case ISL_FORMAT_R8_UINT:
2785       isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2786       break;
2787 
2788    case ISL_FORMAT_R16_SINT:
2789    case ISL_FORMAT_R16_UINT:
2790       isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2791       break;
2792 
2793    case ISL_FORMAT_R32_SINT:
2794    case ISL_FORMAT_R32_UINT:
2795       isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2796       break;
2797 
2798    default:
2799       break;
2800    }
2801 #endif
2802 #endif
2803 
2804    return &isv->base;
2805 }
2806 
2807 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2808 crocus_sampler_view_destroy(struct pipe_context *ctx,
2809                             struct pipe_sampler_view *state)
2810 {
2811    struct crocus_sampler_view *isv = (void *) state;
2812    pipe_resource_reference(&state->texture, NULL);
2813    free(isv);
2814 }
2815 
2816 /**
2817  * The pipe->create_surface() driver hook.
2818  *
2819  * In Gallium nomenclature, "surfaces" are a view of a resource that
2820  * can be bound as a render target or depth/stencil buffer.
2821  */
2822 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2823 crocus_create_surface(struct pipe_context *ctx,
2824                       struct pipe_resource *tex,
2825                       const struct pipe_surface *tmpl)
2826 {
2827    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2828    const struct intel_device_info *devinfo = &screen->devinfo;
2829 
2830    isl_surf_usage_flags_t usage = 0;
2831    if (tmpl->writable)
2832       usage = ISL_SURF_USAGE_STORAGE_BIT;
2833    else if (util_format_is_depth_or_stencil(tmpl->format))
2834       usage = ISL_SURF_USAGE_DEPTH_BIT;
2835    else
2836       usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2837 
2838    const struct crocus_format_info fmt =
2839       crocus_format_for_usage(devinfo, tmpl->format, usage);
2840 
2841    if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2842        !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2843       /* Framebuffer validation will reject this invalid case, but it
2844        * hasn't had the opportunity yet.  In the meantime, we need to
2845        * avoid hitting ISL asserts about unsupported formats below.
2846        */
2847       return NULL;
2848    }
2849 
2850    struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2851    struct pipe_surface *psurf = &surf->base;
2852    struct crocus_resource *res = (struct crocus_resource *) tex;
2853 
2854    if (!surf)
2855       return NULL;
2856 
2857    pipe_reference_init(&psurf->reference, 1);
2858    pipe_resource_reference(&psurf->texture, tex);
2859    psurf->context = ctx;
2860    psurf->format = tmpl->format;
2861    psurf->width = tex->width0;
2862    psurf->height = tex->height0;
2863    psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2864    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2865    psurf->u.tex.level = tmpl->u.tex.level;
2866 
2867    uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2868 
2869    struct isl_view *view = &surf->view;
2870    *view = (struct isl_view) {
2871       .format = fmt.fmt,
2872       .base_level = tmpl->u.tex.level,
2873       .levels = 1,
2874       .base_array_layer = tmpl->u.tex.first_layer,
2875       .array_len = array_len,
2876       .swizzle = ISL_SWIZZLE_IDENTITY,
2877       .usage = usage,
2878    };
2879 
2880 #if GFX_VER >= 6
2881    struct isl_view *read_view = &surf->read_view;
2882    *read_view = (struct isl_view) {
2883       .format = fmt.fmt,
2884       .base_level = tmpl->u.tex.level,
2885       .levels = 1,
2886       .base_array_layer = tmpl->u.tex.first_layer,
2887       .array_len = array_len,
2888       .swizzle = ISL_SWIZZLE_IDENTITY,
2889       .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2890    };
2891 #endif
2892 
2893    surf->clear_color = res->aux.clear_color;
2894 
2895    /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2896    if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2897                           ISL_SURF_USAGE_STENCIL_BIT))
2898       return psurf;
2899 
2900    if (!isl_format_is_compressed(res->surf.format)) {
2901       memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2902       uint64_t temp_offset;
2903       uint32_t temp_x, temp_y;
2904 
2905       isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2906                                           res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2907                                           res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2908                                           &temp_offset, &temp_x, &temp_y);
2909       if (!devinfo->has_surface_tile_offset &&
2910           (temp_x || temp_y)) {
2911          /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2912           * destination.
2913           */
2914          /* move to temp */
2915          struct pipe_resource wa_templ = (struct pipe_resource) {
2916             .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2917             .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2918             .depth0 = 1,
2919             .array_size = 1,
2920             .format = res->base.b.format,
2921             .target = PIPE_TEXTURE_2D,
2922             .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2923          };
2924          surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2925          view->base_level = 0;
2926          view->base_array_layer = 0;
2927          view->array_len = 1;
2928          struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2929          memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2930       }
2931       return psurf;
2932    }
2933 
2934    /* The resource has a compressed format, which is not renderable, but we
2935     * have a renderable view format.  We must be attempting to upload blocks
2936     * of compressed data via an uncompressed view.
2937     *
2938     * In this case, we can assume there are no auxiliary buffers, a single
2939     * miplevel, and that the resource is single-sampled.  Gallium may try
2940     * and create an uncompressed view with multiple layers, however.
2941     */
2942    assert(!isl_format_is_compressed(fmt.fmt));
2943    assert(res->surf.samples == 1);
2944    assert(view->levels == 1);
2945 
2946    /* TODO: compressed pbo uploads aren't working here */
2947    pipe_surface_reference(&psurf, NULL);
2948    return NULL;
2949 
2950    uint64_t offset_B = 0;
2951    uint32_t tile_x_sa = 0, tile_y_sa = 0;
2952 
2953    if (view->base_level > 0) {
2954       /* We can't rely on the hardware's miplevel selection with such
2955        * a substantial lie about the format, so we select a single image
2956        * using the Tile X/Y Offset fields.  In this case, we can't handle
2957        * multiple array slices.
2958        *
2959        * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2960        * hard-coded to align to exactly the block size of the compressed
2961        * texture.  This means that, when reinterpreted as a non-compressed
2962        * texture, the tile offsets may be anything and we can't rely on
2963        * X/Y Offset.
2964        *
2965        * Return NULL to force the state tracker to take fallback paths.
2966        */
2967       // TODO: check if the gen7 check is right, originally gen8
2968       if (view->array_len > 1 || GFX_VER == 7) {
2969          pipe_surface_reference(&psurf, NULL);
2970          return NULL;
2971       }
2972 
2973       const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2974       isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2975                               view->base_level,
2976                               is_3d ? 0 : view->base_array_layer,
2977                               is_3d ? view->base_array_layer : 0,
2978                               &surf->surf,
2979                               &offset_B, &tile_x_sa, &tile_y_sa);
2980 
2981       /* We use address and tile offsets to access a single level/layer
2982        * as a subimage, so reset level/layer so it doesn't offset again.
2983        */
2984       view->base_array_layer = 0;
2985       view->base_level = 0;
2986    } else {
2987       /* Level 0 doesn't require tile offsets, and the hardware can find
2988        * array slices using QPitch even with the format override, so we
2989        * can allow layers in this case.  Copy the original ISL surface.
2990        */
2991       memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2992    }
2993 
2994    /* Scale down the image dimensions by the block size. */
2995    const struct isl_format_layout *fmtl =
2996       isl_format_get_layout(res->surf.format);
2997    surf->surf.format = fmt.fmt;
2998    surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
2999    surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3000    tile_x_sa /= fmtl->bw;
3001    tile_y_sa /= fmtl->bh;
3002 
3003    psurf->width = surf->surf.logical_level0_px.width;
3004    psurf->height = surf->surf.logical_level0_px.height;
3005 
3006    return psurf;
3007 }
3008 
3009 #if GFX_VER >= 7
3010 static void
fill_default_image_param(struct isl_image_param * param)3011 fill_default_image_param(struct isl_image_param *param)
3012 {
3013    memset(param, 0, sizeof(*param));
3014    /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3015     * See emit_address_calculation() in elk_fs_surface_builder.cpp for a more
3016     * detailed explanation of these parameters.
3017     */
3018    param->swizzling[0] = 0xff;
3019    param->swizzling[1] = 0xff;
3020 }
3021 
3022 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3023 fill_buffer_image_param(struct isl_image_param *param,
3024                         enum pipe_format pfmt,
3025                         unsigned size)
3026 {
3027    const unsigned cpp = util_format_get_blocksize(pfmt);
3028 
3029    fill_default_image_param(param);
3030    param->size[0] = size / cpp;
3031    param->stride[0] = cpp;
3032 }
3033 
3034 #endif
3035 
3036 /**
3037  * The pipe->set_shader_images() driver hook.
3038  */
3039 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3040 crocus_set_shader_images(struct pipe_context *ctx,
3041                          enum pipe_shader_type p_stage,
3042                          unsigned start_slot, unsigned count,
3043                          unsigned unbind_num_trailing_slots,
3044                          const struct pipe_image_view *p_images)
3045 {
3046 #if GFX_VER >= 7
3047    struct crocus_context *ice = (struct crocus_context *) ctx;
3048    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3049    const struct intel_device_info *devinfo = &screen->devinfo;
3050    gl_shader_stage stage = stage_from_pipe(p_stage);
3051    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3052    struct crocus_genx_state *genx = ice->state.genx;
3053    struct isl_image_param *image_params = genx->shaders[stage].image_param;
3054 
3055    shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3056 
3057    for (unsigned i = 0; i < count; i++) {
3058       struct crocus_image_view *iv = &shs->image[start_slot + i];
3059 
3060       if (p_images && p_images[i].resource) {
3061          const struct pipe_image_view *img = &p_images[i];
3062          struct crocus_resource *res = (void *) img->resource;
3063 
3064          util_copy_image_view(&iv->base, img);
3065 
3066          shs->bound_image_views |= 1 << (start_slot + i);
3067 
3068          res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3069          res->bind_stages |= 1 << stage;
3070 
3071          isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3072          struct crocus_format_info fmt =
3073             crocus_format_for_usage(devinfo, img->format, usage);
3074 
3075          struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3076          if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3077             /* On Gen8, try to use typed surfaces reads (which support a
3078              * limited number of formats), and if not possible, fall back
3079              * to untyped reads.
3080              */
3081             if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3082                fmt.fmt = ISL_FORMAT_RAW;
3083             else
3084                fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3085          }
3086 
3087          if (res->base.b.target != PIPE_BUFFER) {
3088             struct isl_view view = {
3089                .format = fmt.fmt,
3090                .base_level = img->u.tex.level,
3091                .levels = 1,
3092                .base_array_layer = img->u.tex.first_layer,
3093                .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3094                .swizzle = swiz,
3095                .usage = usage,
3096             };
3097 
3098             iv->view = view;
3099 
3100             isl_surf_fill_image_param(&screen->isl_dev,
3101                                       &image_params[start_slot + i],
3102                                       &res->surf, &view);
3103          } else {
3104             struct isl_view view = {
3105                .format = fmt.fmt,
3106                .swizzle = swiz,
3107                .usage = usage,
3108             };
3109             iv->view = view;
3110 
3111             util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3112                            img->u.buf.offset + img->u.buf.size);
3113             fill_buffer_image_param(&image_params[start_slot + i],
3114                                     img->format, img->u.buf.size);
3115          }
3116       } else {
3117          pipe_resource_reference(&iv->base.resource, NULL);
3118          fill_default_image_param(&image_params[start_slot + i]);
3119       }
3120    }
3121 
3122    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3123    ice->state.dirty |=
3124       stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3125                                    : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3126 
3127    /* Broadwell also needs isl_image_params re-uploaded */
3128    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3129    shs->sysvals_need_upload = true;
3130 #endif
3131 }
3132 
3133 
3134 /**
3135  * The pipe->set_sampler_views() driver hook.
3136  */
3137 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3138 crocus_set_sampler_views(struct pipe_context *ctx,
3139                          enum pipe_shader_type p_stage,
3140                          unsigned start, unsigned count,
3141                          unsigned unbind_num_trailing_slots,
3142                          bool take_ownership,
3143                          struct pipe_sampler_view **views)
3144 {
3145    struct crocus_context *ice = (struct crocus_context *) ctx;
3146    gl_shader_stage stage = stage_from_pipe(p_stage);
3147    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3148 
3149    shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3150 
3151    for (unsigned i = 0; i < count; i++) {
3152       struct pipe_sampler_view *pview = views ? views[i] : NULL;
3153 
3154       if (take_ownership) {
3155          pipe_sampler_view_reference((struct pipe_sampler_view **)
3156                                      &shs->textures[start + i], NULL);
3157          shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3158       } else {
3159          pipe_sampler_view_reference((struct pipe_sampler_view **)
3160                                      &shs->textures[start + i], pview);
3161       }
3162 
3163       struct crocus_sampler_view *view = (void *) pview;
3164       if (view) {
3165          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3166          view->res->bind_stages |= 1 << stage;
3167 
3168          shs->bound_sampler_views |= 1 << (start + i);
3169       }
3170    }
3171 #if GFX_VER == 6
3172    /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3173    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3174 #endif
3175    ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3176    ice->state.dirty |=
3177       stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3178                                    : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3179    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3180 }
3181 
3182 /**
3183  * The pipe->set_tess_state() driver hook.
3184  */
3185 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3186 crocus_set_tess_state(struct pipe_context *ctx,
3187                       const float default_outer_level[4],
3188                       const float default_inner_level[2])
3189 {
3190    struct crocus_context *ice = (struct crocus_context *) ctx;
3191    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3192 
3193    memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3194    memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3195 
3196    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3197    shs->sysvals_need_upload = true;
3198 }
3199 
3200 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3201 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3202 {
3203    struct crocus_context *ice = (struct crocus_context *) ctx;
3204 
3205    ice->state.patch_vertices = patch_vertices;
3206 }
3207 
3208 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3209 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3210 {
3211    struct crocus_surface *surf = (void *) p_surf;
3212    pipe_resource_reference(&p_surf->texture, NULL);
3213 
3214    pipe_resource_reference(&surf->align_res, NULL);
3215    free(surf);
3216 }
3217 
3218 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3219 crocus_set_clip_state(struct pipe_context *ctx,
3220                       const struct pipe_clip_state *state)
3221 {
3222    struct crocus_context *ice = (struct crocus_context *) ctx;
3223    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3224    struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3225    struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3226 
3227    memcpy(&ice->state.clip_planes, state, sizeof(*state));
3228 
3229 #if GFX_VER <= 5
3230    ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3231 #endif
3232    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3233                              CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3234    shs->sysvals_need_upload = true;
3235    gshs->sysvals_need_upload = true;
3236    tshs->sysvals_need_upload = true;
3237 }
3238 
3239 /**
3240  * The pipe->set_polygon_stipple() driver hook.
3241  */
3242 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3243 crocus_set_polygon_stipple(struct pipe_context *ctx,
3244                            const struct pipe_poly_stipple *state)
3245 {
3246    struct crocus_context *ice = (struct crocus_context *) ctx;
3247    memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3248    ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3249 }
3250 
3251 /**
3252  * The pipe->set_sample_mask() driver hook.
3253  */
3254 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3255 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3256 {
3257    struct crocus_context *ice = (struct crocus_context *) ctx;
3258 
3259    /* We only support 16x MSAA, so we have 16 bits of sample maks.
3260     * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3261     */
3262    ice->state.sample_mask = sample_mask & 0xff;
3263    ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3264 }
3265 
3266 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3267 crocus_fill_scissor_rect(struct crocus_context *ice,
3268                          int idx,
3269                          struct pipe_scissor_state *ss)
3270 {
3271    struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3272    struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3273    const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3274    struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3275       .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3276       .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3277       .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3278       .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3279    };
3280    if (cso_state->scissor) {
3281       struct pipe_scissor_state *s = &ice->state.scissors[idx];
3282       scissor.minx = MAX2(scissor.minx, s->minx);
3283       scissor.miny = MAX2(scissor.miny, s->miny);
3284       scissor.maxx = MIN2(scissor.maxx, s->maxx);
3285       scissor.maxy = MIN2(scissor.maxy, s->maxy);
3286    }
3287    *ss = scissor;
3288 }
3289 
3290 /**
3291  * The pipe->set_scissor_states() driver hook.
3292  *
3293  * This corresponds to our SCISSOR_RECT state structures.  It's an
3294  * exact match, so we just store them, and memcpy them out later.
3295  */
3296 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3297 crocus_set_scissor_states(struct pipe_context *ctx,
3298                           unsigned start_slot,
3299                           unsigned num_scissors,
3300                           const struct pipe_scissor_state *rects)
3301 {
3302    struct crocus_context *ice = (struct crocus_context *) ctx;
3303 
3304    for (unsigned i = 0; i < num_scissors; i++) {
3305       if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3306          /* If the scissor was out of bounds and got clamped to 0 width/height
3307           * at the bounds, the subtraction of 1 from maximums could produce a
3308           * negative number and thus not clip anything.  Instead, just provide
3309           * a min > max scissor inside the bounds, which produces the expected
3310           * no rendering.
3311           */
3312          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3313             .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3314          };
3315       } else {
3316          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3317             .minx = rects[i].minx,     .miny = rects[i].miny,
3318             .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3319          };
3320       }
3321    }
3322 
3323 #if GFX_VER < 6
3324    ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3325 #else
3326    ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3327 #endif
3328    ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3329 
3330 }
3331 
3332 /**
3333  * The pipe->set_stencil_ref() driver hook.
3334  *
3335  * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3336  */
3337 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3338 crocus_set_stencil_ref(struct pipe_context *ctx,
3339                        const struct pipe_stencil_ref ref)
3340 {
3341    struct crocus_context *ice = (struct crocus_context *) ctx;
3342    ice->state.stencil_ref = ref;
3343    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3344 }
3345 
3346 #if GFX_VER == 8
3347 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3348 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3349 {
3350    return copysignf(state->scale[axis], sign) + state->translate[axis];
3351 }
3352 #endif
3353 
3354 /**
3355  * The pipe->set_viewport_states() driver hook.
3356  *
3357  * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3358  * the guardband yet, as we need the framebuffer dimensions, but we can
3359  * at least fill out the rest.
3360  */
3361 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3362 crocus_set_viewport_states(struct pipe_context *ctx,
3363                            unsigned start_slot,
3364                            unsigned count,
3365                            const struct pipe_viewport_state *states)
3366 {
3367    struct crocus_context *ice = (struct crocus_context *) ctx;
3368    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3369 
3370    memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3371 
3372    /* Fix depth test misrenderings by lowering translated depth range */
3373    if (screen->driconf.lower_depth_range_rate != 1.0f)
3374       ice->state.viewports[start_slot].translate[2] *=
3375          screen->driconf.lower_depth_range_rate;
3376 
3377    ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3378    ice->state.dirty |= CROCUS_DIRTY_RASTER;
3379 #if GFX_VER >= 6
3380    ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3381 #endif
3382 
3383    if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3384                                !ice->state.cso_rast->cso.depth_clip_far))
3385       ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3386 }
3387 
3388 /**
3389  * The pipe->set_framebuffer_state() driver hook.
3390  *
3391  * Sets the current draw FBO, including color render targets, depth,
3392  * and stencil buffers.
3393  */
3394 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3395 crocus_set_framebuffer_state(struct pipe_context *ctx,
3396                              const struct pipe_framebuffer_state *state)
3397 {
3398    struct crocus_context *ice = (struct crocus_context *) ctx;
3399    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3400    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3401    const struct intel_device_info *devinfo = &screen->devinfo;
3402 #if 0
3403    struct isl_device *isl_dev = &screen->isl_dev;
3404    struct crocus_resource *zres;
3405    struct crocus_resource *stencil_res;
3406 #endif
3407 
3408    unsigned samples = util_framebuffer_get_num_samples(state);
3409    unsigned layers = util_framebuffer_get_num_layers(state);
3410 
3411 #if GFX_VER >= 6
3412    if (cso->samples != samples) {
3413       ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3414       ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3415       ice->state.dirty |= CROCUS_DIRTY_RASTER;
3416 #if GFX_VERx10 == 75
3417       ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3418 #endif
3419    }
3420 #endif
3421 
3422 #if GFX_VER >= 6 && GFX_VER < 8
3423    ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3424 #endif
3425 
3426    if ((cso->layers == 0) != (layers == 0)) {
3427       ice->state.dirty |= CROCUS_DIRTY_CLIP;
3428    }
3429 
3430    if (cso->width != state->width || cso->height != state->height) {
3431       ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3432       ice->state.dirty |= CROCUS_DIRTY_RASTER;
3433       ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3434 #if GFX_VER >= 6
3435       ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3436 #endif
3437    }
3438 
3439    if (cso->zsbuf || state->zsbuf) {
3440       ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3441 
3442       /* update SF's depth buffer format */
3443       if (GFX_VER == 7 && cso->zsbuf)
3444          ice->state.dirty |= CROCUS_DIRTY_RASTER;
3445    }
3446 
3447    /* wm thread dispatch enable */
3448    ice->state.dirty |= CROCUS_DIRTY_WM;
3449    util_copy_framebuffer_state(cso, state);
3450    cso->samples = samples;
3451    cso->layers = layers;
3452 
3453    if (cso->zsbuf) {
3454       struct crocus_resource *zres;
3455       struct crocus_resource *stencil_res;
3456       enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3457       crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3458                                          &stencil_res);
3459       if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3460          aux_usage = zres->aux.usage;
3461       }
3462       ice->state.hiz_usage = aux_usage;
3463    }
3464 
3465    /* Render target change */
3466    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3467 
3468    ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3469 
3470    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3471 }
3472 
3473 /**
3474  * The pipe->set_constant_buffer() driver hook.
3475  *
3476  * This uploads any constant data in user buffers, and references
3477  * any UBO resources containing constant data.
3478  */
3479 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3480 crocus_set_constant_buffer(struct pipe_context *ctx,
3481                            enum pipe_shader_type p_stage, unsigned index,
3482                            bool take_ownership,
3483                            const struct pipe_constant_buffer *input)
3484 {
3485    struct crocus_context *ice = (struct crocus_context *) ctx;
3486    gl_shader_stage stage = stage_from_pipe(p_stage);
3487    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3488    struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3489 
3490    util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3491 
3492    if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3493       shs->bound_cbufs |= 1u << index;
3494 
3495       if (input->user_buffer) {
3496          void *map = NULL;
3497          pipe_resource_reference(&cbuf->buffer, NULL);
3498          u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3499                         &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3500 
3501          if (!cbuf->buffer) {
3502             /* Allocation was unsuccessful - just unbind */
3503             crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3504             return;
3505          }
3506 
3507          assert(map);
3508          memcpy(map, input->user_buffer, input->buffer_size);
3509       }
3510       cbuf->buffer_size =
3511          MIN2(input->buffer_size,
3512               crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3513 
3514       struct crocus_resource *res = (void *) cbuf->buffer;
3515       res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3516       res->bind_stages |= 1 << stage;
3517    } else {
3518       shs->bound_cbufs &= ~(1u << index);
3519    }
3520 
3521    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3522 }
3523 
3524 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3525 upload_sysvals(struct crocus_context *ice,
3526                gl_shader_stage stage)
3527 {
3528    UNUSED struct crocus_genx_state *genx = ice->state.genx;
3529    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3530 
3531    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3532    if (!shader || shader->num_system_values == 0)
3533       return;
3534 
3535    assert(shader->num_cbufs > 0);
3536 
3537    unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3538    struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3539    unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3540    uint32_t *map = NULL;
3541 
3542    assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3543    u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3544                   &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3545 
3546    for (int i = 0; i < shader->num_system_values; i++) {
3547       uint32_t sysval = shader->system_values[i];
3548       uint32_t value = 0;
3549 
3550       if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
3551 #if GFX_VER >= 7
3552          unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
3553          unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
3554          struct isl_image_param *param =
3555             &genx->shaders[stage].image_param[img];
3556 
3557          assert(offset < sizeof(struct isl_image_param));
3558          value = ((uint32_t *) param)[offset];
3559 #endif
3560       } else if (sysval == ELK_PARAM_BUILTIN_ZERO) {
3561          value = 0;
3562       } else if (ELK_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3563          int plane = ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3564          int comp  = ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3565          value = fui(ice->state.clip_planes.ucp[plane][comp]);
3566       } else if (sysval == ELK_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3567          if (stage == MESA_SHADER_TESS_CTRL) {
3568             value = ice->state.vertices_per_patch;
3569          } else {
3570             assert(stage == MESA_SHADER_TESS_EVAL);
3571             const struct shader_info *tcs_info =
3572                crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3573             if (tcs_info)
3574                value = tcs_info->tess.tcs_vertices_out;
3575             else
3576                value = ice->state.vertices_per_patch;
3577          }
3578       } else if (sysval >= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3579                  sysval <= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3580          unsigned i = sysval - ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3581          value = fui(ice->state.default_outer_level[i]);
3582       } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3583          value = fui(ice->state.default_inner_level[0]);
3584       } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3585          value = fui(ice->state.default_inner_level[1]);
3586       } else if (sysval >= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3587                  sysval <= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3588          unsigned i = sysval - ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3589          value = ice->state.last_block[i];
3590       } else {
3591          assert(!"unhandled system value");
3592       }
3593 
3594       *map++ = value;
3595    }
3596 
3597    cbuf->buffer_size = upload_size;
3598    shs->sysvals_need_upload = false;
3599 }
3600 
3601 /**
3602  * The pipe->set_shader_buffers() driver hook.
3603  *
3604  * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
3605  * SURFACE_STATE here, as the buffer offset may change each time.
3606  */
3607 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3608 crocus_set_shader_buffers(struct pipe_context *ctx,
3609                           enum pipe_shader_type p_stage,
3610                           unsigned start_slot, unsigned count,
3611                           const struct pipe_shader_buffer *buffers,
3612                           unsigned writable_bitmask)
3613 {
3614    struct crocus_context *ice = (struct crocus_context *) ctx;
3615    gl_shader_stage stage = stage_from_pipe(p_stage);
3616    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3617 
3618    unsigned modified_bits = u_bit_consecutive(start_slot, count);
3619 
3620    shs->bound_ssbos &= ~modified_bits;
3621    shs->writable_ssbos &= ~modified_bits;
3622    shs->writable_ssbos |= writable_bitmask << start_slot;
3623 
3624    for (unsigned i = 0; i < count; i++) {
3625       if (buffers && buffers[i].buffer) {
3626          struct crocus_resource *res = (void *) buffers[i].buffer;
3627          struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3628          pipe_resource_reference(&ssbo->buffer, &res->base.b);
3629          ssbo->buffer_offset = buffers[i].buffer_offset;
3630          ssbo->buffer_size =
3631             MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3632 
3633          shs->bound_ssbos |= 1 << (start_slot + i);
3634 
3635          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3636          res->bind_stages |= 1 << stage;
3637 
3638          util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3639                         ssbo->buffer_offset + ssbo->buffer_size);
3640       } else {
3641          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3642       }
3643    }
3644 
3645    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3646 }
3647 
3648 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3649 crocus_delete_state(struct pipe_context *ctx, void *state)
3650 {
3651    free(state);
3652 }
3653 
3654 /**
3655  * The pipe->set_vertex_buffers() driver hook.
3656  *
3657  * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3658  */
3659 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)3660 crocus_set_vertex_buffers(struct pipe_context *ctx,
3661                           unsigned count,
3662                           const struct pipe_vertex_buffer *buffers)
3663 {
3664    struct crocus_context *ice = (struct crocus_context *) ctx;
3665    struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3666    const unsigned padding =
3667       (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3668 
3669    util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3670                                 buffers, count, true);
3671 
3672    for (unsigned i = 0; i < count; i++) {
3673       struct pipe_vertex_buffer *state =
3674          &ice->state.vertex_buffers[i];
3675 
3676       if (!state->is_user_buffer && state->buffer.resource) {
3677          struct crocus_resource *res = (void *)state->buffer.resource;
3678          res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3679       }
3680 
3681       uint32_t end = 0;
3682       if (state->buffer.resource)
3683          end = state->buffer.resource->width0 + padding;
3684       ice->state.vb_end[i] = end;
3685    }
3686    ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3687 }
3688 
3689 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3690 static uint8_t get_wa_flags(enum isl_format format)
3691 {
3692    uint8_t wa_flags = 0;
3693 
3694    switch (format) {
3695    case ISL_FORMAT_R10G10B10A2_USCALED:
3696       wa_flags = ELK_ATTRIB_WA_SCALE;
3697       break;
3698    case ISL_FORMAT_R10G10B10A2_SSCALED:
3699       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE;
3700       break;
3701    case ISL_FORMAT_R10G10B10A2_UNORM:
3702       wa_flags = ELK_ATTRIB_WA_NORMALIZE;
3703       break;
3704    case ISL_FORMAT_R10G10B10A2_SNORM:
3705       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE;
3706       break;
3707    case ISL_FORMAT_R10G10B10A2_SINT:
3708       wa_flags = ELK_ATTRIB_WA_SIGN;
3709       break;
3710    case ISL_FORMAT_B10G10R10A2_USCALED:
3711       wa_flags = ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3712       break;
3713    case ISL_FORMAT_B10G10R10A2_SSCALED:
3714       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3715       break;
3716    case ISL_FORMAT_B10G10R10A2_UNORM:
3717       wa_flags = ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3718       break;
3719    case ISL_FORMAT_B10G10R10A2_SNORM:
3720       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3721       break;
3722    case ISL_FORMAT_B10G10R10A2_SINT:
3723       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_BGRA;
3724       break;
3725    case ISL_FORMAT_B10G10R10A2_UINT:
3726       wa_flags = ELK_ATTRIB_WA_BGRA;
3727       break;
3728    default:
3729       break;
3730    }
3731    return wa_flags;
3732 }
3733 #endif
3734 
3735 /**
3736  * Gallium CSO for vertex elements.
3737  */
3738 struct crocus_vertex_element_state {
3739    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3740 #if GFX_VER == 8
3741    uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3742 #endif
3743    uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3744 #if GFX_VER == 8
3745    uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3746 #endif
3747    uint32_t step_rate[16];
3748    uint8_t wa_flags[33];
3749    uint16_t strides[16];
3750    unsigned count;
3751 };
3752 
3753 /**
3754  * The pipe->create_vertex_elements() driver hook.
3755  *
3756  * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3757  * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3758  * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3759  * needed. In these cases we will need information available at draw time.
3760  * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3761  * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3762  * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3763  */
3764 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3765 crocus_create_vertex_elements(struct pipe_context *ctx,
3766                               unsigned count,
3767                               const struct pipe_vertex_element *state)
3768 {
3769    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3770    const struct intel_device_info *devinfo = &screen->devinfo;
3771    struct crocus_vertex_element_state *cso =
3772       calloc(1, sizeof(struct crocus_vertex_element_state));
3773 
3774    cso->count = count;
3775 
3776    crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3777       ve.DWordLength =
3778          1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3779    }
3780 
3781    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3782 #if GFX_VER == 8
3783    uint32_t *vfi_pack_dest = cso->vf_instancing;
3784 #endif
3785 
3786    if (count == 0) {
3787       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3788          ve.Valid = true;
3789          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3790          ve.Component0Control = VFCOMP_STORE_0;
3791          ve.Component1Control = VFCOMP_STORE_0;
3792          ve.Component2Control = VFCOMP_STORE_0;
3793          ve.Component3Control = VFCOMP_STORE_1_FP;
3794       }
3795 #if GFX_VER == 8
3796       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3797       }
3798 #endif
3799    }
3800 
3801    for (int i = 0; i < count; i++) {
3802       const struct crocus_format_info fmt =
3803          crocus_format_for_usage(devinfo, state[i].src_format, 0);
3804       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3805                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3806       enum isl_format actual_fmt = fmt.fmt;
3807 
3808 #if GFX_VERx10 < 75
3809       cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3810 
3811       if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3812           fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3813           fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3814           fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3815           fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3816           fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3817           fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3818           fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3819           fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3820           fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3821           fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3822          actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3823       if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3824          actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3825       if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3826          actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3827       if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3828          actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3829       if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3830          actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3831 #endif
3832 
3833       cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3834       cso->strides[state[i].vertex_buffer_index] = state[i].src_stride;
3835 
3836       switch (isl_format_get_num_channels(fmt.fmt)) {
3837       case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3838       case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3839       case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3840       case 3:
3841          comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3842             : VFCOMP_STORE_1_FP;
3843          break;
3844       }
3845       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3846 #if GFX_VER >= 6
3847          ve.EdgeFlagEnable = false;
3848 #endif
3849          ve.VertexBufferIndex = state[i].vertex_buffer_index;
3850          ve.Valid = true;
3851          ve.SourceElementOffset = state[i].src_offset;
3852          ve.SourceElementFormat = actual_fmt;
3853          ve.Component0Control = comp[0];
3854          ve.Component1Control = comp[1];
3855          ve.Component2Control = comp[2];
3856          ve.Component3Control = comp[3];
3857 #if GFX_VER < 5
3858          ve.DestinationElementOffset = i * 4;
3859 #endif
3860       }
3861 
3862 #if GFX_VER == 8
3863       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3864          vi.VertexElementIndex = i;
3865          vi.InstancingEnable = state[i].instance_divisor > 0;
3866          vi.InstanceDataStepRate = state[i].instance_divisor;
3867       }
3868 #endif
3869       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3870 #if GFX_VER == 8
3871       vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3872 #endif
3873    }
3874 
3875    /* An alternative version of the last VE and VFI is stored so it
3876     * can be used at draw time in case Vertex Shader uses EdgeFlag
3877     */
3878    if (count) {
3879       const unsigned edgeflag_index = count - 1;
3880       const struct crocus_format_info fmt =
3881          crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3882       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3883 #if GFX_VER >= 6
3884          ve.EdgeFlagEnable = true;
3885 #endif
3886          ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3887          ve.Valid = true;
3888          ve.SourceElementOffset = state[edgeflag_index].src_offset;
3889          ve.SourceElementFormat = fmt.fmt;
3890          ve.Component0Control = VFCOMP_STORE_SRC;
3891          ve.Component1Control = VFCOMP_STORE_0;
3892          ve.Component2Control = VFCOMP_STORE_0;
3893          ve.Component3Control = VFCOMP_STORE_0;
3894       }
3895 #if GFX_VER == 8
3896       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3897          /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3898           * at draw time, as it should change if SGVs are emitted.
3899           */
3900          vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3901          vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3902       }
3903 #endif
3904    }
3905 
3906    return cso;
3907 }
3908 
3909 /**
3910  * The pipe->bind_vertex_elements_state() driver hook.
3911  */
3912 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3913 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3914 {
3915    struct crocus_context *ice = (struct crocus_context *) ctx;
3916 #if GFX_VER == 8
3917    struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3918    struct crocus_vertex_element_state *new_cso = state;
3919 
3920    if (new_cso && cso_changed(count))
3921       ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3922 #endif
3923    ice->state.cso_vertex_elements = state;
3924    ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3925    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3926 }
3927 
3928 #if GFX_VER >= 6
3929 struct crocus_streamout_counter {
3930    uint32_t offset_start;
3931    uint32_t offset_end;
3932 
3933    uint64_t accum;
3934 };
3935 
3936 /**
3937  * Gallium CSO for stream output (transform feedback) targets.
3938  */
3939 struct crocus_stream_output_target {
3940    struct pipe_stream_output_target base;
3941 
3942    /** Stride (bytes-per-vertex) during this transform feedback operation */
3943    uint16_t stride;
3944 
3945    /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3946    bool zeroed;
3947 
3948    struct crocus_resource *offset_res;
3949    uint32_t offset_offset;
3950 
3951 #if GFX_VER == 6
3952    void *prim_map;
3953    struct crocus_streamout_counter prev_count;
3954    struct crocus_streamout_counter count;
3955 #endif
3956 #if GFX_VER == 8
3957    /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3958    bool zero_offset;
3959 #endif
3960 };
3961 
3962 #if GFX_VER >= 7
3963 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3964 crocus_get_so_offset(struct pipe_stream_output_target *so)
3965 {
3966    struct crocus_stream_output_target *tgt = (void *)so;
3967    struct pipe_transfer *transfer;
3968    struct pipe_box box;
3969    uint32_t result;
3970    u_box_1d(tgt->offset_offset, 4, &box);
3971    void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3972                                        0, PIPE_MAP_DIRECTLY,
3973                                        &box, &transfer);
3974    assert(val);
3975    result = *(uint32_t *)val;
3976    so->context->buffer_unmap(so->context, transfer);
3977 
3978    return result / tgt->stride;
3979 }
3980 #endif
3981 
3982 #if GFX_VER == 6
3983 static void
3984 compute_vertices_written_so_far(struct crocus_context *ice,
3985                                 struct crocus_stream_output_target *tgt,
3986                                 struct crocus_streamout_counter *count,
3987                                 uint64_t *svbi);
3988 
3989 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3990 crocus_get_so_offset(struct pipe_stream_output_target *so)
3991 {
3992    struct crocus_stream_output_target *tgt = (void *)so;
3993    struct crocus_context *ice = (void *)so->context;
3994 
3995    uint64_t vert_written;
3996    compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
3997    return vert_written;
3998 }
3999 #endif
4000 
4001 /**
4002  * The pipe->create_stream_output_target() driver hook.
4003  *
4004  * "Target" here refers to a destination buffer.  We translate this into
4005  * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4006  * know which buffer this represents, or whether we ought to zero the
4007  * write-offsets, or append.  Those are handled in the set() hook.
4008  */
4009 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4010 crocus_create_stream_output_target(struct pipe_context *ctx,
4011                                    struct pipe_resource *p_res,
4012                                    unsigned buffer_offset,
4013                                    unsigned buffer_size)
4014 {
4015    struct crocus_resource *res = (void *) p_res;
4016    struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4017    if (!cso)
4018       return NULL;
4019 
4020    res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4021 
4022    pipe_reference_init(&cso->base.reference, 1);
4023    pipe_resource_reference(&cso->base.buffer, p_res);
4024    cso->base.buffer_offset = buffer_offset;
4025    cso->base.buffer_size = buffer_size;
4026    cso->base.context = ctx;
4027 
4028    util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4029                   buffer_offset + buffer_size);
4030 #if GFX_VER >= 7
4031    struct crocus_context *ice = (struct crocus_context *) ctx;
4032    void *temp;
4033    u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4034                   &cso->offset_offset,
4035                   (struct pipe_resource **)&cso->offset_res,
4036                   &temp);
4037 #endif
4038 
4039    return &cso->base;
4040 }
4041 
4042 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4043 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4044                                     struct pipe_stream_output_target *state)
4045 {
4046    struct crocus_stream_output_target *cso = (void *) state;
4047 
4048    pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4049    pipe_resource_reference(&cso->base.buffer, NULL);
4050 
4051    free(cso);
4052 }
4053 
4054 #define GEN6_SO_NUM_PRIMS_WRITTEN       0x2288
4055 #define GEN7_SO_WRITE_OFFSET(n)         (0x5280 + (n) * 4)
4056 
4057 #if GFX_VER == 6
4058 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4059 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4060                          struct crocus_streamout_counter *counter)
4061 {
4062    uint64_t *prim_counts = tgt->prim_map;
4063 
4064    if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4065       struct pipe_fence_handle *out_fence = NULL;
4066       batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4067       batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4068       batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4069    }
4070 
4071    for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4072       counter->accum += prim_counts[i + 1] - prim_counts[i];
4073    }
4074    tgt->count.offset_start = tgt->count.offset_end = 0;
4075 }
4076 
4077 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4078 crocus_stream_store_prims_written(struct crocus_batch *batch,
4079                                   struct crocus_stream_output_target *tgt)
4080 {
4081    if (!tgt->offset_res) {
4082       u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4083                      &tgt->offset_offset,
4084                      (struct pipe_resource **)&tgt->offset_res,
4085                      &tgt->prim_map);
4086       tgt->count.offset_start = tgt->count.offset_end = 0;
4087    }
4088 
4089    if (tgt->count.offset_end + 16 >= 4096) {
4090       aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4091       aggregate_stream_counter(batch, tgt, &tgt->count);
4092    }
4093 
4094    crocus_emit_mi_flush(batch);
4095    crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4096                                tgt->offset_res->bo,
4097                                tgt->count.offset_end + tgt->offset_offset, false);
4098    tgt->count.offset_end += 8;
4099 }
4100 
4101 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4102 compute_vertices_written_so_far(struct crocus_context *ice,
4103                                 struct crocus_stream_output_target *tgt,
4104                                 struct crocus_streamout_counter *counter,
4105                                 uint64_t *svbi)
4106 {
4107    //TODO vertices per prim
4108    aggregate_stream_counter(&ice->batches[0], tgt, counter);
4109 
4110    *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4111 }
4112 #endif
4113 /**
4114  * The pipe->set_stream_output_targets() driver hook.
4115  *
4116  * At this point, we know which targets are bound to a particular index,
4117  * and also whether we want to append or start over.  We can finish the
4118  * 3DSTATE_SO_BUFFER packets we started earlier.
4119  */
4120 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4121 crocus_set_stream_output_targets(struct pipe_context *ctx,
4122                                  unsigned num_targets,
4123                                  struct pipe_stream_output_target **targets,
4124                                  const unsigned *offsets)
4125 {
4126    struct crocus_context *ice = (struct crocus_context *) ctx;
4127    struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4128    struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4129    const bool active = num_targets > 0;
4130    if (ice->state.streamout_active != active) {
4131       ice->state.streamout_active = active;
4132 #if GFX_VER >= 7
4133       ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4134 #else
4135       ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4136 #endif
4137 
4138       /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4139        * it's a non-pipelined command.  If we're switching streamout on, we
4140        * may have missed emitting it earlier, so do so now.  (We're already
4141        * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4142        */
4143       if (active) {
4144 #if GFX_VER >= 7
4145          ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4146 #endif
4147       } else {
4148          uint32_t flush = 0;
4149          for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4150             struct crocus_stream_output_target *tgt =
4151                (void *) ice->state.so_target[i];
4152             if (tgt) {
4153                struct crocus_resource *res = (void *) tgt->base.buffer;
4154 
4155                flush |= crocus_flush_bits_for_history(res);
4156                crocus_dirty_for_history(ice, res);
4157             }
4158          }
4159          crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4160                                         "make streamout results visible", flush);
4161       }
4162    }
4163 
4164    ice->state.so_targets = num_targets;
4165    for (int i = 0; i < 4; i++) {
4166       pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4167       pipe_so_target_reference(&ice->state.so_target[i],
4168                                i < num_targets ? targets[i] : NULL);
4169    }
4170 
4171 #if GFX_VER == 6
4172    bool stored_num_prims = false;
4173    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4174       if (num_targets) {
4175          struct crocus_stream_output_target *tgt =
4176             (void *) ice->state.so_target[i];
4177 
4178          if (!tgt)
4179             continue;
4180          if (offsets[i] == 0) {
4181             // This means that we're supposed to ignore anything written to
4182             // the buffer before. We can do this by just clearing out the
4183             // count of writes to the prim count buffer.
4184             tgt->count.offset_start = tgt->count.offset_end;
4185             tgt->count.accum = 0;
4186             ice->state.svbi = 0;
4187          } else {
4188             if (tgt->offset_res) {
4189                compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4190                tgt->count.offset_start = tgt->count.offset_end;
4191             }
4192          }
4193 
4194          if (!stored_num_prims) {
4195             crocus_stream_store_prims_written(batch, tgt);
4196             stored_num_prims = true;
4197          }
4198       } else {
4199          struct crocus_stream_output_target *tgt =
4200             (void *) old_tgt[i];
4201          if (tgt) {
4202             if (!stored_num_prims) {
4203                crocus_stream_store_prims_written(batch, tgt);
4204                stored_num_prims = true;
4205             }
4206 
4207             if (tgt->offset_res) {
4208                tgt->prev_count = tgt->count;
4209             }
4210          }
4211       }
4212       pipe_so_target_reference(&old_tgt[i], NULL);
4213    }
4214    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4215 #else
4216    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4217       if (num_targets) {
4218          struct crocus_stream_output_target *tgt =
4219             (void *) ice->state.so_target[i];
4220 
4221          if (offsets[i] == 0) {
4222 #if GFX_VER == 8
4223             if (tgt)
4224                tgt->zero_offset = true;
4225 #endif
4226             crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4227          }
4228          else if (tgt)
4229             crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4230                                        tgt->offset_res->bo,
4231                                        tgt->offset_offset);
4232       } else {
4233          struct crocus_stream_output_target *tgt =
4234             (void *) old_tgt[i];
4235          if (tgt)
4236             crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4237                                         tgt->offset_res->bo,
4238                                         tgt->offset_offset, false);
4239       }
4240       pipe_so_target_reference(&old_tgt[i], NULL);
4241    }
4242 #endif
4243    /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4244    if (!active)
4245       return;
4246 #if GFX_VER >= 7
4247    ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4248 #elif GFX_VER == 6
4249    ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4250 #endif
4251 }
4252 
4253 #endif
4254 
4255 #if GFX_VER >= 7
4256 /**
4257  * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4258  * 3DSTATE_STREAMOUT packets.
4259  *
4260  * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4261  * hardware to record.  We can create it entirely based on the shader, with
4262  * no dynamic state dependencies.
4263  *
4264  * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4265  * state-based settings.  We capture the shader-related ones here, and merge
4266  * the rest in at draw time.
4267  */
4268 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4269 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4270                            const struct intel_vue_map *vue_map)
4271 {
4272    struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4273    int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4274    int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4275    int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4276    int max_decls = 0;
4277    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4278 
4279    memset(so_decl, 0, sizeof(so_decl));
4280 
4281    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4282     * command feels strange -- each dword pair contains a SO_DECL per stream.
4283     */
4284    for (unsigned i = 0; i < info->num_outputs; i++) {
4285       const struct pipe_stream_output *output = &info->output[i];
4286       const int buffer = output->output_buffer;
4287       const int varying = output->register_index;
4288       const unsigned stream_id = output->stream;
4289       assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4290 
4291       buffer_mask[stream_id] |= 1 << buffer;
4292 
4293       assert(vue_map->varying_to_slot[varying] >= 0);
4294 
4295       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4296        * array.  Instead, it simply increments DstOffset for the following
4297        * input by the number of components that should be skipped.
4298        *
4299        * Our hardware is unusual in that it requires us to program SO_DECLs
4300        * for fake "hole" components, rather than simply taking the offset
4301        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4302        * program as many size = 4 holes as we can, then a final hole to
4303        * accommodate the final 1, 2, or 3 remaining.
4304        */
4305       int skip_components = output->dst_offset - next_offset[buffer];
4306 
4307       while (skip_components > 0) {
4308          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4309             .HoleFlag = 1,
4310             .OutputBufferSlot = output->output_buffer,
4311             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4312          };
4313          skip_components -= 4;
4314       }
4315 
4316       next_offset[buffer] = output->dst_offset + output->num_components;
4317 
4318       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4319          .OutputBufferSlot = output->output_buffer,
4320          .RegisterIndex = vue_map->varying_to_slot[varying],
4321          .ComponentMask =
4322             ((1 << output->num_components) - 1) << output->start_component,
4323       };
4324 
4325       if (decls[stream_id] > max_decls)
4326          max_decls = decls[stream_id];
4327    }
4328 
4329    unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4330    uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4331    uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4332 
4333    crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4334       int urb_entry_read_offset = 0;
4335       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4336          urb_entry_read_offset;
4337 
4338       /* We always read the whole vertex.  This could be reduced at some
4339        * point by reading less and offsetting the register index in the
4340        * SO_DECLs.
4341        */
4342       sol.Stream0VertexReadOffset = urb_entry_read_offset;
4343       sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4344       sol.Stream1VertexReadOffset = urb_entry_read_offset;
4345       sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4346       sol.Stream2VertexReadOffset = urb_entry_read_offset;
4347       sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4348       sol.Stream3VertexReadOffset = urb_entry_read_offset;
4349       sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4350 
4351       // TODO: Double-check that stride == 0 means no buffer. Probably this
4352       // needs to go elsewhere, where the buffer enable stuff is actually
4353       // known.
4354 #if GFX_VER < 8
4355       sol.SOBufferEnable0 = !!info->stride[0];
4356       sol.SOBufferEnable1 = !!info->stride[1];
4357       sol.SOBufferEnable2 = !!info->stride[2];
4358       sol.SOBufferEnable3 = !!info->stride[3];
4359 #else
4360       /* Set buffer pitches; 0 means unbound. */
4361       sol.Buffer0SurfacePitch = 4 * info->stride[0];
4362       sol.Buffer1SurfacePitch = 4 * info->stride[1];
4363       sol.Buffer2SurfacePitch = 4 * info->stride[2];
4364       sol.Buffer3SurfacePitch = 4 * info->stride[3];
4365 #endif
4366    }
4367 
4368    crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4369       list.DWordLength = 3 + 2 * max_decls - 2;
4370       list.StreamtoBufferSelects0 = buffer_mask[0];
4371       list.StreamtoBufferSelects1 = buffer_mask[1];
4372       list.StreamtoBufferSelects2 = buffer_mask[2];
4373       list.StreamtoBufferSelects3 = buffer_mask[3];
4374       list.NumEntries0 = decls[0];
4375       list.NumEntries1 = decls[1];
4376       list.NumEntries2 = decls[2];
4377       list.NumEntries3 = decls[3];
4378    }
4379 
4380    for (int i = 0; i < max_decls; i++) {
4381       crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4382          entry.Stream0Decl = so_decl[0][i];
4383          entry.Stream1Decl = so_decl[1][i];
4384          entry.Stream2Decl = so_decl[2][i];
4385          entry.Stream3Decl = so_decl[3][i];
4386       }
4387    }
4388 
4389    return map;
4390 }
4391 #endif
4392 
4393 #if GFX_VER == 6
4394 static void
crocus_emit_so_svbi(struct crocus_context * ice)4395 crocus_emit_so_svbi(struct crocus_context *ice)
4396 {
4397    struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4398 
4399    unsigned max_vertex = 0xffffffff;
4400    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4401       struct crocus_stream_output_target *tgt =
4402          (void *) ice->state.so_target[i];
4403       if (tgt)
4404          max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4405    }
4406 
4407    crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4408       svbi.IndexNumber = 0;
4409       svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4410       svbi.MaximumIndex = max_vertex;
4411    }
4412 
4413    /* initialize the rest of the SVBI's to reasonable values so that we don't
4414     * run out of room writing the regular data.
4415     */
4416    for (int i = 1; i < 4; i++) {
4417       crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4418          svbi.IndexNumber = i;
4419          svbi.StreamedVertexBufferIndex = 0;
4420          svbi.MaximumIndex = 0xffffffff;
4421       }
4422    }
4423 }
4424 
4425 #endif
4426 
4427 
4428 #if GFX_VER >= 6
4429 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4430 crocus_is_drawing_points(const struct crocus_context *ice)
4431 {
4432    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4433 
4434    if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4435        cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4436       return true;
4437 
4438    if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4439       const struct elk_gs_prog_data *gs_prog_data =
4440          (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4441       return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4442    } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4443       const struct elk_tes_prog_data *tes_data =
4444          (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4445       return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4446    } else {
4447       return ice->state.prim_mode == MESA_PRIM_POINTS;
4448    }
4449 }
4450 #endif
4451 
4452 #if GFX_VER >= 6
4453 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct intel_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4454 get_attr_override(
4455    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4456    const struct intel_vue_map *vue_map,
4457    int urb_entry_read_offset, int fs_attr,
4458    bool two_side_color, uint32_t *max_source_attr)
4459 {
4460    /* Find the VUE slot for this attribute. */
4461    int slot = vue_map->varying_to_slot[fs_attr];
4462 
4463    /* Viewport and Layer are stored in the VUE header.  We need to override
4464     * them to zero if earlier stages didn't write them, as GL requires that
4465     * they read back as zero when not explicitly set.
4466     */
4467    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4468       attr->ComponentOverrideX = true;
4469       attr->ComponentOverrideW = true;
4470       attr->ConstantSource = CONST_0000;
4471 
4472       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4473          attr->ComponentOverrideY = true;
4474       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4475          attr->ComponentOverrideZ = true;
4476 
4477       return;
4478    }
4479 
4480    /* If there was only a back color written but not front, use back
4481     * as the color instead of undefined
4482     */
4483    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4484       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4485    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4486       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4487 
4488    if (slot == -1) {
4489       /* This attribute does not exist in the VUE--that means that the vertex
4490        * shader did not write to it.  This means that either:
4491        *
4492        * (a) This attribute is a texture coordinate, and it is going to be
4493        * replaced with point coordinates (as a consequence of a call to
4494        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4495        * hardware will ignore whatever attribute override we supply.
4496        *
4497        * (b) This attribute is read by the fragment shader but not written by
4498        * the vertex shader, so its value is undefined.  Therefore the
4499        * attribute override we supply doesn't matter.
4500        *
4501        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4502        * previous shader stage.
4503        *
4504        * Note that we don't have to worry about the cases where the attribute
4505        * is gl_PointCoord or is undergoing point sprite coordinate
4506        * replacement, because in those cases, this function isn't called.
4507        *
4508        * In case (c), we need to program the attribute overrides so that the
4509        * primitive ID will be stored in this slot.  In every other case, the
4510        * attribute override we supply doesn't matter.  So just go ahead and
4511        * program primitive ID in every case.
4512        */
4513       attr->ComponentOverrideW = true;
4514       attr->ComponentOverrideX = true;
4515       attr->ComponentOverrideY = true;
4516       attr->ComponentOverrideZ = true;
4517       attr->ConstantSource = PRIM_ID;
4518       return;
4519    }
4520 
4521    /* Compute the location of the attribute relative to urb_entry_read_offset.
4522     * Each increment of urb_entry_read_offset represents a 256-bit value, so
4523     * it counts for two 128-bit VUE slots.
4524     */
4525    int source_attr = slot - 2 * urb_entry_read_offset;
4526    assert(source_attr >= 0 && source_attr < 32);
4527 
4528    /* If we are doing two-sided color, and the VUE slot following this one
4529     * represents a back-facing color, then we need to instruct the SF unit to
4530     * do back-facing swizzling.
4531     */
4532    bool swizzling = two_side_color &&
4533       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4534         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4535        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4536         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4537 
4538    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
4539    if (*max_source_attr < source_attr + swizzling)
4540       *max_source_attr = source_attr + swizzling;
4541 
4542    attr->SourceAttribute = source_attr;
4543    if (swizzling)
4544       attr->SwizzleSelect = INPUTATTR_FACING;
4545 }
4546 
4547 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4548 calculate_attr_overrides(
4549    const struct crocus_context *ice,
4550    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4551    uint32_t *point_sprite_enables,
4552    uint32_t *urb_entry_read_length,
4553    uint32_t *urb_entry_read_offset)
4554 {
4555    const struct elk_wm_prog_data *wm_prog_data = (void *)
4556       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4557    const struct intel_vue_map *vue_map = ice->shaders.last_vue_map;
4558    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4559    uint32_t max_source_attr = 0;
4560    const struct shader_info *fs_info =
4561       crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4562 
4563    int first_slot =
4564       elk_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4565 
4566    /* Each URB offset packs two varying slots */
4567    assert(first_slot % 2 == 0);
4568    *urb_entry_read_offset = first_slot / 2;
4569    *point_sprite_enables = 0;
4570 
4571    for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4572       const int input_index = wm_prog_data->urb_setup[fs_attr];
4573 
4574       if (input_index < 0)
4575          continue;
4576 
4577       bool point_sprite = false;
4578       if (crocus_is_drawing_points(ice)) {
4579          if (fs_attr >= VARYING_SLOT_TEX0 &&
4580              fs_attr <= VARYING_SLOT_TEX7 &&
4581              cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4582             point_sprite = true;
4583 
4584          if (fs_attr == VARYING_SLOT_PNTC)
4585             point_sprite = true;
4586 
4587          if (point_sprite)
4588             *point_sprite_enables |= 1U << input_index;
4589       }
4590 
4591       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4592       if (!point_sprite) {
4593          get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4594                            cso_rast->cso.light_twoside, &max_source_attr);
4595       }
4596 
4597       /* The hardware can only do the overrides on 16 overrides at a
4598        * time, and the other up to 16 have to be lined up so that the
4599        * input index = the output index.  We'll need to do some
4600        * tweaking to make sure that's the case.
4601        */
4602       if (input_index < 16)
4603          attr_overrides[input_index] = attribute;
4604       else
4605          assert(attribute.SourceAttribute == input_index);
4606    }
4607 
4608    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4609     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4610     *
4611     * "This field should be set to the minimum length required to read the
4612     *  maximum source attribute.  The maximum source attribute is indicated
4613     *  by the maximum value of the enabled Attribute # Source Attribute if
4614     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4615     *  enable is not set.
4616     *  read_length = ceiling((max_source_attr + 1) / 2)
4617     *
4618     *  [errata] Corruption/Hang possible if length programmed larger than
4619     *  recommended"
4620     *
4621     * Similar text exists for Ivy Bridge.
4622     */
4623    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4624 }
4625 #endif
4626 
4627 #if GFX_VER >= 7
4628 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4629 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4630 {
4631    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4632    const struct elk_wm_prog_data *wm_prog_data = (void *)
4633       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4634 #if GFX_VER >= 8
4635    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4636 #else
4637 #define attr_overrides sbe.Attribute
4638 #endif
4639 
4640    uint32_t urb_entry_read_length;
4641    uint32_t urb_entry_read_offset;
4642    uint32_t point_sprite_enables;
4643 
4644    crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4645       sbe.AttributeSwizzleEnable = true;
4646       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4647       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4648 
4649       calculate_attr_overrides(ice,
4650                                attr_overrides,
4651                                &point_sprite_enables,
4652                                &urb_entry_read_length,
4653                                &urb_entry_read_offset);
4654       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4655       sbe.VertexURBEntryReadLength = urb_entry_read_length;
4656       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4657       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4658 #if GFX_VER >= 8
4659       sbe.ForceVertexURBEntryReadLength = true;
4660       sbe.ForceVertexURBEntryReadOffset = true;
4661 #endif
4662    }
4663 #if GFX_VER >= 8
4664    crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4665       for (int i = 0; i < 16; i++)
4666          sbes.Attribute[i] = attr_overrides[i];
4667    }
4668 #endif
4669 }
4670 #endif
4671 
4672 /* ------------------------------------------------------------------- */
4673 
4674 /**
4675  * Populate VS program key fields based on the current state.
4676  */
4677 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_vs_prog_key * key)4678 crocus_populate_vs_key(const struct crocus_context *ice,
4679                        const struct shader_info *info,
4680                        gl_shader_stage last_stage,
4681                        struct elk_vs_prog_key *key)
4682 {
4683    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4684 
4685    if (info->clip_distance_array_size == 0 &&
4686        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4687        last_stage == MESA_SHADER_VERTEX)
4688       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4689 
4690    if (last_stage == MESA_SHADER_VERTEX &&
4691        info->outputs_written & (VARYING_BIT_PSIZ))
4692       key->clamp_pointsize = 1;
4693 
4694 #if GFX_VER <= 5
4695    key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4696                          cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4697    key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4698 #endif
4699 
4700    key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4701 
4702 #if GFX_VERx10 < 75
4703    uint64_t inputs_read = info->inputs_read;
4704    int ve_idx = 0;
4705    while (inputs_read) {
4706       int i = u_bit_scan64(&inputs_read);
4707       key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4708       ve_idx++;
4709    }
4710 #endif
4711 }
4712 
4713 /**
4714  * Populate TCS program key fields based on the current state.
4715  */
4716 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct elk_tcs_prog_key * key)4717 crocus_populate_tcs_key(const struct crocus_context *ice,
4718                         struct elk_tcs_prog_key *key)
4719 {
4720 }
4721 
4722 /**
4723  * Populate TES program key fields based on the current state.
4724  */
4725 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_tes_prog_key * key)4726 crocus_populate_tes_key(const struct crocus_context *ice,
4727                         const struct shader_info *info,
4728                         gl_shader_stage last_stage,
4729                         struct elk_tes_prog_key *key)
4730 {
4731    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4732 
4733    if (info->clip_distance_array_size == 0 &&
4734        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4735        last_stage == MESA_SHADER_TESS_EVAL)
4736       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4737 
4738    if (last_stage == MESA_SHADER_TESS_EVAL &&
4739        info->outputs_written & (VARYING_BIT_PSIZ))
4740       key->clamp_pointsize = 1;
4741 }
4742 
4743 /**
4744  * Populate GS program key fields based on the current state.
4745  */
4746 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_gs_prog_key * key)4747 crocus_populate_gs_key(const struct crocus_context *ice,
4748                        const struct shader_info *info,
4749                        gl_shader_stage last_stage,
4750                        struct elk_gs_prog_key *key)
4751 {
4752    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4753 
4754    if (info->clip_distance_array_size == 0 &&
4755        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4756        last_stage == MESA_SHADER_GEOMETRY)
4757       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4758 
4759    if (last_stage == MESA_SHADER_GEOMETRY &&
4760        info->outputs_written & (VARYING_BIT_PSIZ))
4761       key->clamp_pointsize = 1;
4762 }
4763 
4764 /**
4765  * Populate FS program key fields based on the current state.
4766  */
4767 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct elk_wm_prog_key * key)4768 crocus_populate_fs_key(const struct crocus_context *ice,
4769                        const struct shader_info *info,
4770                        struct elk_wm_prog_key *key)
4771 {
4772    struct crocus_screen *screen = (void *) ice->ctx.screen;
4773    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4774    const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4775    const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4776    const struct crocus_blend_state *blend = ice->state.cso_blend;
4777 
4778 #if GFX_VER < 6
4779    uint32_t lookup = 0;
4780 
4781    if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4782       lookup |= ELK_WM_IZ_PS_KILL_ALPHATEST_BIT;
4783 
4784    if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4785       lookup |= ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4786 
4787    if (fb->zsbuf && zsa->cso.depth_enabled) {
4788       lookup |= ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4789 
4790       if (zsa->cso.depth_writemask)
4791          lookup |= ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4792 
4793    }
4794    if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4795       lookup |= ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4796       if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4797          lookup |= ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4798    }
4799    key->iz_lookup = lookup;
4800    key->stats_wm = ice->state.stats_wm;
4801 #endif
4802 
4803    uint32_t line_aa = ELK_NEVER;
4804    if (rast->cso.line_smooth) {
4805       int reduced_prim = ice->state.reduced_prim_mode;
4806       if (reduced_prim == MESA_PRIM_LINES)
4807          line_aa = ELK_ALWAYS;
4808       else if (reduced_prim == MESA_PRIM_TRIANGLES) {
4809          if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4810             line_aa = ELK_SOMETIMES;
4811 
4812             if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4813                 rast->cso.cull_face == PIPE_FACE_BACK)
4814                line_aa = ELK_ALWAYS;
4815          } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4816             line_aa = ELK_SOMETIMES;
4817 
4818             if (rast->cso.cull_face == PIPE_FACE_FRONT)
4819                line_aa = ELK_ALWAYS;
4820          }
4821       }
4822    }
4823    key->line_aa = line_aa;
4824 
4825    key->nr_color_regions = fb->nr_cbufs;
4826 
4827    key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4828 
4829    key->alpha_to_coverage = blend->cso.alpha_to_coverage ?
4830       ELK_ALWAYS : ELK_NEVER;
4831 
4832    key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4833 
4834    key->flat_shade = rast->cso.flatshade &&
4835       (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4836 
4837    const bool multisample_fbo = rast->cso.multisample && fb->samples > 1;
4838    key->multisample_fbo = multisample_fbo ? ELK_ALWAYS : ELK_NEVER;
4839    key->persample_interp =
4840       rast->cso.force_persample_interp ? ELK_ALWAYS : ELK_NEVER;
4841 
4842    key->ignore_sample_mask_out = !multisample_fbo;
4843    key->coherent_fb_fetch = false; // TODO: needed?
4844 
4845    key->force_dual_color_blend =
4846       screen->driconf.dual_color_blend_by_location &&
4847       (blend->blend_enables & 1) && blend->dual_color_blending;
4848 
4849 #if GFX_VER <= 5
4850    if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4851       key->emit_alpha_test = true;
4852       key->alpha_test_func = zsa->cso.alpha_func;
4853       key->alpha_test_ref = zsa->cso.alpha_ref_value;
4854    }
4855 #endif
4856 }
4857 
4858 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct elk_cs_prog_key * key)4859 crocus_populate_cs_key(const struct crocus_context *ice,
4860                        struct elk_cs_prog_key *key)
4861 {
4862 }
4863 
4864 #if GFX_VER == 4
4865 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4866 #elif GFX_VER >= 5
4867 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4868 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4869 {
4870    return shader->offset;
4871 }
4872 #endif
4873 
4874 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4875  * prefetching of binding tables in A0 and B0 steppings.  XXX: Revisit
4876  * this WA on C0 stepping.
4877  *
4878  * TODO: Fill out SamplerCount for prefetching?
4879  */
4880 
4881 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                 \
4882    pkt.KernelStartPointer = KSP(ice, shader);                           \
4883    pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;              \
4884    pkt.FloatingPointMode = prog_data->use_alt_mode;                     \
4885                                                                         \
4886    pkt.DispatchGRFStartRegisterForURBData =                             \
4887       prog_data->dispatch_grf_start_reg;                                \
4888    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;     \
4889    pkt.prefix##URBEntryReadOffset = 0;                                  \
4890                                                                         \
4891    pkt.StatisticsEnable = true;                                         \
4892    pkt.Enable           = true;                                         \
4893                                                                         \
4894    if (prog_data->total_scratch) {                                      \
4895       struct crocus_bo *bo =                                            \
4896          crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4897       pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;   \
4898       pkt.ScratchSpaceBasePointer = rw_bo(bo, 0);                       \
4899    }
4900 
4901 /* ------------------------------------------------------------------- */
4902 #if GFX_VER >= 6
4903 static const uint32_t push_constant_opcodes[] = {
4904    [MESA_SHADER_VERTEX]    = 21,
4905    [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4906    [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4907    [MESA_SHADER_GEOMETRY]  = 22,
4908    [MESA_SHADER_FRAGMENT]  = 23,
4909    [MESA_SHADER_COMPUTE]   = 0,
4910 };
4911 #endif
4912 
4913 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4914 emit_sized_null_surface(struct crocus_batch *batch,
4915                         unsigned width, unsigned height,
4916                         unsigned layers, unsigned levels,
4917                         unsigned minimum_array_element,
4918                         uint32_t *out_offset)
4919 {
4920    struct isl_device *isl_dev = &batch->screen->isl_dev;
4921    uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4922                                  isl_dev->ss.align,
4923                                  out_offset);
4924    //TODO gen 6 multisample crash
4925    isl_null_fill_state(isl_dev, surf,
4926                        .size = isl_extent3d(width, height, layers),
4927                        .levels = levels,
4928                        .minimum_array_element = minimum_array_element);
4929 }
4930 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4931 emit_null_surface(struct crocus_batch *batch,
4932                   uint32_t *out_offset)
4933 {
4934    emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4935 }
4936 
4937 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4938 emit_null_fb_surface(struct crocus_batch *batch,
4939                      struct crocus_context *ice,
4940                      uint32_t *out_offset)
4941 {
4942    uint32_t width, height, layers, level, layer;
4943    /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4944    if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4945       emit_null_surface(batch, out_offset);
4946       return;
4947    }
4948 
4949    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4950    width = MAX2(cso->width, 1);
4951    height = MAX2(cso->height, 1);
4952    layers = cso->layers ? cso->layers : 1;
4953    level = 0;
4954    layer = 0;
4955 
4956    if (cso->nr_cbufs == 0 && cso->zsbuf) {
4957       width = cso->zsbuf->width;
4958       height = cso->zsbuf->height;
4959       level = cso->zsbuf->u.tex.level;
4960       layer = cso->zsbuf->u.tex.first_layer;
4961    }
4962    emit_sized_null_surface(batch, width, height,
4963                            layers, level, layer,
4964                            out_offset);
4965 }
4966 
4967 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4968 emit_surface_state(struct crocus_batch *batch,
4969                    struct crocus_resource *res,
4970                    const struct isl_surf *in_surf,
4971                    bool adjust_surf,
4972                    struct isl_view *in_view,
4973                    bool writeable,
4974                    enum isl_aux_usage aux_usage,
4975                    bool blend_enable,
4976                    uint32_t write_disables,
4977                    uint32_t *surf_state,
4978                    uint32_t addr_offset)
4979 {
4980    struct isl_device *isl_dev = &batch->screen->isl_dev;
4981    uint32_t reloc = RELOC_32BIT;
4982    uint64_t offset_B = res->offset;
4983    uint32_t tile_x_sa = 0, tile_y_sa = 0;
4984 
4985    if (writeable)
4986       reloc |= RELOC_WRITE;
4987 
4988    struct isl_surf surf = *in_surf;
4989    struct isl_view view = *in_view;
4990    if (adjust_surf) {
4991       if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4992          isl_surf_get_image_surf(isl_dev, in_surf,
4993                                  view.base_level, 0,
4994                                  view.base_array_layer,
4995                                  &surf, &offset_B,
4996                                  &tile_x_sa, &tile_y_sa);
4997          view.base_array_layer = 0;
4998          view.base_level = 0;
4999       } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5000          isl_surf_get_image_surf(isl_dev, in_surf,
5001                                  view.base_level, view.base_array_layer,
5002                                  0,
5003                                  &surf, &offset_B,
5004                                  &tile_x_sa, &tile_y_sa);
5005          view.base_array_layer = 0;
5006          view.base_level = 0;
5007       } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5008          surf.dim = ISL_SURF_DIM_2D;
5009    }
5010 
5011    union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5012    struct crocus_bo *aux_bo = NULL;
5013    uint32_t aux_offset = 0;
5014    struct isl_surf *aux_surf = NULL;
5015    if (aux_usage != ISL_AUX_USAGE_NONE) {
5016       aux_surf = &res->aux.surf;
5017       aux_offset = res->aux.offset;
5018       aux_bo = res->aux.bo;
5019 
5020       clear_color = crocus_resource_get_clear_color(res);
5021    }
5022 
5023    isl_surf_fill_state(isl_dev, surf_state,
5024                        .surf = &surf,
5025                        .view = &view,
5026                        .address = crocus_state_reloc(batch,
5027                                                      addr_offset + isl_dev->ss.addr_offset,
5028                                                      res->bo, offset_B, reloc),
5029                        .aux_surf = aux_surf,
5030                        .aux_usage = aux_usage,
5031                        .aux_address = aux_offset,
5032                        .mocs = crocus_mocs(res->bo, isl_dev),
5033                        .clear_color = clear_color,
5034                        .use_clear_address = false,
5035                        .clear_address = 0,
5036                        .x_offset_sa = tile_x_sa,
5037                        .y_offset_sa = tile_y_sa,
5038 #if GFX_VER <= 5
5039                        .blend_enable = blend_enable,
5040                        .write_disables = write_disables,
5041 #endif
5042       );
5043 
5044    if (aux_surf) {
5045       /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5046        * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5047        * contain other control information.  Since buffer addresses are always
5048        * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5049        * an ordinary reloc to do the necessary address translation.
5050        *
5051        * FIXME: move to the point of assignment.
5052        */
5053       if (GFX_VER == 8) {
5054          uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5055          *aux_addr = crocus_state_reloc(batch,
5056                                         addr_offset + isl_dev->ss.aux_addr_offset,
5057                                         aux_bo, *aux_addr,
5058                                         reloc);
5059       } else {
5060          uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5061          *aux_addr = crocus_state_reloc(batch,
5062                                         addr_offset + isl_dev->ss.aux_addr_offset,
5063                                         aux_bo, *aux_addr,
5064                                         reloc);
5065       }
5066    }
5067 
5068 }
5069 
5070 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5071 emit_surface(struct crocus_batch *batch,
5072              struct crocus_surface *surf,
5073              enum isl_aux_usage aux_usage,
5074              bool blend_enable,
5075              uint32_t write_disables)
5076 {
5077    struct isl_device *isl_dev = &batch->screen->isl_dev;
5078    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5079    struct isl_view *view = &surf->view;
5080    uint32_t offset = 0;
5081    enum pipe_texture_target target = res->base.b.target;
5082    bool adjust_surf = false;
5083 
5084    if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5085       adjust_surf = true;
5086 
5087    if (surf->align_res)
5088       res = (struct crocus_resource *)surf->align_res;
5089 
5090    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5091 
5092    emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5093                       aux_usage, blend_enable,
5094                       write_disables,
5095                       surf_state, offset);
5096    return offset;
5097 }
5098 
5099 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5100 emit_rt_surface(struct crocus_batch *batch,
5101                 struct crocus_surface *surf,
5102                 enum isl_aux_usage aux_usage)
5103 {
5104    struct isl_device *isl_dev = &batch->screen->isl_dev;
5105    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5106    struct isl_view *view = &surf->read_view;
5107    uint32_t offset = 0;
5108    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5109 
5110    emit_surface_state(batch, res, &surf->surf, true, view, false,
5111                       aux_usage, 0, false,
5112                       surf_state, offset);
5113    return offset;
5114 }
5115 
5116 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5117 emit_grid(struct crocus_context *ice,
5118           struct crocus_batch *batch)
5119 {
5120    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5121    uint32_t offset = 0;
5122    struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5123    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5124                                        isl_dev->ss.align, &offset);
5125    isl_buffer_fill_state(isl_dev, surf_state,
5126                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5127                                                        crocus_resource_bo(grid_ref->res),
5128                                                        grid_ref->offset,
5129                                                        RELOC_32BIT),
5130                          .size_B = 12,
5131                          .format = ISL_FORMAT_RAW,
5132                          .stride_B = 1,
5133                          .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5134    return offset;
5135 }
5136 
5137 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5138 emit_ubo_buffer(struct crocus_context *ice,
5139                 struct crocus_batch *batch,
5140                 struct pipe_constant_buffer *buffer)
5141 {
5142    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5143    uint32_t offset = 0;
5144 
5145    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5146                                        isl_dev->ss.align, &offset);
5147    isl_buffer_fill_state(isl_dev, surf_state,
5148                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5149                                                        crocus_resource_bo(buffer->buffer),
5150                                                        buffer->buffer_offset,
5151                                                        RELOC_32BIT),
5152                          .size_B = buffer->buffer_size,
5153                          .format = 0,
5154                          .swizzle = ISL_SWIZZLE_IDENTITY,
5155                          .stride_B = 1,
5156                          .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5157 
5158    return offset;
5159 }
5160 
5161 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5162 emit_ssbo_buffer(struct crocus_context *ice,
5163                  struct crocus_batch *batch,
5164                  struct pipe_shader_buffer *buffer, bool writeable)
5165 {
5166    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5167    uint32_t offset = 0;
5168    uint32_t reloc = RELOC_32BIT;
5169 
5170    if (writeable)
5171       reloc |= RELOC_WRITE;
5172    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5173                                        isl_dev->ss.align, &offset);
5174    isl_buffer_fill_state(isl_dev, surf_state,
5175                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5176                                                        crocus_resource_bo(buffer->buffer),
5177                                                        buffer->buffer_offset,
5178                                                        reloc),
5179                          .size_B = buffer->buffer_size,
5180                          .format = ISL_FORMAT_RAW,
5181                          .swizzle = ISL_SWIZZLE_IDENTITY,
5182                          .stride_B = 1,
5183                          .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5184 
5185    return offset;
5186 }
5187 
5188 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5189 emit_sampler_view(struct crocus_context *ice,
5190                   struct crocus_batch *batch,
5191                   bool for_gather,
5192                   struct crocus_sampler_view *isv)
5193 {
5194    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5195    uint32_t offset = 0;
5196 
5197    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5198                                        isl_dev->ss.align, &offset);
5199 
5200    if (isv->base.target == PIPE_BUFFER) {
5201       const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5202       const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5203       unsigned final_size =
5204          MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5205               CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5206       isl_buffer_fill_state(isl_dev, surf_state,
5207                             .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5208                                                           isv->res->bo,
5209                                                           isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5210                             .size_B = final_size,
5211                             .format = isv->view.format,
5212                             .swizzle = isv->view.swizzle,
5213                             .stride_B = cpp,
5214                             .mocs = crocus_mocs(isv->res->bo, isl_dev)
5215          );
5216    } else {
5217       enum isl_aux_usage aux_usage =
5218          crocus_resource_texture_aux_usage(isv->res);
5219 
5220       emit_surface_state(batch, isv->res, &isv->res->surf, false,
5221                          for_gather ? &isv->gather_view : &isv->view,
5222                          false, aux_usage, false,
5223                          0, surf_state, offset);
5224    }
5225    return offset;
5226 }
5227 
5228 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5229 emit_image_view(struct crocus_context *ice,
5230                 struct crocus_batch *batch,
5231                 struct crocus_image_view *iv)
5232 {
5233    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5234    uint32_t offset = 0;
5235 
5236    struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5237    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5238                                        isl_dev->ss.align, &offset);
5239    bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5240    uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5241    if (res->base.b.target == PIPE_BUFFER) {
5242       const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5243       const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5244       unsigned final_size =
5245          MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5246               CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5247       isl_buffer_fill_state(isl_dev, surf_state,
5248                             .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5249                                                           res->bo,
5250                                                           res->offset + iv->base.u.buf.offset, reloc),
5251                             .size_B = final_size,
5252                             .format = iv->view.format,
5253                             .swizzle = iv->view.swizzle,
5254                             .stride_B = cpp,
5255                             .mocs = crocus_mocs(res->bo, isl_dev)
5256          );
5257    } else {
5258       if (iv->view.format == ISL_FORMAT_RAW) {
5259          isl_buffer_fill_state(isl_dev, surf_state,
5260                                .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5261                                                              res->bo,
5262                                                              res->offset, reloc),
5263                                .size_B = res->bo->size - res->offset,
5264                                .format = iv->view.format,
5265                                .swizzle = iv->view.swizzle,
5266                                .stride_B = 1,
5267                                .mocs = crocus_mocs(res->bo, isl_dev),
5268             );
5269 
5270 
5271       } else {
5272          emit_surface_state(batch, res,
5273                             &res->surf, false, &iv->view,
5274                             write, 0, false,
5275                             0, surf_state, offset);
5276       }
5277    }
5278 
5279    return offset;
5280 }
5281 
5282 #if GFX_VER == 6
5283 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5284 emit_sol_surface(struct crocus_batch *batch,
5285                  struct pipe_stream_output_info *so_info,
5286                  uint32_t idx)
5287 {
5288    struct crocus_context *ice = batch->ice;
5289 
5290    if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5291       return 0;
5292    const struct pipe_stream_output *output = &so_info->output[idx];
5293    const int buffer = output->output_buffer;
5294    assert(output->stream == 0);
5295 
5296    struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5297    unsigned stride_dwords = so_info->stride[buffer];
5298    unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5299 
5300    size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5301    unsigned num_vector_components = output->num_components;
5302    unsigned num_elements;
5303    /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5304     * too big to map using a single binding table entry?
5305     */
5306    //   assert((size_dwords - offset_dwords) / stride_dwords
5307    //          <= ELK_MAX_NUM_BUFFER_ENTRIES);
5308 
5309    if (size_dwords > offset_dwords + num_vector_components) {
5310       /* There is room for at least 1 transform feedback output in the buffer.
5311        * Compute the number of additional transform feedback outputs the
5312        * buffer has room for.
5313        */
5314       num_elements =
5315          (size_dwords - offset_dwords - num_vector_components);
5316    } else {
5317       /* There isn't even room for a single transform feedback output in the
5318        * buffer.  We can't configure the binding table entry to prevent output
5319        * entirely; we'll have to rely on the geometry shader to detect
5320        * overflow.  But to minimize the damage in case of a bug, set up the
5321        * binding table entry to just allow a single output.
5322        */
5323       num_elements = 0;
5324    }
5325    num_elements += stride_dwords;
5326 
5327    uint32_t surface_format;
5328    switch (num_vector_components) {
5329    case 1:
5330       surface_format = ISL_FORMAT_R32_FLOAT;
5331       break;
5332    case 2:
5333       surface_format = ISL_FORMAT_R32G32_FLOAT;
5334       break;
5335    case 3:
5336       surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5337       break;
5338    case 4:
5339       surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5340       break;
5341    default:
5342       unreachable("Invalid vector size for transform feedback output");
5343    }
5344 
5345    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5346    uint32_t offset = 0;
5347 
5348    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5349                                        isl_dev->ss.align, &offset);
5350    isl_buffer_fill_state(isl_dev, surf_state,
5351                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5352                                                        crocus_resource_bo(&buf->base.b),
5353                                                        offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5354                          .size_B = num_elements * 4,
5355                          .stride_B = stride_dwords * 4,
5356                          .swizzle = ISL_SWIZZLE_IDENTITY,
5357                          .format = surface_format);
5358    return offset;
5359 }
5360 #endif
5361 
5362 #define foreach_surface_used(index, group)                      \
5363    for (int index = 0; index < bt->sizes[group]; index++)       \
5364       if (crocus_group_index_to_bti(bt, group, index) !=        \
5365           CROCUS_SURFACE_NOT_USED)
5366 
5367 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5368 crocus_populate_binding_table(struct crocus_context *ice,
5369                               struct crocus_batch *batch,
5370                               gl_shader_stage stage, bool ff_gs)
5371 {
5372    struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5373    struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5374    if (!shader)
5375       return;
5376 
5377    struct crocus_binding_table *bt = &shader->bt;
5378    int s = 0;
5379    uint32_t *surf_offsets = shader->surf_offset;
5380 
5381 #if GFX_VER < 8
5382    const struct shader_info *info = crocus_get_shader_info(ice, stage);
5383 #endif
5384 
5385    if (stage == MESA_SHADER_FRAGMENT) {
5386       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5387       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5388       if (cso_fb->nr_cbufs) {
5389          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5390             uint32_t write_disables = 0;
5391             bool blend_enable = false;
5392 #if GFX_VER <= 5
5393             const struct pipe_rt_blend_state *rt =
5394                &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5395             struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5396             struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5397             write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5398             write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5399             write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5400             write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5401             /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5402             blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5403 #endif
5404             if (cso_fb->cbufs[i]) {
5405                surf_offsets[s] = emit_surface(batch,
5406                                               (struct crocus_surface *)cso_fb->cbufs[i],
5407                                               ice->state.draw_aux_usage[i],
5408                                               blend_enable,
5409                                               write_disables);
5410             } else {
5411                emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5412             }
5413             s++;
5414          }
5415       } else {
5416          emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5417          s++;
5418       }
5419 
5420       foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5421          struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5422          if (cso_fb->cbufs[i]) {
5423             surf_offsets[s++] = emit_rt_surface(batch,
5424                                                 (struct crocus_surface *)cso_fb->cbufs[i],
5425                                                 ice->state.draw_aux_usage[i]);
5426          }
5427       }
5428    }
5429 
5430    if (stage == MESA_SHADER_COMPUTE) {
5431       foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5432          surf_offsets[s] = emit_grid(ice, batch);
5433          s++;
5434       }
5435    }
5436 
5437 #if GFX_VER == 6
5438    if (stage == MESA_SHADER_GEOMETRY) {
5439       struct pipe_stream_output_info *so_info;
5440       if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5441          so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5442       else
5443          so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5444 
5445       foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5446          surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5447          s++;
5448       }
5449    }
5450 #endif
5451 
5452    foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5453       struct crocus_sampler_view *view = shs->textures[i];
5454       if (view)
5455          surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5456       else
5457          emit_null_surface(batch, &surf_offsets[s]);
5458       s++;
5459    }
5460 
5461 #if GFX_VER < 8
5462    if (info && info->uses_texture_gather) {
5463       foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5464          struct crocus_sampler_view *view = shs->textures[i];
5465          if (view)
5466             surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5467          else
5468             emit_null_surface(batch, &surf_offsets[s]);
5469          s++;
5470       }
5471    }
5472 #endif
5473 
5474    foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5475       struct crocus_image_view *view = &shs->image[i];
5476       if (view->base.resource)
5477          surf_offsets[s] = emit_image_view(ice, batch, view);
5478       else
5479          emit_null_surface(batch, &surf_offsets[s]);
5480       s++;
5481    }
5482    foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5483       if (shs->constbufs[i].buffer)
5484          surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5485       else
5486          emit_null_surface(batch, &surf_offsets[s]);
5487       s++;
5488    }
5489    foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5490       if (shs->ssbo[i].buffer)
5491          surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5492                                             !!(shs->writable_ssbos & (1 << i)));
5493       else
5494          emit_null_surface(batch, &surf_offsets[s]);
5495       s++;
5496    }
5497 
5498 }
5499 /* ------------------------------------------------------------------- */
5500 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5501 crocus_upload_binding_table(struct crocus_context *ice,
5502                             struct crocus_batch *batch,
5503                             uint32_t *table,
5504                             uint32_t size)
5505 
5506 {
5507    if (size == 0)
5508       return 0;
5509    return emit_state(batch, table, size, 32);
5510 }
5511 
5512 /**
5513  * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5514  */
5515 
5516 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5517 crocus_update_surface_base_address(struct crocus_batch *batch)
5518 {
5519    if (batch->state_base_address_emitted)
5520       return;
5521 
5522    UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5523 
5524    flush_before_state_base_change(batch);
5525 
5526    crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5527       /* Set base addresses */
5528       sba.GeneralStateBaseAddressModifyEnable = true;
5529 
5530 #if GFX_VER >= 6
5531       sba.DynamicStateBaseAddressModifyEnable = true;
5532       sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5533 #endif
5534 
5535       sba.SurfaceStateBaseAddressModifyEnable = true;
5536       sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5537 
5538       sba.IndirectObjectBaseAddressModifyEnable = true;
5539 
5540 #if GFX_VER >= 5
5541       sba.InstructionBaseAddressModifyEnable = true;
5542       sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5543 #endif
5544 
5545       /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5546 #if GFX_VER == 8
5547       sba.GeneralStateBufferSize   = 0xfffff;
5548       sba.IndirectObjectBufferSize = 0xfffff;
5549       sba.InstructionBufferSize    = 0xfffff;
5550       sba.DynamicStateBufferSize   = MAX_STATE_SIZE;
5551 
5552       sba.GeneralStateBufferSizeModifyEnable    = true;
5553       sba.DynamicStateBufferSizeModifyEnable    = true;
5554       sba.IndirectObjectBufferSizeModifyEnable  = true;
5555       sba.InstructionBuffersizeModifyEnable     = true;
5556 #else
5557       sba.GeneralStateAccessUpperBoundModifyEnable = true;
5558       sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5559 
5560 #if GFX_VER >= 5
5561       sba.InstructionAccessUpperBoundModifyEnable = true;
5562 #endif
5563 
5564 #if GFX_VER >= 6
5565       /* Dynamic state upper bound.  Although the documentation says that
5566        * programming it to zero will cause it to be ignored, that is a lie.
5567        * If this isn't programmed to a real bound, the sampler border color
5568        * pointer is rejected, causing border color to mysteriously fail.
5569        */
5570       sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5571       sba.DynamicStateAccessUpperBoundModifyEnable = true;
5572 #else
5573       /* Same idea but using General State Base Address on Gen4-5 */
5574       sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5575 #endif
5576 #endif
5577 
5578 #if GFX_VER >= 6
5579       /* The hardware appears to pay attention to the MOCS fields even
5580        * if you don't set the "Address Modify Enable" bit for the base.
5581        */
5582       sba.GeneralStateMOCS            = mocs;
5583       sba.StatelessDataPortAccessMOCS = mocs;
5584       sba.DynamicStateMOCS            = mocs;
5585       sba.IndirectObjectMOCS          = mocs;
5586       sba.InstructionMOCS             = mocs;
5587       sba.SurfaceStateMOCS            = mocs;
5588 #endif
5589    }
5590 
5591    flush_after_state_base_change(batch);
5592 
5593    /* According to section 3.6.1 of VOL1 of the 965 PRM,
5594     * STATE_BASE_ADDRESS updates require a reissue of:
5595     *
5596     * 3DSTATE_PIPELINE_POINTERS
5597     * 3DSTATE_BINDING_TABLE_POINTERS
5598     * MEDIA_STATE_POINTERS
5599     *
5600     * and this continues through Ironlake.  The Sandy Bridge PRM, vol
5601     * 1 part 1 says that the folowing packets must be reissued:
5602     *
5603     * 3DSTATE_CC_POINTERS
5604     * 3DSTATE_BINDING_TABLE_POINTERS
5605     * 3DSTATE_SAMPLER_STATE_POINTERS
5606     * 3DSTATE_VIEWPORT_STATE_POINTERS
5607     * MEDIA_STATE_POINTERS
5608     *
5609     * Those are always reissued following SBA updates anyway (new
5610     * batch time), except in the case of the program cache BO
5611     * changing.  Having a separate state flag makes the sequence more
5612     * obvious.
5613     */
5614 #if GFX_VER <= 5
5615    batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5616 #elif GFX_VER == 6
5617    batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5618 #endif
5619    batch->state_base_address_emitted = true;
5620 }
5621 
5622 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5623 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5624                           bool window_space_position, float *zmin, float *zmax)
5625 {
5626    if (window_space_position) {
5627       *zmin = 0.f;
5628       *zmax = 1.f;
5629       return;
5630    }
5631    util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5632 }
5633 
5634 struct push_bos {
5635    struct {
5636       struct crocus_address addr;
5637       uint32_t length;
5638    } buffers[4];
5639    int buffer_count;
5640    uint32_t max_length;
5641 };
5642 
5643 #if GFX_VER >= 6
5644 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5645 setup_constant_buffers(struct crocus_context *ice,
5646                        struct crocus_batch *batch,
5647                        int stage,
5648                        struct push_bos *push_bos)
5649 {
5650    struct crocus_shader_state *shs = &ice->state.shaders[stage];
5651    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5652    struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
5653 
5654    uint32_t push_range_sum = 0;
5655 
5656    int n = 0;
5657    for (int i = 0; i < 4; i++) {
5658       const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
5659 
5660       if (range->length == 0)
5661          continue;
5662 
5663       push_range_sum += range->length;
5664 
5665       if (range->length > push_bos->max_length)
5666          push_bos->max_length = range->length;
5667 
5668       /* Range block is a binding table index, map back to UBO index. */
5669       unsigned block_index = crocus_bti_to_group_index(
5670          &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5671       assert(block_index != CROCUS_SURFACE_NOT_USED);
5672 
5673       struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5674       struct crocus_resource *res = (void *) cbuf->buffer;
5675 
5676       assert(cbuf->buffer_offset % 32 == 0);
5677 
5678       push_bos->buffers[n].length = range->length;
5679       push_bos->buffers[n].addr =
5680          res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5681          : ro_bo(batch->ice->workaround_bo,
5682                  batch->ice->workaround_offset);
5683       n++;
5684    }
5685 
5686    /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5687     *
5688     *    "The sum of all four read length fields must be less than or
5689     *    equal to the size of 64."
5690     */
5691    assert(push_range_sum <= 64);
5692 
5693    push_bos->buffer_count = n;
5694 }
5695 
5696 #if GFX_VER == 7
5697 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5698 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5699 {
5700    crocus_emit_pipe_control_write(batch,
5701                                   "vs workaround",
5702                                   PIPE_CONTROL_WRITE_IMMEDIATE
5703                                   | PIPE_CONTROL_DEPTH_STALL,
5704                                   batch->ice->workaround_bo,
5705                                   batch->ice->workaround_offset, 0);
5706 }
5707 #endif
5708 
5709 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5710 emit_push_constant_packets(struct crocus_context *ice,
5711                            struct crocus_batch *batch,
5712                            int stage,
5713                            const struct push_bos *push_bos)
5714 {
5715    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5716    struct elk_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5717    UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5718 
5719 #if GFX_VER == 7
5720    if (stage == MESA_SHADER_VERTEX) {
5721       if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5722          gen7_emit_vs_workaround_flush(batch);
5723    }
5724 #endif
5725    crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5726       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5727 #if GFX_VER >= 7
5728 #if GFX_VER != 8
5729       /* MOCS is MBZ on Gen8 so we skip it there */
5730       pkt.ConstantBody.MOCS = mocs;
5731 #endif
5732 
5733       if (prog_data) {
5734          /* The Skylake PRM contains the following restriction:
5735           *
5736           *    "The driver must ensure The following case does not occur
5737           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5738           *     buffer 3 read length equal to zero committed followed by a
5739           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5740           *     zero committed."
5741           *
5742           * To avoid this, we program the buffers in the highest slots.
5743           * This way, slot 0 is only used if slot 3 is also used.
5744           */
5745          int n = push_bos->buffer_count;
5746          assert(n <= 4);
5747 #if GFX_VERx10 >= 75
5748          const unsigned shift = 4 - n;
5749 #else
5750          const unsigned shift = 0;
5751 #endif
5752          for (int i = 0; i < n; i++) {
5753             pkt.ConstantBody.ReadLength[i + shift] =
5754                push_bos->buffers[i].length;
5755             pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5756          }
5757       }
5758 #else
5759       if (prog_data) {
5760          int n = push_bos->buffer_count;
5761          assert (n <= 1);
5762          if (n == 1) {
5763             pkt.Buffer0Valid = true;
5764             pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5765             pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5766          }
5767       }
5768 #endif
5769    }
5770 }
5771 
5772 #endif
5773 
5774 #if GFX_VER == 8
5775 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5776 #elif GFX_VER >= 6
5777 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
5778 #else
5779 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
5780 #endif
5781 
5782 static inline void
set_depth_stencil_bits(struct crocus_context * ice,DEPTH_STENCIL_GENXML * ds)5783 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5784 {
5785    struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5786    ds->DepthTestEnable = cso->cso.depth_enabled;
5787    ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5788    ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5789 
5790    ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5791    ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5792    ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5793    ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5794 
5795    ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5796    ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5797 
5798    ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5799    ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5800    ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5801    ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5802 
5803    ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5804    ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5805    ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5806    ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5807    ds->StencilBufferWriteEnable =
5808       cso->cso.stencil[0].writemask != 0 ||
5809       (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5810 }
5811 
5812 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5813 emit_vertex_buffer_state(struct crocus_batch *batch,
5814                          unsigned buffer_id,
5815                          struct crocus_bo *bo,
5816                          unsigned start_offset,
5817                          unsigned end_offset,
5818                          unsigned stride,
5819                          unsigned step_rate,
5820                          uint32_t **map)
5821 {
5822    const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5823    _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5824       vb.BufferStartingAddress = ro_bo(bo, start_offset);
5825 #if GFX_VER >= 8
5826       vb.BufferSize = end_offset - start_offset;
5827 #endif
5828       vb.VertexBufferIndex = buffer_id;
5829       vb.BufferPitch = stride;
5830 #if GFX_VER >= 7
5831       vb.AddressModifyEnable = true;
5832 #endif
5833 #if GFX_VER >= 6
5834       vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5835 #endif
5836 #if GFX_VER < 8
5837       vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5838       vb.InstanceDataStepRate = step_rate;
5839 #if GFX_VER >= 5
5840       vb.EndAddress = ro_bo(bo, end_offset - 1);
5841 #endif
5842 #endif
5843    }
5844    *map += vb_dwords;
5845 }
5846 
5847 #if GFX_VER >= 6
5848 static uint32_t
determine_sample_mask(struct crocus_context * ice)5849 determine_sample_mask(struct crocus_context *ice)
5850 {
5851    uint32_t num_samples = ice->state.framebuffer.samples;
5852 
5853    if (num_samples <= 1)
5854       return 1;
5855 
5856    uint32_t fb_mask = (1 << num_samples) - 1;
5857    return ice->state.sample_mask & fb_mask;
5858 }
5859 #endif
5860 
5861 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5862 crocus_upload_dirty_render_state(struct crocus_context *ice,
5863                                struct crocus_batch *batch,
5864                                const struct pipe_draw_info *draw)
5865 {
5866    uint64_t dirty = ice->state.dirty;
5867    uint64_t stage_dirty = ice->state.stage_dirty;
5868 
5869    if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5870        !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5871       return;
5872 
5873    if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5874       crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5875          vf.StatisticsEnable = true;
5876       }
5877    }
5878 
5879 #if GFX_VER <= 5
5880    if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5881                       CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5882       bool ret = calculate_curbe_offsets(batch);
5883       if (ret) {
5884          dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5885          stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5886       }
5887    }
5888 
5889    if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5890        stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5891      bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5892                                            elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5893                                            ((struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5894      if (ret) {
5895 	dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5896 	stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5897      }
5898    }
5899 #endif
5900    if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5901       const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5902       uint32_t cc_vp_address;
5903 
5904       /* XXX: could avoid streaming for depth_clip [0,1] case. */
5905       uint32_t *cc_vp_map =
5906          stream_state(batch,
5907                       4 * ice->state.num_viewports *
5908                       GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5909       for (int i = 0; i < ice->state.num_viewports; i++) {
5910          float zmin, zmax;
5911          crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5912                                  ice->state.window_space_position,
5913                                  &zmin, &zmax);
5914          if (cso_rast->cso.depth_clip_near)
5915             zmin = 0.0;
5916          if (cso_rast->cso.depth_clip_far)
5917             zmax = 1.0;
5918 
5919          crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5920             ccv.MinimumDepth = zmin;
5921             ccv.MaximumDepth = zmax;
5922          }
5923 
5924          cc_vp_map += GENX(CC_VIEWPORT_length);
5925       }
5926 
5927 #if GFX_VER >= 7
5928       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5929          ptr.CCViewportPointer = cc_vp_address;
5930       }
5931 #elif GFX_VER == 6
5932       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5933          vp.CCViewportStateChange = 1;
5934          vp.PointertoCC_VIEWPORT = cc_vp_address;
5935       }
5936 #else
5937       ice->state.cc_vp_address = cc_vp_address;
5938       dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5939 #endif
5940    }
5941 
5942    if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5943       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5944 #if GFX_VER >= 7
5945       uint32_t sf_cl_vp_address;
5946       uint32_t *vp_map =
5947          stream_state(batch,
5948                       4 * ice->state.num_viewports *
5949                       GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5950 #else
5951       uint32_t *vp_map =
5952          stream_state(batch,
5953                       4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5954                       32, &ice->state.sf_vp_address);
5955       uint32_t *clip_map =
5956          stream_state(batch,
5957                       4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5958                       32, &ice->state.clip_vp_address);
5959 #endif
5960 
5961       for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5962          const struct pipe_viewport_state *state = &ice->state.viewports[i];
5963          float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5964 
5965 #if GFX_VER == 8
5966          float vp_xmin = viewport_extent(state, 0, -1.0f);
5967          float vp_xmax = viewport_extent(state, 0,  1.0f);
5968          float vp_ymin = viewport_extent(state, 1, -1.0f);
5969          float vp_ymax = viewport_extent(state, 1,  1.0f);
5970 #endif
5971          intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5972                                         state->scale[0], state->scale[1],
5973                                         state->translate[0], state->translate[1],
5974                                         &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5975 #if GFX_VER >= 7
5976          crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5977 #else
5978          crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5979 #endif
5980          {
5981             vp.ViewportMatrixElementm00 = state->scale[0];
5982             vp.ViewportMatrixElementm11 = state->scale[1];
5983             vp.ViewportMatrixElementm22 = state->scale[2];
5984             vp.ViewportMatrixElementm30 = state->translate[0];
5985             vp.ViewportMatrixElementm31 = state->translate[1];
5986             vp.ViewportMatrixElementm32 = state->translate[2];
5987 #if GFX_VER < 6
5988             struct pipe_scissor_state scissor;
5989             crocus_fill_scissor_rect(ice, 0, &scissor);
5990             vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5991             vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5992             vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5993             vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5994 #endif
5995 
5996 #if GFX_VER >= 7
5997             vp.XMinClipGuardband = gb_xmin;
5998             vp.XMaxClipGuardband = gb_xmax;
5999             vp.YMinClipGuardband = gb_ymin;
6000             vp.YMaxClipGuardband = gb_ymax;
6001 #endif
6002 #if GFX_VER == 8
6003             vp.XMinViewPort = MAX2(vp_xmin, 0);
6004             vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6005             vp.YMinViewPort = MAX2(vp_ymin, 0);
6006             vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6007 #endif
6008          }
6009 #if GFX_VER < 7
6010          crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6011             clip.XMinClipGuardband = gb_xmin;
6012             clip.XMaxClipGuardband = gb_xmax;
6013             clip.YMinClipGuardband = gb_ymin;
6014             clip.YMaxClipGuardband = gb_ymax;
6015          }
6016 #endif
6017 #if GFX_VER >= 7
6018          vp_map += GENX(SF_CLIP_VIEWPORT_length);
6019 #else
6020          vp_map += GENX(SF_VIEWPORT_length);
6021          clip_map += GENX(CLIP_VIEWPORT_length);
6022 #endif
6023       }
6024 #if GFX_VER >= 7
6025       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6026          ptr.SFClipViewportPointer = sf_cl_vp_address;
6027       }
6028 #elif GFX_VER == 6
6029       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6030          vp.SFViewportStateChange = 1;
6031          vp.CLIPViewportStateChange = 1;
6032          vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6033          vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6034       }
6035 #endif
6036    }
6037 
6038 #if GFX_VER >= 6
6039    if (dirty & CROCUS_DIRTY_GEN6_URB) {
6040 #if GFX_VER == 6
6041       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6042          || ice->shaders.ff_gs_prog;
6043 
6044       struct elk_vue_prog_data *vue_prog_data =
6045          (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6046       const unsigned vs_size = vue_prog_data->urb_entry_size;
6047       unsigned gs_size = vs_size;
6048       if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6049          struct elk_vue_prog_data *gs_vue_prog_data =
6050             (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6051          gs_size = gs_vue_prog_data->urb_entry_size;
6052       }
6053 
6054       genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6055 #endif
6056 #if GFX_VER >= 7
6057       const struct intel_device_info *devinfo = &batch->screen->devinfo;
6058       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6059       bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6060       struct intel_urb_config urb_cfg;
6061 
6062       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6063          if (!ice->shaders.prog[i]) {
6064             urb_cfg.size[i] = 1;
6065          } else {
6066             struct elk_vue_prog_data *vue_prog_data =
6067                (void *) ice->shaders.prog[i]->prog_data;
6068             urb_cfg.size[i] = vue_prog_data->urb_entry_size;
6069          }
6070          assert(urb_cfg.size[i] != 0);
6071       }
6072 
6073       /* If we're just switching between programs with the same URB requirements,
6074        * skip the rest of the logic.
6075        */
6076       bool no_change = false;
6077       if (ice->urb.vsize == urb_cfg.size[MESA_SHADER_VERTEX] &&
6078           ice->urb.gs_present == gs_present &&
6079           ice->urb.gsize == urb_cfg.size[MESA_SHADER_GEOMETRY] &&
6080           ice->urb.tess_present == tess_present &&
6081           ice->urb.hsize == urb_cfg.size[MESA_SHADER_TESS_CTRL] &&
6082           ice->urb.dsize == urb_cfg.size[MESA_SHADER_TESS_EVAL]) {
6083          no_change = true;
6084       }
6085 
6086       if (!no_change) {
6087          ice->urb.vsize = urb_cfg.size[MESA_SHADER_VERTEX];
6088          ice->urb.gs_present = gs_present;
6089          ice->urb.gsize = urb_cfg.size[MESA_SHADER_GEOMETRY];
6090          ice->urb.tess_present = tess_present;
6091          ice->urb.hsize = urb_cfg.size[MESA_SHADER_TESS_CTRL];
6092          ice->urb.dsize = urb_cfg.size[MESA_SHADER_TESS_EVAL];
6093 
6094          bool constrained;
6095          intel_get_urb_config(devinfo,
6096                               batch->screen->l3_config_3d,
6097                               tess_present,
6098                               gs_present,
6099                               &urb_cfg, NULL, &constrained);
6100 
6101 #if GFX_VER == 7
6102          if (devinfo->platform == INTEL_PLATFORM_IVB)
6103             gen7_emit_vs_workaround_flush(batch);
6104 #endif
6105          for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6106             crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6107                urb._3DCommandSubOpcode += i;
6108                urb.VSURBStartingAddress     = urb_cfg.start[i];
6109                urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
6110                urb.VSNumberofURBEntries     = urb_cfg.entries[i];
6111             }
6112          }
6113       }
6114 #endif
6115    }
6116 
6117    if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6118       struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6119       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6120       struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6121 
6122       STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6123       int rt_dwords =
6124          MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6125 #if GFX_VER >= 8
6126       rt_dwords += GENX(BLEND_STATE_length);
6127 #endif
6128       uint32_t blend_offset;
6129       uint32_t *blend_map =
6130          stream_state(batch,
6131                       4 * rt_dwords, 64, &blend_offset);
6132 
6133 #if GFX_VER >= 8
6134    struct GENX(BLEND_STATE) be = { 0 };
6135    {
6136 #else
6137    for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6138       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6139 #define be entry
6140 #endif
6141 
6142       be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6143       be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6144       be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6145       be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6146       be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage_dither;
6147       be.ColorDitherEnable = cso_blend->cso.dither;
6148 
6149 #if GFX_VER >= 8
6150       for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6151          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6152 #else
6153       {
6154 #endif
6155          const struct pipe_rt_blend_state *rt =
6156             &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6157 
6158          be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6159             be.IndependentAlphaBlendEnable;
6160 
6161          if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6162             entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6163             entry.LogicOpFunction = cso_blend->cso.logicop_func;
6164          }
6165 
6166          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6167          entry.PreBlendColorClampEnable = true;
6168          entry.PostBlendColorClampEnable = true;
6169 
6170          entry.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
6171          entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6172          entry.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
6173          entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6174 
6175 #if GFX_VER >= 8
6176          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6177 #else
6178          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6179 #endif
6180       }
6181    }
6182 #if GFX_VER >= 8
6183    GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6184 #endif
6185 #if GFX_VER < 7
6186       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6187          ptr.PointertoBLEND_STATE = blend_offset;
6188          ptr.BLEND_STATEChange = true;
6189       }
6190 #else
6191       crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6192          ptr.BlendStatePointer = blend_offset;
6193 #if GFX_VER >= 8
6194          ptr.BlendStatePointerValid = true;
6195 #endif
6196       }
6197 #endif
6198    }
6199 #endif
6200 
6201    if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6202       struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6203       UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6204       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6205       uint32_t cc_offset;
6206       void *cc_map =
6207          stream_state(batch,
6208                       sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6209                       64, &cc_offset);
6210 #if GFX_VER <= 5
6211       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6212 #endif
6213       _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6214          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6215          cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6216 
6217 #if GFX_VER <= 5
6218 
6219          set_depth_stencil_bits(ice, &cc);
6220 
6221          if (cso_blend->cso.logicop_enable) {
6222             if (can_emit_logic_op(ice)) {
6223                cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6224                cc.LogicOpFunction = cso_blend->cso.logicop_func;
6225             }
6226          }
6227          cc.ColorDitherEnable = cso_blend->cso.dither;
6228 
6229          cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6230 
6231          if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6232             cc.AlphaTestEnable = cso->cso.alpha_enabled;
6233             cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6234          }
6235          cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6236          cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6237 #else
6238          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6239          cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6240 
6241          cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
6242          cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6243          cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
6244          cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6245 #endif
6246          cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6247          cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6248       }
6249       ice->shaders.cc_offset = cc_offset;
6250 #if GFX_VER >= 6
6251       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6252          ptr.ColorCalcStatePointer = cc_offset;
6253 #if GFX_VER != 7
6254          ptr.ColorCalcStatePointerValid = true;
6255 #endif
6256       }
6257 #endif
6258    }
6259 #if GFX_VER <= 5
6260    if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6261       crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6262          blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6263          blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6264          blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6265          blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6266       }
6267    }
6268 #endif
6269    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6270       if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6271          continue;
6272 
6273       struct crocus_shader_state *shs = &ice->state.shaders[stage];
6274       struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6275 
6276       if (!shader)
6277          continue;
6278 
6279       if (shs->sysvals_need_upload)
6280          upload_sysvals(ice, stage);
6281 
6282 #if GFX_VER <= 5
6283       dirty |= CROCUS_DIRTY_GEN4_CURBE;
6284 #endif
6285 #if GFX_VER >= 7
6286       struct push_bos push_bos = {};
6287       setup_constant_buffers(ice, batch, stage, &push_bos);
6288 
6289       emit_push_constant_packets(ice, batch, stage, &push_bos);
6290 #endif
6291    }
6292 
6293    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6294       if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6295          if (ice->shaders.prog[stage]) {
6296 #if GFX_VER <= 6
6297             dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6298 #endif
6299             crocus_populate_binding_table(ice, batch, stage, false);
6300             ice->shaders.prog[stage]->bind_bo_offset =
6301                crocus_upload_binding_table(ice, batch,
6302                                            ice->shaders.prog[stage]->surf_offset,
6303                                            ice->shaders.prog[stage]->bt.size_bytes);
6304 
6305 #if GFX_VER >= 7
6306             crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6307                ptr._3DCommandSubOpcode = 38 + stage;
6308                ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6309             }
6310 #endif
6311 #if GFX_VER == 6
6312          } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6313             dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6314             crocus_populate_binding_table(ice, batch, stage, true);
6315             ice->shaders.ff_gs_prog->bind_bo_offset =
6316                crocus_upload_binding_table(ice, batch,
6317                                            ice->shaders.ff_gs_prog->surf_offset,
6318                                            ice->shaders.ff_gs_prog->bt.size_bytes);
6319 #endif
6320          }
6321       }
6322    }
6323 #if GFX_VER <= 6
6324    if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6325       struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6326       if (gs == NULL)
6327          gs = ice->shaders.ff_gs_prog;
6328       crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6329          ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6330          ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6331 #if GFX_VER == 6
6332          ptr.VSBindingTableChange = true;
6333          ptr.PSBindingTableChange = true;
6334          ptr.GSBindingTableChange = gs ? true : false;
6335          ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6336 #endif
6337       }
6338    }
6339 #endif
6340 
6341    bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6342    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6343       if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6344           !ice->shaders.prog[stage])
6345          continue;
6346 
6347       crocus_upload_sampler_states(ice, batch, stage);
6348 
6349       sampler_updates = true;
6350 
6351 #if GFX_VER >= 7
6352       struct crocus_shader_state *shs = &ice->state.shaders[stage];
6353 
6354       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6355          ptr._3DCommandSubOpcode = 43 + stage;
6356          ptr.PointertoVSSamplerState = shs->sampler_offset;
6357       }
6358 #endif
6359    }
6360 
6361    if (sampler_updates) {
6362 #if GFX_VER == 6
6363       struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6364       struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6365       struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6366       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6367          if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6368              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6369               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6370             ptr.VSSamplerStateChange = true;
6371             ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6372          }
6373          if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6374              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6375               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6376             ptr.GSSamplerStateChange = true;
6377             ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6378          }
6379          if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6380              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6381               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6382             ptr.PSSamplerStateChange = true;
6383             ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6384          }
6385       }
6386 #endif
6387    }
6388 
6389 #if GFX_VER >= 6
6390    if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6391       crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6392          ms.PixelLocation =
6393             ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6394          if (ice->state.framebuffer.samples > 0)
6395             ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6396 #if GFX_VER == 6
6397          INTEL_SAMPLE_POS_4X(ms.Sample);
6398 #elif GFX_VER == 7
6399          switch (ice->state.framebuffer.samples) {
6400          case 1:
6401             INTEL_SAMPLE_POS_1X(ms.Sample);
6402             break;
6403          case 2:
6404             INTEL_SAMPLE_POS_2X(ms.Sample);
6405             break;
6406          case 4:
6407             INTEL_SAMPLE_POS_4X(ms.Sample);
6408             break;
6409          case 8:
6410             INTEL_SAMPLE_POS_8X(ms.Sample);
6411             break;
6412          default:
6413             break;
6414          }
6415 #endif
6416       }
6417    }
6418 
6419    if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6420       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6421          ms.SampleMask = determine_sample_mask(ice);
6422       }
6423    }
6424 #endif
6425 
6426 #if GFX_VER >= 7
6427    struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6428    if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6429       struct elk_stage_prog_data *prog_data = shader->prog_data;
6430       struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6431 
6432       crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6433 
6434          /* Initialize the execution mask with VMask.  Otherwise, derivatives are
6435           * incorrect for subspans where some of the pixels are unlit.  We believe
6436           * the bit just didn't take effect in previous generations.
6437           */
6438          ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
6439 
6440          intel_set_ps_dispatch_state(&ps, &batch->screen->devinfo,
6441                                      wm_prog_data,
6442                                      ice->state.framebuffer.samples,
6443                                      0 /* msaa_flags */);
6444 
6445          ps.DispatchGRFStartRegisterForConstantSetupData0 =
6446             elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6447          ps.DispatchGRFStartRegisterForConstantSetupData1 =
6448             elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6449          ps.DispatchGRFStartRegisterForConstantSetupData2 =
6450             elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6451 
6452          ps.KernelStartPointer0 = KSP(ice, shader) +
6453             elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6454          ps.KernelStartPointer1 = KSP(ice, shader) +
6455             elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6456          ps.KernelStartPointer2 = KSP(ice, shader) +
6457             elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6458 
6459 #if GFX_VERx10 == 75
6460          ps.SampleMask = determine_sample_mask(ice);
6461 #endif
6462          // XXX: WABTPPrefetchDisable, see above, drop at C0
6463          ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6464          ps.FloatingPointMode = prog_data->use_alt_mode;
6465 #if GFX_VER >= 8
6466          ps.MaximumNumberofThreadsPerPSD =
6467             batch->screen->devinfo.max_threads_per_psd - 2;
6468 #else
6469          ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6470 #endif
6471 
6472          ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6473 
6474 #if GFX_VER < 8
6475          ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6476          ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6477          ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6478 #endif
6479          /* From the documentation for this packet:
6480           * "If the PS kernel does not need the Position XY Offsets to
6481           *  compute a Position Value, then this field should be programmed
6482           *  to POSOFFSET_NONE."
6483           *
6484           * "SW Recommendation: If the PS kernel needs the Position Offsets
6485           *  to compute a Position XY value, this field should match Position
6486           *  ZW Interpolation Mode to ensure a consistent position.xyzw
6487           *  computation."
6488           *
6489           * We only require XY sample offsets. So, this recommendation doesn't
6490           * look useful at the moment.  We might need this in future.
6491           */
6492          ps.PositionXYOffsetSelect =
6493             wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6494 
6495          if (wm_prog_data->base.total_scratch) {
6496             struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6497             ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6498             ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6499          }
6500       }
6501 #if GFX_VER == 8
6502       const struct shader_info *fs_info =
6503          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6504       crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6505          psx.PixelShaderValid = true;
6506          psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6507          psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6508          psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6509          psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6510          psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6511          psx.PixelShaderIsPerSample =
6512             elk_wm_prog_data_is_persample(wm_prog_data, 0);
6513 
6514          /* _NEW_MULTISAMPLE | ELK_NEW_CONSERVATIVE_RASTERIZATION */
6515          if (wm_prog_data->uses_sample_mask)
6516             psx.PixelShaderUsesInputCoverageMask = true;
6517 
6518          psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6519 
6520          /* The stricter cross-primitive coherency guarantees that the hardware
6521           * gives us with the "Accesses UAV" bit set for at least one shader stage
6522           * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6523           * are redundant within the current image, atomic counter and SSBO GL
6524           * APIs, which all have very loose ordering and coherency requirements
6525           * and generally rely on the application to insert explicit barriers when
6526           * a shader invocation is expected to see the memory writes performed by
6527           * the invocations of some previous primitive.  Regardless of the value
6528           * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6529           * cause an in most cases useless DC flush when the lowermost stage with
6530           * the bit set finishes execution.
6531           *
6532           * It would be nice to disable it, but in some cases we can't because on
6533           * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6534           * signal (which could be set independently from the coherency mechanism
6535           * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6536           * determine whether the hardware skips execution of the fragment shader
6537           * or not via the ThreadDispatchEnable signal.  However if we know that
6538           * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6539           * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6540           * difference so we may just disable it here.
6541           *
6542           * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6543           * take into account KillPixels when no depth or stencil writes are
6544           * enabled.  In order for occlusion queries to work correctly with no
6545           * attachments, we need to force-enable here.
6546           *
6547           */
6548          if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6549              !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6550             psx.PixelShaderHasUAV = true;
6551       }
6552 #endif
6553    }
6554 #endif
6555 
6556 #if GFX_VER >= 7
6557    if (ice->state.streamout_active) {
6558       if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6559          for (int i = 0; i < 4; i++) {
6560             struct crocus_stream_output_target *tgt =
6561                (void *) ice->state.so_target[i];
6562 
6563             if (!tgt) {
6564                crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6565                   sob.SOBufferIndex = i;
6566                   sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6567                }
6568                continue;
6569             }
6570             struct crocus_resource *res = (void *) tgt->base.buffer;
6571             uint32_t start = tgt->base.buffer_offset;
6572 #if GFX_VER < 8
6573             uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6574 #endif
6575             crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6576                sob.SOBufferIndex = i;
6577 
6578                sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6579                sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6580 #if GFX_VER < 8
6581                sob.SurfacePitch = tgt->stride;
6582                sob.SurfaceEndAddress = rw_bo(res->bo, end);
6583 #else
6584                sob.SOBufferEnable = true;
6585                sob.StreamOffsetWriteEnable = true;
6586                sob.StreamOutputBufferOffsetAddressEnable = true;
6587 
6588                sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6589                sob.StreamOutputBufferOffsetAddress =
6590                   rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6591                if (tgt->zero_offset) {
6592                   sob.StreamOffset = 0;
6593                   tgt->zero_offset = false;
6594                } else
6595                   sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6596 #endif
6597             }
6598          }
6599       }
6600 
6601       if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6602          uint32_t *decl_list =
6603             ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6604          crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6605       }
6606 
6607       if (dirty & CROCUS_DIRTY_STREAMOUT) {
6608          const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6609 
6610          uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6611          crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6612             sol.SOFunctionEnable = true;
6613             sol.SOStatisticsEnable = true;
6614 
6615             sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6616                                    !ice->state.prims_generated_query_active;
6617             sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6618          }
6619 
6620          assert(ice->state.streamout);
6621 
6622          crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6623                          GENX(3DSTATE_STREAMOUT_length));
6624       }
6625    } else {
6626       if (dirty & CROCUS_DIRTY_STREAMOUT) {
6627          crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6628       }
6629    }
6630 #endif
6631 #if GFX_VER == 6
6632    if (ice->state.streamout_active) {
6633       if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6634          crocus_emit_so_svbi(ice);
6635       }
6636    }
6637 #endif
6638 
6639    if (dirty & CROCUS_DIRTY_CLIP) {
6640 #if GFX_VER < 6
6641       const struct elk_clip_prog_data *clip_prog_data = (struct elk_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6642       struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6643 
6644       uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6645       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6646       _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6647          clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6648          clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6649          clip.SingleProgramFlow = true;
6650          clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6651 
6652          clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6653          clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6654 
6655          clip.DispatchGRFStartRegisterForURBData = 1;
6656          clip.VertexURBEntryReadOffset = 0;
6657          clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6658 
6659          clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6660          clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6661 
6662          if (batch->ice->urb.nr_clip_entries >= 10) {
6663             /* Half of the URB entries go to each thread, and it has to be an
6664              * even number.
6665              */
6666             assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6667 
6668             /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6669              * only 2 threads can output VUEs at a time.
6670              */
6671             clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6672          } else {
6673             assert(batch->ice->urb.nr_clip_entries >= 5);
6674             clip.MaximumNumberofThreads = 1 - 1;
6675          }
6676          clip.VertexPositionSpace = VPOS_NDCSPACE;
6677          clip.UserClipFlagsMustClipEnable = true;
6678          clip.GuardbandClipTestEnable = true;
6679 
6680          clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6681          clip.ScreenSpaceViewportXMin = -1.0;
6682          clip.ScreenSpaceViewportXMax = 1.0;
6683          clip.ScreenSpaceViewportYMin = -1.0;
6684          clip.ScreenSpaceViewportYMax = 1.0;
6685          clip.ViewportXYClipTestEnable = true;
6686          clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6687 
6688 #if GFX_VER == 5 || GFX_VERx10 == 45
6689          clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6690 #else
6691          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6692           * workaround.
6693           */
6694          clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6695 #endif
6696 
6697          clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6698          clip.GuardbandClipTestEnable = true;
6699 
6700          clip.ClipMode = clip_prog_data->clip_mode;
6701 #if GFX_VERx10 == 45
6702          clip.NegativeWClipTestEnable = true;
6703 #endif
6704       }
6705 
6706 #else //if GFX_VER >= 6
6707       struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6708       const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6709       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6710       bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6711                        ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6712       bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6713          (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6714                     : ice->state.prim_is_points_or_lines);
6715       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6716       crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6717          cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6718          if (cso_rast->cso.rasterizer_discard)
6719             cl.ClipMode = CLIPMODE_REJECT_ALL;
6720          else if (ice->state.window_space_position)
6721             cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6722          else
6723             cl.ClipMode = CLIPMODE_NORMAL;
6724 
6725          cl.PerspectiveDivideDisable = ice->state.window_space_position;
6726          cl.ViewportXYClipTestEnable = !points_or_lines;
6727 
6728          cl.UserClipDistanceCullTestEnableBitmask =
6729             elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6730 
6731          cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes;
6732 
6733          cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6734          cl.MaximumVPIndex = ice->state.num_viewports - 1;
6735       }
6736       crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6737                       ARRAY_SIZE(cso_rast->clip));
6738 #endif
6739    }
6740 
6741    if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6742       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6743       const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6744       const struct elk_stage_prog_data *prog_data = &vue_prog_data->base;
6745 #if GFX_VER == 7
6746       if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6747          gen7_emit_vs_workaround_flush(batch);
6748 #endif
6749 
6750 
6751 #if GFX_VER == 6
6752       struct push_bos push_bos = {};
6753       setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6754 
6755       emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6756 #endif
6757 #if GFX_VER >= 6
6758       crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6759 #else
6760       uint32_t *vs_ptr = stream_state(batch,
6761                                       GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6762       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6763       _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6764 #endif
6765       {
6766          INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6767 
6768          vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6769 
6770 #if GFX_VER < 6
6771          vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6772          vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6773          vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6774 
6775          vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6776          vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6777 
6778          vs.MaximumNumberofThreads =
6779             CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6780          vs.StatisticsEnable = false;
6781          vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6782 #endif
6783 #if GFX_VER == 5
6784          /* Force single program flow on Ironlake.  We cannot reliably get
6785           * all applications working without it.  See:
6786           * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6787           *
6788           * The most notable and reliably failing application is the Humus
6789           * demo "CelShading"
6790           */
6791          vs.SingleProgramFlow = true;
6792          vs.SamplerCount = 0; /* hardware requirement */
6793 
6794 #endif
6795 #if GFX_VER >= 8
6796          vs.SIMD8DispatchEnable =
6797             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6798 
6799          vs.UserClipDistanceCullTestEnableBitmask =
6800             vue_prog_data->cull_distance_mask;
6801 #endif
6802       }
6803 
6804 #if GFX_VER == 6
6805       crocus_emit_pipe_control_flush(batch,
6806                                      "post VS const",
6807                                      PIPE_CONTROL_DEPTH_STALL |
6808                                      PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6809                                      PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6810 #endif
6811    }
6812 
6813    if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6814       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6815       bool active = GFX_VER >= 6 && shader;
6816 #if GFX_VER == 6
6817       struct push_bos push_bos = {};
6818       if (shader)
6819          setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6820 
6821       emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6822 #endif
6823 #if GFX_VERx10 == 70
6824    /**
6825     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6826     * Geometry > Geometry Shader > State:
6827     *
6828     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
6829     *     whole fixed function pipeline when the GS enable changes value in
6830     *     the 3DSTATE_GS."
6831     *
6832     * The hardware architects have clarified that in this context "flush the
6833     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6834     * Stall" bit set.
6835     */
6836    if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6837       gen7_emit_cs_stall_flush(batch);
6838 #endif
6839 #if GFX_VER >= 6
6840       crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6841 #else
6842       uint32_t *gs_ptr = stream_state(batch,
6843                                       GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6844       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6845       _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6846 #endif
6847      {
6848 #if GFX_VER >= 6
6849          if (active) {
6850             const struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(shader->prog_data);
6851             const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6852             const struct elk_stage_prog_data *prog_data = &gs_prog_data->base.base;
6853 
6854             INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6855 #if GFX_VER >= 7
6856             gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6857             gs.OutputTopology = gs_prog_data->output_topology;
6858             gs.ControlDataHeaderSize =
6859                gs_prog_data->control_data_header_size_hwords;
6860 
6861             gs.InstanceControl = gs_prog_data->invocations - 1;
6862             gs.DispatchMode = vue_prog_data->dispatch_mode;
6863 
6864             gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6865 
6866             gs.ControlDataFormat = gs_prog_data->control_data_format;
6867 #endif
6868 
6869             /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6870              * Ivy Bridge and Haswell.
6871              *
6872              * On Ivy Bridge, setting this bit causes the vertices of a triangle
6873              * strip to be delivered to the geometry shader in an order that does
6874              * not strictly follow the OpenGL spec, but preserves triangle
6875              * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
6876              * the geometry shader sees triangles:
6877              *
6878              * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6879              *
6880              * (Clearing the bit is even worse, because it fails to preserve
6881              * orientation).
6882              *
6883              * Triangle strips with adjacency always ordered in a way that preserves
6884              * triangle orientation but does not strictly follow the OpenGL spec,
6885              * regardless of the setting of this bit.
6886              *
6887              * On Haswell, both triangle strips and triangle strips with adjacency
6888              * are always ordered in a way that preserves triangle orientation.
6889              * Setting this bit causes the ordering to strictly follow the OpenGL
6890              * spec.
6891              *
6892              * So in either case we want to set the bit.  Unfortunately on Ivy
6893              * Bridge this will get the order close to correct but not perfect.
6894              */
6895             gs.ReorderMode = TRAILING;
6896             gs.MaximumNumberofThreads =
6897                GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6898                (batch->screen->devinfo.max_gs_threads - 1);
6899 #if GFX_VER < 7
6900             gs.SOStatisticsEnable = true;
6901             if (gs_prog_data->num_transform_feedback_bindings)
6902                gs.SVBIPayloadEnable = ice->state.streamout_active;
6903 
6904             /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6905              * was previously done for gen6.
6906              *
6907              * TODO: test with both disabled to see if the HW is behaving
6908              * as expected, like in gen7.
6909              */
6910             gs.SingleProgramFlow = true;
6911             gs.VectorMaskEnable = true;
6912 #endif
6913 #if GFX_VER >= 8
6914             gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6915 
6916             if (gs_prog_data->static_vertex_count != -1) {
6917                gs.StaticOutput = true;
6918                gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6919             }
6920             gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6921 
6922             gs.UserClipDistanceCullTestEnableBitmask =
6923                vue_prog_data->cull_distance_mask;
6924 
6925             const int urb_entry_write_offset = 1;
6926             const uint32_t urb_entry_output_length =
6927                DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6928                urb_entry_write_offset;
6929 
6930             gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6931             gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6932 #endif
6933          }
6934 #endif
6935 #if GFX_VER <= 6
6936          if (!active && ice->shaders.ff_gs_prog) {
6937             const struct elk_ff_gs_prog_data *gs_prog_data = (struct elk_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6938             /* In gen6, transform feedback for the VS stage is done with an
6939              * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6940              * for this.
6941              */
6942             gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6943             gs.SingleProgramFlow = true;
6944             gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6945             gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6946 
6947 #if GFX_VER <= 5
6948             gs.GRFRegisterCount =
6949                DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6950             /* ELK_NEW_URB_FENCE */
6951             gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6952             gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6953             gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6954             gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6955 #else
6956             gs.Enable = true;
6957             gs.VectorMaskEnable = true;
6958             gs.SVBIPayloadEnable = true;
6959             gs.SVBIPostIncrementEnable = true;
6960             gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6961             gs.SOStatisticsEnable = true;
6962             gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6963 #endif
6964          }
6965 #endif
6966          if (!active && !ice->shaders.ff_gs_prog) {
6967 #if GFX_VER < 8
6968             gs.DispatchGRFStartRegisterForURBData = 1;
6969 #if GFX_VER >= 7
6970             gs.IncludeVertexHandles = true;
6971 #endif
6972 #endif
6973          }
6974 #if GFX_VER >= 6
6975          gs.StatisticsEnable = true;
6976 #endif
6977 #if GFX_VER == 5 || GFX_VER == 6
6978          gs.RenderingEnabled = true;
6979 #endif
6980 #if GFX_VER <= 5
6981          gs.MaximumVPIndex = ice->state.num_viewports - 1;
6982 #endif
6983       }
6984       ice->state.gs_enabled = active;
6985    }
6986 
6987 #if GFX_VER >= 7
6988    if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6989       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6990 
6991       if (shader) {
6992          const struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(shader->prog_data);
6993          const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6994          const struct elk_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6995 
6996          crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
6997             INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
6998             hs.InstanceCount = tcs_prog_data->instances - 1;
6999             hs.IncludeVertexHandles = true;
7000             hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7001          }
7002       } else {
7003          crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7004       }
7005 
7006    }
7007 
7008    if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7009       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7010       if (shader) {
7011          const struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(shader->prog_data);
7012          const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
7013          const struct elk_stage_prog_data *prog_data = &tes_prog_data->base.base;
7014 
7015          crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7016             te.Partitioning = tes_prog_data->partitioning;
7017             te.OutputTopology = tes_prog_data->output_topology;
7018             te.TEDomain = tes_prog_data->domain;
7019             te.TEEnable = true;
7020             te.MaximumTessellationFactorOdd = 63.0;
7021             te.MaximumTessellationFactorNotOdd = 64.0;
7022          };
7023          crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7024             INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7025 
7026             ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7027             ds.ComputeWCoordinateEnable =
7028                tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
7029 
7030 #if GFX_VER >= 8
7031             if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7032                ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7033             ds.UserClipDistanceCullTestEnableBitmask =
7034                vue_prog_data->cull_distance_mask;
7035 #endif
7036          };
7037       } else {
7038          crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7039          crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7040       }
7041    }
7042 #endif
7043    if (dirty & CROCUS_DIRTY_RASTER) {
7044 
7045 #if GFX_VER < 6
7046       const struct elk_sf_prog_data *sf_prog_data = (struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7047       struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7048       uint32_t *sf_ptr = stream_state(batch,
7049                                       GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7050       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7051       _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7052          sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7053          sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7054          sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7055          sf.DispatchGRFStartRegisterForURBData = 3;
7056          sf.VertexURBEntryReadOffset = ELK_SF_URB_ENTRY_READ_OFFSET;
7057          sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7058          sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7059          sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7060          sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7061 
7062          sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7063 
7064          sf.MaximumNumberofThreads =
7065             MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7066 
7067          sf.SpritePointEnable = cso_state->point_quad_rasterization;
7068          sf.DestinationOriginHorizontalBias = 0.5;
7069          sf.DestinationOriginVerticalBias = 0.5;
7070 
7071 	 sf.LineEndCapAntialiasingRegionWidth =
7072             cso_state->line_smooth ? _10pixels : _05pixels;
7073          sf.LastPixelEnable = cso_state->line_last_pixel;
7074          sf.AntialiasingEnable = cso_state->line_smooth;
7075 
7076          sf.LineWidth = get_line_width(cso_state);
7077          sf.PointWidth = cso_state->point_size;
7078          sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7079 #if GFX_VERx10 >= 45
7080          sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7081 #endif
7082          sf.ViewportTransformEnable = true;
7083          sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7084          sf.ScissorRectangleEnable = true;
7085          sf.CullMode = translate_cull_mode(cso_state->cull_face);
7086 
7087          if (cso_state->flatshade_first) {
7088             sf.TriangleFanProvokingVertexSelect = 1;
7089          } else {
7090             sf.TriangleStripListProvokingVertexSelect = 2;
7091             sf.TriangleFanProvokingVertexSelect = 2;
7092             sf.LineStripListProvokingVertexSelect = 1;
7093          }
7094       }
7095 #else
7096       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7097       uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7098       crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7099          sf.ViewportTransformEnable = !ice->state.window_space_position;
7100 
7101 #if GFX_VER == 6
7102          const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7103          uint32_t urb_entry_read_length;
7104          uint32_t urb_entry_read_offset;
7105          uint32_t point_sprite_enables;
7106          calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7107                                   &urb_entry_read_length,
7108                                   &urb_entry_read_offset);
7109          sf.VertexURBEntryReadLength = urb_entry_read_length;
7110          sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7111          sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7112          sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7113          sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7114 #endif
7115 
7116 #if GFX_VER >= 6 && GFX_VER < 8
7117          if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7118             sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7119 #endif
7120 #if GFX_VER == 7
7121          if (ice->state.framebuffer.zsbuf) {
7122             struct crocus_resource *zres, *sres;
7123                crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7124                                                   ice->state.framebuffer.zsbuf->texture,
7125                                                   &zres, &sres);
7126             /* ANV thinks that the stencil-ness doesn't matter, this is just
7127              * about handling polygon offset scaling.
7128              */
7129             sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7130          }
7131 #endif
7132       }
7133       crocus_emit_merge(batch, cso->sf, dynamic_sf,
7134                       ARRAY_SIZE(dynamic_sf));
7135 #if GFX_VER == 8
7136       crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7137 #endif
7138 #endif
7139    }
7140 
7141    if (dirty & CROCUS_DIRTY_WM) {
7142       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7143       const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7144       UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF;
7145       UNUSED const struct shader_info *fs_info =
7146          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7147 
7148 #if GFX_VER == 6
7149       struct push_bos push_bos = {};
7150       setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7151 
7152       emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7153 #endif
7154 #if GFX_VER >= 6
7155       crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7156 #else
7157       uint32_t *wm_ptr = stream_state(batch,
7158                                       GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7159 
7160       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7161 
7162       _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7163 #endif
7164      {
7165 #if GFX_VER <= 6
7166          wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7167          wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7168          wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7169 #endif
7170 #if GFX_VER == 4
7171       /* On gen4, we only have one shader kernel */
7172          if (elk_wm_state_has_ksp(wm, 0)) {
7173             wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7174             wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7175             wm.DispatchGRFStartRegisterForConstantSetupData0 =
7176                wm_prog_data->base.dispatch_grf_start_reg;
7177          }
7178 #elif GFX_VER == 5
7179          wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7180             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7181          wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7182             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7183          wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7184             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7185 
7186          wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7187          wm.GRFRegisterCount1 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7188          wm.GRFRegisterCount2 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7189 
7190          wm.DispatchGRFStartRegisterForConstantSetupData0 =
7191             wm_prog_data->base.dispatch_grf_start_reg;
7192 #elif GFX_VER == 6
7193          wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7194             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7195          wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7196             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7197          wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7198             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7199 
7200          wm.DispatchGRFStartRegisterForConstantSetupData0 =
7201            elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7202          wm.DispatchGRFStartRegisterForConstantSetupData1 =
7203            elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7204          wm.DispatchGRFStartRegisterForConstantSetupData2 =
7205            elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7206 #endif
7207 #if GFX_VER <= 5
7208          wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7209          wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7210          wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7211          wm.SetupURBEntryReadOffset = 0;
7212          wm.EarlyDepthTestEnable = true;
7213          wm.LineAntialiasingRegionWidth = _05pixels;
7214          wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7215          wm.DepthCoefficientURBReadOffset = 1;
7216 
7217          if (cso->cso.offset_tri) {
7218             wm.GlobalDepthOffsetEnable = true;
7219 
7220          /* Something weird going on with legacy_global_depth_bias,
7221           * offset_constant, scaling and MRD.  This value passes glean
7222           * but gives some odd results elsewere (eg. the
7223           * quad-offset-units test).
7224           */
7225             wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7226             wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7227          }
7228          wm.SamplerStatePointer = ro_bo(batch->state.bo,
7229                                         ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7230 #endif
7231 
7232          wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7233             ice->state.statistics_counters_enabled : 0;
7234 
7235 #if GFX_VER >= 6
7236          wm.LineAntialiasingRegionWidth = _10pixels;
7237          wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7238 
7239          wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7240          wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7241 #endif
7242 #if GFX_VER == 6
7243       wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7244          ice->state.cso_blend->dual_color_blending;
7245       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7246       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7247 
7248       /* From the SNB PRM, volume 2 part 1, page 281:
7249        * "If the PS kernel does not need the Position XY Offsets
7250        * to compute a Position XY value, then this field should be
7251        * programmed to POSOFFSET_NONE."
7252        *
7253        * "SW Recommendation: If the PS kernel needs the Position Offsets
7254        * to compute a Position XY value, this field should match Position
7255        * ZW Interpolation Mode to ensure a consistent position.xyzw
7256        * computation."
7257        * We only require XY sample offsets. So, this recommendation doesn't
7258        * look useful at the moment. We might need this in future.
7259        */
7260       if (wm_prog_data->uses_pos_offset)
7261          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7262       else
7263          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7264 #endif
7265          wm.LineStippleEnable = cso->cso.line_stipple_enable;
7266          wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7267 
7268 #if GFX_VER < 7
7269          if (wm_prog_data->base.use_alt_mode)
7270             wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7271          wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7272          wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7273 #endif
7274 
7275 #if GFX_VER < 8
7276 #if GFX_VER >= 6
7277          wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7278 
7279          struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7280          if (fb->samples > 1) {
7281             if (cso->cso.multisample)
7282                wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7283             else
7284                wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7285 
7286             if (elk_wm_prog_data_is_persample(wm_prog_data, 0))
7287                wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7288             else
7289                wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7290          } else {
7291             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7292             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7293          }
7294 #endif
7295 
7296          wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7297 
7298          if (wm_prog_data->uses_kill ||
7299              ice->state.cso_zsa->cso.alpha_enabled ||
7300              ice->state.cso_blend->cso.alpha_to_coverage ||
7301              (GFX_VER >= 6 && wm_prog_data->uses_omask))
7302             wm.PixelShaderKillsPixel = true;
7303 
7304          if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7305              writes_depth || wm.PixelShaderKillsPixel ||
7306              (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7307             wm.ThreadDispatchEnable = true;
7308 
7309 #if GFX_VER >= 7
7310          wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7311          wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7312 #else
7313          if (wm_prog_data->base.total_scratch) {
7314             struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7315                                                             MESA_SHADER_FRAGMENT);
7316             wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7317             wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7318          }
7319 
7320          wm.PixelShaderComputedDepth = writes_depth;
7321 
7322 #endif
7323          /* The "UAV access enable" bits are unnecessary on HSW because they only
7324           * seem to have an effect on the HW-assisted coherency mechanism which we
7325           * don't need, and the rasterization-related UAV_ONLY flag and the
7326           * DISPATCH_ENABLE bit can be set independently from it.
7327           * C.f. gen8_upload_ps_extra().
7328           *
7329           * ELK_NEW_FRAGMENT_PROGRAM | ELK_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7330           * _NEW_COLOR
7331           */
7332 #if GFX_VERx10 == 75
7333          if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7334              wm_prog_data->has_side_effects)
7335             wm.PSUAVonly = ON;
7336 #endif
7337 #endif
7338 #if GFX_VER >= 7
7339       /* ELK_NEW_FS_PROG_DATA */
7340          if (wm_prog_data->early_fragment_tests)
7341            wm.EarlyDepthStencilControl = EDSC_PREPS;
7342          else if (wm_prog_data->has_side_effects)
7343            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7344 #endif
7345 #if GFX_VER == 8
7346          /* We could skip this bit if color writes are enabled. */
7347          if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7348             wm.ForceThreadDispatchEnable = ForceON;
7349 #endif
7350       };
7351 
7352 #if GFX_VER <= 5
7353       if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7354          crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7355             clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7356          }
7357          ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7358       }
7359 #endif
7360    }
7361 
7362 #if GFX_VER >= 7
7363    if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7364       crocus_emit_sbe(batch, ice);
7365    }
7366 #endif
7367 
7368 #if GFX_VER >= 8
7369    if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7370       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7371       struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7372       struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7373       struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7374       const struct shader_info *fs_info =
7375          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7376       uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7377       crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7378          pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7379          pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7380          pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7381             (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7382       }
7383       crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7384                         ARRAY_SIZE(cso_blend->ps_blend));
7385    }
7386 #endif
7387 
7388 #if GFX_VER >= 6
7389    if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7390 
7391 #if GFX_VER >= 8
7392       crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7393          set_depth_stencil_bits(ice, &wmds);
7394       }
7395 #else
7396       uint32_t ds_offset;
7397       void *ds_map = stream_state(batch,
7398                                   sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7399                                   64, &ds_offset);
7400       _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7401          set_depth_stencil_bits(ice, &ds);
7402       }
7403 
7404 #if GFX_VER == 6
7405       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7406          ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7407          ptr.DEPTH_STENCIL_STATEChange = true;
7408       }
7409 #else
7410       crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7411          ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7412       }
7413 #endif
7414 #endif
7415    }
7416 
7417    if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7418       /* Align to 64-byte boundary as per anv. */
7419       uint32_t scissor_offset;
7420       struct pipe_scissor_state *scissor_map = (void *)
7421          stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7422                       64, &scissor_offset);
7423       for (int i = 0; i < ice->state.num_viewports; i++) {
7424          struct pipe_scissor_state scissor;
7425          crocus_fill_scissor_rect(ice, i, &scissor);
7426          scissor_map[i] = scissor;
7427       }
7428 
7429       crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7430          ptr.ScissorRectPointer = scissor_offset;
7431       }
7432    }
7433 #endif
7434 
7435    if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7436       struct isl_device *isl_dev = &batch->screen->isl_dev;
7437 #if GFX_VER >= 6
7438       crocus_emit_depth_stall_flushes(batch);
7439 #endif
7440       void *batch_ptr;
7441       struct crocus_resource *zres, *sres;
7442       struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7443       batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7444 
7445       struct isl_view view = {
7446                               .base_level = 0,
7447                               .levels = 1,
7448                               .base_array_layer = 0,
7449                               .array_len = 1,
7450                               .swizzle = ISL_SWIZZLE_IDENTITY,
7451       };
7452       struct isl_depth_stencil_hiz_emit_info info = {
7453          .view = &view,
7454          .mocs = crocus_mocs(NULL, isl_dev),
7455       };
7456 
7457       if (cso->zsbuf) {
7458          crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7459          struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7460          if (zsbuf->align_res) {
7461             zres = (struct crocus_resource *)zsbuf->align_res;
7462          }
7463          view.base_level = cso->zsbuf->u.tex.level;
7464          view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7465          view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7466 
7467          if (zres) {
7468             view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7469 
7470             info.depth_surf = &zres->surf;
7471             info.depth_address = crocus_command_reloc(batch,
7472                                                       (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7473                                                       zres->bo, 0, RELOC_32BIT);
7474 
7475             info.mocs = crocus_mocs(zres->bo, isl_dev);
7476             view.format = zres->surf.format;
7477 
7478             if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7479                info.hiz_usage = zres->aux.usage;
7480                info.hiz_surf = &zres->aux.surf;
7481                uint64_t hiz_offset = 0;
7482 
7483 #if GFX_VER == 6
7484                /* HiZ surfaces on Sandy Bridge technically don't support
7485                 * mip-mapping.  However, we can fake it by offsetting to the
7486                 * first slice of LOD0 in the HiZ surface.
7487                 */
7488                isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7489                                                    view.base_level, 0, 0,
7490                                                    &hiz_offset, NULL, NULL);
7491 #endif
7492                info.hiz_address = crocus_command_reloc(batch,
7493                                                        (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7494                                                        zres->aux.bo, zres->aux.offset + hiz_offset,
7495                                                        RELOC_32BIT);
7496                info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7497             }
7498          }
7499 
7500 #if GFX_VER >= 6
7501          if (sres) {
7502             view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7503             info.stencil_aux_usage = sres->aux.usage;
7504             info.stencil_surf = &sres->surf;
7505 
7506             uint64_t stencil_offset = 0;
7507 #if GFX_VER == 6
7508             /* Stencil surfaces on Sandy Bridge technically don't support
7509              * mip-mapping.  However, we can fake it by offsetting to the
7510              * first slice of LOD0 in the stencil surface.
7511              */
7512             isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7513                                                 view.base_level, 0, 0,
7514                                                 &stencil_offset, NULL, NULL);
7515 #endif
7516 
7517             info.stencil_address = crocus_command_reloc(batch,
7518                                                         (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7519                                                         sres->bo, stencil_offset, RELOC_32BIT);
7520             if (!zres) {
7521                view.format = sres->surf.format;
7522                info.mocs = crocus_mocs(sres->bo, isl_dev);
7523             }
7524          }
7525 #endif
7526       }
7527       isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7528    }
7529 
7530    /* TODO: Disable emitting this until something uses a stipple. */
7531    if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7532       crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7533          for (int i = 0; i < 32; i++) {
7534             poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7535          }
7536       }
7537    }
7538 
7539    if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7540       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7541       crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7542    }
7543 
7544 #if GFX_VER >= 8
7545    if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7546       crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7547          topo.PrimitiveTopologyType =
7548             translate_prim_type(draw->mode, ice->state.patch_vertices);
7549       }
7550    }
7551 #endif
7552 
7553 #if GFX_VER <= 5
7554    if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7555       upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7556                                       ice->shaders.vs_offset, ice->shaders.sf_offset,
7557                                       ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7558       crocus_upload_urb_fence(batch);
7559 
7560       crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7561         cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7562         cs.URBEntryAllocationSize = ice->urb.csize - 1;
7563       }
7564       dirty |= CROCUS_DIRTY_GEN4_CURBE;
7565    }
7566 #endif
7567    if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7568       struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7569       if (fb->width && fb->height) {
7570          crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7571             rect.ClippedDrawingRectangleXMax = fb->width - 1;
7572             rect.ClippedDrawingRectangleYMax = fb->height - 1;
7573          }
7574       }
7575    }
7576 
7577    if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7578       const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7579       const uint32_t count = user_count +
7580          ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7581       uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7582 
7583       if (count) {
7584          const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7585 
7586          uint32_t *map =
7587             crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7588          _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7589             vb.DWordLength = (vb_dwords * count + 1) - 2;
7590          }
7591          map += 1;
7592 
7593          uint32_t bound = dynamic_bound;
7594          int i;
7595          while (bound) {
7596             i = u_bit_scan(&bound);
7597             struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7598             struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7599             uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7600 
7601             emit_vertex_buffer_state(batch, i, bo,
7602                                      buf->buffer_offset,
7603                                      ice->state.vb_end[i],
7604                                      ice->state.cso_vertex_elements->strides[i],
7605                                      step_rate,
7606                                      &map);
7607          }
7608          i = user_count;
7609          if (ice->state.vs_uses_draw_params) {
7610             struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7611             emit_vertex_buffer_state(batch, i++,
7612                                      res->bo,
7613                                      ice->draw.draw_params.offset,
7614                                      ice->draw.draw_params.res->width0,
7615                                      0, 0, &map);
7616          }
7617          if (ice->state.vs_uses_derived_draw_params) {
7618             struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7619             emit_vertex_buffer_state(batch, i++,
7620                                      res->bo,
7621                                      ice->draw.derived_draw_params.offset,
7622                                      ice->draw.derived_draw_params.res->width0,
7623                                      0, 0, &map);
7624          }
7625       }
7626    }
7627 
7628    if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7629       struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7630       const unsigned entries = MAX2(cso->count, 1);
7631       if (!(ice->state.vs_needs_sgvs_element ||
7632             ice->state.vs_uses_derived_draw_params ||
7633             ice->state.vs_needs_edge_flag)) {
7634          crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7635                          (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7636       } else {
7637          uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7638          const unsigned dyn_count = cso->count +
7639             ice->state.vs_needs_sgvs_element +
7640             ice->state.vs_uses_derived_draw_params;
7641 
7642          crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7643                            &dynamic_ves, ve) {
7644             ve.DWordLength =
7645                1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7646          }
7647          memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7648                 (cso->count - ice->state.vs_needs_edge_flag) *
7649                 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7650          uint32_t *ve_pack_dest =
7651             &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7652                          GENX(VERTEX_ELEMENT_STATE_length)];
7653 
7654          if (ice->state.vs_needs_sgvs_element) {
7655             uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7656                                  VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7657             crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7658                ve.Valid = true;
7659                ve.VertexBufferIndex =
7660                   util_bitcount64(ice->state.bound_vertex_buffers);
7661                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7662                ve.Component0Control = base_ctrl;
7663                ve.Component1Control = base_ctrl;
7664 #if GFX_VER < 8
7665                ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7666                ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7667 #else
7668                ve.Component2Control = VFCOMP_STORE_0;
7669                ve.Component3Control = VFCOMP_STORE_0;
7670 #endif
7671 #if GFX_VER < 5
7672                ve.DestinationElementOffset = cso->count * 4;
7673 #endif
7674             }
7675             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7676          }
7677          if (ice->state.vs_uses_derived_draw_params) {
7678             crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7679                ve.Valid = true;
7680                ve.VertexBufferIndex =
7681                   util_bitcount64(ice->state.bound_vertex_buffers) +
7682                   ice->state.vs_uses_draw_params;
7683                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7684                ve.Component0Control = VFCOMP_STORE_SRC;
7685                ve.Component1Control = VFCOMP_STORE_SRC;
7686                ve.Component2Control = VFCOMP_STORE_0;
7687                ve.Component3Control = VFCOMP_STORE_0;
7688 #if GFX_VER < 5
7689                ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7690 #endif
7691             }
7692             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7693          }
7694          if (ice->state.vs_needs_edge_flag) {
7695             for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
7696                ve_pack_dest[i] = cso->edgeflag_ve[i];
7697          }
7698 
7699          crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7700                          (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7701       }
7702 
7703 #if GFX_VER == 8
7704       if (!ice->state.vs_needs_edge_flag) {
7705          crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7706                          entries * GENX(3DSTATE_VF_INSTANCING_length));
7707       } else {
7708          assert(cso->count > 0);
7709          const unsigned edgeflag_index = cso->count - 1;
7710          uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7711          memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7712                 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7713 
7714          uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7715             edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7716          crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7717             vi.VertexElementIndex = edgeflag_index +
7718                ice->state.vs_needs_sgvs_element +
7719                ice->state.vs_uses_derived_draw_params;
7720          }
7721          for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
7722             vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7723 
7724          crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7725                          entries * GENX(3DSTATE_VF_INSTANCING_length));
7726       }
7727 #endif
7728    }
7729 
7730 #if GFX_VER == 8
7731    if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7732       const struct elk_vs_prog_data *vs_prog_data = (void *)
7733          ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7734       struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7735 
7736       crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7737          if (vs_prog_data->uses_vertexid) {
7738             sgv.VertexIDEnable = true;
7739             sgv.VertexIDComponentNumber = 2;
7740             sgv.VertexIDElementOffset =
7741                cso->count - ice->state.vs_needs_edge_flag;
7742          }
7743 
7744          if (vs_prog_data->uses_instanceid) {
7745             sgv.InstanceIDEnable = true;
7746             sgv.InstanceIDComponentNumber = 3;
7747             sgv.InstanceIDElementOffset =
7748                cso->count - ice->state.vs_needs_edge_flag;
7749          }
7750       }
7751    }
7752 #endif
7753 #if GFX_VERx10 >= 75
7754    if (dirty & CROCUS_DIRTY_GEN75_VF) {
7755       crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7756          if (draw->primitive_restart) {
7757             vf.IndexedDrawCutIndexEnable = true;
7758             vf.CutIndex = draw->restart_index;
7759          }
7760       }
7761    }
7762 #endif
7763 
7764 #if GFX_VER == 8
7765    if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7766       bool enable = want_pma_fix(ice);
7767       genX(crocus_update_pma_fix)(ice, batch, enable);
7768    }
7769 #endif
7770 
7771 #if GFX_VER <= 5
7772    if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7773       gen4_upload_curbe(batch);
7774    }
7775 #endif
7776 }
7777 
7778 static void
7779 crocus_upload_render_state(struct crocus_context *ice,
7780                            struct crocus_batch *batch,
7781                            const struct pipe_draw_info *draw,
7782                            unsigned drawid_offset,
7783                            const struct pipe_draw_indirect_info *indirect,
7784                            const struct pipe_draw_start_count_bias *sc)
7785 {
7786 #if GFX_VER >= 7
7787    bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7788 #endif
7789 
7790    batch->no_wrap = true;
7791    batch->contains_draw = true;
7792 
7793    crocus_update_surface_base_address(batch);
7794 
7795    crocus_upload_dirty_render_state(ice, batch, draw);
7796 
7797    batch->no_wrap = false;
7798    if (draw->index_size > 0) {
7799       unsigned offset;
7800       unsigned size;
7801       bool emit_index = false;
7802 
7803       if (draw->has_user_indices) {
7804          unsigned start_offset = draw->index_size * sc->start;
7805          u_upload_data(ice->ctx.stream_uploader, 0,
7806                        sc->count * draw->index_size, 4,
7807                        (char *)draw->index.user + start_offset,
7808                        &offset, &ice->state.index_buffer.res);
7809          offset -= start_offset;
7810          size = start_offset + sc->count * draw->index_size;
7811          emit_index = true;
7812       } else {
7813          struct crocus_resource *res = (void *) draw->index.resource;
7814 
7815          if (ice->state.index_buffer.res != draw->index.resource) {
7816             res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7817             pipe_resource_reference(&ice->state.index_buffer.res,
7818                                     draw->index.resource);
7819             emit_index = true;
7820          }
7821          offset = 0;
7822          size = draw->index.resource->width0;
7823       }
7824 
7825       if (!emit_index &&
7826           (ice->state.index_buffer.size != size ||
7827            ice->state.index_buffer.index_size != draw->index_size
7828 #if GFX_VERx10 < 75
7829            || ice->state.index_buffer.prim_restart != draw->primitive_restart
7830 #endif
7831 	   )
7832 	  )
7833          emit_index = true;
7834 
7835       if (emit_index) {
7836          struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7837 
7838          crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7839 #if GFX_VERx10 < 75
7840             ib.CutIndexEnable = draw->primitive_restart;
7841 #endif
7842             ib.IndexFormat = draw->index_size >> 1;
7843             ib.BufferStartingAddress = ro_bo(bo, offset);
7844 #if GFX_VER >= 8
7845             ib.BufferSize = bo->size - offset;
7846 #else
7847             ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7848 #endif
7849 #if GFX_VER >= 6
7850             ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7851 #endif
7852          }
7853          ice->state.index_buffer.size = size;
7854          ice->state.index_buffer.offset = offset;
7855          ice->state.index_buffer.index_size = draw->index_size;
7856 #if GFX_VERx10 < 75
7857          ice->state.index_buffer.prim_restart = draw->primitive_restart;
7858 #endif
7859       }
7860    }
7861 
7862 #define _3DPRIM_END_OFFSET          0x2420
7863 #define _3DPRIM_START_VERTEX        0x2430
7864 #define _3DPRIM_VERTEX_COUNT        0x2434
7865 #define _3DPRIM_INSTANCE_COUNT      0x2438
7866 #define _3DPRIM_START_INSTANCE      0x243C
7867 #define _3DPRIM_BASE_VERTEX         0x2440
7868 
7869 #if GFX_VER >= 7
7870    if (indirect && !indirect->count_from_stream_output) {
7871       if (indirect->indirect_draw_count) {
7872          use_predicate = true;
7873 
7874          struct crocus_bo *draw_count_bo =
7875             crocus_resource_bo(indirect->indirect_draw_count);
7876          unsigned draw_count_offset =
7877             indirect->indirect_draw_count_offset;
7878 
7879          crocus_emit_pipe_control_flush(batch,
7880                                         "ensure indirect draw buffer is flushed",
7881                                         PIPE_CONTROL_FLUSH_ENABLE);
7882          if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7883 #if GFX_VERx10 >= 75
7884             struct mi_builder b;
7885             mi_builder_init(&b, &batch->screen->devinfo, batch);
7886 
7887             /* comparison = draw id < draw count */
7888             struct mi_value comparison =
7889                mi_ult(&b, mi_imm(drawid_offset),
7890                       mi_mem32(ro_bo(draw_count_bo,
7891                                      draw_count_offset)));
7892 #if GFX_VER == 8
7893             /* predicate = comparison & conditional rendering predicate */
7894             mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7895                          mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7896 #else
7897             /* predicate = comparison & conditional rendering predicate */
7898             struct mi_value pred = mi_iand(&b, comparison,
7899                                            mi_reg32(CS_GPR(15)));
7900 
7901             mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7902             mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7903 
7904             unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7905                MI_PREDICATE_COMBINEOP_SET |
7906                MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7907 
7908             crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7909 #endif
7910 #endif
7911          } else {
7912             uint32_t mi_predicate;
7913 
7914             /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7915             crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7916             /* Upload the current draw count from the draw parameters buffer
7917              * to MI_PREDICATE_SRC0.
7918              */
7919             crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7920                                        draw_count_bo, draw_count_offset);
7921             /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7922             crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7923 
7924             if (drawid_offset == 0) {
7925                mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7926                   MI_PREDICATE_COMBINEOP_SET |
7927                   MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7928             } else {
7929                /* While draw_index < draw_count the predicate's result will be
7930                 *  (draw_index == draw_count) ^ TRUE = TRUE
7931                 * When draw_index == draw_count the result is
7932                 *  (TRUE) ^ TRUE = FALSE
7933                 * After this all results will be:
7934                 *  (FALSE) ^ FALSE = FALSE
7935                 */
7936                mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7937                   MI_PREDICATE_COMBINEOP_XOR |
7938                   MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7939             }
7940             crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7941          }
7942       }
7943 
7944 #if GFX_VER >= 7
7945       struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7946       assert(bo);
7947 
7948       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7949          lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7950          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7951       }
7952       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7953          lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7954          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7955       }
7956       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7957          lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7958          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7959       }
7960       if (draw->index_size) {
7961          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7962             lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7963             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7964          }
7965          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7966             lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7967             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7968          }
7969       } else {
7970          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7971             lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7972             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7973          }
7974          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7975             lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7976             lri.DataDWord = 0;
7977          }
7978       }
7979 #endif
7980    } else if (indirect && indirect->count_from_stream_output) {
7981 #if GFX_VERx10 >= 75
7982       struct crocus_stream_output_target *so =
7983          (void *) indirect->count_from_stream_output;
7984 
7985       /* XXX: Replace with actual cache tracking */
7986       crocus_emit_pipe_control_flush(batch,
7987                                      "draw count from stream output stall",
7988                                      PIPE_CONTROL_CS_STALL);
7989 
7990       struct mi_builder b;
7991       mi_builder_init(&b, &batch->screen->devinfo, batch);
7992 
7993       struct crocus_address addr =
7994          ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7995       struct mi_value offset =
7996          mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
7997 
7998       mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
7999                mi_udiv32_imm(&b, offset, so->stride));
8000 
8001       _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8002       _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8003       _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8004       _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8005 #endif
8006    }
8007 #else
8008    assert(!indirect);
8009 #endif
8010 
8011    crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8012       prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8013 #if GFX_VER >= 7
8014       prim.PredicateEnable = use_predicate;
8015 #endif
8016 
8017       prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8018       if (indirect) {
8019          // XXX Probably have to do something for gen6 here?
8020 #if GFX_VER >= 7
8021          prim.IndirectParameterEnable = true;
8022 #endif
8023       } else {
8024 #if GFX_VER >= 5
8025          prim.StartInstanceLocation = draw->start_instance;
8026 #endif
8027          prim.InstanceCount = draw->instance_count;
8028          prim.VertexCountPerInstance = sc->count;
8029 
8030          prim.StartVertexLocation = sc->start;
8031 
8032          if (draw->index_size) {
8033             prim.BaseVertexLocation += sc->index_bias;
8034          }
8035       }
8036    }
8037 }
8038 
8039 #if GFX_VER >= 7
8040 
8041 static void
8042 crocus_upload_compute_state(struct crocus_context *ice,
8043                             struct crocus_batch *batch,
8044                             const struct pipe_grid_info *grid)
8045 {
8046    const uint64_t stage_dirty = ice->state.stage_dirty;
8047    struct crocus_screen *screen = batch->screen;
8048    const struct intel_device_info *devinfo = &screen->devinfo;
8049    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8050    struct crocus_compiled_shader *shader =
8051       ice->shaders.prog[MESA_SHADER_COMPUTE];
8052    struct elk_stage_prog_data *prog_data = shader->prog_data;
8053    struct elk_cs_prog_data *cs_prog_data = (void *) prog_data;
8054    const struct intel_cs_dispatch_info dispatch =
8055       elk_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8056 
8057    crocus_update_surface_base_address(batch);
8058    if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8059       upload_sysvals(ice, MESA_SHADER_COMPUTE);
8060 
8061    if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8062       crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8063       ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8064          crocus_upload_binding_table(ice, batch,
8065                                      ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8066                                      ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8067    }
8068 
8069    if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8070       crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8071 
8072    if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8073        cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8074       /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8075        *
8076        *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8077        *    the only bits that are changed are scoreboard related: Scoreboard
8078        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
8079        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
8080        *    sufficient."
8081        */
8082       crocus_emit_pipe_control_flush(batch,
8083                                      "workaround: stall before MEDIA_VFE_STATE",
8084                                      PIPE_CONTROL_CS_STALL);
8085 
8086       crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8087          if (prog_data->total_scratch) {
8088             struct crocus_bo *bo =
8089                crocus_get_scratch_space(ice, prog_data->total_scratch,
8090                                         MESA_SHADER_COMPUTE);
8091 #if GFX_VER == 8
8092             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8093              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8094              */
8095             vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8096 #elif GFX_VERx10 == 75
8097             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8098              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8099              */
8100             vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8101 #else
8102             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8103              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8104              */
8105             vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8106 #endif
8107             vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8108          }
8109 
8110          vfe.MaximumNumberofThreads =
8111             devinfo->max_cs_threads * devinfo->subslice_total - 1;
8112          vfe.ResetGatewayTimer =
8113             Resettingrelativetimerandlatchingtheglobaltimestamp;
8114          vfe.BypassGatewayControl = true;
8115 #if GFX_VER == 7
8116          vfe.GPGPUMode = true;
8117 #endif
8118 #if GFX_VER == 8
8119          vfe.BypassGatewayControl = true;
8120 #endif
8121          vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8122          vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8123 
8124          vfe.CURBEAllocationSize =
8125             ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8126                   cs_prog_data->push.cross_thread.regs, 2);
8127       }
8128    }
8129 
8130    /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8131    if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8132        cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8133       uint32_t curbe_data_offset = 0;
8134       assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8135              cs_prog_data->push.per_thread.dwords == 1 &&
8136              cs_prog_data->base.param[0] == ELK_PARAM_BUILTIN_SUBGROUP_ID);
8137       const unsigned push_const_size =
8138          elk_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8139       uint32_t *curbe_data_map =
8140          stream_state(batch,
8141                       ALIGN(push_const_size, 64), 64,
8142                       &curbe_data_offset);
8143       assert(curbe_data_map);
8144       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8145       crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8146                                        curbe_data_map);
8147 
8148       crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8149          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8150          curbe.CURBEDataStartAddress = curbe_data_offset;
8151       }
8152    }
8153 
8154    if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8155                       CROCUS_STAGE_DIRTY_BINDINGS_CS |
8156                       CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8157                       CROCUS_STAGE_DIRTY_CS)) {
8158       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8159       const uint64_t ksp = KSP(ice,shader) + elk_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8160       crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8161          idd.KernelStartPointer = ksp;
8162          idd.SamplerStatePointer = shs->sampler_offset;
8163          idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8164          idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8165          idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8166          idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8167          idd.BarrierEnable = cs_prog_data->uses_barrier;
8168          idd.SharedLocalMemorySize = elk_encode_slm_size(GFX_VER,
8169                                                          prog_data->total_shared);
8170 #if GFX_VERx10 >= 75
8171          idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8172 #endif
8173       }
8174 
8175       crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8176          load.InterfaceDescriptorTotalLength =
8177             GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8178          load.InterfaceDescriptorDataStartAddress =
8179             emit_state(batch, desc, sizeof(desc), 64);
8180       }
8181    }
8182 
8183 #define GPGPU_DISPATCHDIMX 0x2500
8184 #define GPGPU_DISPATCHDIMY 0x2504
8185 #define GPGPU_DISPATCHDIMZ 0x2508
8186 
8187    if (grid->indirect) {
8188       struct crocus_state_ref *grid_size = &ice->state.grid_size;
8189       struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8190       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8191          lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8192          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8193       }
8194       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8195          lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8196          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8197       }
8198       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8199          lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8200          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8201       }
8202 
8203 #if GFX_VER == 7
8204       /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8205       _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8206       crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8207 
8208       /* Load compute_dispatch_indirect_x_size into SRC0 */
8209       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8210 
8211       /* predicate = (compute_dispatch_indirect_x_size == 0); */
8212       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8213          mip.LoadOperation    = LOAD_LOAD;
8214          mip.CombineOperation = COMBINE_SET;
8215          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8216       };
8217 
8218       /* Load compute_dispatch_indirect_y_size into SRC0 */
8219       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8220 
8221       /* predicate = (compute_dispatch_indirect_y_size == 0); */
8222       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8223          mip.LoadOperation    = LOAD_LOAD;
8224          mip.CombineOperation = COMBINE_OR;
8225          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8226       };
8227 
8228       /* Load compute_dispatch_indirect_z_size into SRC0 */
8229       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8230 
8231       /* predicate = (compute_dispatch_indirect_z_size == 0); */
8232       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8233          mip.LoadOperation    = LOAD_LOAD;
8234          mip.CombineOperation = COMBINE_OR;
8235          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8236       };
8237 
8238       /* predicate = !predicate; */
8239 #define COMPARE_FALSE                           1
8240       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8241          mip.LoadOperation    = LOAD_LOADINV;
8242          mip.CombineOperation = COMBINE_OR;
8243          mip.CompareOperation = COMPARE_FALSE;
8244       }
8245 #endif
8246    }
8247 
8248    crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8249       ggw.IndirectParameterEnable    = grid->indirect != NULL;
8250       ggw.PredicateEnable            = GFX_VER <= 7 && grid->indirect != NULL;
8251       ggw.SIMDSize                   = dispatch.simd_size / 16;
8252       ggw.ThreadDepthCounterMaximum  = 0;
8253       ggw.ThreadHeightCounterMaximum = 0;
8254       ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
8255       ggw.ThreadGroupIDXDimension    = grid->grid[0];
8256       ggw.ThreadGroupIDYDimension    = grid->grid[1];
8257       ggw.ThreadGroupIDZDimension    = grid->grid[2];
8258       ggw.RightExecutionMask         = dispatch.right_mask;
8259       ggw.BottomExecutionMask        = 0xffffffff;
8260    }
8261 
8262    crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8263 
8264    batch->contains_draw = true;
8265 }
8266 
8267 #endif /* GFX_VER >= 7 */
8268 
8269 /**
8270  * State module teardown.
8271  */
8272 static void
8273 crocus_destroy_state(struct crocus_context *ice)
8274 {
8275    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
8276 
8277    pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8278    pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8279 
8280    free(ice->state.genx);
8281 
8282    for (int i = 0; i < 4; i++) {
8283       pipe_so_target_reference(&ice->state.so_target[i], NULL);
8284    }
8285 
8286    util_unreference_framebuffer_state(cso);
8287 
8288    for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8289       struct crocus_shader_state *shs = &ice->state.shaders[stage];
8290       for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8291          pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8292       }
8293       for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8294          pipe_resource_reference(&shs->image[i].base.resource, NULL);
8295       }
8296       for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8297          pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8298       }
8299       for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8300          pipe_sampler_view_reference((struct pipe_sampler_view **)
8301                                      &shs->textures[i], NULL);
8302       }
8303    }
8304 
8305    for (int i = 0; i < 16; i++)
8306       pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8307    pipe_resource_reference(&ice->state.grid_size.res, NULL);
8308 
8309    pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8310 }
8311 
8312 /* ------------------------------------------------------------------- */
8313 
8314 static void
8315 crocus_rebind_buffer(struct crocus_context *ice,
8316                      struct crocus_resource *res)
8317 {
8318    struct pipe_context *ctx = &ice->ctx;
8319 
8320    assert(res->base.b.target == PIPE_BUFFER);
8321 
8322    /* Buffers can't be framebuffer attachments, nor display related,
8323     * and we don't have upstream Clover support.
8324     */
8325    assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8326                                  PIPE_BIND_RENDER_TARGET |
8327                                  PIPE_BIND_BLENDABLE |
8328                                  PIPE_BIND_DISPLAY_TARGET |
8329                                  PIPE_BIND_CURSOR |
8330                                  PIPE_BIND_COMPUTE_RESOURCE |
8331                                  PIPE_BIND_GLOBAL)));
8332 
8333    if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8334       uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8335       while (bound_vbs) {
8336          const int i = u_bit_scan64(&bound_vbs);
8337          struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8338 
8339          if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8340             ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8341       }
8342    }
8343 
8344    if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8345        ice->state.index_buffer.res) {
8346       if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8347          pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8348    }
8349    /* There is no need to handle these:
8350     * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8351     * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8352     */
8353 
8354    if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8355       /* XXX: be careful about resetting vs appending... */
8356       for (int i = 0; i < 4; i++) {
8357          if (ice->state.so_target[i] &&
8358              (ice->state.so_target[i]->buffer == &res->base.b)) {
8359 #if GFX_VER == 6
8360             ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8361 #else
8362             ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8363 #endif
8364          }
8365       }
8366    }
8367 
8368    for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8369       struct crocus_shader_state *shs = &ice->state.shaders[s];
8370       enum pipe_shader_type p_stage = stage_to_pipe(s);
8371 
8372       if (!(res->bind_stages & (1 << s)))
8373          continue;
8374 
8375       if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8376          /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8377          uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8378          while (bound_cbufs) {
8379             const int i = u_bit_scan(&bound_cbufs);
8380             struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8381 
8382             if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8383                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8384             }
8385          }
8386       }
8387 
8388       if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8389          uint32_t bound_ssbos = shs->bound_ssbos;
8390          while (bound_ssbos) {
8391             const int i = u_bit_scan(&bound_ssbos);
8392             struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8393 
8394             if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8395                struct pipe_shader_buffer buf = {
8396                   .buffer = &res->base.b,
8397                   .buffer_offset = ssbo->buffer_offset,
8398                   .buffer_size = ssbo->buffer_size,
8399                };
8400                crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8401                                          (shs->writable_ssbos >> i) & 1);
8402             }
8403          }
8404       }
8405 
8406       if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8407          uint32_t bound_sampler_views = shs->bound_sampler_views;
8408          while (bound_sampler_views) {
8409             const int i = u_bit_scan(&bound_sampler_views);
8410             struct crocus_sampler_view *isv = shs->textures[i];
8411             struct crocus_bo *bo = isv->res->bo;
8412 
8413             if (res->bo == bo) {
8414                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8415             }
8416          }
8417       }
8418 
8419       if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8420          uint32_t bound_image_views = shs->bound_image_views;
8421          while (bound_image_views) {
8422             const int i = u_bit_scan(&bound_image_views);
8423             struct crocus_image_view *iv = &shs->image[i];
8424             struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8425 
8426             if (res->bo == bo)
8427                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8428          }
8429       }
8430    }
8431 }
8432 
8433 /* ------------------------------------------------------------------- */
8434 
8435 static unsigned
8436 flags_to_post_sync_op(uint32_t flags)
8437 {
8438    if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8439       return WriteImmediateData;
8440 
8441    if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8442       return WritePSDepthCount;
8443 
8444    if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8445       return WriteTimestamp;
8446 
8447    return 0;
8448 }
8449 
8450 /*
8451  * Do the given flags have a Post Sync or LRI Post Sync operation?
8452  */
8453 static enum pipe_control_flags
8454 get_post_sync_flags(enum pipe_control_flags flags)
8455 {
8456    flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8457             PIPE_CONTROL_WRITE_DEPTH_COUNT |
8458             PIPE_CONTROL_WRITE_TIMESTAMP |
8459             PIPE_CONTROL_LRI_POST_SYNC_OP;
8460 
8461    /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8462     * "LRI Post Sync Operation".  So more than one bit set would be illegal.
8463     */
8464    assert(util_bitcount(flags) <= 1);
8465 
8466    return flags;
8467 }
8468 
8469 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8470 
8471 /**
8472  * Emit a series of PIPE_CONTROL commands, taking into account any
8473  * workarounds necessary to actually accomplish the caller's request.
8474  *
8475  * Unless otherwise noted, spec quotations in this function come from:
8476  *
8477  * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8478  * Restrictions for PIPE_CONTROL.
8479  *
8480  * You should not use this function directly.  Use the helpers in
8481  * crocus_pipe_control.c instead, which may split the pipe control further.
8482  */
8483 static void
8484 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8485                              const char *reason,
8486                              uint32_t flags,
8487                              struct crocus_bo *bo,
8488                              uint32_t offset,
8489                              uint64_t imm)
8490 {
8491    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8492    enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8493    UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8494       post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8495 
8496    /* Recursive PIPE_CONTROL workarounds --------------------------------
8497     * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8498     *
8499     * We do these first because we want to look at the original operation,
8500     * rather than any workarounds we set.
8501     */
8502 
8503    /* "Flush Types" workarounds ---------------------------------------------
8504     * We do these now because they may add post-sync operations or CS stalls.
8505     */
8506 
8507    if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8508       /* Hardware workaround: SNB B-Spec says:
8509        *
8510        *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8511        *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8512        *     required."
8513        */
8514       crocus_emit_post_sync_nonzero_flush(batch);
8515    }
8516 
8517 #if GFX_VER == 8
8518    if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8519       /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8520        *
8521        * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8522        *  'Write PS Depth Count' or 'Write Timestamp'."
8523        */
8524       if (!bo) {
8525          flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8526          post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8527          non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8528          bo = batch->ice->workaround_bo;
8529          offset = batch->ice->workaround_offset;
8530       }
8531    }
8532 #endif
8533 
8534 #if GFX_VERx10 < 75
8535    if (flags & PIPE_CONTROL_DEPTH_STALL) {
8536       /* Project: PRE-HSW / Argument: Depth Stall
8537        *
8538        * "The following bits must be clear:
8539        *  - Render Target Cache Flush Enable ([12] of DW1)
8540        *  - Depth Cache Flush Enable ([0] of DW1)"
8541        */
8542       assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8543                         PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8544    }
8545 #endif
8546    if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8547       /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8548        *
8549        *    "This bit must be DISABLED for operations other than writing
8550        *     PS_DEPTH_COUNT."
8551        *
8552        * This seems like nonsense.  An Ivybridge workaround requires us to
8553        * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8554        * operation.  Gen8+ requires us to emit depth stalls and depth cache
8555        * flushes together.  So, it's hard to imagine this means anything other
8556        * than "we originally intended this to be used for PS_DEPTH_COUNT".
8557        *
8558        * We ignore the supposed restriction and do nothing.
8559        */
8560    }
8561 
8562    if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8563       /* Project: PRE-HSW / Argument: Depth Cache Flush
8564        *
8565        * "Depth Stall must be clear ([13] of DW1)."
8566        */
8567       assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8568    }
8569 
8570    if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8571                 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8572       /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8573        *
8574        *    "This bit must be DISABLED for End-of-pipe (Read) fences,
8575        *     PS_DEPTH_COUNT or TIMESTAMP queries."
8576        *
8577        * TODO: Implement end-of-pipe checking.
8578        */
8579       assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8580                                   PIPE_CONTROL_WRITE_TIMESTAMP)));
8581    }
8582 
8583    if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8584       /* From the PIPE_CONTROL instruction table, bit 1:
8585        *
8586        *    "This bit is ignored if Depth Stall Enable is set.
8587        *     Further, the render cache is not flushed even if Write Cache
8588        *     Flush Enable bit is set."
8589        *
8590        * We assert that the caller doesn't do this combination, to try and
8591        * prevent mistakes.  It shouldn't hurt the GPU, though.
8592        *
8593        * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8594        * and "Render Target Flush" combo is explicitly required for BTI
8595        * update workarounds.
8596        */
8597       assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8598                         PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8599    }
8600 
8601    /* PIPE_CONTROL page workarounds ------------------------------------- */
8602 
8603    if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8604       /* From the PIPE_CONTROL page itself:
8605        *
8606        *    "IVB, HSW, BDW
8607        *     Restriction: Pipe_control with CS-stall bit set must be issued
8608        *     before a pipe-control command that has the State Cache
8609        *     Invalidate bit set."
8610        */
8611       flags |= PIPE_CONTROL_CS_STALL;
8612    }
8613 
8614    if ((GFX_VERx10 == 75)) {
8615       /* From the PIPE_CONTROL page itself:
8616        *
8617        *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8618        *     Prior to programming a PIPECONTROL command with any of the RO
8619        *     cache invalidation bit set, program a PIPECONTROL flush command
8620        *     with “CS stall” bit and “HDC Flush” bit set."
8621        *
8622        * TODO: Actually implement this.  What's an HDC Flush?
8623        */
8624    }
8625 
8626    if (flags & PIPE_CONTROL_FLUSH_LLC) {
8627       /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8628        *
8629        *    "Project: ALL
8630        *     SW must always program Post-Sync Operation to "Write Immediate
8631        *     Data" when Flush LLC is set."
8632        *
8633        * For now, we just require the caller to do it.
8634        */
8635       assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8636    }
8637 
8638    /* "Post-Sync Operation" workarounds -------------------------------- */
8639 
8640    /* Project: All / Argument: Global Snapshot Count Reset [19]
8641     *
8642     * "This bit must not be exercised on any product.
8643     *  Requires stall bit ([20] of DW1) set."
8644     *
8645     * We don't use this, so we just assert that it isn't used.  The
8646     * PIPE_CONTROL instruction page indicates that they intended this
8647     * as a debug feature and don't think it is useful in production,
8648     * but it may actually be usable, should we ever want to.
8649     */
8650    assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8651 
8652    if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8653                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8654       /* Project: All / Arguments:
8655        *
8656        * - Generic Media State Clear [16]
8657        * - Indirect State Pointers Disable [16]
8658        *
8659        *    "Requires stall bit ([20] of DW1) set."
8660        *
8661        * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8662        * State Clear) says:
8663        *
8664        *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
8665        *     programmed prior to programming a PIPECONTROL command with "Media
8666        *     State Clear" set in GPGPU mode of operation"
8667        *
8668        * This is a subset of the earlier rule, so there's nothing to do.
8669        */
8670       flags |= PIPE_CONTROL_CS_STALL;
8671    }
8672 
8673    if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8674       /* Project: All / Argument: Store Data Index
8675        *
8676        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8677        *  than '0'."
8678        *
8679        * For now, we just assert that the caller does this.  We might want to
8680        * automatically add a write to the workaround BO...
8681        */
8682       assert(non_lri_post_sync_flags != 0);
8683    }
8684 
8685    if (flags & PIPE_CONTROL_SYNC_GFDT) {
8686       /* Project: All / Argument: Sync GFDT
8687        *
8688        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8689        *  than '0' or 0x2520[13] must be set."
8690        *
8691        * For now, we just assert that the caller does this.
8692        */
8693       assert(non_lri_post_sync_flags != 0);
8694    }
8695 
8696    if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8697       /* Project: SNB, IVB, HSW / Argument: TLB inv
8698        *
8699        * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8700        *  must be set to something other than '0'."
8701        *
8702        * For now, we just assert that the caller does this.
8703        */
8704       assert(non_lri_post_sync_flags != 0);
8705    }
8706 
8707    if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8708       /* Project: IVB+ / Argument: TLB inv
8709        *
8710        *    "Requires stall bit ([20] of DW1) set."
8711        *
8712        * Also, from the PIPE_CONTROL instruction table:
8713        *
8714        *    "Project: SKL+
8715        *     Post Sync Operation or CS stall must be set to ensure a TLB
8716        *     invalidation occurs.  Otherwise no cycle will occur to the TLB
8717        *     cache to invalidate."
8718        *
8719        * This is not a subset of the earlier rule, so there's nothing to do.
8720        */
8721       flags |= PIPE_CONTROL_CS_STALL;
8722    }
8723 #if GFX_VER == 8
8724    if (IS_COMPUTE_PIPELINE(batch)) {
8725       if (post_sync_flags ||
8726           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8727                     PIPE_CONTROL_DEPTH_STALL |
8728                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
8729                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8730                     PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8731          /* Project: BDW / Arguments:
8732           *
8733           * - LRI Post Sync Operation   [23]
8734           * - Post Sync Op              [15:14]
8735           * - Notify En                 [8]
8736           * - Depth Stall               [13]
8737           * - Render Target Cache Flush [12]
8738           * - Depth Cache Flush         [0]
8739           * - DC Flush Enable           [5]
8740           *
8741           *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
8742           *     Workloads."
8743           *
8744           * (The docs have separate table rows for each bit, with essentially
8745           * the same workaround text.  We've combined them here.)
8746           */
8747          flags |= PIPE_CONTROL_CS_STALL;
8748 
8749          /* Also, from the PIPE_CONTROL instruction table, bit 20:
8750           *
8751           *    "Project: BDW
8752           *     This bit must be always set when PIPE_CONTROL command is
8753           *     programmed by GPGPU and MEDIA workloads, except for the cases
8754           *     when only Read Only Cache Invalidation bits are set (State
8755           *     Cache Invalidation Enable, Instruction cache Invalidation
8756           *     Enable, Texture Cache Invalidation Enable, Constant Cache
8757           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
8758           *     need not implemented when FF_DOP_CG is disable via "Fixed
8759           *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8760           *
8761           * It sounds like we could avoid CS stalls in some cases, but we
8762           * don't currently bother.  This list isn't exactly the list above,
8763           * either...
8764           */
8765       }
8766    }
8767 #endif
8768    /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8769     *
8770     * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8771     *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8772     *
8773     * Note that the kernel does CS stalls between batches, so we only need
8774     * to count them within a batch.  We currently naively count every 4, and
8775     * don't skip the ones with only read-cache-invalidate bits set.  This
8776     * may or may not be a problem...
8777     */
8778    if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8779       if (flags & PIPE_CONTROL_CS_STALL) {
8780          /* If we're doing a CS stall, reset the counter and carry on. */
8781          batch->pipe_controls_since_last_cs_stall = 0;
8782       }
8783 
8784       /* If this is the fourth pipe control without a CS stall, do one now. */
8785       if (++batch->pipe_controls_since_last_cs_stall == 4) {
8786          batch->pipe_controls_since_last_cs_stall = 0;
8787          flags |= PIPE_CONTROL_CS_STALL;
8788       }
8789    }
8790 
8791    /* "Stall" workarounds ----------------------------------------------
8792     * These have to come after the earlier ones because we may have added
8793     * some additional CS stalls above.
8794     */
8795 
8796    if (flags & PIPE_CONTROL_CS_STALL) {
8797       /* Project: PRE-SKL, VLV, CHV
8798        *
8799        * "[All Stepping][All SKUs]:
8800        *
8801        *  One of the following must also be set:
8802        *
8803        *  - Render Target Cache Flush Enable ([12] of DW1)
8804        *  - Depth Cache Flush Enable ([0] of DW1)
8805        *  - Stall at Pixel Scoreboard ([1] of DW1)
8806        *  - Depth Stall ([13] of DW1)
8807        *  - Post-Sync Operation ([13] of DW1)
8808        *  - DC Flush Enable ([5] of DW1)"
8809        *
8810        * If we don't already have one of those bits set, we choose to add
8811        * "Stall at Pixel Scoreboard".  Some of the other bits require a
8812        * CS stall as a workaround (see above), which would send us into
8813        * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
8814        * appears to be safe, so we choose that.
8815        */
8816       const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8817                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8818                                PIPE_CONTROL_WRITE_IMMEDIATE |
8819                                PIPE_CONTROL_WRITE_DEPTH_COUNT |
8820                                PIPE_CONTROL_WRITE_TIMESTAMP |
8821                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
8822                                PIPE_CONTROL_DEPTH_STALL |
8823                                PIPE_CONTROL_DATA_CACHE_FLUSH;
8824       if (!(flags & wa_bits))
8825          flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8826    }
8827 
8828    /* Emit --------------------------------------------------------------- */
8829 
8830    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8831       fprintf(stderr,
8832               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8833               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8834               (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8835               (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8836               (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8837               (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8838               (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8839               (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8840               (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8841               (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8842               (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8843               (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8844               (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8845               (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8846               (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8847               (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8848               (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8849               "SnapRes" : "",
8850               (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8851               "ISPDis" : "",
8852               (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8853               (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8854               (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8855               imm, reason);
8856    }
8857 
8858    crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8859 #if GFX_VER >= 7
8860       pc.LRIPostSyncOperation = NoLRIOperation;
8861       pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8862       pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8863 #endif
8864 #if GFX_VER >= 6
8865       pc.StoreDataIndex = 0;
8866       pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8867       pc.GlobalSnapshotCountReset =
8868          flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8869       pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8870       pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8871       pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8872       pc.RenderTargetCacheFlushEnable =
8873          flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8874       pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8875       pc.StateCacheInvalidationEnable =
8876          flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8877       pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8878       pc.ConstantCacheInvalidationEnable =
8879          flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8880 #else
8881       pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8882 #endif
8883       pc.PostSyncOperation = flags_to_post_sync_op(flags);
8884       pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8885       pc.InstructionCacheInvalidateEnable =
8886          flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8887       pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8888 #if GFX_VER >= 5 || GFX_VERx10 == 45
8889       pc.IndirectStatePointersDisable =
8890          flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8891 #endif
8892 #if GFX_VER >= 6
8893       pc.TextureCacheInvalidationEnable =
8894          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8895 #elif GFX_VER == 5 || GFX_VERx10 == 45
8896       pc.TextureCacheFlushEnable =
8897          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8898 #endif
8899       pc.Address = ggtt_bo(bo, offset);
8900       if (GFX_VER < 7 && bo)
8901          pc.DestinationAddressType = DAT_GGTT;
8902       pc.ImmediateData = imm;
8903    }
8904 }
8905 
8906 #if GFX_VER == 6
8907 void
8908 genX(crocus_upload_urb)(struct crocus_batch *batch,
8909                         unsigned vs_size,
8910                         bool gs_present,
8911                         unsigned gs_size)
8912 {
8913    struct crocus_context *ice = batch->ice;
8914    int nr_vs_entries, nr_gs_entries;
8915    int total_urb_size = ice->urb.size * 1024; /* in bytes */
8916    const struct intel_device_info *devinfo = &batch->screen->devinfo;
8917 
8918    /* Calculate how many entries fit in each stage's section of the URB */
8919    if (gs_present) {
8920       nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8921       nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8922    } else {
8923       nr_vs_entries = total_urb_size / (vs_size * 128);
8924       nr_gs_entries = 0;
8925    }
8926 
8927    /* Then clamp to the maximum allowed by the hardware */
8928    if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8929       nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8930 
8931    if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8932       nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8933 
8934    /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8935    ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8936    ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8937 
8938    assert(ice->urb.nr_vs_entries >=
8939           devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8940    assert(ice->urb.nr_vs_entries % 4 == 0);
8941    assert(ice->urb.nr_gs_entries % 4 == 0);
8942    assert(vs_size <= 5);
8943    assert(gs_size <= 5);
8944 
8945    crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8946       urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8947       urb.VSURBEntryAllocationSize = vs_size - 1;
8948 
8949       urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8950       urb.GSURBEntryAllocationSize = gs_size - 1;
8951    };
8952    /* From the PRM Volume 2 part 1, section 1.4.7:
8953     *
8954     *   Because of a urb corruption caused by allocating a previous gsunit’s
8955     *   urb entry to vsunit software is required to send a "GS NULL
8956     *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8957     *   a dummy DRAW call before any case where VS will be taking over GS URB
8958     *   space.
8959     *
8960     * It is not clear exactly what this means ("URB fence" is a command that
8961     * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
8962     * a workaround.
8963     */
8964    if (ice->urb.gs_present && !gs_present)
8965       crocus_emit_mi_flush(batch);
8966    ice->urb.gs_present = gs_present;
8967 }
8968 #endif
8969 
8970 static void
8971 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8972 {
8973 }
8974 
8975 static void
8976 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8977                                  struct crocus_bo *bo,
8978                                  uint32_t offset_in_bytes,
8979                                  uint32_t report_id)
8980 {
8981 #if GFX_VER >= 7
8982    crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8983       mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8984       mi_rpc.ReportID = report_id;
8985    }
8986 #endif
8987 }
8988 
8989 /**
8990  * From the PRM, Volume 2a:
8991  *
8992  *    "Indirect State Pointers Disable
8993  *
8994  *    At the completion of the post-sync operation associated with this pipe
8995  *    control packet, the indirect state pointers in the hardware are
8996  *    considered invalid; the indirect pointers are not saved in the context.
8997  *    If any new indirect state commands are executed in the command stream
8998  *    while the pipe control is pending, the new indirect state commands are
8999  *    preserved.
9000  *
9001  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9002  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9003  *    commands are only considered as Indirect State Pointers. Once ISP is
9004  *    issued in a context, SW must initialize by programming push constant
9005  *    commands for all the shaders (at least to zero length) before attempting
9006  *    any rendering operation for the same context."
9007  *
9008  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9009  * even though they point to a BO that has been already unreferenced at
9010  * the end of the previous batch buffer. This has been fine so far since
9011  * we are protected by these scratch page (every address not covered by
9012  * a BO should be pointing to the scratch page). But on CNL, it is
9013  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9014  * instruction.
9015  *
9016  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9017  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9018  * context restore, so the mentioned hang doesn't happen. However,
9019  * software must program push constant commands for all stages prior to
9020  * rendering anything, so we flag them as dirty.
9021  *
9022  * Finally, we also make sure to stall at pixel scoreboard to make sure the
9023  * constants have been loaded into the EUs prior to disable the push constants
9024  * so that it doesn't hang a previous 3DPRIMITIVE.
9025  */
9026 #if GFX_VER >= 7
9027 static void
9028 gen7_emit_isp_disable(struct crocus_batch *batch)
9029 {
9030    crocus_emit_raw_pipe_control(batch, "isp disable",
9031                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9032                                 PIPE_CONTROL_CS_STALL,
9033                                 NULL, 0, 0);
9034    crocus_emit_raw_pipe_control(batch, "isp disable",
9035                                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9036                                 PIPE_CONTROL_CS_STALL,
9037                                 NULL, 0, 0);
9038 
9039    struct crocus_context *ice = batch->ice;
9040    ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9041                               CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9042                               CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9043                               CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9044                               CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9045 }
9046 #endif
9047 
9048 #if GFX_VER >= 7
9049 static void
9050 crocus_state_finish_batch(struct crocus_batch *batch)
9051 {
9052 #if GFX_VERx10 == 75
9053    if (batch->name == CROCUS_BATCH_RENDER) {
9054       crocus_emit_mi_flush(batch);
9055       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9056          ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9057       }
9058 
9059       crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9060                                      PIPE_CONTROL_CS_STALL);
9061    }
9062 #endif
9063    gen7_emit_isp_disable(batch);
9064 }
9065 #endif
9066 
9067 static void
9068 crocus_batch_reset_dirty(struct crocus_batch *batch)
9069 {
9070    /* unreference any index buffer so it get reemitted. */
9071    pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9072 
9073    /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9074     * as the old state batch won't still be available.
9075     */
9076    batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9077       CROCUS_DIRTY_COLOR_CALC_STATE;
9078 
9079    batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9080 
9081    batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9082    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9083    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9084    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9085    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9086    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9087    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9088 
9089    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9090    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9091    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9092    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9093    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9094    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9095 
9096    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9097    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9098    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9099    batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9100 
9101 #if GFX_VER >= 6
9102    /* SCISSOR_STATE */
9103    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9104    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9105    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9106 
9107 #endif
9108 #if GFX_VER <= 5
9109    /* dirty the SF state on gen4/5 */
9110    batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9111    batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9112    batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9113    batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9114 #endif
9115 #if GFX_VER >= 7
9116    /* Streamout dirty */
9117    batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9118    batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9119    batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9120 #endif
9121 }
9122 
9123 #if GFX_VERx10 == 75
9124 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9125 {
9126    return &ice->state.cso_rast->cso;
9127 }
9128 #endif
9129 
9130 #if GFX_VER >= 6
9131 static void update_so_strides(struct crocus_context *ice,
9132                               uint16_t *strides)
9133 {
9134    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9135       struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9136       if (so)
9137          so->stride = strides[i] * sizeof(uint32_t);
9138    }
9139 }
9140 #endif
9141 
9142 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9143                                    int s,
9144                                    uint32_t *clamp_mask)
9145 {
9146 #if GFX_VER < 8
9147    if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9148        samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9149       if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9150          clamp_mask[0] |= (1 << s);
9151       if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9152          clamp_mask[1] |= (1 << s);
9153       if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9154          clamp_mask[2] |= (1 << s);
9155    }
9156 #endif
9157 }
9158 
9159 static void
9160 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9161 {
9162    struct crocus_context *ice = (struct crocus_context *) ctx;
9163 
9164    if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9165       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9166       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9167    }
9168 
9169    if (ice->batch_count == 1)
9170       return;
9171 
9172    if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9173       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9174       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9175    }
9176 }
9177 
9178 void
9179 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9180 {
9181    assert(screen->devinfo.verx10 == GFX_VERx10);
9182    assert(screen->devinfo.ver == GFX_VER);
9183    screen->vtbl.destroy_state = crocus_destroy_state;
9184    screen->vtbl.init_render_context = crocus_init_render_context;
9185    screen->vtbl.upload_render_state = crocus_upload_render_state;
9186 #if GFX_VER >= 7
9187    screen->vtbl.init_compute_context = crocus_init_compute_context;
9188    screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9189 #endif
9190    screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9191    screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9192    screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9193 #if GFX_VERx10 >= 75
9194    screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9195    screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9196    screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9197    screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9198    screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9199    screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9200 #endif
9201 #if GFX_VER >= 7
9202    screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9203    screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9204    screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9205    screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9206 #endif
9207    screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9208 #if GFX_VER >= 6
9209    screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9210    screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9211 #endif
9212    screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9213    screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9214    screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9215    screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9216    screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9217    screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9218    screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9219 #if GFX_VER >= 7
9220    screen->vtbl.finish_batch = crocus_state_finish_batch;
9221 #endif
9222 #if GFX_VER <= 5
9223    screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9224    screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9225 #endif
9226    screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9227    screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9228    screen->vtbl.translate_prim_type = translate_prim_type;
9229 #if GFX_VER >= 6
9230    screen->vtbl.update_so_strides = update_so_strides;
9231    screen->vtbl.get_so_offset = crocus_get_so_offset;
9232 #endif
9233 
9234    genX(crocus_init_blt)(screen);
9235 }
9236 
9237 void
9238 genX(crocus_init_state)(struct crocus_context *ice)
9239 {
9240    struct pipe_context *ctx = &ice->ctx;
9241 
9242    ctx->create_blend_state = crocus_create_blend_state;
9243    ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9244    ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9245    ctx->create_sampler_state = crocus_create_sampler_state;
9246    ctx->create_sampler_view = crocus_create_sampler_view;
9247    ctx->create_surface = crocus_create_surface;
9248    ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9249    ctx->bind_blend_state = crocus_bind_blend_state;
9250    ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9251    ctx->bind_sampler_states = crocus_bind_sampler_states;
9252    ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9253    ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9254    ctx->delete_blend_state = crocus_delete_state;
9255    ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9256    ctx->delete_rasterizer_state = crocus_delete_state;
9257    ctx->delete_sampler_state = crocus_delete_state;
9258    ctx->delete_vertex_elements_state = crocus_delete_state;
9259    ctx->set_blend_color = crocus_set_blend_color;
9260    ctx->set_clip_state = crocus_set_clip_state;
9261    ctx->set_constant_buffer = crocus_set_constant_buffer;
9262    ctx->set_shader_buffers = crocus_set_shader_buffers;
9263    ctx->set_shader_images = crocus_set_shader_images;
9264    ctx->set_sampler_views = crocus_set_sampler_views;
9265    ctx->set_tess_state = crocus_set_tess_state;
9266    ctx->set_patch_vertices = crocus_set_patch_vertices;
9267    ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9268    ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9269    ctx->set_sample_mask = crocus_set_sample_mask;
9270    ctx->set_scissor_states = crocus_set_scissor_states;
9271    ctx->set_stencil_ref = crocus_set_stencil_ref;
9272    ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9273    ctx->set_viewport_states = crocus_set_viewport_states;
9274    ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9275    ctx->surface_destroy = crocus_surface_destroy;
9276    ctx->draw_vbo = crocus_draw_vbo;
9277    ctx->launch_grid = crocus_launch_grid;
9278 
9279    ctx->set_frontend_noop = crocus_set_frontend_noop;
9280 
9281 #if GFX_VER >= 6
9282    ctx->create_stream_output_target = crocus_create_stream_output_target;
9283    ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9284    ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9285 #endif
9286 
9287    ice->state.dirty = ~0ull;
9288    ice->state.stage_dirty = ~0ull;
9289 
9290    ice->state.statistics_counters_enabled = true;
9291 
9292    ice->state.sample_mask = 0xff;
9293    ice->state.num_viewports = 1;
9294    ice->state.prim_mode = MESA_PRIM_COUNT;
9295    ice->state.reduced_prim_mode = MESA_PRIM_COUNT;
9296    ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9297    ice->draw.derived_params.drawid = -1;
9298 
9299    /* Default all scissor rectangles to be empty regions. */
9300    for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9301       ice->state.scissors[i] = (struct pipe_scissor_state) {
9302          .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9303       };
9304    }
9305 }
9306