1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file iris_state.c
25 *
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * iris_upload_render_state() and iris_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73 #include <stdio.h>
74 #include <errno.h>
75
76 #ifdef HAVE_VALGRIND
77 #include <valgrind.h>
78 #include <memcheck.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83
84 #include "pipe/p_defines.h"
85 #include "pipe/p_state.h"
86 #include "pipe/p_context.h"
87 #include "pipe/p_screen.h"
88 #include "util/u_dual_blend.h"
89 #include "util/u_inlines.h"
90 #include "util/format/u_format.h"
91 #include "util/u_framebuffer.h"
92 #include "util/u_transfer.h"
93 #include "util/u_upload_mgr.h"
94 #include "util/u_viewport.h"
95 #include "util/u_memory.h"
96 #include "util/u_trace_gallium.h"
97 #include "nir.h"
98 #include "intel/common/intel_aux_map.h"
99 #include "intel/common/intel_compute_slm.h"
100 #include "intel/common/intel_l3_config.h"
101 #include "intel/common/intel_sample_positions.h"
102 #include "intel/ds/intel_tracepoints.h"
103 #include "iris_batch.h"
104 #include "iris_context.h"
105 #include "iris_defines.h"
106 #include "iris_pipe.h"
107 #include "iris_resource.h"
108 #include "iris_utrace.h"
109
110 #include "iris_genx_macros.h"
111
112 #if GFX_VER >= 9
113 #include "intel/compiler/brw_compiler.h"
114 #include "intel/common/intel_genX_state_brw.h"
115 #else
116 #include "intel/compiler/elk/elk_compiler.h"
117 #include "intel/common/intel_genX_state_elk.h"
118 #endif
119
120 #include "intel/common/intel_guardband.h"
121 #include "intel/common/intel_pixel_hash.h"
122 #include "intel/common/intel_tiled_render.h"
123
124 /**
125 * Statically assert that PIPE_* enums match the hardware packets.
126 * (As long as they match, we don't need to translate them.)
127 */
pipe_asserts()128 UNUSED static void pipe_asserts()
129 {
130 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
131
132 /* pipe_logicop happens to match the hardware. */
133 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
134 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
135 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
136 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
137 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
138 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
139 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
140 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
141 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
142 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
143 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
144 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
145 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
146 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
147 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
148 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
149
150 /* pipe_blend_func happens to match the hardware. */
151 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
160 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
161 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
162 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
163 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
164 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
165 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
166 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
167 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
168 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
169 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
170
171 /* pipe_blend_func happens to match the hardware. */
172 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
173 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
174 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
175 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
176 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
177
178 /* pipe_stencil_op happens to match the hardware. */
179 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
180 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
181 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
182 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
183 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
184 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
185 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
186 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
187
188 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
189 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
190 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
191 #undef PIPE_ASSERT
192 }
193
194 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)195 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
196 {
197 static const unsigned map[] = {
198 [MESA_PRIM_POINTS] = _3DPRIM_POINTLIST,
199 [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
200 [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
201 [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
202 [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
203 [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
204 [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
205 [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
206 [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
207 [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
208 [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
209 [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
210 [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
211 [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
212 [MESA_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
213 };
214
215 return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
216 }
217
218 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)219 translate_compare_func(enum pipe_compare_func pipe_func)
220 {
221 static const unsigned map[] = {
222 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
223 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
224 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
225 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
226 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
227 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
228 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
229 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
230 };
231 return map[pipe_func];
232 }
233
234 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)235 translate_shadow_func(enum pipe_compare_func pipe_func)
236 {
237 /* Gallium specifies the result of shadow comparisons as:
238 *
239 * 1 if ref <op> texel,
240 * 0 otherwise.
241 *
242 * The hardware does:
243 *
244 * 0 if texel <op> ref,
245 * 1 otherwise.
246 *
247 * So we need to flip the operator and also negate.
248 */
249 static const unsigned map[] = {
250 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
251 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
252 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
253 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
254 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
255 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
256 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
257 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
258 };
259 return map[pipe_func];
260 }
261
262 static unsigned
translate_cull_mode(unsigned pipe_face)263 translate_cull_mode(unsigned pipe_face)
264 {
265 static const unsigned map[4] = {
266 [PIPE_FACE_NONE] = CULLMODE_NONE,
267 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
268 [PIPE_FACE_BACK] = CULLMODE_BACK,
269 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
270 };
271 return map[pipe_face];
272 }
273
274 static unsigned
translate_fill_mode(unsigned pipe_polymode)275 translate_fill_mode(unsigned pipe_polymode)
276 {
277 static const unsigned map[4] = {
278 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
279 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
280 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
281 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
282 };
283 return map[pipe_polymode];
284 }
285
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289 static const unsigned map[] = {
290 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
292 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
293 };
294 return map[pipe_mip];
295 }
296
297 static uint32_t
translate_wrap(unsigned pipe_wrap)298 translate_wrap(unsigned pipe_wrap)
299 {
300 static const unsigned map[] = {
301 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
302 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
303 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
304 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
305 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
306 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
307
308 /* These are unsupported. */
309 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
310 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
311 };
312 return map[pipe_wrap];
313 }
314
315 /**
316 * Allocate space for some indirect state.
317 *
318 * Return a pointer to the map (to fill it out) and a state ref (for
319 * referring to the state in GPU commands).
320 */
321 static void *
upload_state(struct u_upload_mgr * uploader,struct iris_state_ref * ref,unsigned size,unsigned alignment)322 upload_state(struct u_upload_mgr *uploader,
323 struct iris_state_ref *ref,
324 unsigned size,
325 unsigned alignment)
326 {
327 void *p = NULL;
328 u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
329 return p;
330 }
331
332 /**
333 * Stream out temporary/short-lived state.
334 *
335 * This allocates space, pins the BO, and includes the BO address in the
336 * returned offset (which works because all state lives in 32-bit memory
337 * zones).
338 */
339 static uint32_t *
stream_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,unsigned size,unsigned alignment,uint32_t * out_offset)340 stream_state(struct iris_batch *batch,
341 struct u_upload_mgr *uploader,
342 struct pipe_resource **out_res,
343 unsigned size,
344 unsigned alignment,
345 uint32_t *out_offset)
346 {
347 void *ptr = NULL;
348
349 u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
350
351 struct iris_bo *bo = iris_resource_bo(*out_res);
352 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
353
354 iris_record_state_size(batch->state_sizes,
355 bo->address + *out_offset, size);
356
357 *out_offset += iris_bo_offset_from_base_address(bo);
358
359 return ptr;
360 }
361
362 /**
363 * stream_state() + memcpy.
364 */
365 static uint32_t
emit_state(struct iris_batch * batch,struct u_upload_mgr * uploader,struct pipe_resource ** out_res,const void * data,unsigned size,unsigned alignment)366 emit_state(struct iris_batch *batch,
367 struct u_upload_mgr *uploader,
368 struct pipe_resource **out_res,
369 const void *data,
370 unsigned size,
371 unsigned alignment)
372 {
373 unsigned offset = 0;
374 uint32_t *map =
375 stream_state(batch, uploader, out_res, size, alignment, &offset);
376
377 if (map)
378 memcpy(map, data, size);
379
380 return offset;
381 }
382
383 /**
384 * Did field 'x' change between 'old_cso' and 'new_cso'?
385 *
386 * (If so, we may want to set some dirty flags.)
387 */
388 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
389 #define cso_changed_memcmp(x) \
390 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
391 #define cso_changed_memcmp_elts(x, n) \
392 (!old_cso || memcmp(old_cso->x, new_cso->x, n * sizeof(old_cso->x[0])) != 0)
393
394 static void
flush_before_state_base_change(struct iris_batch * batch)395 flush_before_state_base_change(struct iris_batch *batch)
396 {
397 /* Wa_14014427904 - We need additional invalidate/flush when
398 * emitting NP state commands with ATS-M in compute mode.
399 */
400 bool atsm_compute = intel_device_info_is_atsm(batch->screen->devinfo) &&
401 batch->name == IRIS_BATCH_COMPUTE;
402 uint32_t np_state_wa_bits =
403 PIPE_CONTROL_CS_STALL |
404 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
405 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
406 PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
407 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
408 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
409 PIPE_CONTROL_FLUSH_HDC;
410
411 /* Flush before emitting STATE_BASE_ADDRESS.
412 *
413 * This isn't documented anywhere in the PRM. However, it seems to be
414 * necessary prior to changing the surface state base address. We've
415 * seen issues in Vulkan where we get GPU hangs when using multi-level
416 * command buffers which clear depth, reset state base address, and then
417 * go render stuff.
418 *
419 * Normally, in GL, we would trust the kernel to do sufficient stalls
420 * and flushes prior to executing our batch. However, it doesn't seem
421 * as if the kernel's flushing is always sufficient and we don't want to
422 * rely on it.
423 *
424 * We make this an end-of-pipe sync instead of a normal flush because we
425 * do not know the current status of the GPU. On Haswell at least,
426 * having a fast-clear operation in flight at the same time as a normal
427 * rendering operation can cause hangs. Since the kernel's flushing is
428 * insufficient, we need to ensure that any rendering operations from
429 * other processes are definitely complete before we try to do our own
430 * rendering. It's a bit of a big hammer but it appears to work.
431 *
432 * Render target cache flush before SBA is required by Wa_18039438632.
433 */
434 iris_emit_end_of_pipe_sync(batch,
435 "change STATE_BASE_ADDRESS (flushes)",
436 atsm_compute ? np_state_wa_bits : 0 |
437 PIPE_CONTROL_RENDER_TARGET_FLUSH |
438 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
439 PIPE_CONTROL_DATA_CACHE_FLUSH);
440 }
441
442 static void
flush_after_state_base_change(struct iris_batch * batch)443 flush_after_state_base_change(struct iris_batch *batch)
444 {
445 const struct intel_device_info *devinfo = batch->screen->devinfo;
446 /* After re-setting the surface state base address, we have to do some
447 * cache flusing so that the sampler engine will pick up the new
448 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
449 * Shared Function > 3D Sampler > State > State Caching (page 96):
450 *
451 * Coherency with system memory in the state cache, like the texture
452 * cache is handled partially by software. It is expected that the
453 * command stream or shader will issue Cache Flush operation or
454 * Cache_Flush sampler message to ensure that the L1 cache remains
455 * coherent with system memory.
456 *
457 * [...]
458 *
459 * Whenever the value of the Dynamic_State_Base_Addr,
460 * Surface_State_Base_Addr are altered, the L1 state cache must be
461 * invalidated to ensure the new surface or sampler state is fetched
462 * from system memory.
463 *
464 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
465 * which, according the PIPE_CONTROL instruction documentation in the
466 * Broadwell PRM:
467 *
468 * Setting this bit is independent of any other bit in this packet.
469 * This bit controls the invalidation of the L1 and L2 state caches
470 * at the top of the pipe i.e. at the parsing time.
471 *
472 * Unfortunately, experimentation seems to indicate that state cache
473 * invalidation through a PIPE_CONTROL does nothing whatsoever in
474 * regards to surface state and binding tables. In stead, it seems that
475 * invalidating the texture cache is what is actually needed.
476 *
477 * XXX: As far as we have been able to determine through
478 * experimentation, shows that flush the texture cache appears to be
479 * sufficient. The theory here is that all of the sampling/rendering
480 * units cache the binding table in the texture cache. However, we have
481 * yet to be able to actually confirm this.
482 *
483 * Wa_16013000631:
484 *
485 * "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
486 * or program pipe control with Instruction cache invalidate post
487 * STATE_BASE_ADDRESS command"
488 */
489 iris_emit_end_of_pipe_sync(batch,
490 "change STATE_BASE_ADDRESS (invalidates)",
491 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
492 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
493 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
494 (intel_needs_workaround(devinfo, 16013000631) ?
495 PIPE_CONTROL_INSTRUCTION_INVALIDATE : 0));
496 }
497
498 static void
iris_load_register_reg32(struct iris_batch * batch,uint32_t dst,uint32_t src)499 iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,
500 uint32_t src)
501 {
502 struct mi_builder b;
503 mi_builder_init(&b, batch->screen->devinfo, batch);
504 mi_store(&b, mi_reg32(dst), mi_reg32(src));
505 }
506
507 static void
iris_load_register_reg64(struct iris_batch * batch,uint32_t dst,uint32_t src)508 iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,
509 uint32_t src)
510 {
511 struct mi_builder b;
512 mi_builder_init(&b, batch->screen->devinfo, batch);
513 mi_store(&b, mi_reg64(dst), mi_reg64(src));
514 }
515
516 static void
iris_load_register_imm32(struct iris_batch * batch,uint32_t reg,uint32_t val)517 iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,
518 uint32_t val)
519 {
520 struct mi_builder b;
521 mi_builder_init(&b, batch->screen->devinfo, batch);
522 mi_store(&b, mi_reg32(reg), mi_imm(val));
523 }
524
525 static void
iris_load_register_imm64(struct iris_batch * batch,uint32_t reg,uint64_t val)526 iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,
527 uint64_t val)
528 {
529 struct mi_builder b;
530 mi_builder_init(&b, batch->screen->devinfo, batch);
531 mi_store(&b, mi_reg64(reg), mi_imm(val));
532 }
533
534 /**
535 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
536 */
537 static void
iris_load_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)538 iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,
539 struct iris_bo *bo, uint32_t offset)
540 {
541 iris_batch_sync_region_start(batch);
542 struct mi_builder b;
543 mi_builder_init(&b, batch->screen->devinfo, batch);
544 struct mi_value src = mi_mem32(ro_bo(bo, offset));
545 mi_store(&b, mi_reg32(reg), src);
546 iris_batch_sync_region_end(batch);
547 }
548
549 /**
550 * Load a 64-bit value from a buffer into a MMIO register via
551 * two MI_LOAD_REGISTER_MEM commands.
552 */
553 static void
iris_load_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset)554 iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,
555 struct iris_bo *bo, uint32_t offset)
556 {
557 iris_batch_sync_region_start(batch);
558 struct mi_builder b;
559 mi_builder_init(&b, batch->screen->devinfo, batch);
560 struct mi_value src = mi_mem64(ro_bo(bo, offset));
561 mi_store(&b, mi_reg64(reg), src);
562 iris_batch_sync_region_end(batch);
563 }
564
565 static void
iris_store_register_mem32(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)566 iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,
567 struct iris_bo *bo, uint32_t offset,
568 bool predicated)
569 {
570 iris_batch_sync_region_start(batch);
571 struct mi_builder b;
572 mi_builder_init(&b, batch->screen->devinfo, batch);
573 struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
574 struct mi_value src = mi_reg32(reg);
575 if (predicated)
576 mi_store_if(&b, dst, src);
577 else
578 mi_store(&b, dst, src);
579 iris_batch_sync_region_end(batch);
580 }
581
582 static void
iris_store_register_mem64(struct iris_batch * batch,uint32_t reg,struct iris_bo * bo,uint32_t offset,bool predicated)583 iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,
584 struct iris_bo *bo, uint32_t offset,
585 bool predicated)
586 {
587 iris_batch_sync_region_start(batch);
588 struct mi_builder b;
589 mi_builder_init(&b, batch->screen->devinfo, batch);
590 struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
591 struct mi_value src = mi_reg64(reg);
592 if (predicated)
593 mi_store_if(&b, dst, src);
594 else
595 mi_store(&b, dst, src);
596 iris_batch_sync_region_end(batch);
597 }
598
599 static void
iris_store_data_imm32(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint32_t imm)600 iris_store_data_imm32(struct iris_batch *batch,
601 struct iris_bo *bo, uint32_t offset,
602 uint32_t imm)
603 {
604 iris_batch_sync_region_start(batch);
605 struct mi_builder b;
606 mi_builder_init(&b, batch->screen->devinfo, batch);
607 struct mi_value dst = mi_mem32(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
608 struct mi_value src = mi_imm(imm);
609 mi_store(&b, dst, src);
610 iris_batch_sync_region_end(batch);
611 }
612
613 static void
iris_store_data_imm64(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset,uint64_t imm)614 iris_store_data_imm64(struct iris_batch *batch,
615 struct iris_bo *bo, uint32_t offset,
616 uint64_t imm)
617 {
618 iris_batch_sync_region_start(batch);
619 struct mi_builder b;
620 mi_builder_init(&b, batch->screen->devinfo, batch);
621 struct mi_value dst = mi_mem64(rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE));
622 struct mi_value src = mi_imm(imm);
623 mi_store(&b, dst, src);
624 iris_batch_sync_region_end(batch);
625 }
626
627 static void
iris_copy_mem_mem(struct iris_batch * batch,struct iris_bo * dst_bo,uint32_t dst_offset,struct iris_bo * src_bo,uint32_t src_offset,unsigned bytes)628 iris_copy_mem_mem(struct iris_batch *batch,
629 struct iris_bo *dst_bo, uint32_t dst_offset,
630 struct iris_bo *src_bo, uint32_t src_offset,
631 unsigned bytes)
632 {
633 /* MI_COPY_MEM_MEM operates on DWords. */
634 assert(bytes % 4 == 0);
635 assert(dst_offset % 4 == 0);
636 assert(src_offset % 4 == 0);
637 iris_batch_sync_region_start(batch);
638
639 for (unsigned i = 0; i < bytes; i += 4) {
640 iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {
641 cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,
642 IRIS_DOMAIN_OTHER_WRITE);
643 cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);
644 }
645 }
646
647 iris_batch_sync_region_end(batch);
648 }
649
650 static void
iris_rewrite_compute_walker_pc(struct iris_batch * batch,uint32_t * walker,struct iris_bo * bo,uint32_t offset)651 iris_rewrite_compute_walker_pc(struct iris_batch *batch,
652 uint32_t *walker,
653 struct iris_bo *bo,
654 uint32_t offset)
655 {
656 #if GFX_VERx10 >= 125
657 struct iris_screen *screen = batch->screen;
658 struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
659
660 uint32_t dwords[GENX(COMPUTE_WALKER_length)];
661
662 _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) {
663 cw.body.PostSync.Operation = WriteTimestamp;
664 cw.body.PostSync.DestinationAddress = addr;
665 cw.body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
666 }
667
668 for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++)
669 walker[i] |= dwords[i];
670 #else
671 unreachable("Unsupported");
672 #endif
673 }
674
675 static void
emit_pipeline_select(struct iris_batch * batch,uint32_t pipeline)676 emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
677 {
678 /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
679 #if GFX_VER < 20
680
681 #if GFX_VER >= 8 && GFX_VER < 10
682 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
683 *
684 * Software must clear the COLOR_CALC_STATE Valid field in
685 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
686 * with Pipeline Select set to GPGPU.
687 *
688 * The internal hardware docs recommend the same workaround for Gfx9
689 * hardware too.
690 */
691 if (pipeline == GPGPU)
692 iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
693 #endif
694
695 #if GFX_VER >= 12
696 /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
697 *
698 * "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
699 * are flushed through a stalling PIPE_CONTROL command prior to
700 * programming of PIPELINE_SELECT command transitioning Pipeline Select
701 * from 3D to GPGPU/Media.
702 * Software must ensure HDC Pipeline flush and Generic Media State Clear
703 * is issued through a stalling PIPE_CONTROL command prior to programming
704 * of PIPELINE_SELECT command transitioning Pipeline Select from
705 * GPGPU/Media to 3D."
706 *
707 * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
708 * because PIPE was not in MEDIA mode?!
709 */
710 enum pipe_control_flags flags = PIPE_CONTROL_CS_STALL |
711 PIPE_CONTROL_FLUSH_HDC;
712
713 if (pipeline == GPGPU && batch->name == IRIS_BATCH_RENDER) {
714 flags |= PIPE_CONTROL_RENDER_TARGET_FLUSH |
715 PIPE_CONTROL_DEPTH_CACHE_FLUSH;
716 } else {
717 flags |= PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH;
718 }
719 /* Wa_16013063087 - State Cache Invalidate must be issued prior to
720 * PIPELINE_SELECT when switching from 3D to Compute.
721 *
722 * SW must do this by programming of PIPECONTROL with “CS Stall” followed
723 * by a PIPECONTROL with State Cache Invalidate bit set.
724 */
725 if (pipeline == GPGPU &&
726 intel_needs_workaround(batch->screen->devinfo, 16013063087))
727 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
728
729 iris_emit_pipe_control_flush(batch, "PIPELINE_SELECT flush", flags);
730 #else
731 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
732 * PIPELINE_SELECT [DevBWR+]":
733 *
734 * "Project: DEVSNB+
735 *
736 * Software must ensure all the write caches are flushed through a
737 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
738 * command to invalidate read only caches prior to programming
739 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
740 */
741 iris_emit_pipe_control_flush(batch,
742 "workaround: PIPELINE_SELECT flushes (1/2)",
743 PIPE_CONTROL_RENDER_TARGET_FLUSH |
744 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
745 PIPE_CONTROL_DATA_CACHE_FLUSH |
746 PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
747 PIPE_CONTROL_CS_STALL);
748
749 iris_emit_pipe_control_flush(batch,
750 "workaround: PIPELINE_SELECT flushes (2/2)",
751 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
752 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
753 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
754 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
755 #endif
756
757 iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
758 #if GFX_VER >= 9
759 sel.MaskBits = GFX_VER == 12 ? 0x13 : 0x3;
760 #if GFX_VER == 12
761 sel.MediaSamplerDOPClockGateEnable = true;
762 #endif /* if GFX_VER == 12 */
763 #endif /* if GFX_VER >= 9 */
764 sel.PipelineSelection = pipeline;
765 }
766 #endif /* if GFX_VER < 20 */
767 }
768
769 UNUSED static void
init_glk_barrier_mode(struct iris_batch * batch,uint32_t value)770 init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)
771 {
772 #if GFX_VER == 9
773 /* Project: DevGLK
774 *
775 * "This chicken bit works around a hardware issue with barrier
776 * logic encountered when switching between GPGPU and 3D pipelines.
777 * To workaround the issue, this mode bit should be set after a
778 * pipeline is selected."
779 */
780 iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {
781 reg.GLKBarrierMode = value;
782 reg.GLKBarrierModeMask = 1;
783 }
784 #endif
785 }
786
787 static void
init_state_base_address(struct iris_batch * batch)788 init_state_base_address(struct iris_batch *batch)
789 {
790 struct isl_device *isl_dev = &batch->screen->isl_dev;
791 uint32_t mocs = isl_mocs(isl_dev, 0, false);
792 flush_before_state_base_change(batch);
793
794 /* We program most base addresses once at context initialization time.
795 * Each base address points at a 4GB memory zone, and never needs to
796 * change. See iris_bufmgr.h for a description of the memory zones.
797 *
798 * The one exception is Surface State Base Address, which needs to be
799 * updated occasionally. See iris_binder.c for the details there.
800 */
801 iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
802 sba.GeneralStateMOCS = mocs;
803 sba.StatelessDataPortAccessMOCS = mocs;
804 sba.DynamicStateMOCS = mocs;
805 sba.IndirectObjectMOCS = mocs;
806 sba.InstructionMOCS = mocs;
807 sba.SurfaceStateMOCS = mocs;
808 #if GFX_VER >= 9
809 sba.BindlessSurfaceStateMOCS = mocs;
810 #endif
811
812 sba.GeneralStateBaseAddressModifyEnable = true;
813 sba.DynamicStateBaseAddressModifyEnable = true;
814 sba.IndirectObjectBaseAddressModifyEnable = true;
815 sba.InstructionBaseAddressModifyEnable = true;
816 sba.GeneralStateBufferSizeModifyEnable = true;
817 sba.DynamicStateBufferSizeModifyEnable = true;
818 sba.SurfaceStateBaseAddressModifyEnable = true;
819 #if GFX_VER >= 11
820 sba.BindlessSamplerStateMOCS = mocs;
821 #endif
822 sba.IndirectObjectBufferSizeModifyEnable = true;
823 sba.InstructionBuffersizeModifyEnable = true;
824
825 sba.InstructionBaseAddress = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
826 sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
827 sba.SurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDER_START);
828
829 sba.GeneralStateBufferSize = 0xfffff;
830 sba.IndirectObjectBufferSize = 0xfffff;
831 sba.InstructionBufferSize = 0xfffff;
832 sba.DynamicStateBufferSize = 0xfffff;
833 #if GFX_VERx10 >= 125
834 sba.L1CacheControl = L1CC_WB;
835 #endif
836 }
837
838 flush_after_state_base_change(batch);
839 }
840
841 static void
iris_emit_l3_config(struct iris_batch * batch,const struct intel_l3_config * cfg)842 iris_emit_l3_config(struct iris_batch *batch,
843 const struct intel_l3_config *cfg)
844 {
845 #if GFX_VER < 20
846 assert(cfg || GFX_VER >= 12);
847
848 #if GFX_VER >= 12
849 #define L3_ALLOCATION_REG GENX(L3ALLOC)
850 #define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
851 #else
852 #define L3_ALLOCATION_REG GENX(L3CNTLREG)
853 #define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
854 #endif
855
856 iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {
857 #if GFX_VER < 11
858 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
859 #endif
860 #if GFX_VER == 11
861 /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set
862 * in L3CNTLREG register. The default setting of the bit is not the
863 * desirable behavior.
864 */
865 reg.ErrorDetectionBehaviorControl = true;
866 reg.UseFullWays = true;
867 #endif
868 if (GFX_VER < 12 || (cfg && cfg->n[INTEL_L3P_ALL] <= 126)) {
869 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
870 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
871 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
872 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
873 } else {
874 assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
875 cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
876 cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
877 cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
878 #if GFX_VER >= 12
879 reg.L3FullWayAllocationEnable = true;
880 #endif
881 }
882 }
883 #endif /* GFX_VER < 20 */
884 }
885
886 void
genX(emit_urb_config)887 genX(emit_urb_config)(struct iris_batch *batch,
888 bool has_tess_eval,
889 bool has_geometry)
890 {
891 struct iris_screen *screen = batch->screen;
892 struct iris_context *ice = batch->ice;
893
894 intel_get_urb_config(screen->devinfo,
895 screen->l3_config_3d,
896 has_tess_eval,
897 has_geometry,
898 &ice->shaders.urb.cfg,
899 &ice->state.urb_deref_block_size,
900 &ice->shaders.urb.constrained);
901
902 genX(urb_workaround)(batch, &ice->shaders.urb.cfg);
903
904 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
905 #if GFX_VER >= 12
906 iris_emit_cmd(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
907 urb._3DCommandSubOpcode += i;
908 urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
909 urb.VSURBStartingAddressSlice0 = ice->shaders.urb.cfg.start[i];
910 urb.VSURBStartingAddressSliceN = ice->shaders.urb.cfg.start[i];
911 urb.VSNumberofURBEntriesSlice0 = ice->shaders.urb.cfg.entries[i];
912 urb.VSNumberofURBEntriesSliceN = ice->shaders.urb.cfg.entries[i];
913 }
914 #else
915 iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
916 urb._3DCommandSubOpcode += i;
917 urb.VSURBStartingAddress = ice->shaders.urb.cfg.start[i];
918 urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
919 urb.VSNumberofURBEntries = ice->shaders.urb.cfg.entries[i];
920 }
921 #endif
922 }
923 }
924
925 #if GFX_VER == 9
926 static void
iris_enable_obj_preemption(struct iris_batch * batch,bool enable)927 iris_enable_obj_preemption(struct iris_batch *batch, bool enable)
928 {
929 /* A fixed function pipe flush is required before modifying this field */
930 iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"
931 : "disable preemption",
932 PIPE_CONTROL_RENDER_TARGET_FLUSH);
933
934 /* enable object level preemption */
935 iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
936 reg.ReplayMode = enable;
937 reg.ReplayModeMask = true;
938 }
939 }
940 #endif
941
942 static void
upload_pixel_hashing_tables(struct iris_batch * batch)943 upload_pixel_hashing_tables(struct iris_batch *batch)
944 {
945 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
946 UNUSED struct iris_context *ice = batch->ice;
947 assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
948
949 #if GFX_VER == 11
950 /* Gfx11 hardware has two pixel pipes at most. */
951 for (unsigned i = 2; i < ARRAY_SIZE(devinfo->ppipe_subslices); i++)
952 assert(devinfo->ppipe_subslices[i] == 0);
953
954 if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])
955 return;
956
957 unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
958 uint32_t hash_address;
959 struct pipe_resource *tmp = NULL;
960 uint32_t *map =
961 stream_state(batch, ice->state.dynamic_uploader, &tmp,
962 size, 64, &hash_address);
963 pipe_resource_reference(&tmp, NULL);
964
965 const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];
966 struct GENX(SLICE_HASH_TABLE) table;
967 intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
968
969 GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
970
971 iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
972 ptr.SliceHashStatePointerValid = true;
973 ptr.SliceHashTableStatePointer = hash_address;
974 }
975
976 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
977 mode.SliceHashingTableEnable = true;
978 }
979
980 #elif GFX_VERx10 == 120
981 /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
982 * present with n active dual subslices.
983 */
984 unsigned ppipes_of[3] = {};
985
986 for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
987 for (unsigned p = 0; p < 3; p++)
988 ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);
989 }
990
991 /* Gfx12 has three pixel pipes. */
992 for (unsigned p = 3; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)
993 assert(devinfo->ppipe_subslices[p] == 0);
994
995 if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
996 /* All three pixel pipes have the maximum number of active dual
997 * subslices, or there is only one active pixel pipe: Nothing to do.
998 */
999 return;
1000 }
1001
1002 iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
1003 p.SliceHashControl[0] = TABLE_0;
1004
1005 if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1006 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
1007 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1008 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
1009
1010 if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
1011 intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
1012 else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
1013 intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
1014 else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
1015 intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
1016 else
1017 unreachable("Illegal fusing.");
1018 }
1019
1020 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1021 p.SubsliceHashingTableEnable = true;
1022 p.SubsliceHashingTableEnableMask = true;
1023 }
1024
1025 #elif GFX_VERx10 == 125
1026 struct pipe_screen *pscreen = &batch->screen->base;
1027 const unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
1028 const struct pipe_resource tmpl = {
1029 .target = PIPE_BUFFER,
1030 .format = PIPE_FORMAT_R8_UNORM,
1031 .bind = PIPE_BIND_CUSTOM,
1032 .usage = PIPE_USAGE_IMMUTABLE,
1033 .flags = IRIS_RESOURCE_FLAG_DYNAMIC_MEMZONE,
1034 .width0 = size,
1035 .height0 = 1,
1036 .depth0 = 1,
1037 .array_size = 1
1038 };
1039
1040 pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
1041 ice->state.pixel_hashing_tables = pscreen->resource_create(pscreen, &tmpl);
1042
1043 struct iris_resource *res = (struct iris_resource *)ice->state.pixel_hashing_tables;
1044 struct pipe_transfer *transfer = NULL;
1045 uint32_t *map = pipe_buffer_map_range(&ice->ctx, ice->state.pixel_hashing_tables,
1046 0, size, PIPE_MAP_WRITE,
1047 &transfer);
1048
1049 /* Calculate the set of present pixel pipes, and another set of
1050 * present pixel pipes with 2 dual subslices enabled, the latter
1051 * will appear on the hashing table with twice the frequency of
1052 * pixel pipes with a single dual subslice present.
1053 */
1054 uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
1055 for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++) {
1056 if (devinfo->ppipe_subslices[p])
1057 ppipe_mask1 |= (1u << p);
1058 if (devinfo->ppipe_subslices[p] > 1)
1059 ppipe_mask2 |= (1u << p);
1060 }
1061 assert(ppipe_mask1);
1062
1063 struct GENX(SLICE_HASH_TABLE) table;
1064
1065 /* Note that the hardware expects an array with 7 tables, each
1066 * table is intended to specify the pixel pipe hashing behavior for
1067 * every possible slice count between 2 and 8, however that doesn't
1068 * actually work, among other reasons due to hardware bugs that
1069 * will cause the GPU to erroneously access the table at the wrong
1070 * index in some cases, so in practice all 7 tables need to be
1071 * initialized to the same value.
1072 */
1073 for (unsigned i = 0; i < 7; i++)
1074 intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
1075 table.Entry[i][0]);
1076
1077 GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
1078
1079 pipe_buffer_unmap(&ice->ctx, transfer);
1080
1081 iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_NONE);
1082 iris_record_state_size(batch->state_sizes, res->bo->address + res->offset, size);
1083
1084 iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
1085 ptr.SliceHashStatePointerValid = true;
1086 ptr.SliceHashTableStatePointer = iris_bo_offset_from_base_address(res->bo) +
1087 res->offset;
1088 }
1089
1090 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
1091 mode.SliceHashingTableEnable = true;
1092 mode.SliceHashingTableEnableMask = true;
1093 mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
1094 hashing32x32 : NormalMode);
1095 mode.CrossSliceHashingModeMask = -1;
1096 }
1097 #endif
1098 }
1099
1100 static void
iris_alloc_push_constants(struct iris_batch * batch)1101 iris_alloc_push_constants(struct iris_batch *batch)
1102 {
1103 const struct intel_device_info *devinfo = batch->screen->devinfo;
1104
1105 /* For now, we set a static partitioning of the push constant area,
1106 * assuming that all stages could be in use.
1107 *
1108 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1109 * see if that improves performance by offering more space to
1110 * the VS/FS when those aren't in use. Also, try dynamically
1111 * enabling/disabling it like i965 does. This would be more
1112 * stalls and may not actually help; we don't know yet.
1113 */
1114
1115 /* Divide as equally as possible with any remainder given to FRAGMENT. */
1116 const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
1117 const unsigned stage_size = push_constant_kb / 5;
1118 const unsigned frag_size = push_constant_kb - 4 * stage_size;
1119
1120 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1121 iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1122 alloc._3DCommandSubOpcode = 18 + i;
1123 alloc.ConstantBufferOffset = stage_size * i;
1124 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
1125 }
1126 }
1127
1128 #if GFX_VERx10 == 125
1129 /* DG2: Wa_22011440098
1130 * MTL: Wa_18022330953
1131 *
1132 * In 3D mode, after programming push constant alloc command immediately
1133 * program push constant command(ZERO length) without any commit between
1134 * them.
1135 */
1136 iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
1137 /* Update empty push constants for all stages (bitmask = 11111b) */
1138 c.ShaderUpdateEnable = 0x1f;
1139 c.MOCS = iris_mocs(NULL, &batch->screen->isl_dev, 0);
1140 }
1141 #endif
1142 }
1143
1144 #if GFX_VER >= 12
1145 static void
1146 init_aux_map_state(struct iris_batch *batch);
1147 #endif
1148
1149 /* This updates a register. Caller should stall the pipeline as needed. */
1150 static void
iris_disable_rhwo_optimization(struct iris_batch * batch,bool disable)1151 iris_disable_rhwo_optimization(struct iris_batch *batch, bool disable)
1152 {
1153 assert(batch->screen->devinfo->verx10 == 120);
1154 #if GFX_VERx10 == 120
1155 iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
1156 c1.RCCRHWOOptimizationDisable = disable;
1157 c1.RCCRHWOOptimizationDisableMask = true;
1158 };
1159 #endif
1160 }
1161
1162 static void
state_system_mem_fence_address_emit(struct iris_batch * batch)1163 state_system_mem_fence_address_emit(struct iris_batch *batch)
1164 {
1165 #if GFX_VERx10 >= 200
1166 struct iris_screen *screen = batch->screen;
1167 struct iris_address addr = { .bo = iris_bufmgr_get_mem_fence_bo(screen->bufmgr) };
1168 iris_emit_cmd(batch, GENX(STATE_SYSTEM_MEM_FENCE_ADDRESS), mem_fence_addr) {
1169 mem_fence_addr.SystemMemoryFenceAddress = addr;
1170 }
1171 #endif
1172 }
1173
1174 /**
1175 * Upload initial GPU state for any kind of context.
1176 *
1177 * These need to happen for both render and compute.
1178 */
1179 static void
iris_init_common_context(struct iris_batch * batch)1180 iris_init_common_context(struct iris_batch *batch)
1181 {
1182 #if GFX_VER == 11
1183 iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {
1184 reg.HeaderlessMessageforPreemptableContexts = 1;
1185 reg.HeaderlessMessageforPreemptableContextsMask = 1;
1186 }
1187
1188 /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
1189 iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {
1190 reg.EnabledTexelOffsetPrecisionFix = 1;
1191 reg.EnabledTexelOffsetPrecisionFixMask = 1;
1192 }
1193 #endif
1194
1195 /* Select 256B-aligned binding table mode on Icelake through Tigerlake,
1196 * which gives us larger binding table pointers, at the cost of higher
1197 * alignment requirements (bits 18:8 are valid instead of 15:5). When
1198 * using this mode, we have to shift binding table pointers by 3 bits,
1199 * as they're still stored in the same bit-location in the field.
1200 */
1201 #if GFX_VER >= 11 && GFX_VERx10 < 125
1202 iris_emit_reg(batch, GENX(GT_MODE), reg) {
1203 reg.BindingTableAlignment = BTP_18_8;
1204 reg.BindingTableAlignmentMask = true;
1205 }
1206 #endif
1207
1208 #if GFX_VERx10 == 125
1209 /* Even though L3 partial write merging is supposed to be enabled
1210 * by default on Gfx12.5 according to the hardware spec, i915
1211 * appears to accidentally clear the enables during context
1212 * initialization, so make sure to enable them here since partial
1213 * write merging has a large impact on rendering performance.
1214 */
1215 iris_emit_reg(batch, GENX(L3SQCREG5), reg) {
1216 reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
1217 reg.CompressiblePartialWriteMergeEnable = true;
1218 reg.CoherentPartialWriteMergeEnable = true;
1219 reg.CrossTilePartialWriteMergeEnable = true;
1220 }
1221 #endif
1222
1223 state_system_mem_fence_address_emit(batch);
1224 }
1225
1226 static void
toggle_protected(struct iris_batch * batch)1227 toggle_protected(struct iris_batch *batch)
1228 {
1229 struct iris_context *ice;
1230
1231 if (batch->name == IRIS_BATCH_RENDER)
1232 ice =container_of(batch, struct iris_context, batches[IRIS_BATCH_RENDER]);
1233 else if (batch->name == IRIS_BATCH_COMPUTE)
1234 ice = container_of(batch, struct iris_context, batches[IRIS_BATCH_COMPUTE]);
1235 else
1236 unreachable("unhandled batch");
1237
1238 if (!ice->protected)
1239 return;
1240
1241 #if GFX_VER >= 12
1242 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1243 pc.CommandStreamerStallEnable = true;
1244 pc.RenderTargetCacheFlushEnable = true;
1245 pc.ProtectedMemoryDisable = true;
1246 }
1247 iris_emit_cmd(batch, GENX(MI_SET_APPID), appid) {
1248 /* Default value for single session. */
1249 appid.ProtectedMemoryApplicationID = 0xf;
1250 appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
1251 }
1252 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
1253 pc.CommandStreamerStallEnable = true;
1254 pc.RenderTargetCacheFlushEnable = true;
1255 pc.ProtectedMemoryEnable = true;
1256 }
1257 #else
1258 unreachable("Not supported");
1259 #endif
1260 }
1261
1262 #if GFX_VER >= 20
1263 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE_FAST)
1264 #else
1265 #define _3DSTATE_DRAWING_RECTANGLE GENX(3DSTATE_DRAWING_RECTANGLE)
1266 #endif
1267
1268 /**
1269 * Upload the initial GPU state for a render context.
1270 *
1271 * This sets some invariant state that needs to be programmed a particular
1272 * way, but we never actually change.
1273 */
1274 static void
iris_init_render_context(struct iris_batch * batch)1275 iris_init_render_context(struct iris_batch *batch)
1276 {
1277 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1278
1279 iris_batch_sync_region_start(batch);
1280
1281 emit_pipeline_select(batch, _3D);
1282
1283 toggle_protected(batch);
1284
1285 iris_emit_l3_config(batch, batch->screen->l3_config_3d);
1286
1287 init_state_base_address(batch);
1288
1289 iris_init_common_context(batch);
1290
1291 #if GFX_VER >= 9
1292 iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {
1293 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1294 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1295 }
1296 #else
1297 iris_emit_reg(batch, GENX(INSTPM), reg) {
1298 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1299 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1300 }
1301 #endif
1302
1303 #if GFX_VER == 9
1304 iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1305 reg.FloatBlendOptimizationEnable = true;
1306 reg.FloatBlendOptimizationEnableMask = true;
1307 reg.MSCRAWHazardAvoidanceBit = true;
1308 reg.MSCRAWHazardAvoidanceBitMask = true;
1309 reg.PartialResolveDisableInVC = true;
1310 reg.PartialResolveDisableInVCMask = true;
1311 }
1312
1313 if (devinfo->platform == INTEL_PLATFORM_GLK)
1314 init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);
1315 #endif
1316
1317 #if GFX_VER == 11
1318 iris_emit_reg(batch, GENX(TCCNTLREG), reg) {
1319 reg.L3DataPartialWriteMergingEnable = true;
1320 reg.ColorZPartialWriteMergingEnable = true;
1321 reg.URBPartialWriteMergingEnable = true;
1322 reg.TCDisable = true;
1323 }
1324
1325 /* Hardware specification recommends disabling repacking for the
1326 * compatibility with decompression mechanism in display controller.
1327 */
1328 if (devinfo->disable_ccs_repack) {
1329 iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {
1330 reg.DisableRepackingforCompression = true;
1331 reg.DisableRepackingforCompressionMask = true;
1332 }
1333 }
1334 #endif
1335
1336 #if GFX_VER == 12
1337 iris_emit_reg(batch, GENX(FF_MODE2), reg) {
1338 /* On Alchemist, the FF_MODE2 docs for the GS timer say:
1339 *
1340 * "The timer value must be set to 224."
1341 *
1342 * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
1343 * and that this is necessary to avoid hanging the HS/DS units. It
1344 * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
1345 *
1346 * The HS timer docs also have the same quote for Alchemist. I am
1347 * unaware of a reason it needs to be set to 224 on Tigerlake, but
1348 * we do so for consistency if nothing else.
1349 *
1350 * For the TDS timer value, the docs say:
1351 *
1352 * "For best performance, a value of 4 should be programmed."
1353 *
1354 * i915 also sets it this way on Tigerlake due to workarounds.
1355 *
1356 * The default VS timer appears to be 0, so we leave it at that.
1357 */
1358 reg.GSTimerValue = 224;
1359 reg.HSTimerValue = 224;
1360 reg.TDSTimerValue = 4;
1361 reg.VSTimerValue = 0;
1362 }
1363 #endif
1364
1365 #if INTEL_NEEDS_WA_1508744258
1366 /* The suggested workaround is:
1367 *
1368 * Disable RHWO by setting 0x7010[14] by default except during resolve
1369 * pass.
1370 *
1371 * We implement global disabling of the optimization here and we toggle it
1372 * in iris_resolve_color.
1373 *
1374 * iris_init_compute_context is unmodified because we don't expect to
1375 * access the RCC in the compute context. iris_mcs_partial_resolve is
1376 * unmodified because that pass doesn't use a HW bit to perform the
1377 * resolve (related HSDs specifically call out the RenderTargetResolveType
1378 * field in the 3DSTATE_PS instruction).
1379 */
1380 iris_disable_rhwo_optimization(batch, true);
1381 #endif
1382
1383 #if GFX_VERx10 == 120
1384 /* Wa_1806527549 says to disable the following HiZ optimization when the
1385 * depth buffer is D16_UNORM. We've found the WA to help with more depth
1386 * buffer configurations however, so we always disable it just to be safe.
1387 */
1388 iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
1389 reg.HZDepthTestLEGEOptimizationDisable = true;
1390 reg.HZDepthTestLEGEOptimizationDisableMask = true;
1391 }
1392 #endif
1393
1394 #if GFX_VERx10 == 125
1395 iris_emit_reg(batch, GENX(CHICKEN_RASTER_2), reg) {
1396 reg.TBIMRBatchSizeOverride = true;
1397 reg.TBIMROpenBatchEnable = true;
1398 reg.TBIMRFastClip = true;
1399 reg.TBIMRBatchSizeOverrideMask = true;
1400 reg.TBIMROpenBatchEnableMask = true;
1401 reg.TBIMRFastClipMask = true;
1402 };
1403 #endif
1404
1405 #if GFX_VER >= 20
1406 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1407 p.DX10OGLBorderModeforYCRCB = true;
1408 p.DX10OGLBorderModeforYCRCBMask = true;
1409 }
1410 #endif
1411
1412 upload_pixel_hashing_tables(batch);
1413
1414 /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid
1415 * changing it dynamically. We set it to the maximum size here, and
1416 * instead include the render target dimensions in the viewport, so
1417 * viewport extents clipping takes care of pruning stray geometry.
1418 */
1419 iris_emit_cmd(batch, _3DSTATE_DRAWING_RECTANGLE, rect) {
1420 rect.ClippedDrawingRectangleXMax = UINT16_MAX;
1421 rect.ClippedDrawingRectangleYMax = UINT16_MAX;
1422 }
1423
1424 /* Set the initial MSAA sample positions. */
1425 iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1426 INTEL_SAMPLE_POS_1X(pat._1xSample);
1427 INTEL_SAMPLE_POS_2X(pat._2xSample);
1428 INTEL_SAMPLE_POS_4X(pat._4xSample);
1429 INTEL_SAMPLE_POS_8X(pat._8xSample);
1430 #if GFX_VER >= 9
1431 INTEL_SAMPLE_POS_16X(pat._16xSample);
1432 #endif
1433 }
1434
1435 /* Use the legacy AA line coverage computation. */
1436 iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1437
1438 /* Disable chromakeying (it's for media) */
1439 iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1440
1441 /* We want regular rendering, not special HiZ operations. */
1442 iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1443
1444 /* No polygon stippling offsets are necessary. */
1445 /* TODO: may need to set an offset for origin-UL framebuffers */
1446 iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1447
1448 #if GFX_VERx10 >= 125
1449 iris_emit_cmd(batch, GENX(3DSTATE_MESH_CONTROL), foo);
1450 iris_emit_cmd(batch, GENX(3DSTATE_TASK_CONTROL), foo);
1451 #endif
1452
1453 #if INTEL_NEEDS_WA_14019857787
1454 iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
1455 p.EnableOOOreadsinRCPB = true;
1456 p.EnableOOOreadsinRCPBMask = true;
1457 }
1458 #endif
1459
1460 iris_alloc_push_constants(batch);
1461
1462 #if GFX_VER >= 12
1463 init_aux_map_state(batch);
1464 #endif
1465
1466 iris_batch_sync_region_end(batch);
1467 }
1468
1469 static void
iris_init_compute_context(struct iris_batch * batch)1470 iris_init_compute_context(struct iris_batch *batch)
1471 {
1472 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
1473
1474 iris_batch_sync_region_start(batch);
1475
1476 /* Wa_1607854226:
1477 *
1478 * Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.
1479 */
1480 #if GFX_VERx10 == 120
1481 emit_pipeline_select(batch, _3D);
1482 #else
1483 emit_pipeline_select(batch, GPGPU);
1484 #endif
1485
1486 toggle_protected(batch);
1487
1488 iris_emit_l3_config(batch, batch->screen->l3_config_cs);
1489
1490 init_state_base_address(batch);
1491
1492 iris_init_common_context(batch);
1493
1494 #if GFX_VERx10 == 120
1495 emit_pipeline_select(batch, GPGPU);
1496 #endif
1497
1498 #if GFX_VER == 9
1499 if (devinfo->platform == INTEL_PLATFORM_GLK)
1500 init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);
1501 #endif
1502
1503 #if GFX_VER >= 12
1504 init_aux_map_state(batch);
1505 #endif
1506
1507 #if GFX_VERx10 >= 125
1508 /* Wa_14015782607 - Issue pipe control with HDC_flush and
1509 * untyped cache flush set to 1 when CCS has NP state update with
1510 * STATE_COMPUTE_MODE.
1511 */
1512 if (intel_needs_workaround(devinfo, 14015782607))
1513 iris_emit_pipe_control_flush(batch, "Wa_14015782607",
1514 PIPE_CONTROL_CS_STALL |
1515 PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
1516 PIPE_CONTROL_FLUSH_HDC);
1517
1518 /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
1519 * emitting NP state commands with ATS-M in compute mode.
1520 */
1521 if (intel_device_info_is_atsm(devinfo))
1522 iris_emit_pipe_control_flush(batch, "Wa_14014427904/22013045878",
1523 PIPE_CONTROL_CS_STALL |
1524 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1525 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1526 PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
1527 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1528 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1529 PIPE_CONTROL_FLUSH_HDC);
1530
1531 iris_emit_cmd(batch, GENX(STATE_COMPUTE_MODE), cm) {
1532 #if GFX_VER >= 20
1533 cm.AsyncComputeThreadLimit = ACTL_Max8;
1534 cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
1535 cm.ZAsyncThrottlesettings = ZATS_DefertoAsyncComputeThreadLimit;
1536 cm.AsyncComputeThreadLimitMask = 0x7;
1537 cm.ZPassAsyncComputeThreadLimitMask = 0x7;
1538 cm.ZAsyncThrottlesettingsMask = 0x3;
1539 #else
1540 cm.PixelAsyncComputeThreadLimit = PACTL_Max24;
1541 cm.ZPassAsyncComputeThreadLimit = ZPACTL_Max60;
1542 cm.PixelAsyncComputeThreadLimitMask = 0x7;
1543 cm.ZPassAsyncComputeThreadLimitMask = 0x7;
1544 if (intel_device_info_is_mtl_or_arl(devinfo)) {
1545 cm.ZAsyncThrottlesettings = ZATS_DefertoPixelAsyncComputeThreadLimit;
1546 cm.ZAsyncThrottlesettingsMask = 0x3;
1547 }
1548 #endif
1549 }
1550 #endif
1551
1552 #if GFX_VERx10 >= 125
1553 iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
1554 cfe.MaximumNumberofThreads =
1555 devinfo->max_cs_threads * devinfo->subslice_total;
1556 }
1557 #endif
1558
1559 iris_batch_sync_region_end(batch);
1560 }
1561
1562 static void
iris_init_copy_context(struct iris_batch * batch)1563 iris_init_copy_context(struct iris_batch *batch)
1564 {
1565 iris_batch_sync_region_start(batch);
1566
1567 #if GFX_VER >= 12
1568 init_aux_map_state(batch);
1569 #endif
1570
1571 state_system_mem_fence_address_emit(batch);
1572
1573 iris_batch_sync_region_end(batch);
1574 }
1575
1576 struct iris_vertex_buffer_state {
1577 /** The VERTEX_BUFFER_STATE hardware structure. */
1578 uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];
1579
1580 /** The resource to source vertex data from. */
1581 struct pipe_resource *resource;
1582
1583 int offset;
1584 };
1585
1586 struct iris_depth_buffer_state {
1587 /* Depth/HiZ/Stencil related hardware packets. */
1588 #if GFX_VER < 20
1589 uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1590 GENX(3DSTATE_STENCIL_BUFFER_length) +
1591 GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
1592 GENX(3DSTATE_CLEAR_PARAMS_length)];
1593 #else
1594 uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1595 GENX(3DSTATE_STENCIL_BUFFER_length) +
1596 GENX(3DSTATE_HIER_DEPTH_BUFFER_length)];
1597 #endif
1598 };
1599
1600 #if INTEL_NEEDS_WA_1808121037
1601 enum iris_depth_reg_mode {
1602 IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
1603 IRIS_DEPTH_REG_MODE_D16_1X_MSAA,
1604 IRIS_DEPTH_REG_MODE_UNKNOWN,
1605 };
1606 #endif
1607
1608 /**
1609 * Generation-specific context state (ice->state.genx->...).
1610 *
1611 * Most state can go in iris_context directly, but these encode hardware
1612 * packets which vary by generation.
1613 */
1614 struct iris_genx_state {
1615 struct iris_vertex_buffer_state vertex_buffers[33];
1616 uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];
1617
1618 struct iris_depth_buffer_state depth_buffer;
1619
1620 uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
1621
1622 #if GFX_VER == 8
1623 bool pma_fix_enabled;
1624 #endif
1625
1626 /* Is object level preemption enabled? */
1627 bool object_preemption;
1628
1629 #if INTEL_NEEDS_WA_1808121037
1630 enum iris_depth_reg_mode depth_reg_mode;
1631 #endif
1632
1633 struct {
1634 #if GFX_VER == 8
1635 struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1636 #endif
1637 } shaders[MESA_SHADER_STAGES];
1638 };
1639
1640 /**
1641 * The pipe->set_blend_color() driver hook.
1642 *
1643 * This corresponds to our COLOR_CALC_STATE.
1644 */
1645 static void
iris_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1646 iris_set_blend_color(struct pipe_context *ctx,
1647 const struct pipe_blend_color *state)
1648 {
1649 struct iris_context *ice = (struct iris_context *) ctx;
1650
1651 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1652 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1653 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1654 }
1655
1656 /**
1657 * Gallium CSO for blend state (see pipe_blend_state).
1658 */
1659 struct iris_blend_state {
1660 /** Partial 3DSTATE_PS_BLEND */
1661 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1662
1663 /** Partial BLEND_STATE */
1664 uint32_t blend_state[GENX(BLEND_STATE_length) +
1665 IRIS_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
1666
1667 bool alpha_to_coverage; /* for shader key */
1668
1669 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1670 uint8_t blend_enables;
1671
1672 /** Bitfield of whether color writes are enabled for RT[i] */
1673 uint8_t color_write_enables;
1674
1675 /** Does RT[0] use dual color blending? */
1676 bool dual_color_blending;
1677
1678 int ps_dst_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1679 int ps_dst_alpha_blend_factor[IRIS_MAX_DRAW_BUFFERS];
1680 };
1681
1682 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1683 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1684 {
1685 if (alpha_to_one) {
1686 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1687 return PIPE_BLENDFACTOR_ONE;
1688
1689 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1690 return PIPE_BLENDFACTOR_ZERO;
1691 }
1692
1693 return f;
1694 }
1695
1696 /**
1697 * The pipe->create_blend_state() driver hook.
1698 *
1699 * Translates a pipe_blend_state into iris_blend_state.
1700 */
1701 static void *
iris_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1702 iris_create_blend_state(struct pipe_context *ctx,
1703 const struct pipe_blend_state *state)
1704 {
1705 struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
1706 uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);
1707
1708 cso->blend_enables = 0;
1709 cso->color_write_enables = 0;
1710 STATIC_ASSERT(IRIS_MAX_DRAW_BUFFERS <= 8);
1711
1712 cso->alpha_to_coverage = state->alpha_to_coverage;
1713
1714 bool indep_alpha_blend = false;
1715
1716 for (int i = 0; i < IRIS_MAX_DRAW_BUFFERS; i++) {
1717 const struct pipe_rt_blend_state *rt =
1718 &state->rt[state->independent_blend_enable ? i : 0];
1719
1720 enum pipe_blendfactor src_rgb =
1721 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1722 enum pipe_blendfactor src_alpha =
1723 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1724 enum pipe_blendfactor dst_rgb =
1725 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1726 enum pipe_blendfactor dst_alpha =
1727 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1728
1729 /* Stored separately in cso for dynamic emission. */
1730 cso->ps_dst_blend_factor[i] = (int) dst_rgb;
1731 cso->ps_dst_alpha_blend_factor[i] = (int) dst_alpha;
1732
1733 if (rt->rgb_func != rt->alpha_func ||
1734 src_rgb != src_alpha || dst_rgb != dst_alpha)
1735 indep_alpha_blend = true;
1736
1737 if (rt->blend_enable)
1738 cso->blend_enables |= 1u << i;
1739
1740 if (rt->colormask)
1741 cso->color_write_enables |= 1u << i;
1742
1743 iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {
1744 be.LogicOpEnable = state->logicop_enable;
1745 be.LogicOpFunction = state->logicop_func;
1746
1747 be.PreBlendSourceOnlyClampEnable = false;
1748 be.ColorClampRange = COLORCLAMP_RTFORMAT;
1749 be.PreBlendColorClampEnable = true;
1750 be.PostBlendColorClampEnable = true;
1751
1752 be.ColorBufferBlendEnable = rt->blend_enable;
1753
1754 be.ColorBlendFunction = rt->rgb_func;
1755 be.AlphaBlendFunction = rt->alpha_func;
1756
1757 /* The casts prevent warnings about implicit enum type conversions. */
1758 be.SourceBlendFactor = (int) src_rgb;
1759 be.SourceAlphaBlendFactor = (int) src_alpha;
1760
1761 be.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
1762 be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
1763 be.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
1764 be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
1765 }
1766 blend_entry += GENX(BLEND_STATE_ENTRY_length);
1767 }
1768
1769 iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1770 /* pb.HasWriteableRT is filled in at draw time.
1771 * pb.AlphaTestEnable is filled in at draw time.
1772 *
1773 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1774 * setting it when dual color blending without an appropriate shader.
1775 */
1776
1777 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1778 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1779
1780 /* The casts prevent warnings about implicit enum type conversions. */
1781 pb.SourceBlendFactor =
1782 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1783 pb.SourceAlphaBlendFactor =
1784 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1785 }
1786
1787 iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
1788 bs.AlphaToCoverageEnable = state->alpha_to_coverage;
1789 bs.IndependentAlphaBlendEnable = indep_alpha_blend;
1790 bs.AlphaToOneEnable = state->alpha_to_one;
1791 bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage_dither;
1792 bs.ColorDitherEnable = state->dither;
1793 /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
1794 }
1795
1796 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1797
1798 return cso;
1799 }
1800
1801 /**
1802 * The pipe->bind_blend_state() driver hook.
1803 *
1804 * Bind a blending CSO and flag related dirty bits.
1805 */
1806 static void
iris_bind_blend_state(struct pipe_context * ctx,void * state)1807 iris_bind_blend_state(struct pipe_context *ctx, void *state)
1808 {
1809 struct iris_context *ice = (struct iris_context *) ctx;
1810 struct iris_blend_state *cso = state;
1811
1812 ice->state.cso_blend = cso;
1813
1814 ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
1815 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1816 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
1817
1818 if (GFX_VER == 8)
1819 ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1820 }
1821
1822 /**
1823 * Return true if the FS writes to any color outputs which are not disabled
1824 * via color masking.
1825 */
1826 static bool
has_writeable_rt(const struct iris_blend_state * cso_blend,const struct shader_info * fs_info)1827 has_writeable_rt(const struct iris_blend_state *cso_blend,
1828 const struct shader_info *fs_info)
1829 {
1830 if (!fs_info)
1831 return false;
1832
1833 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1834
1835 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1836 rt_outputs = (1 << IRIS_MAX_DRAW_BUFFERS) - 1;
1837
1838 return cso_blend->color_write_enables & rt_outputs;
1839 }
1840
1841 /**
1842 * Gallium CSO for depth, stencil, and alpha testing state.
1843 */
1844 struct iris_depth_stencil_alpha_state {
1845 /** Partial 3DSTATE_WM_DEPTH_STENCIL. */
1846 uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
1847
1848 #if GFX_VER >= 12
1849 uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];
1850 #endif
1851
1852 /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */
1853 unsigned alpha_enabled:1;
1854 unsigned alpha_func:3; /**< PIPE_FUNC_x */
1855 float alpha_ref_value; /**< reference value */
1856
1857 /** Outbound to resolve and cache set tracking. */
1858 bool depth_writes_enabled;
1859 bool stencil_writes_enabled;
1860
1861 /** Outbound to Gfx8-9 PMA stall equations */
1862 bool depth_test_enabled;
1863
1864 /** Tracking state of DS writes for Wa_18019816803. */
1865 bool ds_write_state;
1866 };
1867
1868 /**
1869 * The pipe->create_depth_stencil_alpha_state() driver hook.
1870 *
1871 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1872 * testing state since we need pieces of it in a variety of places.
1873 */
1874 static void *
iris_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1875 iris_create_zsa_state(struct pipe_context *ctx,
1876 const struct pipe_depth_stencil_alpha_state *state)
1877 {
1878 struct iris_depth_stencil_alpha_state *cso =
1879 malloc(sizeof(struct iris_depth_stencil_alpha_state));
1880
1881 bool two_sided_stencil = state->stencil[1].enabled;
1882
1883 bool depth_write_enabled = false;
1884 bool stencil_write_enabled = false;
1885
1886 /* Depth writes enabled? */
1887 if (state->depth_writemask &&
1888 ((!state->depth_enabled) ||
1889 ((state->depth_func != PIPE_FUNC_NEVER) &&
1890 (state->depth_func != PIPE_FUNC_EQUAL))))
1891 depth_write_enabled = true;
1892
1893 bool stencil_all_keep =
1894 state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1895 state->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
1896 state->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
1897 (!two_sided_stencil ||
1898 (state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
1899 state->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
1900 state->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP));
1901
1902 bool stencil_mask_zero =
1903 state->stencil[0].writemask == 0 ||
1904 (!two_sided_stencil || state->stencil[1].writemask == 0);
1905
1906 bool stencil_func_never =
1907 state->stencil[0].func == PIPE_FUNC_NEVER &&
1908 state->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
1909 (!two_sided_stencil ||
1910 (state->stencil[1].func == PIPE_FUNC_NEVER &&
1911 state->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP));
1912
1913 /* Stencil writes enabled? */
1914 if (state->stencil[0].writemask != 0 ||
1915 ((two_sided_stencil && state->stencil[1].writemask != 0) &&
1916 (!stencil_all_keep &&
1917 !stencil_mask_zero &&
1918 !stencil_func_never)))
1919 stencil_write_enabled = true;
1920
1921 cso->ds_write_state = depth_write_enabled || stencil_write_enabled;
1922
1923 cso->alpha_enabled = state->alpha_enabled;
1924 cso->alpha_func = state->alpha_func;
1925 cso->alpha_ref_value = state->alpha_ref_value;
1926 cso->depth_writes_enabled = state->depth_writemask;
1927 cso->depth_test_enabled = state->depth_enabled;
1928 cso->stencil_writes_enabled =
1929 state->stencil[0].writemask != 0 ||
1930 (two_sided_stencil && state->stencil[1].writemask != 0);
1931
1932 /* gallium frontends need to optimize away EQUAL writes for us. */
1933 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1934
1935 iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {
1936 wmds.StencilFailOp = state->stencil[0].fail_op;
1937 wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;
1938 wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;
1939 wmds.StencilTestFunction =
1940 translate_compare_func(state->stencil[0].func);
1941 wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;
1942 wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;
1943 wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;
1944 wmds.BackfaceStencilTestFunction =
1945 translate_compare_func(state->stencil[1].func);
1946 wmds.DepthTestFunction = translate_compare_func(state->depth_func);
1947 wmds.DoubleSidedStencilEnable = two_sided_stencil;
1948 wmds.StencilTestEnable = state->stencil[0].enabled;
1949 wmds.StencilBufferWriteEnable =
1950 state->stencil[0].writemask != 0 ||
1951 (two_sided_stencil && state->stencil[1].writemask != 0);
1952 wmds.DepthTestEnable = state->depth_enabled;
1953 wmds.DepthBufferWriteEnable = state->depth_writemask;
1954 wmds.StencilTestMask = state->stencil[0].valuemask;
1955 wmds.StencilWriteMask = state->stencil[0].writemask;
1956 wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;
1957 wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;
1958 /* wmds.[Backface]StencilReferenceValue are merged later */
1959 #if GFX_VER >= 12
1960 wmds.StencilReferenceValueModifyDisable = true;
1961 #endif
1962 }
1963
1964 #if GFX_VER >= 12
1965 iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {
1966 depth_bounds.DepthBoundsTestValueModifyDisable = false;
1967 depth_bounds.DepthBoundsTestEnableModifyDisable = false;
1968 depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;
1969 depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;
1970 depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;
1971 }
1972 #endif
1973
1974 return cso;
1975 }
1976
1977 /**
1978 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1979 *
1980 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1981 */
1982 static void
iris_bind_zsa_state(struct pipe_context * ctx,void * state)1983 iris_bind_zsa_state(struct pipe_context *ctx, void *state)
1984 {
1985 struct iris_context *ice = (struct iris_context *) ctx;
1986 struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1987 struct iris_depth_stencil_alpha_state *new_cso = state;
1988
1989 if (new_cso) {
1990 if (cso_changed(alpha_ref_value))
1991 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1992
1993 if (cso_changed(alpha_enabled))
1994 ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
1995
1996 if (cso_changed(alpha_func))
1997 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1998
1999 if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))
2000 ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
2001
2002 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
2003 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
2004
2005 /* State ds_write_enable changed, need to flag dirty DS. */
2006 if (!old_cso || (ice->state.ds_write_state != new_cso->ds_write_state)) {
2007 ice->state.dirty |= IRIS_DIRTY_DS_WRITE_ENABLE;
2008 ice->state.ds_write_state = new_cso->ds_write_state;
2009 }
2010
2011 #if GFX_VER >= 12
2012 if (cso_changed(depth_bounds))
2013 ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;
2014 #endif
2015 }
2016
2017 ice->state.cso_zsa = new_cso;
2018 ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2019 ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
2020 ice->state.stage_dirty |=
2021 ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
2022
2023 if (GFX_VER == 8)
2024 ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
2025 }
2026
2027 #if GFX_VER == 8
2028 static bool
want_pma_fix(struct iris_context * ice)2029 want_pma_fix(struct iris_context *ice)
2030 {
2031 UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
2032 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2033 const struct iris_fs_data *fs_data =
2034 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
2035 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
2036 const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
2037 const struct iris_blend_state *cso_blend = ice->state.cso_blend;
2038
2039 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
2040 * to avoid stalling at the pixel mask array. The state equations are
2041 * documented in these places:
2042 *
2043 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
2044 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
2045 *
2046 * Both equations share some common elements:
2047 *
2048 * no_hiz_op =
2049 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2050 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2051 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2052 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
2053 *
2054 * killpixels =
2055 * 3DSTATE_WM::ForceKillPix != ForceOff &&
2056 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2057 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2058 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2059 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
2060 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2061 *
2062 * (Technically the stencil PMA treats ForceKillPix differently,
2063 * but I think this is a documentation oversight, and we don't
2064 * ever use it in this way, so it doesn't matter).
2065 *
2066 * common_pma_fix =
2067 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
2068 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
2069 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2070 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2071 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
2072 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
2073 * no_hiz_op
2074 *
2075 * These are always true:
2076 *
2077 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
2078 * 3DSTATE_PS_EXTRA::PixelShaderValid
2079 *
2080 * Also, we never use the normal drawing path for HiZ ops; these are true:
2081 *
2082 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
2083 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
2084 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
2085 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
2086 *
2087 * This happens sometimes:
2088 *
2089 * 3DSTATE_WM::ForceThreadDispatch != 1
2090 *
2091 * However, we choose to ignore it as it either agrees with the signal
2092 * (dispatch was already enabled, so nothing out of the ordinary), or
2093 * there are no framebuffer attachments (so no depth or HiZ anyway,
2094 * meaning the PMA signal will already be disabled).
2095 */
2096
2097 if (!cso_fb->zsbuf)
2098 return false;
2099
2100 struct iris_resource *zres, *sres;
2101 iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
2102
2103 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
2104 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
2105 */
2106 if (!zres ||
2107 !iris_resource_level_has_hiz(devinfo, zres, cso_fb->zsbuf->u.tex.level))
2108 return false;
2109
2110 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
2111 if (fs_data->early_fragment_tests)
2112 return false;
2113
2114 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
2115 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2116 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2117 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2118 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
2119 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2120 */
2121 bool killpixels = fs_data->uses_kill || fs_data->uses_omask ||
2122 cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;
2123
2124 /* The Gfx8 depth PMA equation becomes:
2125 *
2126 * depth_writes =
2127 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
2128 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
2129 *
2130 * stencil_writes =
2131 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
2132 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
2133 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
2134 *
2135 * Z_PMA_OPT =
2136 * common_pma_fix &&
2137 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
2138 * ((killpixels && (depth_writes || stencil_writes)) ||
2139 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
2140 *
2141 */
2142 if (!cso_zsa->depth_test_enabled)
2143 return false;
2144
2145 return fs_data->computed_depth_mode != PSCDEPTH_OFF ||
2146 (killpixels && (cso_zsa->depth_writes_enabled ||
2147 (sres && cso_zsa->stencil_writes_enabled)));
2148 }
2149 #endif
2150
2151 void
genX(update_pma_fix)2152 genX(update_pma_fix)(struct iris_context *ice,
2153 struct iris_batch *batch,
2154 bool enable)
2155 {
2156 #if GFX_VER == 8
2157 struct iris_genx_state *genx = ice->state.genx;
2158
2159 if (genx->pma_fix_enabled == enable)
2160 return;
2161
2162 genx->pma_fix_enabled = enable;
2163
2164 /* According to the Broadwell PIPE_CONTROL documentation, software should
2165 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
2166 * prior to the LRI. If stencil buffer writes are enabled, then a Render
2167 * Cache Flush is also necessary.
2168 *
2169 * The Gfx9 docs say to use a depth stall rather than a command streamer
2170 * stall. However, the hardware seems to violently disagree. A full
2171 * command streamer stall seems to be needed in both cases.
2172 */
2173 iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2174 PIPE_CONTROL_CS_STALL |
2175 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2176 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2177
2178 iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
2179 reg.NPPMAFixEnable = enable;
2180 reg.NPEarlyZFailsDisable = enable;
2181 reg.NPPMAFixEnableMask = true;
2182 reg.NPEarlyZFailsDisableMask = true;
2183 }
2184
2185 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
2186 * Flush bits is often necessary. We do it regardless because it's easier.
2187 * The render cache flush is also necessary if stencil writes are enabled.
2188 *
2189 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
2190 * flushes seem to work just as well.
2191 */
2192 iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
2193 PIPE_CONTROL_DEPTH_STALL |
2194 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
2195 PIPE_CONTROL_RENDER_TARGET_FLUSH);
2196 #endif
2197 }
2198
2199 /**
2200 * Gallium CSO for rasterizer state.
2201 */
2202 struct iris_rasterizer_state {
2203 uint32_t sf[GENX(3DSTATE_SF_length)];
2204 uint32_t clip[GENX(3DSTATE_CLIP_length)];
2205 uint32_t raster[GENX(3DSTATE_RASTER_length)];
2206 uint32_t wm[GENX(3DSTATE_WM_length)];
2207 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
2208
2209 uint8_t num_clip_plane_consts;
2210 bool clip_halfz; /* for CC_VIEWPORT */
2211 bool depth_clip_near; /* for CC_VIEWPORT */
2212 bool depth_clip_far; /* for CC_VIEWPORT */
2213 bool flatshade; /* for shader state */
2214 bool flatshade_first; /* for stream output */
2215 bool clamp_fragment_color; /* for shader state */
2216 bool light_twoside; /* for shader state */
2217 bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */
2218 bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
2219 bool line_smooth;
2220 bool line_stipple_enable;
2221 bool poly_stipple_enable;
2222 bool multisample;
2223 bool force_persample_interp;
2224 bool conservative_rasterization;
2225 bool fill_mode_point;
2226 bool fill_mode_line;
2227 bool fill_mode_point_or_line;
2228 enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
2229 uint16_t sprite_coord_enable;
2230 };
2231
2232 static float
get_line_width(const struct pipe_rasterizer_state * state)2233 get_line_width(const struct pipe_rasterizer_state *state)
2234 {
2235 float line_width = state->line_width;
2236
2237 /* From the OpenGL 4.4 spec:
2238 *
2239 * "The actual width of non-antialiased lines is determined by rounding
2240 * the supplied width to the nearest integer, then clamping it to the
2241 * implementation-dependent maximum non-antialiased line width."
2242 */
2243 if (!state->multisample && !state->line_smooth)
2244 line_width = roundf(state->line_width);
2245
2246 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
2247 /* For 1 pixel line thickness or less, the general anti-aliasing
2248 * algorithm gives up, and a garbage line is generated. Setting a
2249 * Line Width of 0.0 specifies the rasterization of the "thinnest"
2250 * (one-pixel-wide), non-antialiased lines.
2251 *
2252 * Lines rendered with zero Line Width are rasterized using the
2253 * "Grid Intersection Quantization" rules as specified by the
2254 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
2255 */
2256 line_width = 0.0f;
2257 }
2258
2259 return line_width;
2260 }
2261
2262 /**
2263 * The pipe->create_rasterizer_state() driver hook.
2264 */
2265 static void *
iris_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)2266 iris_create_rasterizer_state(struct pipe_context *ctx,
2267 const struct pipe_rasterizer_state *state)
2268 {
2269 struct iris_rasterizer_state *cso =
2270 malloc(sizeof(struct iris_rasterizer_state));
2271
2272 cso->multisample = state->multisample;
2273 cso->force_persample_interp = state->force_persample_interp;
2274 cso->clip_halfz = state->clip_halfz;
2275 cso->depth_clip_near = state->depth_clip_near;
2276 cso->depth_clip_far = state->depth_clip_far;
2277 cso->flatshade = state->flatshade;
2278 cso->flatshade_first = state->flatshade_first;
2279 cso->clamp_fragment_color = state->clamp_fragment_color;
2280 cso->light_twoside = state->light_twoside;
2281 cso->rasterizer_discard = state->rasterizer_discard;
2282 cso->half_pixel_center = state->half_pixel_center;
2283 cso->sprite_coord_mode = state->sprite_coord_mode;
2284 cso->sprite_coord_enable = state->sprite_coord_enable;
2285 cso->line_smooth = state->line_smooth;
2286 cso->line_stipple_enable = state->line_stipple_enable;
2287 cso->poly_stipple_enable = state->poly_stipple_enable;
2288 cso->conservative_rasterization =
2289 state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
2290
2291 cso->fill_mode_point =
2292 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2293 state->fill_back == PIPE_POLYGON_MODE_POINT;
2294 cso->fill_mode_line =
2295 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2296 state->fill_back == PIPE_POLYGON_MODE_LINE;
2297 cso->fill_mode_point_or_line =
2298 cso->fill_mode_point ||
2299 cso->fill_mode_line;
2300
2301 if (state->clip_plane_enable != 0)
2302 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2303 else
2304 cso->num_clip_plane_consts = 0;
2305
2306 float line_width = get_line_width(state);
2307
2308 iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2309 sf.StatisticsEnable = true;
2310 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2311 sf.LineEndCapAntialiasingRegionWidth =
2312 state->line_smooth ? _10pixels : _05pixels;
2313 sf.LastPixelEnable = state->line_last_pixel;
2314 sf.LineWidth = line_width;
2315 sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&
2316 !state->point_quad_rasterization;
2317 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2318 sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);
2319
2320 if (state->flatshade_first) {
2321 sf.TriangleFanProvokingVertexSelect = 1;
2322 } else {
2323 sf.TriangleStripListProvokingVertexSelect = 2;
2324 sf.TriangleFanProvokingVertexSelect = 2;
2325 sf.LineStripListProvokingVertexSelect = 1;
2326 }
2327 }
2328
2329 iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2330 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2331 rr.CullMode = translate_cull_mode(state->cull_face);
2332 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2333 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2334 rr.DXMultisampleRasterizationEnable = state->multisample;
2335 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2336 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2337 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2338 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2339 rr.GlobalDepthOffsetScale = state->offset_scale;
2340 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2341 rr.SmoothPointEnable = state->point_smooth;
2342 rr.ScissorRectangleEnable = state->scissor;
2343 #if GFX_VER >= 9
2344 rr.ViewportZNearClipTestEnable = state->depth_clip_near;
2345 rr.ViewportZFarClipTestEnable = state->depth_clip_far;
2346 rr.ConservativeRasterizationEnable =
2347 cso->conservative_rasterization;
2348 #else
2349 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2350 #endif
2351 }
2352
2353 iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2354 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2355 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2356 */
2357 cl.EarlyCullEnable = true;
2358 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2359 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2360 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2361 cl.GuardbandClipTestEnable = true;
2362 cl.ClipEnable = true;
2363 cl.MinimumPointWidth = 0.125;
2364 cl.MaximumPointWidth = 255.875;
2365
2366 if (state->flatshade_first) {
2367 cl.TriangleFanProvokingVertexSelect = 1;
2368 } else {
2369 cl.TriangleStripListProvokingVertexSelect = 2;
2370 cl.TriangleFanProvokingVertexSelect = 2;
2371 cl.LineStripListProvokingVertexSelect = 1;
2372 }
2373 }
2374
2375 iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
2376 /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
2377 * filled in at draw time from the FS program.
2378 */
2379 wm.LineAntialiasingRegionWidth = _10pixels;
2380 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2381 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2382 wm.LineStippleEnable = state->line_stipple_enable;
2383 wm.PolygonStippleEnable = state->poly_stipple_enable;
2384 }
2385
2386 /* Remap from 0..255 back to 1..256 */
2387 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2388
2389 iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2390 if (state->line_stipple_enable) {
2391 line.LineStipplePattern = state->line_stipple_pattern;
2392 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2393 line.LineStippleRepeatCount = line_stipple_factor;
2394 }
2395 }
2396
2397 return cso;
2398 }
2399
2400 /**
2401 * The pipe->bind_rasterizer_state() driver hook.
2402 *
2403 * Bind a rasterizer CSO and flag related dirty bits.
2404 */
2405 static void
iris_bind_rasterizer_state(struct pipe_context * ctx,void * state)2406 iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2407 {
2408 struct iris_context *ice = (struct iris_context *) ctx;
2409 struct iris_rasterizer_state *old_cso = ice->state.cso_rast;
2410 struct iris_rasterizer_state *new_cso = state;
2411
2412 if (new_cso) {
2413 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2414 if (cso_changed_memcmp(line_stipple))
2415 ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
2416
2417 if (cso_changed(half_pixel_center))
2418 ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
2419
2420 if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
2421 ice->state.dirty |= IRIS_DIRTY_WM;
2422
2423 if (cso_changed(rasterizer_discard))
2424 ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
2425
2426 if (cso_changed(flatshade_first))
2427 ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
2428
2429 if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||
2430 cso_changed(clip_halfz))
2431 ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
2432
2433 if (cso_changed(sprite_coord_enable) ||
2434 cso_changed(sprite_coord_mode) ||
2435 cso_changed(light_twoside))
2436 ice->state.dirty |= IRIS_DIRTY_SBE;
2437
2438 if (cso_changed(conservative_rasterization))
2439 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
2440 }
2441
2442 ice->state.cso_rast = new_cso;
2443 ice->state.dirty |= IRIS_DIRTY_RASTER;
2444 ice->state.dirty |= IRIS_DIRTY_CLIP;
2445 ice->state.stage_dirty |=
2446 ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];
2447 }
2448
2449 /**
2450 * Return true if the given wrap mode requires the border color to exist.
2451 *
2452 * (We can skip uploading it if the sampler isn't going to use it.)
2453 */
2454 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2455 wrap_mode_needs_border_color(unsigned wrap_mode)
2456 {
2457 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2458 }
2459
2460 /**
2461 * Gallium CSO for sampler state.
2462 */
2463 struct iris_sampler_state {
2464 union pipe_color_union border_color;
2465 bool needs_border_color;
2466
2467 uint32_t sampler_state[GENX(SAMPLER_STATE_length)];
2468
2469 #if GFX_VERx10 == 125
2470 /* Sampler state structure to use for 3D textures in order to
2471 * implement Wa_14014414195.
2472 */
2473 uint32_t sampler_state_3d[GENX(SAMPLER_STATE_length)];
2474 #endif
2475 };
2476
2477 static void
fill_sampler_state(uint32_t * sampler_state,const struct pipe_sampler_state * state,unsigned max_anisotropy)2478 fill_sampler_state(uint32_t *sampler_state,
2479 const struct pipe_sampler_state *state,
2480 unsigned max_anisotropy)
2481 {
2482 float min_lod = state->min_lod;
2483 unsigned mag_img_filter = state->mag_img_filter;
2484
2485 // XXX: explain this code ported from ilo...I don't get it at all...
2486 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2487 state->min_lod > 0.0f) {
2488 min_lod = 0.0f;
2489 mag_img_filter = state->min_img_filter;
2490 }
2491
2492 iris_pack_state(GENX(SAMPLER_STATE), sampler_state, samp) {
2493 samp.TCXAddressControlMode = translate_wrap(state->wrap_s);
2494 samp.TCYAddressControlMode = translate_wrap(state->wrap_t);
2495 samp.TCZAddressControlMode = translate_wrap(state->wrap_r);
2496 samp.CubeSurfaceControlMode = state->seamless_cube_map;
2497 samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2498 samp.MinModeFilter = state->min_img_filter;
2499 samp.MagModeFilter = mag_img_filter;
2500 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2501 samp.MaximumAnisotropy = RATIO21;
2502
2503 if (max_anisotropy >= 2) {
2504 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2505 #if GFX_VER >= 30
2506 samp.MinModeFilter = MAPFILTER_ANISOTROPIC_FAST;
2507 #else
2508 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2509 #endif
2510 samp.AnisotropicAlgorithm = EWAApproximation;
2511 }
2512
2513 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR) {
2514 #if GFX_VER >= 30
2515 samp.MagModeFilter = MAPFILTER_ANISOTROPIC_FAST;
2516 #else
2517 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2518 #endif
2519 }
2520
2521 samp.MaximumAnisotropy =
2522 MIN2((max_anisotropy - 2) / 2, RATIO161);
2523 }
2524
2525 /* Set address rounding bits if not using nearest filtering. */
2526 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2527 samp.UAddressMinFilterRoundingEnable = true;
2528 samp.VAddressMinFilterRoundingEnable = true;
2529 samp.RAddressMinFilterRoundingEnable = true;
2530 }
2531
2532 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2533 samp.UAddressMagFilterRoundingEnable = true;
2534 samp.VAddressMagFilterRoundingEnable = true;
2535 samp.RAddressMagFilterRoundingEnable = true;
2536 }
2537
2538 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2539 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2540
2541 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2542
2543 samp.LODPreClampMode = CLAMP_MODE_OGL;
2544 samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);
2545 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2546 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2547
2548 /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
2549 }
2550 }
2551
2552 /**
2553 * The pipe->create_sampler_state() driver hook.
2554 *
2555 * We fill out SAMPLER_STATE (except for the border color pointer), and
2556 * store that on the CPU. It doesn't make sense to upload it to a GPU
2557 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2558 * all bound sampler states to be in contiguous memory.
2559 */
2560 static void *
iris_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2561 iris_create_sampler_state(struct pipe_context *ctx,
2562 const struct pipe_sampler_state *state)
2563 {
2564 UNUSED struct iris_screen *screen = (void *)ctx->screen;
2565 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
2566 struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
2567
2568 if (!cso)
2569 return NULL;
2570
2571 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2572 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2573
2574 unsigned wrap_s = translate_wrap(state->wrap_s);
2575 unsigned wrap_t = translate_wrap(state->wrap_t);
2576 unsigned wrap_r = translate_wrap(state->wrap_r);
2577
2578 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2579
2580 cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||
2581 wrap_mode_needs_border_color(wrap_t) ||
2582 wrap_mode_needs_border_color(wrap_r);
2583
2584 fill_sampler_state(cso->sampler_state, state, state->max_anisotropy);
2585
2586 #if GFX_VERx10 == 125
2587 /* Fill an extra sampler state structure with anisotropic filtering
2588 * disabled used to implement Wa_14014414195.
2589 */
2590 if (intel_needs_workaround(screen->devinfo, 14014414195))
2591 fill_sampler_state(cso->sampler_state_3d, state, 0);
2592 #endif
2593
2594 return cso;
2595 }
2596
2597 /**
2598 * The pipe->bind_sampler_states() driver hook.
2599 */
2600 static void
iris_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2601 iris_bind_sampler_states(struct pipe_context *ctx,
2602 enum pipe_shader_type p_stage,
2603 unsigned start, unsigned count,
2604 void **states)
2605 {
2606 struct iris_context *ice = (struct iris_context *) ctx;
2607 gl_shader_stage stage = stage_from_pipe(p_stage);
2608 struct iris_shader_state *shs = &ice->state.shaders[stage];
2609
2610 assert(start + count <= IRIS_MAX_SAMPLERS);
2611
2612 bool dirty = false;
2613
2614 for (int i = 0; i < count; i++) {
2615 struct iris_sampler_state *state = states ? states[i] : NULL;
2616 if (shs->samplers[start + i] != state) {
2617 shs->samplers[start + i] = state;
2618 dirty = true;
2619 }
2620 }
2621
2622 if (dirty)
2623 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2624 }
2625
2626 /**
2627 * Upload the sampler states into a contiguous area of GPU memory, for
2628 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2629 *
2630 * Also fill out the border color state pointers.
2631 */
2632 static void
iris_upload_sampler_states(struct iris_context * ice,gl_shader_stage stage)2633 iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)
2634 {
2635 struct iris_screen *screen = (struct iris_screen *) ice->ctx.screen;
2636 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
2637 struct iris_shader_state *shs = &ice->state.shaders[stage];
2638 struct iris_border_color_pool *border_color_pool =
2639 iris_bufmgr_get_border_color_pool(screen->bufmgr);
2640
2641 /* We assume gallium frontends will call pipe->bind_sampler_states()
2642 * if the program's number of textures changes.
2643 */
2644 unsigned count = util_last_bit64(shader->bt.samplers_used_mask);
2645
2646 if (!count)
2647 return;
2648
2649 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2650 * in the dynamic state memory zone, so we can point to it via the
2651 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2652 */
2653 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2654 uint32_t *map =
2655 upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);
2656 if (unlikely(!map))
2657 return;
2658
2659 struct pipe_resource *res = shs->sampler_table.res;
2660 struct iris_bo *bo = iris_resource_bo(res);
2661
2662 iris_record_state_size(ice->state.sizes,
2663 bo->address + shs->sampler_table.offset, size);
2664
2665 shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);
2666
2667 ice->state.need_border_colors &= ~(1 << stage);
2668
2669 for (int i = 0; i < count; i++) {
2670 struct iris_sampler_state *state = shs->samplers[i];
2671 struct iris_sampler_view *tex = shs->textures[i];
2672
2673 if (!state) {
2674 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2675 } else {
2676 const uint32_t *sampler_state = state->sampler_state;
2677
2678 #if GFX_VERx10 == 125
2679 if (intel_needs_workaround(screen->devinfo, 14014414195) &&
2680 tex && tex->res->base.b.target == PIPE_TEXTURE_3D) {
2681 sampler_state = state->sampler_state_3d;
2682 }
2683 #endif
2684
2685 if (!state->needs_border_color) {
2686 memcpy(map, sampler_state, 4 * GENX(SAMPLER_STATE_length));
2687 } else {
2688 ice->state.need_border_colors |= 1 << stage;
2689
2690 /* We may need to swizzle the border color for format faking.
2691 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2692 * This means we need to move the border color's A channel into
2693 * the R or G channels so that those read swizzles will move it
2694 * back into A.
2695 */
2696 union pipe_color_union *color = &state->border_color;
2697 union pipe_color_union tmp;
2698 if (tex) {
2699 enum pipe_format internal_format = tex->res->internal_format;
2700
2701 if (util_format_is_alpha(internal_format)) {
2702 unsigned char swz[4] = {
2703 PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,
2704 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2705 };
2706 util_format_apply_color_swizzle(&tmp, color, swz, true);
2707 color = &tmp;
2708 } else if (util_format_is_luminance_alpha(internal_format) &&
2709 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2710 unsigned char swz[4] = {
2711 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,
2712 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2713 };
2714 util_format_apply_color_swizzle(&tmp, color, swz, true);
2715 color = &tmp;
2716 }
2717 }
2718
2719 /* Stream out the border color and merge the pointer. */
2720 uint32_t offset = iris_upload_border_color(border_color_pool,
2721 color);
2722
2723 uint32_t dynamic[GENX(SAMPLER_STATE_length)];
2724 iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
2725 dyns.BorderColorPointer = offset;
2726 }
2727
2728 for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
2729 map[j] = sampler_state[j] | dynamic[j];
2730 }
2731 }
2732
2733 map += GENX(SAMPLER_STATE_length);
2734 }
2735 }
2736
2737 static enum isl_channel_select
fmt_swizzle(const struct iris_format_info * fmt,enum pipe_swizzle swz)2738 fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)
2739 {
2740 switch (swz) {
2741 case PIPE_SWIZZLE_X: return fmt->swizzle.r;
2742 case PIPE_SWIZZLE_Y: return fmt->swizzle.g;
2743 case PIPE_SWIZZLE_Z: return fmt->swizzle.b;
2744 case PIPE_SWIZZLE_W: return fmt->swizzle.a;
2745 case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;
2746 case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;
2747 default: unreachable("invalid swizzle");
2748 }
2749 }
2750
2751 static void
fill_buffer_surface_state(struct isl_device * isl_dev,struct iris_resource * res,void * map,enum isl_format format,struct isl_swizzle swizzle,unsigned offset,unsigned size,isl_surf_usage_flags_t usage)2752 fill_buffer_surface_state(struct isl_device *isl_dev,
2753 struct iris_resource *res,
2754 void *map,
2755 enum isl_format format,
2756 struct isl_swizzle swizzle,
2757 unsigned offset,
2758 unsigned size,
2759 isl_surf_usage_flags_t usage)
2760 {
2761 const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2762 const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2763
2764 /* The ARB_texture_buffer_specification says:
2765 *
2766 * "The number of texels in the buffer texture's texel array is given by
2767 *
2768 * floor(<buffer_size> / (<components> * sizeof(<base_type>)),
2769 *
2770 * where <buffer_size> is the size of the buffer object, in basic
2771 * machine units and <components> and <base_type> are the element count
2772 * and base data type for elements, as specified in Table X.1. The
2773 * number of texels in the texel array is then clamped to the
2774 * implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."
2775 *
2776 * We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,
2777 * so that when ISL divides by stride to obtain the number of texels, that
2778 * texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.
2779 */
2780 unsigned final_size =
2781 MIN3(size, res->bo->size - res->offset - offset,
2782 IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);
2783
2784 isl_buffer_fill_state(isl_dev, map,
2785 .address = res->bo->address + res->offset + offset,
2786 .size_B = final_size,
2787 .format = format,
2788 .swizzle = swizzle,
2789 .stride_B = cpp,
2790 .mocs = iris_mocs(res->bo, isl_dev, usage));
2791 }
2792
2793 #define SURFACE_STATE_ALIGNMENT 64
2794
2795 /**
2796 * Allocate several contiguous SURFACE_STATE structures, one for each
2797 * supported auxiliary surface mode. This only allocates the CPU-side
2798 * copy, they will need to be uploaded later after they're filled in.
2799 */
2800 static void
alloc_surface_states(struct iris_surface_state * surf_state,unsigned aux_usages)2801 alloc_surface_states(struct iris_surface_state *surf_state,
2802 unsigned aux_usages)
2803 {
2804 enum { surf_size = 4 * GENX(RENDER_SURFACE_STATE_length) };
2805
2806 /* If this changes, update this to explicitly align pointers */
2807 STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);
2808
2809 assert(aux_usages != 0);
2810
2811 /* In case we're re-allocating them... */
2812 free(surf_state->cpu);
2813
2814 surf_state->aux_usages = aux_usages;
2815 surf_state->num_states = util_bitcount(aux_usages);
2816 surf_state->cpu = calloc(surf_state->num_states, surf_size);
2817 surf_state->ref.offset = 0;
2818 pipe_resource_reference(&surf_state->ref.res, NULL);
2819
2820 assert(surf_state->cpu);
2821 }
2822
2823 /**
2824 * Upload the CPU side SURFACE_STATEs into a GPU buffer.
2825 */
2826 static void
upload_surface_states(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state)2827 upload_surface_states(struct u_upload_mgr *mgr,
2828 struct iris_surface_state *surf_state)
2829 {
2830 const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2831 const unsigned bytes = surf_state->num_states * surf_size;
2832
2833 void *map =
2834 upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);
2835
2836 surf_state->ref.offset +=
2837 iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));
2838
2839 if (map)
2840 memcpy(map, surf_state->cpu, bytes);
2841 }
2842
2843 /**
2844 * Update resource addresses in a set of SURFACE_STATE descriptors,
2845 * and re-upload them if necessary.
2846 */
2847 static bool
update_surface_state_addrs(struct u_upload_mgr * mgr,struct iris_surface_state * surf_state,struct iris_bo * bo)2848 update_surface_state_addrs(struct u_upload_mgr *mgr,
2849 struct iris_surface_state *surf_state,
2850 struct iris_bo *bo)
2851 {
2852 if (surf_state->bo_address == bo->address)
2853 return false;
2854
2855 STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);
2856 STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);
2857
2858 uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];
2859
2860 /* First, update the CPU copies. We assume no other fields exist in
2861 * the QWord containing Surface Base Address.
2862 */
2863 for (unsigned i = 0; i < surf_state->num_states; i++) {
2864 *ss_addr = *ss_addr - surf_state->bo_address + bo->address;
2865 ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;
2866 }
2867
2868 /* Next, upload the updated copies to a GPU buffer. */
2869 upload_surface_states(mgr, surf_state);
2870
2871 surf_state->bo_address = bo->address;
2872
2873 return true;
2874 }
2875
2876 /* We should only use this function when it's needed to fill out
2877 * surf with information provided by the pipe_(image|sampler)_view.
2878 * This is only necessary for CL extension cl_khr_image2d_from_buffer.
2879 * This is the reason why ISL_SURF_DIM_2D is hardcoded on dim field.
2880 */
2881 static void
fill_surf_for_tex2d_from_buffer(struct isl_device * isl_dev,enum isl_format format,unsigned width,unsigned height,unsigned row_stride,isl_surf_usage_flags_t usage,struct isl_surf * surf)2882 fill_surf_for_tex2d_from_buffer(struct isl_device *isl_dev,
2883 enum isl_format format,
2884 unsigned width,
2885 unsigned height,
2886 unsigned row_stride,
2887 isl_surf_usage_flags_t usage,
2888 struct isl_surf *surf)
2889 {
2890 const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2891 const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2892
2893 const struct isl_surf_init_info init_info = {
2894 .dim = ISL_SURF_DIM_2D,
2895 .format = format,
2896 .width = width,
2897 .height = height,
2898 .depth = 1,
2899 .levels = 1,
2900 .array_len = 1,
2901 .samples = 1,
2902 .min_alignment_B = 4,
2903 .row_pitch_B = row_stride * cpp,
2904 .usage = usage,
2905 .tiling_flags = ISL_TILING_LINEAR_BIT,
2906 };
2907
2908 const bool isl_surf_created_successfully =
2909 isl_surf_init_s(isl_dev, surf, &init_info);
2910
2911 assert(isl_surf_created_successfully);
2912 }
2913
2914 static void
fill_surface_state(struct isl_device * isl_dev,void * map,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,unsigned aux_usage,uint32_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2915 fill_surface_state(struct isl_device *isl_dev,
2916 void *map,
2917 struct iris_resource *res,
2918 struct isl_surf *surf,
2919 struct isl_view *view,
2920 unsigned aux_usage,
2921 uint32_t extra_main_offset,
2922 uint32_t tile_x_sa,
2923 uint32_t tile_y_sa)
2924 {
2925 struct isl_surf_fill_state_info f = {
2926 .surf = surf,
2927 .view = view,
2928 .mocs = iris_mocs(res->bo, isl_dev, view->usage),
2929 .address = res->bo->address + res->offset + extra_main_offset,
2930 .x_offset_sa = tile_x_sa,
2931 .y_offset_sa = tile_y_sa,
2932 };
2933
2934 if (aux_usage != ISL_AUX_USAGE_NONE) {
2935 f.aux_surf = &res->aux.surf;
2936 f.aux_usage = aux_usage;
2937 f.clear_color = res->aux.clear_color;
2938
2939 if (aux_usage == ISL_AUX_USAGE_MC)
2940 f.mc_format = iris_format_for_usage(isl_dev->info,
2941 res->external_format,
2942 surf->usage).fmt;
2943
2944 if (res->aux.bo)
2945 f.aux_address = res->aux.bo->address + res->aux.offset;
2946
2947 if (res->aux.clear_color_bo) {
2948 f.clear_address = res->aux.clear_color_bo->address +
2949 res->aux.clear_color_offset;
2950 f.use_clear_address = isl_dev->info->ver > 9;
2951 }
2952 }
2953
2954 isl_surf_fill_state_s(isl_dev, map, &f);
2955 }
2956
2957 static void
fill_surface_states(struct isl_device * isl_dev,struct iris_surface_state * surf_state,struct iris_resource * res,struct isl_surf * surf,struct isl_view * view,uint64_t extra_main_offset,uint32_t tile_x_sa,uint32_t tile_y_sa)2958 fill_surface_states(struct isl_device *isl_dev,
2959 struct iris_surface_state *surf_state,
2960 struct iris_resource *res,
2961 struct isl_surf *surf,
2962 struct isl_view *view,
2963 uint64_t extra_main_offset,
2964 uint32_t tile_x_sa,
2965 uint32_t tile_y_sa)
2966 {
2967 void *map = surf_state->cpu;
2968 unsigned aux_modes = surf_state->aux_usages;
2969
2970 while (aux_modes) {
2971 enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2972
2973 fill_surface_state(isl_dev, map, res, surf, view, aux_usage,
2974 extra_main_offset, tile_x_sa, tile_y_sa);
2975
2976 map += SURFACE_STATE_ALIGNMENT;
2977 }
2978 }
2979
2980 /**
2981 * The pipe->create_sampler_view() driver hook.
2982 */
2983 static struct pipe_sampler_view *
iris_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2984 iris_create_sampler_view(struct pipe_context *ctx,
2985 struct pipe_resource *tex,
2986 const struct pipe_sampler_view *tmpl)
2987 {
2988 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2989 const struct intel_device_info *devinfo = screen->devinfo;
2990 struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
2991
2992 if (!isv)
2993 return NULL;
2994
2995 /* initialize base object */
2996 isv->base = *tmpl;
2997 isv->base.context = ctx;
2998 isv->base.texture = NULL;
2999 pipe_reference_init(&isv->base.reference, 1);
3000 pipe_resource_reference(&isv->base.texture, tex);
3001
3002 if (util_format_is_depth_or_stencil(tmpl->format)) {
3003 struct iris_resource *zres, *sres;
3004 const struct util_format_description *desc =
3005 util_format_description(tmpl->format);
3006
3007 iris_get_depth_stencil_resources(tex, &zres, &sres);
3008
3009 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
3010 }
3011
3012 isv->res = (struct iris_resource *) tex;
3013
3014 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
3015
3016 if (isv->base.target == PIPE_TEXTURE_CUBE ||
3017 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
3018 usage |= ISL_SURF_USAGE_CUBE_BIT;
3019
3020 const struct iris_format_info fmt =
3021 iris_format_for_usage(devinfo, tmpl->format, usage);
3022
3023 isv->clear_color = isv->res->aux.clear_color;
3024
3025 isv->view = (struct isl_view) {
3026 .format = fmt.fmt,
3027 .swizzle = (struct isl_swizzle) {
3028 .r = fmt_swizzle(&fmt, tmpl->swizzle_r),
3029 .g = fmt_swizzle(&fmt, tmpl->swizzle_g),
3030 .b = fmt_swizzle(&fmt, tmpl->swizzle_b),
3031 .a = fmt_swizzle(&fmt, tmpl->swizzle_a),
3032 },
3033 .usage = usage,
3034 };
3035
3036 unsigned aux_usages = 0;
3037
3038 if ((isv->res->aux.usage == ISL_AUX_USAGE_CCS_D ||
3039 isv->res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3040 isv->res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3041 !isl_format_supports_ccs_e(devinfo, isv->view.format)) {
3042 aux_usages = 1 << ISL_AUX_USAGE_NONE;
3043 } else if (isl_aux_usage_has_hiz(isv->res->aux.usage) &&
3044 !iris_sample_with_depth_aux(devinfo, isv->res)) {
3045 aux_usages = 1 << ISL_AUX_USAGE_NONE;
3046 } else {
3047 aux_usages = 1 << ISL_AUX_USAGE_NONE |
3048 1 << isv->res->aux.usage;
3049 }
3050
3051 alloc_surface_states(&isv->surface_state, aux_usages);
3052 isv->surface_state.bo_address = isv->res->bo->address;
3053
3054 /* Fill out SURFACE_STATE for this view. */
3055 if (tmpl->target != PIPE_BUFFER) {
3056 isv->view.base_level = tmpl->u.tex.first_level;
3057 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
3058
3059 if (tmpl->target == PIPE_TEXTURE_3D) {
3060 isv->view.base_array_layer = 0;
3061 isv->view.array_len = 1;
3062 } else {
3063 #if GFX_VER < 9
3064 /* Hardware older than skylake ignores this value */
3065 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
3066 #endif
3067 isv->view.base_array_layer = tmpl->u.tex.first_layer;
3068 isv->view.array_len =
3069 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3070 }
3071
3072 fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3073 &isv->res->surf, &isv->view, 0, 0, 0);
3074 } else if (isv->base.is_tex2d_from_buf) {
3075 /* In case it's a 2d image created from a buffer, we should
3076 * use fill_surface_states function with image parameters provided
3077 * by the CL application
3078 */
3079 isv->view.base_array_layer = 0;
3080 isv->view.array_len = 1;
3081
3082 /* Create temp_surf and fill with values provided by CL application */
3083 struct isl_surf temp_surf;
3084 fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt.fmt,
3085 isv->base.u.tex2d_from_buf.width,
3086 isv->base.u.tex2d_from_buf.height,
3087 isv->base.u.tex2d_from_buf.row_stride,
3088 usage,
3089 &temp_surf);
3090
3091 fill_surface_states(&screen->isl_dev, &isv->surface_state, isv->res,
3092 &temp_surf, &isv->view, 0, 0, 0);
3093 } else {
3094 fill_buffer_surface_state(&screen->isl_dev, isv->res,
3095 isv->surface_state.cpu,
3096 isv->view.format, isv->view.swizzle,
3097 tmpl->u.buf.offset, tmpl->u.buf.size,
3098 ISL_SURF_USAGE_TEXTURE_BIT);
3099 }
3100
3101 return &isv->base;
3102 }
3103
3104 static void
iris_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)3105 iris_sampler_view_destroy(struct pipe_context *ctx,
3106 struct pipe_sampler_view *state)
3107 {
3108 struct iris_sampler_view *isv = (void *) state;
3109 pipe_resource_reference(&state->texture, NULL);
3110 pipe_resource_reference(&isv->surface_state.ref.res, NULL);
3111 free(isv->surface_state.cpu);
3112 free(isv);
3113 }
3114
3115 /**
3116 * The pipe->create_surface() driver hook.
3117 *
3118 * In Gallium nomenclature, "surfaces" are a view of a resource that
3119 * can be bound as a render target or depth/stencil buffer.
3120 */
3121 static struct pipe_surface *
iris_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)3122 iris_create_surface(struct pipe_context *ctx,
3123 struct pipe_resource *tex,
3124 const struct pipe_surface *tmpl)
3125 {
3126 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3127 const struct intel_device_info *devinfo = screen->devinfo;
3128
3129 isl_surf_usage_flags_t usage = 0;
3130 if (tmpl->writable)
3131 usage = ISL_SURF_USAGE_STORAGE_BIT;
3132 else if (util_format_is_depth_or_stencil(tmpl->format))
3133 usage = ISL_SURF_USAGE_DEPTH_BIT;
3134 else
3135 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
3136
3137 const struct iris_format_info fmt =
3138 iris_format_for_usage(devinfo, tmpl->format, usage);
3139
3140 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
3141 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
3142 /* Framebuffer validation will reject this invalid case, but it
3143 * hasn't had the opportunity yet. In the meantime, we need to
3144 * avoid hitting ISL asserts about unsupported formats below.
3145 */
3146 return NULL;
3147 }
3148
3149 struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
3150 struct iris_resource *res = (struct iris_resource *) tex;
3151
3152 if (!surf)
3153 return NULL;
3154
3155 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
3156
3157 struct isl_view *view = &surf->view;
3158 *view = (struct isl_view) {
3159 .format = fmt.fmt,
3160 .base_level = tmpl->u.tex.level,
3161 .levels = 1,
3162 .base_array_layer = tmpl->u.tex.first_layer,
3163 .array_len = array_len,
3164 .swizzle = ISL_SWIZZLE_IDENTITY,
3165 .usage = usage,
3166 };
3167
3168 #if GFX_VER == 8
3169 struct isl_view *read_view = &surf->read_view;
3170 *read_view = (struct isl_view) {
3171 .format = fmt.fmt,
3172 .base_level = tmpl->u.tex.level,
3173 .levels = 1,
3174 .base_array_layer = tmpl->u.tex.first_layer,
3175 .array_len = array_len,
3176 .swizzle = ISL_SWIZZLE_IDENTITY,
3177 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
3178 };
3179
3180 struct isl_surf read_surf = res->surf;
3181 uint64_t read_surf_offset_B = 0;
3182 uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;
3183 if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {
3184 /* The minimum array element field of the surface state structure is
3185 * ignored by the sampler unit for 3D textures on some hardware. If the
3186 * render buffer is a single slice of a 3D texture, create a 2D texture
3187 * covering that slice.
3188 *
3189 * TODO: This only handles the case where we're rendering to a single
3190 * slice of an array texture. If we have layered rendering combined
3191 * with non-coherent FB fetch and a non-zero base_array_layer, then
3192 * we're going to run into problems.
3193 *
3194 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/4904
3195 */
3196 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
3197 read_view->base_level,
3198 0, read_view->base_array_layer,
3199 &read_surf, &read_surf_offset_B,
3200 &read_surf_tile_x_sa, &read_surf_tile_y_sa);
3201 read_view->base_level = 0;
3202 read_view->base_array_layer = 0;
3203 assert(read_view->array_len == 1);
3204 } else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {
3205 /* Convert 1D array textures to 2D arrays because shaders always provide
3206 * the array index coordinate at the Z component to avoid recompiles
3207 * when changing the texture target of the framebuffer.
3208 */
3209 assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);
3210 read_surf.dim = ISL_SURF_DIM_2D;
3211 }
3212 #endif
3213
3214 struct isl_surf isl_surf = res->surf;
3215 uint64_t offset_B = 0;
3216 uint32_t tile_x_el = 0, tile_y_el = 0;
3217 if (isl_format_is_compressed(res->surf.format)) {
3218 /* The resource has a compressed format, which is not renderable, but we
3219 * have a renderable view format. We must be attempting to upload
3220 * blocks of compressed data via an uncompressed view.
3221 *
3222 * In this case, we can assume there are no auxiliary surfaces, a single
3223 * miplevel, and that the resource is single-sampled. Gallium may try
3224 * and create an uncompressed view with multiple layers, however.
3225 */
3226 assert(res->aux.surf.size_B == 0);
3227 assert(res->surf.samples == 1);
3228 assert(view->levels == 1);
3229
3230 bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev,
3231 &res->surf, view,
3232 &isl_surf, view, &offset_B,
3233 &tile_x_el, &tile_y_el);
3234
3235 /* On Broadwell, HALIGN and VALIGN are specified in pixels and are
3236 * hard-coded to align to exactly the block size of the compressed
3237 * texture. This means that, when reinterpreted as a non-compressed
3238 * texture, the tile offsets may be anything.
3239 *
3240 * We need them to be multiples of 4 to be usable in RENDER_SURFACE_STATE,
3241 * so force the state tracker to take fallback paths if they're not.
3242 */
3243 #if GFX_VER == 8
3244 if (tile_x_el % 4 != 0 || tile_y_el % 4 != 0) {
3245 ok = false;
3246 }
3247 #endif
3248
3249 if (!ok) {
3250 free(surf);
3251 return NULL;
3252 }
3253 }
3254
3255 surf->clear_color = res->aux.clear_color;
3256
3257 struct pipe_surface *psurf = &surf->base;
3258 pipe_reference_init(&psurf->reference, 1);
3259 pipe_resource_reference(&psurf->texture, tex);
3260 psurf->context = ctx;
3261 psurf->format = tmpl->format;
3262 psurf->width = isl_surf.logical_level0_px.width;
3263 psurf->height = isl_surf.logical_level0_px.height;
3264 psurf->texture = tex;
3265 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
3266 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
3267 psurf->u.tex.level = tmpl->u.tex.level;
3268
3269 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
3270 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
3271 ISL_SURF_USAGE_STENCIL_BIT))
3272 return psurf;
3273
3274 /* Fill out a SURFACE_STATE for each possible auxiliary surface mode and
3275 * return the pipe_surface.
3276 */
3277 unsigned aux_usages = 0;
3278
3279 if ((res->aux.usage == ISL_AUX_USAGE_CCS_E ||
3280 res->aux.usage == ISL_AUX_USAGE_FCV_CCS_E) &&
3281 !isl_format_supports_ccs_e(devinfo, view->format)) {
3282 aux_usages = 1 << ISL_AUX_USAGE_NONE;
3283 } else {
3284 aux_usages = 1 << ISL_AUX_USAGE_NONE |
3285 1 << res->aux.usage;
3286 }
3287
3288 alloc_surface_states(&surf->surface_state, aux_usages);
3289 surf->surface_state.bo_address = res->bo->address;
3290 fill_surface_states(&screen->isl_dev, &surf->surface_state, res,
3291 &isl_surf, view, offset_B, tile_x_el, tile_y_el);
3292
3293 #if GFX_VER == 8
3294 alloc_surface_states(&surf->surface_state_read, aux_usages);
3295 surf->surface_state_read.bo_address = res->bo->address;
3296 fill_surface_states(&screen->isl_dev, &surf->surface_state_read, res,
3297 &read_surf, read_view, read_surf_offset_B,
3298 read_surf_tile_x_sa, read_surf_tile_y_sa);
3299 #endif
3300
3301 return psurf;
3302 }
3303
3304 #if GFX_VER < 9
3305 static void
fill_default_image_param(struct isl_image_param * param)3306 fill_default_image_param(struct isl_image_param *param)
3307 {
3308 memset(param, 0, sizeof(*param));
3309 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3310 * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3311 * detailed explanation of these parameters.
3312 */
3313 param->swizzling[0] = 0xff;
3314 param->swizzling[1] = 0xff;
3315 }
3316
3317 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3318 fill_buffer_image_param(struct isl_image_param *param,
3319 enum pipe_format pfmt,
3320 unsigned size)
3321 {
3322 const unsigned cpp = util_format_get_blocksize(pfmt);
3323
3324 fill_default_image_param(param);
3325 param->size[0] = size / cpp;
3326 param->stride[0] = cpp;
3327 }
3328 #else
3329 #define isl_surf_fill_image_param(x, ...)
3330 #define fill_default_image_param(x, ...)
3331 #define fill_buffer_image_param(x, ...)
3332 #endif
3333
3334 /**
3335 * The pipe->set_shader_images() driver hook.
3336 */
3337 static void
iris_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3338 iris_set_shader_images(struct pipe_context *ctx,
3339 enum pipe_shader_type p_stage,
3340 unsigned start_slot, unsigned count,
3341 unsigned unbind_num_trailing_slots,
3342 const struct pipe_image_view *p_images)
3343 {
3344 struct iris_context *ice = (struct iris_context *) ctx;
3345 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3346 gl_shader_stage stage = stage_from_pipe(p_stage);
3347 struct iris_shader_state *shs = &ice->state.shaders[stage];
3348 #if GFX_VER == 8
3349 struct iris_genx_state *genx = ice->state.genx;
3350 struct isl_image_param *image_params = genx->shaders[stage].image_param;
3351 #endif
3352
3353 shs->bound_image_views &=
3354 ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3355
3356 for (unsigned i = 0; i < count; i++) {
3357 struct iris_image_view *iv = &shs->image[start_slot + i];
3358
3359 if (p_images && p_images[i].resource) {
3360 const struct pipe_image_view *img = &p_images[i];
3361 struct iris_resource *res = (void *) img->resource;
3362
3363 util_copy_image_view(&iv->base, img);
3364
3365 shs->bound_image_views |= BITFIELD64_BIT(start_slot + i);
3366
3367 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3368 res->bind_stages |= 1 << stage;
3369
3370 enum isl_format isl_fmt = iris_image_view_get_format(ice, img);
3371
3372 unsigned aux_usages = 1 << ISL_AUX_USAGE_NONE;
3373
3374 /* Gfx12+ supports render compression for images */
3375 if (GFX_VER >= 12 && isl_aux_usage_has_ccs_e(res->aux.usage))
3376 aux_usages |= 1 << ISL_AUX_USAGE_CCS_E;
3377
3378 alloc_surface_states(&iv->surface_state, aux_usages);
3379 iv->surface_state.bo_address = res->bo->address;
3380
3381 if (res->base.b.target != PIPE_BUFFER) {
3382 struct isl_view view = {
3383 .format = isl_fmt,
3384 .base_level = img->u.tex.level,
3385 .levels = 1,
3386 .base_array_layer = img->u.tex.first_layer,
3387 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3388 .swizzle = ISL_SWIZZLE_IDENTITY,
3389 .usage = ISL_SURF_USAGE_STORAGE_BIT,
3390 };
3391
3392 /* If using untyped fallback. */
3393 if (isl_fmt == ISL_FORMAT_RAW) {
3394 fill_buffer_surface_state(&screen->isl_dev, res,
3395 iv->surface_state.cpu,
3396 isl_fmt, ISL_SWIZZLE_IDENTITY,
3397 0, res->bo->size,
3398 ISL_SURF_USAGE_STORAGE_BIT);
3399 } else {
3400 fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3401 &res->surf, &view, 0, 0, 0);
3402 }
3403
3404 isl_surf_fill_image_param(&screen->isl_dev,
3405 &image_params[start_slot + i],
3406 &res->surf, &view);
3407 } else if (img->access & PIPE_IMAGE_ACCESS_TEX2D_FROM_BUFFER) {
3408 /* In case it's a 2d image created from a buffer, we should
3409 * use fill_surface_states function with image parameters provided
3410 * by the CL application
3411 */
3412 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3413 struct isl_view view = {
3414 .format = isl_fmt,
3415 .base_level = 0,
3416 .levels = 1,
3417 .base_array_layer = 0,
3418 .array_len = 1,
3419 .swizzle = ISL_SWIZZLE_IDENTITY,
3420 .usage = usage,
3421 };
3422
3423 /* Create temp_surf and fill with values provided by CL application */
3424 struct isl_surf temp_surf;
3425 enum isl_format fmt = iris_image_view_get_format(ice, img);
3426 fill_surf_for_tex2d_from_buffer(&screen->isl_dev, fmt,
3427 img->u.tex2d_from_buf.width,
3428 img->u.tex2d_from_buf.height,
3429 img->u.tex2d_from_buf.row_stride,
3430 usage,
3431 &temp_surf);
3432
3433 fill_surface_states(&screen->isl_dev, &iv->surface_state, res,
3434 &temp_surf, &view, 0, 0, 0);
3435 isl_surf_fill_image_param(&screen->isl_dev,
3436 &image_params[start_slot + i],
3437 &temp_surf, &view);
3438 } else {
3439 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3440 img->u.buf.offset + img->u.buf.size);
3441
3442 fill_buffer_surface_state(&screen->isl_dev, res,
3443 iv->surface_state.cpu,
3444 isl_fmt, ISL_SWIZZLE_IDENTITY,
3445 img->u.buf.offset, img->u.buf.size,
3446 ISL_SURF_USAGE_STORAGE_BIT);
3447 fill_buffer_image_param(&image_params[start_slot + i],
3448 img->format, img->u.buf.size);
3449 }
3450
3451 upload_surface_states(ice->state.surface_uploader, &iv->surface_state);
3452 } else {
3453 pipe_resource_reference(&iv->base.resource, NULL);
3454 pipe_resource_reference(&iv->surface_state.ref.res, NULL);
3455 fill_default_image_param(&image_params[start_slot + i]);
3456 }
3457 }
3458
3459 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
3460 ice->state.dirty |=
3461 stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3462 : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3463
3464 /* Broadwell also needs isl_image_params re-uploaded */
3465 if (GFX_VER < 9) {
3466 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3467 shs->sysvals_need_upload = true;
3468 }
3469
3470 if (unbind_num_trailing_slots) {
3471 iris_set_shader_images(ctx, p_stage, start_slot + count,
3472 unbind_num_trailing_slots, 0, NULL);
3473 }
3474 }
3475
3476 UNUSED static bool
is_sampler_view_3d(const struct iris_sampler_view * view)3477 is_sampler_view_3d(const struct iris_sampler_view *view)
3478 {
3479 return view && view->res->base.b.target == PIPE_TEXTURE_3D;
3480 }
3481
3482 /**
3483 * The pipe->set_sampler_views() driver hook.
3484 */
3485 static void
iris_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3486 iris_set_sampler_views(struct pipe_context *ctx,
3487 enum pipe_shader_type p_stage,
3488 unsigned start, unsigned count,
3489 unsigned unbind_num_trailing_slots,
3490 bool take_ownership,
3491 struct pipe_sampler_view **views)
3492 {
3493 struct iris_context *ice = (struct iris_context *) ctx;
3494 UNUSED struct iris_screen *screen = (void *) ctx->screen;
3495 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
3496 gl_shader_stage stage = stage_from_pipe(p_stage);
3497 struct iris_shader_state *shs = &ice->state.shaders[stage];
3498 unsigned i;
3499
3500 if (count == 0 && unbind_num_trailing_slots == 0)
3501 return;
3502
3503 BITSET_CLEAR_RANGE(shs->bound_sampler_views, start,
3504 start + count + unbind_num_trailing_slots - 1);
3505
3506 for (i = 0; i < count; i++) {
3507 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3508 struct iris_sampler_view *view = (void *) pview;
3509
3510 #if GFX_VERx10 == 125
3511 if (intel_needs_workaround(screen->devinfo, 14014414195)) {
3512 if (is_sampler_view_3d(shs->textures[start + i]) !=
3513 is_sampler_view_3d(view))
3514 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3515 }
3516 #endif
3517
3518 if (take_ownership) {
3519 pipe_sampler_view_reference((struct pipe_sampler_view **)
3520 &shs->textures[start + i], NULL);
3521 shs->textures[start + i] = (struct iris_sampler_view *)pview;
3522 } else {
3523 pipe_sampler_view_reference((struct pipe_sampler_view **)
3524 &shs->textures[start + i], pview);
3525 }
3526 if (view) {
3527 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3528 view->res->bind_stages |= 1 << stage;
3529
3530 BITSET_SET(shs->bound_sampler_views, start + i);
3531
3532 update_surface_state_addrs(ice->state.surface_uploader,
3533 &view->surface_state, view->res->bo);
3534 }
3535 }
3536 for (; i < count + unbind_num_trailing_slots; i++) {
3537 pipe_sampler_view_reference((struct pipe_sampler_view **)
3538 &shs->textures[start + i], NULL);
3539 }
3540
3541 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);
3542 ice->state.dirty |=
3543 stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3544 : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3545 }
3546
3547 static void
iris_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** resources)3548 iris_set_compute_resources(struct pipe_context *ctx,
3549 unsigned start, unsigned count,
3550 struct pipe_surface **resources)
3551 {
3552 assert(count == 0);
3553 }
3554
3555 static void
iris_set_global_binding(struct pipe_context * ctx,unsigned start_slot,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)3556 iris_set_global_binding(struct pipe_context *ctx,
3557 unsigned start_slot, unsigned count,
3558 struct pipe_resource **resources,
3559 uint32_t **handles)
3560 {
3561 struct iris_context *ice = (struct iris_context *) ctx;
3562
3563 assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
3564 for (unsigned i = 0; i < count; i++) {
3565 if (resources && resources[i]) {
3566 pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3567 resources[i]);
3568
3569 struct iris_resource *res = (void *) resources[i];
3570 assert(res->base.b.target == PIPE_BUFFER);
3571 util_range_add(&res->base.b, &res->valid_buffer_range,
3572 0, res->base.b.width0);
3573
3574 uint64_t addr = 0;
3575 memcpy(&addr, handles[i], sizeof(addr));
3576 addr += res->bo->address + res->offset;
3577 memcpy(handles[i], &addr, sizeof(addr));
3578 } else {
3579 pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
3580 NULL);
3581 }
3582 }
3583
3584 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
3585 }
3586
3587 /**
3588 * The pipe->set_tess_state() driver hook.
3589 */
3590 static void
iris_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3591 iris_set_tess_state(struct pipe_context *ctx,
3592 const float default_outer_level[4],
3593 const float default_inner_level[2])
3594 {
3595 struct iris_context *ice = (struct iris_context *) ctx;
3596 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3597
3598 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3599 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3600
3601 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;
3602 shs->sysvals_need_upload = true;
3603 }
3604
3605 static void
iris_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3606 iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3607 {
3608 struct iris_context *ice = (struct iris_context *) ctx;
3609
3610 ice->state.patch_vertices = patch_vertices;
3611 }
3612
3613 static void
iris_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3614 iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3615 {
3616 struct iris_surface *surf = (void *) p_surf;
3617 pipe_resource_reference(&p_surf->texture, NULL);
3618 pipe_resource_reference(&surf->surface_state.ref.res, NULL);
3619 pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);
3620 free(surf->surface_state.cpu);
3621 free(surf->surface_state_read.cpu);
3622 free(surf);
3623 }
3624
3625 static void
iris_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3626 iris_set_clip_state(struct pipe_context *ctx,
3627 const struct pipe_clip_state *state)
3628 {
3629 struct iris_context *ice = (struct iris_context *) ctx;
3630 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3631 struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3632 struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3633
3634 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3635
3636 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |
3637 IRIS_STAGE_DIRTY_CONSTANTS_GS |
3638 IRIS_STAGE_DIRTY_CONSTANTS_TES;
3639 shs->sysvals_need_upload = true;
3640 gshs->sysvals_need_upload = true;
3641 tshs->sysvals_need_upload = true;
3642 }
3643
3644 /**
3645 * The pipe->set_polygon_stipple() driver hook.
3646 */
3647 static void
iris_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3648 iris_set_polygon_stipple(struct pipe_context *ctx,
3649 const struct pipe_poly_stipple *state)
3650 {
3651 struct iris_context *ice = (struct iris_context *) ctx;
3652 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3653 ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;
3654 }
3655
3656 /**
3657 * The pipe->set_sample_mask() driver hook.
3658 */
3659 static void
iris_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3660 iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3661 {
3662 struct iris_context *ice = (struct iris_context *) ctx;
3663
3664 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3665 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3666 */
3667 ice->state.sample_mask = sample_mask & 0xffff;
3668 ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;
3669 }
3670
3671 /**
3672 * The pipe->set_scissor_states() driver hook.
3673 *
3674 * This corresponds to our SCISSOR_RECT state structures. It's an
3675 * exact match, so we just store them, and memcpy them out later.
3676 */
3677 static void
iris_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3678 iris_set_scissor_states(struct pipe_context *ctx,
3679 unsigned start_slot,
3680 unsigned num_scissors,
3681 const struct pipe_scissor_state *rects)
3682 {
3683 struct iris_context *ice = (struct iris_context *) ctx;
3684
3685 for (unsigned i = 0; i < num_scissors; i++) {
3686 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3687 /* If the scissor was out of bounds and got clamped to 0 width/height
3688 * at the bounds, the subtraction of 1 from maximums could produce a
3689 * negative number and thus not clip anything. Instead, just provide
3690 * a min > max scissor inside the bounds, which produces the expected
3691 * no rendering.
3692 */
3693 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3694 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3695 };
3696 } else {
3697 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3698 .minx = rects[i].minx, .miny = rects[i].miny,
3699 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3700 };
3701 }
3702 }
3703
3704 ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
3705 }
3706
3707 /**
3708 * The pipe->set_stencil_ref() driver hook.
3709 *
3710 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3711 */
3712 static void
iris_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref state)3713 iris_set_stencil_ref(struct pipe_context *ctx,
3714 const struct pipe_stencil_ref state)
3715 {
3716 struct iris_context *ice = (struct iris_context *) ctx;
3717 memcpy(&ice->state.stencil_ref, &state, sizeof(state));
3718 if (GFX_VER >= 12)
3719 ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;
3720 else if (GFX_VER >= 9)
3721 ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
3722 else
3723 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
3724 }
3725
3726 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3727 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3728 {
3729 return copysignf(state->scale[axis], sign) + state->translate[axis];
3730 }
3731
3732 /**
3733 * The pipe->set_viewport_states() driver hook.
3734 *
3735 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3736 * the guardband yet, as we need the framebuffer dimensions, but we can
3737 * at least fill out the rest.
3738 */
3739 static void
iris_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3740 iris_set_viewport_states(struct pipe_context *ctx,
3741 unsigned start_slot,
3742 unsigned count,
3743 const struct pipe_viewport_state *states)
3744 {
3745 struct iris_context *ice = (struct iris_context *) ctx;
3746 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3747
3748 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3749
3750 /* Fix depth test misrenderings by lowering translated depth range */
3751 if (screen->driconf.lower_depth_range_rate != 1.0f)
3752 ice->state.viewports[start_slot].translate[2] *=
3753 screen->driconf.lower_depth_range_rate;
3754
3755 ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3756
3757 if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||
3758 !ice->state.cso_rast->depth_clip_far))
3759 ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
3760 }
3761
3762 /**
3763 * The pipe->set_framebuffer_state() driver hook.
3764 *
3765 * Sets the current draw FBO, including color render targets, depth,
3766 * and stencil buffers.
3767 */
3768 static void
iris_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3769 iris_set_framebuffer_state(struct pipe_context *ctx,
3770 const struct pipe_framebuffer_state *state)
3771 {
3772 struct iris_context *ice = (struct iris_context *) ctx;
3773 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3774 const struct intel_device_info *devinfo = screen->devinfo;
3775 struct isl_device *isl_dev = &screen->isl_dev;
3776 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3777 struct iris_resource *zres;
3778 struct iris_resource *stencil_res;
3779 struct iris_resource *new_res = NULL;
3780 struct pipe_box new_render_area;
3781
3782 unsigned samples = util_framebuffer_get_num_samples(state);
3783 unsigned layers = util_framebuffer_get_num_layers(state);
3784
3785 /* multiview not supported */
3786 assert(!state->viewmask);
3787
3788 if (cso->samples != samples) {
3789 ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
3790
3791 /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */
3792 if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))
3793 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
3794
3795 /* We may need to emit blend state for Wa_14018912822. */
3796 if ((cso->samples > 1) != (samples > 1) &&
3797 intel_needs_workaround(devinfo, 14018912822)) {
3798 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3799 ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
3800 }
3801 }
3802
3803 if (cso->nr_cbufs != state->nr_cbufs) {
3804 ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3805 }
3806
3807 if ((cso->layers == 0) != (layers == 0)) {
3808 ice->state.dirty |= IRIS_DIRTY_CLIP;
3809 }
3810
3811 if (state->nr_cbufs > 0 && state->cbufs[0])
3812 new_res = (struct iris_resource *)state->cbufs[0]->texture;
3813
3814 if (new_res && new_res->use_damage) {
3815 new_render_area = new_res->damage;
3816 } else {
3817 new_render_area.x = 0;
3818 new_render_area.y = 0;
3819 new_render_area.z = 0;
3820 new_render_area.width = state->width;
3821 new_render_area.height = state->height;
3822 new_render_area.depth = 0;
3823 }
3824
3825 if (memcmp(&ice->state.render_area, &new_render_area, sizeof(new_render_area))) {
3826 ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3827 ice->state.render_area = new_render_area;
3828 }
3829
3830 if (cso->zsbuf || state->zsbuf) {
3831 ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
3832 }
3833
3834 bool has_integer_rt = false;
3835 for (unsigned i = 0; i < state->nr_cbufs; i++) {
3836 if (state->cbufs[i]) {
3837 enum isl_format ifmt =
3838 isl_format_for_pipe_format(state->cbufs[i]->format);
3839 has_integer_rt |= isl_format_has_int_channel(ifmt);
3840 }
3841 }
3842
3843 /* 3DSTATE_RASTER::AntialiasingEnable */
3844 if (has_integer_rt != ice->state.has_integer_rt ||
3845 cso->samples != samples) {
3846 ice->state.dirty |= IRIS_DIRTY_RASTER;
3847 }
3848
3849 util_copy_framebuffer_state(cso, state);
3850 cso->samples = samples;
3851 cso->layers = layers;
3852
3853 ice->state.has_integer_rt = has_integer_rt;
3854
3855 struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
3856
3857 struct isl_view view = {
3858 .base_level = 0,
3859 .levels = 1,
3860 .base_array_layer = 0,
3861 .array_len = 1,
3862 .swizzle = ISL_SWIZZLE_IDENTITY,
3863 };
3864
3865 struct isl_depth_stencil_hiz_emit_info info = {
3866 .view = &view,
3867 .mocs = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_DEPTH_BIT),
3868 };
3869
3870 if (cso->zsbuf) {
3871 iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,
3872 &stencil_res);
3873
3874 view.base_level = cso->zsbuf->u.tex.level;
3875 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
3876 view.array_len =
3877 cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
3878
3879 if (zres) {
3880 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
3881
3882 info.depth_surf = &zres->surf;
3883 info.depth_address = zres->bo->address + zres->offset;
3884 info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);
3885
3886 view.format = zres->surf.format;
3887
3888 if (iris_resource_level_has_hiz(devinfo, zres, view.base_level)) {
3889 info.hiz_usage = zres->aux.usage;
3890 info.hiz_surf = &zres->aux.surf;
3891 info.hiz_address = zres->aux.bo->address + zres->aux.offset;
3892 }
3893
3894 ice->state.hiz_usage = info.hiz_usage;
3895 }
3896
3897 if (stencil_res) {
3898 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
3899 info.stencil_aux_usage = stencil_res->aux.usage;
3900 info.stencil_surf = &stencil_res->surf;
3901 info.stencil_address = stencil_res->bo->address + stencil_res->offset;
3902 if (!zres) {
3903 view.format = stencil_res->surf.format;
3904 info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);
3905 }
3906 }
3907 }
3908
3909 isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
3910
3911 /* Make a null surface for unbound buffers */
3912 void *null_surf_map =
3913 upload_state(ice->state.surface_uploader, &ice->state.null_fb,
3914 4 * GENX(RENDER_SURFACE_STATE_length), 64);
3915 isl_null_fill_state(&screen->isl_dev, null_surf_map,
3916 .size = isl_extent3d(MAX2(cso->width, 1),
3917 MAX2(cso->height, 1),
3918 cso->layers ? cso->layers : 1));
3919 ice->state.null_fb.offset +=
3920 iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));
3921
3922 /* Render target change */
3923 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;
3924
3925 ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
3926
3927 ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3928
3929 ice->state.stage_dirty |=
3930 ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
3931
3932 if (GFX_VER == 8)
3933 ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
3934 }
3935
3936 /**
3937 * The pipe->set_constant_buffer() driver hook.
3938 *
3939 * This uploads any constant data in user buffers, and references
3940 * any UBO resources containing constant data.
3941 */
3942 static void
iris_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3943 iris_set_constant_buffer(struct pipe_context *ctx,
3944 enum pipe_shader_type p_stage, unsigned index,
3945 bool take_ownership,
3946 const struct pipe_constant_buffer *input)
3947 {
3948 struct iris_context *ice = (struct iris_context *) ctx;
3949 gl_shader_stage stage = stage_from_pipe(p_stage);
3950 struct iris_shader_state *shs = &ice->state.shaders[stage];
3951 struct pipe_shader_buffer *cbuf = &shs->constbuf[index];
3952
3953 /* TODO: Only do this if the buffer changes? */
3954 pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);
3955
3956 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3957 shs->bound_cbufs |= 1u << index;
3958
3959 if (input->user_buffer) {
3960 void *map = NULL;
3961 pipe_resource_reference(&cbuf->buffer, NULL);
3962 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3963 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3964
3965 if (!cbuf->buffer) {
3966 /* Allocation was unsuccessful - just unbind */
3967 iris_set_constant_buffer(ctx, p_stage, index, false, NULL);
3968 return;
3969 }
3970
3971 assert(map);
3972 memcpy(map, input->user_buffer, input->buffer_size);
3973 } else if (input->buffer) {
3974 if (cbuf->buffer != input->buffer) {
3975 ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3976 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3977 shs->dirty_cbufs |= 1u << index;
3978 }
3979
3980 if (take_ownership) {
3981 pipe_resource_reference(&cbuf->buffer, NULL);
3982 cbuf->buffer = input->buffer;
3983 } else {
3984 pipe_resource_reference(&cbuf->buffer, input->buffer);
3985 }
3986
3987 cbuf->buffer_offset = input->buffer_offset;
3988 }
3989
3990 cbuf->buffer_size =
3991 MIN2(input->buffer_size,
3992 iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3993
3994 struct iris_resource *res = (void *) cbuf->buffer;
3995 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3996 res->bind_stages |= 1 << stage;
3997 } else {
3998 shs->bound_cbufs &= ~(1u << index);
3999 pipe_resource_reference(&cbuf->buffer, NULL);
4000 }
4001
4002 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
4003 }
4004
4005 static void
upload_sysvals(struct iris_context * ice,gl_shader_stage stage,const struct pipe_grid_info * grid)4006 upload_sysvals(struct iris_context *ice,
4007 gl_shader_stage stage,
4008 const struct pipe_grid_info *grid)
4009 {
4010 UNUSED struct iris_genx_state *genx = ice->state.genx;
4011 struct iris_shader_state *shs = &ice->state.shaders[stage];
4012
4013 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
4014 if (!shader || (shader->num_system_values == 0 &&
4015 shader->kernel_input_size == 0))
4016 return;
4017
4018 assert(shader->num_cbufs > 0);
4019
4020 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
4021 struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
4022 unsigned system_values_start =
4023 ALIGN(shader->kernel_input_size, sizeof(uint32_t));
4024 unsigned upload_size = system_values_start +
4025 shader->num_system_values * sizeof(uint32_t);
4026 void *map = NULL;
4027
4028 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
4029 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
4030 &cbuf->buffer_offset, &cbuf->buffer, &map);
4031
4032 if (shader->kernel_input_size > 0)
4033 memcpy(map, grid->input, shader->kernel_input_size);
4034
4035 uint32_t *sysval_map = map + system_values_start;
4036 for (int i = 0; i < shader->num_system_values; i++) {
4037 uint32_t sysval = shader->system_values[i];
4038 uint32_t value = 0;
4039
4040 #if GFX_VER >= 9
4041 #define COMPILER(x) BRW_##x
4042 #else
4043 #define COMPILER(x) ELK_##x
4044 #endif
4045
4046 if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
4047 #if GFX_VER == 8
4048 unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
4049 unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
4050 struct isl_image_param *param =
4051 &genx->shaders[stage].image_param[img];
4052
4053 assert(offset < sizeof(struct isl_image_param));
4054 value = ((uint32_t *) param)[offset];
4055 #endif
4056 } else if (sysval == COMPILER(PARAM_BUILTIN_ZERO)) {
4057 value = 0;
4058 } else if (COMPILER(PARAM_BUILTIN_IS_CLIP_PLANE(sysval))) {
4059 int plane = COMPILER(PARAM_BUILTIN_CLIP_PLANE_IDX(sysval));
4060 int comp = COMPILER(PARAM_BUILTIN_CLIP_PLANE_COMP(sysval));
4061 value = fui(ice->state.clip_planes.ucp[plane][comp]);
4062 } else if (sysval == COMPILER(PARAM_BUILTIN_PATCH_VERTICES_IN)) {
4063 if (stage == MESA_SHADER_TESS_CTRL) {
4064 value = ice->state.vertices_per_patch;
4065 } else {
4066 assert(stage == MESA_SHADER_TESS_EVAL);
4067 const struct shader_info *tcs_info =
4068 iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
4069 if (tcs_info)
4070 value = tcs_info->tess.tcs_vertices_out;
4071 else
4072 value = ice->state.vertices_per_patch;
4073 }
4074 } else if (sysval >= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X) &&
4075 sysval <= COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_W)) {
4076 unsigned i = sysval - COMPILER(PARAM_BUILTIN_TESS_LEVEL_OUTER_X);
4077 value = fui(ice->state.default_outer_level[i]);
4078 } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_X)) {
4079 value = fui(ice->state.default_inner_level[0]);
4080 } else if (sysval == COMPILER(PARAM_BUILTIN_TESS_LEVEL_INNER_Y)) {
4081 value = fui(ice->state.default_inner_level[1]);
4082 } else if (sysval >= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X) &&
4083 sysval <= COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_Z)) {
4084 unsigned i = sysval - COMPILER(PARAM_BUILTIN_WORK_GROUP_SIZE_X);
4085 value = ice->state.last_block[i];
4086 } else if (sysval == COMPILER(PARAM_BUILTIN_WORK_DIM)) {
4087 value = grid->work_dim;
4088 } else {
4089 assert(!"unhandled system value");
4090 }
4091
4092 *sysval_map++ = value;
4093 }
4094
4095 cbuf->buffer_size = upload_size;
4096 iris_upload_ubo_ssbo_surf_state(ice, cbuf,
4097 &shs->constbuf_surf_state[sysval_cbuf_index],
4098 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
4099
4100 shs->sysvals_need_upload = false;
4101 }
4102
4103 /**
4104 * The pipe->set_shader_buffers() driver hook.
4105 *
4106 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
4107 * SURFACE_STATE here, as the buffer offset may change each time.
4108 */
4109 static void
iris_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4110 iris_set_shader_buffers(struct pipe_context *ctx,
4111 enum pipe_shader_type p_stage,
4112 unsigned start_slot, unsigned count,
4113 const struct pipe_shader_buffer *buffers,
4114 unsigned writable_bitmask)
4115 {
4116 struct iris_context *ice = (struct iris_context *) ctx;
4117 gl_shader_stage stage = stage_from_pipe(p_stage);
4118 struct iris_shader_state *shs = &ice->state.shaders[stage];
4119
4120 unsigned modified_bits = u_bit_consecutive(start_slot, count);
4121
4122 shs->bound_ssbos &= ~modified_bits;
4123 shs->writable_ssbos &= ~modified_bits;
4124 shs->writable_ssbos |= writable_bitmask << start_slot;
4125
4126 for (unsigned i = 0; i < count; i++) {
4127 if (buffers && buffers[i].buffer) {
4128 struct iris_resource *res = (void *) buffers[i].buffer;
4129 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
4130 struct iris_state_ref *surf_state =
4131 &shs->ssbo_surf_state[start_slot + i];
4132 pipe_resource_reference(&ssbo->buffer, &res->base.b);
4133 ssbo->buffer_offset = buffers[i].buffer_offset;
4134 ssbo->buffer_size =
4135 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
4136
4137 shs->bound_ssbos |= 1 << (start_slot + i);
4138
4139 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
4140
4141 iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);
4142
4143 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
4144 res->bind_stages |= 1 << stage;
4145
4146 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
4147 ssbo->buffer_offset + ssbo->buffer_size);
4148 } else {
4149 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
4150 pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
4151 NULL);
4152 }
4153 }
4154
4155 ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
4156 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
4157 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
4158 }
4159
4160 static void
iris_delete_state(struct pipe_context * ctx,void * state)4161 iris_delete_state(struct pipe_context *ctx, void *state)
4162 {
4163 free(state);
4164 }
4165
4166 /**
4167 * The pipe->set_vertex_buffers() driver hook.
4168 *
4169 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
4170 */
4171 static void
iris_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)4172 iris_set_vertex_buffers(struct pipe_context *ctx,
4173 unsigned count,
4174 const struct pipe_vertex_buffer *buffers)
4175 {
4176 struct iris_context *ice = (struct iris_context *) ctx;
4177 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4178 struct iris_genx_state *genx = ice->state.genx;
4179
4180 unsigned last_count = util_last_bit64(ice->state.bound_vertex_buffers);
4181 ice->state.bound_vertex_buffers = 0;
4182
4183 for (unsigned i = 0; i < count; i++) {
4184 const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;
4185 struct iris_vertex_buffer_state *state =
4186 &genx->vertex_buffers[i];
4187
4188 if (!buffer) {
4189 pipe_resource_reference(&state->resource, NULL);
4190 continue;
4191 }
4192
4193 /* We may see user buffers that are NULL bindings. */
4194 assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
4195
4196 if (buffer->buffer.resource &&
4197 state->resource != buffer->buffer.resource)
4198 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
4199
4200 pipe_resource_reference(&state->resource, NULL);
4201 state->resource = buffer->buffer.resource;
4202
4203 struct iris_resource *res = (void *) state->resource;
4204
4205 state->offset = (int) buffer->buffer_offset;
4206
4207 if (res) {
4208 ice->state.bound_vertex_buffers |= 1ull << i;
4209 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4210 }
4211
4212 iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
4213 vb.VertexBufferIndex = i;
4214 vb.AddressModifyEnable = true;
4215 /* vb.BufferPitch is merged in dynamically from VE state later */
4216 if (res) {
4217 vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;
4218 vb.BufferStartingAddress =
4219 ro_bo(NULL, res->bo->address + (int) buffer->buffer_offset);
4220 vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4221 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4222 #if GFX_VER >= 12
4223 vb.L3BypassDisable = true;
4224 #endif
4225 } else {
4226 vb.NullVertexBuffer = true;
4227 vb.MOCS = iris_mocs(NULL, &screen->isl_dev,
4228 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
4229 }
4230 }
4231 }
4232
4233 for (unsigned i = count; i < last_count; i++) {
4234 struct iris_vertex_buffer_state *state =
4235 &genx->vertex_buffers[i];
4236
4237 pipe_resource_reference(&state->resource, NULL);
4238 }
4239
4240 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4241 }
4242
4243 /**
4244 * Gallium CSO for vertex elements.
4245 */
4246 struct iris_vertex_element_state {
4247 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
4248 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
4249 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
4250 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
4251 uint32_t stride[PIPE_MAX_ATTRIBS];
4252 unsigned vb_count;
4253 unsigned count;
4254 };
4255
4256 /**
4257 * The pipe->create_vertex_elements_state() driver hook.
4258 *
4259 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
4260 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
4261 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
4262 * needed. In these cases we will need information available at draw time.
4263 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
4264 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
4265 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
4266 */
4267 static void *
iris_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)4268 iris_create_vertex_elements(struct pipe_context *ctx,
4269 unsigned count,
4270 const struct pipe_vertex_element *state)
4271 {
4272 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4273 const struct intel_device_info *devinfo = screen->devinfo;
4274 struct iris_vertex_element_state *cso =
4275 calloc(1, sizeof(struct iris_vertex_element_state));
4276
4277 cso->count = count;
4278 cso->vb_count = 0;
4279
4280 iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
4281 ve.DWordLength =
4282 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
4283 }
4284
4285 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
4286 uint32_t *vfi_pack_dest = cso->vf_instancing;
4287
4288 if (count == 0) {
4289 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4290 ve.Valid = true;
4291 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
4292 ve.Component0Control = VFCOMP_STORE_0;
4293 ve.Component1Control = VFCOMP_STORE_0;
4294 ve.Component2Control = VFCOMP_STORE_0;
4295 ve.Component3Control = VFCOMP_STORE_1_FP;
4296 }
4297
4298 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4299 }
4300 }
4301
4302 for (int i = 0; i < count; i++) {
4303 const struct iris_format_info fmt =
4304 iris_format_for_usage(devinfo, state[i].src_format, 0);
4305 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
4306 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
4307
4308 switch (isl_format_get_num_channels(fmt.fmt)) {
4309 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
4310 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
4311 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
4312 case 3:
4313 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
4314 : VFCOMP_STORE_1_FP;
4315 break;
4316 }
4317 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
4318 ve.EdgeFlagEnable = false;
4319 ve.VertexBufferIndex = state[i].vertex_buffer_index;
4320 ve.Valid = true;
4321 ve.SourceElementOffset = state[i].src_offset;
4322 ve.SourceElementFormat = fmt.fmt;
4323 ve.Component0Control = comp[0];
4324 ve.Component1Control = comp[1];
4325 ve.Component2Control = comp[2];
4326 ve.Component3Control = comp[3];
4327 }
4328
4329 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
4330 vi.VertexElementIndex = i;
4331 vi.InstancingEnable = state[i].instance_divisor > 0;
4332 vi.InstanceDataStepRate = state[i].instance_divisor;
4333 }
4334
4335 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
4336 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
4337 cso->stride[state[i].vertex_buffer_index] = state[i].src_stride;
4338 cso->vb_count = MAX2(state[i].vertex_buffer_index + 1, cso->vb_count);
4339 }
4340
4341 /* An alternative version of the last VE and VFI is stored so it
4342 * can be used at draw time in case Vertex Shader uses EdgeFlag
4343 */
4344 if (count) {
4345 const unsigned edgeflag_index = count - 1;
4346 const struct iris_format_info fmt =
4347 iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
4348 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
4349 ve.EdgeFlagEnable = true ;
4350 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
4351 ve.Valid = true;
4352 ve.SourceElementOffset = state[edgeflag_index].src_offset;
4353 ve.SourceElementFormat = fmt.fmt;
4354 ve.Component0Control = VFCOMP_STORE_SRC;
4355 ve.Component1Control = VFCOMP_STORE_0;
4356 ve.Component2Control = VFCOMP_STORE_0;
4357 ve.Component3Control = VFCOMP_STORE_0;
4358 }
4359 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
4360 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
4361 * at draw time, as it should change if SGVs are emitted.
4362 */
4363 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
4364 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
4365 }
4366 }
4367
4368 return cso;
4369 }
4370
4371 /**
4372 * The pipe->bind_vertex_elements_state() driver hook.
4373 */
4374 static void
iris_bind_vertex_elements_state(struct pipe_context * ctx,void * state)4375 iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
4376 {
4377 struct iris_context *ice = (struct iris_context *) ctx;
4378 struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
4379 struct iris_vertex_element_state *new_cso = state;
4380
4381 /* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,
4382 * we need to re-emit it to ensure we're overriding the right one.
4383 */
4384 if (new_cso && cso_changed(count))
4385 ice->state.dirty |= IRIS_DIRTY_VF_SGVS;
4386
4387 ice->state.cso_vertex_elements = state;
4388 ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;
4389 if (new_cso) {
4390 /* re-emit vertex buffer state if stride changes */
4391 if (cso_changed(vb_count) ||
4392 cso_changed_memcmp_elts(stride, new_cso->vb_count))
4393 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
4394 }
4395 }
4396
4397 /**
4398 * The pipe->create_stream_output_target() driver hook.
4399 *
4400 * "Target" here refers to a destination buffer. We translate this into
4401 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4402 * know which buffer this represents, or whether we ought to zero the
4403 * write-offsets, or append. Those are handled in the set() hook.
4404 */
4405 static struct pipe_stream_output_target *
iris_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4406 iris_create_stream_output_target(struct pipe_context *ctx,
4407 struct pipe_resource *p_res,
4408 unsigned buffer_offset,
4409 unsigned buffer_size)
4410 {
4411 struct iris_resource *res = (void *) p_res;
4412 struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));
4413 if (!cso)
4414 return NULL;
4415
4416 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4417
4418 pipe_reference_init(&cso->base.reference, 1);
4419 pipe_resource_reference(&cso->base.buffer, p_res);
4420 cso->base.buffer_offset = buffer_offset;
4421 cso->base.buffer_size = buffer_size;
4422 cso->base.context = ctx;
4423
4424 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4425 buffer_offset + buffer_size);
4426
4427 return &cso->base;
4428 }
4429
4430 static void
iris_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4431 iris_stream_output_target_destroy(struct pipe_context *ctx,
4432 struct pipe_stream_output_target *state)
4433 {
4434 struct iris_stream_output_target *cso = (void *) state;
4435
4436 pipe_resource_reference(&cso->base.buffer, NULL);
4437 pipe_resource_reference(&cso->offset.res, NULL);
4438
4439 free(cso);
4440 }
4441
4442 /**
4443 * The pipe->set_stream_output_targets() driver hook.
4444 *
4445 * At this point, we know which targets are bound to a particular index,
4446 * and also whether we want to append or start over. We can finish the
4447 * 3DSTATE_SO_BUFFER packets we started earlier.
4448 */
4449 static void
iris_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets,enum mesa_prim output_prim)4450 iris_set_stream_output_targets(struct pipe_context *ctx,
4451 unsigned num_targets,
4452 struct pipe_stream_output_target **targets,
4453 const unsigned *offsets,
4454 enum mesa_prim output_prim)
4455 {
4456 struct iris_context *ice = (struct iris_context *) ctx;
4457 struct iris_genx_state *genx = ice->state.genx;
4458 uint32_t *so_buffers = genx->so_buffers;
4459 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
4460
4461 const bool active = num_targets > 0;
4462 if (ice->state.streamout_active != active) {
4463 ice->state.streamout_active = active;
4464 ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
4465
4466 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4467 * it's a non-pipelined command. If we're switching streamout on, we
4468 * may have missed emitting it earlier, so do so now. (We're already
4469 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4470 */
4471 if (active) {
4472 ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;
4473 } else {
4474 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4475 struct iris_stream_output_target *tgt =
4476 (void *) ice->state.so_target[i];
4477
4478 if (tgt)
4479 iris_dirty_for_history(ice, (void *)tgt->base.buffer);
4480 }
4481 }
4482 }
4483
4484 for (int i = 0; i < 4; i++) {
4485 pipe_so_target_reference(&ice->state.so_target[i],
4486 i < num_targets ? targets[i] : NULL);
4487 }
4488
4489 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4490 if (!active)
4491 return;
4492
4493 for (unsigned i = 0; i < 4; i++,
4494 so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
4495
4496 struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];
4497 unsigned offset = offsets[i];
4498
4499 if (!tgt) {
4500 iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4501 #if GFX_VER < 12
4502 sob.SOBufferIndex = i;
4503 #else
4504 sob._3DCommandOpcode = 0;
4505 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4506 #endif
4507 sob.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
4508 }
4509 continue;
4510 }
4511
4512 if (!tgt->offset.res)
4513 upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);
4514
4515 struct iris_resource *res = (void *) tgt->base.buffer;
4516
4517 /* Note that offsets[i] will either be 0, causing us to zero
4518 * the value in the buffer, or 0xFFFFFFFF, which happens to mean
4519 * "continue appending at the existing offset."
4520 */
4521 assert(offset == 0 || offset == 0xFFFFFFFF);
4522
4523 /* When we're first called with an offset of 0, we want the next
4524 * 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.
4525 * Any further times we emit those packets, we want to use 0xFFFFFFFF
4526 * to continue appending from the current offset.
4527 *
4528 * Note that we might be called by Begin (offset = 0), Pause, then
4529 * Resume (offset = 0xFFFFFFFF) before ever drawing (where these
4530 * commands will actually be sent to the GPU). In this case, we
4531 * don't want to append - we still want to do our initial zeroing.
4532 */
4533 if (offset == 0)
4534 tgt->zero_offset = true;
4535
4536 iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
4537 #if GFX_VER < 12
4538 sob.SOBufferIndex = i;
4539 #else
4540 sob._3DCommandOpcode = 0;
4541 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
4542 #endif
4543 sob.SurfaceBaseAddress =
4544 rw_bo(NULL, res->bo->address + tgt->base.buffer_offset,
4545 IRIS_DOMAIN_OTHER_WRITE);
4546 sob.SOBufferEnable = true;
4547 sob.StreamOffsetWriteEnable = true;
4548 sob.StreamOutputBufferOffsetAddressEnable = true;
4549 sob.MOCS = iris_mocs(res->bo, &screen->isl_dev,
4550 ISL_SURF_USAGE_STREAM_OUT_BIT);
4551
4552 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
4553 sob.StreamOutputBufferOffsetAddress =
4554 rw_bo(NULL, iris_resource_bo(tgt->offset.res)->address +
4555 tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);
4556 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
4557 }
4558 }
4559
4560 ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
4561 }
4562
4563 /**
4564 * An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4565 * 3DSTATE_STREAMOUT packets.
4566 *
4567 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4568 * hardware to record. We can create it entirely based on the shader, with
4569 * no dynamic state dependencies.
4570 *
4571 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4572 * state-based settings. We capture the shader-related ones here, and merge
4573 * the rest in at draw time.
4574 */
4575 static uint32_t *
iris_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4576 iris_create_so_decl_list(const struct pipe_stream_output_info *info,
4577 const struct intel_vue_map *vue_map)
4578 {
4579 struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4580 int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4581 int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4582 int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4583 int max_decls = 0;
4584 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4585
4586 memset(so_decl, 0, sizeof(so_decl));
4587
4588 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4589 * command feels strange -- each dword pair contains a SO_DECL per stream.
4590 */
4591 for (unsigned i = 0; i < info->num_outputs; i++) {
4592 const struct pipe_stream_output *output = &info->output[i];
4593 const int buffer = output->output_buffer;
4594 const int varying = output->register_index;
4595 const unsigned stream_id = output->stream;
4596 assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4597
4598 buffer_mask[stream_id] |= 1 << buffer;
4599
4600 assert(vue_map->varying_to_slot[varying] >= 0);
4601
4602 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4603 * array. Instead, it simply increments DstOffset for the following
4604 * input by the number of components that should be skipped.
4605 *
4606 * Our hardware is unusual in that it requires us to program SO_DECLs
4607 * for fake "hole" components, rather than simply taking the offset
4608 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4609 * program as many size = 4 holes as we can, then a final hole to
4610 * accommodate the final 1, 2, or 3 remaining.
4611 */
4612 int skip_components = output->dst_offset - next_offset[buffer];
4613
4614 while (skip_components > 0) {
4615 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4616 .HoleFlag = 1,
4617 .OutputBufferSlot = output->output_buffer,
4618 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4619 };
4620 skip_components -= 4;
4621 }
4622
4623 next_offset[buffer] = output->dst_offset + output->num_components;
4624
4625 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4626 .OutputBufferSlot = output->output_buffer,
4627 .RegisterIndex = vue_map->varying_to_slot[varying],
4628 .ComponentMask =
4629 ((1 << output->num_components) - 1) << output->start_component,
4630 };
4631
4632 if (decls[stream_id] > max_decls)
4633 max_decls = decls[stream_id];
4634 }
4635
4636 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4637 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4638 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4639
4640 iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4641 int urb_entry_read_offset = 0;
4642 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4643 urb_entry_read_offset;
4644
4645 /* We always read the whole vertex. This could be reduced at some
4646 * point by reading less and offsetting the register index in the
4647 * SO_DECLs.
4648 */
4649 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4650 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4651 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4652 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4653 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4654 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4655 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4656 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4657
4658 /* Set buffer pitches; 0 means unbound. */
4659 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4660 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4661 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4662 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4663 }
4664
4665 iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4666 list.DWordLength = 3 + 2 * max_decls - 2;
4667 list.StreamtoBufferSelects0 = buffer_mask[0];
4668 list.StreamtoBufferSelects1 = buffer_mask[1];
4669 list.StreamtoBufferSelects2 = buffer_mask[2];
4670 list.StreamtoBufferSelects3 = buffer_mask[3];
4671 list.NumEntries0 = decls[0];
4672 list.NumEntries1 = decls[1];
4673 list.NumEntries2 = decls[2];
4674 list.NumEntries3 = decls[3];
4675 }
4676
4677 for (int i = 0; i < max_decls; i++) {
4678 iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4679 entry.Stream0Decl = so_decl[0][i];
4680 entry.Stream1Decl = so_decl[1][i];
4681 entry.Stream2Decl = so_decl[2][i];
4682 entry.Stream3Decl = so_decl[3][i];
4683 }
4684 }
4685
4686 return map;
4687 }
4688
4689 static inline int
iris_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)4690 iris_compute_first_urb_slot_required(uint64_t inputs_read,
4691 const struct intel_vue_map *prev_stage_vue_map)
4692 {
4693 #if GFX_VER >= 9
4694 return brw_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4695 #else
4696 return elk_compute_first_urb_slot_required(inputs_read, prev_stage_vue_map);
4697 #endif
4698 }
4699
4700 static void
iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,const struct intel_vue_map * last_vue_map,bool two_sided_color,unsigned * out_offset,unsigned * out_length)4701 iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
4702 const struct intel_vue_map *last_vue_map,
4703 bool two_sided_color,
4704 unsigned *out_offset,
4705 unsigned *out_length)
4706 {
4707 /* The compiler computes the first URB slot without considering COL/BFC
4708 * swizzling (because it doesn't know whether it's enabled), so we need
4709 * to do that here too. This may result in a smaller offset, which
4710 * should be safe.
4711 */
4712 const unsigned first_slot =
4713 iris_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
4714
4715 /* This becomes the URB read offset (counted in pairs of slots). */
4716 assert(first_slot % 2 == 0);
4717 *out_offset = first_slot / 2;
4718
4719 /* We need to adjust the inputs read to account for front/back color
4720 * swizzling, as it can make the URB length longer.
4721 */
4722 for (int c = 0; c <= 1; c++) {
4723 if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
4724 /* If two sided color is enabled, the fragment shader's gl_Color
4725 * (COL0) input comes from either the gl_FrontColor (COL0) or
4726 * gl_BackColor (BFC0) input varyings. Mark BFC as used, too.
4727 */
4728 if (two_sided_color)
4729 fs_input_slots |= (VARYING_BIT_BFC0 << c);
4730
4731 /* If front color isn't written, we opt to give them back color
4732 * instead of an undefined value. Switch from COL to BFC.
4733 */
4734 if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
4735 fs_input_slots &= ~(VARYING_BIT_COL0 << c);
4736 fs_input_slots |= (VARYING_BIT_BFC0 << c);
4737 }
4738 }
4739 }
4740
4741 /* Compute the minimum URB Read Length necessary for the FS inputs.
4742 *
4743 * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4744 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4745 *
4746 * "This field should be set to the minimum length required to read the
4747 * maximum source attribute. The maximum source attribute is indicated
4748 * by the maximum value of the enabled Attribute # Source Attribute if
4749 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4750 * enable is not set.
4751 * read_length = ceiling((max_source_attr + 1) / 2)
4752 *
4753 * [errata] Corruption/Hang possible if length programmed larger than
4754 * recommended"
4755 *
4756 * Similar text exists for Ivy Bridge.
4757 *
4758 * We find the last URB slot that's actually read by the FS.
4759 */
4760 unsigned last_read_slot = last_vue_map->num_slots - 1;
4761 while (last_read_slot > first_slot && !(fs_input_slots &
4762 (1ull << last_vue_map->slot_to_varying[last_read_slot])))
4763 --last_read_slot;
4764
4765 /* The URB read length is the difference of the two, counted in pairs. */
4766 *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
4767 }
4768
4769 static void
iris_emit_sbe_swiz(struct iris_batch * batch,const struct iris_context * ice,const struct intel_vue_map * vue_map,unsigned urb_read_offset,unsigned sprite_coord_enables)4770 iris_emit_sbe_swiz(struct iris_batch *batch,
4771 const struct iris_context *ice,
4772 const struct intel_vue_map *vue_map,
4773 unsigned urb_read_offset,
4774 unsigned sprite_coord_enables)
4775 {
4776 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
4777 const struct iris_fs_data *fs_data =
4778 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4779 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4780
4781 /* XXX: this should be generated when putting programs in place */
4782
4783 for (uint8_t idx = 0; idx < fs_data->urb_setup_attribs_count; idx++) {
4784 const uint8_t fs_attr = fs_data->urb_setup_attribs[idx];
4785 const int input_index = fs_data->urb_setup[fs_attr];
4786 if (input_index < 0 || input_index >= 16)
4787 continue;
4788
4789 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
4790 &attr_overrides[input_index];
4791 int slot = vue_map->varying_to_slot[fs_attr];
4792
4793 /* Viewport and Layer are stored in the VUE header. We need to override
4794 * them to zero if earlier stages didn't write them, as GL requires that
4795 * they read back as zero when not explicitly set.
4796 */
4797 switch (fs_attr) {
4798 case VARYING_SLOT_VIEWPORT:
4799 case VARYING_SLOT_LAYER:
4800 attr->ComponentOverrideX = true;
4801 attr->ComponentOverrideW = true;
4802 attr->ConstantSource = CONST_0000;
4803
4804 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4805 attr->ComponentOverrideY = true;
4806 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4807 attr->ComponentOverrideZ = true;
4808 continue;
4809
4810 default:
4811 break;
4812 }
4813
4814 if (sprite_coord_enables & (1 << input_index))
4815 continue;
4816
4817 /* If there was only a back color written but not front, use back
4818 * as the color instead of undefined.
4819 */
4820 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4821 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4822 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4823 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4824
4825 /* Not written by the previous stage - undefined. */
4826 if (slot == -1) {
4827 attr->ComponentOverrideX = true;
4828 attr->ComponentOverrideY = true;
4829 attr->ComponentOverrideZ = true;
4830 attr->ComponentOverrideW = true;
4831 attr->ConstantSource = CONST_0001_FLOAT;
4832 continue;
4833 }
4834
4835 /* Compute the location of the attribute relative to the read offset,
4836 * which is counted in 256-bit increments (two 128-bit VUE slots).
4837 */
4838 const int source_attr = slot - 2 * urb_read_offset;
4839 assert(source_attr >= 0 && source_attr <= 32);
4840 attr->SourceAttribute = source_attr;
4841
4842 /* If we are doing two-sided color, and the VUE slot following this one
4843 * represents a back-facing color, then we need to instruct the SF unit
4844 * to do back-facing swizzling.
4845 */
4846 if (cso_rast->light_twoside &&
4847 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4848 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4849 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4850 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
4851 attr->SwizzleSelect = INPUTATTR_FACING;
4852 }
4853
4854 iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4855 for (int i = 0; i < 16; i++)
4856 sbes.Attribute[i] = attr_overrides[i];
4857 }
4858 }
4859
4860 static bool
iris_is_drawing_points(const struct iris_context * ice)4861 iris_is_drawing_points(const struct iris_context *ice)
4862 {
4863 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4864
4865 if (cso_rast->fill_mode_point) {
4866 return true;
4867 }
4868
4869 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4870 const struct iris_gs_data *gs_data =
4871 iris_gs_data(ice->shaders.prog[MESA_SHADER_GEOMETRY]);
4872 return gs_data->output_topology == _3DPRIM_POINTLIST;
4873 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4874 const struct iris_tes_data *tes_data =
4875 iris_tes_data(ice->shaders.prog[MESA_SHADER_TESS_EVAL]);
4876 return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4877 } else {
4878 return ice->state.prim_mode == MESA_PRIM_POINTS;
4879 }
4880 }
4881
4882 static unsigned
iris_calculate_point_sprite_overrides(const struct iris_fs_data * fs_data,const struct iris_rasterizer_state * cso)4883 iris_calculate_point_sprite_overrides(const struct iris_fs_data *fs_data,
4884 const struct iris_rasterizer_state *cso)
4885 {
4886 unsigned overrides = 0;
4887
4888 if (fs_data->urb_setup[VARYING_SLOT_PNTC] != -1)
4889 overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_PNTC];
4890
4891 for (int i = 0; i < 8; i++) {
4892 if ((cso->sprite_coord_enable & (1 << i)) &&
4893 fs_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)
4894 overrides |= 1 << fs_data->urb_setup[VARYING_SLOT_TEX0 + i];
4895 }
4896
4897 return overrides;
4898 }
4899
4900 static void
iris_emit_sbe(struct iris_batch * batch,const struct iris_context * ice)4901 iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
4902 {
4903 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4904 const struct iris_fs_data *fs_data =
4905 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
4906 const struct intel_vue_map *last_vue_map =
4907 &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
4908
4909 unsigned urb_read_offset, urb_read_length;
4910 iris_compute_sbe_urb_read_interval(fs_data->inputs,
4911 last_vue_map,
4912 cso_rast->light_twoside,
4913 &urb_read_offset, &urb_read_length);
4914
4915 unsigned sprite_coord_overrides =
4916 iris_is_drawing_points(ice) ?
4917 iris_calculate_point_sprite_overrides(fs_data, cso_rast) : 0;
4918
4919 iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4920 sbe.AttributeSwizzleEnable = true;
4921 sbe.NumberofSFOutputAttributes = fs_data->num_varying_inputs;
4922 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
4923 sbe.VertexURBEntryReadOffset = urb_read_offset;
4924 sbe.VertexURBEntryReadLength = urb_read_length;
4925 sbe.ForceVertexURBEntryReadOffset = true;
4926 sbe.ForceVertexURBEntryReadLength = true;
4927 sbe.ConstantInterpolationEnable = fs_data->flat_inputs;
4928 sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;
4929 #if GFX_VER >= 9
4930 for (int i = 0; i < 32; i++) {
4931 sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
4932 }
4933 #endif
4934
4935 /* Ask the hardware to supply PrimitiveID if the fragment shader
4936 * reads it but a previous stage didn't write one.
4937 */
4938 if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
4939 last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
4940 sbe.PrimitiveIDOverrideAttributeSelect =
4941 fs_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
4942 sbe.PrimitiveIDOverrideComponentX = true;
4943 sbe.PrimitiveIDOverrideComponentY = true;
4944 sbe.PrimitiveIDOverrideComponentZ = true;
4945 sbe.PrimitiveIDOverrideComponentW = true;
4946 }
4947 }
4948
4949 iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,
4950 sprite_coord_overrides);
4951 }
4952
4953 /* ------------------------------------------------------------------- */
4954
4955 /**
4956 * Populate VS program key fields based on the current state.
4957 */
4958 static void
iris_populate_vs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_vs_prog_key * key)4959 iris_populate_vs_key(const struct iris_context *ice,
4960 const struct shader_info *info,
4961 gl_shader_stage last_stage,
4962 struct iris_vs_prog_key *key)
4963 {
4964 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4965
4966 if (info->clip_distance_array_size == 0 &&
4967 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4968 last_stage == MESA_SHADER_VERTEX)
4969 key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4970 }
4971
4972 /**
4973 * Populate TCS program key fields based on the current state.
4974 */
4975 static void
iris_populate_tcs_key(const struct iris_context * ice,struct iris_tcs_prog_key * key)4976 iris_populate_tcs_key(const struct iris_context *ice,
4977 struct iris_tcs_prog_key *key)
4978 {
4979 }
4980
4981 /**
4982 * Populate TES program key fields based on the current state.
4983 */
4984 static void
iris_populate_tes_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_tes_prog_key * key)4985 iris_populate_tes_key(const struct iris_context *ice,
4986 const struct shader_info *info,
4987 gl_shader_stage last_stage,
4988 struct iris_tes_prog_key *key)
4989 {
4990 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4991
4992 if (info->clip_distance_array_size == 0 &&
4993 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4994 last_stage == MESA_SHADER_TESS_EVAL)
4995 key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4996 }
4997
4998 /**
4999 * Populate GS program key fields based on the current state.
5000 */
5001 static void
iris_populate_gs_key(const struct iris_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct iris_gs_prog_key * key)5002 iris_populate_gs_key(const struct iris_context *ice,
5003 const struct shader_info *info,
5004 gl_shader_stage last_stage,
5005 struct iris_gs_prog_key *key)
5006 {
5007 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
5008
5009 if (info->clip_distance_array_size == 0 &&
5010 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
5011 last_stage == MESA_SHADER_GEOMETRY)
5012 key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
5013 }
5014
5015 /**
5016 * Populate FS program key fields based on the current state.
5017 */
5018 static void
iris_populate_fs_key(const struct iris_context * ice,const struct shader_info * info,struct iris_fs_prog_key * key)5019 iris_populate_fs_key(const struct iris_context *ice,
5020 const struct shader_info *info,
5021 struct iris_fs_prog_key *key)
5022 {
5023 struct iris_screen *screen = (void *) ice->ctx.screen;
5024 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
5025 const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
5026 const struct iris_rasterizer_state *rast = ice->state.cso_rast;
5027 const struct iris_blend_state *blend = ice->state.cso_blend;
5028
5029 key->nr_color_regions = fb->nr_cbufs;
5030
5031 key->clamp_fragment_color = rast->clamp_fragment_color;
5032
5033 key->alpha_to_coverage = blend->alpha_to_coverage;
5034
5035 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;
5036
5037 key->flat_shade = rast->flatshade &&
5038 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
5039
5040 key->persample_interp = rast->force_persample_interp;
5041 key->multisample_fbo = rast->multisample && fb->samples > 1;
5042
5043 key->coherent_fb_fetch = GFX_VER >= 9 && GFX_VER < 20;
5044
5045 key->force_dual_color_blend =
5046 screen->driconf.dual_color_blend_by_location &&
5047 (blend->blend_enables & 1) && blend->dual_color_blending;
5048 }
5049
5050 static void
iris_populate_cs_key(const struct iris_context * ice,struct iris_cs_prog_key * key)5051 iris_populate_cs_key(const struct iris_context *ice,
5052 struct iris_cs_prog_key *key)
5053 {
5054 }
5055
5056 static inline uint32_t
encode_sampler_count(const struct iris_compiled_shader * shader)5057 encode_sampler_count(const struct iris_compiled_shader *shader)
5058 {
5059 /* We can potentially have way more than 32 samplers and that's ok.
5060 * However, the 3DSTATE_XS packets only have 3 bits to specify how
5061 * many to pre-fetch and all values above 4 are marked reserved.
5062 */
5063 uint32_t count = util_last_bit64(shader->bt.samplers_used_mask);
5064 return DIV_ROUND_UP(CLAMP(count, 0, 16), 4);
5065 }
5066
5067 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
5068 pkt.KernelStartPointer = KSP(shader); \
5069 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
5070 pkt.SamplerCount = encode_sampler_count(shader); \
5071 pkt.FloatingPointMode = shader->use_alt_mode; \
5072 \
5073 pkt.DispatchGRFStartRegisterForURBData = \
5074 shader->dispatch_grf_start_reg; \
5075 pkt.prefix##URBEntryReadLength = vue_data->urb_read_length; \
5076 pkt.prefix##URBEntryReadOffset = 0; \
5077 \
5078 pkt.StatisticsEnable = true; \
5079 pkt.Enable = true; \
5080 \
5081 if (shader->total_scratch) { \
5082 INIT_THREAD_SCRATCH_SIZE(pkt) \
5083 }
5084
5085 /* Note that on Gfx12HP we pass a scratch space surface state offset
5086 * shifted by 2 relative to the value specified on the BSpec, since
5087 * that allows the compiler to save a shift instruction while
5088 * constructing the extended descriptor for SS addressing. That
5089 * worked because we limit the scratch surface state pool to 8 MB and
5090 * because we relied on the legacy (ExBSO=0) encoding of the extended
5091 * descriptor in order to save the shift, which is no longer supported
5092 * for the UGM shared function on Xe2 platforms, so we no longer
5093 * attempt to do that trick.
5094 */
5095 #define SCRATCH_SPACE_BUFFER_SHIFT (GFX_VER >= 20 ? 6 : 4)
5096
5097 #if GFX_VERx10 >= 125
5098 #define INIT_THREAD_SCRATCH_SIZE(pkt)
5099 #define MERGE_SCRATCH_ADDR(name) \
5100 { \
5101 uint32_t pkt2[GENX(name##_length)] = {0}; \
5102 _iris_pack_command(batch, GENX(name), pkt2, p) { \
5103 p.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT; \
5104 } \
5105 iris_emit_merge(batch, pkt, pkt2, GENX(name##_length)); \
5106 }
5107 #else
5108 #define INIT_THREAD_SCRATCH_SIZE(pkt) \
5109 pkt.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
5110 #define MERGE_SCRATCH_ADDR(name) \
5111 { \
5112 uint32_t pkt2[GENX(name##_length)] = {0}; \
5113 _iris_pack_command(batch, GENX(name), pkt2, p) { \
5114 p.ScratchSpaceBasePointer = \
5115 rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE); \
5116 } \
5117 iris_emit_merge(batch, pkt, pkt2, GENX(name##_length)); \
5118 }
5119 #endif
5120
5121
5122 /**
5123 * Encode most of 3DSTATE_VS based on the compiled shader.
5124 */
5125 static void
iris_store_vs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5126 iris_store_vs_state(const struct intel_device_info *devinfo,
5127 struct iris_compiled_shader *shader)
5128 {
5129 struct iris_vue_data *vue_data = iris_vue_data(shader);
5130
5131 iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
5132 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
5133 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
5134 #if GFX_VER < 20
5135 vs.SIMD8DispatchEnable = true;
5136 #endif
5137 vs.UserClipDistanceCullTestEnableBitmask =
5138 vue_data->cull_distance_mask;
5139 }
5140 }
5141
5142 /**
5143 * Encode most of 3DSTATE_HS based on the compiled shader.
5144 */
5145 static void
iris_store_tcs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5146 iris_store_tcs_state(const struct intel_device_info *devinfo,
5147 struct iris_compiled_shader *shader)
5148 {
5149 struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
5150 struct iris_vue_data *vue_data = &tcs_data->base;
5151
5152 iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
5153 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
5154
5155 #if GFX_VER >= 12
5156 /* Wa_1604578095:
5157 *
5158 * Hang occurs when the number of max threads is less than 2 times
5159 * the number of instance count. The number of max threads must be
5160 * more than 2 times the number of instance count.
5161 */
5162 assert((devinfo->max_tcs_threads / 2) > tcs_data->instances);
5163 hs.DispatchGRFStartRegisterForURBData = shader->dispatch_grf_start_reg & 0x1f;
5164 hs.DispatchGRFStartRegisterForURBData5 = shader->dispatch_grf_start_reg >> 5;
5165 #endif
5166
5167 hs.InstanceCount = tcs_data->instances - 1;
5168 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
5169 hs.IncludeVertexHandles = true;
5170
5171 #if GFX_VER == 12
5172 /* Patch Count threshold specifies the maximum number of patches that
5173 * will be accumulated before a thread dispatch is forced.
5174 */
5175 hs.PatchCountThreshold = tcs_data->patch_count_threshold;
5176 #endif
5177
5178 #if GFX_VER >= 9
5179 #if GFX_VER < 20
5180 hs.DispatchMode = vue_data->dispatch_mode;
5181 #endif
5182 hs.IncludePrimitiveID = tcs_data->include_primitive_id;
5183 #endif
5184 }
5185 }
5186
5187 /**
5188 * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
5189 */
5190 static void
iris_store_tes_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5191 iris_store_tes_state(const struct intel_device_info *devinfo,
5192 struct iris_compiled_shader *shader)
5193 {
5194 struct iris_tes_data *tes_data = iris_tes_data(shader);
5195 struct iris_vue_data *vue_data = &tes_data->base;
5196
5197 uint32_t *ds_state = (void *) shader->derived_data;
5198 uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
5199
5200 iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
5201 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
5202
5203 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
5204 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
5205 ds.ComputeWCoordinateEnable =
5206 tes_data->domain == INTEL_TESS_DOMAIN_TRI;
5207
5208 #if GFX_VER >= 12
5209 ds.PrimitiveIDNotRequired = !tes_data->include_primitive_id;
5210 #endif
5211 ds.UserClipDistanceCullTestEnableBitmask =
5212 vue_data->cull_distance_mask;
5213 }
5214
5215 iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
5216 te.Partitioning = tes_data->partitioning;
5217 #if GFX_VER >= 20
5218 te.NumberOfRegionsPerPatch = 2;
5219 #endif
5220 te.OutputTopology = tes_data->output_topology;
5221 te.TEDomain = tes_data->domain;
5222 te.TEEnable = true;
5223 te.MaximumTessellationFactorOdd = 63.0;
5224 te.MaximumTessellationFactorNotOdd = 64.0;
5225 #if GFX_VERx10 >= 125
5226 STATIC_ASSERT(TEDMODE_OFF == 0);
5227 if (intel_needs_workaround(devinfo, 14015055625)) {
5228 te.TessellationDistributionMode = TEDMODE_OFF;
5229 } else if (intel_needs_workaround(devinfo, 22012699309)) {
5230 te.TessellationDistributionMode = TEDMODE_RR_STRICT;
5231 } else {
5232 te.TessellationDistributionMode = TEDMODE_RR_FREE;
5233 }
5234
5235 #if GFX_VER >= 20
5236 te.TessellationDistributionLevel = TEDLEVEL_REGION;
5237 #else
5238 te.TessellationDistributionLevel = TEDLEVEL_PATCH;
5239 #endif
5240 /* 64_TRIANGLES */
5241 te.SmallPatchThreshold = 3;
5242 /* 1K_TRIANGLES */
5243 te.TargetBlockSize = 8;
5244 /* 1K_TRIANGLES */
5245 te.LocalBOPAccumulatorThreshold = 1;
5246 #endif
5247 }
5248 }
5249
5250 /**
5251 * Encode most of 3DSTATE_GS based on the compiled shader.
5252 */
5253 static void
iris_store_gs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5254 iris_store_gs_state(const struct intel_device_info *devinfo,
5255 struct iris_compiled_shader *shader)
5256 {
5257 struct iris_gs_data *gs_data = iris_gs_data(shader);
5258 struct iris_vue_data *vue_data = &gs_data->base;
5259
5260 iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
5261 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
5262
5263 gs.OutputVertexSize = gs_data->output_vertex_size_hwords * 2 - 1;
5264 gs.OutputTopology = gs_data->output_topology;
5265 gs.ControlDataHeaderSize = gs_data->control_data_header_size_hwords;
5266 gs.InstanceControl = gs_data->invocations - 1;
5267 #if GFX_VER < 20
5268 gs.DispatchMode = DISPATCH_MODE_SIMD8;
5269 #endif
5270 gs.IncludePrimitiveID = gs_data->include_primitive_id;
5271 gs.ControlDataFormat = gs_data->control_data_format;
5272 gs.ReorderMode = TRAILING;
5273 gs.ExpectedVertexCount = gs_data->vertices_in;
5274 gs.MaximumNumberofThreads =
5275 GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
5276 : (devinfo->max_gs_threads - 1);
5277
5278 if (gs_data->static_vertex_count != -1) {
5279 gs.StaticOutput = true;
5280 gs.StaticOutputVertexCount = gs_data->static_vertex_count;
5281 }
5282 gs.IncludeVertexHandles = vue_data->include_vue_handles;
5283
5284 gs.UserClipDistanceCullTestEnableBitmask = vue_data->cull_distance_mask;
5285
5286 const int urb_entry_write_offset = 1;
5287 const uint32_t urb_entry_output_length =
5288 DIV_ROUND_UP(vue_data->vue_map.num_slots, 2) - urb_entry_write_offset;
5289
5290 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
5291 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
5292 }
5293 }
5294
5295 /**
5296 * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
5297 */
5298 static void
iris_store_fs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5299 iris_store_fs_state(const struct intel_device_info *devinfo,
5300 struct iris_compiled_shader *shader)
5301 {
5302 struct iris_fs_data *fs_data = iris_fs_data(shader);
5303
5304 uint32_t *ps_state = (void *) shader->derived_data;
5305 uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
5306
5307 iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
5308 ps.VectorMaskEnable = fs_data->uses_vmask;
5309 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
5310 ps.SamplerCount = encode_sampler_count(shader);
5311 ps.FloatingPointMode = shader->use_alt_mode;
5312 ps.MaximumNumberofThreadsPerPSD =
5313 devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
5314
5315 #if GFX_VER < 20
5316 ps.PushConstantEnable = devinfo->needs_null_push_constant_tbimr_workaround ||
5317 shader->ubo_ranges[0].length > 0;
5318 #endif
5319
5320 /* From the documentation for this packet:
5321 * "If the PS kernel does not need the Position XY Offsets to
5322 * compute a Position Value, then this field should be programmed
5323 * to POSOFFSET_NONE."
5324 *
5325 * "SW Recommendation: If the PS kernel needs the Position Offsets
5326 * to compute a Position XY value, this field should match Position
5327 * ZW Interpolation Mode to ensure a consistent position.xyzw
5328 * computation."
5329 *
5330 * We only require XY sample offsets. So, this recommendation doesn't
5331 * look useful at the moment. We might need this in future.
5332 */
5333 ps.PositionXYOffsetSelect =
5334 fs_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
5335
5336 if (shader->total_scratch) {
5337 INIT_THREAD_SCRATCH_SIZE(ps);
5338 }
5339 }
5340
5341 iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
5342 psx.PixelShaderValid = true;
5343 psx.PixelShaderComputedDepthMode = fs_data->computed_depth_mode;
5344 psx.PixelShaderKillsPixel = fs_data->uses_kill;
5345 #if GFX_VER < 20
5346 psx.AttributeEnable = fs_data->num_varying_inputs != 0;
5347 #endif
5348 psx.PixelShaderUsesSourceDepth = fs_data->uses_src_depth;
5349 psx.PixelShaderUsesSourceW = fs_data->uses_src_w;
5350 psx.PixelShaderIsPerSample = fs_data->is_per_sample;
5351 psx.oMaskPresenttoRenderTarget = fs_data->uses_omask;
5352
5353 #if GFX_VER >= 9
5354 #if GFX_VER >= 20
5355 assert(!fs_data->pulls_bary);
5356 #else
5357 psx.PixelShaderPullsBary = fs_data->pulls_bary;
5358 #endif
5359 psx.PixelShaderComputesStencil = fs_data->computed_stencil;
5360 #endif
5361
5362 #if GFX_VER >= 11
5363 psx.PixelShaderRequiresSubpixelSampleOffsets =
5364 fs_data->uses_sample_offsets;
5365 psx.PixelShaderRequiresNonPerspectiveBaryPlaneCoefficients =
5366 fs_data->uses_npc_bary_coefficients;
5367 psx.PixelShaderRequiresPerspectiveBaryPlaneCoefficients =
5368 fs_data->uses_pc_bary_coefficients;
5369 psx.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
5370 fs_data->uses_depth_w_coefficients;
5371 #endif
5372 }
5373 }
5374
5375 /**
5376 * Compute the size of the derived data (shader command packets).
5377 *
5378 * This must match the data written by the iris_store_xs_state() functions.
5379 */
5380 static void
iris_store_cs_state(const struct intel_device_info * devinfo,struct iris_compiled_shader * shader)5381 iris_store_cs_state(const struct intel_device_info *devinfo,
5382 struct iris_compiled_shader *shader)
5383 {
5384 struct iris_cs_data *cs_data = iris_cs_data(shader);
5385 void *map = shader->derived_data;
5386
5387 iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
5388 #if GFX_VERx10 < 125
5389 desc.ConstantURBEntryReadLength = cs_data->push.per_thread.regs;
5390 desc.CrossThreadConstantDataReadLength =
5391 cs_data->push.cross_thread.regs;
5392 #else
5393 assert(cs_data->push.per_thread.regs == 0);
5394 assert(cs_data->push.cross_thread.regs == 0);
5395 #endif
5396 #if GFX_VERx10 <= 125
5397 desc.BarrierEnable = cs_data->uses_barrier;
5398 #endif
5399 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
5400 desc.BindingTableEntryCount = devinfo->verx10 == 125 ?
5401 0 : MIN2(shader->bt.size_bytes / 4, 31);
5402 desc.SamplerCount = encode_sampler_count(shader);
5403 /* TODO: Check if we are missing workarounds and enable mid-thread
5404 * preemption.
5405 *
5406 * We still have issues with mid-thread preemption (it was already
5407 * disabled by the kernel on gfx11, due to missing workarounds). It's
5408 * possible that we are just missing some workarounds, and could enable
5409 * it later, but for now let's disable it to fix a GPU in compute in Car
5410 * Chase (and possibly more).
5411 */
5412 #if GFX_VER >= 20
5413 desc.ThreadPreemption = false;
5414 #elif GFX_VER >= 12
5415 desc.ThreadPreemptionDisable = true;
5416 #endif
5417 }
5418 }
5419
5420 static unsigned
iris_derived_program_state_size(enum iris_program_cache_id cache_id)5421 iris_derived_program_state_size(enum iris_program_cache_id cache_id)
5422 {
5423 assert(cache_id <= IRIS_CACHE_BLORP);
5424
5425 static const unsigned dwords[] = {
5426 [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
5427 [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
5428 [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
5429 [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
5430 [IRIS_CACHE_FS] =
5431 GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
5432 [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
5433 [IRIS_CACHE_BLORP] = 0,
5434 };
5435
5436 return sizeof(uint32_t) * dwords[cache_id];
5437 }
5438
5439 /**
5440 * Create any state packets corresponding to the given shader stage
5441 * (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.
5442 * This means that we can look up a program in the in-memory cache and
5443 * get most of the state packet without having to reconstruct it.
5444 */
5445 static void
iris_store_derived_program_state(const struct intel_device_info * devinfo,enum iris_program_cache_id cache_id,struct iris_compiled_shader * shader)5446 iris_store_derived_program_state(const struct intel_device_info *devinfo,
5447 enum iris_program_cache_id cache_id,
5448 struct iris_compiled_shader *shader)
5449 {
5450 switch (cache_id) {
5451 case IRIS_CACHE_VS:
5452 iris_store_vs_state(devinfo, shader);
5453 break;
5454 case IRIS_CACHE_TCS:
5455 iris_store_tcs_state(devinfo, shader);
5456 break;
5457 case IRIS_CACHE_TES:
5458 iris_store_tes_state(devinfo, shader);
5459 break;
5460 case IRIS_CACHE_GS:
5461 iris_store_gs_state(devinfo, shader);
5462 break;
5463 case IRIS_CACHE_FS:
5464 iris_store_fs_state(devinfo, shader);
5465 break;
5466 case IRIS_CACHE_CS:
5467 iris_store_cs_state(devinfo, shader);
5468 break;
5469 case IRIS_CACHE_BLORP:
5470 break;
5471 }
5472 }
5473
5474 /* ------------------------------------------------------------------- */
5475
5476 static const uint32_t push_constant_opcodes[] = {
5477 [MESA_SHADER_VERTEX] = 21,
5478 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
5479 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
5480 [MESA_SHADER_GEOMETRY] = 22,
5481 [MESA_SHADER_FRAGMENT] = 23,
5482 [MESA_SHADER_COMPUTE] = 0,
5483 };
5484
5485 static uint32_t
use_null_surface(struct iris_batch * batch,struct iris_context * ice)5486 use_null_surface(struct iris_batch *batch, struct iris_context *ice)
5487 {
5488 struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
5489
5490 iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5491
5492 return ice->state.unbound_tex.offset;
5493 }
5494
5495 static uint32_t
use_null_fb_surface(struct iris_batch * batch,struct iris_context * ice)5496 use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)
5497 {
5498 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
5499 if (!ice->state.null_fb.res)
5500 return use_null_surface(batch, ice);
5501
5502 struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);
5503
5504 iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
5505
5506 return ice->state.null_fb.offset;
5507 }
5508
5509 static uint32_t
surf_state_offset_for_aux(unsigned aux_modes,enum isl_aux_usage aux_usage)5510 surf_state_offset_for_aux(unsigned aux_modes,
5511 enum isl_aux_usage aux_usage)
5512 {
5513 assert(aux_modes & (1 << aux_usage));
5514 return SURFACE_STATE_ALIGNMENT *
5515 util_bitcount(aux_modes & ((1 << aux_usage) - 1));
5516 }
5517
5518 #if GFX_VER == 9
5519 static void
surf_state_update_clear_value(struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5520 surf_state_update_clear_value(struct iris_batch *batch,
5521 struct iris_resource *res,
5522 struct iris_surface_state *surf_state,
5523 enum isl_aux_usage aux_usage)
5524 {
5525 struct isl_device *isl_dev = &batch->screen->isl_dev;
5526 struct iris_bo *state_bo = iris_resource_bo(surf_state->ref.res);
5527 uint64_t real_offset = surf_state->ref.offset + IRIS_MEMZONE_BINDER_START;
5528 uint32_t offset_into_bo = real_offset - state_bo->address;
5529 uint32_t clear_offset = offset_into_bo +
5530 isl_dev->ss.clear_value_offset +
5531 surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5532 uint32_t *color = res->aux.clear_color.u32;
5533
5534 assert(isl_dev->ss.clear_value_size == 16);
5535
5536 if (aux_usage == ISL_AUX_USAGE_HIZ) {
5537 iris_emit_pipe_control_write(batch, "update fast clear value (Z)",
5538 PIPE_CONTROL_WRITE_IMMEDIATE,
5539 state_bo, clear_offset, color[0]);
5540 } else {
5541 iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",
5542 PIPE_CONTROL_WRITE_IMMEDIATE,
5543 state_bo, clear_offset,
5544 (uint64_t) color[0] |
5545 (uint64_t) color[1] << 32);
5546 iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",
5547 PIPE_CONTROL_WRITE_IMMEDIATE,
5548 state_bo, clear_offset + 8,
5549 (uint64_t) color[2] |
5550 (uint64_t) color[3] << 32);
5551 }
5552
5553 iris_emit_pipe_control_flush(batch,
5554 "update fast clear: state cache invalidate",
5555 PIPE_CONTROL_FLUSH_ENABLE |
5556 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
5557 }
5558 #endif
5559
5560 static void
update_clear_value(struct iris_context * ice,struct iris_batch * batch,struct iris_resource * res,struct iris_surface_state * surf_state,struct isl_view * view)5561 update_clear_value(struct iris_context *ice,
5562 struct iris_batch *batch,
5563 struct iris_resource *res,
5564 struct iris_surface_state *surf_state,
5565 struct isl_view *view)
5566 {
5567 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5568 UNUSED unsigned aux_modes = surf_state->aux_usages;
5569
5570 /* We only need to update the clear color in the surface state for gfx8 and
5571 * gfx9. Newer gens can read it directly from the clear color state buffer.
5572 */
5573 #if GFX_VER == 9
5574 /* Skip updating the ISL_AUX_USAGE_NONE surface state */
5575 aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);
5576
5577 while (aux_modes) {
5578 enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
5579
5580 surf_state_update_clear_value(batch, res, surf_state, aux_usage);
5581 }
5582 #elif GFX_VER == 8
5583 /* TODO: Could update rather than re-filling */
5584 alloc_surface_states(surf_state, surf_state->aux_usages);
5585
5586 fill_surface_states(isl_dev, surf_state, res, &res->surf, view, 0, 0, 0);
5587
5588 upload_surface_states(ice->state.surface_uploader, surf_state);
5589 #endif
5590 }
5591
5592 static uint32_t
use_surface_state(struct iris_batch * batch,struct iris_surface_state * surf_state,enum isl_aux_usage aux_usage)5593 use_surface_state(struct iris_batch *batch,
5594 struct iris_surface_state *surf_state,
5595 enum isl_aux_usage aux_usage)
5596 {
5597 iris_use_pinned_bo(batch, iris_resource_bo(surf_state->ref.res), false,
5598 IRIS_DOMAIN_NONE);
5599
5600 return surf_state->ref.offset +
5601 surf_state_offset_for_aux(surf_state->aux_usages, aux_usage);
5602 }
5603
5604 /**
5605 * Add a surface to the validation list, as well as the buffer containing
5606 * the corresponding SURFACE_STATE.
5607 *
5608 * Returns the binding table entry (offset to SURFACE_STATE).
5609 */
5610 static uint32_t
use_surface(struct iris_context * ice,struct iris_batch * batch,struct pipe_surface * p_surf,bool writeable,enum isl_aux_usage aux_usage,bool is_read_surface,enum iris_domain access)5611 use_surface(struct iris_context *ice,
5612 struct iris_batch *batch,
5613 struct pipe_surface *p_surf,
5614 bool writeable,
5615 enum isl_aux_usage aux_usage,
5616 bool is_read_surface,
5617 enum iris_domain access)
5618 {
5619 struct iris_surface *surf = (void *) p_surf;
5620 struct iris_resource *res = (void *) p_surf->texture;
5621
5622 if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {
5623 upload_surface_states(ice->state.surface_uploader,
5624 &surf->surface_state_read);
5625 }
5626
5627 if (!surf->surface_state.ref.res) {
5628 upload_surface_states(ice->state.surface_uploader,
5629 &surf->surface_state);
5630 }
5631
5632 if (memcmp(&res->aux.clear_color, &surf->clear_color,
5633 sizeof(surf->clear_color)) != 0) {
5634 update_clear_value(ice, batch, res, &surf->surface_state, &surf->view);
5635 if (GFX_VER == 8) {
5636 update_clear_value(ice, batch, res, &surf->surface_state_read,
5637 &surf->read_view);
5638 }
5639 surf->clear_color = res->aux.clear_color;
5640 }
5641
5642 if (res->aux.clear_color_bo)
5643 iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
5644
5645 if (res->aux.bo)
5646 iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
5647
5648 iris_use_pinned_bo(batch, res->bo, writeable, access);
5649
5650 if (GFX_VER == 8 && is_read_surface) {
5651 return use_surface_state(batch, &surf->surface_state_read, aux_usage);
5652 } else {
5653 return use_surface_state(batch, &surf->surface_state, aux_usage);
5654 }
5655 }
5656
5657 static uint32_t
use_sampler_view(struct iris_context * ice,struct iris_batch * batch,struct iris_sampler_view * isv)5658 use_sampler_view(struct iris_context *ice,
5659 struct iris_batch *batch,
5660 struct iris_sampler_view *isv)
5661 {
5662 enum isl_aux_usage aux_usage =
5663 iris_resource_texture_aux_usage(ice, isv->res, isv->view.format,
5664 isv->view.base_level, isv->view.levels);
5665
5666 if (!isv->surface_state.ref.res)
5667 upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
5668
5669 if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
5670 sizeof(isv->clear_color)) != 0) {
5671 update_clear_value(ice, batch, isv->res, &isv->surface_state,
5672 &isv->view);
5673 isv->clear_color = isv->res->aux.clear_color;
5674 }
5675
5676 if (isv->res->aux.clear_color_bo) {
5677 iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
5678 false, IRIS_DOMAIN_SAMPLER_READ);
5679 }
5680
5681 if (isv->res->aux.bo) {
5682 iris_use_pinned_bo(batch, isv->res->aux.bo,
5683 false, IRIS_DOMAIN_SAMPLER_READ);
5684 }
5685
5686 iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_SAMPLER_READ);
5687
5688 return use_surface_state(batch, &isv->surface_state, aux_usage);
5689 }
5690
5691 static uint32_t
use_ubo_ssbo(struct iris_batch * batch,struct iris_context * ice,struct pipe_shader_buffer * buf,struct iris_state_ref * surf_state,bool writable,enum iris_domain access)5692 use_ubo_ssbo(struct iris_batch *batch,
5693 struct iris_context *ice,
5694 struct pipe_shader_buffer *buf,
5695 struct iris_state_ref *surf_state,
5696 bool writable, enum iris_domain access)
5697 {
5698 if (!buf->buffer || !surf_state->res)
5699 return use_null_surface(batch, ice);
5700
5701 iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);
5702 iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,
5703 IRIS_DOMAIN_NONE);
5704
5705 return surf_state->offset;
5706 }
5707
5708 static uint32_t
use_image(struct iris_batch * batch,struct iris_context * ice,struct iris_shader_state * shs,const struct shader_info * info,int i)5709 use_image(struct iris_batch *batch, struct iris_context *ice,
5710 struct iris_shader_state *shs, const struct shader_info *info,
5711 int i)
5712 {
5713 struct iris_image_view *iv = &shs->image[i];
5714 struct iris_resource *res = (void *) iv->base.resource;
5715
5716 if (!res)
5717 return use_null_surface(batch, ice);
5718
5719 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5720
5721 iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);
5722
5723 if (res->aux.bo)
5724 iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);
5725
5726 if (res->aux.clear_color_bo) {
5727 iris_use_pinned_bo(batch, res->aux.clear_color_bo, false,
5728 IRIS_DOMAIN_NONE);
5729 }
5730
5731 enum isl_aux_usage aux_usage = shs->image_aux_usage[i];
5732
5733 return use_surface_state(batch, &iv->surface_state, aux_usage);
5734 }
5735
5736 #define push_bt_entry(addr) \
5737 assert(addr >= surf_base_offset); \
5738 assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \
5739 if (!pin_only) bt_map[s++] = (addr) - surf_base_offset;
5740
5741 #define bt_assert(section) \
5742 if (!pin_only && shader->bt.used_mask[section] != 0) \
5743 assert(shader->bt.offsets[section] == s);
5744
5745 /**
5746 * Populate the binding table for a given shader stage.
5747 *
5748 * This fills out the table of pointers to surfaces required by the shader,
5749 * and also adds those buffers to the validation list so the kernel can make
5750 * resident before running our batch.
5751 */
5752 static void
iris_populate_binding_table(struct iris_context * ice,struct iris_batch * batch,gl_shader_stage stage,bool pin_only)5753 iris_populate_binding_table(struct iris_context *ice,
5754 struct iris_batch *batch,
5755 gl_shader_stage stage,
5756 bool pin_only)
5757 {
5758 const struct iris_binder *binder = &ice->state.binder;
5759 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5760 if (!shader)
5761 return;
5762
5763 struct iris_binding_table *bt = &shader->bt;
5764 struct iris_shader_state *shs = &ice->state.shaders[stage];
5765 uint32_t surf_base_offset = GFX_VER < 11 ? binder->bo->address : 0;
5766
5767 uint32_t *bt_map = binder->map + binder->bt_offset[stage];
5768 int s = 0;
5769
5770 const struct shader_info *info = iris_get_shader_info(ice, stage);
5771 if (!info) {
5772 /* TCS passthrough doesn't need a binding table. */
5773 assert(stage == MESA_SHADER_TESS_CTRL);
5774 return;
5775 }
5776
5777 if (stage == MESA_SHADER_COMPUTE &&
5778 shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {
5779 /* surface for gl_NumWorkGroups */
5780 struct iris_state_ref *grid_data = &ice->state.grid_size;
5781 struct iris_state_ref *grid_state = &ice->state.grid_surf_state;
5782 iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,
5783 IRIS_DOMAIN_PULL_CONSTANT_READ);
5784 iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,
5785 IRIS_DOMAIN_NONE);
5786 push_bt_entry(grid_state->offset);
5787 }
5788
5789 if (stage == MESA_SHADER_FRAGMENT) {
5790 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5791 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5792 if (cso_fb->nr_cbufs) {
5793 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5794 uint32_t addr;
5795 if (cso_fb->cbufs[i]) {
5796 addr = use_surface(ice, batch, cso_fb->cbufs[i], true,
5797 ice->state.draw_aux_usage[i], false,
5798 IRIS_DOMAIN_RENDER_WRITE);
5799 } else {
5800 addr = use_null_fb_surface(batch, ice);
5801 }
5802 push_bt_entry(addr);
5803 }
5804 } else if (bt->use_null_rt) {
5805 uint32_t addr = use_null_fb_surface(batch, ice);
5806 push_bt_entry(addr);
5807 }
5808 }
5809
5810 #define foreach_surface_used(index, group) \
5811 bt_assert(group); \
5812 for (int index = 0; index < bt->sizes[group]; index++) \
5813 if (iris_group_index_to_bti(bt, group, index) != \
5814 IRIS_SURFACE_NOT_USED)
5815
5816 foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {
5817 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5818 uint32_t addr;
5819 if (cso_fb->cbufs[i]) {
5820 addr = use_surface(ice, batch, cso_fb->cbufs[i],
5821 false, ice->state.draw_aux_usage[i], true,
5822 IRIS_DOMAIN_SAMPLER_READ);
5823 push_bt_entry(addr);
5824 }
5825 }
5826
5827 foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_LOW64) {
5828 struct iris_sampler_view *view = shs->textures[i];
5829 uint32_t addr = view ? use_sampler_view(ice, batch, view)
5830 : use_null_surface(batch, ice);
5831 push_bt_entry(addr);
5832 }
5833
5834 foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE_HIGH64) {
5835 struct iris_sampler_view *view = shs->textures[64 + i];
5836 uint32_t addr = view ? use_sampler_view(ice, batch, view)
5837 : use_null_surface(batch, ice);
5838 push_bt_entry(addr);
5839 }
5840
5841 foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {
5842 uint32_t addr = use_image(batch, ice, shs, info, i);
5843 push_bt_entry(addr);
5844 }
5845
5846 foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {
5847 uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],
5848 &shs->constbuf_surf_state[i], false,
5849 IRIS_DOMAIN_PULL_CONSTANT_READ);
5850 push_bt_entry(addr);
5851 }
5852
5853 foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {
5854 uint32_t addr =
5855 use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],
5856 shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);
5857 push_bt_entry(addr);
5858 }
5859
5860 #if 0
5861 /* XXX: YUV surfaces not implemented yet */
5862 bt_assert(plane_start[1], ...);
5863 bt_assert(plane_start[2], ...);
5864 #endif
5865 }
5866
5867 static void
iris_use_optional_res(struct iris_batch * batch,struct pipe_resource * res,bool writeable,enum iris_domain access)5868 iris_use_optional_res(struct iris_batch *batch,
5869 struct pipe_resource *res,
5870 bool writeable,
5871 enum iris_domain access)
5872 {
5873 if (res) {
5874 struct iris_bo *bo = iris_resource_bo(res);
5875 iris_use_pinned_bo(batch, bo, writeable, access);
5876 }
5877 }
5878
5879 static void
pin_depth_and_stencil_buffers(struct iris_batch * batch,struct pipe_surface * zsbuf,struct iris_depth_stencil_alpha_state * cso_zsa)5880 pin_depth_and_stencil_buffers(struct iris_batch *batch,
5881 struct pipe_surface *zsbuf,
5882 struct iris_depth_stencil_alpha_state *cso_zsa)
5883 {
5884 if (!zsbuf)
5885 return;
5886
5887 struct iris_resource *zres, *sres;
5888 iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);
5889
5890 if (zres) {
5891 iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,
5892 IRIS_DOMAIN_DEPTH_WRITE);
5893 if (zres->aux.bo) {
5894 iris_use_pinned_bo(batch, zres->aux.bo,
5895 cso_zsa->depth_writes_enabled,
5896 IRIS_DOMAIN_DEPTH_WRITE);
5897 }
5898 }
5899
5900 if (sres) {
5901 iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,
5902 IRIS_DOMAIN_DEPTH_WRITE);
5903 }
5904 }
5905
5906 static uint32_t
pin_scratch_space(struct iris_context * ice,struct iris_batch * batch,const struct iris_compiled_shader * shader,gl_shader_stage stage)5907 pin_scratch_space(struct iris_context *ice,
5908 struct iris_batch *batch,
5909 const struct iris_compiled_shader *shader,
5910 gl_shader_stage stage)
5911 {
5912 uint32_t scratch_addr = 0;
5913
5914 if (shader->total_scratch > 0) {
5915 struct iris_bo *scratch_bo =
5916 iris_get_scratch_space(ice, shader->total_scratch, stage);
5917 iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);
5918
5919 #if GFX_VERx10 >= 125
5920 const struct iris_state_ref *ref =
5921 iris_get_scratch_surf(ice, shader->total_scratch);
5922 iris_use_pinned_bo(batch, iris_resource_bo(ref->res),
5923 false, IRIS_DOMAIN_NONE);
5924 scratch_addr = ref->offset +
5925 iris_resource_bo(ref->res)->address -
5926 IRIS_MEMZONE_SCRATCH_START;
5927 assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));
5928 #else
5929 scratch_addr = scratch_bo->address;
5930 #endif
5931 }
5932
5933 return scratch_addr;
5934 }
5935
5936 /* ------------------------------------------------------------------- */
5937
5938 /**
5939 * Pin any BOs which were installed by a previous batch, and restored
5940 * via the hardware logical context mechanism.
5941 *
5942 * We don't need to re-emit all state every batch - the hardware context
5943 * mechanism will save and restore it for us. This includes pointers to
5944 * various BOs...which won't exist unless we ask the kernel to pin them
5945 * by adding them to the validation list.
5946 *
5947 * We can skip buffers if we've re-emitted those packets, as we're
5948 * overwriting those stale pointers with new ones, and don't actually
5949 * refer to the old BOs.
5950 */
5951 static void
iris_restore_render_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)5952 iris_restore_render_saved_bos(struct iris_context *ice,
5953 struct iris_batch *batch,
5954 const struct pipe_draw_info *draw)
5955 {
5956 struct iris_genx_state *genx = ice->state.genx;
5957
5958 const uint64_t clean = ~ice->state.dirty;
5959 const uint64_t stage_clean = ~ice->state.stage_dirty;
5960
5961 if (clean & IRIS_DIRTY_CC_VIEWPORT) {
5962 iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,
5963 IRIS_DOMAIN_NONE);
5964 }
5965
5966 if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
5967 iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,
5968 IRIS_DOMAIN_NONE);
5969 }
5970
5971 if (clean & IRIS_DIRTY_BLEND_STATE) {
5972 iris_use_optional_res(batch, ice->state.last_res.blend, false,
5973 IRIS_DOMAIN_NONE);
5974 }
5975
5976 if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
5977 iris_use_optional_res(batch, ice->state.last_res.color_calc, false,
5978 IRIS_DOMAIN_NONE);
5979 }
5980
5981 if (clean & IRIS_DIRTY_SCISSOR_RECT) {
5982 iris_use_optional_res(batch, ice->state.last_res.scissor, false,
5983 IRIS_DOMAIN_NONE);
5984 }
5985
5986 if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {
5987 for (int i = 0; i < 4; i++) {
5988 struct iris_stream_output_target *tgt =
5989 (void *) ice->state.so_target[i];
5990 if (tgt) {
5991 iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
5992 true, IRIS_DOMAIN_OTHER_WRITE);
5993 iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
5994 true, IRIS_DOMAIN_OTHER_WRITE);
5995 }
5996 }
5997 }
5998
5999 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6000 if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6001 continue;
6002
6003 struct iris_shader_state *shs = &ice->state.shaders[stage];
6004 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6005
6006 if (!shader)
6007 continue;
6008
6009 for (int i = 0; i < 4; i++) {
6010 const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6011
6012 if (range->length == 0)
6013 continue;
6014
6015 /* Range block is a binding table index, map back to UBO index. */
6016 unsigned block_index = iris_bti_to_group_index(
6017 &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6018 assert(block_index != IRIS_SURFACE_NOT_USED);
6019
6020 struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6021 struct iris_resource *res = (void *) cbuf->buffer;
6022
6023 if (res)
6024 iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
6025 else
6026 iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,
6027 IRIS_DOMAIN_OTHER_READ);
6028 }
6029 }
6030
6031 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6032 if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6033 /* Re-pin any buffers referred to by the binding table. */
6034 iris_populate_binding_table(ice, batch, stage, true);
6035 }
6036 }
6037
6038 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6039 struct iris_shader_state *shs = &ice->state.shaders[stage];
6040 struct pipe_resource *res = shs->sampler_table.res;
6041 if (res)
6042 iris_use_pinned_bo(batch, iris_resource_bo(res), false,
6043 IRIS_DOMAIN_NONE);
6044 }
6045
6046 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6047 if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {
6048 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6049
6050 if (shader) {
6051 struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6052 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6053
6054 pin_scratch_space(ice, batch, shader, stage);
6055 }
6056 }
6057 }
6058
6059 if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&
6060 (clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {
6061 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6062 pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
6063 }
6064
6065 iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
6066 IRIS_DOMAIN_VF_READ);
6067
6068 if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
6069 uint64_t bound = ice->state.bound_vertex_buffers;
6070 while (bound) {
6071 const int i = u_bit_scan64(&bound);
6072 struct pipe_resource *res = genx->vertex_buffers[i].resource;
6073 iris_use_pinned_bo(batch, iris_resource_bo(res), false,
6074 IRIS_DOMAIN_VF_READ);
6075 }
6076 }
6077 }
6078
6079 static void
iris_restore_compute_saved_bos(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)6080 iris_restore_compute_saved_bos(struct iris_context *ice,
6081 struct iris_batch *batch,
6082 const struct pipe_grid_info *grid)
6083 {
6084 const uint64_t stage_clean = ~ice->state.stage_dirty;
6085
6086 const int stage = MESA_SHADER_COMPUTE;
6087 struct iris_shader_state *shs = &ice->state.shaders[stage];
6088
6089 if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {
6090 /* Re-pin any buffers referred to by the binding table. */
6091 iris_populate_binding_table(ice, batch, stage, true);
6092 }
6093
6094 struct pipe_resource *sampler_res = shs->sampler_table.res;
6095 if (sampler_res)
6096 iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,
6097 IRIS_DOMAIN_NONE);
6098
6099 if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&
6100 (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&
6101 (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
6102 (stage_clean & IRIS_STAGE_DIRTY_CS)) {
6103 iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,
6104 IRIS_DOMAIN_NONE);
6105 }
6106
6107 if (stage_clean & IRIS_STAGE_DIRTY_CS) {
6108 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6109
6110 if (shader) {
6111 struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
6112 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
6113
6114 if (GFX_VERx10 < 125) {
6115 struct iris_bo *curbe_bo =
6116 iris_resource_bo(ice->state.last_res.cs_thread_ids);
6117 iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);
6118 }
6119
6120 pin_scratch_space(ice, batch, shader, stage);
6121 }
6122 }
6123 }
6124
6125 /**
6126 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
6127 */
6128 static void
iris_update_binder_address(struct iris_batch * batch,struct iris_binder * binder)6129 iris_update_binder_address(struct iris_batch *batch,
6130 struct iris_binder *binder)
6131 {
6132 if (batch->last_binder_address == binder->bo->address)
6133 return;
6134
6135 struct isl_device *isl_dev = &batch->screen->isl_dev;
6136 uint32_t mocs = isl_mocs(isl_dev, 0, false);
6137
6138 iris_batch_sync_region_start(batch);
6139
6140 #if GFX_VER >= 11
6141 /* Use 3DSTATE_BINDING_TABLE_POOL_ALLOC on Icelake and later */
6142
6143 #if GFX_VERx10 == 120
6144 /* Wa_1607854226:
6145 *
6146 * Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
6147 * mode by putting the pipeline temporarily in 3D mode..
6148 */
6149 if (batch->name == IRIS_BATCH_COMPUTE)
6150 emit_pipeline_select(batch, _3D);
6151 #endif
6152
6153 iris_emit_pipe_control_flush(batch, "Stall for binder realloc",
6154 PIPE_CONTROL_CS_STALL);
6155
6156 iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
6157 btpa.BindingTablePoolBaseAddress = ro_bo(binder->bo, 0);
6158 btpa.BindingTablePoolBufferSize = binder->size / 4096;
6159 #if GFX_VERx10 < 125
6160 btpa.BindingTablePoolEnable = true;
6161 #endif
6162 btpa.MOCS = mocs;
6163 }
6164
6165 #if GFX_VERx10 == 120
6166 /* Wa_1607854226:
6167 *
6168 * Put the pipeline back into compute mode.
6169 */
6170 if (batch->name == IRIS_BATCH_COMPUTE)
6171 emit_pipeline_select(batch, GPGPU);
6172 #endif
6173 #else
6174 /* Use STATE_BASE_ADDRESS on older platforms */
6175 flush_before_state_base_change(batch);
6176
6177 iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
6178 sba.SurfaceStateBaseAddressModifyEnable = true;
6179 sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
6180
6181 /* The hardware appears to pay attention to the MOCS fields even
6182 * if you don't set the "Address Modify Enable" bit for the base.
6183 */
6184 sba.GeneralStateMOCS = mocs;
6185 sba.StatelessDataPortAccessMOCS = mocs;
6186 sba.DynamicStateMOCS = mocs;
6187 sba.IndirectObjectMOCS = mocs;
6188 sba.InstructionMOCS = mocs;
6189 sba.SurfaceStateMOCS = mocs;
6190 #if GFX_VER >= 9
6191 sba.BindlessSurfaceStateMOCS = mocs;
6192 #endif
6193 #if GFX_VERx10 >= 125
6194 sba.L1CacheControl = L1CC_WB;
6195 #endif
6196 }
6197 #endif
6198
6199 flush_after_state_base_change(batch);
6200 iris_batch_sync_region_end(batch);
6201
6202 batch->last_binder_address = binder->bo->address;
6203 }
6204
6205 static inline void
iris_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)6206 iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
6207 bool window_space_position, float *zmin, float *zmax)
6208 {
6209 if (window_space_position) {
6210 *zmin = 0.f;
6211 *zmax = 1.f;
6212 return;
6213 }
6214 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
6215 }
6216
6217 /* Wa_16018063123 */
6218 static inline void
batch_emit_fast_color_dummy_blit(struct iris_batch * batch)6219 batch_emit_fast_color_dummy_blit(struct iris_batch *batch)
6220 {
6221 #if GFX_VERx10 >= 125
6222 iris_emit_cmd(batch, GENX(XY_FAST_COLOR_BLT), blt) {
6223 blt.DestinationBaseAddress = batch->screen->workaround_address;
6224 blt.DestinationMOCS = iris_mocs(batch->screen->workaround_address.bo,
6225 &batch->screen->isl_dev,
6226 ISL_SURF_USAGE_BLITTER_DST_BIT);
6227 blt.DestinationPitch = 63;
6228 blt.DestinationX2 = 1;
6229 blt.DestinationY2 = 4;
6230 blt.DestinationSurfaceWidth = 1;
6231 blt.DestinationSurfaceHeight = 4;
6232 blt.DestinationSurfaceType = XY_SURFTYPE_2D;
6233 blt.DestinationSurfaceQPitch = 4;
6234 blt.DestinationTiling = XY_TILE_LINEAR;
6235 }
6236 #endif
6237 }
6238
6239 #if GFX_VER >= 12
6240 static void
invalidate_aux_map_state_per_engine(struct iris_batch * batch)6241 invalidate_aux_map_state_per_engine(struct iris_batch *batch)
6242 {
6243 uint64_t register_addr = 0;
6244
6245 switch (batch->name) {
6246 case IRIS_BATCH_RENDER: {
6247 /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6248 * RCS engine idle sequence:
6249 *
6250 * Gfx12+:
6251 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6252 * Target Cache Flush + Depth Cache
6253 *
6254 * Gfx125+:
6255 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + Render
6256 * Target Cache Flush + Depth Cache + CCS flush
6257 */
6258 iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6259 PIPE_CONTROL_DATA_CACHE_FLUSH |
6260 PIPE_CONTROL_L3_FABRIC_FLUSH |
6261 PIPE_CONTROL_CS_STALL |
6262 PIPE_CONTROL_RENDER_TARGET_FLUSH |
6263 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
6264 (GFX_VERx10 == 125 ?
6265 PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6266
6267 register_addr = GENX(GFX_CCS_AUX_INV_num);
6268 break;
6269 }
6270 case IRIS_BATCH_COMPUTE: {
6271 /* From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6272 * Compute engine idle sequence:
6273 *
6274 * Gfx12+:
6275 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall
6276 *
6277 * Gfx125+:
6278 * PIPE_CONTROL:- DC Flush + L3 Fabric Flush + CS Stall + CCS flush
6279 */
6280 iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
6281 PIPE_CONTROL_DATA_CACHE_FLUSH |
6282 PIPE_CONTROL_L3_FABRIC_FLUSH |
6283 PIPE_CONTROL_CS_STALL |
6284 (GFX_VERx10 == 125 ?
6285 PIPE_CONTROL_CCS_CACHE_FLUSH : 0));
6286
6287 register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
6288 break;
6289 }
6290 case IRIS_BATCH_BLITTER: {
6291 #if GFX_VERx10 >= 125
6292 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
6293 if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
6294 batch_emit_fast_color_dummy_blit(batch);
6295
6296 /*
6297 * Notice we don't set the L3 Fabric Flush here, because we have
6298 * PIPE_CONTROL_CS_STALL. The PIPE_CONTROL::L3 Fabric Flush
6299 * documentation says :
6300 *
6301 * "L3 Fabric Flush will ensure all the pending transactions in the
6302 * L3 Fabric are flushed to global observation point. HW does
6303 * implicit L3 Fabric Flush on all stalling flushes (both explicit
6304 * and implicit) and on PIPECONTROL having Post Sync Operation
6305 * enabled."
6306 *
6307 * Therefore setting L3 Fabric Flush here would be redundant.
6308 *
6309 * From Bspec 43904 (Register_CCSAuxiliaryTableInvalidate):
6310 * Blitter engine idle sequence:
6311 *
6312 * Gfx125+:
6313 * MI_FLUSH_DW (dw0;b16 – flush CCS)
6314 */
6315 iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
6316 fd.FlushCCS = true;
6317 }
6318 register_addr = GENX(BCS_CCS_AUX_INV_num);
6319 #endif
6320 break;
6321 }
6322 default:
6323 unreachable("Invalid batch for aux map invalidation");
6324 break;
6325 }
6326
6327 if (register_addr != 0) {
6328 /* If the aux-map state number increased, then we need to rewrite the
6329 * register. Rewriting the register is used to both set the aux-map
6330 * translation table address, and also to invalidate any previously
6331 * cached translations.
6332 */
6333 iris_load_register_imm32(batch, register_addr, 1);
6334
6335 /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
6336 *
6337 * "Poll Aux Invalidation bit once the invalidation is set (Register
6338 * 4208 bit 0)"
6339 */
6340 iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
6341 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
6342 sem.WaitMode = PollingMode;
6343 sem.RegisterPollMode = true;
6344 sem.SemaphoreDataDword = 0x0;
6345 sem.SemaphoreAddress = ro_bo(NULL, register_addr);
6346 }
6347 }
6348 }
6349
6350 void
genX(invalidate_aux_map_state)6351 genX(invalidate_aux_map_state)(struct iris_batch *batch)
6352 {
6353 struct iris_screen *screen = batch->screen;
6354 void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6355 if (!aux_map_ctx)
6356 return;
6357 uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);
6358 if (batch->last_aux_map_state != aux_map_state_num) {
6359 invalidate_aux_map_state_per_engine(batch);
6360 batch->last_aux_map_state = aux_map_state_num;
6361 }
6362 }
6363
6364 static void
init_aux_map_state(struct iris_batch * batch)6365 init_aux_map_state(struct iris_batch *batch)
6366 {
6367 struct iris_screen *screen = batch->screen;
6368 void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
6369 if (!aux_map_ctx)
6370 return;
6371
6372 uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);
6373 assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);
6374
6375 uint32_t reg = 0;
6376 switch (batch->name) {
6377 case IRIS_BATCH_COMPUTE:
6378 if (iris_bufmgr_compute_engine_supported(screen->bufmgr)) {
6379 reg = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
6380 break;
6381 }
6382 /* fallthrough */
6383 FALLTHROUGH;
6384 case IRIS_BATCH_RENDER:
6385 reg = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
6386 break;
6387 case IRIS_BATCH_BLITTER:
6388 #if GFX_VERx10 >= 125
6389 reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
6390 #endif
6391 break;
6392 default:
6393 unreachable("Invalid batch for aux map init.");
6394 }
6395
6396 if (reg)
6397 iris_load_register_imm64(batch, reg, base_addr);
6398 }
6399 #endif
6400
6401 struct push_bos {
6402 struct {
6403 struct iris_address addr;
6404 uint32_t length;
6405 } buffers[4];
6406 int buffer_count;
6407 uint32_t max_length;
6408 };
6409
6410 static void
setup_constant_buffers(struct iris_context * ice,struct iris_batch * batch,int stage,struct push_bos * push_bos)6411 setup_constant_buffers(struct iris_context *ice,
6412 struct iris_batch *batch,
6413 int stage,
6414 struct push_bos *push_bos)
6415 {
6416 struct iris_shader_state *shs = &ice->state.shaders[stage];
6417 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
6418
6419 uint32_t push_range_sum = 0;
6420
6421 int n = 0;
6422 for (int i = 0; i < 4; i++) {
6423 const struct iris_ubo_range *range = &shader->ubo_ranges[i];
6424
6425 if (range->length == 0)
6426 continue;
6427
6428 push_range_sum += range->length;
6429
6430 if (range->length > push_bos->max_length)
6431 push_bos->max_length = range->length;
6432
6433 /* Range block is a binding table index, map back to UBO index. */
6434 unsigned block_index = iris_bti_to_group_index(
6435 &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
6436 assert(block_index != IRIS_SURFACE_NOT_USED);
6437
6438 struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
6439 struct iris_resource *res = (void *) cbuf->buffer;
6440
6441 assert(cbuf->buffer_offset % 32 == 0);
6442
6443 if (res)
6444 iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
6445
6446 push_bos->buffers[n].length = range->length;
6447 push_bos->buffers[n].addr =
6448 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
6449 : batch->screen->workaround_address;
6450 n++;
6451 }
6452
6453 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
6454 *
6455 * "The sum of all four read length fields must be less than or
6456 * equal to the size of 64."
6457 */
6458 assert(push_range_sum <= 64);
6459
6460 push_bos->buffer_count = n;
6461 }
6462
6463 static void
emit_push_constant_packets(struct iris_context * ice,struct iris_batch * batch,int stage,const struct push_bos * push_bos)6464 emit_push_constant_packets(struct iris_context *ice,
6465 struct iris_batch *batch,
6466 int stage,
6467 const struct push_bos *push_bos)
6468 {
6469 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
6470
6471 iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
6472 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
6473
6474 #if GFX_VER >= 9
6475 pkt.MOCS = isl_mocs(isl_dev, 0, false);
6476 #endif
6477
6478 /* The Skylake PRM contains the following restriction:
6479 *
6480 * "The driver must ensure The following case does not occur
6481 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
6482 * buffer 3 read length equal to zero committed followed by a
6483 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
6484 * zero committed."
6485 *
6486 * To avoid this, we program the buffers in the highest slots.
6487 * This way, slot 0 is only used if slot 3 is also used.
6488 */
6489 const int n = push_bos->buffer_count;
6490 assert(n <= 4);
6491 const unsigned shift = 4 - n;
6492 for (int i = 0; i < n; i++) {
6493 pkt.ConstantBody.ReadLength[i + shift] =
6494 push_bos->buffers[i].length;
6495 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
6496 }
6497 }
6498 }
6499
6500 #if GFX_VER >= 12
6501 static void
emit_null_push_constant_tbimr_workaround(struct iris_batch * batch)6502 emit_null_push_constant_tbimr_workaround(struct iris_batch *batch)
6503 {
6504 struct isl_device *isl_dev = &batch->screen->isl_dev;
6505 /* Pass a single-register push constant payload for the PS
6506 * stage even if empty, since PS invocations with zero push
6507 * constant cycles have been found to cause hangs with TBIMR
6508 * enabled. See HSDES #22020184996.
6509 *
6510 * XXX - Use workaround infrastructure and final workaround
6511 * when provided by hardware team.
6512 */
6513 const struct iris_address null_addr = {
6514 .bo = batch->screen->workaround_bo,
6515 .offset = 1024,
6516 };
6517 const uint32_t num_dwords = 2 + 2 * 1;
6518 uint32_t const_all[num_dwords];
6519 uint32_t *dw = &const_all[0];
6520
6521 iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6522 all.DWordLength = num_dwords - 2;
6523 all.MOCS = isl_mocs(isl_dev, 0, false);
6524 all.ShaderUpdateEnable = (1 << MESA_SHADER_FRAGMENT);
6525 all.PointerBufferMask = 1;
6526 }
6527 dw += 2;
6528
6529 _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), dw, data) {
6530 data.PointerToConstantBuffer = null_addr;
6531 data.ConstantBufferReadLength = 1;
6532 }
6533
6534 iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6535 }
6536
6537 static void
emit_push_constant_packet_all(struct iris_context * ice,struct iris_batch * batch,uint32_t shader_mask,const struct push_bos * push_bos)6538 emit_push_constant_packet_all(struct iris_context *ice,
6539 struct iris_batch *batch,
6540 uint32_t shader_mask,
6541 const struct push_bos *push_bos)
6542 {
6543 struct isl_device *isl_dev = &batch->screen->isl_dev;
6544
6545 if (!push_bos) {
6546 if (batch->screen->devinfo->needs_null_push_constant_tbimr_workaround &&
6547 (shader_mask & (1 << MESA_SHADER_FRAGMENT))) {
6548 emit_null_push_constant_tbimr_workaround(batch);
6549 shader_mask &= ~(1 << MESA_SHADER_FRAGMENT);
6550 }
6551
6552 if (shader_mask) {
6553 iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
6554 pc.ShaderUpdateEnable = shader_mask;
6555 pc.MOCS = iris_mocs(NULL, isl_dev, 0);
6556 }
6557 }
6558 return;
6559 }
6560
6561 const uint32_t n = push_bos->buffer_count;
6562 const uint32_t max_pointers = 4;
6563 const uint32_t num_dwords = 2 + 2 * n;
6564 uint32_t const_all[2 + 2 * max_pointers];
6565 uint32_t *dw = &const_all[0];
6566
6567 assert(n <= max_pointers);
6568 iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
6569 all.DWordLength = num_dwords - 2;
6570 all.MOCS = isl_mocs(isl_dev, 0, false);
6571 all.ShaderUpdateEnable = shader_mask;
6572 all.PointerBufferMask = (1 << n) - 1;
6573 }
6574 dw += 2;
6575
6576 for (int i = 0; i < n; i++) {
6577 _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
6578 dw + i * 2, data) {
6579 data.PointerToConstantBuffer = push_bos->buffers[i].addr;
6580 data.ConstantBufferReadLength = push_bos->buffers[i].length;
6581 }
6582 }
6583 iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
6584 }
6585 #endif
6586
6587 void
genX(emit_depth_state_workarounds)6588 genX(emit_depth_state_workarounds)(struct iris_context *ice,
6589 struct iris_batch *batch,
6590 const struct isl_surf *surf)
6591 {
6592 #if INTEL_NEEDS_WA_1808121037
6593 const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
6594 surf->samples == 1;
6595
6596 switch (ice->state.genx->depth_reg_mode) {
6597 case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
6598 if (!is_d16_1x_msaa)
6599 return;
6600 break;
6601 case IRIS_DEPTH_REG_MODE_D16_1X_MSAA:
6602 if (is_d16_1x_msaa)
6603 return;
6604 break;
6605 case IRIS_DEPTH_REG_MODE_UNKNOWN:
6606 break;
6607 }
6608
6609 /* We'll change some CHICKEN registers depending on the depth surface
6610 * format. Do a depth flush and stall so the pipeline is not using these
6611 * settings while we change the registers.
6612 */
6613 iris_emit_end_of_pipe_sync(batch,
6614 "Workaround: Stop pipeline for Wa_1808121037",
6615 PIPE_CONTROL_DEPTH_STALL |
6616 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
6617
6618 /* Wa_1808121037
6619 *
6620 * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
6621 * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
6622 */
6623 iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
6624 reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
6625 reg.HIZPlaneOptimizationdisablebitMask = true;
6626 }
6627
6628 ice->state.genx->depth_reg_mode =
6629 is_d16_1x_msaa ? IRIS_DEPTH_REG_MODE_D16_1X_MSAA :
6630 IRIS_DEPTH_REG_MODE_HW_DEFAULT;
6631 #endif
6632 }
6633
6634 /* Calculate TBIMR tiling parameters adequate for the current pipeline
6635 * setup. Return true if TBIMR should be enabled.
6636 */
6637 UNUSED static bool
calculate_tile_dimensions(struct iris_context * ice,unsigned * tile_width,unsigned * tile_height)6638 calculate_tile_dimensions(struct iris_context *ice,
6639 unsigned *tile_width, unsigned *tile_height)
6640 {
6641 struct iris_screen *screen = (void *)ice->ctx.screen;
6642 const struct intel_device_info *devinfo = screen->devinfo;
6643
6644 assert(GFX_VER == 12);
6645 const unsigned aux_scale = ISL_MAIN_TO_CCS_SIZE_RATIO_XE;
6646
6647 /* Perform a rough calculation of the tile cache footprint of the
6648 * pixel pipeline, approximating it as the sum of the amount of
6649 * memory used per pixel by every render target, depth, stencil and
6650 * auxiliary surfaces bound to the pipeline.
6651 */
6652 unsigned pixel_size = 0;
6653
6654 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
6655
6656 if (cso->width == 0 || cso->height == 0)
6657 return false;
6658
6659 for (unsigned i = 0; i < cso->nr_cbufs; i++) {
6660 const struct iris_surface *surf = (void *)cso->cbufs[i];
6661
6662 if (surf) {
6663 const struct iris_resource *res = (void *)surf->base.texture;
6664
6665 pixel_size += intel_calculate_surface_pixel_size(&res->surf);
6666
6667 /* XXX - Pessimistic, in some cases it might be helpful to neglect
6668 * aux surface traffic.
6669 */
6670 if (ice->state.draw_aux_usage[i]) {
6671 pixel_size += intel_calculate_surface_pixel_size(&res->aux.surf);
6672
6673 if (isl_aux_usage_has_ccs(res->aux.usage)) {
6674 pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6675 &res->surf), aux_scale);
6676 }
6677 }
6678 }
6679 }
6680
6681 if (cso->zsbuf) {
6682 struct iris_resource *zres;
6683 struct iris_resource *sres;
6684 iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres, &sres);
6685
6686 if (zres) {
6687 pixel_size += intel_calculate_surface_pixel_size(&zres->surf);
6688
6689 /* XXX - Pessimistic, in some cases it might be helpful to neglect
6690 * aux surface traffic.
6691 */
6692 if (iris_resource_level_has_hiz(devinfo, zres, cso->zsbuf->u.tex.level)) {
6693 pixel_size += intel_calculate_surface_pixel_size(&zres->aux.surf);
6694
6695 if (isl_aux_usage_has_ccs(zres->aux.usage)) {
6696 pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
6697 &zres->surf), aux_scale);
6698 }
6699 }
6700 }
6701
6702 if (sres) {
6703 pixel_size += intel_calculate_surface_pixel_size(&sres->surf);
6704 }
6705 }
6706
6707 /* Compute a tile layout that allows reasonable utilization of the
6708 * tile cache based on the per-pixel cache footprint estimated
6709 * above.
6710 */
6711 intel_calculate_tile_dimensions(devinfo, screen->l3_config_3d,
6712 32, 32, cso->width, cso->height, pixel_size,
6713 tile_width, tile_height);
6714
6715 /* Perform TBIMR tile passes only if the framebuffer covers more
6716 * than a single tile.
6717 */
6718 return *tile_width < cso->width || *tile_height < cso->height;
6719 }
6720
6721 static void
iris_preemption_streamout_wa(struct iris_context * ice,struct iris_batch * batch,bool enable)6722 iris_preemption_streamout_wa(struct iris_context *ice,
6723 struct iris_batch *batch,
6724 bool enable)
6725 {
6726 #if GFX_VERx10 >= 120
6727 if (!intel_needs_workaround(batch->screen->devinfo, 16013994831))
6728 return;
6729
6730 iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
6731 reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !enable;
6732 reg.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
6733 }
6734
6735 /* Emit CS_STALL and 250 noops. */
6736 iris_emit_pipe_control_flush(batch, "workaround: Wa_16013994831",
6737 PIPE_CONTROL_CS_STALL);
6738 for (unsigned i = 0; i < 250; i++)
6739 iris_emit_cmd(batch, GENX(MI_NOOP), noop);
6740
6741 ice->state.genx->object_preemption = enable;
6742 #endif
6743 }
6744
6745 static void
shader_program_uses_primitive_id(struct iris_context * ice,struct iris_batch * batch,struct iris_compiled_shader * shader,gl_shader_stage stage,bool * uses_primitive_id)6746 shader_program_uses_primitive_id(struct iris_context *ice,
6747 struct iris_batch *batch,
6748 struct iris_compiled_shader *shader,
6749 gl_shader_stage stage,
6750 bool *uses_primitive_id)
6751 {
6752 switch (stage) {
6753 case MESA_SHADER_TESS_CTRL: {
6754 struct iris_tcs_data *tcs_data = iris_tcs_data(shader);
6755 *uses_primitive_id |= tcs_data->include_primitive_id;
6756 break;
6757 }
6758 case MESA_SHADER_TESS_EVAL: {
6759 struct iris_tes_data *tes_data = iris_tes_data(shader);
6760 *uses_primitive_id |= tes_data->include_primitive_id;
6761 break;
6762 }
6763 default:
6764 break;
6765 }
6766
6767 struct iris_compiled_shader *gs_shader =
6768 ice->shaders.prog[MESA_SHADER_GEOMETRY];
6769 const struct iris_gs_data *gs_data =
6770 gs_shader ? iris_gs_data(gs_shader) : NULL;
6771
6772 *uses_primitive_id |= gs_data && gs_data->include_primitive_id;
6773 }
6774
6775 static void
emit_wa_18020335297_dummy_draw(struct iris_batch * batch)6776 emit_wa_18020335297_dummy_draw(struct iris_batch *batch)
6777 {
6778 #if GFX_VERx10 >= 125
6779 iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
6780 vfg.DistributionMode = RR_STRICT;
6781 }
6782 iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
6783 vf.GeometryDistributionEnable = true;
6784 }
6785 #endif
6786
6787 #if GFX_VER >= 12
6788 iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
6789 pr.ReplicaMask = 1;
6790 }
6791 #endif
6792
6793 iris_emit_cmd(batch, GENX(3DSTATE_RASTER), rr) {
6794 rr.CullMode = CULLMODE_NONE;
6795 rr.FrontFaceFillMode = FILL_MODE_SOLID;
6796 rr.BackFaceFillMode = FILL_MODE_SOLID;
6797 }
6798
6799 iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) { }
6800 iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) { }
6801
6802 #if GFX_VER >= 11
6803 iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs2) { }
6804 #endif
6805
6806 iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) {
6807 clip.ClipEnable = true;
6808 clip.ClipMode = CLIPMODE_REJECT_ALL;
6809 }
6810
6811 iris_emit_cmd(batch, GENX(3DSTATE_VS), vs) { }
6812 iris_emit_cmd(batch, GENX(3DSTATE_GS), gs) { }
6813 iris_emit_cmd(batch, GENX(3DSTATE_HS), hs) { }
6814 iris_emit_cmd(batch, GENX(3DSTATE_TE), te) { }
6815 iris_emit_cmd(batch, GENX(3DSTATE_DS), ds) { }
6816 iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so) { }
6817
6818 uint32_t vertex_elements[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)];
6819 uint32_t *ve_pack_dest = &vertex_elements[1];
6820
6821 iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), vertex_elements, ve) {
6822 ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 -
6823 GENX(3DSTATE_VERTEX_ELEMENTS_length_bias);
6824 }
6825
6826 for (int i = 0; i < 2; i++) {
6827 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6828 ve.Valid = true;
6829 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
6830 ve.Component0Control = VFCOMP_STORE_0;
6831 ve.Component1Control = VFCOMP_STORE_0;
6832 ve.Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6833 ve.Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP;
6834 }
6835 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6836 }
6837
6838 iris_batch_emit(batch, vertex_elements, sizeof(uint32_t) *
6839 (1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)));
6840
6841 iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
6842 topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
6843 }
6844
6845 /* Emit dummy draw per slice. */
6846 for (unsigned i = 0; i < batch->screen->devinfo->num_slices; i++) {
6847 iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
6848 prim.VertexCountPerInstance = 3;
6849 prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
6850 prim.InstanceCount = 1;
6851 prim.VertexAccessType = SEQUENTIAL;
6852 }
6853 }
6854 }
6855
6856 static void
iris_upload_dirty_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,bool skip_vb_params)6857 iris_upload_dirty_render_state(struct iris_context *ice,
6858 struct iris_batch *batch,
6859 const struct pipe_draw_info *draw,
6860 bool skip_vb_params)
6861 {
6862 struct iris_screen *screen = batch->screen;
6863 struct iris_border_color_pool *border_color_pool =
6864 iris_bufmgr_get_border_color_pool(screen->bufmgr);
6865
6866 /* Re-emit 3DSTATE_DS before any 3DPRIMITIVE when tessellation is on */
6867 if (intel_needs_workaround(batch->screen->devinfo, 22018402687) &&
6868 ice->shaders.prog[MESA_SHADER_TESS_EVAL])
6869 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TES;
6870
6871 uint64_t dirty = ice->state.dirty;
6872 uint64_t stage_dirty = ice->state.stage_dirty;
6873
6874 if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&
6875 !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))
6876 return;
6877
6878 struct iris_genx_state *genx = ice->state.genx;
6879 struct iris_binder *binder = &ice->state.binder;
6880 struct iris_fs_data *fs_data =
6881 iris_fs_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]);
6882
6883 /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
6884 * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
6885 */
6886 bool needs_wa_14018912822 =
6887 screen->driconf.intel_enable_wa_14018912822 &&
6888 intel_needs_workaround(batch->screen->devinfo, 14018912822) &&
6889 util_framebuffer_get_num_samples(&ice->state.framebuffer) > 1;
6890
6891 if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
6892 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6893 uint32_t cc_vp_address;
6894 bool wa_18020335297_applied = false;
6895
6896 /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
6897 if (intel_needs_workaround(screen->devinfo, 18020335297) &&
6898 batch->name == IRIS_BATCH_RENDER &&
6899 ice->state.viewport_ptr_set) {
6900 emit_wa_18020335297_dummy_draw(batch);
6901 wa_18020335297_applied = true;
6902 }
6903
6904 /* XXX: could avoid streaming for depth_clip [0,1] case. */
6905 uint32_t *cc_vp_map =
6906 stream_state(batch, ice->state.dynamic_uploader,
6907 &ice->state.last_res.cc_vp,
6908 4 * ice->state.num_viewports *
6909 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
6910 for (int i = 0; i < ice->state.num_viewports; i++) {
6911 float zmin, zmax;
6912 iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,
6913 ice->state.window_space_position,
6914 &zmin, &zmax);
6915 if (cso_rast->depth_clip_near)
6916 zmin = 0.0;
6917 if (cso_rast->depth_clip_far)
6918 zmax = 1.0;
6919
6920 iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
6921 ccv.MinimumDepth = zmin;
6922 ccv.MaximumDepth = zmax;
6923 }
6924
6925 cc_vp_map += GENX(CC_VIEWPORT_length);
6926 }
6927
6928 iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
6929 ptr.CCViewportPointer = cc_vp_address;
6930 }
6931
6932 if (wa_18020335297_applied) {
6933 #if GFX_VER >= 12
6934 iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) { }
6935 #endif
6936 /* Dirty all emitted WA state to make sure that current real
6937 * state is restored.
6938 */
6939 dirty |= IRIS_DIRTY_VFG |
6940 IRIS_DIRTY_VF |
6941 IRIS_DIRTY_RASTER |
6942 IRIS_DIRTY_VF_STATISTICS |
6943 IRIS_DIRTY_VF_SGVS |
6944 IRIS_DIRTY_CLIP |
6945 IRIS_DIRTY_STREAMOUT |
6946 IRIS_DIRTY_VERTEX_ELEMENTS |
6947 IRIS_DIRTY_VF_TOPOLOGY;
6948
6949 for (int stage = 0; stage < MESA_SHADER_FRAGMENT; stage++) {
6950 if (ice->shaders.prog[stage])
6951 stage_dirty |= (IRIS_STAGE_DIRTY_VS << stage);
6952 }
6953 }
6954 ice->state.viewport_ptr_set = true;
6955 }
6956
6957 if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
6958 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6959 int32_t x_min, y_min, x_max, y_max;
6960 uint32_t sf_cl_vp_address;
6961 uint32_t *vp_map =
6962 stream_state(batch, ice->state.dynamic_uploader,
6963 &ice->state.last_res.sf_cl_vp,
6964 4 * ice->state.num_viewports *
6965 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
6966
6967 x_min = ice->state.render_area.x;
6968 y_min = ice->state.render_area.y;
6969 x_max = ice->state.render_area.width;
6970 y_max = ice->state.render_area.height;
6971
6972 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
6973 const struct pipe_viewport_state *state = &ice->state.viewports[i];
6974 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
6975
6976 float vp_xmin = viewport_extent(state, 0, -1.0f);
6977 float vp_xmax = viewport_extent(state, 0, 1.0f);
6978 float vp_ymin = viewport_extent(state, 1, -1.0f);
6979 float vp_ymax = viewport_extent(state, 1, 1.0f);
6980
6981 intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
6982 state->scale[0], state->scale[1],
6983 state->translate[0], state->translate[1],
6984 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
6985
6986 iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
6987 vp.ViewportMatrixElementm00 = state->scale[0];
6988 vp.ViewportMatrixElementm11 = state->scale[1];
6989 vp.ViewportMatrixElementm22 = state->scale[2];
6990 vp.ViewportMatrixElementm30 = state->translate[0];
6991 vp.ViewportMatrixElementm31 = state->translate[1];
6992 vp.ViewportMatrixElementm32 = state->translate[2];
6993 vp.XMinClipGuardband = gb_xmin;
6994 vp.XMaxClipGuardband = gb_xmax;
6995 vp.YMinClipGuardband = gb_ymin;
6996 vp.YMaxClipGuardband = gb_ymax;
6997 vp.XMinViewPort = MAX2(vp_xmin, 0);
6998 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6999 vp.YMinViewPort = MAX2(vp_ymin, 0);
7000 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
7001 }
7002
7003 vp_map += GENX(SF_CLIP_VIEWPORT_length);
7004 }
7005
7006 iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
7007 ptr.SFClipViewportPointer = sf_cl_vp_address;
7008 }
7009 }
7010
7011 if (dirty & IRIS_DIRTY_URB) {
7012 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
7013 if (!ice->shaders.prog[i]) {
7014 ice->shaders.urb.cfg.size[i] = 1;
7015 } else {
7016 struct iris_vue_data *vue_data =
7017 iris_vue_data(ice->shaders.prog[i]);
7018 ice->shaders.urb.cfg.size[i] = vue_data->urb_entry_size;
7019 }
7020 assert(ice->shaders.urb.cfg.size[i] != 0);
7021 }
7022
7023 genX(emit_urb_config)(batch,
7024 ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
7025 ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL);
7026 }
7027
7028 if (dirty & IRIS_DIRTY_BLEND_STATE) {
7029 struct iris_blend_state *cso_blend = ice->state.cso_blend;
7030 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7031 struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7032
7033 bool color_blend_zero = false;
7034 bool alpha_blend_zero = false;
7035
7036 /* Always write at least one BLEND_STATE - the final RT message will
7037 * reference BLEND_STATE[0] even if there aren't color writes. There
7038 * may still be alpha testing, computed depth, and so on.
7039 */
7040 const int rt_dwords =
7041 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
7042
7043 uint32_t blend_offset;
7044 uint32_t *blend_map =
7045 stream_state(batch, ice->state.dynamic_uploader,
7046 &ice->state.last_res.blend,
7047 96, 64, &blend_offset);
7048
7049 /* Copy of blend entries for merging dynamic changes. */
7050 uint32_t blend_entries[4 * rt_dwords];
7051 memcpy(blend_entries, &cso_blend->blend_state[1], sizeof(blend_entries));
7052
7053 unsigned cbufs = MAX2(cso_fb->nr_cbufs, 1);
7054
7055 uint32_t *blend_entry = blend_entries;
7056 for (unsigned i = 0; i < cbufs; i++) {
7057 int dst_blend_factor = cso_blend->ps_dst_blend_factor[i];
7058 int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[i];
7059 uint32_t entry[GENX(BLEND_STATE_ENTRY_length)];
7060 iris_pack_state(GENX(BLEND_STATE_ENTRY), entry, be) {
7061 if (needs_wa_14018912822) {
7062 if (dst_blend_factor == BLENDFACTOR_ZERO) {
7063 dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7064 color_blend_zero = true;
7065 }
7066 if (dst_alpha_blend_factor == BLENDFACTOR_ZERO) {
7067 dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7068 alpha_blend_zero = true;
7069 }
7070 }
7071 be.DestinationBlendFactor = dst_blend_factor;
7072 be.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7073 }
7074
7075 /* Merge entry. */
7076 uint32_t *dst = blend_entry;
7077 uint32_t *src = entry;
7078 for (unsigned j = 0; j < GENX(BLEND_STATE_ENTRY_length); j++)
7079 *dst |= *src;
7080
7081 blend_entry += GENX(BLEND_STATE_ENTRY_length);
7082 }
7083
7084 /* Blend constants modified for Wa_14018912822. */
7085 if (ice->state.color_blend_zero != color_blend_zero) {
7086 ice->state.color_blend_zero = color_blend_zero;
7087 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7088 }
7089 if (ice->state.alpha_blend_zero != alpha_blend_zero) {
7090 ice->state.alpha_blend_zero = alpha_blend_zero;
7091 ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
7092 }
7093
7094 uint32_t blend_state_header;
7095 iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
7096 bs.AlphaTestEnable = cso_zsa->alpha_enabled;
7097 bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);
7098 }
7099
7100 blend_map[0] = blend_state_header | cso_blend->blend_state[0];
7101 memcpy(&blend_map[1], blend_entries, 4 * rt_dwords);
7102
7103 iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
7104 ptr.BlendStatePointer = blend_offset;
7105 ptr.BlendStatePointerValid = true;
7106 }
7107 }
7108
7109 if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
7110 struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7111 #if GFX_VER == 8
7112 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7113 #endif
7114 uint32_t cc_offset;
7115 void *cc_map =
7116 stream_state(batch, ice->state.dynamic_uploader,
7117 &ice->state.last_res.color_calc,
7118 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
7119 64, &cc_offset);
7120 iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
7121 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
7122 cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;
7123 cc.BlendConstantColorRed = ice->state.color_blend_zero ?
7124 0.0 : ice->state.blend_color.color[0];
7125 cc.BlendConstantColorGreen = ice->state.color_blend_zero ?
7126 0.0 : ice->state.blend_color.color[1];
7127 cc.BlendConstantColorBlue = ice->state.color_blend_zero ?
7128 0.0 : ice->state.blend_color.color[2];
7129 cc.BlendConstantColorAlpha = ice->state.alpha_blend_zero ?
7130 0.0 : ice->state.blend_color.color[3];
7131 #if GFX_VER == 8
7132 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
7133 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7134 #endif
7135 }
7136 iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7137 ptr.ColorCalcStatePointer = cc_offset;
7138 ptr.ColorCalcStatePointerValid = true;
7139 }
7140 }
7141
7142 #if GFX_VERx10 == 125
7143 if (dirty & (IRIS_DIRTY_RENDER_BUFFER | IRIS_DIRTY_DEPTH_BUFFER)) {
7144 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7145 unsigned tile_width, tile_height;
7146
7147 ice->state.use_tbimr = batch->screen->driconf.enable_tbimr &&
7148 calculate_tile_dimensions(ice, &tile_width, &tile_height);
7149
7150 if (ice->state.use_tbimr) {
7151 /* Use a batch size of 128 polygons per slice as recommended
7152 * by BSpec 68436 "TBIMR Programming".
7153 */
7154 const unsigned num_slices = screen->devinfo->num_slices;
7155 const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
7156
7157 iris_emit_cmd(batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO), tbimr) {
7158 tbimr.TileRectangleHeight = tile_height;
7159 tbimr.TileRectangleWidth = tile_width;
7160 tbimr.VerticalTileCount = DIV_ROUND_UP(cso_fb->height, tile_height);
7161 tbimr.HorizontalTileCount = DIV_ROUND_UP(cso_fb->width, tile_width);
7162 tbimr.TBIMRBatchSize = util_logbase2(batch_size) - 5;
7163 tbimr.TileBoxCheck = true;
7164 }
7165 }
7166 }
7167 #endif
7168
7169 /* Wa_1604061319
7170 *
7171 * 3DSTATE_CONSTANT_* needs to be programmed before BTP_*
7172 *
7173 * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
7174 * any stage has a dirty binding table.
7175 */
7176 const bool emit_const_wa = GFX_VER >= 11 &&
7177 ((dirty & IRIS_DIRTY_RENDER_BUFFER) ||
7178 (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));
7179
7180 #if GFX_VER >= 12
7181 uint32_t nobuffer_stages = 0;
7182 #endif
7183
7184 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7185 if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&
7186 !emit_const_wa)
7187 continue;
7188
7189 struct iris_shader_state *shs = &ice->state.shaders[stage];
7190 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7191
7192 if (!shader)
7193 continue;
7194
7195 if (shs->sysvals_need_upload)
7196 upload_sysvals(ice, stage, NULL);
7197
7198 struct push_bos push_bos = {};
7199 setup_constant_buffers(ice, batch, stage, &push_bos);
7200
7201 #if GFX_VER >= 12
7202 /* If this stage doesn't have any push constants, emit it later in a
7203 * single CONSTANT_ALL packet with all the other stages.
7204 */
7205 if (push_bos.buffer_count == 0) {
7206 nobuffer_stages |= 1 << stage;
7207 continue;
7208 }
7209
7210 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
7211 * contains only 5 bits, so we can only use it for buffers smaller than
7212 * 32.
7213 *
7214 * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
7215 * in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
7216 * for disabling stages, where all address bits are zero. However, we
7217 * can't safely use it for general buffers with arbitrary addresses.
7218 * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
7219 * case.
7220 */
7221 if (push_bos.max_length < 32 && GFX_VERx10 > 120) {
7222 emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);
7223 continue;
7224 }
7225 #endif
7226 emit_push_constant_packets(ice, batch, stage, &push_bos);
7227 }
7228
7229 #if GFX_VER >= 12
7230 if (nobuffer_stages)
7231 /* Wa_16011448509: all address bits are zero */
7232 emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);
7233 #endif
7234
7235 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7236 /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted
7237 * in order to commit constants. TODO: Investigate "Disable Gather
7238 * at Set Shader" to go back to legacy mode...
7239 */
7240 if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |
7241 (GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))
7242 << stage)) {
7243 iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
7244 ptr._3DCommandSubOpcode = 38 + stage;
7245 ptr.PointertoVSBindingTable =
7246 binder->bt_offset[stage] >> IRIS_BT_OFFSET_SHIFT;
7247 }
7248 }
7249 }
7250
7251 if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
7252 // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
7253 // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
7254
7255 /* The PIPE_CONTROL command description says:
7256 *
7257 * "Whenever a Binding Table Index (BTI) used by a Render Target
7258 * Message points to a different RENDER_SURFACE_STATE, SW must issue a
7259 * Render Target Cache Flush by enabling this bit. When render target
7260 * flush is set due to new association of BTI, PS Scoreboard Stall bit
7261 * must be set in this packet."
7262 */
7263 // XXX: does this need to happen at 3DSTATE_BTP_PS time?
7264 iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
7265 PIPE_CONTROL_RENDER_TARGET_FLUSH |
7266 PIPE_CONTROL_STALL_AT_SCOREBOARD);
7267 }
7268
7269 if (dirty & IRIS_DIRTY_RENDER_BUFFER)
7270 trace_framebuffer_state(&batch->trace, NULL, &ice->state.framebuffer);
7271
7272 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7273 if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
7274 iris_populate_binding_table(ice, batch, stage, false);
7275 }
7276 }
7277
7278 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7279 if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
7280 !ice->shaders.prog[stage])
7281 continue;
7282
7283 iris_upload_sampler_states(ice, stage);
7284
7285 struct iris_shader_state *shs = &ice->state.shaders[stage];
7286 struct pipe_resource *res = shs->sampler_table.res;
7287 if (res)
7288 iris_use_pinned_bo(batch, iris_resource_bo(res), false,
7289 IRIS_DOMAIN_NONE);
7290
7291 iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
7292 ptr._3DCommandSubOpcode = 43 + stage;
7293 ptr.PointertoVSSamplerState = shs->sampler_table.offset;
7294 }
7295 }
7296
7297 if (ice->state.need_border_colors)
7298 iris_use_pinned_bo(batch, border_color_pool->bo, false, IRIS_DOMAIN_NONE);
7299
7300 if (dirty & IRIS_DIRTY_MULTISAMPLE) {
7301 iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
7302 ms.PixelLocation =
7303 ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
7304 if (ice->state.framebuffer.samples > 0)
7305 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
7306 }
7307 }
7308
7309 if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
7310 iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
7311 ms.SampleMask = ice->state.sample_mask;
7312 }
7313 }
7314
7315 #if GFX_VERx10 >= 125
7316 /* This is only used on >= gfx125 for dynamic 3DSTATE_TE and
7317 * 3DSTATE_VFG emission related workarounds.
7318 */
7319 bool program_uses_primitive_id = false;
7320
7321 /* Check if FS stage will use primitive ID overrides. */
7322 const struct intel_vue_map *last_vue_map =
7323 &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7324 if ((fs_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
7325 last_vue_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1) {
7326 program_uses_primitive_id = true;
7327 }
7328 #endif
7329
7330 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
7331 if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))
7332 continue;
7333
7334 struct iris_compiled_shader *shader = ice->shaders.prog[stage];
7335
7336 if (shader) {
7337 struct iris_resource *cache = (void *) shader->assembly.res;
7338 iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);
7339
7340 uint32_t scratch_addr =
7341 pin_scratch_space(ice, batch, shader, stage);
7342
7343 #if GFX_VERx10 >= 125
7344 shader_program_uses_primitive_id(ice, batch, shader, stage,
7345 &program_uses_primitive_id);
7346 #endif
7347
7348 if (stage == MESA_SHADER_FRAGMENT) {
7349 UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;
7350 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7351
7352 uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
7353 _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
7354 #if GFX_VER >= 9
7355 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(shader->brw_prog_data);
7356 #else
7357 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(shader->elk_prog_data);
7358 #endif
7359 intel_set_ps_dispatch_state(&ps, batch->screen->devinfo,
7360 wm_prog_data, util_framebuffer_get_num_samples(cso_fb),
7361 0 /* msaa_flags */);
7362
7363 #if GFX_VER == 12
7364 assert(fs_data->dispatch_multi == 0 ||
7365 (fs_data->dispatch_multi == 16 && fs_data->max_polygons == 2));
7366 ps.DualSIMD8DispatchEnable = fs_data->dispatch_multi;
7367 /* XXX - No major improvement observed from enabling
7368 * overlapping subspans, but it could be helpful
7369 * in theory when the requirements listed on the
7370 * BSpec page for 3DSTATE_PS_BODY are met.
7371 */
7372 ps.OverlappingSubspansEnable = false;
7373 #endif
7374
7375 #if GFX_VER >= 9
7376 ps.DispatchGRFStartRegisterForConstantSetupData0 =
7377 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7378 ps.DispatchGRFStartRegisterForConstantSetupData1 =
7379 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7380 #if GFX_VER < 20
7381 ps.DispatchGRFStartRegisterForConstantSetupData2 =
7382 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7383 #endif
7384
7385 ps.KernelStartPointer0 = KSP(shader) +
7386 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7387 ps.KernelStartPointer1 = KSP(shader) +
7388 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7389 #if GFX_VER < 20
7390 ps.KernelStartPointer2 = KSP(shader) +
7391 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7392 #endif
7393 #else
7394 ps.DispatchGRFStartRegisterForConstantSetupData0 =
7395 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
7396 ps.DispatchGRFStartRegisterForConstantSetupData1 =
7397 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
7398 ps.DispatchGRFStartRegisterForConstantSetupData2 =
7399 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
7400
7401 ps.KernelStartPointer0 = KSP(shader) +
7402 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
7403 ps.KernelStartPointer1 = KSP(shader) +
7404 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
7405 ps.KernelStartPointer2 = KSP(shader) +
7406 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
7407 #endif
7408
7409 #if GFX_VERx10 >= 125
7410 ps.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7411 #else
7412 ps.ScratchSpaceBasePointer =
7413 rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7414 #endif
7415 }
7416
7417 uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};
7418 iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
7419 #if GFX_VER >= 9
7420 if (!fs_data->uses_sample_mask)
7421 psx.InputCoverageMaskState = ICMS_NONE;
7422 else if (fs_data->post_depth_coverage)
7423 psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
7424 else if (fs_data->inner_coverage &&
7425 cso->conservative_rasterization)
7426 psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
7427 else
7428 psx.InputCoverageMaskState = ICMS_NORMAL;
7429 #else
7430 psx.PixelShaderUsesInputCoverageMask =
7431 fs_data->uses_sample_mask;
7432 #endif
7433 }
7434
7435 uint32_t *shader_ps = (uint32_t *) shader->derived_data;
7436 uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);
7437 iris_emit_merge(batch, shader_ps, ps_state,
7438 GENX(3DSTATE_PS_length));
7439 iris_emit_merge(batch, shader_psx, psx_state,
7440 GENX(3DSTATE_PS_EXTRA_length));
7441 #if GFX_VERx10 >= 125
7442 } else if (stage == MESA_SHADER_TESS_EVAL) {
7443 uint32_t te_state[GENX(3DSTATE_TE_length)] = { 0 };
7444 iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
7445 if (intel_needs_workaround(screen->devinfo, 14015055625) &&
7446 program_uses_primitive_id)
7447 te.TessellationDistributionMode = TEDMODE_OFF;
7448 else if (intel_needs_workaround(screen->devinfo, 22012699309))
7449 te.TessellationDistributionMode = TEDMODE_RR_STRICT;
7450 else
7451 te.TessellationDistributionMode = TEDMODE_RR_FREE;
7452 }
7453
7454 uint32_t ds_state[GENX(3DSTATE_DS_length)] = { 0 };
7455 iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
7456 if (scratch_addr)
7457 ds.ScratchSpaceBuffer =
7458 scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
7459 }
7460
7461 uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7462 uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7463
7464 iris_emit_merge(batch, shader_ds, ds_state,
7465 GENX(3DSTATE_DS_length));
7466 iris_emit_merge(batch, shader_te, te_state,
7467 GENX(3DSTATE_TE_length));
7468 #endif
7469 } else if (scratch_addr) {
7470 uint32_t *pkt = (uint32_t *) shader->derived_data;
7471 switch (stage) {
7472 case MESA_SHADER_VERTEX: MERGE_SCRATCH_ADDR(3DSTATE_VS); break;
7473 case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;
7474 case MESA_SHADER_TESS_EVAL: {
7475 uint32_t *shader_ds = (uint32_t *) shader->derived_data;
7476 uint32_t *shader_te = shader_ds + GENX(3DSTATE_DS_length);
7477 iris_batch_emit(batch, shader_te, 4 * GENX(3DSTATE_TE_length));
7478 MERGE_SCRATCH_ADDR(3DSTATE_DS);
7479 break;
7480 }
7481 case MESA_SHADER_GEOMETRY: MERGE_SCRATCH_ADDR(3DSTATE_GS); break;
7482 }
7483 } else {
7484 iris_batch_emit(batch, shader->derived_data,
7485 iris_derived_program_state_size(stage));
7486 }
7487 } else {
7488 if (stage == MESA_SHADER_TESS_EVAL) {
7489 iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7490 iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
7491 iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7492 } else if (stage == MESA_SHADER_GEOMETRY) {
7493 iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
7494 }
7495 }
7496 }
7497
7498 #if GFX_VERx10 >= 125
7499 /* Inspect program_uses_primitive_id state and dirty VFG if required. */
7500 if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
7501 program_uses_primitive_id != ice->state.uses_primitive_id) {
7502 dirty |= IRIS_DIRTY_VFG;
7503 ice->state.uses_primitive_id = program_uses_primitive_id;
7504 }
7505 #endif
7506
7507 if (ice->state.streamout_active) {
7508 if (dirty & IRIS_DIRTY_SO_BUFFERS) {
7509 /* Wa_16011411144
7510 * SW must insert a PIPE_CONTROL cmd before and after the
7511 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_* state is
7512 * not combined with other state changes.
7513 */
7514 if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7515 iris_emit_pipe_control_flush(batch,
7516 "SO pre change stall WA",
7517 PIPE_CONTROL_CS_STALL);
7518 }
7519
7520 for (int i = 0; i < 4; i++) {
7521 struct iris_stream_output_target *tgt =
7522 (void *) ice->state.so_target[i];
7523 enum { dwords = GENX(3DSTATE_SO_BUFFER_length) };
7524 uint32_t *so_buffers = genx->so_buffers + i * dwords;
7525 bool zero_offset = false;
7526
7527 if (tgt) {
7528 zero_offset = tgt->zero_offset;
7529 iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
7530 true, IRIS_DOMAIN_OTHER_WRITE);
7531 iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
7532 true, IRIS_DOMAIN_OTHER_WRITE);
7533 }
7534
7535 if (zero_offset) {
7536 /* Skip the last DWord which contains "Stream Offset" of
7537 * 0xFFFFFFFF and instead emit a dword of zero directly.
7538 */
7539 STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==
7540 32 * (dwords - 1));
7541 const uint32_t zero = 0;
7542 iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));
7543 iris_batch_emit(batch, &zero, sizeof(zero));
7544 tgt->zero_offset = false;
7545 } else {
7546 iris_batch_emit(batch, so_buffers, 4 * dwords);
7547 }
7548 }
7549
7550 /* Wa_16011411144 */
7551 if (intel_device_info_is_dg2(batch->screen->devinfo)) {
7552 iris_emit_pipe_control_flush(batch,
7553 "SO post change stall WA",
7554 PIPE_CONTROL_CS_STALL);
7555 }
7556 }
7557
7558 if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
7559 /* Wa_16011773973:
7560 * If SOL is enabled and SO_DECL state has to be programmed,
7561 * 1. Send 3D State SOL state with SOL disabled
7562 * 2. Send SO_DECL NP state
7563 * 3. Send 3D State SOL with SOL Enabled
7564 */
7565 if (intel_device_info_is_dg2(batch->screen->devinfo))
7566 iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7567
7568 uint32_t *decl_list =
7569 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
7570 iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
7571
7572 #if GFX_VER >= 11 && GFX_VER < 20
7573 /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7574 * 3DSTATE_SO_DECL_LIST:
7575 *
7576 * "Workaround: This command must be followed by a PIPE_CONTROL
7577 * with CS Stall bit set."
7578 *
7579 * On DG2+ also known as Wa_1509820217.
7580 */
7581 iris_emit_pipe_control_flush(batch,
7582 "workaround: cs stall after so_decl",
7583 PIPE_CONTROL_CS_STALL);
7584 #endif
7585 }
7586
7587 if (dirty & IRIS_DIRTY_STREAMOUT) {
7588 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7589
7590 #if GFX_VERx10 >= 120
7591 /* Wa_16013994831 - Disable preemption. */
7592 if (intel_needs_workaround(batch->screen->devinfo, 16013994831))
7593 iris_preemption_streamout_wa(ice, batch, false);
7594 #endif
7595
7596 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
7597 iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
7598 sol.SOFunctionEnable = true;
7599 sol.SOStatisticsEnable = true;
7600
7601 sol.RenderingDisable = cso_rast->rasterizer_discard &&
7602 !ice->state.prims_generated_query_active;
7603 sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
7604
7605
7606 #if INTEL_NEEDS_WA_18022508906
7607 /* Wa_14017076903 :
7608 *
7609 * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
7610 *
7611 * SOL_INT::Render_Enable =
7612 * (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
7613 * (
7614 * (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
7615 * !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
7616 * !3DSTATE_STREAMOUT::API_Render_Disable &&
7617 * (
7618 * 3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
7619 * 3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
7620 * 3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
7621 * 3DSTATE_PS_EXTRA::PS_Valid ||
7622 * 3DSTATE_WM::Legacy Depth_Buffer_Clear ||
7623 * 3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
7624 * 3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
7625 * )
7626 * )
7627 *
7628 * If SOL_INT::Render_Enable is false, the SO stage will not forward any
7629 * topologies down the pipeline. Which is not what we want for occlusion
7630 * queries.
7631 *
7632 * Here we force rendering to get SOL_INT::Render_Enable when occlusion
7633 * queries are active.
7634 */
7635 const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7636 if (!cso_rast->rasterizer_discard && ice->state.occlusion_query_active)
7637 sol.ForceRendering = Force_on;
7638 #endif
7639 }
7640
7641 assert(ice->state.streamout);
7642
7643 iris_emit_merge(batch, ice->state.streamout, dynamic_sol,
7644 GENX(3DSTATE_STREAMOUT_length));
7645 }
7646 } else {
7647 if (dirty & IRIS_DIRTY_STREAMOUT) {
7648
7649 #if GFX_VERx10 >= 120
7650 /* Wa_16013994831 - Enable preemption. */
7651 if (!ice->state.genx->object_preemption)
7652 iris_preemption_streamout_wa(ice, batch, true);
7653 #endif
7654
7655 iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
7656 }
7657 }
7658
7659 if (dirty & IRIS_DIRTY_CLIP) {
7660 struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
7661 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7662
7663 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
7664 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7665 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
7666 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
7667 : ice->state.prim_is_points_or_lines);
7668 const struct intel_vue_map *last =
7669 &iris_vue_data(ice->shaders.last_vue_shader)->vue_map;
7670
7671 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
7672 iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
7673 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
7674 if (cso_rast->rasterizer_discard)
7675 cl.ClipMode = CLIPMODE_REJECT_ALL;
7676 else if (ice->state.window_space_position)
7677 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
7678 else
7679 cl.ClipMode = CLIPMODE_NORMAL;
7680
7681 cl.PerspectiveDivideDisable = ice->state.window_space_position;
7682 cl.ViewportXYClipTestEnable = !points_or_lines;
7683
7684 cl.NonPerspectiveBarycentricEnable = fs_data->uses_nonperspective_interp_modes;
7685
7686 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1 ||
7687 !(last->slots_valid & VARYING_BIT_LAYER);
7688 cl.MaximumVPIndex = ice->state.num_viewports - 1;
7689 }
7690 iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
7691 ARRAY_SIZE(cso_rast->clip));
7692 }
7693
7694 if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {
7695 /* From the Browadwell PRM, Volume 2, documentation for
7696 * 3DSTATE_RASTER, "Antialiasing Enable":
7697 *
7698 * "This field must be disabled if any of the render targets
7699 * have integer (UINT or SINT) surface format."
7700 *
7701 * Additionally internal documentation for Gfx12+ states:
7702 *
7703 * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
7704 * FORCED_SAMPLE_COUNT > 1."
7705 */
7706 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7707 unsigned samples = util_framebuffer_get_num_samples(cso_fb);
7708 struct iris_rasterizer_state *cso = ice->state.cso_rast;
7709
7710 bool aa_enable = cso->line_smooth &&
7711 !ice->state.has_integer_rt &&
7712 !(batch->screen->devinfo->ver >= 12 && samples > 1);
7713
7714 uint32_t dynamic_raster[GENX(3DSTATE_RASTER_length)];
7715 iris_pack_command(GENX(3DSTATE_RASTER), &dynamic_raster, raster) {
7716 raster.AntialiasingEnable = aa_enable;
7717 }
7718 iris_emit_merge(batch, cso->raster, dynamic_raster,
7719 ARRAY_SIZE(cso->raster));
7720
7721 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7722 iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7723 sf.ViewportTransformEnable = !ice->state.window_space_position;
7724
7725 #if GFX_VER >= 12
7726 sf.DerefBlockSize = ice->state.urb_deref_block_size;
7727 #endif
7728 }
7729 iris_emit_merge(batch, cso->sf, dynamic_sf,
7730 ARRAY_SIZE(dynamic_sf));
7731 }
7732
7733 if (dirty & IRIS_DIRTY_WM) {
7734 struct iris_rasterizer_state *cso = ice->state.cso_rast;
7735 uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
7736
7737 iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
7738 wm.StatisticsEnable = ice->state.statistics_counters_enabled;
7739
7740 wm.BarycentricInterpolationMode =
7741 iris_fs_barycentric_modes(ice->shaders.prog[MESA_SHADER_FRAGMENT], 0);
7742
7743 if (fs_data->early_fragment_tests)
7744 wm.EarlyDepthStencilControl = EDSC_PREPS;
7745 else if (fs_data->has_side_effects)
7746 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7747 else
7748 wm.EarlyDepthStencilControl = EDSC_NORMAL;
7749
7750 /* We could skip this bit if color writes are enabled. */
7751 if (fs_data->has_side_effects || fs_data->uses_kill)
7752 wm.ForceThreadDispatchEnable = ForceON;
7753 }
7754 iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
7755 }
7756
7757 if (dirty & IRIS_DIRTY_SBE) {
7758 iris_emit_sbe(batch, ice);
7759 }
7760
7761 if (dirty & IRIS_DIRTY_PS_BLEND) {
7762 struct iris_blend_state *cso_blend = ice->state.cso_blend;
7763 struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7764 const struct shader_info *fs_info =
7765 iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7766
7767 int dst_blend_factor = cso_blend->ps_dst_blend_factor[0];
7768 int dst_alpha_blend_factor = cso_blend->ps_dst_alpha_blend_factor[0];
7769
7770 /* When MSAA is enabled, instead of using BLENDFACTOR_ZERO use
7771 * CONST_COLOR, CONST_ALPHA and supply zero by using blend constants.
7772 */
7773 if (needs_wa_14018912822) {
7774 if (ice->state.color_blend_zero)
7775 dst_blend_factor = BLENDFACTOR_CONST_COLOR;
7776 if (ice->state.alpha_blend_zero)
7777 dst_alpha_blend_factor = BLENDFACTOR_CONST_ALPHA;
7778 }
7779
7780 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7781 iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7782 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7783 pb.AlphaTestEnable = cso_zsa->alpha_enabled;
7784
7785 pb.DestinationBlendFactor = dst_blend_factor;
7786 pb.DestinationAlphaBlendFactor = dst_alpha_blend_factor;
7787
7788 /* The dual source blending docs caution against using SRC1 factors
7789 * when the shader doesn't use a dual source render target write.
7790 * Empirically, this can lead to GPU hangs, and the results are
7791 * undefined anyway, so simply disable blending to avoid the hang.
7792 */
7793 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7794 (!cso_blend->dual_color_blending || fs_data->dual_src_blend);
7795 }
7796
7797 iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7798 ARRAY_SIZE(cso_blend->ps_blend));
7799 }
7800
7801 if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
7802 struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
7803 #if GFX_VER >= 9 && GFX_VER < 12
7804 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7805 uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7806 iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7807 wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7808 wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7809 }
7810 iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
7811 #else
7812 /* Use modify disable fields which allow us to emit packets
7813 * directly instead of merging them later.
7814 */
7815 iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));
7816 #endif
7817
7818 /* Depth or stencil write changed in cso. */
7819 if (intel_needs_workaround(batch->screen->devinfo, 18019816803) &&
7820 (dirty & IRIS_DIRTY_DS_WRITE_ENABLE)) {
7821 iris_emit_pipe_control_flush(
7822 batch, "workaround: PSS stall after DS write enable change",
7823 PIPE_CONTROL_PSS_STALL_SYNC);
7824 }
7825
7826 #if GFX_VER >= 12
7827 iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));
7828 #endif
7829 }
7830
7831 if (dirty & IRIS_DIRTY_STENCIL_REF) {
7832 #if GFX_VER >= 12
7833 /* Use modify disable fields which allow us to emit packets
7834 * directly instead of merging them later.
7835 */
7836 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
7837 uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
7838 iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
7839 wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
7840 wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
7841 wmds.StencilTestMaskModifyDisable = true;
7842 wmds.StencilWriteMaskModifyDisable = true;
7843 wmds.StencilStateModifyDisable = true;
7844 wmds.DepthStateModifyDisable = true;
7845 }
7846 iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));
7847 #endif
7848 }
7849
7850 if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
7851 /* Wa_1409725701:
7852 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
7853 * stored as an array of up to 16 elements. The location of first
7854 * element of the array, as specified by Pointer to SCISSOR_RECT,
7855 * should be aligned to a 64-byte boundary.
7856 */
7857 uint32_t alignment = 64;
7858 uint32_t scissor_offset =
7859 emit_state(batch, ice->state.dynamic_uploader,
7860 &ice->state.last_res.scissor,
7861 ice->state.scissors,
7862 sizeof(struct pipe_scissor_state) *
7863 ice->state.num_viewports, alignment);
7864
7865 iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7866 ptr.ScissorRectPointer = scissor_offset;
7867 }
7868 }
7869
7870 if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
7871 struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
7872
7873 /* Do not emit the cso yet. We may need to update clear params first. */
7874 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7875 struct iris_resource *zres = NULL, *sres = NULL;
7876 if (cso_fb->zsbuf) {
7877 iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
7878 &zres, &sres);
7879 }
7880
7881 if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
7882 #if GFX_VER < 20
7883 uint32_t *clear_params =
7884 cso_z->packets + ARRAY_SIZE(cso_z->packets) -
7885 GENX(3DSTATE_CLEAR_PARAMS_length);
7886
7887 iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
7888 clear.DepthClearValueValid = true;
7889 clear.DepthClearValue = zres->aux.clear_color.f32[0];
7890 }
7891 #endif
7892 }
7893
7894 iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
7895
7896 if (intel_needs_workaround(batch->screen->devinfo, 1408224581) ||
7897 intel_needs_workaround(batch->screen->devinfo, 14014097488) ||
7898 intel_needs_workaround(batch->screen->devinfo, 14016712196)) {
7899 /* Wa_1408224581
7900 *
7901 * Workaround: Gfx12LP Astep only An additional pipe control with
7902 * post-sync = store dword operation would be required.( w/a is to
7903 * have an additional pipe control after the stencil state whenever
7904 * the surface state bits of this state is changing).
7905 *
7906 * This also seems sufficient to handle Wa_14014097488 and
7907 * Wa_14016712196.
7908 */
7909 iris_emit_pipe_control_write(batch, "WA for depth/stencil state",
7910 PIPE_CONTROL_WRITE_IMMEDIATE,
7911 screen->workaround_address.bo,
7912 screen->workaround_address.offset, 0);
7913 }
7914
7915 if (zres)
7916 genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
7917 }
7918
7919 if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
7920 /* Listen for buffer changes, and also write enable changes. */
7921 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
7922 pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
7923 }
7924
7925 if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
7926 iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7927 for (int i = 0; i < 32; i++) {
7928 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7929 }
7930 }
7931 }
7932
7933 if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
7934 struct iris_rasterizer_state *cso = ice->state.cso_rast;
7935 iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7936 #if GFX_VER >= 11
7937 /* ICL PRMs, Volume 2a - Command Reference: Instructions,
7938 * 3DSTATE_LINE_STIPPLE:
7939 *
7940 * "Workaround: This command must be followed by a PIPE_CONTROL with
7941 * CS Stall bit set."
7942 */
7943 iris_emit_pipe_control_flush(batch,
7944 "workaround: post 3DSTATE_LINE_STIPPLE",
7945 PIPE_CONTROL_CS_STALL);
7946 #endif
7947 }
7948
7949 if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
7950 iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7951 topo.PrimitiveTopologyType =
7952 translate_prim_type(draw->mode, ice->state.vertices_per_patch);
7953 }
7954 }
7955
7956 if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
7957 int count = util_bitcount64(ice->state.bound_vertex_buffers);
7958 uint64_t dynamic_bound = ice->state.bound_vertex_buffers;
7959
7960 if (ice->state.vs_uses_draw_params && !skip_vb_params) {
7961 assert(ice->draw.draw_params.res);
7962
7963 struct iris_vertex_buffer_state *state =
7964 &(ice->state.genx->vertex_buffers[count]);
7965 pipe_resource_reference(&state->resource, ice->draw.draw_params.res);
7966 struct iris_resource *res = (void *) state->resource;
7967
7968 iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7969 vb.VertexBufferIndex = count;
7970 vb.AddressModifyEnable = true;
7971 vb.BufferPitch = 0;
7972 vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;
7973 vb.BufferStartingAddress =
7974 ro_bo(NULL, res->bo->address +
7975 (int) ice->draw.draw_params.offset);
7976 vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
7977 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
7978 #if GFX_VER >= 12
7979 vb.L3BypassDisable = true;
7980 #endif
7981 }
7982 dynamic_bound |= 1ull << count;
7983 count++;
7984 }
7985
7986 if (ice->state.vs_uses_derived_draw_params && !skip_vb_params) {
7987 struct iris_vertex_buffer_state *state =
7988 &(ice->state.genx->vertex_buffers[count]);
7989 pipe_resource_reference(&state->resource,
7990 ice->draw.derived_draw_params.res);
7991 struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;
7992
7993 iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
7994 vb.VertexBufferIndex = count;
7995 vb.AddressModifyEnable = true;
7996 vb.BufferPitch = 0;
7997 vb.BufferSize =
7998 res->bo->size - ice->draw.derived_draw_params.offset;
7999 vb.BufferStartingAddress =
8000 ro_bo(NULL, res->bo->address +
8001 (int) ice->draw.derived_draw_params.offset);
8002 vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
8003 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8004 #if GFX_VER >= 12
8005 vb.L3BypassDisable = true;
8006 #endif
8007 }
8008 dynamic_bound |= 1ull << count;
8009 count++;
8010 }
8011
8012 if (count) {
8013 #if GFX_VER >= 11
8014 /* Gfx11+ doesn't need the cache workaround below */
8015 uint64_t bound = dynamic_bound;
8016 while (bound) {
8017 const int i = u_bit_scan64(&bound);
8018 iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
8019 false, IRIS_DOMAIN_VF_READ);
8020 }
8021 #else
8022 /* The VF cache designers cut corners, and made the cache key's
8023 * <VertexBufferIndex, Memory Address> tuple only consider the bottom
8024 * 32 bits of the address. If you have two vertex buffers which get
8025 * placed exactly 4 GiB apart and use them in back-to-back draw calls,
8026 * you can get collisions (even within a single batch).
8027 *
8028 * So, we need to do a VF cache invalidate if the buffer for a VB
8029 * slot slot changes [48:32] address bits from the previous time.
8030 */
8031 unsigned flush_flags = 0;
8032
8033 uint64_t bound = dynamic_bound;
8034 while (bound) {
8035 const int i = u_bit_scan64(&bound);
8036 uint16_t high_bits = 0;
8037
8038 struct iris_resource *res =
8039 (void *) genx->vertex_buffers[i].resource;
8040 if (res) {
8041 iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
8042
8043 high_bits = res->bo->address >> 32ull;
8044 if (high_bits != ice->state.last_vbo_high_bits[i]) {
8045 flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |
8046 PIPE_CONTROL_CS_STALL;
8047 ice->state.last_vbo_high_bits[i] = high_bits;
8048 }
8049 }
8050 }
8051
8052 if (flush_flags) {
8053 iris_emit_pipe_control_flush(batch,
8054 "workaround: VF cache 32-bit key [VB]",
8055 flush_flags);
8056 }
8057 #endif
8058
8059 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
8060
8061 uint32_t *map =
8062 iris_get_command_space(batch, 4 * (1 + vb_dwords * count));
8063 _iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
8064 vb.DWordLength = (vb_dwords * count + 1) - 2;
8065 }
8066 map += 1;
8067
8068 const struct iris_vertex_element_state *cso_ve =
8069 ice->state.cso_vertex_elements;
8070
8071 bound = dynamic_bound;
8072 while (bound) {
8073 const int i = u_bit_scan64(&bound);
8074
8075 uint32_t vb_stride[GENX(VERTEX_BUFFER_STATE_length)];
8076 struct iris_bo *bo =
8077 iris_resource_bo(genx->vertex_buffers[i].resource);
8078 iris_pack_state(GENX(VERTEX_BUFFER_STATE), &vb_stride, vbs) {
8079 vbs.BufferPitch = cso_ve->stride[i];
8080 /* Unnecessary except to defeat the genxml nonzero checker */
8081 vbs.MOCS = iris_mocs(bo, &screen->isl_dev,
8082 ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
8083 }
8084 for (unsigned d = 0; d < vb_dwords; d++)
8085 map[d] = genx->vertex_buffers[i].state[d] | vb_stride[d];
8086
8087 map += vb_dwords;
8088 }
8089 }
8090 }
8091
8092 if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
8093 struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8094 const unsigned entries = MAX2(cso->count, 1);
8095 if (!(ice->state.vs_needs_sgvs_element ||
8096 ice->state.vs_uses_derived_draw_params ||
8097 ice->state.vs_needs_edge_flag)) {
8098 iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
8099 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
8100 } else {
8101 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
8102 const unsigned dyn_count = cso->count +
8103 ice->state.vs_needs_sgvs_element +
8104 ice->state.vs_uses_derived_draw_params;
8105
8106 iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
8107 &dynamic_ves, ve) {
8108 ve.DWordLength =
8109 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
8110 }
8111 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
8112 (cso->count - ice->state.vs_needs_edge_flag) *
8113 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
8114 uint32_t *ve_pack_dest =
8115 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
8116 GENX(VERTEX_ELEMENT_STATE_length)];
8117
8118 if (ice->state.vs_needs_sgvs_element) {
8119 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
8120 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
8121 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8122 ve.Valid = true;
8123 ve.VertexBufferIndex =
8124 util_bitcount64(ice->state.bound_vertex_buffers);
8125 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8126 ve.Component0Control = base_ctrl;
8127 ve.Component1Control = base_ctrl;
8128 ve.Component2Control = VFCOMP_STORE_0;
8129 ve.Component3Control = VFCOMP_STORE_0;
8130 }
8131 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8132 }
8133 if (ice->state.vs_uses_derived_draw_params) {
8134 iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
8135 ve.Valid = true;
8136 ve.VertexBufferIndex =
8137 util_bitcount64(ice->state.bound_vertex_buffers) +
8138 ice->state.vs_uses_draw_params;
8139 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
8140 ve.Component0Control = VFCOMP_STORE_SRC;
8141 ve.Component1Control = VFCOMP_STORE_SRC;
8142 ve.Component2Control = VFCOMP_STORE_0;
8143 ve.Component3Control = VFCOMP_STORE_0;
8144 }
8145 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
8146 }
8147 if (ice->state.vs_needs_edge_flag) {
8148 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
8149 ve_pack_dest[i] = cso->edgeflag_ve[i];
8150 }
8151
8152 iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
8153 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
8154 }
8155
8156 if (!ice->state.vs_needs_edge_flag) {
8157 iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
8158 entries * GENX(3DSTATE_VF_INSTANCING_length));
8159 } else {
8160 assert(cso->count > 0);
8161 const unsigned edgeflag_index = cso->count - 1;
8162 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
8163 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
8164 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
8165
8166 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
8167 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
8168 iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
8169 vi.VertexElementIndex = edgeflag_index +
8170 ice->state.vs_needs_sgvs_element +
8171 ice->state.vs_uses_derived_draw_params;
8172 }
8173 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
8174 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
8175
8176 iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
8177 entries * GENX(3DSTATE_VF_INSTANCING_length));
8178 }
8179 }
8180
8181 if (dirty & IRIS_DIRTY_VF_SGVS) {
8182 const struct iris_vs_data *vs_data =
8183 iris_vs_data(ice->shaders.prog[MESA_SHADER_VERTEX]);
8184 struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
8185
8186 iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
8187 if (vs_data->uses_vertexid) {
8188 sgv.VertexIDEnable = true;
8189 sgv.VertexIDComponentNumber = 2;
8190 sgv.VertexIDElementOffset =
8191 cso->count - ice->state.vs_needs_edge_flag;
8192 }
8193
8194 if (vs_data->uses_instanceid) {
8195 sgv.InstanceIDEnable = true;
8196 sgv.InstanceIDComponentNumber = 3;
8197 sgv.InstanceIDElementOffset =
8198 cso->count - ice->state.vs_needs_edge_flag;
8199 }
8200 }
8201 }
8202
8203 if (dirty & IRIS_DIRTY_VF_STATISTICS) {
8204 iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
8205 vf.StatisticsEnable = true;
8206 }
8207 }
8208
8209 if (dirty & IRIS_DIRTY_VF) {
8210 #if INTEL_WA_16012775297_GFX_VER
8211 /* Emit dummy VF statistics before each 3DSTATE_VF. */
8212 if (intel_needs_workaround(batch->screen->devinfo, 16012775297) &&
8213 (dirty & IRIS_DIRTY_VF_STATISTICS) == 0) {
8214 iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
8215 vfs.StatisticsEnable = true;
8216 }
8217 }
8218 #endif
8219
8220 iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
8221 #if GFX_VERx10 >= 125
8222 vf.GeometryDistributionEnable = true;
8223 #endif
8224 if (draw->primitive_restart) {
8225 vf.IndexedDrawCutIndexEnable = true;
8226 vf.CutIndex = draw->restart_index;
8227 }
8228 }
8229 }
8230
8231 #if GFX_VERx10 >= 125
8232 if (dirty & IRIS_DIRTY_VFG) {
8233 iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
8234 /* Gfx12.5: If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE */
8235 vfg.DistributionMode =
8236 #if GFX_VER < 20
8237 ice->shaders.prog[MESA_SHADER_TESS_EVAL] == NULL ? RR_FREE :
8238 #endif
8239 RR_STRICT;
8240 if (intel_needs_workaround(batch->screen->devinfo, 14019166699) &&
8241 program_uses_primitive_id)
8242 vfg.DistributionGranularity = InstanceLevelGranularity;
8243 else
8244 vfg.DistributionGranularity = BatchLevelGranularity;
8245 #if INTEL_WA_14014851047_GFX_VER
8246 vfg.GranularityThresholdDisable =
8247 intel_needs_workaround(batch->screen->devinfo, 14014851047);
8248 #endif
8249 vfg.ListCutIndexEnable = draw->primitive_restart;
8250 /* 192 vertices for TRILIST_ADJ */
8251 vfg.ListNBatchSizeScale = 0;
8252 /* Batch size of 384 vertices */
8253 vfg.List3BatchSizeScale = 2;
8254 /* Batch size of 128 vertices */
8255 vfg.List2BatchSizeScale = 1;
8256 /* Batch size of 128 vertices */
8257 vfg.List1BatchSizeScale = 2;
8258 /* Batch size of 256 vertices for STRIP topologies */
8259 vfg.StripBatchSizeScale = 3;
8260 /* 192 control points for PATCHLIST_3 */
8261 vfg.PatchBatchSizeScale = 1;
8262 /* 192 control points for PATCHLIST_3 */
8263 vfg.PatchBatchSizeMultiplier = 31;
8264 }
8265 }
8266 #endif
8267
8268 #if GFX_VER == 8
8269 if (dirty & IRIS_DIRTY_PMA_FIX) {
8270 bool enable = want_pma_fix(ice);
8271 genX(update_pma_fix)(ice, batch, enable);
8272 }
8273 #endif
8274
8275 if (ice->state.current_hash_scale != 1)
8276 genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
8277
8278 #if GFX_VER >= 12
8279 genX(invalidate_aux_map_state)(batch);
8280 #endif
8281 }
8282
8283 static void
flush_vbos(struct iris_context * ice,struct iris_batch * batch)8284 flush_vbos(struct iris_context *ice, struct iris_batch *batch)
8285 {
8286 struct iris_genx_state *genx = ice->state.genx;
8287 uint64_t bound = ice->state.bound_vertex_buffers;
8288 while (bound) {
8289 const int i = u_bit_scan64(&bound);
8290 struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
8291 iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
8292 }
8293 }
8294
8295 static bool
point_or_line_list(enum mesa_prim prim_type)8296 point_or_line_list(enum mesa_prim prim_type)
8297 {
8298 switch (prim_type) {
8299 case MESA_PRIM_POINTS:
8300 case MESA_PRIM_LINES:
8301 case MESA_PRIM_LINE_STRIP:
8302 case MESA_PRIM_LINES_ADJACENCY:
8303 case MESA_PRIM_LINE_STRIP_ADJACENCY:
8304 case MESA_PRIM_LINE_LOOP:
8305 return true;
8306 default:
8307 return false;
8308 }
8309 return false;
8310 }
8311
8312 void
genX(emit_breakpoint)8313 genX(emit_breakpoint)(struct iris_batch *batch, bool emit_before_draw)
8314 {
8315 struct iris_context *ice = batch->ice;
8316 uint32_t draw_count = emit_before_draw ?
8317 p_atomic_inc_return(&ice->draw_call_count) :
8318 p_atomic_read(&ice->draw_call_count);
8319
8320 if (((draw_count == intel_debug_bkp_before_draw_count &&
8321 emit_before_draw) ||
8322 (draw_count == intel_debug_bkp_after_draw_count &&
8323 !emit_before_draw))) {
8324 iris_emit_cmd(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
8325 sem.WaitMode = PollingMode;
8326 sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
8327 sem.SemaphoreDataDword = 0x1;
8328 sem.SemaphoreAddress = rw_bo(batch->screen->breakpoint_bo, 0,
8329 IRIS_DOMAIN_OTHER_WRITE);
8330 };
8331 }
8332 }
8333
8334 void
genX(emit_3dprimitive_was)8335 genX(emit_3dprimitive_was)(struct iris_batch *batch,
8336 const struct pipe_draw_indirect_info *indirect,
8337 uint32_t primitive_type,
8338 uint32_t vertex_count)
8339 {
8340 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8341 UNUSED const struct iris_context *ice = batch->ice;
8342
8343 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
8344 if (intel_needs_workaround(devinfo, 22014412737) &&
8345 (point_or_line_list(primitive_type) || indirect ||
8346 (vertex_count == 1 || vertex_count == 2))) {
8347 iris_emit_pipe_control_write(batch, "Wa_22014412737",
8348 PIPE_CONTROL_WRITE_IMMEDIATE,
8349 batch->screen->workaround_bo,
8350 batch->screen->workaround_address.offset,
8351 0ull);
8352 batch->num_3d_primitives_emitted = 0;
8353 } else if (intel_needs_workaround(devinfo, 16014538804)) {
8354 batch->num_3d_primitives_emitted++;
8355
8356 /* Wa_16014538804 - Send empty/dummy pipe control after 3 3DPRIMITIVE. */
8357 if (batch->num_3d_primitives_emitted == 3) {
8358 iris_emit_pipe_control_flush(batch, "Wa_16014538804", 0);
8359 batch->num_3d_primitives_emitted = 0;
8360 }
8361 }
8362 #endif
8363 }
8364
8365 void
genX(urb_workaround)8366 genX(urb_workaround)(struct iris_batch *batch,
8367 const struct intel_urb_config *urb_cfg)
8368 {
8369 #if INTEL_NEEDS_WA_16014912113
8370 if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb,
8371 MESA_SHADER_TESS_EVAL) &&
8372 batch->ice->shaders.last_urb.size[0] != 0) {
8373 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
8374 #if GFX_VER >= 12
8375 iris_emit_cmd(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
8376 urb._3DCommandSubOpcode += i;
8377 urb.VSURBEntryAllocationSize =
8378 batch->ice->shaders.last_urb.size[i] - 1;
8379 urb.VSURBStartingAddressSlice0 =
8380 batch->ice->shaders.last_urb.start[i];
8381 urb.VSURBStartingAddressSliceN =
8382 batch->ice->shaders.last_urb.start[i];
8383 urb.VSNumberofURBEntriesSlice0 = i == 0 ? 256 : 0;
8384 urb.VSNumberofURBEntriesSliceN = i == 0 ? 256 : 0;
8385 }
8386 #else
8387 iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
8388 urb._3DCommandSubOpcode += i;
8389 urb.VSURBStartingAddress =
8390 batch->ice->shaders.last_urb.start[i];
8391 urb.VSURBEntryAllocationSize =
8392 batch->ice->shaders.last_urb.size[i] - 1;
8393 urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
8394 }
8395 #endif
8396 }
8397 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8398 pc.HDCPipelineFlushEnable = true;
8399 }
8400 }
8401 #endif
8402
8403 /* Update current urb config. */
8404 memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg,
8405 sizeof(struct intel_urb_config));
8406 }
8407
8408 static void
iris_emit_index_buffer(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,const struct pipe_draw_start_count_bias * sc)8409 iris_emit_index_buffer(struct iris_context *ice,
8410 struct iris_batch *batch,
8411 const struct pipe_draw_info *draw,
8412 const struct pipe_draw_start_count_bias *sc)
8413 {
8414 unsigned offset;
8415
8416 if (draw->has_user_indices) {
8417 unsigned start_offset = draw->index_size * sc->start;
8418
8419 u_upload_data(ice->ctx.const_uploader, start_offset,
8420 sc->count * draw->index_size, 4,
8421 (char*)draw->index.user + start_offset,
8422 &offset, &ice->state.last_res.index_buffer);
8423 offset -= start_offset;
8424 } else {
8425 struct iris_resource *res = (void *) draw->index.resource;
8426 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
8427
8428 pipe_resource_reference(&ice->state.last_res.index_buffer,
8429 draw->index.resource);
8430 offset = 0;
8431
8432 iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
8433 }
8434
8435 struct iris_genx_state *genx = ice->state.genx;
8436 struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
8437
8438 uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
8439 iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
8440 ib.IndexFormat = draw->index_size >> 1;
8441 ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
8442 ISL_SURF_USAGE_INDEX_BUFFER_BIT);
8443 ib.BufferSize = bo->size - offset;
8444 ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
8445 #if GFX_VER >= 12
8446 ib.L3BypassDisable = true;
8447 #endif
8448 }
8449
8450 if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
8451 memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
8452 iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
8453 iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
8454 }
8455
8456 #if GFX_VER < 11
8457 /* The VF cache key only uses 32-bits, see vertex buffer comment above */
8458 uint16_t high_bits = bo->address >> 32ull;
8459 if (high_bits != ice->state.last_index_bo_high_bits) {
8460 iris_emit_pipe_control_flush(batch,
8461 "workaround: VF cache 32-bit key [IB]",
8462 PIPE_CONTROL_VF_CACHE_INVALIDATE |
8463 PIPE_CONTROL_CS_STALL);
8464 ice->state.last_index_bo_high_bits = high_bits;
8465 }
8466 #endif
8467 }
8468
8469
8470 static void
iris_upload_render_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8471 iris_upload_render_state(struct iris_context *ice,
8472 struct iris_batch *batch,
8473 const struct pipe_draw_info *draw,
8474 unsigned drawid_offset,
8475 const struct pipe_draw_indirect_info *indirect,
8476 const struct pipe_draw_start_count_bias *sc)
8477 {
8478 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
8479 bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8480
8481 trace_intel_begin_draw(&batch->trace);
8482
8483 if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8484 flush_vbos(ice, batch);
8485
8486 iris_batch_sync_region_start(batch);
8487
8488 /* Always pin the binder. If we're emitting new binding table pointers,
8489 * we need it. If not, we're probably inheriting old tables via the
8490 * context, and need it anyway. Since true zero-bindings cases are
8491 * practically non-existent, just pin it and avoid last_res tracking.
8492 */
8493 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8494 IRIS_DOMAIN_NONE);
8495
8496 if (!batch->contains_draw) {
8497 if (GFX_VER == 12) {
8498 /* Re-emit constants when starting a new batch buffer in order to
8499 * work around push constant corruption on context switch.
8500 *
8501 * XXX - Provide hardware spec quotation when available.
8502 */
8503 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
8504 IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8505 IRIS_STAGE_DIRTY_CONSTANTS_TES |
8506 IRIS_STAGE_DIRTY_CONSTANTS_GS |
8507 IRIS_STAGE_DIRTY_CONSTANTS_FS);
8508 }
8509 batch->contains_draw = true;
8510 }
8511
8512 if (!batch->contains_draw_with_next_seqno) {
8513 iris_restore_render_saved_bos(ice, batch, draw);
8514 batch->contains_draw_with_next_seqno = true;
8515 }
8516
8517 /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8518 * Wa_16011107343 (same for gfx12)
8519 * We implement this by setting TCS dirty on each draw.
8520 */
8521 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8522 ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8523 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8524 }
8525
8526 iris_upload_dirty_render_state(ice, batch, draw, false);
8527
8528 if (draw->index_size > 0)
8529 iris_emit_index_buffer(ice, batch, draw, sc);
8530
8531 if (indirect) {
8532 struct mi_builder b;
8533 uint32_t mocs;
8534 mi_builder_init(&b, batch->screen->devinfo, batch);
8535
8536 #define _3DPRIM_END_OFFSET 0x2420
8537 #define _3DPRIM_START_VERTEX 0x2430
8538 #define _3DPRIM_VERTEX_COUNT 0x2434
8539 #define _3DPRIM_INSTANCE_COUNT 0x2438
8540 #define _3DPRIM_START_INSTANCE 0x243C
8541 #define _3DPRIM_BASE_VERTEX 0x2440
8542
8543 if (!indirect->count_from_stream_output) {
8544 if (indirect->indirect_draw_count) {
8545 use_predicate = true;
8546
8547 struct iris_bo *draw_count_bo =
8548 iris_resource_bo(indirect->indirect_draw_count);
8549 unsigned draw_count_offset =
8550 indirect->indirect_draw_count_offset;
8551 mocs = iris_mocs(draw_count_bo, &batch->screen->isl_dev, 0);
8552 mi_builder_set_mocs(&b, mocs);
8553
8554 if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
8555 /* comparison = draw id < draw count */
8556 struct mi_value comparison =
8557 mi_ult(&b, mi_imm(drawid_offset),
8558 mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8559
8560 /* predicate = comparison & conditional rendering predicate */
8561 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
8562 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
8563 } else {
8564 uint32_t mi_predicate;
8565
8566 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
8567 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(drawid_offset));
8568 /* Upload the current draw count from the draw parameters buffer
8569 * to MI_PREDICATE_SRC0. Zero the top 32-bits of
8570 * MI_PREDICATE_SRC0.
8571 */
8572 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
8573 mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
8574
8575 if (drawid_offset == 0) {
8576 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
8577 MI_PREDICATE_COMBINEOP_SET |
8578 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8579 } else {
8580 /* While draw_index < draw_count the predicate's result will be
8581 * (draw_index == draw_count) ^ TRUE = TRUE
8582 * When draw_index == draw_count the result is
8583 * (TRUE) ^ TRUE = FALSE
8584 * After this all results will be:
8585 * (FALSE) ^ FALSE = FALSE
8586 */
8587 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
8588 MI_PREDICATE_COMBINEOP_XOR |
8589 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
8590 }
8591 iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
8592 }
8593 }
8594 struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8595 assert(bo);
8596
8597 mocs = iris_mocs(bo, &batch->screen->isl_dev, 0);
8598 mi_builder_set_mocs(&b, mocs);
8599
8600 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8601 mi_mem32(ro_bo(bo, indirect->offset + 0)));
8602 mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8603 mi_mem32(ro_bo(bo, indirect->offset + 4)));
8604 mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX),
8605 mi_mem32(ro_bo(bo, indirect->offset + 8)));
8606 if (draw->index_size) {
8607 mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX),
8608 mi_mem32(ro_bo(bo, indirect->offset + 12)));
8609 mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8610 mi_mem32(ro_bo(bo, indirect->offset + 16)));
8611 } else {
8612 mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE),
8613 mi_mem32(ro_bo(bo, indirect->offset + 12)));
8614 mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8615 }
8616 } else if (indirect->count_from_stream_output) {
8617 struct iris_stream_output_target *so =
8618 (void *) indirect->count_from_stream_output;
8619 struct iris_bo *so_bo = iris_resource_bo(so->offset.res);
8620
8621 mocs = iris_mocs(so_bo, &batch->screen->isl_dev, 0);
8622 mi_builder_set_mocs(&b, mocs);
8623
8624 iris_emit_buffer_barrier_for(batch, so_bo, IRIS_DOMAIN_OTHER_READ);
8625
8626 struct iris_address addr = ro_bo(so_bo, so->offset.offset);
8627 struct mi_value offset =
8628 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8629 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8630 mi_udiv32_imm(&b, offset, so->stride));
8631 mi_store(&b, mi_reg32(_3DPRIM_START_VERTEX), mi_imm(0));
8632 mi_store(&b, mi_reg32(_3DPRIM_BASE_VERTEX), mi_imm(0));
8633 mi_store(&b, mi_reg32(_3DPRIM_START_INSTANCE), mi_imm(0));
8634 mi_store(&b, mi_reg32(_3DPRIM_INSTANCE_COUNT),
8635 mi_imm(draw->instance_count));
8636 }
8637 }
8638
8639 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8640
8641 genX(maybe_emit_breakpoint)(batch, true);
8642
8643 iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8644 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8645 prim.PredicateEnable = use_predicate;
8646 #if GFX_VERx10 >= 125
8647 prim.TBIMREnable = ice->state.use_tbimr;
8648 #endif
8649 if (indirect) {
8650 prim.IndirectParameterEnable = true;
8651 } else {
8652 prim.StartInstanceLocation = draw->start_instance;
8653 prim.InstanceCount = draw->instance_count;
8654 prim.VertexCountPerInstance = sc->count;
8655
8656 prim.StartVertexLocation = sc->start;
8657
8658 if (draw->index_size) {
8659 prim.BaseVertexLocation += sc->index_bias;
8660 }
8661 }
8662 }
8663
8664 genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8665 genX(maybe_emit_breakpoint)(batch, false);
8666
8667 iris_batch_sync_region_end(batch);
8668
8669 uint32_t count = (sc) ? sc->count : 0;
8670 count *= draw->instance_count ? draw->instance_count : 1;
8671 trace_intel_end_draw(&batch->trace, count, 0, 0);
8672 }
8673
8674 static void
iris_upload_indirect_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8675 iris_upload_indirect_render_state(struct iris_context *ice,
8676 const struct pipe_draw_info *draw,
8677 const struct pipe_draw_indirect_info *indirect,
8678 const struct pipe_draw_start_count_bias *sc)
8679 {
8680 #if GFX_VERx10 >= 125
8681 assert(indirect);
8682
8683 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8684 UNUSED struct iris_screen *screen = batch->screen;
8685 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8686 const bool use_predicate =
8687 ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
8688
8689 trace_intel_begin_draw(&batch->trace);
8690
8691 if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8692 flush_vbos(ice, batch);
8693
8694 iris_batch_sync_region_start(batch);
8695
8696 /* Always pin the binder. If we're emitting new binding table pointers,
8697 * we need it. If not, we're probably inheriting old tables via the
8698 * context, and need it anyway. Since true zero-bindings cases are
8699 * practically non-existent, just pin it and avoid last_res tracking.
8700 */
8701 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8702 IRIS_DOMAIN_NONE);
8703
8704 if (!batch->contains_draw) {
8705 /* Re-emit constants when starting a new batch buffer in order to
8706 * work around push constant corruption on context switch.
8707 *
8708 * XXX - Provide hardware spec quotation when available.
8709 */
8710 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
8711 IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8712 IRIS_STAGE_DIRTY_CONSTANTS_TES |
8713 IRIS_STAGE_DIRTY_CONSTANTS_GS |
8714 IRIS_STAGE_DIRTY_CONSTANTS_FS);
8715 batch->contains_draw = true;
8716 }
8717
8718 if (!batch->contains_draw_with_next_seqno) {
8719 iris_restore_render_saved_bos(ice, batch, draw);
8720 batch->contains_draw_with_next_seqno = true;
8721 }
8722
8723 /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8724 * Wa_16011107343 (same for gfx12)
8725 * We implement this by setting TCS dirty on each draw.
8726 */
8727 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8728 ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8729 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8730 }
8731
8732 iris_upload_dirty_render_state(ice, batch, draw, false);
8733
8734 if (draw->index_size > 0)
8735 iris_emit_index_buffer(ice, batch, draw, sc);
8736
8737 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8738
8739 genX(maybe_emit_breakpoint)(batch, true);
8740
8741 iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
8742 ind.ArgumentFormat =
8743 draw->index_size > 0 ? XI_DRAWINDEXED : XI_DRAW;
8744 ind.PredicateEnable = use_predicate;
8745 ind.TBIMREnabled = ice->state.use_tbimr;
8746 ind.MaxCount = indirect->draw_count;
8747
8748 if (indirect->buffer) {
8749 struct iris_bo *bo = iris_resource_bo(indirect->buffer);
8750 ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset);
8751 ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0);
8752 } else {
8753 ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
8754 }
8755
8756 if (indirect->indirect_draw_count) {
8757 struct iris_bo *draw_count_bo =
8758 iris_resource_bo(indirect->indirect_draw_count);
8759 ind.CountBufferIndirectEnable = true;
8760 ind.CountBufferAddress =
8761 ro_bo(draw_count_bo, indirect->indirect_draw_count_offset);
8762 }
8763 }
8764
8765 genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8766 genX(maybe_emit_breakpoint)(batch, false);
8767
8768 iris_batch_sync_region_end(batch);
8769
8770 uint32_t count = (sc) ? sc->count : 0;
8771 count *= draw->instance_count ? draw->instance_count : 1;
8772 trace_intel_end_draw(&batch->trace, count, 0, 0);
8773 #else
8774 unreachable("Unsupported path");
8775 #endif /* GFX_VERx10 >= 125 */
8776 }
8777
8778 static void
iris_upload_indirect_shader_render_state(struct iris_context * ice,const struct pipe_draw_info * draw,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * sc)8779 iris_upload_indirect_shader_render_state(struct iris_context *ice,
8780 const struct pipe_draw_info *draw,
8781 const struct pipe_draw_indirect_info *indirect,
8782 const struct pipe_draw_start_count_bias *sc)
8783 {
8784 assert(indirect);
8785
8786 struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
8787 UNUSED struct iris_screen *screen = batch->screen;
8788 UNUSED const struct intel_device_info *devinfo = screen->devinfo;
8789
8790 if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
8791 flush_vbos(ice, batch);
8792
8793 iris_batch_sync_region_start(batch);
8794
8795 /* Always pin the binder. If we're emitting new binding table pointers,
8796 * we need it. If not, we're probably inheriting old tables via the
8797 * context, and need it anyway. Since true zero-bindings cases are
8798 * practically non-existent, just pin it and avoid last_res tracking.
8799 */
8800 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8801 IRIS_DOMAIN_NONE);
8802
8803 if (!batch->contains_draw) {
8804 if (GFX_VER == 12) {
8805 /* Re-emit constants when starting a new batch buffer in order to
8806 * work around push constant corruption on context switch.
8807 *
8808 * XXX - Provide hardware spec quotation when available.
8809 */
8810 ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS |
8811 IRIS_STAGE_DIRTY_CONSTANTS_TCS |
8812 IRIS_STAGE_DIRTY_CONSTANTS_TES |
8813 IRIS_STAGE_DIRTY_CONSTANTS_GS |
8814 IRIS_STAGE_DIRTY_CONSTANTS_FS);
8815 }
8816 batch->contains_draw = true;
8817 }
8818
8819 if (!batch->contains_draw_with_next_seqno) {
8820 iris_restore_render_saved_bos(ice, batch, draw);
8821 batch->contains_draw_with_next_seqno = true;
8822 }
8823
8824 if (draw->index_size > 0)
8825 iris_emit_index_buffer(ice, batch, draw, sc);
8826
8827 /* Make sure we have enough space to keep all the commands in the single BO
8828 * (because of the jumps)
8829 */
8830 iris_require_command_space(batch, 2000);
8831
8832 #ifndef NDEBUG
8833 struct iris_bo *command_bo = batch->bo;
8834 #endif
8835
8836 /* Jump point to generate more draw if we run out of space in the ring
8837 * buffer.
8838 */
8839 uint64_t gen_addr = iris_batch_current_address_u64(batch);
8840
8841 iris_handle_always_flush_cache(batch);
8842
8843 #if GFX_VER == 9
8844 iris_emit_pipe_control_flush(batch, "before generation",
8845 PIPE_CONTROL_VF_CACHE_INVALIDATE);
8846 #endif
8847
8848 struct iris_address params_addr;
8849 struct iris_gen_indirect_params *params =
8850 genX(emit_indirect_generate)(batch, draw, indirect, sc,
8851 ¶ms_addr);
8852
8853 iris_emit_pipe_control_flush(batch, "after generation flush",
8854 ((ice->state.vs_uses_draw_params ||
8855 ice->state.vs_uses_derived_draw_params) ?
8856 PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) |
8857 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8858 PIPE_CONTROL_DATA_CACHE_FLUSH |
8859 PIPE_CONTROL_CS_STALL);
8860
8861 trace_intel_begin_draw(&batch->trace);
8862
8863 /* Always pin the binder. If we're emitting new binding table pointers,
8864 * we need it. If not, we're probably inheriting old tables via the
8865 * context, and need it anyway. Since true zero-bindings cases are
8866 * practically non-existent, just pin it and avoid last_res tracking.
8867 */
8868 iris_use_pinned_bo(batch, ice->state.binder.bo, false,
8869 IRIS_DOMAIN_NONE);
8870
8871 /* Wa_1306463417 - Send HS state for every primitive on gfx11.
8872 * Wa_16011107343 (same for gfx12)
8873 * We implement this by setting TCS dirty on each draw.
8874 */
8875 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
8876 ice->shaders.prog[MESA_SHADER_TESS_CTRL]) {
8877 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS;
8878 }
8879
8880 iris_upload_dirty_render_state(ice, batch, draw, true);
8881
8882 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
8883
8884 genX(maybe_emit_breakpoint)(batch, true);
8885
8886 #if GFX_VER >= 12
8887 iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) {
8888 arb.PreParserDisableMask = true;
8889 arb.PreParserDisable = true;
8890 }
8891 #endif
8892
8893 iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8894 bbs.AddressSpaceIndicator = ASI_PPGTT;
8895 bbs.BatchBufferStartAddress = (struct iris_address) {
8896 .bo = ice->draw.generation.ring_bo,
8897 };
8898 }
8899
8900 /* Run the ring buffer one more time with the next set of commands */
8901 uint64_t inc_addr = iris_batch_current_address_u64(batch);
8902 {
8903 iris_emit_pipe_control_flush(batch,
8904 "post generated draws wait",
8905 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8906 PIPE_CONTROL_CS_STALL);
8907
8908 struct mi_builder b;
8909 mi_builder_init(&b, batch->screen->devinfo, batch);
8910
8911 struct iris_address draw_base_addr = iris_address_add(
8912 params_addr,
8913 offsetof(struct iris_gen_indirect_params, draw_base));
8914
8915 const uint32_t mocs =
8916 iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0);
8917 mi_builder_set_mocs(&b, mocs);
8918
8919 mi_store(&b, mi_mem32(draw_base_addr),
8920 mi_iadd(&b, mi_mem32(draw_base_addr),
8921 mi_imm(params->ring_count)));
8922
8923 iris_emit_pipe_control_flush(batch,
8924 "post generation base increment",
8925 PIPE_CONTROL_CS_STALL |
8926 PIPE_CONTROL_CONST_CACHE_INVALIDATE);
8927
8928 iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
8929 bbs.AddressSpaceIndicator = ASI_PPGTT;
8930 bbs.BatchBufferStartAddress = (struct iris_address) {
8931 .offset = gen_addr,
8932 };
8933 }
8934 }
8935
8936 /* Exit of the ring buffer */
8937 uint64_t end_addr = iris_batch_current_address_u64(batch);
8938
8939 #ifndef NDEBUG
8940 assert(command_bo == batch->bo);
8941 #endif
8942
8943 genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count);
8944 genX(maybe_emit_breakpoint)(batch, false);
8945
8946 iris_emit_pipe_control_flush(batch,
8947 "post generated draws wait",
8948 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8949 PIPE_CONTROL_CS_STALL);
8950
8951 params->gen_addr = inc_addr;
8952 params->end_addr = end_addr;
8953
8954 iris_batch_sync_region_end(batch);
8955
8956 uint32_t count = (sc) ? sc->count : 0;
8957 count *= draw->instance_count ? draw->instance_count : 1;
8958 trace_intel_end_draw(&batch->trace, count, 0, 0);
8959 }
8960
8961 static void
iris_load_indirect_location(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)8962 iris_load_indirect_location(struct iris_context *ice,
8963 struct iris_batch *batch,
8964 const struct pipe_grid_info *grid)
8965 {
8966 #define GPGPU_DISPATCHDIMX 0x2500
8967 #define GPGPU_DISPATCHDIMY 0x2504
8968 #define GPGPU_DISPATCHDIMZ 0x2508
8969
8970 assert(grid->indirect);
8971
8972 struct iris_state_ref *grid_size = &ice->state.grid_size;
8973 struct iris_bo *bo = iris_resource_bo(grid_size->res);
8974 struct mi_builder b;
8975 mi_builder_init(&b, batch->screen->devinfo, batch);
8976 struct mi_value size_x = mi_mem32(ro_bo(bo, grid_size->offset + 0));
8977 struct mi_value size_y = mi_mem32(ro_bo(bo, grid_size->offset + 4));
8978 struct mi_value size_z = mi_mem32(ro_bo(bo, grid_size->offset + 8));
8979 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
8980 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
8981 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
8982 }
8983
iris_emit_indirect_dispatch_supported(const struct intel_device_info * devinfo)8984 static bool iris_emit_indirect_dispatch_supported(const struct intel_device_info *devinfo)
8985 {
8986 // TODO: Swizzling X and Y workgroup sizes is not supported in execute indirect dispatch
8987 return devinfo->has_indirect_unroll;
8988 }
8989
8990 #if GFX_VERx10 >= 125
8991
iris_emit_execute_indirect_dispatch(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid,const struct GENX (INTERFACE_DESCRIPTOR_DATA)idd)8992 static void iris_emit_execute_indirect_dispatch(struct iris_context *ice,
8993 struct iris_batch *batch,
8994 const struct pipe_grid_info *grid,
8995 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd)
8996 {
8997 const struct iris_screen *screen = batch->screen;
8998 struct iris_compiled_shader *shader =
8999 ice->shaders.prog[MESA_SHADER_COMPUTE];
9000 const struct iris_cs_data *cs_data = iris_cs_data(shader);
9001 const struct intel_cs_dispatch_info dispatch =
9002 iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9003 struct iris_bo *indirect = iris_resource_bo(grid->indirect);
9004 const int dispatch_size = dispatch.simd_size / 16;
9005
9006 struct GENX(COMPUTE_WALKER_BODY) body = {};
9007 body.SIMDSize = dispatch_size;
9008 body.MessageSIMD = dispatch_size;
9009 body.GenerateLocalID = cs_data->generate_local_id != 0;
9010 body.EmitLocal = cs_data->generate_local_id;
9011 body.WalkOrder = cs_data->walk_order;
9012 body.TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9013 TileY32bpe : Linear;
9014 body.LocalXMaximum = grid->block[0] - 1;
9015 body.LocalYMaximum = grid->block[1] - 1;
9016 body.LocalZMaximum = grid->block[2] - 1;
9017 body.ExecutionMask = dispatch.right_mask;
9018 body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0);
9019 body.InterfaceDescriptor = idd;
9020 /* HSD 14016252163: Use of Morton walk order (and batching using a batch
9021 * size of 4) is expected to increase sampler cache hit rates by
9022 * increasing sample address locality within a subslice.
9023 */
9024 #if GFX_VER >= 30
9025 body.DispatchWalkOrder =
9026 cs_data->uses_sampler ? MortonWalk : LinearWalk;
9027 body.ThreadGroupBatchSize =
9028 cs_data->uses_sampler ? TG_BATCH_4 : TG_BATCH_1;
9029 #endif
9030
9031 struct iris_address indirect_bo = ro_bo(indirect, grid->indirect_offset);
9032 iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DISPATCH), ind) {
9033 ind.PredicateEnable =
9034 ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
9035 ind.MaxCount = 1;
9036 ind.COMPUTE_WALKER_BODY = body;
9037 ind.ArgumentBufferStartAddress = indirect_bo;
9038 ind.MOCS =
9039 iris_mocs(indirect_bo.bo, &screen->isl_dev, 0);
9040 }
9041 }
9042
9043 static void
iris_upload_compute_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9044 iris_upload_compute_walker(struct iris_context *ice,
9045 struct iris_batch *batch,
9046 const struct pipe_grid_info *grid)
9047 {
9048 const uint64_t stage_dirty = ice->state.stage_dirty;
9049 struct iris_screen *screen = batch->screen;
9050 const struct intel_device_info *devinfo = screen->devinfo;
9051 struct iris_binder *binder = &ice->state.binder;
9052 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9053 struct iris_compiled_shader *shader =
9054 ice->shaders.prog[MESA_SHADER_COMPUTE];
9055 const struct iris_cs_data *cs_data = iris_cs_data(shader);
9056 const struct intel_cs_dispatch_info dispatch =
9057 iris_get_cs_dispatch_info(devinfo, shader, grid->block);
9058
9059 trace_intel_begin_compute(&batch->trace);
9060
9061 if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
9062 iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
9063 cfe.MaximumNumberofThreads =
9064 devinfo->max_cs_threads * devinfo->subslice_total;
9065 uint32_t scratch_addr = pin_scratch_space(ice, batch, shader,
9066 MESA_SHADER_COMPUTE);
9067 cfe.ScratchSpaceBuffer = scratch_addr >> SCRATCH_SPACE_BUFFER_SHIFT;
9068 }
9069 }
9070
9071 struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {};
9072 idd.KernelStartPointer = KSP(shader);
9073 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9074 idd.SharedLocalMemorySize =
9075 intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
9076 idd.PreferredSLMAllocationSize =
9077 intel_compute_preferred_slm_calc_encode_size(devinfo,
9078 shader->total_shared,
9079 dispatch.group_size,
9080 dispatch.simd_size);
9081 idd.SamplerStatePointer = shs->sampler_table.offset;
9082 idd.SamplerCount = encode_sampler_count(shader),
9083 idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
9084 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
9085 idd.BindingTableEntryCount = devinfo->verx10 == 125 ?
9086 0 : MIN2(shader->bt.size_bytes / 4, 31);
9087 idd.NumberOfBarriers = cs_data->uses_barrier;
9088
9089 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9090
9091 if (iris_emit_indirect_dispatch_supported(devinfo) && grid->indirect) {
9092 iris_emit_execute_indirect_dispatch(ice, batch, grid, idd);
9093 } else {
9094 if (grid->indirect)
9095 iris_load_indirect_location(ice, batch, grid);
9096
9097 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9098
9099 ice->utrace.last_compute_walker =
9100 iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length));
9101
9102 struct GENX(COMPUTE_WALKER_BODY) body = {
9103 .SIMDSize = dispatch.simd_size / 16,
9104 .MessageSIMD = dispatch.simd_size / 16,
9105 .LocalXMaximum = grid->block[0] - 1,
9106 .LocalYMaximum = grid->block[1] - 1,
9107 .LocalZMaximum = grid->block[2] - 1,
9108 .ThreadGroupIDXDimension = grid->grid[0],
9109 .ThreadGroupIDYDimension = grid->grid[1],
9110 .ThreadGroupIDZDimension = grid->grid[2],
9111 .ExecutionMask = dispatch.right_mask,
9112 .PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0),
9113 .InterfaceDescriptor = idd,
9114
9115 #if GFX_VERx10 >= 125
9116 .GenerateLocalID = cs_data->generate_local_id != 0,
9117 .EmitLocal = cs_data->generate_local_id,
9118 .WalkOrder = cs_data->walk_order,
9119 .TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ?
9120 TileY32bpe : Linear,
9121 #endif
9122 };
9123
9124 _iris_pack_command(batch, GENX(COMPUTE_WALKER),
9125 ice->utrace.last_compute_walker, cw) {
9126 cw.IndirectParameterEnable = grid->indirect;
9127 cw.body = body;
9128 assert(iris_cs_push_const_total_size(shader, dispatch.threads) == 0);
9129 }
9130 }
9131
9132 trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2], 0);
9133 }
9134
9135 #else /* #if GFX_VERx10 >= 125 */
9136
9137 static void
iris_upload_gpgpu_walker(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9138 iris_upload_gpgpu_walker(struct iris_context *ice,
9139 struct iris_batch *batch,
9140 const struct pipe_grid_info *grid)
9141 {
9142 const uint64_t stage_dirty = ice->state.stage_dirty;
9143 struct iris_screen *screen = batch->screen;
9144 const struct intel_device_info *devinfo = screen->devinfo;
9145 struct iris_binder *binder = &ice->state.binder;
9146 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9147 struct iris_uncompiled_shader *ish =
9148 ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
9149 struct iris_compiled_shader *shader =
9150 ice->shaders.prog[MESA_SHADER_COMPUTE];
9151 struct iris_cs_data *cs_data = iris_cs_data(shader);
9152 const struct intel_cs_dispatch_info dispatch =
9153 iris_get_cs_dispatch_info(screen->devinfo, shader, grid->block);
9154
9155 trace_intel_begin_compute(&batch->trace);
9156
9157 if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9158 cs_data->local_size[0] == 0 /* Variable local group size */) {
9159 /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
9160 *
9161 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
9162 * the only bits that are changed are scoreboard related: Scoreboard
9163 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
9164 * these scoreboard related states, a MEDIA_STATE_FLUSH is
9165 * sufficient."
9166 */
9167 iris_emit_pipe_control_flush(batch,
9168 "workaround: stall before MEDIA_VFE_STATE",
9169 PIPE_CONTROL_CS_STALL);
9170
9171 iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
9172 if (shader->total_scratch) {
9173 uint32_t scratch_addr =
9174 pin_scratch_space(ice, batch, shader, MESA_SHADER_COMPUTE);
9175
9176 vfe.PerThreadScratchSpace = ffs(shader->total_scratch) - 11;
9177 vfe.ScratchSpaceBasePointer =
9178 rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
9179 }
9180
9181 vfe.MaximumNumberofThreads =
9182 devinfo->max_cs_threads * devinfo->subslice_total - 1;
9183 #if GFX_VER < 11
9184 vfe.ResetGatewayTimer =
9185 Resettingrelativetimerandlatchingtheglobaltimestamp;
9186 #endif
9187 #if GFX_VER == 8
9188 vfe.BypassGatewayControl = true;
9189 #endif
9190 vfe.NumberofURBEntries = 2;
9191 vfe.URBEntryAllocationSize = 2;
9192
9193 vfe.CURBEAllocationSize =
9194 ALIGN(cs_data->push.per_thread.regs * dispatch.threads +
9195 cs_data->push.cross_thread.regs, 2);
9196 }
9197 }
9198
9199 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
9200 if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
9201 cs_data->local_size[0] == 0 /* Variable local group size */) {
9202 uint32_t curbe_data_offset = 0;
9203 assert(cs_data->push.cross_thread.dwords == 0 &&
9204 cs_data->push.per_thread.dwords == 1 &&
9205 cs_data->first_param_is_builtin_subgroup_id);
9206 const unsigned push_const_size =
9207 iris_cs_push_const_total_size(shader, dispatch.threads);
9208 uint32_t *curbe_data_map =
9209 stream_state(batch, ice->state.dynamic_uploader,
9210 &ice->state.last_res.cs_thread_ids,
9211 ALIGN(push_const_size, 64), 64,
9212 &curbe_data_offset);
9213 assert(curbe_data_map);
9214 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
9215 iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads,
9216 curbe_data_map);
9217
9218 iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
9219 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
9220 curbe.CURBEDataStartAddress = curbe_data_offset;
9221 }
9222 }
9223
9224 for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
9225 struct pipe_resource *res = ice->state.global_bindings[i];
9226 if (!res)
9227 break;
9228
9229 iris_use_pinned_bo(batch, iris_resource_bo(res),
9230 true, IRIS_DOMAIN_NONE);
9231 }
9232
9233 if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
9234 IRIS_STAGE_DIRTY_BINDINGS_CS |
9235 IRIS_STAGE_DIRTY_CONSTANTS_CS |
9236 IRIS_STAGE_DIRTY_CS)) {
9237 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
9238
9239 iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
9240 idd.SharedLocalMemorySize =
9241 intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
9242 idd.KernelStartPointer =
9243 KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);
9244 idd.SamplerStatePointer = shs->sampler_table.offset;
9245 idd.BindingTablePointer =
9246 binder->bt_offset[MESA_SHADER_COMPUTE] >> IRIS_BT_OFFSET_SHIFT;
9247 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
9248 }
9249
9250 for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
9251 desc[i] |= ((uint32_t *) shader->derived_data)[i];
9252
9253 iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
9254 load.InterfaceDescriptorTotalLength =
9255 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
9256 load.InterfaceDescriptorDataStartAddress =
9257 emit_state(batch, ice->state.dynamic_uploader,
9258 &ice->state.last_res.cs_desc, desc, sizeof(desc), 64);
9259 }
9260 }
9261
9262 if (grid->indirect)
9263 iris_load_indirect_location(ice, batch, grid);
9264
9265 iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
9266
9267 iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
9268 ggw.IndirectParameterEnable = grid->indirect != NULL;
9269 ggw.SIMDSize = dispatch.simd_size / 16;
9270 ggw.ThreadDepthCounterMaximum = 0;
9271 ggw.ThreadHeightCounterMaximum = 0;
9272 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
9273 ggw.ThreadGroupIDXDimension = grid->grid[0];
9274 ggw.ThreadGroupIDYDimension = grid->grid[1];
9275 ggw.ThreadGroupIDZDimension = grid->grid[2];
9276 ggw.RightExecutionMask = dispatch.right_mask;
9277 ggw.BottomExecutionMask = 0xffffffff;
9278 }
9279
9280 iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
9281
9282 trace_intel_end_compute(&batch->trace, grid->grid[0], grid->grid[1], grid->grid[2], 0);
9283 }
9284
9285 #endif /* #if GFX_VERx10 >= 125 */
9286
9287 static void
iris_upload_compute_state(struct iris_context * ice,struct iris_batch * batch,const struct pipe_grid_info * grid)9288 iris_upload_compute_state(struct iris_context *ice,
9289 struct iris_batch *batch,
9290 const struct pipe_grid_info *grid)
9291 {
9292 struct iris_screen *screen = batch->screen;
9293 const uint64_t stage_dirty = ice->state.stage_dirty;
9294 struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
9295 struct iris_compiled_shader *shader =
9296 ice->shaders.prog[MESA_SHADER_COMPUTE];
9297 struct iris_border_color_pool *border_color_pool =
9298 iris_bufmgr_get_border_color_pool(screen->bufmgr);
9299
9300 iris_batch_sync_region_start(batch);
9301
9302 /* Always pin the binder. If we're emitting new binding table pointers,
9303 * we need it. If not, we're probably inheriting old tables via the
9304 * context, and need it anyway. Since true zero-bindings cases are
9305 * practically non-existent, just pin it and avoid last_res tracking.
9306 */
9307 iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
9308
9309 if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
9310 shs->sysvals_need_upload) ||
9311 shader->kernel_input_size > 0)
9312 upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
9313
9314 if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
9315 iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
9316
9317 if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
9318 iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
9319
9320 iris_use_optional_res(batch, shs->sampler_table.res, false,
9321 IRIS_DOMAIN_NONE);
9322 iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
9323 IRIS_DOMAIN_NONE);
9324
9325 if (ice->state.need_border_colors)
9326 iris_use_pinned_bo(batch, border_color_pool->bo, false,
9327 IRIS_DOMAIN_NONE);
9328
9329 #if GFX_VER >= 12
9330 genX(invalidate_aux_map_state)(batch);
9331 #endif
9332
9333 #if GFX_VERx10 >= 125
9334 iris_upload_compute_walker(ice, batch, grid);
9335 #else
9336 iris_upload_gpgpu_walker(ice, batch, grid);
9337 #endif
9338
9339 if (!batch->contains_draw_with_next_seqno) {
9340 iris_restore_compute_saved_bos(ice, batch, grid);
9341 batch->contains_draw_with_next_seqno = batch->contains_draw = true;
9342 }
9343
9344 iris_batch_sync_region_end(batch);
9345 }
9346
9347 /**
9348 * State module teardown.
9349 */
9350 static void
iris_destroy_state(struct iris_context * ice)9351 iris_destroy_state(struct iris_context *ice)
9352 {
9353 struct iris_genx_state *genx = ice->state.genx;
9354
9355 pipe_resource_reference(&ice->state.pixel_hashing_tables, NULL);
9356
9357 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
9358 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
9359 pipe_resource_reference(&ice->draw.generation.params.res, NULL);
9360 pipe_resource_reference(&ice->draw.generation.vertices.res, NULL);
9361
9362 /* Loop over all VBOs, including ones for draw parameters */
9363 for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
9364 pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);
9365 }
9366
9367 free(ice->state.genx);
9368
9369 for (int i = 0; i < 4; i++) {
9370 pipe_so_target_reference(&ice->state.so_target[i], NULL);
9371 }
9372
9373 util_unreference_framebuffer_state(&ice->state.framebuffer);
9374
9375 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
9376 struct iris_shader_state *shs = &ice->state.shaders[stage];
9377 pipe_resource_reference(&shs->sampler_table.res, NULL);
9378 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
9379 pipe_resource_reference(&shs->constbuf[i].buffer, NULL);
9380 pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);
9381 }
9382 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
9383 pipe_resource_reference(&shs->image[i].base.resource, NULL);
9384 pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);
9385 free(shs->image[i].surface_state.cpu);
9386 }
9387 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
9388 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
9389 pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);
9390 }
9391 for (int i = 0; i < IRIS_MAX_TEXTURES; i++) {
9392 pipe_sampler_view_reference((struct pipe_sampler_view **)
9393 &shs->textures[i], NULL);
9394 }
9395 }
9396
9397 pipe_resource_reference(&ice->state.grid_size.res, NULL);
9398 pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);
9399
9400 pipe_resource_reference(&ice->state.null_fb.res, NULL);
9401 pipe_resource_reference(&ice->state.unbound_tex.res, NULL);
9402
9403 pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
9404 pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
9405 pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
9406 pipe_resource_reference(&ice->state.last_res.scissor, NULL);
9407 pipe_resource_reference(&ice->state.last_res.blend, NULL);
9408 pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);
9409 pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);
9410 pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);
9411 }
9412
9413 /* ------------------------------------------------------------------- */
9414
9415 static void
iris_rebind_buffer(struct iris_context * ice,struct iris_resource * res)9416 iris_rebind_buffer(struct iris_context *ice,
9417 struct iris_resource *res)
9418 {
9419 struct pipe_context *ctx = &ice->ctx;
9420 struct iris_genx_state *genx = ice->state.genx;
9421
9422 assert(res->base.b.target == PIPE_BUFFER);
9423
9424 /* Buffers can't be framebuffer attachments, nor display related,
9425 * and we don't have upstream Clover support.
9426 */
9427 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
9428 PIPE_BIND_RENDER_TARGET |
9429 PIPE_BIND_BLENDABLE |
9430 PIPE_BIND_DISPLAY_TARGET |
9431 PIPE_BIND_CURSOR |
9432 PIPE_BIND_COMPUTE_RESOURCE |
9433 PIPE_BIND_GLOBAL)));
9434
9435 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
9436 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
9437 while (bound_vbs) {
9438 const int i = u_bit_scan64(&bound_vbs);
9439 struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];
9440
9441 /* Update the CPU struct */
9442 STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);
9443 STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);
9444 uint64_t *addr = (uint64_t *) &state->state[1];
9445 struct iris_bo *bo = iris_resource_bo(state->resource);
9446
9447 if (*addr != bo->address + state->offset) {
9448 *addr = bo->address + state->offset;
9449 ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
9450 IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
9451 }
9452 }
9453 }
9454
9455 /* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit
9456 * the 3DSTATE_INDEX_BUFFER packet whenever the address changes.
9457 *
9458 * There is also no need to handle these:
9459 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
9460 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
9461 */
9462
9463 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
9464 uint32_t *so_buffers = genx->so_buffers;
9465 for (unsigned i = 0; i < 4; i++,
9466 so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
9467
9468 /* There are no other fields in bits 127:64 */
9469 uint64_t *addr = (uint64_t *) &so_buffers[2];
9470 STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);
9471 STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);
9472
9473 struct pipe_stream_output_target *tgt = ice->state.so_target[i];
9474 if (tgt) {
9475 struct iris_bo *bo = iris_resource_bo(tgt->buffer);
9476 if (*addr != bo->address + tgt->buffer_offset) {
9477 *addr = bo->address + tgt->buffer_offset;
9478 ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
9479 }
9480 }
9481 }
9482 }
9483
9484 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
9485 struct iris_shader_state *shs = &ice->state.shaders[s];
9486 enum pipe_shader_type p_stage = stage_to_pipe(s);
9487
9488 if (!(res->bind_stages & (1 << s)))
9489 continue;
9490
9491 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
9492 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
9493 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
9494 while (bound_cbufs) {
9495 const int i = u_bit_scan(&bound_cbufs);
9496 struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
9497 struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];
9498
9499 if (res->bo == iris_resource_bo(cbuf->buffer)) {
9500 pipe_resource_reference(&surf_state->res, NULL);
9501 shs->dirty_cbufs |= 1u << i;
9502 ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
9503 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
9504 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
9505 }
9506 }
9507 }
9508
9509 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
9510 uint32_t bound_ssbos = shs->bound_ssbos;
9511 while (bound_ssbos) {
9512 const int i = u_bit_scan(&bound_ssbos);
9513 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
9514
9515 if (res->bo == iris_resource_bo(ssbo->buffer)) {
9516 struct pipe_shader_buffer buf = {
9517 .buffer = &res->base.b,
9518 .buffer_offset = ssbo->buffer_offset,
9519 .buffer_size = ssbo->buffer_size,
9520 };
9521 iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,
9522 (shs->writable_ssbos >> i) & 1);
9523 }
9524 }
9525 }
9526
9527 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
9528 int i;
9529 BITSET_FOREACH_SET(i, shs->bound_sampler_views, IRIS_MAX_TEXTURES) {
9530 struct iris_sampler_view *isv = shs->textures[i];
9531 struct iris_bo *bo = isv->res->bo;
9532
9533 if (update_surface_state_addrs(ice->state.surface_uploader,
9534 &isv->surface_state, bo)) {
9535 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9536 }
9537 }
9538 }
9539
9540 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
9541 uint64_t bound_image_views = shs->bound_image_views;
9542 while (bound_image_views) {
9543 const int i = u_bit_scan64(&bound_image_views);
9544 struct iris_image_view *iv = &shs->image[i];
9545 struct iris_bo *bo = iris_resource_bo(iv->base.resource);
9546
9547 if (update_surface_state_addrs(ice->state.surface_uploader,
9548 &iv->surface_state, bo)) {
9549 ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
9550 }
9551 }
9552 }
9553 }
9554 }
9555
9556 /* ------------------------------------------------------------------- */
9557
9558 /**
9559 * Introduce a batch synchronization boundary, and update its cache coherency
9560 * status to reflect the execution of a PIPE_CONTROL command with the
9561 * specified flags.
9562 */
9563 static void
batch_mark_sync_for_pipe_control(struct iris_batch * batch,uint32_t flags)9564 batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
9565 {
9566 const struct intel_device_info *devinfo = batch->screen->devinfo;
9567
9568 iris_batch_sync_boundary(batch);
9569
9570 if ((flags & PIPE_CONTROL_CS_STALL)) {
9571 if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9572 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9573
9574 if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9575 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9576
9577 if ((flags & PIPE_CONTROL_TILE_CACHE_FLUSH)) {
9578 /* A tile cache flush makes any C/Z data in L3 visible to memory. */
9579 const unsigned c = IRIS_DOMAIN_RENDER_WRITE;
9580 const unsigned z = IRIS_DOMAIN_DEPTH_WRITE;
9581 batch->coherent_seqnos[c][c] = batch->l3_coherent_seqnos[c];
9582 batch->coherent_seqnos[z][z] = batch->l3_coherent_seqnos[z];
9583 }
9584
9585 if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9586 /* HDC and DC flushes both flush the data cache out to L3 */
9587 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9588 }
9589
9590 if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH)) {
9591 /* A DC flush also flushes L3 data cache lines out to memory. */
9592 const unsigned i = IRIS_DOMAIN_DATA_WRITE;
9593 batch->coherent_seqnos[i][i] = batch->l3_coherent_seqnos[i];
9594 }
9595
9596 if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9597 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9598
9599 if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
9600 PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
9601 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
9602 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9603 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9604 iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
9605 }
9606 }
9607
9608 if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
9609 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
9610
9611 if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
9612 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
9613
9614 if (flags & (PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_DATA_CACHE_FLUSH))
9615 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
9616
9617 if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
9618 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
9619
9620 if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
9621 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
9622
9623 if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE))
9624 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_SAMPLER_READ);
9625
9626 /* Technically, to invalidate IRIS_DOMAIN_PULL_CONSTANT_READ, we need
9627 * both "Constant Cache Invalidate" and either "Texture Cache Invalidate"
9628 * or "Data Cache Flush" set, depending on the setting of
9629 * iris_indirect_ubos_use_sampler().
9630 *
9631 * However, "Data Cache Flush" and "Constant Cache Invalidate" will never
9632 * appear in the same PIPE_CONTROL command, because one is bottom-of-pipe
9633 * while the other is top-of-pipe. Because we only look at one flush at
9634 * a time, we won't see both together.
9635 *
9636 * To deal with this, we mark it as invalidated when the constant cache
9637 * is invalidated, and trust the callers to also flush the other related
9638 * cache correctly at the same time.
9639 */
9640 if ((flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
9641 iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_PULL_CONSTANT_READ);
9642
9643 /* IRIS_DOMAIN_OTHER_READ no longer uses any caches. */
9644
9645 if ((flags & PIPE_CONTROL_L3_RO_INVALIDATE_BITS) == PIPE_CONTROL_L3_RO_INVALIDATE_BITS) {
9646 /* If we just invalidated the read-only lines of L3, then writes from non-L3-coherent
9647 * domains will now be visible to those L3 clients.
9648 */
9649 for (unsigned i = 0; i < NUM_IRIS_DOMAINS; i++) {
9650 if (!iris_domain_is_l3_coherent(devinfo, i))
9651 batch->l3_coherent_seqnos[i] = batch->coherent_seqnos[i][i];
9652 }
9653 }
9654 }
9655
9656 static unsigned
flags_to_post_sync_op(uint32_t flags)9657 flags_to_post_sync_op(uint32_t flags)
9658 {
9659 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
9660 return WriteImmediateData;
9661
9662 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
9663 return WritePSDepthCount;
9664
9665 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
9666 return WriteTimestamp;
9667
9668 return 0;
9669 }
9670
9671 /**
9672 * Do the given flags have a Post Sync or LRI Post Sync operation?
9673 */
9674 static enum pipe_control_flags
get_post_sync_flags(enum pipe_control_flags flags)9675 get_post_sync_flags(enum pipe_control_flags flags)
9676 {
9677 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
9678 PIPE_CONTROL_WRITE_DEPTH_COUNT |
9679 PIPE_CONTROL_WRITE_TIMESTAMP |
9680 PIPE_CONTROL_LRI_POST_SYNC_OP;
9681
9682 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
9683 * "LRI Post Sync Operation". So more than one bit set would be illegal.
9684 */
9685 assert(util_bitcount(flags) <= 1);
9686
9687 return flags;
9688 }
9689
9690 #define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)
9691
9692 /**
9693 * Emit a series of PIPE_CONTROL commands, taking into account any
9694 * workarounds necessary to actually accomplish the caller's request.
9695 *
9696 * Unless otherwise noted, spec quotations in this function come from:
9697 *
9698 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
9699 * Restrictions for PIPE_CONTROL.
9700 *
9701 * You should not use this function directly. Use the helpers in
9702 * iris_pipe_control.c instead, which may split the pipe control further.
9703 */
9704 static void
iris_emit_raw_pipe_control(struct iris_batch * batch,const char * reason,uint32_t flags,struct iris_bo * bo,uint32_t offset,uint64_t imm)9705 iris_emit_raw_pipe_control(struct iris_batch *batch,
9706 const char *reason,
9707 uint32_t flags,
9708 struct iris_bo *bo,
9709 uint32_t offset,
9710 uint64_t imm)
9711 {
9712 UNUSED const struct intel_device_info *devinfo = batch->screen->devinfo;
9713 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
9714 enum pipe_control_flags non_lri_post_sync_flags =
9715 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
9716
9717 #if GFX_VER >= 12
9718 if (batch->name == IRIS_BATCH_BLITTER) {
9719 batch_mark_sync_for_pipe_control(batch, flags);
9720 iris_batch_sync_region_start(batch);
9721
9722 assert(!(flags & PIPE_CONTROL_WRITE_DEPTH_COUNT));
9723
9724 /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
9725 if (intel_needs_workaround(batch->screen->devinfo, 16018063123))
9726 batch_emit_fast_color_dummy_blit(batch);
9727
9728 /* The blitter doesn't actually use PIPE_CONTROL; rather it uses the
9729 * MI_FLUSH_DW command. However, all of our code is set up to flush
9730 * via emitting a pipe control, so we just translate it at this point,
9731 * even if it is a bit hacky.
9732 */
9733 iris_emit_cmd(batch, GENX(MI_FLUSH_DW), fd) {
9734 fd.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
9735 fd.ImmediateData = imm;
9736 fd.PostSyncOperation = flags_to_post_sync_op(flags);
9737 #if GFX_VERx10 >= 125
9738 /* TODO: This may not always be necessary */
9739 fd.FlushCCS = true;
9740 #endif
9741 }
9742 iris_batch_sync_region_end(batch);
9743 return;
9744 }
9745 #endif
9746
9747 /* The "L3 Read Only Cache Invalidation Bit" docs say it "controls the
9748 * invalidation of the Geometry streams cached in L3 cache at the top
9749 * of the pipe". In other words, index & vertex data that gets cached
9750 * in L3 when VERTEX_BUFFER_STATE::L3BypassDisable is set.
9751 *
9752 * Normally, invalidating L1/L2 read-only caches also invalidate their
9753 * related L3 cachelines, but this isn't the case for the VF cache.
9754 * Emulate it by setting the L3 Read Only bit when doing a VF invalidate.
9755 */
9756 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)
9757 flags |= PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
9758
9759 /* Recursive PIPE_CONTROL workarounds --------------------------------
9760 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
9761 *
9762 * We do these first because we want to look at the original operation,
9763 * rather than any workarounds we set.
9764 */
9765 if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
9766 /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
9767 * lists several workarounds:
9768 *
9769 * "Project: SKL, KBL, BXT
9770 *
9771 * If the VF Cache Invalidation Enable is set to a 1 in a
9772 * PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
9773 * sets to 0, with the VF Cache Invalidation Enable set to 0
9774 * needs to be sent prior to the PIPE_CONTROL with VF Cache
9775 * Invalidation Enable set to a 1."
9776 */
9777 iris_emit_raw_pipe_control(batch,
9778 "workaround: recursive VF cache invalidate",
9779 0, NULL, 0, 0);
9780 }
9781
9782 if (GFX_VER == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
9783 /* Project: SKL / Argument: LRI Post Sync Operation [23]
9784 *
9785 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9786 * programmed prior to programming a PIPECONTROL command with "LRI
9787 * Post Sync Operation" in GPGPU mode of operation (i.e when
9788 * PIPELINE_SELECT command is set to GPGPU mode of operation)."
9789 *
9790 * The same text exists a few rows below for Post Sync Op.
9791 */
9792 iris_emit_raw_pipe_control(batch,
9793 "workaround: CS stall before gpgpu post-sync",
9794 PIPE_CONTROL_CS_STALL, bo, offset, imm);
9795 }
9796
9797 /* "Flush Types" workarounds ---------------------------------------------
9798 * We do these now because they may add post-sync operations or CS stalls.
9799 */
9800
9801 if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
9802 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
9803 *
9804 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
9805 * 'Write PS Depth Count' or 'Write Timestamp'."
9806 */
9807 if (!bo) {
9808 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9809 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9810 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
9811 bo = batch->screen->workaround_address.bo;
9812 offset = batch->screen->workaround_address.offset;
9813 }
9814 }
9815
9816 if (flags & PIPE_CONTROL_DEPTH_STALL) {
9817 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
9818 *
9819 * "This bit must be DISABLED for operations other than writing
9820 * PS_DEPTH_COUNT."
9821 *
9822 * This seems like nonsense. An Ivybridge workaround requires us to
9823 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
9824 * operation. Gfx8+ requires us to emit depth stalls and depth cache
9825 * flushes together. So, it's hard to imagine this means anything other
9826 * than "we originally intended this to be used for PS_DEPTH_COUNT".
9827 *
9828 * We ignore the supposed restriction and do nothing.
9829 */
9830 }
9831
9832 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
9833 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9834 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
9835 *
9836 * "This bit must be DISABLED for End-of-pipe (Read) fences,
9837 * PS_DEPTH_COUNT or TIMESTAMP queries."
9838 *
9839 * TODO: Implement end-of-pipe checking.
9840 */
9841 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
9842 PIPE_CONTROL_WRITE_TIMESTAMP)));
9843 }
9844
9845 if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
9846 /* From the PIPE_CONTROL instruction table, bit 1:
9847 *
9848 * "This bit is ignored if Depth Stall Enable is set.
9849 * Further, the render cache is not flushed even if Write Cache
9850 * Flush Enable bit is set."
9851 *
9852 * We assert that the caller doesn't do this combination, to try and
9853 * prevent mistakes. It shouldn't hurt the GPU, though.
9854 *
9855 * We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"
9856 * and "Render Target Flush" combo is explicitly required for BTI
9857 * update workarounds.
9858 */
9859 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
9860 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
9861 }
9862
9863 /* PIPE_CONTROL page workarounds ------------------------------------- */
9864
9865 if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
9866 /* From the PIPE_CONTROL page itself:
9867 *
9868 * "IVB, HSW, BDW
9869 * Restriction: Pipe_control with CS-stall bit set must be issued
9870 * before a pipe-control command that has the State Cache
9871 * Invalidate bit set."
9872 */
9873 flags |= PIPE_CONTROL_CS_STALL;
9874 }
9875
9876 if (flags & PIPE_CONTROL_FLUSH_LLC) {
9877 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
9878 *
9879 * "Project: ALL
9880 * SW must always program Post-Sync Operation to "Write Immediate
9881 * Data" when Flush LLC is set."
9882 *
9883 * For now, we just require the caller to do it.
9884 */
9885 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
9886 }
9887
9888 /* Emulate a HDC flush with a full Data Cache Flush on older hardware which
9889 * doesn't support the new lightweight flush.
9890 */
9891 #if GFX_VER < 12
9892 if (flags & PIPE_CONTROL_FLUSH_HDC)
9893 flags |= PIPE_CONTROL_DATA_CACHE_FLUSH;
9894 #endif
9895
9896 /* "Post-Sync Operation" workarounds -------------------------------- */
9897
9898 /* Project: All / Argument: Global Snapshot Count Reset [19]
9899 *
9900 * "This bit must not be exercised on any product.
9901 * Requires stall bit ([20] of DW1) set."
9902 *
9903 * We don't use this, so we just assert that it isn't used. The
9904 * PIPE_CONTROL instruction page indicates that they intended this
9905 * as a debug feature and don't think it is useful in production,
9906 * but it may actually be usable, should we ever want to.
9907 */
9908 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
9909
9910 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
9911 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
9912 /* Project: All / Arguments:
9913 *
9914 * - Generic Media State Clear [16]
9915 * - Indirect State Pointers Disable [16]
9916 *
9917 * "Requires stall bit ([20] of DW1) set."
9918 *
9919 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
9920 * State Clear) says:
9921 *
9922 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
9923 * programmed prior to programming a PIPECONTROL command with "Media
9924 * State Clear" set in GPGPU mode of operation"
9925 *
9926 * This is a subset of the earlier rule, so there's nothing to do.
9927 */
9928 flags |= PIPE_CONTROL_CS_STALL;
9929 }
9930
9931 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
9932 /* Project: All / Argument: Store Data Index
9933 *
9934 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9935 * than '0'."
9936 *
9937 * For now, we just assert that the caller does this. We might want to
9938 * automatically add a write to the workaround BO...
9939 */
9940 assert(non_lri_post_sync_flags != 0);
9941 }
9942
9943 if (flags & PIPE_CONTROL_SYNC_GFDT) {
9944 /* Project: All / Argument: Sync GFDT
9945 *
9946 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
9947 * than '0' or 0x2520[13] must be set."
9948 *
9949 * For now, we just assert that the caller does this.
9950 */
9951 assert(non_lri_post_sync_flags != 0);
9952 }
9953
9954 if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
9955 /* Project: IVB+ / Argument: TLB inv
9956 *
9957 * "Requires stall bit ([20] of DW1) set."
9958 *
9959 * Also, from the PIPE_CONTROL instruction table:
9960 *
9961 * "Project: SKL+
9962 * Post Sync Operation or CS stall must be set to ensure a TLB
9963 * invalidation occurs. Otherwise no cycle will occur to the TLB
9964 * cache to invalidate."
9965 *
9966 * This is not a subset of the earlier rule, so there's nothing to do.
9967 */
9968 flags |= PIPE_CONTROL_CS_STALL;
9969 }
9970
9971 if (GFX_VER == 9 && devinfo->gt == 4) {
9972 /* TODO: The big Skylake GT4 post sync op workaround */
9973 }
9974
9975 /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
9976
9977 if (IS_COMPUTE_PIPELINE(batch)) {
9978 if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
9979 /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
9980 * PIPE_CONTROL, Flush Types:
9981 * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
9982 * For newer platforms this is documented in the PIPE_CONTROL
9983 * instruction page.
9984 */
9985 flags |= PIPE_CONTROL_CS_STALL;
9986 }
9987
9988 if (GFX_VER == 8 && (post_sync_flags ||
9989 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
9990 PIPE_CONTROL_DEPTH_STALL |
9991 PIPE_CONTROL_RENDER_TARGET_FLUSH |
9992 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
9993 PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
9994 /* Project: BDW / Arguments:
9995 *
9996 * - LRI Post Sync Operation [23]
9997 * - Post Sync Op [15:14]
9998 * - Notify En [8]
9999 * - Depth Stall [13]
10000 * - Render Target Cache Flush [12]
10001 * - Depth Cache Flush [0]
10002 * - DC Flush Enable [5]
10003 *
10004 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
10005 * Workloads."
10006 */
10007 flags |= PIPE_CONTROL_CS_STALL;
10008
10009 /* Also, from the PIPE_CONTROL instruction table, bit 20:
10010 *
10011 * "Project: BDW
10012 * This bit must be always set when PIPE_CONTROL command is
10013 * programmed by GPGPU and MEDIA workloads, except for the cases
10014 * when only Read Only Cache Invalidation bits are set (State
10015 * Cache Invalidation Enable, Instruction cache Invalidation
10016 * Enable, Texture Cache Invalidation Enable, Constant Cache
10017 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
10018 * need not implemented when FF_DOP_CG is disable via "Fixed
10019 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
10020 *
10021 * It sounds like we could avoid CS stalls in some cases, but we
10022 * don't currently bother. This list isn't exactly the list above,
10023 * either...
10024 */
10025 }
10026 }
10027
10028 /* "Stall" workarounds ----------------------------------------------
10029 * These have to come after the earlier ones because we may have added
10030 * some additional CS stalls above.
10031 */
10032
10033 if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
10034 /* Project: PRE-SKL, VLV, CHV
10035 *
10036 * "[All Stepping][All SKUs]:
10037 *
10038 * One of the following must also be set:
10039 *
10040 * - Render Target Cache Flush Enable ([12] of DW1)
10041 * - Depth Cache Flush Enable ([0] of DW1)
10042 * - Stall at Pixel Scoreboard ([1] of DW1)
10043 * - Depth Stall ([13] of DW1)
10044 * - Post-Sync Operation ([13] of DW1)
10045 * - DC Flush Enable ([5] of DW1)"
10046 *
10047 * If we don't already have one of those bits set, we choose to add
10048 * "Stall at Pixel Scoreboard". Some of the other bits require a
10049 * CS stall as a workaround (see above), which would send us into
10050 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
10051 * appears to be safe, so we choose that.
10052 */
10053 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
10054 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
10055 PIPE_CONTROL_WRITE_IMMEDIATE |
10056 PIPE_CONTROL_WRITE_DEPTH_COUNT |
10057 PIPE_CONTROL_WRITE_TIMESTAMP |
10058 PIPE_CONTROL_STALL_AT_SCOREBOARD |
10059 PIPE_CONTROL_DEPTH_STALL |
10060 PIPE_CONTROL_DATA_CACHE_FLUSH;
10061 if (!(flags & wa_bits))
10062 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
10063 }
10064
10065 if (INTEL_NEEDS_WA_1409600907 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
10066 /* Wa_1409600907:
10067 *
10068 * "PIPE_CONTROL with Depth Stall Enable bit must be set
10069 * with any PIPE_CONTROL with Depth Flush Enable bit set.
10070 */
10071 flags |= PIPE_CONTROL_DEPTH_STALL;
10072 }
10073
10074 /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
10075 * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
10076 * with CS_STALL Bit set (with No POST_SYNC ENABLED)
10077 */
10078 if (intel_device_info_is_adln(devinfo) &&
10079 IS_COMPUTE_PIPELINE(batch) &&
10080 flags_to_post_sync_op(flags) != NoWrite) {
10081 iris_emit_raw_pipe_control(batch, "Wa_14014966230",
10082 PIPE_CONTROL_CS_STALL, NULL, 0, 0);
10083 }
10084
10085 batch_mark_sync_for_pipe_control(batch, flags);
10086
10087 #if INTEL_NEEDS_WA_14010840176
10088 /* "If the intention of “constant cache invalidate” is
10089 * to invalidate the L1 cache (which can cache constants), use “HDC
10090 * pipeline flush” instead of Constant Cache invalidate command."
10091 *
10092 * "If L3 invalidate is needed, the w/a should be to set state invalidate
10093 * in the pipe control command, in addition to the HDC pipeline flush."
10094 */
10095 if (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) {
10096 flags &= ~PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10097 flags |= PIPE_CONTROL_FLUSH_HDC | PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10098 }
10099 #endif
10100
10101 /* Emit --------------------------------------------------------------- */
10102
10103 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
10104 fprintf(stderr,
10105 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
10106 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
10107 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
10108 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
10109 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
10110 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
10111 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
10112 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
10113 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
10114 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
10115 (flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",
10116 (flags & PIPE_CONTROL_L3_FABRIC_FLUSH) ? "L3Fabric " : "",
10117 (flags & PIPE_CONTROL_CCS_CACHE_FLUSH) ? "CCS " : "",
10118 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
10119 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
10120 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
10121 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
10122 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
10123 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
10124 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
10125 "SnapRes" : "",
10126 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
10127 "ISPDis" : "",
10128 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
10129 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
10130 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
10131 (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",
10132 (flags & PIPE_CONTROL_PSS_STALL_SYNC) ? "PSS " : "",
10133 (flags & PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH) ? "UntypedDataPortCache " : "",
10134 imm, reason);
10135 }
10136
10137 iris_batch_sync_region_start(batch);
10138
10139 const bool trace_pc =
10140 (flags & (PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CACHE_INVALIDATE_BITS)) != 0;
10141
10142 if (trace_pc)
10143 trace_intel_begin_stall(&batch->trace);
10144
10145 iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
10146 #if GFX_VERx10 >= 125
10147 pc.PSSStallSyncEnable = flags & PIPE_CONTROL_PSS_STALL_SYNC;
10148 #endif
10149 #if GFX_VER == 12
10150 pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;
10151 pc.L3FabricFlush = flags & PIPE_CONTROL_L3_FABRIC_FLUSH;
10152 #endif
10153 #if GFX_VER > 11
10154 pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;
10155 #endif
10156 #if GFX_VERx10 >= 125
10157 pc.UntypedDataPortCacheFlushEnable =
10158 (flags & (PIPE_CONTROL_UNTYPED_DATAPORT_CACHE_FLUSH |
10159 PIPE_CONTROL_FLUSH_HDC |
10160 PIPE_CONTROL_DATA_CACHE_FLUSH)) &&
10161 IS_COMPUTE_PIPELINE(batch);
10162 pc.HDCPipelineFlushEnable |= pc.UntypedDataPortCacheFlushEnable;
10163 pc.CCSFlushEnable |= flags & PIPE_CONTROL_CCS_CACHE_FLUSH;
10164 #endif
10165 pc.LRIPostSyncOperation = NoLRIOperation;
10166 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
10167 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
10168 pc.StoreDataIndex = 0;
10169 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
10170 #if GFX_VERx10 < 125
10171 pc.GlobalSnapshotCountReset =
10172 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
10173 #endif
10174 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
10175 #if GFX_VERx10 < 200
10176 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
10177 #endif
10178 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
10179 pc.RenderTargetCacheFlushEnable =
10180 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
10181 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
10182 pc.StateCacheInvalidationEnable =
10183 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
10184 #if GFX_VER >= 12
10185 pc.L3ReadOnlyCacheInvalidationEnable =
10186 flags & PIPE_CONTROL_L3_READ_ONLY_CACHE_INVALIDATE;
10187 #endif
10188 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
10189 pc.ConstantCacheInvalidationEnable =
10190 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
10191 pc.PostSyncOperation = flags_to_post_sync_op(flags);
10192 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
10193 pc.InstructionCacheInvalidateEnable =
10194 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
10195 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
10196 pc.IndirectStatePointersDisable =
10197 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
10198 pc.TextureCacheInvalidationEnable =
10199 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
10200 pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
10201 pc.ImmediateData = imm;
10202 }
10203
10204 if (trace_pc) {
10205 trace_intel_end_stall(&batch->trace, flags,
10206 iris_utrace_pipe_flush_bit_to_ds_stall_flag,
10207 reason,0,0,0);
10208 }
10209
10210 iris_batch_sync_region_end(batch);
10211 }
10212
10213 #if GFX_VER == 9
10214 /**
10215 * Preemption on Gfx9 has to be enabled or disabled in various cases.
10216 *
10217 * See these workarounds for preemption:
10218 * - WaDisableMidObjectPreemptionForGSLineStripAdj
10219 * - WaDisableMidObjectPreemptionForTrifanOrPolygon
10220 * - WaDisableMidObjectPreemptionForLineLoop
10221 * - WA#0798
10222 *
10223 * We don't put this in the vtable because it's only used on Gfx9.
10224 */
10225 void
gfx9_toggle_preemption(struct iris_context * ice,struct iris_batch * batch,const struct pipe_draw_info * draw)10226 gfx9_toggle_preemption(struct iris_context *ice,
10227 struct iris_batch *batch,
10228 const struct pipe_draw_info *draw)
10229 {
10230 struct iris_genx_state *genx = ice->state.genx;
10231 bool object_preemption = true;
10232
10233 /* WaDisableMidObjectPreemptionForGSLineStripAdj
10234 *
10235 * "WA: Disable mid-draw preemption when draw-call is a linestrip_adj
10236 * and GS is enabled."
10237 */
10238 if (draw->mode == MESA_PRIM_LINE_STRIP_ADJACENCY &&
10239 ice->shaders.prog[MESA_SHADER_GEOMETRY])
10240 object_preemption = false;
10241
10242 /* WaDisableMidObjectPreemptionForTrifanOrPolygon
10243 *
10244 * "TriFan miscompare in Execlist Preemption test. Cut index that is
10245 * on a previous context. End the previous, the resume another context
10246 * with a tri-fan or polygon, and the vertex count is corrupted. If we
10247 * prempt again we will cause corruption.
10248 *
10249 * WA: Disable mid-draw preemption when draw-call has a tri-fan."
10250 */
10251 if (draw->mode == MESA_PRIM_TRIANGLE_FAN)
10252 object_preemption = false;
10253
10254 /* WaDisableMidObjectPreemptionForLineLoop
10255 *
10256 * "VF Stats Counters Missing a vertex when preemption enabled.
10257 *
10258 * WA: Disable mid-draw preemption when the draw uses a lineloop
10259 * topology."
10260 */
10261 if (draw->mode == MESA_PRIM_LINE_LOOP)
10262 object_preemption = false;
10263
10264 /* WA#0798
10265 *
10266 * "VF is corrupting GAFS data when preempted on an instance boundary
10267 * and replayed with instancing enabled.
10268 *
10269 * WA: Disable preemption when using instanceing."
10270 */
10271 if (draw->instance_count > 1)
10272 object_preemption = false;
10273
10274 if (genx->object_preemption != object_preemption) {
10275 iris_enable_obj_preemption(batch, object_preemption);
10276 genx->object_preemption = object_preemption;
10277 }
10278 }
10279 #endif
10280
10281 static void
iris_lost_genx_state(struct iris_context * ice,struct iris_batch * batch)10282 iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
10283 {
10284 struct iris_genx_state *genx = ice->state.genx;
10285
10286 #if INTEL_NEEDS_WA_1808121037
10287 genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
10288 #endif
10289
10290 memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
10291 }
10292
10293 static void
iris_emit_mi_report_perf_count(struct iris_batch * batch,struct iris_bo * bo,uint32_t offset_in_bytes,uint32_t report_id)10294 iris_emit_mi_report_perf_count(struct iris_batch *batch,
10295 struct iris_bo *bo,
10296 uint32_t offset_in_bytes,
10297 uint32_t report_id)
10298 {
10299 iris_batch_sync_region_start(batch);
10300 iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
10301 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,
10302 IRIS_DOMAIN_OTHER_WRITE);
10303 mi_rpc.ReportID = report_id;
10304 }
10305 iris_batch_sync_region_end(batch);
10306 }
10307
10308 /**
10309 * Update the pixel hashing modes that determine the balancing of PS threads
10310 * across subslices and slices.
10311 *
10312 * \param width Width bound of the rendering area (already scaled down if \p
10313 * scale is greater than 1).
10314 * \param height Height bound of the rendering area (already scaled down if \p
10315 * scale is greater than 1).
10316 * \param scale The number of framebuffer samples that could potentially be
10317 * affected by an individual channel of the PS thread. This is
10318 * typically one for single-sampled rendering, but for operations
10319 * like CCS resolves and fast clears a single PS invocation may
10320 * update a huge number of pixels, in which case a finer
10321 * balancing is desirable in order to maximally utilize the
10322 * bandwidth available. UINT_MAX can be used as shorthand for
10323 * "finest hashing mode available".
10324 */
10325 void
genX(emit_hashing_mode)10326 genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
10327 unsigned width, unsigned height, unsigned scale)
10328 {
10329 #if GFX_VER == 9
10330 const struct intel_device_info *devinfo = batch->screen->devinfo;
10331 const unsigned slice_hashing[] = {
10332 /* Because all Gfx9 platforms with more than one slice require
10333 * three-way subslice hashing, a single "normal" 16x16 slice hashing
10334 * block is guaranteed to suffer from substantial imbalance, with one
10335 * subslice receiving twice as much work as the other two in the
10336 * slice.
10337 *
10338 * The performance impact of that would be particularly severe when
10339 * three-way hashing is also in use for slice balancing (which is the
10340 * case for all Gfx9 GT4 platforms), because one of the slices
10341 * receives one every three 16x16 blocks in either direction, which
10342 * is roughly the periodicity of the underlying subslice imbalance
10343 * pattern ("roughly" because in reality the hardware's
10344 * implementation of three-way hashing doesn't do exact modulo 3
10345 * arithmetic, which somewhat decreases the magnitude of this effect
10346 * in practice). This leads to a systematic subslice imbalance
10347 * within that slice regardless of the size of the primitive. The
10348 * 32x32 hashing mode guarantees that the subslice imbalance within a
10349 * single slice hashing block is minimal, largely eliminating this
10350 * effect.
10351 */
10352 _32x32,
10353 /* Finest slice hashing mode available. */
10354 NORMAL
10355 };
10356 const unsigned subslice_hashing[] = {
10357 /* 16x16 would provide a slight cache locality benefit especially
10358 * visible in the sampler L1 cache efficiency of low-bandwidth
10359 * non-LLC platforms, but it comes at the cost of greater subslice
10360 * imbalance for primitives of dimensions approximately intermediate
10361 * between 16x4 and 16x16.
10362 */
10363 _16x4,
10364 /* Finest subslice hashing mode available. */
10365 _8x4
10366 };
10367 /* Dimensions of the smallest hashing block of a given hashing mode. If
10368 * the rendering area is smaller than this there can't possibly be any
10369 * benefit from switching to this mode, so we optimize out the
10370 * transition.
10371 */
10372 const unsigned min_size[][2] = {
10373 { 16, 4 },
10374 { 8, 4 }
10375 };
10376 const unsigned idx = scale > 1;
10377
10378 if (width > min_size[idx][0] || height > min_size[idx][1]) {
10379 iris_emit_raw_pipe_control(batch,
10380 "workaround: CS stall before GT_MODE LRI",
10381 PIPE_CONTROL_STALL_AT_SCOREBOARD |
10382 PIPE_CONTROL_CS_STALL,
10383 NULL, 0, 0);
10384
10385 iris_emit_reg(batch, GENX(GT_MODE), reg) {
10386 reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
10387 reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
10388 reg.SubsliceHashing = subslice_hashing[idx];
10389 reg.SubsliceHashingMask = -1;
10390 };
10391
10392 ice->state.current_hash_scale = scale;
10393 }
10394 #endif
10395 }
10396
10397 static void
iris_set_frontend_noop(struct pipe_context * ctx,bool enable)10398 iris_set_frontend_noop(struct pipe_context *ctx, bool enable)
10399 {
10400 struct iris_context *ice = (struct iris_context *) ctx;
10401
10402 if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {
10403 ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
10404 ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
10405 }
10406
10407 if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {
10408 ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
10409 ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
10410 }
10411 }
10412
10413 void
genX(init_screen_state)10414 genX(init_screen_state)(struct iris_screen *screen)
10415 {
10416 assert(screen->devinfo->verx10 == GFX_VERx10);
10417 screen->vtbl.destroy_state = iris_destroy_state;
10418 screen->vtbl.init_render_context = iris_init_render_context;
10419 screen->vtbl.init_compute_context = iris_init_compute_context;
10420 screen->vtbl.init_copy_context = iris_init_copy_context;
10421 screen->vtbl.upload_render_state = iris_upload_render_state;
10422 screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state;
10423 screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state;
10424 screen->vtbl.update_binder_address = iris_update_binder_address;
10425 screen->vtbl.upload_compute_state = iris_upload_compute_state;
10426 screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
10427 screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc;
10428 screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
10429 screen->vtbl.rebind_buffer = iris_rebind_buffer;
10430 screen->vtbl.load_register_reg32 = iris_load_register_reg32;
10431 screen->vtbl.load_register_reg64 = iris_load_register_reg64;
10432 screen->vtbl.load_register_imm32 = iris_load_register_imm32;
10433 screen->vtbl.load_register_imm64 = iris_load_register_imm64;
10434 screen->vtbl.load_register_mem32 = iris_load_register_mem32;
10435 screen->vtbl.load_register_mem64 = iris_load_register_mem64;
10436 screen->vtbl.store_register_mem32 = iris_store_register_mem32;
10437 screen->vtbl.store_register_mem64 = iris_store_register_mem64;
10438 screen->vtbl.store_data_imm32 = iris_store_data_imm32;
10439 screen->vtbl.store_data_imm64 = iris_store_data_imm64;
10440 screen->vtbl.copy_mem_mem = iris_copy_mem_mem;
10441 screen->vtbl.derived_program_state_size = iris_derived_program_state_size;
10442 screen->vtbl.store_derived_program_state = iris_store_derived_program_state;
10443 screen->vtbl.create_so_decl_list = iris_create_so_decl_list;
10444 screen->vtbl.populate_vs_key = iris_populate_vs_key;
10445 screen->vtbl.populate_tcs_key = iris_populate_tcs_key;
10446 screen->vtbl.populate_tes_key = iris_populate_tes_key;
10447 screen->vtbl.populate_gs_key = iris_populate_gs_key;
10448 screen->vtbl.populate_fs_key = iris_populate_fs_key;
10449 screen->vtbl.populate_cs_key = iris_populate_cs_key;
10450 screen->vtbl.lost_genx_state = iris_lost_genx_state;
10451 screen->vtbl.disable_rhwo_optimization = iris_disable_rhwo_optimization;
10452 }
10453
10454 void
genX(init_state)10455 genX(init_state)(struct iris_context *ice)
10456 {
10457 struct pipe_context *ctx = &ice->ctx;
10458 struct iris_screen *screen = (struct iris_screen *)ctx->screen;
10459
10460 ctx->create_blend_state = iris_create_blend_state;
10461 ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
10462 ctx->create_rasterizer_state = iris_create_rasterizer_state;
10463 ctx->create_sampler_state = iris_create_sampler_state;
10464 ctx->create_sampler_view = iris_create_sampler_view;
10465 ctx->create_surface = iris_create_surface;
10466 ctx->create_vertex_elements_state = iris_create_vertex_elements;
10467 ctx->bind_blend_state = iris_bind_blend_state;
10468 ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
10469 ctx->bind_sampler_states = iris_bind_sampler_states;
10470 ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
10471 ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
10472 ctx->delete_blend_state = iris_delete_state;
10473 ctx->delete_depth_stencil_alpha_state = iris_delete_state;
10474 ctx->delete_rasterizer_state = iris_delete_state;
10475 ctx->delete_sampler_state = iris_delete_state;
10476 ctx->delete_vertex_elements_state = iris_delete_state;
10477 ctx->set_blend_color = iris_set_blend_color;
10478 ctx->set_clip_state = iris_set_clip_state;
10479 ctx->set_constant_buffer = iris_set_constant_buffer;
10480 ctx->set_shader_buffers = iris_set_shader_buffers;
10481 ctx->set_shader_images = iris_set_shader_images;
10482 ctx->set_sampler_views = iris_set_sampler_views;
10483 ctx->set_compute_resources = iris_set_compute_resources;
10484 ctx->set_global_binding = iris_set_global_binding;
10485 ctx->set_tess_state = iris_set_tess_state;
10486 ctx->set_patch_vertices = iris_set_patch_vertices;
10487 ctx->set_framebuffer_state = iris_set_framebuffer_state;
10488 ctx->set_polygon_stipple = iris_set_polygon_stipple;
10489 ctx->set_sample_mask = iris_set_sample_mask;
10490 ctx->set_scissor_states = iris_set_scissor_states;
10491 ctx->set_stencil_ref = iris_set_stencil_ref;
10492 ctx->set_vertex_buffers = iris_set_vertex_buffers;
10493 ctx->set_viewport_states = iris_set_viewport_states;
10494 ctx->sampler_view_destroy = iris_sampler_view_destroy;
10495 ctx->surface_destroy = iris_surface_destroy;
10496 ctx->draw_vbo = iris_draw_vbo;
10497 ctx->launch_grid = iris_launch_grid;
10498 ctx->create_stream_output_target = iris_create_stream_output_target;
10499 ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
10500 ctx->set_stream_output_targets = iris_set_stream_output_targets;
10501 ctx->set_frontend_noop = iris_set_frontend_noop;
10502
10503 ice->state.dirty = ~0ull;
10504 ice->state.stage_dirty = ~0ull;
10505
10506 ice->state.statistics_counters_enabled = true;
10507
10508 ice->state.sample_mask = 0xffff;
10509 ice->state.num_viewports = 1;
10510 ice->state.prim_mode = MESA_PRIM_COUNT;
10511 ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
10512 ice->draw.derived_params.drawid = -1;
10513
10514 #if GFX_VERx10 >= 120
10515 ice->state.genx->object_preemption = true;
10516 #endif
10517
10518 /* Make a 1x1x1 null surface for unbound textures */
10519 void *null_surf_map =
10520 upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
10521 4 * GENX(RENDER_SURFACE_STATE_length), 64);
10522 isl_null_fill_state(&screen->isl_dev, null_surf_map,
10523 .size = isl_extent3d(1, 1, 1));
10524 ice->state.unbound_tex.offset +=
10525 iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));
10526
10527 /* Default all scissor rectangles to be empty regions. */
10528 for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {
10529 ice->state.scissors[i] = (struct pipe_scissor_state) {
10530 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
10531 };
10532 }
10533 }
10534