1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file crocus_state.c
25 *
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73 #include <errno.h>
74 #include <stdio.h>
75
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83
84 #include "drm-uapi/i915_drm.h"
85 #include "intel/common/intel_l3_config.h"
86 #include "intel/common/intel_sample_positions.h"
87 #include "intel/compiler/elk/elk_compiler.h"
88 #include "compiler/shader_info.h"
89 #include "pipe/p_context.h"
90 #include "pipe/p_defines.h"
91 #include "pipe/p_screen.h"
92 #include "pipe/p_state.h"
93 #include "util/format/u_format.h"
94 #include "util/half_float.h"
95 #include "util/u_dual_blend.h"
96 #include "util/u_framebuffer.h"
97 #include "util/u_helpers.h"
98 #include "util/u_inlines.h"
99 #include "util/u_memory.h"
100 #include "util/u_prim.h"
101 #include "util/u_transfer.h"
102 #include "util/u_upload_mgr.h"
103 #include "util/u_viewport.h"
104 #include "crocus_batch.h"
105 #include "crocus_context.h"
106 #include "crocus_defines.h"
107 #include "crocus_pipe.h"
108 #include "crocus_resource.h"
109
110 #include "crocus_genx_macros.h"
111 #include "intel/common/intel_genX_state_elk.h"
112 #include "intel/common/intel_guardband.h"
113 #include "main/macros.h" /* UNCLAMPED_* */
114
115 /**
116 * Statically assert that PIPE_* enums match the hardware packets.
117 * (As long as they match, we don't need to translate them.)
118 */
pipe_asserts()119 UNUSED static void pipe_asserts()
120 {
121 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
122
123 /* pipe_logicop happens to match the hardware. */
124 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
125 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
126 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
127 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
128 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
129 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
130 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
131 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
132 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
133 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
134 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
135 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
136 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
137 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
138 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
139 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
140
141 /* pipe_blend_func happens to match the hardware. */
142 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
143 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
144 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
145 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
146 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
147 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
148 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
149 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
150 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
151 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
160 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
161
162 /* pipe_blend_func happens to match the hardware. */
163 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
164 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
165 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
166 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
167 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
168
169 /* pipe_stencil_op happens to match the hardware. */
170 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
171 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
172 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
173 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
174 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
175 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
176 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
177 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
178
179 #if GFX_VER >= 6
180 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
181 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
182 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
183 #endif
184 #undef PIPE_ASSERT
185 }
186
187 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)188 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
189 {
190 static const unsigned map[] = {
191 [MESA_PRIM_POINTS] = _3DPRIM_POINTLIST,
192 [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
193 [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
194 [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
195 [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
196 [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
197 [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
198 [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
199 [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
200 [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
201 #if GFX_VER >= 6
202 [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
203 [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
204 [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
205 [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
206 #endif
207 #if GFX_VER >= 7
208 [MESA_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
209 #endif
210 };
211
212 return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
213 }
214
215 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)216 translate_compare_func(enum pipe_compare_func pipe_func)
217 {
218 static const unsigned map[] = {
219 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
220 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
221 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
222 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
223 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
224 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
225 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
226 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
227 };
228 return map[pipe_func];
229 }
230
231 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)232 translate_shadow_func(enum pipe_compare_func pipe_func)
233 {
234 /* Gallium specifies the result of shadow comparisons as:
235 *
236 * 1 if ref <op> texel,
237 * 0 otherwise.
238 *
239 * The hardware does:
240 *
241 * 0 if texel <op> ref,
242 * 1 otherwise.
243 *
244 * So we need to flip the operator and also negate.
245 */
246 static const unsigned map[] = {
247 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
248 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
249 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
250 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
251 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
252 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
253 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
254 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
255 };
256 return map[pipe_func];
257 }
258
259 static unsigned
translate_cull_mode(unsigned pipe_face)260 translate_cull_mode(unsigned pipe_face)
261 {
262 static const unsigned map[4] = {
263 [PIPE_FACE_NONE] = CULLMODE_NONE,
264 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
265 [PIPE_FACE_BACK] = CULLMODE_BACK,
266 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
267 };
268 return map[pipe_face];
269 }
270
271 #if GFX_VER >= 6
272 static unsigned
translate_fill_mode(unsigned pipe_polymode)273 translate_fill_mode(unsigned pipe_polymode)
274 {
275 static const unsigned map[4] = {
276 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
277 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
278 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
279 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
280 };
281 return map[pipe_polymode];
282 }
283 #endif
284
285 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)286 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
287 {
288 static const unsigned map[] = {
289 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
290 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
291 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
292 };
293 return map[pipe_mip];
294 }
295
296 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)297 translate_wrap(unsigned pipe_wrap, bool either_nearest)
298 {
299 static const unsigned map[] = {
300 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
301 #if GFX_VER == 8
302 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
303 #else
304 [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
305 #endif
306 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
307 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
308 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
309 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
310
311 /* These are unsupported. */
312 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
313 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
314 };
315 #if GFX_VER < 8
316 if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
317 return TCM_CLAMP;
318 #endif
319 return map[pipe_wrap];
320 }
321
322 /**
323 * Equiv if elk_state_batch
324 */
325 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)326 stream_state(struct crocus_batch *batch,
327 unsigned size,
328 unsigned alignment,
329 uint32_t *out_offset)
330 {
331 uint32_t offset = ALIGN(batch->state.used, alignment);
332
333 if (offset + size >= STATE_SZ && !batch->no_wrap) {
334 crocus_batch_flush(batch);
335 offset = ALIGN(batch->state.used, alignment);
336 } else if (offset + size >= batch->state.bo->size) {
337 const unsigned new_size =
338 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
339 MAX_STATE_SIZE);
340 crocus_grow_buffer(batch, true, batch->state.used, new_size);
341 assert(offset + size < batch->state.bo->size);
342 }
343
344 crocus_record_state_size(batch->state_sizes, offset, size);
345
346 batch->state.used = offset + size;
347 *out_offset = offset;
348
349 return (uint32_t *)batch->state.map + (offset >> 2);
350 }
351
352 /**
353 * stream_state() + memcpy.
354 */
355 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)356 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
357 unsigned alignment)
358 {
359 unsigned offset = 0;
360 uint32_t *map = stream_state(batch, size, alignment, &offset);
361
362 if (map)
363 memcpy(map, data, size);
364
365 return offset;
366 }
367
368 #if GFX_VER <= 5
369 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)370 upload_pipelined_state_pointers(struct crocus_batch *batch,
371 bool gs_active, uint32_t gs_offset,
372 uint32_t vs_offset, uint32_t sf_offset,
373 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
374 {
375 #if GFX_VER == 5
376 /* Need to flush before changing clip max threads for errata. */
377 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
378 #endif
379
380 crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
381 pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
382 pp.GSEnable = gs_active;
383 if (gs_active)
384 pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
385 pp.ClipEnable = true;
386 pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
387 pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
388 pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
389 pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
390 }
391 }
392
393 #endif
394 /**
395 * Did field 'x' change between 'old_cso' and 'new_cso'?
396 *
397 * (If so, we may want to set some dirty flags.)
398 */
399 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
400 #define cso_changed_memcmp(x) \
401 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
402
403 static void
flush_before_state_base_change(struct crocus_batch * batch)404 flush_before_state_base_change(struct crocus_batch *batch)
405 {
406 #if GFX_VER >= 6
407 /* Flush before emitting STATE_BASE_ADDRESS.
408 *
409 * This isn't documented anywhere in the PRM. However, it seems to be
410 * necessary prior to changing the surface state base adress. We've
411 * seen issues in Vulkan where we get GPU hangs when using multi-level
412 * command buffers which clear depth, reset state base address, and then
413 * go render stuff.
414 *
415 * Normally, in GL, we would trust the kernel to do sufficient stalls
416 * and flushes prior to executing our batch. However, it doesn't seem
417 * as if the kernel's flushing is always sufficient and we don't want to
418 * rely on it.
419 *
420 * We make this an end-of-pipe sync instead of a normal flush because we
421 * do not know the current status of the GPU. On Haswell at least,
422 * having a fast-clear operation in flight at the same time as a normal
423 * rendering operation can cause hangs. Since the kernel's flushing is
424 * insufficient, we need to ensure that any rendering operations from
425 * other processes are definitely complete before we try to do our own
426 * rendering. It's a bit of a big hammer but it appears to work.
427 */
428 const unsigned dc_flush =
429 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
430 crocus_emit_end_of_pipe_sync(batch,
431 "change STATE_BASE_ADDRESS (flushes)",
432 PIPE_CONTROL_RENDER_TARGET_FLUSH |
433 dc_flush |
434 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
435 #endif
436 }
437
438 static void
flush_after_state_base_change(struct crocus_batch * batch)439 flush_after_state_base_change(struct crocus_batch *batch)
440 {
441 /* After re-setting the surface state base address, we have to do some
442 * cache flusing so that the sampler engine will pick up the new
443 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
444 * Shared Function > 3D Sampler > State > State Caching (page 96):
445 *
446 * Coherency with system memory in the state cache, like the texture
447 * cache is handled partially by software. It is expected that the
448 * command stream or shader will issue Cache Flush operation or
449 * Cache_Flush sampler message to ensure that the L1 cache remains
450 * coherent with system memory.
451 *
452 * [...]
453 *
454 * Whenever the value of the Dynamic_State_Base_Addr,
455 * Surface_State_Base_Addr are altered, the L1 state cache must be
456 * invalidated to ensure the new surface or sampler state is fetched
457 * from system memory.
458 *
459 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
460 * which, according the PIPE_CONTROL instruction documentation in the
461 * Broadwell PRM:
462 *
463 * Setting this bit is independent of any other bit in this packet.
464 * This bit controls the invalidation of the L1 and L2 state caches
465 * at the top of the pipe i.e. at the parsing time.
466 *
467 * Unfortunately, experimentation seems to indicate that state cache
468 * invalidation through a PIPE_CONTROL does nothing whatsoever in
469 * regards to surface state and binding tables. In stead, it seems that
470 * invalidating the texture cache is what is actually needed.
471 *
472 * XXX: As far as we have been able to determine through
473 * experimentation, shows that flush the texture cache appears to be
474 * sufficient. The theory here is that all of the sampling/rendering
475 * units cache the binding table in the texture cache. However, we have
476 * yet to be able to actually confirm this.
477 */
478 #if GFX_VER >= 6
479 crocus_emit_end_of_pipe_sync(batch,
480 "change STATE_BASE_ADDRESS (invalidates)",
481 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
482 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
483 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
484 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
485 #endif
486 }
487
488 #if GFX_VER >= 6
489 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)490 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
491 struct crocus_bo *bo, uint32_t offset,
492 bool predicated)
493 {
494 crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
495 srm.RegisterAddress = reg;
496 srm.MemoryAddress = ggtt_bo(bo, offset);
497 #if GFX_VERx10 >= 75
498 srm.PredicateEnable = predicated;
499 #else
500 if (predicated)
501 unreachable("unsupported predication");
502 #endif
503 }
504 }
505
506 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)507 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
508 struct crocus_bo *bo, uint32_t offset,
509 bool predicated)
510 {
511 crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
512 crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
513 }
514 #endif
515
516 #if GFX_VER >= 7
517 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)518 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
519 {
520 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
521 lri.RegisterOffset = reg;
522 lri.DataDWord = val;
523 }
524 }
525 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
526
527 #if GFX_VERx10 >= 75
528 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)529 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
530 {
531 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
532 lrr.SourceRegisterAddress = src;
533 lrr.DestinationRegisterAddress = dst;
534 }
535 }
536
537 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)538 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
539 uint32_t src)
540 {
541 _crocus_emit_lrr(batch, dst, src);
542 }
543
544 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)545 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
546 uint32_t src)
547 {
548 _crocus_emit_lrr(batch, dst, src);
549 _crocus_emit_lrr(batch, dst + 4, src + 4);
550 }
551 #endif
552
553 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)554 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
555 uint32_t val)
556 {
557 _crocus_emit_lri(batch, reg, val);
558 }
559
560 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)561 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
562 uint64_t val)
563 {
564 _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
565 _crocus_emit_lri(batch, reg + 4, val >> 32);
566 }
567
568 /**
569 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
570 */
571 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)572 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
573 struct crocus_bo *bo, uint32_t offset)
574 {
575 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
576 lrm.RegisterAddress = reg;
577 lrm.MemoryAddress = ro_bo(bo, offset);
578 }
579 }
580
581 /**
582 * Load a 64-bit value from a buffer into a MMIO register via
583 * two MI_LOAD_REGISTER_MEM commands.
584 */
585 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)586 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
587 struct crocus_bo *bo, uint32_t offset)
588 {
589 crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
590 crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
591 }
592
593 #if GFX_VERx10 >= 75
594 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)595 crocus_store_data_imm32(struct crocus_batch *batch,
596 struct crocus_bo *bo, uint32_t offset,
597 uint32_t imm)
598 {
599 crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
600 sdi.Address = rw_bo(bo, offset);
601 #if GFX_VER >= 6
602 sdi.ImmediateData = imm;
603 #endif
604 }
605 }
606
607 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)608 crocus_store_data_imm64(struct crocus_batch *batch,
609 struct crocus_bo *bo, uint32_t offset,
610 uint64_t imm)
611 {
612 /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
613 * 2 in genxml but it's actually variable length and we need 5 DWords.
614 */
615 void *map = crocus_get_command_space(batch, 4 * 5);
616 _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
617 sdi.DWordLength = 5 - 2;
618 sdi.Address = rw_bo(bo, offset);
619 #if GFX_VER >= 6
620 sdi.ImmediateData = imm;
621 #endif
622 }
623 }
624 #endif
625
626 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)627 crocus_copy_mem_mem(struct crocus_batch *batch,
628 struct crocus_bo *dst_bo, uint32_t dst_offset,
629 struct crocus_bo *src_bo, uint32_t src_offset,
630 unsigned bytes)
631 {
632 assert(bytes % 4 == 0);
633 assert(dst_offset % 4 == 0);
634 assert(src_offset % 4 == 0);
635
636 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
637 for (unsigned i = 0; i < bytes; i += 4) {
638 crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
639 src_bo, src_offset + i);
640 crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
641 dst_bo, dst_offset + i, false);
642 }
643 }
644 #endif
645
646 /**
647 * Gallium CSO for rasterizer state.
648 */
649 struct crocus_rasterizer_state {
650 struct pipe_rasterizer_state cso;
651 #if GFX_VER >= 6
652 uint32_t sf[GENX(3DSTATE_SF_length)];
653 uint32_t clip[GENX(3DSTATE_CLIP_length)];
654 #endif
655 #if GFX_VER >= 8
656 uint32_t raster[GENX(3DSTATE_RASTER_length)];
657 #endif
658 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
659
660 uint8_t num_clip_plane_consts;
661 bool fill_mode_point_or_line;
662 };
663
664 #if GFX_VER <= 5
665 #define URB_VS 0
666 #define URB_GS 1
667 #define URB_CLP 2
668 #define URB_SF 3
669 #define URB_CS 4
670
671 static const struct {
672 uint32_t min_nr_entries;
673 uint32_t preferred_nr_entries;
674 uint32_t min_entry_size;
675 uint32_t max_entry_size;
676 } limits[URB_CS+1] = {
677 { 16, 32, 1, 5 }, /* vs */
678 { 4, 8, 1, 5 }, /* gs */
679 { 5, 10, 1, 5 }, /* clp */
680 { 1, 8, 1, 12 }, /* sf */
681 { 1, 4, 1, 32 } /* cs */
682 };
683
check_urb_layout(struct crocus_context * ice)684 static bool check_urb_layout(struct crocus_context *ice)
685 {
686 ice->urb.vs_start = 0;
687 ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
688 ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
689 ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
690 ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
691
692 return ice->urb.cs_start + ice->urb.nr_cs_entries *
693 ice->urb.csize <= ice->urb.size;
694 }
695
696
697 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)698 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
699 unsigned vsize, unsigned sfsize)
700 {
701 struct crocus_context *ice = batch->ice;
702 if (csize < limits[URB_CS].min_entry_size)
703 csize = limits[URB_CS].min_entry_size;
704
705 if (vsize < limits[URB_VS].min_entry_size)
706 vsize = limits[URB_VS].min_entry_size;
707
708 if (sfsize < limits[URB_SF].min_entry_size)
709 sfsize = limits[URB_SF].min_entry_size;
710
711 if (ice->urb.vsize < vsize ||
712 ice->urb.sfsize < sfsize ||
713 ice->urb.csize < csize ||
714 (ice->urb.constrained && (ice->urb.vsize > vsize ||
715 ice->urb.sfsize > sfsize ||
716 ice->urb.csize > csize))) {
717
718
719 ice->urb.csize = csize;
720 ice->urb.sfsize = sfsize;
721 ice->urb.vsize = vsize;
722
723 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
724 ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
725 ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
726 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
727 ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
728
729 ice->urb.constrained = 0;
730
731 if (GFX_VER == 5) {
732 ice->urb.nr_vs_entries = 128;
733 ice->urb.nr_sf_entries = 48;
734 if (check_urb_layout(ice)) {
735 goto done;
736 } else {
737 ice->urb.constrained = 1;
738 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
739 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
740 }
741 } else if (GFX_VERx10 == 45) {
742 ice->urb.nr_vs_entries = 64;
743 if (check_urb_layout(ice)) {
744 goto done;
745 } else {
746 ice->urb.constrained = 1;
747 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
748 }
749 }
750
751 if (!check_urb_layout(ice)) {
752 ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
753 ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
754 ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
755 ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
756 ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
757
758 /* Mark us as operating with constrained nr_entries, so that next
759 * time we recalculate we'll resize the fences in the hope of
760 * escaping constrained mode and getting back to normal performance.
761 */
762 ice->urb.constrained = 1;
763
764 if (!check_urb_layout(ice)) {
765 /* This is impossible, given the maximal sizes of urb
766 * entries and the values for minimum nr of entries
767 * provided above.
768 */
769 fprintf(stderr, "couldn't calculate URB layout!\n");
770 exit(1);
771 }
772
773 if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
774 fprintf(stderr, "URB CONSTRAINED\n");
775 }
776
777 done:
778 if (INTEL_DEBUG(DEBUG_URB))
779 fprintf(stderr,
780 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
781 ice->urb.vs_start,
782 ice->urb.gs_start,
783 ice->urb.clip_start,
784 ice->urb.sf_start,
785 ice->urb.cs_start,
786 ice->urb.size);
787 return true;
788 }
789 return false;
790 }
791
792 static void
crocus_upload_urb_fence(struct crocus_batch * batch)793 crocus_upload_urb_fence(struct crocus_batch *batch)
794 {
795 uint32_t urb_fence[3];
796 _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
797 urb.VSUnitURBReallocationRequest = 1;
798 urb.GSUnitURBReallocationRequest = 1;
799 urb.CLIPUnitURBReallocationRequest = 1;
800 urb.SFUnitURBReallocationRequest = 1;
801 urb.VFEUnitURBReallocationRequest = 1;
802 urb.CSUnitURBReallocationRequest = 1;
803
804 urb.VSFence = batch->ice->urb.gs_start;
805 urb.GSFence = batch->ice->urb.clip_start;
806 urb.CLIPFence = batch->ice->urb.sf_start;
807 urb.SFFence = batch->ice->urb.cs_start;
808 urb.CSFence = batch->ice->urb.size;
809 }
810
811 /* erratum: URB_FENCE must not cross a 64byte cacheline */
812 if ((crocus_batch_bytes_used(batch) & 15) > 12) {
813 int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
814 do {
815 *(uint32_t *)batch->command.map_next = 0;
816 batch->command.map_next += sizeof(uint32_t);
817 } while (--pad);
818 }
819
820 crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
821 }
822
823 static bool
calculate_curbe_offsets(struct crocus_batch * batch)824 calculate_curbe_offsets(struct crocus_batch *batch)
825 {
826 struct crocus_context *ice = batch->ice;
827
828 unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
829 unsigned total_regs;
830
831 nr_fp_regs = 0;
832 for (int i = 0; i < 4; i++) {
833 const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
834 if (range->length == 0)
835 continue;
836
837 /* ubo range tracks at 256-bit, we need 512-bit */
838 nr_fp_regs += (range->length + 1) / 2;
839 }
840
841 if (ice->state.cso_rast->cso.clip_plane_enable) {
842 unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
843 nr_clip_regs = (nr_planes * 4 + 15) / 16;
844 }
845
846 nr_vp_regs = 0;
847 for (int i = 0; i < 4; i++) {
848 const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
849 if (range->length == 0)
850 continue;
851
852 /* ubo range tracks at 256-bit, we need 512-bit */
853 nr_vp_regs += (range->length + 1) / 2;
854 }
855 if (nr_vp_regs == 0) {
856 /* The pre-gen6 VS requires that some push constants get loaded no
857 * matter what, or the GPU would hang.
858 */
859 nr_vp_regs = 1;
860 }
861 total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
862
863 /* The CURBE allocation size is limited to 32 512-bit units (128 EU
864 * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
865 * (volume 1, part 1) PRMs.
866 *
867 * Note that in elk_fs.cpp we're only loading up to 16 EU registers of
868 * values as push constants before spilling to pull constants, and in
869 * elk_vec4.cpp we're loading up to 32 registers of push constants. An EU
870 * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
871 * regs for clip.
872 */
873 assert(total_regs <= 32);
874
875 /* Lazy resize:
876 */
877 if (nr_fp_regs > ice->curbe.wm_size ||
878 nr_vp_regs > ice->curbe.vs_size ||
879 nr_clip_regs != ice->curbe.clip_size ||
880 (total_regs < ice->curbe.total_size / 4 &&
881 ice->curbe.total_size > 16)) {
882
883 GLuint reg = 0;
884
885 /* Calculate a new layout:
886 */
887 reg = 0;
888 ice->curbe.wm_start = reg;
889 ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
890 ice->curbe.clip_start = reg;
891 ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
892 ice->curbe.vs_start = reg;
893 ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
894 ice->curbe.total_size = reg;
895
896 if (0)
897 fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
898 ice->curbe.wm_start,
899 ice->curbe.wm_size,
900 ice->curbe.clip_start,
901 ice->curbe.clip_size,
902 ice->curbe.vs_start,
903 ice->curbe.vs_size );
904 return true;
905 }
906 return false;
907 }
908
909 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)910 upload_shader_consts(struct crocus_context *ice,
911 gl_shader_stage stage,
912 uint32_t *map,
913 unsigned start)
914 {
915 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
916 struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
917 uint32_t *cmap;
918 bool found = false;
919 unsigned offset = start * 16;
920 int total = 0;
921 for (int i = 0; i < 4; i++) {
922 const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
923
924 if (range->length == 0)
925 continue;
926
927 unsigned block_index = crocus_bti_to_group_index(
928 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
929 unsigned len = range->length * 8 * sizeof(float);
930 unsigned start = range->start * 8 * sizeof(float);
931 struct pipe_transfer *transfer;
932
933 cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
934 ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
935 PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
936 if (cmap)
937 memcpy(&map[offset + (total * 8)], cmap, len);
938 pipe_buffer_unmap(&ice->ctx, transfer);
939 total += range->length;
940 found = true;
941 }
942
943 if (stage == MESA_SHADER_VERTEX && !found) {
944 /* The pre-gen6 VS requires that some push constants get loaded no
945 * matter what, or the GPU would hang.
946 */
947 unsigned len = 16;
948 memset(&map[offset], 0, len);
949 }
950 }
951
952 static const float fixed_plane[6][4] = {
953 { 0, 0, -1, 1 },
954 { 0, 0, 1, 1 },
955 { 0, -1, 0, 1 },
956 { 0, 1, 0, 1 },
957 {-1, 0, 0, 1 },
958 { 1, 0, 0, 1 }
959 };
960
961 static void
gen4_upload_curbe(struct crocus_batch * batch)962 gen4_upload_curbe(struct crocus_batch *batch)
963 {
964 struct crocus_context *ice = batch->ice;
965 const unsigned sz = ice->curbe.total_size;
966 const unsigned buf_sz = sz * 16 * sizeof(float);
967
968 if (sz == 0)
969 goto emit;
970
971 uint32_t *map;
972 u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
973 &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
974
975 /* fragment shader constants */
976 if (ice->curbe.wm_size) {
977 upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
978 }
979
980 /* clipper constants */
981 if (ice->curbe.clip_size) {
982 unsigned offset = ice->curbe.clip_start * 16;
983 float *fmap = (float *)map;
984 unsigned i;
985 /* If any planes are going this way, send them all this way:
986 */
987 for (i = 0; i < 6; i++) {
988 fmap[offset + i * 4 + 0] = fixed_plane[i][0];
989 fmap[offset + i * 4 + 1] = fixed_plane[i][1];
990 fmap[offset + i * 4 + 2] = fixed_plane[i][2];
991 fmap[offset + i * 4 + 3] = fixed_plane[i][3];
992 }
993
994 unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
995 struct pipe_clip_state *cp = &ice->state.clip_planes;
996 while (mask) {
997 const int j = u_bit_scan(&mask);
998 fmap[offset + i * 4 + 0] = cp->ucp[j][0];
999 fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1000 fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1001 fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1002 i++;
1003 }
1004 }
1005
1006 /* vertex shader constants */
1007 if (ice->curbe.vs_size) {
1008 upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1009 }
1010 if (0) {
1011 for (int i = 0; i < sz*16; i+=4) {
1012 float *f = (float *)map;
1013 fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1014 f[i+0], f[i+1], f[i+2], f[i+3]);
1015 }
1016 }
1017
1018 emit:
1019 crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1020 if (ice->curbe.curbe_res) {
1021 cb.BufferLength = ice->curbe.total_size - 1;
1022 cb.Valid = 1;
1023 cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1024 }
1025 }
1026
1027 #if GFX_VER == 4 && GFX_VERx10 != 45
1028 /* Work around a Broadwater/Crestline depth interpolator bug. The
1029 * following sequence will cause GPU hangs:
1030 *
1031 * 1. Change state so that all depth related fields in CC_STATE are
1032 * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1033 * 2. Emit a CONSTANT_BUFFER packet.
1034 * 3. Draw via 3DPRIMITIVE.
1035 *
1036 * The recommended workaround is to emit a non-pipelined state change after
1037 * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1038 *
1039 * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1040 * and always emit it when "PS Use Source Depth" is set. We could be more
1041 * precise, but the additional complexity is probably not worth it.
1042 *
1043 */
1044 const struct shader_info *fs_info =
1045 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1046
1047 if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1048 ice->state.global_depth_offset_clamp = 0;
1049 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1050 }
1051 #endif
1052 }
1053 #endif
1054
1055 #if GFX_VER >= 7
1056
1057 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
1058 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
1059 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
1060
1061 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1062 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1063 {
1064 #if GFX_VER == 7
1065 const struct intel_device_info *devinfo = &batch->screen->devinfo;
1066 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1067 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1068 cfg->n[INTEL_L3P_ALL];
1069 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1070 cfg->n[INTEL_L3P_ALL];
1071 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1072 cfg->n[INTEL_L3P_ALL];
1073 const bool has_slm = cfg->n[INTEL_L3P_SLM];
1074 #endif
1075
1076 /* According to the hardware docs, the L3 partitioning can only be changed
1077 * while the pipeline is completely drained and the caches are flushed,
1078 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1079 */
1080 crocus_emit_pipe_control_flush(batch, "l3_config",
1081 PIPE_CONTROL_DATA_CACHE_FLUSH |
1082 PIPE_CONTROL_CS_STALL);
1083
1084 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1085 * invalidation of the relevant caches. Note that because RO invalidation
1086 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1087 * command is processed by the CS) we cannot combine it with the previous
1088 * stalling flush as the hardware documentation suggests, because that
1089 * would cause the CS to stall on previous rendering *after* RO
1090 * invalidation and wouldn't prevent the RO caches from being polluted by
1091 * concurrent rendering before the stall completes. This intentionally
1092 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1093 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1094 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1095 * already guarantee that there is no concurrent GPGPU kernel execution
1096 * (see SKL HSD 2132585).
1097 */
1098 crocus_emit_pipe_control_flush(batch, "l3 config",
1099 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1100 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1101 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1102 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1103
1104 /* Now send a third stalling flush to make sure that invalidation is
1105 * complete when the L3 configuration registers are modified.
1106 */
1107 crocus_emit_pipe_control_flush(batch, "l3 config",
1108 PIPE_CONTROL_DATA_CACHE_FLUSH |
1109 PIPE_CONTROL_CS_STALL);
1110
1111 #if GFX_VER == 8
1112 assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1113 crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1114 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1115 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1116 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1117 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1118 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1119 }
1120 #else
1121 assert(!cfg->n[INTEL_L3P_ALL]);
1122
1123 /* When enabled SLM only uses a portion of the L3 on half of the banks,
1124 * the matching space on the remaining banks has to be allocated to a
1125 * client (URB for all validated configurations) set to the
1126 * lower-bandwidth 2-bank address hashing mode.
1127 */
1128 const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1129 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1130
1131 /* Minimum number of ways that can be allocated to the URB. */
1132 const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1133 assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1134
1135 uint32_t l3sqcr1, l3cr2, l3cr3;
1136
1137 crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1138 reg.ConvertDC_UC = !has_dc;
1139 reg.ConvertIS_UC = !has_is;
1140 reg.ConvertC_UC = !has_c;
1141 reg.ConvertT_UC = !has_t;
1142 #if GFX_VERx10 == 75
1143 reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1144 #else
1145 reg.L3SQGeneralPriorityCreditInitialization =
1146 devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1147 #endif
1148 reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1149 };
1150
1151 crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1152 reg.SLMEnable = has_slm;
1153 reg.URBLowBandwidth = urb_low_bw;
1154 reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1155 #if !(GFX_VERx10 == 75)
1156 reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1157 #endif
1158 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1159 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1160 };
1161
1162 crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1163 reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1164 reg.ISLowBandwidth = 0;
1165 reg.CAllocation = cfg->n[INTEL_L3P_C];
1166 reg.CLowBandwidth = 0;
1167 reg.TAllocation = cfg->n[INTEL_L3P_T];
1168 reg.TLowBandwidth = 0;
1169 };
1170
1171 /* Set up the L3 partitioning. */
1172 crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1173 crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1174 crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1175
1176 #if GFX_VERx10 == 75
1177 /* TODO: Fail screen creation if command parser version < 4 */
1178 uint32_t scratch1, chicken3;
1179 crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1180 reg.L3AtomicDisable = !has_dc;
1181 }
1182 crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1183 reg.L3AtomicDisableMask = true;
1184 reg.L3AtomicDisable = !has_dc;
1185 }
1186 crocus_emit_lri(batch, SCRATCH1, scratch1);
1187 crocus_emit_lri(batch, CHICKEN3, chicken3);
1188 #endif
1189 #endif
1190 }
1191
1192 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1193 emit_l3_state(struct crocus_batch *batch, bool compute)
1194 {
1195 const struct intel_l3_config *const cfg =
1196 compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1197
1198 setup_l3_config(batch, cfg);
1199 if (INTEL_DEBUG(DEBUG_L3)) {
1200 intel_dump_l3_config(cfg, stderr);
1201 }
1202 }
1203
1204 /**
1205 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1206 */
1207 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1208 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1209 {
1210 crocus_emit_pipe_control_write(batch,
1211 "workaround",
1212 PIPE_CONTROL_CS_STALL
1213 | PIPE_CONTROL_WRITE_IMMEDIATE,
1214 batch->ice->workaround_bo,
1215 batch->ice->workaround_offset, 0);
1216 }
1217 #endif
1218
1219 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1220 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1221 {
1222 #if GFX_VER == 8
1223 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1224 *
1225 * Software must clear the COLOR_CALC_STATE Valid field in
1226 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1227 * with Pipeline Select set to GPGPU.
1228 *
1229 * The internal hardware docs recommend the same workaround for Gfx9
1230 * hardware too.
1231 */
1232 if (pipeline == GPGPU)
1233 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1234 #endif
1235
1236 #if GFX_VER >= 6
1237 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1238 * PIPELINE_SELECT [DevBWR+]":
1239 *
1240 * "Project: DEVSNB+
1241 *
1242 * Software must ensure all the write caches are flushed through a
1243 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1244 * command to invalidate read only caches prior to programming
1245 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1246 */
1247 const unsigned dc_flush =
1248 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1249 crocus_emit_pipe_control_flush(batch,
1250 "workaround: PIPELINE_SELECT flushes (1/2)",
1251 PIPE_CONTROL_RENDER_TARGET_FLUSH |
1252 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1253 dc_flush |
1254 PIPE_CONTROL_CS_STALL);
1255
1256 crocus_emit_pipe_control_flush(batch,
1257 "workaround: PIPELINE_SELECT flushes (2/2)",
1258 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1259 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1260 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1261 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1262 #else
1263 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1264 * PIPELINE_SELECT [DevBWR+]":
1265 *
1266 * Project: PRE-DEVSNB
1267 *
1268 * Software must ensure the current pipeline is flushed via an
1269 * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1270 */
1271 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1272 #endif
1273
1274 crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1275 sel.PipelineSelection = pipeline;
1276 }
1277
1278 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1279 if (pipeline == _3D) {
1280 gen7_emit_cs_stall_flush(batch);
1281
1282 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1283 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1284 };
1285 }
1286 #endif
1287 }
1288
1289 /**
1290 * The following diagram shows how we partition the URB:
1291 *
1292 * 16kB or 32kB Rest of the URB space
1293 * __________-__________ _________________-_________________
1294 * / \ / \
1295 * +-------------------------------------------------------------+
1296 * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
1297 * | Constants | Entries |
1298 * +-------------------------------------------------------------+
1299 *
1300 * Notably, push constants must be stored at the beginning of the URB
1301 * space, while entries can be stored anywhere. Ivybridge and Haswell
1302 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1303 * doubles this (32kB).
1304 *
1305 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1306 * sized) in increments of 1kB. Haswell GT3 requires them to be located and
1307 * sized in increments of 2kB.
1308 *
1309 * Currently we split the constant buffer space evenly among whatever stages
1310 * are active. This is probably not ideal, but simple.
1311 *
1312 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1313 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1314 * Haswell GT3 has 512kB of URB space.
1315 *
1316 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1317 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1318 */
1319 #if GFX_VER >= 7
1320 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1321 crocus_alloc_push_constants(struct crocus_batch *batch)
1322 {
1323 const unsigned push_constant_kb =
1324 batch->screen->devinfo.max_constant_urb_size_kb;
1325 unsigned size_per_stage = push_constant_kb / 5;
1326
1327 /* For now, we set a static partitioning of the push constant area,
1328 * assuming that all stages could be in use.
1329 *
1330 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1331 * see if that improves performance by offering more space to
1332 * the VS/FS when those aren't in use. Also, try dynamically
1333 * enabling/disabling it like i965 does. This would be more
1334 * stalls and may not actually help; we don't know yet.
1335 */
1336 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1337 crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1338 alloc._3DCommandSubOpcode = 18 + i;
1339 alloc.ConstantBufferOffset = size_per_stage * i;
1340 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1341 }
1342 }
1343
1344 /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1345 *
1346 * A PIPE_CONTROL command with the CS Stall bit set must be programmed
1347 * in the ring after this instruction.
1348 *
1349 * No such restriction exists for Haswell or Baytrail.
1350 */
1351 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1352 gen7_emit_cs_stall_flush(batch);
1353 }
1354 #endif
1355
1356 /**
1357 * Upload the initial GPU state for a render context.
1358 *
1359 * This sets some invariant state that needs to be programmed a particular
1360 * way, but we never actually change.
1361 */
1362 static void
crocus_init_render_context(struct crocus_batch * batch)1363 crocus_init_render_context(struct crocus_batch *batch)
1364 {
1365 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1366
1367 emit_pipeline_select(batch, _3D);
1368
1369 crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1370
1371 #if GFX_VER >= 7
1372 emit_l3_state(batch, false);
1373 #endif
1374 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1375 crocus_emit_reg(batch, GENX(INSTPM), reg) {
1376 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1377 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1378 }
1379 #endif
1380 #if GFX_VER >= 5 || GFX_VERx10 == 45
1381 /* Use the legacy AA line coverage computation. */
1382 crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1383 #endif
1384
1385 /* No polygon stippling offsets are necessary. */
1386 /* TODO: may need to set an offset for origin-UL framebuffers */
1387 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1388
1389 #if GFX_VER >= 7
1390 crocus_alloc_push_constants(batch);
1391 #endif
1392
1393 #if GFX_VER == 8
1394 /* Set the initial MSAA sample positions. */
1395 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1396 INTEL_SAMPLE_POS_1X(pat._1xSample);
1397 INTEL_SAMPLE_POS_2X(pat._2xSample);
1398 INTEL_SAMPLE_POS_4X(pat._4xSample);
1399 INTEL_SAMPLE_POS_8X(pat._8xSample);
1400 }
1401
1402 /* Disable chromakeying (it's for media) */
1403 crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1404
1405 /* We want regular rendering, not special HiZ operations. */
1406 crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1407 #endif
1408 }
1409
1410 #if GFX_VER >= 7
1411 static void
crocus_init_compute_context(struct crocus_batch * batch)1412 crocus_init_compute_context(struct crocus_batch *batch)
1413 {
1414 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1415
1416 emit_pipeline_select(batch, GPGPU);
1417
1418 #if GFX_VER >= 7
1419 emit_l3_state(batch, true);
1420 #endif
1421 }
1422 #endif
1423
1424 /**
1425 * Generation-specific context state (ice->state.genx->...).
1426 *
1427 * Most state can go in crocus_context directly, but these encode hardware
1428 * packets which vary by generation.
1429 */
1430 struct crocus_genx_state {
1431 struct {
1432 #if GFX_VER >= 7
1433 struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1434 #endif
1435 } shaders[MESA_SHADER_STAGES];
1436
1437 #if GFX_VER == 8
1438 bool pma_fix_enabled;
1439 #endif
1440 };
1441
1442 /**
1443 * The pipe->set_blend_color() driver hook.
1444 *
1445 * This corresponds to our COLOR_CALC_STATE.
1446 */
1447 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1448 crocus_set_blend_color(struct pipe_context *ctx,
1449 const struct pipe_blend_color *state)
1450 {
1451 struct crocus_context *ice = (struct crocus_context *) ctx;
1452
1453 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1454 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1455 #if GFX_VER <= 5
1456 ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1457 #else
1458 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1459 #endif
1460 }
1461
1462 /**
1463 * Gallium CSO for blend state (see pipe_blend_state).
1464 */
1465 struct crocus_blend_state {
1466 #if GFX_VER == 8
1467 /** Partial 3DSTATE_PS_BLEND */
1468 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1469 #endif
1470
1471 /** copy of BLEND_STATE */
1472 struct pipe_blend_state cso;
1473
1474 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1475 uint8_t blend_enables;
1476
1477 /** Bitfield of whether color writes are enabled for RT[i] */
1478 uint8_t color_write_enables;
1479
1480 /** Does RT[0] use dual color blending? */
1481 bool dual_color_blending;
1482 };
1483
1484 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1485 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1486 {
1487 if (alpha_to_one) {
1488 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1489 return PIPE_BLENDFACTOR_ONE;
1490
1491 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1492 return PIPE_BLENDFACTOR_ZERO;
1493 }
1494
1495 return f;
1496 }
1497
1498 #if GFX_VER >= 6
1499 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1500 #else
1501 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1502 #endif
1503
1504 static bool
can_emit_logic_op(struct crocus_context * ice)1505 can_emit_logic_op(struct crocus_context *ice)
1506 {
1507 /* all pre gen8 have logicop restricted to unorm */
1508 enum pipe_format pformat = PIPE_FORMAT_NONE;
1509 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1510 if (ice->state.framebuffer.cbufs[i]) {
1511 pformat = ice->state.framebuffer.cbufs[i]->format;
1512 break;
1513 }
1514 }
1515 return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1516 }
1517
1518 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1519 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1520 struct crocus_blend_state *cso_blend,
1521 int idx)
1522 {
1523 struct crocus_context *ice = batch->ice;
1524 bool independent_alpha_blend = false;
1525 const struct pipe_rt_blend_state *rt =
1526 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1527 const unsigned blend_enabled = rt->blend_enable;
1528
1529 enum pipe_blendfactor src_rgb =
1530 fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1531 enum pipe_blendfactor src_alpha =
1532 fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1533 enum pipe_blendfactor dst_rgb =
1534 fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1535 enum pipe_blendfactor dst_alpha =
1536 fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1537
1538 if (rt->rgb_func != rt->alpha_func ||
1539 src_rgb != src_alpha || dst_rgb != dst_alpha)
1540 independent_alpha_blend = true;
1541 if (cso_blend->cso.logicop_enable) {
1542 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1543 entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1544 entry->LogicOpFunction = cso_blend->cso.logicop_func;
1545 }
1546 } else if (blend_enabled) {
1547 if (idx == 0) {
1548 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1549 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1550 entry->ColorBufferBlendEnable =
1551 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1552 } else
1553 entry->ColorBufferBlendEnable = 1;
1554
1555 entry->ColorBlendFunction = rt->rgb_func;
1556 entry->AlphaBlendFunction = rt->alpha_func;
1557 entry->SourceBlendFactor = (int) src_rgb;
1558 entry->SourceAlphaBlendFactor = (int) src_alpha;
1559 entry->DestinationBlendFactor = (int) dst_rgb;
1560 entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1561 }
1562 #if GFX_VER <= 5
1563 /*
1564 * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1565 * when a dual src blend shader is in use. Setup dummy blending.
1566 */
1567 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1568 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1569 if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1570 entry->ColorBufferBlendEnable = 1;
1571 entry->ColorBlendFunction = PIPE_BLEND_ADD;
1572 entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1573 entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1574 entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1575 entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1576 entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1577 }
1578 #endif
1579 return independent_alpha_blend;
1580 }
1581
1582 /**
1583 * The pipe->create_blend_state() driver hook.
1584 *
1585 * Translates a pipe_blend_state into crocus_blend_state.
1586 */
1587 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1588 crocus_create_blend_state(struct pipe_context *ctx,
1589 const struct pipe_blend_state *state)
1590 {
1591 struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1592
1593 cso->blend_enables = 0;
1594 cso->color_write_enables = 0;
1595 STATIC_ASSERT(ELK_MAX_DRAW_BUFFERS <= 8);
1596
1597 cso->cso = *state;
1598 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1599
1600 #if GFX_VER == 8
1601 bool indep_alpha_blend = false;
1602 #endif
1603 for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
1604 const struct pipe_rt_blend_state *rt =
1605 &state->rt[state->independent_blend_enable ? i : 0];
1606 if (rt->blend_enable)
1607 cso->blend_enables |= 1u << i;
1608 if (rt->colormask)
1609 cso->color_write_enables |= 1u << i;
1610 #if GFX_VER == 8
1611 enum pipe_blendfactor src_rgb =
1612 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1613 enum pipe_blendfactor src_alpha =
1614 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1615 enum pipe_blendfactor dst_rgb =
1616 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1617 enum pipe_blendfactor dst_alpha =
1618 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1619
1620 if (rt->rgb_func != rt->alpha_func ||
1621 src_rgb != src_alpha || dst_rgb != dst_alpha)
1622 indep_alpha_blend = true;
1623 #endif
1624 }
1625
1626 #if GFX_VER == 8
1627 crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1628 /* pb.HasWriteableRT is filled in at draw time.
1629 * pb.AlphaTestEnable is filled in at draw time.
1630 *
1631 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1632 * setting it when dual color blending without an appropriate shader.
1633 */
1634
1635 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1636 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1637
1638 /* The casts prevent warnings about implicit enum type conversions. */
1639 pb.SourceBlendFactor =
1640 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1641 pb.SourceAlphaBlendFactor =
1642 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1643 pb.DestinationBlendFactor =
1644 (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1645 pb.DestinationAlphaBlendFactor =
1646 (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1647 }
1648 #endif
1649 return cso;
1650 }
1651
1652 /**
1653 * The pipe->bind_blend_state() driver hook.
1654 *
1655 * Bind a blending CSO and flag related dirty bits.
1656 */
1657 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1658 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1659 {
1660 struct crocus_context *ice = (struct crocus_context *) ctx;
1661 struct crocus_blend_state *cso = state;
1662
1663 ice->state.cso_blend = cso;
1664 ice->state.blend_enables = cso ? cso->blend_enables : 0;
1665
1666 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1667 ice->state.dirty |= CROCUS_DIRTY_WM;
1668 #if GFX_VER >= 6
1669 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1670 #endif
1671 #if GFX_VER >= 7
1672 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1673 #endif
1674 #if GFX_VER == 8
1675 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1676 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1677 #endif
1678 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1679 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1680 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1681 }
1682
1683 /**
1684 * Return true if the FS writes to any color outputs which are not disabled
1685 * via color masking.
1686 */
1687 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1688 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1689 const struct shader_info *fs_info)
1690 {
1691 if (!fs_info)
1692 return false;
1693
1694 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1695
1696 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1697 rt_outputs = (1 << ELK_MAX_DRAW_BUFFERS) - 1;
1698
1699 return cso_blend->color_write_enables & rt_outputs;
1700 }
1701
1702 /**
1703 * Gallium CSO for depth, stencil, and alpha testing state.
1704 */
1705 struct crocus_depth_stencil_alpha_state {
1706 struct pipe_depth_stencil_alpha_state cso;
1707
1708 bool depth_writes_enabled;
1709 bool stencil_writes_enabled;
1710 };
1711
1712 /**
1713 * The pipe->create_depth_stencil_alpha_state() driver hook.
1714 *
1715 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1716 * testing state since we need pieces of it in a variety of places.
1717 */
1718 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1719 crocus_create_zsa_state(struct pipe_context *ctx,
1720 const struct pipe_depth_stencil_alpha_state *state)
1721 {
1722 struct crocus_depth_stencil_alpha_state *cso =
1723 malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1724
1725 bool two_sided_stencil = state->stencil[1].enabled;
1726 cso->cso = *state;
1727
1728 cso->depth_writes_enabled = state->depth_writemask;
1729 cso->stencil_writes_enabled =
1730 state->stencil[0].writemask != 0 ||
1731 (two_sided_stencil && state->stencil[1].writemask != 0);
1732
1733 /* The state tracker needs to optimize away EQUAL writes for us. */
1734 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1735
1736 return cso;
1737 }
1738
1739 /**
1740 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1741 *
1742 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1743 */
1744 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1745 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1746 {
1747 struct crocus_context *ice = (struct crocus_context *) ctx;
1748 struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1749 struct crocus_depth_stencil_alpha_state *new_cso = state;
1750
1751 if (new_cso) {
1752 if (cso_changed(cso.alpha_ref_value))
1753 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1754
1755 if (cso_changed(cso.alpha_enabled))
1756 ice->state.dirty |= CROCUS_DIRTY_WM;
1757 #if GFX_VER >= 6
1758 if (cso_changed(cso.alpha_enabled))
1759 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1760
1761 if (cso_changed(cso.alpha_func))
1762 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1763 #endif
1764 #if GFX_VER == 8
1765 if (cso_changed(cso.alpha_enabled))
1766 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1767 #endif
1768
1769 if (cso_changed(depth_writes_enabled))
1770 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1771
1772 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1773 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1774
1775 #if GFX_VER <= 5
1776 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1777 #endif
1778 }
1779
1780 ice->state.cso_zsa = new_cso;
1781 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1782 #if GFX_VER >= 6
1783 ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1784 #endif
1785 #if GFX_VER == 8
1786 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1787 #endif
1788 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1789 }
1790
1791 #if GFX_VER == 8
1792 static bool
want_pma_fix(struct crocus_context * ice)1793 want_pma_fix(struct crocus_context *ice)
1794 {
1795 UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1796 UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1797 const struct elk_wm_prog_data *wm_prog_data = (void *)
1798 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1799 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1800 const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1801 const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1802
1803 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1804 * to avoid stalling at the pixel mask array. The state equations are
1805 * documented in these places:
1806 *
1807 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
1808 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1809 *
1810 * Both equations share some common elements:
1811 *
1812 * no_hiz_op =
1813 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1814 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1815 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1816 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1817 *
1818 * killpixels =
1819 * 3DSTATE_WM::ForceKillPix != ForceOff &&
1820 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1821 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1822 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1823 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1824 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1825 *
1826 * (Technically the stencil PMA treats ForceKillPix differently,
1827 * but I think this is a documentation oversight, and we don't
1828 * ever use it in this way, so it doesn't matter).
1829 *
1830 * common_pma_fix =
1831 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
1832 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1833 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1834 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1835 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1836 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
1837 * no_hiz_op
1838 *
1839 * These are always true:
1840 *
1841 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1842 * 3DSTATE_PS_EXTRA::PixelShaderValid
1843 *
1844 * Also, we never use the normal drawing path for HiZ ops; these are true:
1845 *
1846 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1847 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1848 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1849 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
1850 *
1851 * This happens sometimes:
1852 *
1853 * 3DSTATE_WM::ForceThreadDispatch != 1
1854 *
1855 * However, we choose to ignore it as it either agrees with the signal
1856 * (dispatch was already enabled, so nothing out of the ordinary), or
1857 * there are no framebuffer attachments (so no depth or HiZ anyway,
1858 * meaning the PMA signal will already be disabled).
1859 */
1860
1861 if (!cso_fb->zsbuf)
1862 return false;
1863
1864 struct crocus_resource *zres, *sres;
1865 crocus_get_depth_stencil_resources(devinfo,
1866 cso_fb->zsbuf->texture, &zres, &sres);
1867
1868 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1869 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1870 */
1871 if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1872 return false;
1873
1874 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1875 if (wm_prog_data->early_fragment_tests)
1876 return false;
1877
1878 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1879 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1880 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1881 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1882 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1883 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1884 */
1885 bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1886 cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1887
1888 /* The Gfx8 depth PMA equation becomes:
1889 *
1890 * depth_writes =
1891 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1892 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1893 *
1894 * stencil_writes =
1895 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1896 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1897 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1898 *
1899 * Z_PMA_OPT =
1900 * common_pma_fix &&
1901 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1902 * ((killpixels && (depth_writes || stencil_writes)) ||
1903 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1904 *
1905 */
1906 if (!cso_zsa->cso.depth_enabled)
1907 return false;
1908
1909 return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1910 (killpixels && (cso_zsa->depth_writes_enabled ||
1911 (sres && cso_zsa->stencil_writes_enabled)));
1912 }
1913 #endif
1914 void
genX(crocus_update_pma_fix)1915 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1916 struct crocus_batch *batch,
1917 bool enable)
1918 {
1919 #if GFX_VER == 8
1920 struct crocus_genx_state *genx = ice->state.genx;
1921
1922 if (genx->pma_fix_enabled == enable)
1923 return;
1924
1925 genx->pma_fix_enabled = enable;
1926
1927 /* According to the Broadwell PIPE_CONTROL documentation, software should
1928 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1929 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
1930 *
1931 * The Gfx9 docs say to use a depth stall rather than a command streamer
1932 * stall. However, the hardware seems to violently disagree. A full
1933 * command streamer stall seems to be needed in both cases.
1934 */
1935 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1936 PIPE_CONTROL_CS_STALL |
1937 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1938 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1939
1940 crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1941 reg.NPPMAFixEnable = enable;
1942 reg.NPEarlyZFailsDisable = enable;
1943 reg.NPPMAFixEnableMask = true;
1944 reg.NPEarlyZFailsDisableMask = true;
1945 }
1946
1947 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1948 * Flush bits is often necessary. We do it regardless because it's easier.
1949 * The render cache flush is also necessary if stencil writes are enabled.
1950 *
1951 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1952 * flushes seem to work just as well.
1953 */
1954 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1955 PIPE_CONTROL_DEPTH_STALL |
1956 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1957 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1958 #endif
1959 }
1960
1961 static float
get_line_width(const struct pipe_rasterizer_state * state)1962 get_line_width(const struct pipe_rasterizer_state *state)
1963 {
1964 float line_width = state->line_width;
1965
1966 /* From the OpenGL 4.4 spec:
1967 *
1968 * "The actual width of non-antialiased lines is determined by rounding
1969 * the supplied width to the nearest integer, then clamping it to the
1970 * implementation-dependent maximum non-antialiased line width."
1971 */
1972 if (!state->multisample && !state->line_smooth)
1973 line_width = roundf(state->line_width);
1974
1975 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1976 /* For 1 pixel line thickness or less, the general anti-aliasing
1977 * algorithm gives up, and a garbage line is generated. Setting a
1978 * Line Width of 0.0 specifies the rasterization of the "thinnest"
1979 * (one-pixel-wide), non-antialiased lines.
1980 *
1981 * Lines rendered with zero Line Width are rasterized using the
1982 * "Grid Intersection Quantization" rules as specified by the
1983 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1984 */
1985 /* hack around this for gfx4/5 fps counters in hud. */
1986 line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1987 }
1988 return line_width;
1989 }
1990
1991 /**
1992 * The pipe->create_rasterizer_state() driver hook.
1993 */
1994 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1995 crocus_create_rasterizer_state(struct pipe_context *ctx,
1996 const struct pipe_rasterizer_state *state)
1997 {
1998 struct crocus_rasterizer_state *cso =
1999 malloc(sizeof(struct crocus_rasterizer_state));
2000
2001 cso->fill_mode_point_or_line =
2002 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2003 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2004 state->fill_back == PIPE_POLYGON_MODE_LINE ||
2005 state->fill_back == PIPE_POLYGON_MODE_POINT;
2006
2007 if (state->clip_plane_enable != 0)
2008 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2009 else
2010 cso->num_clip_plane_consts = 0;
2011
2012 cso->cso = *state;
2013
2014 #if GFX_VER >= 6
2015 float line_width = get_line_width(state);
2016
2017 crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2018 sf.StatisticsEnable = true;
2019 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2020 sf.LineEndCapAntialiasingRegionWidth =
2021 state->line_smooth ? _10pixels : _05pixels;
2022 sf.LastPixelEnable = state->line_last_pixel;
2023 #if GFX_VER <= 7
2024 sf.AntialiasingEnable = state->line_smooth;
2025 #endif
2026 #if GFX_VER == 8
2027 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2028 if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2029 sf.CHVLineWidth = line_width;
2030 else
2031 sf.LineWidth = line_width;
2032 #else
2033 sf.LineWidth = line_width;
2034 #endif
2035 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2036 sf.PointWidth = state->point_size;
2037
2038 if (state->flatshade_first) {
2039 sf.TriangleFanProvokingVertexSelect = 1;
2040 } else {
2041 sf.TriangleStripListProvokingVertexSelect = 2;
2042 sf.TriangleFanProvokingVertexSelect = 2;
2043 sf.LineStripListProvokingVertexSelect = 1;
2044 }
2045
2046 #if GFX_VER == 6
2047 sf.AttributeSwizzleEnable = true;
2048 if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2049 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2050 else
2051 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2052 #endif
2053
2054 #if GFX_VER <= 7
2055 sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2056
2057 #if GFX_VER >= 6
2058 sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2059 sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2060 sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2061 sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2062 sf.GlobalDepthOffsetScale = state->offset_scale;
2063 sf.GlobalDepthOffsetClamp = state->offset_clamp;
2064
2065 sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2066 sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2067 #endif
2068
2069 sf.CullMode = translate_cull_mode(state->cull_face);
2070 sf.ScissorRectangleEnable = true;
2071
2072 #if GFX_VERx10 == 75
2073 sf.LineStippleEnable = state->line_stipple_enable;
2074 #endif
2075 #endif
2076 }
2077 #endif
2078
2079 #if GFX_VER == 8
2080 crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2081 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2082 rr.CullMode = translate_cull_mode(state->cull_face);
2083 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2084 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2085 rr.DXMultisampleRasterizationEnable = state->multisample;
2086 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2087 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2088 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2089 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2090 rr.GlobalDepthOffsetScale = state->offset_scale;
2091 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2092 rr.SmoothPointEnable = state->point_smooth;
2093 rr.AntialiasingEnable = state->line_smooth;
2094 rr.ScissorRectangleEnable = state->scissor;
2095 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2096 }
2097 #endif
2098
2099 #if GFX_VER >= 6
2100 crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2101 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2102 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2103 */
2104 #if GFX_VER >= 7
2105 cl.EarlyCullEnable = true;
2106 #endif
2107
2108 #if GFX_VER == 7
2109 cl.FrontWinding = state->front_ccw ? 1 : 0;
2110 cl.CullMode = translate_cull_mode(state->cull_face);
2111 #endif
2112 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2113 #if GFX_VER < 8
2114 cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2115 #endif
2116 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2117 cl.GuardbandClipTestEnable = true;
2118 cl.ClipEnable = true;
2119 cl.MinimumPointWidth = 0.125;
2120 cl.MaximumPointWidth = 255.875;
2121
2122 #if GFX_VER == 8
2123 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2124 #endif
2125
2126 if (state->flatshade_first) {
2127 cl.TriangleFanProvokingVertexSelect = 1;
2128 } else {
2129 cl.TriangleStripListProvokingVertexSelect = 2;
2130 cl.TriangleFanProvokingVertexSelect = 2;
2131 cl.LineStripListProvokingVertexSelect = 1;
2132 }
2133 }
2134 #endif
2135
2136 /* Remap from 0..255 back to 1..256 */
2137 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2138
2139 crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2140 if (state->line_stipple_enable) {
2141 line.LineStipplePattern = state->line_stipple_pattern;
2142 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2143 line.LineStippleRepeatCount = line_stipple_factor;
2144 }
2145 }
2146
2147 return cso;
2148 }
2149
2150 /**
2151 * The pipe->bind_rasterizer_state() driver hook.
2152 *
2153 * Bind a rasterizer CSO and flag related dirty bits.
2154 */
2155 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2156 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2157 {
2158 struct crocus_context *ice = (struct crocus_context *) ctx;
2159 struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2160 struct crocus_rasterizer_state *new_cso = state;
2161
2162 if (new_cso) {
2163 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2164 if (cso_changed_memcmp(line_stipple))
2165 ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2166 #if GFX_VER >= 6
2167 if (cso_changed(cso.half_pixel_center))
2168 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2169 if (cso_changed(cso.scissor))
2170 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2171 if (cso_changed(cso.multisample))
2172 ice->state.dirty |= CROCUS_DIRTY_WM;
2173 #else
2174 if (cso_changed(cso.scissor))
2175 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2176 #endif
2177
2178 if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2179 ice->state.dirty |= CROCUS_DIRTY_WM;
2180
2181 #if GFX_VER >= 6
2182 if (cso_changed(cso.rasterizer_discard))
2183 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2184
2185 if (cso_changed(cso.flatshade_first))
2186 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2187 #endif
2188
2189 if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2190 cso_changed(cso.clip_halfz))
2191 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2192
2193 #if GFX_VER >= 7
2194 if (cso_changed(cso.sprite_coord_enable) ||
2195 cso_changed(cso.sprite_coord_mode) ||
2196 cso_changed(cso.light_twoside))
2197 ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2198 #endif
2199 #if GFX_VER <= 5
2200 if (cso_changed(cso.clip_plane_enable))
2201 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2202 #endif
2203 }
2204
2205 ice->state.cso_rast = new_cso;
2206 ice->state.dirty |= CROCUS_DIRTY_RASTER;
2207 ice->state.dirty |= CROCUS_DIRTY_CLIP;
2208 #if GFX_VER <= 5
2209 ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2210 ice->state.dirty |= CROCUS_DIRTY_WM;
2211 #endif
2212 #if GFX_VER <= 6
2213 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2214 #endif
2215 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2216 }
2217
2218 /**
2219 * Return true if the given wrap mode requires the border color to exist.
2220 *
2221 * (We can skip uploading it if the sampler isn't going to use it.)
2222 */
2223 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2224 wrap_mode_needs_border_color(unsigned wrap_mode)
2225 {
2226 #if GFX_VER == 8
2227 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2228 #else
2229 return wrap_mode == TCM_CLAMP_BORDER;
2230 #endif
2231 }
2232
2233 /**
2234 * Gallium CSO for sampler state.
2235 */
2236 struct crocus_sampler_state {
2237 struct pipe_sampler_state pstate;
2238 union pipe_color_union border_color;
2239 bool needs_border_color;
2240 unsigned wrap_s;
2241 unsigned wrap_t;
2242 unsigned wrap_r;
2243 unsigned mag_img_filter;
2244 float min_lod;
2245 };
2246
2247 /**
2248 * The pipe->create_sampler_state() driver hook.
2249 *
2250 * We fill out SAMPLER_STATE (except for the border color pointer), and
2251 * store that on the CPU. It doesn't make sense to upload it to a GPU
2252 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2253 * all bound sampler states to be in contiguous memor.
2254 */
2255 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2256 crocus_create_sampler_state(struct pipe_context *ctx,
2257 const struct pipe_sampler_state *state)
2258 {
2259 struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2260
2261 if (!cso)
2262 return NULL;
2263
2264 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2265 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2266
2267 bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2268 state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2269 cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2270 cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2271 cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2272
2273 cso->pstate = *state;
2274
2275 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2276
2277 cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2278 wrap_mode_needs_border_color(cso->wrap_t) ||
2279 wrap_mode_needs_border_color(cso->wrap_r);
2280
2281 cso->min_lod = state->min_lod;
2282 cso->mag_img_filter = state->mag_img_filter;
2283
2284 // XXX: explain this code ported from ilo...I don't get it at all...
2285 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2286 state->min_lod > 0.0f) {
2287 cso->min_lod = 0.0f;
2288 cso->mag_img_filter = state->min_img_filter;
2289 }
2290
2291 return cso;
2292 }
2293
2294 /**
2295 * The pipe->bind_sampler_states() driver hook.
2296 */
2297 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2298 crocus_bind_sampler_states(struct pipe_context *ctx,
2299 enum pipe_shader_type p_stage,
2300 unsigned start, unsigned count,
2301 void **states)
2302 {
2303 struct crocus_context *ice = (struct crocus_context *) ctx;
2304 gl_shader_stage stage = stage_from_pipe(p_stage);
2305 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2306
2307 assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2308
2309 bool dirty = false;
2310
2311 for (int i = 0; i < count; i++) {
2312 if (shs->samplers[start + i] != states[i]) {
2313 shs->samplers[start + i] = states[i];
2314 dirty = true;
2315 }
2316 }
2317
2318 if (dirty) {
2319 #if GFX_VER <= 5
2320 if (p_stage == PIPE_SHADER_FRAGMENT)
2321 ice->state.dirty |= CROCUS_DIRTY_WM;
2322 else if (p_stage == PIPE_SHADER_VERTEX)
2323 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2324 #endif
2325 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2326 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2327 }
2328 }
2329
2330 enum samp_workaround {
2331 SAMP_NORMAL,
2332 SAMP_CUBE_CLAMP,
2333 SAMP_CUBE_CUBE,
2334 SAMP_T_WRAP,
2335 };
2336
2337 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2338 crocus_upload_sampler_state(struct crocus_batch *batch,
2339 struct crocus_sampler_state *cso,
2340 uint32_t border_color_offset,
2341 enum samp_workaround samp_workaround,
2342 uint32_t first_level,
2343 void *map)
2344 {
2345 struct pipe_sampler_state *state = &cso->pstate;
2346 uint32_t wrap_s, wrap_t, wrap_r;
2347
2348 wrap_s = cso->wrap_s;
2349 wrap_t = cso->wrap_t;
2350 wrap_r = cso->wrap_r;
2351
2352 switch (samp_workaround) {
2353 case SAMP_CUBE_CLAMP:
2354 wrap_s = TCM_CLAMP;
2355 wrap_t = TCM_CLAMP;
2356 wrap_r = TCM_CLAMP;
2357 break;
2358 case SAMP_CUBE_CUBE:
2359 wrap_s = TCM_CUBE;
2360 wrap_t = TCM_CUBE;
2361 wrap_r = TCM_CUBE;
2362 break;
2363 case SAMP_T_WRAP:
2364 wrap_t = TCM_WRAP;
2365 break;
2366 default:
2367 break;
2368 }
2369
2370 _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2371 samp.TCXAddressControlMode = wrap_s;
2372 samp.TCYAddressControlMode = wrap_t;
2373 samp.TCZAddressControlMode = wrap_r;
2374
2375 #if GFX_VER >= 6
2376 samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2377 #endif
2378 samp.MinModeFilter = state->min_img_filter;
2379 samp.MagModeFilter = cso->mag_img_filter;
2380 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2381 samp.MaximumAnisotropy = RATIO21;
2382
2383 if (state->max_anisotropy >= 2) {
2384 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2385 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2386 #if GFX_VER >= 7
2387 samp.AnisotropicAlgorithm = EWAApproximation;
2388 #endif
2389 }
2390
2391 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2392 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2393
2394 samp.MaximumAnisotropy =
2395 MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2396 }
2397
2398 /* Set address rounding bits if not using nearest filtering. */
2399 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2400 samp.UAddressMinFilterRoundingEnable = true;
2401 samp.VAddressMinFilterRoundingEnable = true;
2402 samp.RAddressMinFilterRoundingEnable = true;
2403 }
2404
2405 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2406 samp.UAddressMagFilterRoundingEnable = true;
2407 samp.VAddressMagFilterRoundingEnable = true;
2408 samp.RAddressMagFilterRoundingEnable = true;
2409 }
2410
2411 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2412 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2413
2414 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2415
2416 #if GFX_VER == 8
2417 samp.LODPreClampMode = CLAMP_MODE_OGL;
2418 #else
2419 samp.LODPreClampEnable = true;
2420 #endif
2421 samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2422 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2423 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2424
2425 #if GFX_VER == 6
2426 samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2427 samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2428 #endif
2429
2430 #if GFX_VER < 6
2431 samp.BorderColorPointer =
2432 ro_bo(batch->state.bo, border_color_offset);
2433 #else
2434 samp.BorderColorPointer = border_color_offset;
2435 #endif
2436 }
2437 }
2438
2439 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2440 crocus_upload_border_color(struct crocus_batch *batch,
2441 struct crocus_sampler_state *cso,
2442 struct crocus_sampler_view *tex,
2443 uint32_t *bc_offset)
2444 {
2445 /* We may need to swizzle the border color for format faking.
2446 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2447 * This means we need to move the border color's A channel into
2448 * the R or G channels so that those read swizzles will move it
2449 * back into A.
2450 */
2451 enum pipe_format internal_format = PIPE_FORMAT_NONE;
2452 union pipe_color_union *color = &cso->border_color;
2453 union pipe_color_union tmp;
2454 if (tex) {
2455 internal_format = tex->res->internal_format;
2456
2457 if (util_format_is_alpha(internal_format)) {
2458 unsigned char swz[4] = {
2459 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2460 PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2461 };
2462 util_format_apply_color_swizzle(&tmp, color, swz, true);
2463 color = &tmp;
2464 } else if (util_format_is_luminance_alpha(internal_format) &&
2465 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2466 unsigned char swz[4] = {
2467 PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2468 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2469 };
2470 util_format_apply_color_swizzle(&tmp, color, swz, true);
2471 color = &tmp;
2472 }
2473 }
2474 bool is_integer_format = util_format_is_pure_integer(internal_format);
2475 unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2476 const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2477 uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2478
2479 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2480
2481 #define ASSIGN(dst, src) \
2482 do { \
2483 dst = src; \
2484 } while (0)
2485
2486 #define ASSIGNu16(dst, src) \
2487 do { \
2488 dst = (uint16_t)src; \
2489 } while (0)
2490
2491 #define ASSIGNu8(dst, src) \
2492 do { \
2493 dst = (uint8_t)src; \
2494 } while (0)
2495
2496 #define BORDER_COLOR_ATTR(macro, _color_type, src) \
2497 macro(state.BorderColor ## _color_type ## Red, src[0]); \
2498 macro(state.BorderColor ## _color_type ## Green, src[1]); \
2499 macro(state.BorderColor ## _color_type ## Blue, src[2]); \
2500 macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2501
2502 #if GFX_VER >= 8
2503 /* On Broadwell, the border color is represented as four 32-bit floats,
2504 * integers, or unsigned values, interpreted according to the surface
2505 * format. This matches the sampler->BorderColor union exactly; just
2506 * memcpy the values.
2507 */
2508 BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2509 #elif GFX_VERx10 == 75
2510 if (is_integer_format) {
2511 const struct util_format_description *format_desc =
2512 util_format_description(internal_format);
2513
2514 /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2515 * "If any color channel is missing from the surface format,
2516 * corresponding border color should be programmed as zero and if
2517 * alpha channel is missing, corresponding Alpha border color should
2518 * be programmed as 1."
2519 */
2520 unsigned c[4] = { 0, 0, 0, 1 };
2521 for (int i = 0; i < 4; i++) {
2522 if (format_desc->channel[i].size)
2523 c[i] = color->ui[i];
2524 }
2525
2526 switch (format_desc->channel[0].size) {
2527 case 8:
2528 /* Copy RGBA in order. */
2529 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2530 break;
2531 case 10:
2532 /* R10G10B10A2_UINT is treated like a 16-bit format. */
2533 case 16:
2534 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2535 break;
2536 case 32:
2537 if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2538 /* Careful inspection of the tables reveals that for RG32 formats,
2539 * the green channel needs to go where blue normally belongs.
2540 */
2541 state.BorderColor32bitRed = c[0];
2542 state.BorderColor32bitBlue = c[1];
2543 state.BorderColor32bitAlpha = 1;
2544 } else {
2545 /* Copy RGBA in order. */
2546 BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2547 }
2548 break;
2549 default:
2550 assert(!"Invalid number of bits per channel in integer format.");
2551 break;
2552 }
2553 } else {
2554 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2555 }
2556 #elif GFX_VER == 5 || GFX_VER == 6
2557 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2558 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2559 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2560
2561 #define MESA_FLOAT_TO_HALF(dst, src) \
2562 dst = _mesa_float_to_half(src);
2563
2564 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2565
2566 #undef MESA_FLOAT_TO_HALF
2567
2568 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
2569 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2570 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
2571 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2572
2573 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2574
2575 #elif GFX_VER == 4
2576 BORDER_COLOR_ATTR(ASSIGN, , color->f);
2577 #else
2578 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2579 #endif
2580
2581 #undef ASSIGN
2582 #undef BORDER_COLOR_ATTR
2583
2584 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2585 }
2586
2587 /**
2588 * Upload the sampler states into a contiguous area of GPU memory, for
2589 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2590 *
2591 * Also fill out the border color state pointers.
2592 */
2593 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2594 crocus_upload_sampler_states(struct crocus_context *ice,
2595 struct crocus_batch *batch, gl_shader_stage stage)
2596 {
2597 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2598 const struct shader_info *info = crocus_get_shader_info(ice, stage);
2599
2600 /* We assume the state tracker will call pipe->bind_sampler_states()
2601 * if the program's number of textures changes.
2602 */
2603 unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2604
2605 if (!count)
2606 return;
2607
2608 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2609 * in the dynamic state memory zone, so we can point to it via the
2610 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2611 */
2612 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2613 uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2614
2615 if (unlikely(!map))
2616 return;
2617
2618 for (int i = 0; i < count; i++) {
2619 struct crocus_sampler_state *state = shs->samplers[i];
2620 struct crocus_sampler_view *tex = shs->textures[i];
2621
2622 if (!state || !tex) {
2623 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2624 } else {
2625 unsigned border_color_offset = 0;
2626 if (state->needs_border_color) {
2627 crocus_upload_border_color(batch, state, tex, &border_color_offset);
2628 }
2629
2630 enum samp_workaround wa = SAMP_NORMAL;
2631 /* There's a bug in 1D texture sampling - it actually pays
2632 * attention to the wrap_t value, though it should not.
2633 * Override the wrap_t value here to GL_REPEAT to keep
2634 * any nonexistent border pixels from floating in.
2635 */
2636 if (tex->base.target == PIPE_TEXTURE_1D)
2637 wa = SAMP_T_WRAP;
2638 else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2639 tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2640 /* Cube maps must use the same wrap mode for all three coordinate
2641 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
2642 *
2643 * Ivybridge and Baytrail seem to have problems with CUBE mode and
2644 * integer formats. Fall back to CLAMP for now.
2645 */
2646 if (state->pstate.seamless_cube_map &&
2647 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2648 wa = SAMP_CUBE_CUBE;
2649 else
2650 wa = SAMP_CUBE_CLAMP;
2651 }
2652
2653 uint32_t first_level = 0;
2654 if (tex->base.target != PIPE_BUFFER)
2655 first_level = tex->base.u.tex.first_level;
2656
2657 crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2658 }
2659
2660 map += GENX(SAMPLER_STATE_length);
2661 }
2662 }
2663
2664 /**
2665 * The pipe->create_sampler_view() driver hook.
2666 */
2667 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2668 crocus_create_sampler_view(struct pipe_context *ctx,
2669 struct pipe_resource *tex,
2670 const struct pipe_sampler_view *tmpl)
2671 {
2672 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2673 const struct intel_device_info *devinfo = &screen->devinfo;
2674 struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2675
2676 if (!isv)
2677 return NULL;
2678
2679 /* initialize base object */
2680 isv->base = *tmpl;
2681 isv->base.context = ctx;
2682 isv->base.texture = NULL;
2683 pipe_reference_init(&isv->base.reference, 1);
2684 pipe_resource_reference(&isv->base.texture, tex);
2685
2686 if (util_format_is_depth_or_stencil(tmpl->format)) {
2687 struct crocus_resource *zres, *sres;
2688 const struct util_format_description *desc =
2689 util_format_description(tmpl->format);
2690
2691 crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2692
2693 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2694
2695 if (tex->format == PIPE_FORMAT_S8_UINT)
2696 if (GFX_VER == 7 && sres->shadow)
2697 tex = &sres->shadow->base.b;
2698 }
2699
2700 isv->res = (struct crocus_resource *) tex;
2701
2702 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2703
2704 if (isv->base.target == PIPE_TEXTURE_CUBE ||
2705 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2706 usage |= ISL_SURF_USAGE_CUBE_BIT;
2707
2708 const struct crocus_format_info fmt =
2709 crocus_format_for_usage(devinfo, tmpl->format, usage);
2710
2711 enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2712 crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2713
2714 /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2715 if (GFX_VER < 6 &&
2716 (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2717 tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2718 isv->swizzle[0] = tmpl->swizzle_g;
2719 isv->swizzle[1] = tmpl->swizzle_g;
2720 isv->swizzle[2] = tmpl->swizzle_g;
2721 isv->swizzle[3] = tmpl->swizzle_g;
2722 }
2723
2724 isv->clear_color = isv->res->aux.clear_color;
2725
2726 isv->view = (struct isl_view) {
2727 .format = fmt.fmt,
2728 #if GFX_VERx10 >= 75
2729 .swizzle = (struct isl_swizzle) {
2730 .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2731 .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2732 .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2733 .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2734 },
2735 #else
2736 /* swizzling handled in shader code */
2737 .swizzle = ISL_SWIZZLE_IDENTITY,
2738 #endif
2739 .usage = usage,
2740 };
2741
2742 /* Fill out SURFACE_STATE for this view. */
2743 if (tmpl->target != PIPE_BUFFER) {
2744 isv->view.base_level = tmpl->u.tex.first_level;
2745 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2746
2747 /* Hardware older than skylake ignores this value */
2748 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2749
2750 // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2751 isv->view.base_array_layer = tmpl->u.tex.first_layer;
2752 isv->view.array_len =
2753 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2754 }
2755 #if GFX_VER >= 6
2756 /* just create a second view struct for texture gather just in case */
2757 isv->gather_view = isv->view;
2758
2759 #if GFX_VER == 7
2760 if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2761 fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2762 fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2763 isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2764 #if GFX_VERx10 >= 75
2765 isv->gather_view.swizzle = (struct isl_swizzle) {
2766 .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2767 .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2768 .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2769 .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2770 };
2771 #endif
2772 }
2773 #endif
2774 #if GFX_VER == 6
2775 /* Sandybridge's gather4 message is broken for integer formats.
2776 * To work around this, we pretend the surface is UNORM for
2777 * 8 or 16-bit formats, and emit shader instructions to recover
2778 * the real INT/UINT value. For 32-bit formats, we pretend
2779 * the surface is FLOAT, and simply reinterpret the resulting
2780 * bits.
2781 */
2782 switch (fmt.fmt) {
2783 case ISL_FORMAT_R8_SINT:
2784 case ISL_FORMAT_R8_UINT:
2785 isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2786 break;
2787
2788 case ISL_FORMAT_R16_SINT:
2789 case ISL_FORMAT_R16_UINT:
2790 isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2791 break;
2792
2793 case ISL_FORMAT_R32_SINT:
2794 case ISL_FORMAT_R32_UINT:
2795 isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2796 break;
2797
2798 default:
2799 break;
2800 }
2801 #endif
2802 #endif
2803
2804 return &isv->base;
2805 }
2806
2807 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2808 crocus_sampler_view_destroy(struct pipe_context *ctx,
2809 struct pipe_sampler_view *state)
2810 {
2811 struct crocus_sampler_view *isv = (void *) state;
2812 pipe_resource_reference(&state->texture, NULL);
2813 free(isv);
2814 }
2815
2816 /**
2817 * The pipe->create_surface() driver hook.
2818 *
2819 * In Gallium nomenclature, "surfaces" are a view of a resource that
2820 * can be bound as a render target or depth/stencil buffer.
2821 */
2822 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2823 crocus_create_surface(struct pipe_context *ctx,
2824 struct pipe_resource *tex,
2825 const struct pipe_surface *tmpl)
2826 {
2827 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2828 const struct intel_device_info *devinfo = &screen->devinfo;
2829
2830 isl_surf_usage_flags_t usage = 0;
2831 if (tmpl->writable)
2832 usage = ISL_SURF_USAGE_STORAGE_BIT;
2833 else if (util_format_is_depth_or_stencil(tmpl->format))
2834 usage = ISL_SURF_USAGE_DEPTH_BIT;
2835 else
2836 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2837
2838 const struct crocus_format_info fmt =
2839 crocus_format_for_usage(devinfo, tmpl->format, usage);
2840
2841 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2842 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2843 /* Framebuffer validation will reject this invalid case, but it
2844 * hasn't had the opportunity yet. In the meantime, we need to
2845 * avoid hitting ISL asserts about unsupported formats below.
2846 */
2847 return NULL;
2848 }
2849
2850 struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2851 struct pipe_surface *psurf = &surf->base;
2852 struct crocus_resource *res = (struct crocus_resource *) tex;
2853
2854 if (!surf)
2855 return NULL;
2856
2857 pipe_reference_init(&psurf->reference, 1);
2858 pipe_resource_reference(&psurf->texture, tex);
2859 psurf->context = ctx;
2860 psurf->format = tmpl->format;
2861 psurf->width = tex->width0;
2862 psurf->height = tex->height0;
2863 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2864 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2865 psurf->u.tex.level = tmpl->u.tex.level;
2866
2867 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2868
2869 struct isl_view *view = &surf->view;
2870 *view = (struct isl_view) {
2871 .format = fmt.fmt,
2872 .base_level = tmpl->u.tex.level,
2873 .levels = 1,
2874 .base_array_layer = tmpl->u.tex.first_layer,
2875 .array_len = array_len,
2876 .swizzle = ISL_SWIZZLE_IDENTITY,
2877 .usage = usage,
2878 };
2879
2880 #if GFX_VER >= 6
2881 struct isl_view *read_view = &surf->read_view;
2882 *read_view = (struct isl_view) {
2883 .format = fmt.fmt,
2884 .base_level = tmpl->u.tex.level,
2885 .levels = 1,
2886 .base_array_layer = tmpl->u.tex.first_layer,
2887 .array_len = array_len,
2888 .swizzle = ISL_SWIZZLE_IDENTITY,
2889 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2890 };
2891 #endif
2892
2893 surf->clear_color = res->aux.clear_color;
2894
2895 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2896 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2897 ISL_SURF_USAGE_STENCIL_BIT))
2898 return psurf;
2899
2900 if (!isl_format_is_compressed(res->surf.format)) {
2901 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2902 uint64_t temp_offset;
2903 uint32_t temp_x, temp_y;
2904
2905 isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2906 res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2907 res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2908 &temp_offset, &temp_x, &temp_y);
2909 if (!devinfo->has_surface_tile_offset &&
2910 (temp_x || temp_y)) {
2911 /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2912 * destination.
2913 */
2914 /* move to temp */
2915 struct pipe_resource wa_templ = (struct pipe_resource) {
2916 .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2917 .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2918 .depth0 = 1,
2919 .array_size = 1,
2920 .format = res->base.b.format,
2921 .target = PIPE_TEXTURE_2D,
2922 .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2923 };
2924 surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2925 view->base_level = 0;
2926 view->base_array_layer = 0;
2927 view->array_len = 1;
2928 struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2929 memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2930 }
2931 return psurf;
2932 }
2933
2934 /* The resource has a compressed format, which is not renderable, but we
2935 * have a renderable view format. We must be attempting to upload blocks
2936 * of compressed data via an uncompressed view.
2937 *
2938 * In this case, we can assume there are no auxiliary buffers, a single
2939 * miplevel, and that the resource is single-sampled. Gallium may try
2940 * and create an uncompressed view with multiple layers, however.
2941 */
2942 assert(!isl_format_is_compressed(fmt.fmt));
2943 assert(res->surf.samples == 1);
2944 assert(view->levels == 1);
2945
2946 /* TODO: compressed pbo uploads aren't working here */
2947 pipe_surface_reference(&psurf, NULL);
2948 return NULL;
2949
2950 uint64_t offset_B = 0;
2951 uint32_t tile_x_sa = 0, tile_y_sa = 0;
2952
2953 if (view->base_level > 0) {
2954 /* We can't rely on the hardware's miplevel selection with such
2955 * a substantial lie about the format, so we select a single image
2956 * using the Tile X/Y Offset fields. In this case, we can't handle
2957 * multiple array slices.
2958 *
2959 * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2960 * hard-coded to align to exactly the block size of the compressed
2961 * texture. This means that, when reinterpreted as a non-compressed
2962 * texture, the tile offsets may be anything and we can't rely on
2963 * X/Y Offset.
2964 *
2965 * Return NULL to force the state tracker to take fallback paths.
2966 */
2967 // TODO: check if the gen7 check is right, originally gen8
2968 if (view->array_len > 1 || GFX_VER == 7) {
2969 pipe_surface_reference(&psurf, NULL);
2970 return NULL;
2971 }
2972
2973 const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2974 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2975 view->base_level,
2976 is_3d ? 0 : view->base_array_layer,
2977 is_3d ? view->base_array_layer : 0,
2978 &surf->surf,
2979 &offset_B, &tile_x_sa, &tile_y_sa);
2980
2981 /* We use address and tile offsets to access a single level/layer
2982 * as a subimage, so reset level/layer so it doesn't offset again.
2983 */
2984 view->base_array_layer = 0;
2985 view->base_level = 0;
2986 } else {
2987 /* Level 0 doesn't require tile offsets, and the hardware can find
2988 * array slices using QPitch even with the format override, so we
2989 * can allow layers in this case. Copy the original ISL surface.
2990 */
2991 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2992 }
2993
2994 /* Scale down the image dimensions by the block size. */
2995 const struct isl_format_layout *fmtl =
2996 isl_format_get_layout(res->surf.format);
2997 surf->surf.format = fmt.fmt;
2998 surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
2999 surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3000 tile_x_sa /= fmtl->bw;
3001 tile_y_sa /= fmtl->bh;
3002
3003 psurf->width = surf->surf.logical_level0_px.width;
3004 psurf->height = surf->surf.logical_level0_px.height;
3005
3006 return psurf;
3007 }
3008
3009 #if GFX_VER >= 7
3010 static void
fill_default_image_param(struct isl_image_param * param)3011 fill_default_image_param(struct isl_image_param *param)
3012 {
3013 memset(param, 0, sizeof(*param));
3014 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3015 * See emit_address_calculation() in elk_fs_surface_builder.cpp for a more
3016 * detailed explanation of these parameters.
3017 */
3018 param->swizzling[0] = 0xff;
3019 param->swizzling[1] = 0xff;
3020 }
3021
3022 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3023 fill_buffer_image_param(struct isl_image_param *param,
3024 enum pipe_format pfmt,
3025 unsigned size)
3026 {
3027 const unsigned cpp = util_format_get_blocksize(pfmt);
3028
3029 fill_default_image_param(param);
3030 param->size[0] = size / cpp;
3031 param->stride[0] = cpp;
3032 }
3033
3034 #endif
3035
3036 /**
3037 * The pipe->set_shader_images() driver hook.
3038 */
3039 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3040 crocus_set_shader_images(struct pipe_context *ctx,
3041 enum pipe_shader_type p_stage,
3042 unsigned start_slot, unsigned count,
3043 unsigned unbind_num_trailing_slots,
3044 const struct pipe_image_view *p_images)
3045 {
3046 #if GFX_VER >= 7
3047 struct crocus_context *ice = (struct crocus_context *) ctx;
3048 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3049 const struct intel_device_info *devinfo = &screen->devinfo;
3050 gl_shader_stage stage = stage_from_pipe(p_stage);
3051 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3052 struct crocus_genx_state *genx = ice->state.genx;
3053 struct isl_image_param *image_params = genx->shaders[stage].image_param;
3054
3055 shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3056
3057 for (unsigned i = 0; i < count; i++) {
3058 struct crocus_image_view *iv = &shs->image[start_slot + i];
3059
3060 if (p_images && p_images[i].resource) {
3061 const struct pipe_image_view *img = &p_images[i];
3062 struct crocus_resource *res = (void *) img->resource;
3063
3064 util_copy_image_view(&iv->base, img);
3065
3066 shs->bound_image_views |= 1 << (start_slot + i);
3067
3068 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3069 res->bind_stages |= 1 << stage;
3070
3071 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3072 struct crocus_format_info fmt =
3073 crocus_format_for_usage(devinfo, img->format, usage);
3074
3075 struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3076 if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3077 /* On Gen8, try to use typed surfaces reads (which support a
3078 * limited number of formats), and if not possible, fall back
3079 * to untyped reads.
3080 */
3081 if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3082 fmt.fmt = ISL_FORMAT_RAW;
3083 else
3084 fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3085 }
3086
3087 if (res->base.b.target != PIPE_BUFFER) {
3088 struct isl_view view = {
3089 .format = fmt.fmt,
3090 .base_level = img->u.tex.level,
3091 .levels = 1,
3092 .base_array_layer = img->u.tex.first_layer,
3093 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3094 .swizzle = swiz,
3095 .usage = usage,
3096 };
3097
3098 iv->view = view;
3099
3100 isl_surf_fill_image_param(&screen->isl_dev,
3101 &image_params[start_slot + i],
3102 &res->surf, &view);
3103 } else {
3104 struct isl_view view = {
3105 .format = fmt.fmt,
3106 .swizzle = swiz,
3107 .usage = usage,
3108 };
3109 iv->view = view;
3110
3111 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3112 img->u.buf.offset + img->u.buf.size);
3113 fill_buffer_image_param(&image_params[start_slot + i],
3114 img->format, img->u.buf.size);
3115 }
3116 } else {
3117 pipe_resource_reference(&iv->base.resource, NULL);
3118 fill_default_image_param(&image_params[start_slot + i]);
3119 }
3120 }
3121
3122 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3123 ice->state.dirty |=
3124 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3125 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3126
3127 /* Broadwell also needs isl_image_params re-uploaded */
3128 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3129 shs->sysvals_need_upload = true;
3130 #endif
3131 }
3132
3133
3134 /**
3135 * The pipe->set_sampler_views() driver hook.
3136 */
3137 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3138 crocus_set_sampler_views(struct pipe_context *ctx,
3139 enum pipe_shader_type p_stage,
3140 unsigned start, unsigned count,
3141 unsigned unbind_num_trailing_slots,
3142 bool take_ownership,
3143 struct pipe_sampler_view **views)
3144 {
3145 struct crocus_context *ice = (struct crocus_context *) ctx;
3146 gl_shader_stage stage = stage_from_pipe(p_stage);
3147 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3148
3149 shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3150
3151 for (unsigned i = 0; i < count; i++) {
3152 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3153
3154 if (take_ownership) {
3155 pipe_sampler_view_reference((struct pipe_sampler_view **)
3156 &shs->textures[start + i], NULL);
3157 shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3158 } else {
3159 pipe_sampler_view_reference((struct pipe_sampler_view **)
3160 &shs->textures[start + i], pview);
3161 }
3162
3163 struct crocus_sampler_view *view = (void *) pview;
3164 if (view) {
3165 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3166 view->res->bind_stages |= 1 << stage;
3167
3168 shs->bound_sampler_views |= 1 << (start + i);
3169 }
3170 }
3171 #if GFX_VER == 6
3172 /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3173 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3174 #endif
3175 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3176 ice->state.dirty |=
3177 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3178 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3179 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3180 }
3181
3182 /**
3183 * The pipe->set_tess_state() driver hook.
3184 */
3185 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3186 crocus_set_tess_state(struct pipe_context *ctx,
3187 const float default_outer_level[4],
3188 const float default_inner_level[2])
3189 {
3190 struct crocus_context *ice = (struct crocus_context *) ctx;
3191 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3192
3193 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3194 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3195
3196 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3197 shs->sysvals_need_upload = true;
3198 }
3199
3200 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3201 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3202 {
3203 struct crocus_context *ice = (struct crocus_context *) ctx;
3204
3205 ice->state.patch_vertices = patch_vertices;
3206 }
3207
3208 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3209 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3210 {
3211 struct crocus_surface *surf = (void *) p_surf;
3212 pipe_resource_reference(&p_surf->texture, NULL);
3213
3214 pipe_resource_reference(&surf->align_res, NULL);
3215 free(surf);
3216 }
3217
3218 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3219 crocus_set_clip_state(struct pipe_context *ctx,
3220 const struct pipe_clip_state *state)
3221 {
3222 struct crocus_context *ice = (struct crocus_context *) ctx;
3223 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3224 struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3225 struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3226
3227 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3228
3229 #if GFX_VER <= 5
3230 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3231 #endif
3232 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3233 CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3234 shs->sysvals_need_upload = true;
3235 gshs->sysvals_need_upload = true;
3236 tshs->sysvals_need_upload = true;
3237 }
3238
3239 /**
3240 * The pipe->set_polygon_stipple() driver hook.
3241 */
3242 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3243 crocus_set_polygon_stipple(struct pipe_context *ctx,
3244 const struct pipe_poly_stipple *state)
3245 {
3246 struct crocus_context *ice = (struct crocus_context *) ctx;
3247 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3248 ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3249 }
3250
3251 /**
3252 * The pipe->set_sample_mask() driver hook.
3253 */
3254 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3255 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3256 {
3257 struct crocus_context *ice = (struct crocus_context *) ctx;
3258
3259 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3260 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3261 */
3262 ice->state.sample_mask = sample_mask & 0xff;
3263 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3264 }
3265
3266 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3267 crocus_fill_scissor_rect(struct crocus_context *ice,
3268 int idx,
3269 struct pipe_scissor_state *ss)
3270 {
3271 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3272 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3273 const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3274 struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3275 .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3276 .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3277 .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3278 .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3279 };
3280 if (cso_state->scissor) {
3281 struct pipe_scissor_state *s = &ice->state.scissors[idx];
3282 scissor.minx = MAX2(scissor.minx, s->minx);
3283 scissor.miny = MAX2(scissor.miny, s->miny);
3284 scissor.maxx = MIN2(scissor.maxx, s->maxx);
3285 scissor.maxy = MIN2(scissor.maxy, s->maxy);
3286 }
3287 *ss = scissor;
3288 }
3289
3290 /**
3291 * The pipe->set_scissor_states() driver hook.
3292 *
3293 * This corresponds to our SCISSOR_RECT state structures. It's an
3294 * exact match, so we just store them, and memcpy them out later.
3295 */
3296 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3297 crocus_set_scissor_states(struct pipe_context *ctx,
3298 unsigned start_slot,
3299 unsigned num_scissors,
3300 const struct pipe_scissor_state *rects)
3301 {
3302 struct crocus_context *ice = (struct crocus_context *) ctx;
3303
3304 for (unsigned i = 0; i < num_scissors; i++) {
3305 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3306 /* If the scissor was out of bounds and got clamped to 0 width/height
3307 * at the bounds, the subtraction of 1 from maximums could produce a
3308 * negative number and thus not clip anything. Instead, just provide
3309 * a min > max scissor inside the bounds, which produces the expected
3310 * no rendering.
3311 */
3312 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3313 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3314 };
3315 } else {
3316 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3317 .minx = rects[i].minx, .miny = rects[i].miny,
3318 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3319 };
3320 }
3321 }
3322
3323 #if GFX_VER < 6
3324 ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3325 #else
3326 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3327 #endif
3328 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3329
3330 }
3331
3332 /**
3333 * The pipe->set_stencil_ref() driver hook.
3334 *
3335 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3336 */
3337 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3338 crocus_set_stencil_ref(struct pipe_context *ctx,
3339 const struct pipe_stencil_ref ref)
3340 {
3341 struct crocus_context *ice = (struct crocus_context *) ctx;
3342 ice->state.stencil_ref = ref;
3343 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3344 }
3345
3346 #if GFX_VER == 8
3347 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3348 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3349 {
3350 return copysignf(state->scale[axis], sign) + state->translate[axis];
3351 }
3352 #endif
3353
3354 /**
3355 * The pipe->set_viewport_states() driver hook.
3356 *
3357 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3358 * the guardband yet, as we need the framebuffer dimensions, but we can
3359 * at least fill out the rest.
3360 */
3361 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3362 crocus_set_viewport_states(struct pipe_context *ctx,
3363 unsigned start_slot,
3364 unsigned count,
3365 const struct pipe_viewport_state *states)
3366 {
3367 struct crocus_context *ice = (struct crocus_context *) ctx;
3368 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3369
3370 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3371
3372 /* Fix depth test misrenderings by lowering translated depth range */
3373 if (screen->driconf.lower_depth_range_rate != 1.0f)
3374 ice->state.viewports[start_slot].translate[2] *=
3375 screen->driconf.lower_depth_range_rate;
3376
3377 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3378 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3379 #if GFX_VER >= 6
3380 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3381 #endif
3382
3383 if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3384 !ice->state.cso_rast->cso.depth_clip_far))
3385 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3386 }
3387
3388 /**
3389 * The pipe->set_framebuffer_state() driver hook.
3390 *
3391 * Sets the current draw FBO, including color render targets, depth,
3392 * and stencil buffers.
3393 */
3394 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3395 crocus_set_framebuffer_state(struct pipe_context *ctx,
3396 const struct pipe_framebuffer_state *state)
3397 {
3398 struct crocus_context *ice = (struct crocus_context *) ctx;
3399 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3400 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3401 const struct intel_device_info *devinfo = &screen->devinfo;
3402 #if 0
3403 struct isl_device *isl_dev = &screen->isl_dev;
3404 struct crocus_resource *zres;
3405 struct crocus_resource *stencil_res;
3406 #endif
3407
3408 unsigned samples = util_framebuffer_get_num_samples(state);
3409 unsigned layers = util_framebuffer_get_num_layers(state);
3410
3411 #if GFX_VER >= 6
3412 if (cso->samples != samples) {
3413 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3414 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3415 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3416 #if GFX_VERx10 == 75
3417 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3418 #endif
3419 }
3420 #endif
3421
3422 #if GFX_VER >= 6 && GFX_VER < 8
3423 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3424 #endif
3425
3426 if ((cso->layers == 0) != (layers == 0)) {
3427 ice->state.dirty |= CROCUS_DIRTY_CLIP;
3428 }
3429
3430 if (cso->width != state->width || cso->height != state->height) {
3431 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3432 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3433 ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3434 #if GFX_VER >= 6
3435 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3436 #endif
3437 }
3438
3439 if (cso->zsbuf || state->zsbuf) {
3440 ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3441
3442 /* update SF's depth buffer format */
3443 if (GFX_VER == 7 && cso->zsbuf)
3444 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3445 }
3446
3447 /* wm thread dispatch enable */
3448 ice->state.dirty |= CROCUS_DIRTY_WM;
3449 util_copy_framebuffer_state(cso, state);
3450 cso->samples = samples;
3451 cso->layers = layers;
3452
3453 if (cso->zsbuf) {
3454 struct crocus_resource *zres;
3455 struct crocus_resource *stencil_res;
3456 enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3457 crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3458 &stencil_res);
3459 if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3460 aux_usage = zres->aux.usage;
3461 }
3462 ice->state.hiz_usage = aux_usage;
3463 }
3464
3465 /* Render target change */
3466 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3467
3468 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3469
3470 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3471 }
3472
3473 /**
3474 * The pipe->set_constant_buffer() driver hook.
3475 *
3476 * This uploads any constant data in user buffers, and references
3477 * any UBO resources containing constant data.
3478 */
3479 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3480 crocus_set_constant_buffer(struct pipe_context *ctx,
3481 enum pipe_shader_type p_stage, unsigned index,
3482 bool take_ownership,
3483 const struct pipe_constant_buffer *input)
3484 {
3485 struct crocus_context *ice = (struct crocus_context *) ctx;
3486 gl_shader_stage stage = stage_from_pipe(p_stage);
3487 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3488 struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3489
3490 util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3491
3492 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3493 shs->bound_cbufs |= 1u << index;
3494
3495 if (input->user_buffer) {
3496 void *map = NULL;
3497 pipe_resource_reference(&cbuf->buffer, NULL);
3498 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3499 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3500
3501 if (!cbuf->buffer) {
3502 /* Allocation was unsuccessful - just unbind */
3503 crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3504 return;
3505 }
3506
3507 assert(map);
3508 memcpy(map, input->user_buffer, input->buffer_size);
3509 }
3510 cbuf->buffer_size =
3511 MIN2(input->buffer_size,
3512 crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3513
3514 struct crocus_resource *res = (void *) cbuf->buffer;
3515 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3516 res->bind_stages |= 1 << stage;
3517 } else {
3518 shs->bound_cbufs &= ~(1u << index);
3519 }
3520
3521 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3522 }
3523
3524 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3525 upload_sysvals(struct crocus_context *ice,
3526 gl_shader_stage stage)
3527 {
3528 UNUSED struct crocus_genx_state *genx = ice->state.genx;
3529 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3530
3531 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3532 if (!shader || shader->num_system_values == 0)
3533 return;
3534
3535 assert(shader->num_cbufs > 0);
3536
3537 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3538 struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3539 unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3540 uint32_t *map = NULL;
3541
3542 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3543 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3544 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3545
3546 for (int i = 0; i < shader->num_system_values; i++) {
3547 uint32_t sysval = shader->system_values[i];
3548 uint32_t value = 0;
3549
3550 if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
3551 #if GFX_VER >= 7
3552 unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
3553 unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
3554 struct isl_image_param *param =
3555 &genx->shaders[stage].image_param[img];
3556
3557 assert(offset < sizeof(struct isl_image_param));
3558 value = ((uint32_t *) param)[offset];
3559 #endif
3560 } else if (sysval == ELK_PARAM_BUILTIN_ZERO) {
3561 value = 0;
3562 } else if (ELK_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3563 int plane = ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3564 int comp = ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3565 value = fui(ice->state.clip_planes.ucp[plane][comp]);
3566 } else if (sysval == ELK_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3567 if (stage == MESA_SHADER_TESS_CTRL) {
3568 value = ice->state.vertices_per_patch;
3569 } else {
3570 assert(stage == MESA_SHADER_TESS_EVAL);
3571 const struct shader_info *tcs_info =
3572 crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3573 if (tcs_info)
3574 value = tcs_info->tess.tcs_vertices_out;
3575 else
3576 value = ice->state.vertices_per_patch;
3577 }
3578 } else if (sysval >= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3579 sysval <= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3580 unsigned i = sysval - ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3581 value = fui(ice->state.default_outer_level[i]);
3582 } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3583 value = fui(ice->state.default_inner_level[0]);
3584 } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3585 value = fui(ice->state.default_inner_level[1]);
3586 } else if (sysval >= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3587 sysval <= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3588 unsigned i = sysval - ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3589 value = ice->state.last_block[i];
3590 } else {
3591 assert(!"unhandled system value");
3592 }
3593
3594 *map++ = value;
3595 }
3596
3597 cbuf->buffer_size = upload_size;
3598 shs->sysvals_need_upload = false;
3599 }
3600
3601 /**
3602 * The pipe->set_shader_buffers() driver hook.
3603 *
3604 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
3605 * SURFACE_STATE here, as the buffer offset may change each time.
3606 */
3607 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3608 crocus_set_shader_buffers(struct pipe_context *ctx,
3609 enum pipe_shader_type p_stage,
3610 unsigned start_slot, unsigned count,
3611 const struct pipe_shader_buffer *buffers,
3612 unsigned writable_bitmask)
3613 {
3614 struct crocus_context *ice = (struct crocus_context *) ctx;
3615 gl_shader_stage stage = stage_from_pipe(p_stage);
3616 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3617
3618 unsigned modified_bits = u_bit_consecutive(start_slot, count);
3619
3620 shs->bound_ssbos &= ~modified_bits;
3621 shs->writable_ssbos &= ~modified_bits;
3622 shs->writable_ssbos |= writable_bitmask << start_slot;
3623
3624 for (unsigned i = 0; i < count; i++) {
3625 if (buffers && buffers[i].buffer) {
3626 struct crocus_resource *res = (void *) buffers[i].buffer;
3627 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3628 pipe_resource_reference(&ssbo->buffer, &res->base.b);
3629 ssbo->buffer_offset = buffers[i].buffer_offset;
3630 ssbo->buffer_size =
3631 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3632
3633 shs->bound_ssbos |= 1 << (start_slot + i);
3634
3635 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3636 res->bind_stages |= 1 << stage;
3637
3638 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3639 ssbo->buffer_offset + ssbo->buffer_size);
3640 } else {
3641 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3642 }
3643 }
3644
3645 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3646 }
3647
3648 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3649 crocus_delete_state(struct pipe_context *ctx, void *state)
3650 {
3651 free(state);
3652 }
3653
3654 /**
3655 * The pipe->set_vertex_buffers() driver hook.
3656 *
3657 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3658 */
3659 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)3660 crocus_set_vertex_buffers(struct pipe_context *ctx,
3661 unsigned count,
3662 const struct pipe_vertex_buffer *buffers)
3663 {
3664 struct crocus_context *ice = (struct crocus_context *) ctx;
3665 struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3666 const unsigned padding =
3667 (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3668
3669 util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3670 buffers, count, true);
3671
3672 for (unsigned i = 0; i < count; i++) {
3673 struct pipe_vertex_buffer *state =
3674 &ice->state.vertex_buffers[i];
3675
3676 if (!state->is_user_buffer && state->buffer.resource) {
3677 struct crocus_resource *res = (void *)state->buffer.resource;
3678 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3679 }
3680
3681 uint32_t end = 0;
3682 if (state->buffer.resource)
3683 end = state->buffer.resource->width0 + padding;
3684 ice->state.vb_end[i] = end;
3685 }
3686 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3687 }
3688
3689 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3690 static uint8_t get_wa_flags(enum isl_format format)
3691 {
3692 uint8_t wa_flags = 0;
3693
3694 switch (format) {
3695 case ISL_FORMAT_R10G10B10A2_USCALED:
3696 wa_flags = ELK_ATTRIB_WA_SCALE;
3697 break;
3698 case ISL_FORMAT_R10G10B10A2_SSCALED:
3699 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE;
3700 break;
3701 case ISL_FORMAT_R10G10B10A2_UNORM:
3702 wa_flags = ELK_ATTRIB_WA_NORMALIZE;
3703 break;
3704 case ISL_FORMAT_R10G10B10A2_SNORM:
3705 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE;
3706 break;
3707 case ISL_FORMAT_R10G10B10A2_SINT:
3708 wa_flags = ELK_ATTRIB_WA_SIGN;
3709 break;
3710 case ISL_FORMAT_B10G10R10A2_USCALED:
3711 wa_flags = ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3712 break;
3713 case ISL_FORMAT_B10G10R10A2_SSCALED:
3714 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3715 break;
3716 case ISL_FORMAT_B10G10R10A2_UNORM:
3717 wa_flags = ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3718 break;
3719 case ISL_FORMAT_B10G10R10A2_SNORM:
3720 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3721 break;
3722 case ISL_FORMAT_B10G10R10A2_SINT:
3723 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_BGRA;
3724 break;
3725 case ISL_FORMAT_B10G10R10A2_UINT:
3726 wa_flags = ELK_ATTRIB_WA_BGRA;
3727 break;
3728 default:
3729 break;
3730 }
3731 return wa_flags;
3732 }
3733 #endif
3734
3735 /**
3736 * Gallium CSO for vertex elements.
3737 */
3738 struct crocus_vertex_element_state {
3739 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3740 #if GFX_VER == 8
3741 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3742 #endif
3743 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3744 #if GFX_VER == 8
3745 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3746 #endif
3747 uint32_t step_rate[16];
3748 uint8_t wa_flags[33];
3749 uint16_t strides[16];
3750 unsigned count;
3751 };
3752
3753 /**
3754 * The pipe->create_vertex_elements() driver hook.
3755 *
3756 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3757 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3758 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3759 * needed. In these cases we will need information available at draw time.
3760 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3761 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3762 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3763 */
3764 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3765 crocus_create_vertex_elements(struct pipe_context *ctx,
3766 unsigned count,
3767 const struct pipe_vertex_element *state)
3768 {
3769 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3770 const struct intel_device_info *devinfo = &screen->devinfo;
3771 struct crocus_vertex_element_state *cso =
3772 calloc(1, sizeof(struct crocus_vertex_element_state));
3773
3774 cso->count = count;
3775
3776 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3777 ve.DWordLength =
3778 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3779 }
3780
3781 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3782 #if GFX_VER == 8
3783 uint32_t *vfi_pack_dest = cso->vf_instancing;
3784 #endif
3785
3786 if (count == 0) {
3787 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3788 ve.Valid = true;
3789 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3790 ve.Component0Control = VFCOMP_STORE_0;
3791 ve.Component1Control = VFCOMP_STORE_0;
3792 ve.Component2Control = VFCOMP_STORE_0;
3793 ve.Component3Control = VFCOMP_STORE_1_FP;
3794 }
3795 #if GFX_VER == 8
3796 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3797 }
3798 #endif
3799 }
3800
3801 for (int i = 0; i < count; i++) {
3802 const struct crocus_format_info fmt =
3803 crocus_format_for_usage(devinfo, state[i].src_format, 0);
3804 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3805 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3806 enum isl_format actual_fmt = fmt.fmt;
3807
3808 #if GFX_VERx10 < 75
3809 cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3810
3811 if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3812 fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3813 fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3814 fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3815 fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3816 fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3817 fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3818 fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3819 fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3820 fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3821 fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3822 actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3823 if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3824 actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3825 if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3826 actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3827 if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3828 actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3829 if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3830 actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3831 #endif
3832
3833 cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3834 cso->strides[state[i].vertex_buffer_index] = state[i].src_stride;
3835
3836 switch (isl_format_get_num_channels(fmt.fmt)) {
3837 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3838 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3839 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3840 case 3:
3841 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3842 : VFCOMP_STORE_1_FP;
3843 break;
3844 }
3845 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3846 #if GFX_VER >= 6
3847 ve.EdgeFlagEnable = false;
3848 #endif
3849 ve.VertexBufferIndex = state[i].vertex_buffer_index;
3850 ve.Valid = true;
3851 ve.SourceElementOffset = state[i].src_offset;
3852 ve.SourceElementFormat = actual_fmt;
3853 ve.Component0Control = comp[0];
3854 ve.Component1Control = comp[1];
3855 ve.Component2Control = comp[2];
3856 ve.Component3Control = comp[3];
3857 #if GFX_VER < 5
3858 ve.DestinationElementOffset = i * 4;
3859 #endif
3860 }
3861
3862 #if GFX_VER == 8
3863 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3864 vi.VertexElementIndex = i;
3865 vi.InstancingEnable = state[i].instance_divisor > 0;
3866 vi.InstanceDataStepRate = state[i].instance_divisor;
3867 }
3868 #endif
3869 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3870 #if GFX_VER == 8
3871 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3872 #endif
3873 }
3874
3875 /* An alternative version of the last VE and VFI is stored so it
3876 * can be used at draw time in case Vertex Shader uses EdgeFlag
3877 */
3878 if (count) {
3879 const unsigned edgeflag_index = count - 1;
3880 const struct crocus_format_info fmt =
3881 crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3882 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3883 #if GFX_VER >= 6
3884 ve.EdgeFlagEnable = true;
3885 #endif
3886 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3887 ve.Valid = true;
3888 ve.SourceElementOffset = state[edgeflag_index].src_offset;
3889 ve.SourceElementFormat = fmt.fmt;
3890 ve.Component0Control = VFCOMP_STORE_SRC;
3891 ve.Component1Control = VFCOMP_STORE_0;
3892 ve.Component2Control = VFCOMP_STORE_0;
3893 ve.Component3Control = VFCOMP_STORE_0;
3894 }
3895 #if GFX_VER == 8
3896 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3897 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3898 * at draw time, as it should change if SGVs are emitted.
3899 */
3900 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3901 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3902 }
3903 #endif
3904 }
3905
3906 return cso;
3907 }
3908
3909 /**
3910 * The pipe->bind_vertex_elements_state() driver hook.
3911 */
3912 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3913 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3914 {
3915 struct crocus_context *ice = (struct crocus_context *) ctx;
3916 #if GFX_VER == 8
3917 struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3918 struct crocus_vertex_element_state *new_cso = state;
3919
3920 if (new_cso && cso_changed(count))
3921 ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3922 #endif
3923 ice->state.cso_vertex_elements = state;
3924 ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3925 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3926 }
3927
3928 #if GFX_VER >= 6
3929 struct crocus_streamout_counter {
3930 uint32_t offset_start;
3931 uint32_t offset_end;
3932
3933 uint64_t accum;
3934 };
3935
3936 /**
3937 * Gallium CSO for stream output (transform feedback) targets.
3938 */
3939 struct crocus_stream_output_target {
3940 struct pipe_stream_output_target base;
3941
3942 /** Stride (bytes-per-vertex) during this transform feedback operation */
3943 uint16_t stride;
3944
3945 /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3946 bool zeroed;
3947
3948 struct crocus_resource *offset_res;
3949 uint32_t offset_offset;
3950
3951 #if GFX_VER == 6
3952 void *prim_map;
3953 struct crocus_streamout_counter prev_count;
3954 struct crocus_streamout_counter count;
3955 #endif
3956 #if GFX_VER == 8
3957 /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3958 bool zero_offset;
3959 #endif
3960 };
3961
3962 #if GFX_VER >= 7
3963 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3964 crocus_get_so_offset(struct pipe_stream_output_target *so)
3965 {
3966 struct crocus_stream_output_target *tgt = (void *)so;
3967 struct pipe_transfer *transfer;
3968 struct pipe_box box;
3969 uint32_t result;
3970 u_box_1d(tgt->offset_offset, 4, &box);
3971 void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3972 0, PIPE_MAP_DIRECTLY,
3973 &box, &transfer);
3974 assert(val);
3975 result = *(uint32_t *)val;
3976 so->context->buffer_unmap(so->context, transfer);
3977
3978 return result / tgt->stride;
3979 }
3980 #endif
3981
3982 #if GFX_VER == 6
3983 static void
3984 compute_vertices_written_so_far(struct crocus_context *ice,
3985 struct crocus_stream_output_target *tgt,
3986 struct crocus_streamout_counter *count,
3987 uint64_t *svbi);
3988
3989 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3990 crocus_get_so_offset(struct pipe_stream_output_target *so)
3991 {
3992 struct crocus_stream_output_target *tgt = (void *)so;
3993 struct crocus_context *ice = (void *)so->context;
3994
3995 uint64_t vert_written;
3996 compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
3997 return vert_written;
3998 }
3999 #endif
4000
4001 /**
4002 * The pipe->create_stream_output_target() driver hook.
4003 *
4004 * "Target" here refers to a destination buffer. We translate this into
4005 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4006 * know which buffer this represents, or whether we ought to zero the
4007 * write-offsets, or append. Those are handled in the set() hook.
4008 */
4009 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4010 crocus_create_stream_output_target(struct pipe_context *ctx,
4011 struct pipe_resource *p_res,
4012 unsigned buffer_offset,
4013 unsigned buffer_size)
4014 {
4015 struct crocus_resource *res = (void *) p_res;
4016 struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4017 if (!cso)
4018 return NULL;
4019
4020 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4021
4022 pipe_reference_init(&cso->base.reference, 1);
4023 pipe_resource_reference(&cso->base.buffer, p_res);
4024 cso->base.buffer_offset = buffer_offset;
4025 cso->base.buffer_size = buffer_size;
4026 cso->base.context = ctx;
4027
4028 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4029 buffer_offset + buffer_size);
4030 #if GFX_VER >= 7
4031 struct crocus_context *ice = (struct crocus_context *) ctx;
4032 void *temp;
4033 u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4034 &cso->offset_offset,
4035 (struct pipe_resource **)&cso->offset_res,
4036 &temp);
4037 #endif
4038
4039 return &cso->base;
4040 }
4041
4042 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4043 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4044 struct pipe_stream_output_target *state)
4045 {
4046 struct crocus_stream_output_target *cso = (void *) state;
4047
4048 pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4049 pipe_resource_reference(&cso->base.buffer, NULL);
4050
4051 free(cso);
4052 }
4053
4054 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
4055 #define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
4056
4057 #if GFX_VER == 6
4058 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4059 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4060 struct crocus_streamout_counter *counter)
4061 {
4062 uint64_t *prim_counts = tgt->prim_map;
4063
4064 if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4065 struct pipe_fence_handle *out_fence = NULL;
4066 batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4067 batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4068 batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4069 }
4070
4071 for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4072 counter->accum += prim_counts[i + 1] - prim_counts[i];
4073 }
4074 tgt->count.offset_start = tgt->count.offset_end = 0;
4075 }
4076
4077 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4078 crocus_stream_store_prims_written(struct crocus_batch *batch,
4079 struct crocus_stream_output_target *tgt)
4080 {
4081 if (!tgt->offset_res) {
4082 u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4083 &tgt->offset_offset,
4084 (struct pipe_resource **)&tgt->offset_res,
4085 &tgt->prim_map);
4086 tgt->count.offset_start = tgt->count.offset_end = 0;
4087 }
4088
4089 if (tgt->count.offset_end + 16 >= 4096) {
4090 aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4091 aggregate_stream_counter(batch, tgt, &tgt->count);
4092 }
4093
4094 crocus_emit_mi_flush(batch);
4095 crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4096 tgt->offset_res->bo,
4097 tgt->count.offset_end + tgt->offset_offset, false);
4098 tgt->count.offset_end += 8;
4099 }
4100
4101 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4102 compute_vertices_written_so_far(struct crocus_context *ice,
4103 struct crocus_stream_output_target *tgt,
4104 struct crocus_streamout_counter *counter,
4105 uint64_t *svbi)
4106 {
4107 //TODO vertices per prim
4108 aggregate_stream_counter(&ice->batches[0], tgt, counter);
4109
4110 *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4111 }
4112 #endif
4113 /**
4114 * The pipe->set_stream_output_targets() driver hook.
4115 *
4116 * At this point, we know which targets are bound to a particular index,
4117 * and also whether we want to append or start over. We can finish the
4118 * 3DSTATE_SO_BUFFER packets we started earlier.
4119 */
4120 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4121 crocus_set_stream_output_targets(struct pipe_context *ctx,
4122 unsigned num_targets,
4123 struct pipe_stream_output_target **targets,
4124 const unsigned *offsets)
4125 {
4126 struct crocus_context *ice = (struct crocus_context *) ctx;
4127 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4128 struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4129 const bool active = num_targets > 0;
4130 if (ice->state.streamout_active != active) {
4131 ice->state.streamout_active = active;
4132 #if GFX_VER >= 7
4133 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4134 #else
4135 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4136 #endif
4137
4138 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4139 * it's a non-pipelined command. If we're switching streamout on, we
4140 * may have missed emitting it earlier, so do so now. (We're already
4141 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4142 */
4143 if (active) {
4144 #if GFX_VER >= 7
4145 ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4146 #endif
4147 } else {
4148 uint32_t flush = 0;
4149 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4150 struct crocus_stream_output_target *tgt =
4151 (void *) ice->state.so_target[i];
4152 if (tgt) {
4153 struct crocus_resource *res = (void *) tgt->base.buffer;
4154
4155 flush |= crocus_flush_bits_for_history(res);
4156 crocus_dirty_for_history(ice, res);
4157 }
4158 }
4159 crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4160 "make streamout results visible", flush);
4161 }
4162 }
4163
4164 ice->state.so_targets = num_targets;
4165 for (int i = 0; i < 4; i++) {
4166 pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4167 pipe_so_target_reference(&ice->state.so_target[i],
4168 i < num_targets ? targets[i] : NULL);
4169 }
4170
4171 #if GFX_VER == 6
4172 bool stored_num_prims = false;
4173 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4174 if (num_targets) {
4175 struct crocus_stream_output_target *tgt =
4176 (void *) ice->state.so_target[i];
4177
4178 if (!tgt)
4179 continue;
4180 if (offsets[i] == 0) {
4181 // This means that we're supposed to ignore anything written to
4182 // the buffer before. We can do this by just clearing out the
4183 // count of writes to the prim count buffer.
4184 tgt->count.offset_start = tgt->count.offset_end;
4185 tgt->count.accum = 0;
4186 ice->state.svbi = 0;
4187 } else {
4188 if (tgt->offset_res) {
4189 compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4190 tgt->count.offset_start = tgt->count.offset_end;
4191 }
4192 }
4193
4194 if (!stored_num_prims) {
4195 crocus_stream_store_prims_written(batch, tgt);
4196 stored_num_prims = true;
4197 }
4198 } else {
4199 struct crocus_stream_output_target *tgt =
4200 (void *) old_tgt[i];
4201 if (tgt) {
4202 if (!stored_num_prims) {
4203 crocus_stream_store_prims_written(batch, tgt);
4204 stored_num_prims = true;
4205 }
4206
4207 if (tgt->offset_res) {
4208 tgt->prev_count = tgt->count;
4209 }
4210 }
4211 }
4212 pipe_so_target_reference(&old_tgt[i], NULL);
4213 }
4214 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4215 #else
4216 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4217 if (num_targets) {
4218 struct crocus_stream_output_target *tgt =
4219 (void *) ice->state.so_target[i];
4220
4221 if (offsets[i] == 0) {
4222 #if GFX_VER == 8
4223 if (tgt)
4224 tgt->zero_offset = true;
4225 #endif
4226 crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4227 }
4228 else if (tgt)
4229 crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4230 tgt->offset_res->bo,
4231 tgt->offset_offset);
4232 } else {
4233 struct crocus_stream_output_target *tgt =
4234 (void *) old_tgt[i];
4235 if (tgt)
4236 crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4237 tgt->offset_res->bo,
4238 tgt->offset_offset, false);
4239 }
4240 pipe_so_target_reference(&old_tgt[i], NULL);
4241 }
4242 #endif
4243 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4244 if (!active)
4245 return;
4246 #if GFX_VER >= 7
4247 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4248 #elif GFX_VER == 6
4249 ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4250 #endif
4251 }
4252
4253 #endif
4254
4255 #if GFX_VER >= 7
4256 /**
4257 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4258 * 3DSTATE_STREAMOUT packets.
4259 *
4260 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4261 * hardware to record. We can create it entirely based on the shader, with
4262 * no dynamic state dependencies.
4263 *
4264 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4265 * state-based settings. We capture the shader-related ones here, and merge
4266 * the rest in at draw time.
4267 */
4268 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4269 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4270 const struct intel_vue_map *vue_map)
4271 {
4272 struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4273 int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4274 int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4275 int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4276 int max_decls = 0;
4277 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4278
4279 memset(so_decl, 0, sizeof(so_decl));
4280
4281 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4282 * command feels strange -- each dword pair contains a SO_DECL per stream.
4283 */
4284 for (unsigned i = 0; i < info->num_outputs; i++) {
4285 const struct pipe_stream_output *output = &info->output[i];
4286 const int buffer = output->output_buffer;
4287 const int varying = output->register_index;
4288 const unsigned stream_id = output->stream;
4289 assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4290
4291 buffer_mask[stream_id] |= 1 << buffer;
4292
4293 assert(vue_map->varying_to_slot[varying] >= 0);
4294
4295 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4296 * array. Instead, it simply increments DstOffset for the following
4297 * input by the number of components that should be skipped.
4298 *
4299 * Our hardware is unusual in that it requires us to program SO_DECLs
4300 * for fake "hole" components, rather than simply taking the offset
4301 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4302 * program as many size = 4 holes as we can, then a final hole to
4303 * accommodate the final 1, 2, or 3 remaining.
4304 */
4305 int skip_components = output->dst_offset - next_offset[buffer];
4306
4307 while (skip_components > 0) {
4308 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4309 .HoleFlag = 1,
4310 .OutputBufferSlot = output->output_buffer,
4311 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4312 };
4313 skip_components -= 4;
4314 }
4315
4316 next_offset[buffer] = output->dst_offset + output->num_components;
4317
4318 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4319 .OutputBufferSlot = output->output_buffer,
4320 .RegisterIndex = vue_map->varying_to_slot[varying],
4321 .ComponentMask =
4322 ((1 << output->num_components) - 1) << output->start_component,
4323 };
4324
4325 if (decls[stream_id] > max_decls)
4326 max_decls = decls[stream_id];
4327 }
4328
4329 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4330 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4331 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4332
4333 crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4334 int urb_entry_read_offset = 0;
4335 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4336 urb_entry_read_offset;
4337
4338 /* We always read the whole vertex. This could be reduced at some
4339 * point by reading less and offsetting the register index in the
4340 * SO_DECLs.
4341 */
4342 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4343 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4344 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4345 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4346 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4347 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4348 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4349 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4350
4351 // TODO: Double-check that stride == 0 means no buffer. Probably this
4352 // needs to go elsewhere, where the buffer enable stuff is actually
4353 // known.
4354 #if GFX_VER < 8
4355 sol.SOBufferEnable0 = !!info->stride[0];
4356 sol.SOBufferEnable1 = !!info->stride[1];
4357 sol.SOBufferEnable2 = !!info->stride[2];
4358 sol.SOBufferEnable3 = !!info->stride[3];
4359 #else
4360 /* Set buffer pitches; 0 means unbound. */
4361 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4362 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4363 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4364 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4365 #endif
4366 }
4367
4368 crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4369 list.DWordLength = 3 + 2 * max_decls - 2;
4370 list.StreamtoBufferSelects0 = buffer_mask[0];
4371 list.StreamtoBufferSelects1 = buffer_mask[1];
4372 list.StreamtoBufferSelects2 = buffer_mask[2];
4373 list.StreamtoBufferSelects3 = buffer_mask[3];
4374 list.NumEntries0 = decls[0];
4375 list.NumEntries1 = decls[1];
4376 list.NumEntries2 = decls[2];
4377 list.NumEntries3 = decls[3];
4378 }
4379
4380 for (int i = 0; i < max_decls; i++) {
4381 crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4382 entry.Stream0Decl = so_decl[0][i];
4383 entry.Stream1Decl = so_decl[1][i];
4384 entry.Stream2Decl = so_decl[2][i];
4385 entry.Stream3Decl = so_decl[3][i];
4386 }
4387 }
4388
4389 return map;
4390 }
4391 #endif
4392
4393 #if GFX_VER == 6
4394 static void
crocus_emit_so_svbi(struct crocus_context * ice)4395 crocus_emit_so_svbi(struct crocus_context *ice)
4396 {
4397 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4398
4399 unsigned max_vertex = 0xffffffff;
4400 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4401 struct crocus_stream_output_target *tgt =
4402 (void *) ice->state.so_target[i];
4403 if (tgt)
4404 max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4405 }
4406
4407 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4408 svbi.IndexNumber = 0;
4409 svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4410 svbi.MaximumIndex = max_vertex;
4411 }
4412
4413 /* initialize the rest of the SVBI's to reasonable values so that we don't
4414 * run out of room writing the regular data.
4415 */
4416 for (int i = 1; i < 4; i++) {
4417 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4418 svbi.IndexNumber = i;
4419 svbi.StreamedVertexBufferIndex = 0;
4420 svbi.MaximumIndex = 0xffffffff;
4421 }
4422 }
4423 }
4424
4425 #endif
4426
4427
4428 #if GFX_VER >= 6
4429 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4430 crocus_is_drawing_points(const struct crocus_context *ice)
4431 {
4432 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4433
4434 if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4435 cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4436 return true;
4437
4438 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4439 const struct elk_gs_prog_data *gs_prog_data =
4440 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4441 return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4442 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4443 const struct elk_tes_prog_data *tes_data =
4444 (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4445 return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4446 } else {
4447 return ice->state.prim_mode == MESA_PRIM_POINTS;
4448 }
4449 }
4450 #endif
4451
4452 #if GFX_VER >= 6
4453 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct intel_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4454 get_attr_override(
4455 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4456 const struct intel_vue_map *vue_map,
4457 int urb_entry_read_offset, int fs_attr,
4458 bool two_side_color, uint32_t *max_source_attr)
4459 {
4460 /* Find the VUE slot for this attribute. */
4461 int slot = vue_map->varying_to_slot[fs_attr];
4462
4463 /* Viewport and Layer are stored in the VUE header. We need to override
4464 * them to zero if earlier stages didn't write them, as GL requires that
4465 * they read back as zero when not explicitly set.
4466 */
4467 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4468 attr->ComponentOverrideX = true;
4469 attr->ComponentOverrideW = true;
4470 attr->ConstantSource = CONST_0000;
4471
4472 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4473 attr->ComponentOverrideY = true;
4474 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4475 attr->ComponentOverrideZ = true;
4476
4477 return;
4478 }
4479
4480 /* If there was only a back color written but not front, use back
4481 * as the color instead of undefined
4482 */
4483 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4484 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4485 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4486 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4487
4488 if (slot == -1) {
4489 /* This attribute does not exist in the VUE--that means that the vertex
4490 * shader did not write to it. This means that either:
4491 *
4492 * (a) This attribute is a texture coordinate, and it is going to be
4493 * replaced with point coordinates (as a consequence of a call to
4494 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4495 * hardware will ignore whatever attribute override we supply.
4496 *
4497 * (b) This attribute is read by the fragment shader but not written by
4498 * the vertex shader, so its value is undefined. Therefore the
4499 * attribute override we supply doesn't matter.
4500 *
4501 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4502 * previous shader stage.
4503 *
4504 * Note that we don't have to worry about the cases where the attribute
4505 * is gl_PointCoord or is undergoing point sprite coordinate
4506 * replacement, because in those cases, this function isn't called.
4507 *
4508 * In case (c), we need to program the attribute overrides so that the
4509 * primitive ID will be stored in this slot. In every other case, the
4510 * attribute override we supply doesn't matter. So just go ahead and
4511 * program primitive ID in every case.
4512 */
4513 attr->ComponentOverrideW = true;
4514 attr->ComponentOverrideX = true;
4515 attr->ComponentOverrideY = true;
4516 attr->ComponentOverrideZ = true;
4517 attr->ConstantSource = PRIM_ID;
4518 return;
4519 }
4520
4521 /* Compute the location of the attribute relative to urb_entry_read_offset.
4522 * Each increment of urb_entry_read_offset represents a 256-bit value, so
4523 * it counts for two 128-bit VUE slots.
4524 */
4525 int source_attr = slot - 2 * urb_entry_read_offset;
4526 assert(source_attr >= 0 && source_attr < 32);
4527
4528 /* If we are doing two-sided color, and the VUE slot following this one
4529 * represents a back-facing color, then we need to instruct the SF unit to
4530 * do back-facing swizzling.
4531 */
4532 bool swizzling = two_side_color &&
4533 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4534 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4535 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4536 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4537
4538 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
4539 if (*max_source_attr < source_attr + swizzling)
4540 *max_source_attr = source_attr + swizzling;
4541
4542 attr->SourceAttribute = source_attr;
4543 if (swizzling)
4544 attr->SwizzleSelect = INPUTATTR_FACING;
4545 }
4546
4547 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4548 calculate_attr_overrides(
4549 const struct crocus_context *ice,
4550 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4551 uint32_t *point_sprite_enables,
4552 uint32_t *urb_entry_read_length,
4553 uint32_t *urb_entry_read_offset)
4554 {
4555 const struct elk_wm_prog_data *wm_prog_data = (void *)
4556 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4557 const struct intel_vue_map *vue_map = ice->shaders.last_vue_map;
4558 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4559 uint32_t max_source_attr = 0;
4560 const struct shader_info *fs_info =
4561 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4562
4563 int first_slot =
4564 elk_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4565
4566 /* Each URB offset packs two varying slots */
4567 assert(first_slot % 2 == 0);
4568 *urb_entry_read_offset = first_slot / 2;
4569 *point_sprite_enables = 0;
4570
4571 for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4572 const int input_index = wm_prog_data->urb_setup[fs_attr];
4573
4574 if (input_index < 0)
4575 continue;
4576
4577 bool point_sprite = false;
4578 if (crocus_is_drawing_points(ice)) {
4579 if (fs_attr >= VARYING_SLOT_TEX0 &&
4580 fs_attr <= VARYING_SLOT_TEX7 &&
4581 cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4582 point_sprite = true;
4583
4584 if (fs_attr == VARYING_SLOT_PNTC)
4585 point_sprite = true;
4586
4587 if (point_sprite)
4588 *point_sprite_enables |= 1U << input_index;
4589 }
4590
4591 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4592 if (!point_sprite) {
4593 get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4594 cso_rast->cso.light_twoside, &max_source_attr);
4595 }
4596
4597 /* The hardware can only do the overrides on 16 overrides at a
4598 * time, and the other up to 16 have to be lined up so that the
4599 * input index = the output index. We'll need to do some
4600 * tweaking to make sure that's the case.
4601 */
4602 if (input_index < 16)
4603 attr_overrides[input_index] = attribute;
4604 else
4605 assert(attribute.SourceAttribute == input_index);
4606 }
4607
4608 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4609 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4610 *
4611 * "This field should be set to the minimum length required to read the
4612 * maximum source attribute. The maximum source attribute is indicated
4613 * by the maximum value of the enabled Attribute # Source Attribute if
4614 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4615 * enable is not set.
4616 * read_length = ceiling((max_source_attr + 1) / 2)
4617 *
4618 * [errata] Corruption/Hang possible if length programmed larger than
4619 * recommended"
4620 *
4621 * Similar text exists for Ivy Bridge.
4622 */
4623 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4624 }
4625 #endif
4626
4627 #if GFX_VER >= 7
4628 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4629 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4630 {
4631 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4632 const struct elk_wm_prog_data *wm_prog_data = (void *)
4633 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4634 #if GFX_VER >= 8
4635 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4636 #else
4637 #define attr_overrides sbe.Attribute
4638 #endif
4639
4640 uint32_t urb_entry_read_length;
4641 uint32_t urb_entry_read_offset;
4642 uint32_t point_sprite_enables;
4643
4644 crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4645 sbe.AttributeSwizzleEnable = true;
4646 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4647 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4648
4649 calculate_attr_overrides(ice,
4650 attr_overrides,
4651 &point_sprite_enables,
4652 &urb_entry_read_length,
4653 &urb_entry_read_offset);
4654 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4655 sbe.VertexURBEntryReadLength = urb_entry_read_length;
4656 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4657 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4658 #if GFX_VER >= 8
4659 sbe.ForceVertexURBEntryReadLength = true;
4660 sbe.ForceVertexURBEntryReadOffset = true;
4661 #endif
4662 }
4663 #if GFX_VER >= 8
4664 crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4665 for (int i = 0; i < 16; i++)
4666 sbes.Attribute[i] = attr_overrides[i];
4667 }
4668 #endif
4669 }
4670 #endif
4671
4672 /* ------------------------------------------------------------------- */
4673
4674 /**
4675 * Populate VS program key fields based on the current state.
4676 */
4677 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_vs_prog_key * key)4678 crocus_populate_vs_key(const struct crocus_context *ice,
4679 const struct shader_info *info,
4680 gl_shader_stage last_stage,
4681 struct elk_vs_prog_key *key)
4682 {
4683 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4684
4685 if (info->clip_distance_array_size == 0 &&
4686 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4687 last_stage == MESA_SHADER_VERTEX)
4688 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4689
4690 if (last_stage == MESA_SHADER_VERTEX &&
4691 info->outputs_written & (VARYING_BIT_PSIZ))
4692 key->clamp_pointsize = 1;
4693
4694 #if GFX_VER <= 5
4695 key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4696 cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4697 key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4698 #endif
4699
4700 key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4701
4702 #if GFX_VERx10 < 75
4703 uint64_t inputs_read = info->inputs_read;
4704 int ve_idx = 0;
4705 while (inputs_read) {
4706 int i = u_bit_scan64(&inputs_read);
4707 key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4708 ve_idx++;
4709 }
4710 #endif
4711 }
4712
4713 /**
4714 * Populate TCS program key fields based on the current state.
4715 */
4716 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct elk_tcs_prog_key * key)4717 crocus_populate_tcs_key(const struct crocus_context *ice,
4718 struct elk_tcs_prog_key *key)
4719 {
4720 }
4721
4722 /**
4723 * Populate TES program key fields based on the current state.
4724 */
4725 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_tes_prog_key * key)4726 crocus_populate_tes_key(const struct crocus_context *ice,
4727 const struct shader_info *info,
4728 gl_shader_stage last_stage,
4729 struct elk_tes_prog_key *key)
4730 {
4731 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4732
4733 if (info->clip_distance_array_size == 0 &&
4734 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4735 last_stage == MESA_SHADER_TESS_EVAL)
4736 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4737
4738 if (last_stage == MESA_SHADER_TESS_EVAL &&
4739 info->outputs_written & (VARYING_BIT_PSIZ))
4740 key->clamp_pointsize = 1;
4741 }
4742
4743 /**
4744 * Populate GS program key fields based on the current state.
4745 */
4746 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_gs_prog_key * key)4747 crocus_populate_gs_key(const struct crocus_context *ice,
4748 const struct shader_info *info,
4749 gl_shader_stage last_stage,
4750 struct elk_gs_prog_key *key)
4751 {
4752 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4753
4754 if (info->clip_distance_array_size == 0 &&
4755 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4756 last_stage == MESA_SHADER_GEOMETRY)
4757 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4758
4759 if (last_stage == MESA_SHADER_GEOMETRY &&
4760 info->outputs_written & (VARYING_BIT_PSIZ))
4761 key->clamp_pointsize = 1;
4762 }
4763
4764 /**
4765 * Populate FS program key fields based on the current state.
4766 */
4767 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct elk_wm_prog_key * key)4768 crocus_populate_fs_key(const struct crocus_context *ice,
4769 const struct shader_info *info,
4770 struct elk_wm_prog_key *key)
4771 {
4772 struct crocus_screen *screen = (void *) ice->ctx.screen;
4773 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4774 const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4775 const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4776 const struct crocus_blend_state *blend = ice->state.cso_blend;
4777
4778 #if GFX_VER < 6
4779 uint32_t lookup = 0;
4780
4781 if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4782 lookup |= ELK_WM_IZ_PS_KILL_ALPHATEST_BIT;
4783
4784 if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4785 lookup |= ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4786
4787 if (fb->zsbuf && zsa->cso.depth_enabled) {
4788 lookup |= ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4789
4790 if (zsa->cso.depth_writemask)
4791 lookup |= ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4792
4793 }
4794 if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4795 lookup |= ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4796 if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4797 lookup |= ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4798 }
4799 key->iz_lookup = lookup;
4800 key->stats_wm = ice->state.stats_wm;
4801 #endif
4802
4803 uint32_t line_aa = ELK_NEVER;
4804 if (rast->cso.line_smooth) {
4805 int reduced_prim = ice->state.reduced_prim_mode;
4806 if (reduced_prim == MESA_PRIM_LINES)
4807 line_aa = ELK_ALWAYS;
4808 else if (reduced_prim == MESA_PRIM_TRIANGLES) {
4809 if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4810 line_aa = ELK_SOMETIMES;
4811
4812 if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4813 rast->cso.cull_face == PIPE_FACE_BACK)
4814 line_aa = ELK_ALWAYS;
4815 } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4816 line_aa = ELK_SOMETIMES;
4817
4818 if (rast->cso.cull_face == PIPE_FACE_FRONT)
4819 line_aa = ELK_ALWAYS;
4820 }
4821 }
4822 }
4823 key->line_aa = line_aa;
4824
4825 key->nr_color_regions = fb->nr_cbufs;
4826
4827 key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4828
4829 key->alpha_to_coverage = blend->cso.alpha_to_coverage ?
4830 ELK_ALWAYS : ELK_NEVER;
4831
4832 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4833
4834 key->flat_shade = rast->cso.flatshade &&
4835 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4836
4837 const bool multisample_fbo = rast->cso.multisample && fb->samples > 1;
4838 key->multisample_fbo = multisample_fbo ? ELK_ALWAYS : ELK_NEVER;
4839 key->persample_interp =
4840 rast->cso.force_persample_interp ? ELK_ALWAYS : ELK_NEVER;
4841
4842 key->ignore_sample_mask_out = !multisample_fbo;
4843 key->coherent_fb_fetch = false; // TODO: needed?
4844
4845 key->force_dual_color_blend =
4846 screen->driconf.dual_color_blend_by_location &&
4847 (blend->blend_enables & 1) && blend->dual_color_blending;
4848
4849 #if GFX_VER <= 5
4850 if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4851 key->emit_alpha_test = true;
4852 key->alpha_test_func = zsa->cso.alpha_func;
4853 key->alpha_test_ref = zsa->cso.alpha_ref_value;
4854 }
4855 #endif
4856 }
4857
4858 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct elk_cs_prog_key * key)4859 crocus_populate_cs_key(const struct crocus_context *ice,
4860 struct elk_cs_prog_key *key)
4861 {
4862 }
4863
4864 #if GFX_VER == 4
4865 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4866 #elif GFX_VER >= 5
4867 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4868 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4869 {
4870 return shader->offset;
4871 }
4872 #endif
4873
4874 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4875 * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
4876 * this WA on C0 stepping.
4877 *
4878 * TODO: Fill out SamplerCount for prefetching?
4879 */
4880
4881 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
4882 pkt.KernelStartPointer = KSP(ice, shader); \
4883 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
4884 pkt.FloatingPointMode = prog_data->use_alt_mode; \
4885 \
4886 pkt.DispatchGRFStartRegisterForURBData = \
4887 prog_data->dispatch_grf_start_reg; \
4888 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
4889 pkt.prefix##URBEntryReadOffset = 0; \
4890 \
4891 pkt.StatisticsEnable = true; \
4892 pkt.Enable = true; \
4893 \
4894 if (prog_data->total_scratch) { \
4895 struct crocus_bo *bo = \
4896 crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4897 pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
4898 pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
4899 }
4900
4901 /* ------------------------------------------------------------------- */
4902 #if GFX_VER >= 6
4903 static const uint32_t push_constant_opcodes[] = {
4904 [MESA_SHADER_VERTEX] = 21,
4905 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4906 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4907 [MESA_SHADER_GEOMETRY] = 22,
4908 [MESA_SHADER_FRAGMENT] = 23,
4909 [MESA_SHADER_COMPUTE] = 0,
4910 };
4911 #endif
4912
4913 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4914 emit_sized_null_surface(struct crocus_batch *batch,
4915 unsigned width, unsigned height,
4916 unsigned layers, unsigned levels,
4917 unsigned minimum_array_element,
4918 uint32_t *out_offset)
4919 {
4920 struct isl_device *isl_dev = &batch->screen->isl_dev;
4921 uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4922 isl_dev->ss.align,
4923 out_offset);
4924 //TODO gen 6 multisample crash
4925 isl_null_fill_state(isl_dev, surf,
4926 .size = isl_extent3d(width, height, layers),
4927 .levels = levels,
4928 .minimum_array_element = minimum_array_element);
4929 }
4930 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4931 emit_null_surface(struct crocus_batch *batch,
4932 uint32_t *out_offset)
4933 {
4934 emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4935 }
4936
4937 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4938 emit_null_fb_surface(struct crocus_batch *batch,
4939 struct crocus_context *ice,
4940 uint32_t *out_offset)
4941 {
4942 uint32_t width, height, layers, level, layer;
4943 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4944 if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4945 emit_null_surface(batch, out_offset);
4946 return;
4947 }
4948
4949 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4950 width = MAX2(cso->width, 1);
4951 height = MAX2(cso->height, 1);
4952 layers = cso->layers ? cso->layers : 1;
4953 level = 0;
4954 layer = 0;
4955
4956 if (cso->nr_cbufs == 0 && cso->zsbuf) {
4957 width = cso->zsbuf->width;
4958 height = cso->zsbuf->height;
4959 level = cso->zsbuf->u.tex.level;
4960 layer = cso->zsbuf->u.tex.first_layer;
4961 }
4962 emit_sized_null_surface(batch, width, height,
4963 layers, level, layer,
4964 out_offset);
4965 }
4966
4967 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4968 emit_surface_state(struct crocus_batch *batch,
4969 struct crocus_resource *res,
4970 const struct isl_surf *in_surf,
4971 bool adjust_surf,
4972 struct isl_view *in_view,
4973 bool writeable,
4974 enum isl_aux_usage aux_usage,
4975 bool blend_enable,
4976 uint32_t write_disables,
4977 uint32_t *surf_state,
4978 uint32_t addr_offset)
4979 {
4980 struct isl_device *isl_dev = &batch->screen->isl_dev;
4981 uint32_t reloc = RELOC_32BIT;
4982 uint64_t offset_B = res->offset;
4983 uint32_t tile_x_sa = 0, tile_y_sa = 0;
4984
4985 if (writeable)
4986 reloc |= RELOC_WRITE;
4987
4988 struct isl_surf surf = *in_surf;
4989 struct isl_view view = *in_view;
4990 if (adjust_surf) {
4991 if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4992 isl_surf_get_image_surf(isl_dev, in_surf,
4993 view.base_level, 0,
4994 view.base_array_layer,
4995 &surf, &offset_B,
4996 &tile_x_sa, &tile_y_sa);
4997 view.base_array_layer = 0;
4998 view.base_level = 0;
4999 } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5000 isl_surf_get_image_surf(isl_dev, in_surf,
5001 view.base_level, view.base_array_layer,
5002 0,
5003 &surf, &offset_B,
5004 &tile_x_sa, &tile_y_sa);
5005 view.base_array_layer = 0;
5006 view.base_level = 0;
5007 } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5008 surf.dim = ISL_SURF_DIM_2D;
5009 }
5010
5011 union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5012 struct crocus_bo *aux_bo = NULL;
5013 uint32_t aux_offset = 0;
5014 struct isl_surf *aux_surf = NULL;
5015 if (aux_usage != ISL_AUX_USAGE_NONE) {
5016 aux_surf = &res->aux.surf;
5017 aux_offset = res->aux.offset;
5018 aux_bo = res->aux.bo;
5019
5020 clear_color = crocus_resource_get_clear_color(res);
5021 }
5022
5023 isl_surf_fill_state(isl_dev, surf_state,
5024 .surf = &surf,
5025 .view = &view,
5026 .address = crocus_state_reloc(batch,
5027 addr_offset + isl_dev->ss.addr_offset,
5028 res->bo, offset_B, reloc),
5029 .aux_surf = aux_surf,
5030 .aux_usage = aux_usage,
5031 .aux_address = aux_offset,
5032 .mocs = crocus_mocs(res->bo, isl_dev),
5033 .clear_color = clear_color,
5034 .use_clear_address = false,
5035 .clear_address = 0,
5036 .x_offset_sa = tile_x_sa,
5037 .y_offset_sa = tile_y_sa,
5038 #if GFX_VER <= 5
5039 .blend_enable = blend_enable,
5040 .write_disables = write_disables,
5041 #endif
5042 );
5043
5044 if (aux_surf) {
5045 /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5046 * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5047 * contain other control information. Since buffer addresses are always
5048 * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5049 * an ordinary reloc to do the necessary address translation.
5050 *
5051 * FIXME: move to the point of assignment.
5052 */
5053 if (GFX_VER == 8) {
5054 uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5055 *aux_addr = crocus_state_reloc(batch,
5056 addr_offset + isl_dev->ss.aux_addr_offset,
5057 aux_bo, *aux_addr,
5058 reloc);
5059 } else {
5060 uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5061 *aux_addr = crocus_state_reloc(batch,
5062 addr_offset + isl_dev->ss.aux_addr_offset,
5063 aux_bo, *aux_addr,
5064 reloc);
5065 }
5066 }
5067
5068 }
5069
5070 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5071 emit_surface(struct crocus_batch *batch,
5072 struct crocus_surface *surf,
5073 enum isl_aux_usage aux_usage,
5074 bool blend_enable,
5075 uint32_t write_disables)
5076 {
5077 struct isl_device *isl_dev = &batch->screen->isl_dev;
5078 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5079 struct isl_view *view = &surf->view;
5080 uint32_t offset = 0;
5081 enum pipe_texture_target target = res->base.b.target;
5082 bool adjust_surf = false;
5083
5084 if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5085 adjust_surf = true;
5086
5087 if (surf->align_res)
5088 res = (struct crocus_resource *)surf->align_res;
5089
5090 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5091
5092 emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5093 aux_usage, blend_enable,
5094 write_disables,
5095 surf_state, offset);
5096 return offset;
5097 }
5098
5099 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5100 emit_rt_surface(struct crocus_batch *batch,
5101 struct crocus_surface *surf,
5102 enum isl_aux_usage aux_usage)
5103 {
5104 struct isl_device *isl_dev = &batch->screen->isl_dev;
5105 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5106 struct isl_view *view = &surf->read_view;
5107 uint32_t offset = 0;
5108 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5109
5110 emit_surface_state(batch, res, &surf->surf, true, view, false,
5111 aux_usage, 0, false,
5112 surf_state, offset);
5113 return offset;
5114 }
5115
5116 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5117 emit_grid(struct crocus_context *ice,
5118 struct crocus_batch *batch)
5119 {
5120 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5121 uint32_t offset = 0;
5122 struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5123 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5124 isl_dev->ss.align, &offset);
5125 isl_buffer_fill_state(isl_dev, surf_state,
5126 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5127 crocus_resource_bo(grid_ref->res),
5128 grid_ref->offset,
5129 RELOC_32BIT),
5130 .size_B = 12,
5131 .format = ISL_FORMAT_RAW,
5132 .stride_B = 1,
5133 .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5134 return offset;
5135 }
5136
5137 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5138 emit_ubo_buffer(struct crocus_context *ice,
5139 struct crocus_batch *batch,
5140 struct pipe_constant_buffer *buffer)
5141 {
5142 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5143 uint32_t offset = 0;
5144
5145 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5146 isl_dev->ss.align, &offset);
5147 isl_buffer_fill_state(isl_dev, surf_state,
5148 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5149 crocus_resource_bo(buffer->buffer),
5150 buffer->buffer_offset,
5151 RELOC_32BIT),
5152 .size_B = buffer->buffer_size,
5153 .format = 0,
5154 .swizzle = ISL_SWIZZLE_IDENTITY,
5155 .stride_B = 1,
5156 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5157
5158 return offset;
5159 }
5160
5161 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5162 emit_ssbo_buffer(struct crocus_context *ice,
5163 struct crocus_batch *batch,
5164 struct pipe_shader_buffer *buffer, bool writeable)
5165 {
5166 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5167 uint32_t offset = 0;
5168 uint32_t reloc = RELOC_32BIT;
5169
5170 if (writeable)
5171 reloc |= RELOC_WRITE;
5172 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5173 isl_dev->ss.align, &offset);
5174 isl_buffer_fill_state(isl_dev, surf_state,
5175 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5176 crocus_resource_bo(buffer->buffer),
5177 buffer->buffer_offset,
5178 reloc),
5179 .size_B = buffer->buffer_size,
5180 .format = ISL_FORMAT_RAW,
5181 .swizzle = ISL_SWIZZLE_IDENTITY,
5182 .stride_B = 1,
5183 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5184
5185 return offset;
5186 }
5187
5188 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5189 emit_sampler_view(struct crocus_context *ice,
5190 struct crocus_batch *batch,
5191 bool for_gather,
5192 struct crocus_sampler_view *isv)
5193 {
5194 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5195 uint32_t offset = 0;
5196
5197 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5198 isl_dev->ss.align, &offset);
5199
5200 if (isv->base.target == PIPE_BUFFER) {
5201 const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5202 const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5203 unsigned final_size =
5204 MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5205 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5206 isl_buffer_fill_state(isl_dev, surf_state,
5207 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5208 isv->res->bo,
5209 isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5210 .size_B = final_size,
5211 .format = isv->view.format,
5212 .swizzle = isv->view.swizzle,
5213 .stride_B = cpp,
5214 .mocs = crocus_mocs(isv->res->bo, isl_dev)
5215 );
5216 } else {
5217 enum isl_aux_usage aux_usage =
5218 crocus_resource_texture_aux_usage(isv->res);
5219
5220 emit_surface_state(batch, isv->res, &isv->res->surf, false,
5221 for_gather ? &isv->gather_view : &isv->view,
5222 false, aux_usage, false,
5223 0, surf_state, offset);
5224 }
5225 return offset;
5226 }
5227
5228 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5229 emit_image_view(struct crocus_context *ice,
5230 struct crocus_batch *batch,
5231 struct crocus_image_view *iv)
5232 {
5233 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5234 uint32_t offset = 0;
5235
5236 struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5237 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5238 isl_dev->ss.align, &offset);
5239 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5240 uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5241 if (res->base.b.target == PIPE_BUFFER) {
5242 const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5243 const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5244 unsigned final_size =
5245 MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5246 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5247 isl_buffer_fill_state(isl_dev, surf_state,
5248 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5249 res->bo,
5250 res->offset + iv->base.u.buf.offset, reloc),
5251 .size_B = final_size,
5252 .format = iv->view.format,
5253 .swizzle = iv->view.swizzle,
5254 .stride_B = cpp,
5255 .mocs = crocus_mocs(res->bo, isl_dev)
5256 );
5257 } else {
5258 if (iv->view.format == ISL_FORMAT_RAW) {
5259 isl_buffer_fill_state(isl_dev, surf_state,
5260 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5261 res->bo,
5262 res->offset, reloc),
5263 .size_B = res->bo->size - res->offset,
5264 .format = iv->view.format,
5265 .swizzle = iv->view.swizzle,
5266 .stride_B = 1,
5267 .mocs = crocus_mocs(res->bo, isl_dev),
5268 );
5269
5270
5271 } else {
5272 emit_surface_state(batch, res,
5273 &res->surf, false, &iv->view,
5274 write, 0, false,
5275 0, surf_state, offset);
5276 }
5277 }
5278
5279 return offset;
5280 }
5281
5282 #if GFX_VER == 6
5283 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5284 emit_sol_surface(struct crocus_batch *batch,
5285 struct pipe_stream_output_info *so_info,
5286 uint32_t idx)
5287 {
5288 struct crocus_context *ice = batch->ice;
5289
5290 if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5291 return 0;
5292 const struct pipe_stream_output *output = &so_info->output[idx];
5293 const int buffer = output->output_buffer;
5294 assert(output->stream == 0);
5295
5296 struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5297 unsigned stride_dwords = so_info->stride[buffer];
5298 unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5299
5300 size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5301 unsigned num_vector_components = output->num_components;
5302 unsigned num_elements;
5303 /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5304 * too big to map using a single binding table entry?
5305 */
5306 // assert((size_dwords - offset_dwords) / stride_dwords
5307 // <= ELK_MAX_NUM_BUFFER_ENTRIES);
5308
5309 if (size_dwords > offset_dwords + num_vector_components) {
5310 /* There is room for at least 1 transform feedback output in the buffer.
5311 * Compute the number of additional transform feedback outputs the
5312 * buffer has room for.
5313 */
5314 num_elements =
5315 (size_dwords - offset_dwords - num_vector_components);
5316 } else {
5317 /* There isn't even room for a single transform feedback output in the
5318 * buffer. We can't configure the binding table entry to prevent output
5319 * entirely; we'll have to rely on the geometry shader to detect
5320 * overflow. But to minimize the damage in case of a bug, set up the
5321 * binding table entry to just allow a single output.
5322 */
5323 num_elements = 0;
5324 }
5325 num_elements += stride_dwords;
5326
5327 uint32_t surface_format;
5328 switch (num_vector_components) {
5329 case 1:
5330 surface_format = ISL_FORMAT_R32_FLOAT;
5331 break;
5332 case 2:
5333 surface_format = ISL_FORMAT_R32G32_FLOAT;
5334 break;
5335 case 3:
5336 surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5337 break;
5338 case 4:
5339 surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5340 break;
5341 default:
5342 unreachable("Invalid vector size for transform feedback output");
5343 }
5344
5345 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5346 uint32_t offset = 0;
5347
5348 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5349 isl_dev->ss.align, &offset);
5350 isl_buffer_fill_state(isl_dev, surf_state,
5351 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5352 crocus_resource_bo(&buf->base.b),
5353 offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5354 .size_B = num_elements * 4,
5355 .stride_B = stride_dwords * 4,
5356 .swizzle = ISL_SWIZZLE_IDENTITY,
5357 .format = surface_format);
5358 return offset;
5359 }
5360 #endif
5361
5362 #define foreach_surface_used(index, group) \
5363 for (int index = 0; index < bt->sizes[group]; index++) \
5364 if (crocus_group_index_to_bti(bt, group, index) != \
5365 CROCUS_SURFACE_NOT_USED)
5366
5367 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5368 crocus_populate_binding_table(struct crocus_context *ice,
5369 struct crocus_batch *batch,
5370 gl_shader_stage stage, bool ff_gs)
5371 {
5372 struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5373 struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5374 if (!shader)
5375 return;
5376
5377 struct crocus_binding_table *bt = &shader->bt;
5378 int s = 0;
5379 uint32_t *surf_offsets = shader->surf_offset;
5380
5381 #if GFX_VER < 8
5382 const struct shader_info *info = crocus_get_shader_info(ice, stage);
5383 #endif
5384
5385 if (stage == MESA_SHADER_FRAGMENT) {
5386 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5387 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5388 if (cso_fb->nr_cbufs) {
5389 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5390 uint32_t write_disables = 0;
5391 bool blend_enable = false;
5392 #if GFX_VER <= 5
5393 const struct pipe_rt_blend_state *rt =
5394 &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5395 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5396 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5397 write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5398 write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5399 write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5400 write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5401 /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5402 blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5403 #endif
5404 if (cso_fb->cbufs[i]) {
5405 surf_offsets[s] = emit_surface(batch,
5406 (struct crocus_surface *)cso_fb->cbufs[i],
5407 ice->state.draw_aux_usage[i],
5408 blend_enable,
5409 write_disables);
5410 } else {
5411 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5412 }
5413 s++;
5414 }
5415 } else {
5416 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5417 s++;
5418 }
5419
5420 foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5421 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5422 if (cso_fb->cbufs[i]) {
5423 surf_offsets[s++] = emit_rt_surface(batch,
5424 (struct crocus_surface *)cso_fb->cbufs[i],
5425 ice->state.draw_aux_usage[i]);
5426 }
5427 }
5428 }
5429
5430 if (stage == MESA_SHADER_COMPUTE) {
5431 foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5432 surf_offsets[s] = emit_grid(ice, batch);
5433 s++;
5434 }
5435 }
5436
5437 #if GFX_VER == 6
5438 if (stage == MESA_SHADER_GEOMETRY) {
5439 struct pipe_stream_output_info *so_info;
5440 if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5441 so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5442 else
5443 so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5444
5445 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5446 surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5447 s++;
5448 }
5449 }
5450 #endif
5451
5452 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5453 struct crocus_sampler_view *view = shs->textures[i];
5454 if (view)
5455 surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5456 else
5457 emit_null_surface(batch, &surf_offsets[s]);
5458 s++;
5459 }
5460
5461 #if GFX_VER < 8
5462 if (info && info->uses_texture_gather) {
5463 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5464 struct crocus_sampler_view *view = shs->textures[i];
5465 if (view)
5466 surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5467 else
5468 emit_null_surface(batch, &surf_offsets[s]);
5469 s++;
5470 }
5471 }
5472 #endif
5473
5474 foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5475 struct crocus_image_view *view = &shs->image[i];
5476 if (view->base.resource)
5477 surf_offsets[s] = emit_image_view(ice, batch, view);
5478 else
5479 emit_null_surface(batch, &surf_offsets[s]);
5480 s++;
5481 }
5482 foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5483 if (shs->constbufs[i].buffer)
5484 surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5485 else
5486 emit_null_surface(batch, &surf_offsets[s]);
5487 s++;
5488 }
5489 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5490 if (shs->ssbo[i].buffer)
5491 surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5492 !!(shs->writable_ssbos & (1 << i)));
5493 else
5494 emit_null_surface(batch, &surf_offsets[s]);
5495 s++;
5496 }
5497
5498 }
5499 /* ------------------------------------------------------------------- */
5500 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5501 crocus_upload_binding_table(struct crocus_context *ice,
5502 struct crocus_batch *batch,
5503 uint32_t *table,
5504 uint32_t size)
5505
5506 {
5507 if (size == 0)
5508 return 0;
5509 return emit_state(batch, table, size, 32);
5510 }
5511
5512 /**
5513 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5514 */
5515
5516 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5517 crocus_update_surface_base_address(struct crocus_batch *batch)
5518 {
5519 if (batch->state_base_address_emitted)
5520 return;
5521
5522 UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5523
5524 flush_before_state_base_change(batch);
5525
5526 crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5527 /* Set base addresses */
5528 sba.GeneralStateBaseAddressModifyEnable = true;
5529
5530 #if GFX_VER >= 6
5531 sba.DynamicStateBaseAddressModifyEnable = true;
5532 sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5533 #endif
5534
5535 sba.SurfaceStateBaseAddressModifyEnable = true;
5536 sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5537
5538 sba.IndirectObjectBaseAddressModifyEnable = true;
5539
5540 #if GFX_VER >= 5
5541 sba.InstructionBaseAddressModifyEnable = true;
5542 sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5543 #endif
5544
5545 /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5546 #if GFX_VER == 8
5547 sba.GeneralStateBufferSize = 0xfffff;
5548 sba.IndirectObjectBufferSize = 0xfffff;
5549 sba.InstructionBufferSize = 0xfffff;
5550 sba.DynamicStateBufferSize = MAX_STATE_SIZE;
5551
5552 sba.GeneralStateBufferSizeModifyEnable = true;
5553 sba.DynamicStateBufferSizeModifyEnable = true;
5554 sba.IndirectObjectBufferSizeModifyEnable = true;
5555 sba.InstructionBuffersizeModifyEnable = true;
5556 #else
5557 sba.GeneralStateAccessUpperBoundModifyEnable = true;
5558 sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5559
5560 #if GFX_VER >= 5
5561 sba.InstructionAccessUpperBoundModifyEnable = true;
5562 #endif
5563
5564 #if GFX_VER >= 6
5565 /* Dynamic state upper bound. Although the documentation says that
5566 * programming it to zero will cause it to be ignored, that is a lie.
5567 * If this isn't programmed to a real bound, the sampler border color
5568 * pointer is rejected, causing border color to mysteriously fail.
5569 */
5570 sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5571 sba.DynamicStateAccessUpperBoundModifyEnable = true;
5572 #else
5573 /* Same idea but using General State Base Address on Gen4-5 */
5574 sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5575 #endif
5576 #endif
5577
5578 #if GFX_VER >= 6
5579 /* The hardware appears to pay attention to the MOCS fields even
5580 * if you don't set the "Address Modify Enable" bit for the base.
5581 */
5582 sba.GeneralStateMOCS = mocs;
5583 sba.StatelessDataPortAccessMOCS = mocs;
5584 sba.DynamicStateMOCS = mocs;
5585 sba.IndirectObjectMOCS = mocs;
5586 sba.InstructionMOCS = mocs;
5587 sba.SurfaceStateMOCS = mocs;
5588 #endif
5589 }
5590
5591 flush_after_state_base_change(batch);
5592
5593 /* According to section 3.6.1 of VOL1 of the 965 PRM,
5594 * STATE_BASE_ADDRESS updates require a reissue of:
5595 *
5596 * 3DSTATE_PIPELINE_POINTERS
5597 * 3DSTATE_BINDING_TABLE_POINTERS
5598 * MEDIA_STATE_POINTERS
5599 *
5600 * and this continues through Ironlake. The Sandy Bridge PRM, vol
5601 * 1 part 1 says that the folowing packets must be reissued:
5602 *
5603 * 3DSTATE_CC_POINTERS
5604 * 3DSTATE_BINDING_TABLE_POINTERS
5605 * 3DSTATE_SAMPLER_STATE_POINTERS
5606 * 3DSTATE_VIEWPORT_STATE_POINTERS
5607 * MEDIA_STATE_POINTERS
5608 *
5609 * Those are always reissued following SBA updates anyway (new
5610 * batch time), except in the case of the program cache BO
5611 * changing. Having a separate state flag makes the sequence more
5612 * obvious.
5613 */
5614 #if GFX_VER <= 5
5615 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5616 #elif GFX_VER == 6
5617 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5618 #endif
5619 batch->state_base_address_emitted = true;
5620 }
5621
5622 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5623 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5624 bool window_space_position, float *zmin, float *zmax)
5625 {
5626 if (window_space_position) {
5627 *zmin = 0.f;
5628 *zmax = 1.f;
5629 return;
5630 }
5631 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5632 }
5633
5634 struct push_bos {
5635 struct {
5636 struct crocus_address addr;
5637 uint32_t length;
5638 } buffers[4];
5639 int buffer_count;
5640 uint32_t max_length;
5641 };
5642
5643 #if GFX_VER >= 6
5644 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5645 setup_constant_buffers(struct crocus_context *ice,
5646 struct crocus_batch *batch,
5647 int stage,
5648 struct push_bos *push_bos)
5649 {
5650 struct crocus_shader_state *shs = &ice->state.shaders[stage];
5651 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5652 struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
5653
5654 uint32_t push_range_sum = 0;
5655
5656 int n = 0;
5657 for (int i = 0; i < 4; i++) {
5658 const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
5659
5660 if (range->length == 0)
5661 continue;
5662
5663 push_range_sum += range->length;
5664
5665 if (range->length > push_bos->max_length)
5666 push_bos->max_length = range->length;
5667
5668 /* Range block is a binding table index, map back to UBO index. */
5669 unsigned block_index = crocus_bti_to_group_index(
5670 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5671 assert(block_index != CROCUS_SURFACE_NOT_USED);
5672
5673 struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5674 struct crocus_resource *res = (void *) cbuf->buffer;
5675
5676 assert(cbuf->buffer_offset % 32 == 0);
5677
5678 push_bos->buffers[n].length = range->length;
5679 push_bos->buffers[n].addr =
5680 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5681 : ro_bo(batch->ice->workaround_bo,
5682 batch->ice->workaround_offset);
5683 n++;
5684 }
5685
5686 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5687 *
5688 * "The sum of all four read length fields must be less than or
5689 * equal to the size of 64."
5690 */
5691 assert(push_range_sum <= 64);
5692
5693 push_bos->buffer_count = n;
5694 }
5695
5696 #if GFX_VER == 7
5697 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5698 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5699 {
5700 crocus_emit_pipe_control_write(batch,
5701 "vs workaround",
5702 PIPE_CONTROL_WRITE_IMMEDIATE
5703 | PIPE_CONTROL_DEPTH_STALL,
5704 batch->ice->workaround_bo,
5705 batch->ice->workaround_offset, 0);
5706 }
5707 #endif
5708
5709 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5710 emit_push_constant_packets(struct crocus_context *ice,
5711 struct crocus_batch *batch,
5712 int stage,
5713 const struct push_bos *push_bos)
5714 {
5715 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5716 struct elk_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5717 UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5718
5719 #if GFX_VER == 7
5720 if (stage == MESA_SHADER_VERTEX) {
5721 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5722 gen7_emit_vs_workaround_flush(batch);
5723 }
5724 #endif
5725 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5726 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5727 #if GFX_VER >= 7
5728 #if GFX_VER != 8
5729 /* MOCS is MBZ on Gen8 so we skip it there */
5730 pkt.ConstantBody.MOCS = mocs;
5731 #endif
5732
5733 if (prog_data) {
5734 /* The Skylake PRM contains the following restriction:
5735 *
5736 * "The driver must ensure The following case does not occur
5737 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5738 * buffer 3 read length equal to zero committed followed by a
5739 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5740 * zero committed."
5741 *
5742 * To avoid this, we program the buffers in the highest slots.
5743 * This way, slot 0 is only used if slot 3 is also used.
5744 */
5745 int n = push_bos->buffer_count;
5746 assert(n <= 4);
5747 #if GFX_VERx10 >= 75
5748 const unsigned shift = 4 - n;
5749 #else
5750 const unsigned shift = 0;
5751 #endif
5752 for (int i = 0; i < n; i++) {
5753 pkt.ConstantBody.ReadLength[i + shift] =
5754 push_bos->buffers[i].length;
5755 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5756 }
5757 }
5758 #else
5759 if (prog_data) {
5760 int n = push_bos->buffer_count;
5761 assert (n <= 1);
5762 if (n == 1) {
5763 pkt.Buffer0Valid = true;
5764 pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5765 pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5766 }
5767 }
5768 #endif
5769 }
5770 }
5771
5772 #endif
5773
5774 #if GFX_VER == 8
5775 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5776 #elif GFX_VER >= 6
5777 typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
5778 #else
5779 typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
5780 #endif
5781
5782 static inline void
set_depth_stencil_bits(struct crocus_context * ice,DEPTH_STENCIL_GENXML * ds)5783 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5784 {
5785 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5786 ds->DepthTestEnable = cso->cso.depth_enabled;
5787 ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5788 ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5789
5790 ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5791 ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5792 ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5793 ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5794
5795 ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5796 ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5797
5798 ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5799 ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5800 ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5801 ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5802
5803 ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5804 ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5805 ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5806 ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5807 ds->StencilBufferWriteEnable =
5808 cso->cso.stencil[0].writemask != 0 ||
5809 (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5810 }
5811
5812 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5813 emit_vertex_buffer_state(struct crocus_batch *batch,
5814 unsigned buffer_id,
5815 struct crocus_bo *bo,
5816 unsigned start_offset,
5817 unsigned end_offset,
5818 unsigned stride,
5819 unsigned step_rate,
5820 uint32_t **map)
5821 {
5822 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5823 _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5824 vb.BufferStartingAddress = ro_bo(bo, start_offset);
5825 #if GFX_VER >= 8
5826 vb.BufferSize = end_offset - start_offset;
5827 #endif
5828 vb.VertexBufferIndex = buffer_id;
5829 vb.BufferPitch = stride;
5830 #if GFX_VER >= 7
5831 vb.AddressModifyEnable = true;
5832 #endif
5833 #if GFX_VER >= 6
5834 vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5835 #endif
5836 #if GFX_VER < 8
5837 vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5838 vb.InstanceDataStepRate = step_rate;
5839 #if GFX_VER >= 5
5840 vb.EndAddress = ro_bo(bo, end_offset - 1);
5841 #endif
5842 #endif
5843 }
5844 *map += vb_dwords;
5845 }
5846
5847 #if GFX_VER >= 6
5848 static uint32_t
determine_sample_mask(struct crocus_context * ice)5849 determine_sample_mask(struct crocus_context *ice)
5850 {
5851 uint32_t num_samples = ice->state.framebuffer.samples;
5852
5853 if (num_samples <= 1)
5854 return 1;
5855
5856 uint32_t fb_mask = (1 << num_samples) - 1;
5857 return ice->state.sample_mask & fb_mask;
5858 }
5859 #endif
5860
5861 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5862 crocus_upload_dirty_render_state(struct crocus_context *ice,
5863 struct crocus_batch *batch,
5864 const struct pipe_draw_info *draw)
5865 {
5866 uint64_t dirty = ice->state.dirty;
5867 uint64_t stage_dirty = ice->state.stage_dirty;
5868
5869 if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5870 !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5871 return;
5872
5873 if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5874 crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5875 vf.StatisticsEnable = true;
5876 }
5877 }
5878
5879 #if GFX_VER <= 5
5880 if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5881 CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5882 bool ret = calculate_curbe_offsets(batch);
5883 if (ret) {
5884 dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5885 stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5886 }
5887 }
5888
5889 if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5890 stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5891 bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5892 elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5893 ((struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5894 if (ret) {
5895 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5896 stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5897 }
5898 }
5899 #endif
5900 if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5901 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5902 uint32_t cc_vp_address;
5903
5904 /* XXX: could avoid streaming for depth_clip [0,1] case. */
5905 uint32_t *cc_vp_map =
5906 stream_state(batch,
5907 4 * ice->state.num_viewports *
5908 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5909 for (int i = 0; i < ice->state.num_viewports; i++) {
5910 float zmin, zmax;
5911 crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5912 ice->state.window_space_position,
5913 &zmin, &zmax);
5914 if (cso_rast->cso.depth_clip_near)
5915 zmin = 0.0;
5916 if (cso_rast->cso.depth_clip_far)
5917 zmax = 1.0;
5918
5919 crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5920 ccv.MinimumDepth = zmin;
5921 ccv.MaximumDepth = zmax;
5922 }
5923
5924 cc_vp_map += GENX(CC_VIEWPORT_length);
5925 }
5926
5927 #if GFX_VER >= 7
5928 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5929 ptr.CCViewportPointer = cc_vp_address;
5930 }
5931 #elif GFX_VER == 6
5932 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5933 vp.CCViewportStateChange = 1;
5934 vp.PointertoCC_VIEWPORT = cc_vp_address;
5935 }
5936 #else
5937 ice->state.cc_vp_address = cc_vp_address;
5938 dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5939 #endif
5940 }
5941
5942 if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5943 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5944 #if GFX_VER >= 7
5945 uint32_t sf_cl_vp_address;
5946 uint32_t *vp_map =
5947 stream_state(batch,
5948 4 * ice->state.num_viewports *
5949 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5950 #else
5951 uint32_t *vp_map =
5952 stream_state(batch,
5953 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5954 32, &ice->state.sf_vp_address);
5955 uint32_t *clip_map =
5956 stream_state(batch,
5957 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5958 32, &ice->state.clip_vp_address);
5959 #endif
5960
5961 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5962 const struct pipe_viewport_state *state = &ice->state.viewports[i];
5963 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5964
5965 #if GFX_VER == 8
5966 float vp_xmin = viewport_extent(state, 0, -1.0f);
5967 float vp_xmax = viewport_extent(state, 0, 1.0f);
5968 float vp_ymin = viewport_extent(state, 1, -1.0f);
5969 float vp_ymax = viewport_extent(state, 1, 1.0f);
5970 #endif
5971 intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5972 state->scale[0], state->scale[1],
5973 state->translate[0], state->translate[1],
5974 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5975 #if GFX_VER >= 7
5976 crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5977 #else
5978 crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5979 #endif
5980 {
5981 vp.ViewportMatrixElementm00 = state->scale[0];
5982 vp.ViewportMatrixElementm11 = state->scale[1];
5983 vp.ViewportMatrixElementm22 = state->scale[2];
5984 vp.ViewportMatrixElementm30 = state->translate[0];
5985 vp.ViewportMatrixElementm31 = state->translate[1];
5986 vp.ViewportMatrixElementm32 = state->translate[2];
5987 #if GFX_VER < 6
5988 struct pipe_scissor_state scissor;
5989 crocus_fill_scissor_rect(ice, 0, &scissor);
5990 vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5991 vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5992 vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5993 vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5994 #endif
5995
5996 #if GFX_VER >= 7
5997 vp.XMinClipGuardband = gb_xmin;
5998 vp.XMaxClipGuardband = gb_xmax;
5999 vp.YMinClipGuardband = gb_ymin;
6000 vp.YMaxClipGuardband = gb_ymax;
6001 #endif
6002 #if GFX_VER == 8
6003 vp.XMinViewPort = MAX2(vp_xmin, 0);
6004 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6005 vp.YMinViewPort = MAX2(vp_ymin, 0);
6006 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6007 #endif
6008 }
6009 #if GFX_VER < 7
6010 crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6011 clip.XMinClipGuardband = gb_xmin;
6012 clip.XMaxClipGuardband = gb_xmax;
6013 clip.YMinClipGuardband = gb_ymin;
6014 clip.YMaxClipGuardband = gb_ymax;
6015 }
6016 #endif
6017 #if GFX_VER >= 7
6018 vp_map += GENX(SF_CLIP_VIEWPORT_length);
6019 #else
6020 vp_map += GENX(SF_VIEWPORT_length);
6021 clip_map += GENX(CLIP_VIEWPORT_length);
6022 #endif
6023 }
6024 #if GFX_VER >= 7
6025 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6026 ptr.SFClipViewportPointer = sf_cl_vp_address;
6027 }
6028 #elif GFX_VER == 6
6029 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6030 vp.SFViewportStateChange = 1;
6031 vp.CLIPViewportStateChange = 1;
6032 vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6033 vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6034 }
6035 #endif
6036 }
6037
6038 #if GFX_VER >= 6
6039 if (dirty & CROCUS_DIRTY_GEN6_URB) {
6040 #if GFX_VER == 6
6041 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6042 || ice->shaders.ff_gs_prog;
6043
6044 struct elk_vue_prog_data *vue_prog_data =
6045 (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6046 const unsigned vs_size = vue_prog_data->urb_entry_size;
6047 unsigned gs_size = vs_size;
6048 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6049 struct elk_vue_prog_data *gs_vue_prog_data =
6050 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6051 gs_size = gs_vue_prog_data->urb_entry_size;
6052 }
6053
6054 genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6055 #endif
6056 #if GFX_VER >= 7
6057 const struct intel_device_info *devinfo = &batch->screen->devinfo;
6058 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6059 bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6060 struct intel_urb_config urb_cfg;
6061
6062 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6063 if (!ice->shaders.prog[i]) {
6064 urb_cfg.size[i] = 1;
6065 } else {
6066 struct elk_vue_prog_data *vue_prog_data =
6067 (void *) ice->shaders.prog[i]->prog_data;
6068 urb_cfg.size[i] = vue_prog_data->urb_entry_size;
6069 }
6070 assert(urb_cfg.size[i] != 0);
6071 }
6072
6073 /* If we're just switching between programs with the same URB requirements,
6074 * skip the rest of the logic.
6075 */
6076 bool no_change = false;
6077 if (ice->urb.vsize == urb_cfg.size[MESA_SHADER_VERTEX] &&
6078 ice->urb.gs_present == gs_present &&
6079 ice->urb.gsize == urb_cfg.size[MESA_SHADER_GEOMETRY] &&
6080 ice->urb.tess_present == tess_present &&
6081 ice->urb.hsize == urb_cfg.size[MESA_SHADER_TESS_CTRL] &&
6082 ice->urb.dsize == urb_cfg.size[MESA_SHADER_TESS_EVAL]) {
6083 no_change = true;
6084 }
6085
6086 if (!no_change) {
6087 ice->urb.vsize = urb_cfg.size[MESA_SHADER_VERTEX];
6088 ice->urb.gs_present = gs_present;
6089 ice->urb.gsize = urb_cfg.size[MESA_SHADER_GEOMETRY];
6090 ice->urb.tess_present = tess_present;
6091 ice->urb.hsize = urb_cfg.size[MESA_SHADER_TESS_CTRL];
6092 ice->urb.dsize = urb_cfg.size[MESA_SHADER_TESS_EVAL];
6093
6094 bool constrained;
6095 intel_get_urb_config(devinfo,
6096 batch->screen->l3_config_3d,
6097 tess_present,
6098 gs_present,
6099 &urb_cfg, NULL, &constrained);
6100
6101 #if GFX_VER == 7
6102 if (devinfo->platform == INTEL_PLATFORM_IVB)
6103 gen7_emit_vs_workaround_flush(batch);
6104 #endif
6105 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6106 crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6107 urb._3DCommandSubOpcode += i;
6108 urb.VSURBStartingAddress = urb_cfg.start[i];
6109 urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
6110 urb.VSNumberofURBEntries = urb_cfg.entries[i];
6111 }
6112 }
6113 }
6114 #endif
6115 }
6116
6117 if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6118 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6119 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6120 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6121
6122 STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6123 int rt_dwords =
6124 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6125 #if GFX_VER >= 8
6126 rt_dwords += GENX(BLEND_STATE_length);
6127 #endif
6128 uint32_t blend_offset;
6129 uint32_t *blend_map =
6130 stream_state(batch,
6131 4 * rt_dwords, 64, &blend_offset);
6132
6133 #if GFX_VER >= 8
6134 struct GENX(BLEND_STATE) be = { 0 };
6135 {
6136 #else
6137 for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6138 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6139 #define be entry
6140 #endif
6141
6142 be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6143 be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6144 be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6145 be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6146 be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage_dither;
6147 be.ColorDitherEnable = cso_blend->cso.dither;
6148
6149 #if GFX_VER >= 8
6150 for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6151 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6152 #else
6153 {
6154 #endif
6155 const struct pipe_rt_blend_state *rt =
6156 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6157
6158 be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6159 be.IndependentAlphaBlendEnable;
6160
6161 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6162 entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6163 entry.LogicOpFunction = cso_blend->cso.logicop_func;
6164 }
6165
6166 entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6167 entry.PreBlendColorClampEnable = true;
6168 entry.PostBlendColorClampEnable = true;
6169
6170 entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
6171 entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6172 entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
6173 entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6174
6175 #if GFX_VER >= 8
6176 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6177 #else
6178 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6179 #endif
6180 }
6181 }
6182 #if GFX_VER >= 8
6183 GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6184 #endif
6185 #if GFX_VER < 7
6186 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6187 ptr.PointertoBLEND_STATE = blend_offset;
6188 ptr.BLEND_STATEChange = true;
6189 }
6190 #else
6191 crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6192 ptr.BlendStatePointer = blend_offset;
6193 #if GFX_VER >= 8
6194 ptr.BlendStatePointerValid = true;
6195 #endif
6196 }
6197 #endif
6198 }
6199 #endif
6200
6201 if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6202 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6203 UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6204 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6205 uint32_t cc_offset;
6206 void *cc_map =
6207 stream_state(batch,
6208 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6209 64, &cc_offset);
6210 #if GFX_VER <= 5
6211 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6212 #endif
6213 _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6214 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6215 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6216
6217 #if GFX_VER <= 5
6218
6219 set_depth_stencil_bits(ice, &cc);
6220
6221 if (cso_blend->cso.logicop_enable) {
6222 if (can_emit_logic_op(ice)) {
6223 cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6224 cc.LogicOpFunction = cso_blend->cso.logicop_func;
6225 }
6226 }
6227 cc.ColorDitherEnable = cso_blend->cso.dither;
6228
6229 cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6230
6231 if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6232 cc.AlphaTestEnable = cso->cso.alpha_enabled;
6233 cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6234 }
6235 cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6236 cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6237 #else
6238 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6239 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6240
6241 cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6242 cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6243 cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6244 cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6245 #endif
6246 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6247 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6248 }
6249 ice->shaders.cc_offset = cc_offset;
6250 #if GFX_VER >= 6
6251 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6252 ptr.ColorCalcStatePointer = cc_offset;
6253 #if GFX_VER != 7
6254 ptr.ColorCalcStatePointerValid = true;
6255 #endif
6256 }
6257 #endif
6258 }
6259 #if GFX_VER <= 5
6260 if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6261 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6262 blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6263 blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6264 blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6265 blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6266 }
6267 }
6268 #endif
6269 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6270 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6271 continue;
6272
6273 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6274 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6275
6276 if (!shader)
6277 continue;
6278
6279 if (shs->sysvals_need_upload)
6280 upload_sysvals(ice, stage);
6281
6282 #if GFX_VER <= 5
6283 dirty |= CROCUS_DIRTY_GEN4_CURBE;
6284 #endif
6285 #if GFX_VER >= 7
6286 struct push_bos push_bos = {};
6287 setup_constant_buffers(ice, batch, stage, &push_bos);
6288
6289 emit_push_constant_packets(ice, batch, stage, &push_bos);
6290 #endif
6291 }
6292
6293 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6294 if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6295 if (ice->shaders.prog[stage]) {
6296 #if GFX_VER <= 6
6297 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6298 #endif
6299 crocus_populate_binding_table(ice, batch, stage, false);
6300 ice->shaders.prog[stage]->bind_bo_offset =
6301 crocus_upload_binding_table(ice, batch,
6302 ice->shaders.prog[stage]->surf_offset,
6303 ice->shaders.prog[stage]->bt.size_bytes);
6304
6305 #if GFX_VER >= 7
6306 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6307 ptr._3DCommandSubOpcode = 38 + stage;
6308 ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6309 }
6310 #endif
6311 #if GFX_VER == 6
6312 } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6313 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6314 crocus_populate_binding_table(ice, batch, stage, true);
6315 ice->shaders.ff_gs_prog->bind_bo_offset =
6316 crocus_upload_binding_table(ice, batch,
6317 ice->shaders.ff_gs_prog->surf_offset,
6318 ice->shaders.ff_gs_prog->bt.size_bytes);
6319 #endif
6320 }
6321 }
6322 }
6323 #if GFX_VER <= 6
6324 if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6325 struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6326 if (gs == NULL)
6327 gs = ice->shaders.ff_gs_prog;
6328 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6329 ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6330 ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6331 #if GFX_VER == 6
6332 ptr.VSBindingTableChange = true;
6333 ptr.PSBindingTableChange = true;
6334 ptr.GSBindingTableChange = gs ? true : false;
6335 ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6336 #endif
6337 }
6338 }
6339 #endif
6340
6341 bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6342 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6343 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6344 !ice->shaders.prog[stage])
6345 continue;
6346
6347 crocus_upload_sampler_states(ice, batch, stage);
6348
6349 sampler_updates = true;
6350
6351 #if GFX_VER >= 7
6352 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6353
6354 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6355 ptr._3DCommandSubOpcode = 43 + stage;
6356 ptr.PointertoVSSamplerState = shs->sampler_offset;
6357 }
6358 #endif
6359 }
6360
6361 if (sampler_updates) {
6362 #if GFX_VER == 6
6363 struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6364 struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6365 struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6366 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6367 if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6368 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6369 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6370 ptr.VSSamplerStateChange = true;
6371 ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6372 }
6373 if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6374 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6375 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6376 ptr.GSSamplerStateChange = true;
6377 ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6378 }
6379 if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6380 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6381 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6382 ptr.PSSamplerStateChange = true;
6383 ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6384 }
6385 }
6386 #endif
6387 }
6388
6389 #if GFX_VER >= 6
6390 if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6391 crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6392 ms.PixelLocation =
6393 ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6394 if (ice->state.framebuffer.samples > 0)
6395 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6396 #if GFX_VER == 6
6397 INTEL_SAMPLE_POS_4X(ms.Sample);
6398 #elif GFX_VER == 7
6399 switch (ice->state.framebuffer.samples) {
6400 case 1:
6401 INTEL_SAMPLE_POS_1X(ms.Sample);
6402 break;
6403 case 2:
6404 INTEL_SAMPLE_POS_2X(ms.Sample);
6405 break;
6406 case 4:
6407 INTEL_SAMPLE_POS_4X(ms.Sample);
6408 break;
6409 case 8:
6410 INTEL_SAMPLE_POS_8X(ms.Sample);
6411 break;
6412 default:
6413 break;
6414 }
6415 #endif
6416 }
6417 }
6418
6419 if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6420 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6421 ms.SampleMask = determine_sample_mask(ice);
6422 }
6423 }
6424 #endif
6425
6426 #if GFX_VER >= 7
6427 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6428 if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6429 struct elk_stage_prog_data *prog_data = shader->prog_data;
6430 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6431
6432 crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6433
6434 /* Initialize the execution mask with VMask. Otherwise, derivatives are
6435 * incorrect for subspans where some of the pixels are unlit. We believe
6436 * the bit just didn't take effect in previous generations.
6437 */
6438 ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
6439
6440 intel_set_ps_dispatch_state(&ps, &batch->screen->devinfo,
6441 wm_prog_data,
6442 ice->state.framebuffer.samples,
6443 0 /* msaa_flags */);
6444
6445 ps.DispatchGRFStartRegisterForConstantSetupData0 =
6446 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6447 ps.DispatchGRFStartRegisterForConstantSetupData1 =
6448 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6449 ps.DispatchGRFStartRegisterForConstantSetupData2 =
6450 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6451
6452 ps.KernelStartPointer0 = KSP(ice, shader) +
6453 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6454 ps.KernelStartPointer1 = KSP(ice, shader) +
6455 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6456 ps.KernelStartPointer2 = KSP(ice, shader) +
6457 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6458
6459 #if GFX_VERx10 == 75
6460 ps.SampleMask = determine_sample_mask(ice);
6461 #endif
6462 // XXX: WABTPPrefetchDisable, see above, drop at C0
6463 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6464 ps.FloatingPointMode = prog_data->use_alt_mode;
6465 #if GFX_VER >= 8
6466 ps.MaximumNumberofThreadsPerPSD =
6467 batch->screen->devinfo.max_threads_per_psd - 2;
6468 #else
6469 ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6470 #endif
6471
6472 ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6473
6474 #if GFX_VER < 8
6475 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6476 ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6477 ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6478 #endif
6479 /* From the documentation for this packet:
6480 * "If the PS kernel does not need the Position XY Offsets to
6481 * compute a Position Value, then this field should be programmed
6482 * to POSOFFSET_NONE."
6483 *
6484 * "SW Recommendation: If the PS kernel needs the Position Offsets
6485 * to compute a Position XY value, this field should match Position
6486 * ZW Interpolation Mode to ensure a consistent position.xyzw
6487 * computation."
6488 *
6489 * We only require XY sample offsets. So, this recommendation doesn't
6490 * look useful at the moment. We might need this in future.
6491 */
6492 ps.PositionXYOffsetSelect =
6493 wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6494
6495 if (wm_prog_data->base.total_scratch) {
6496 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6497 ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6498 ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6499 }
6500 }
6501 #if GFX_VER == 8
6502 const struct shader_info *fs_info =
6503 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6504 crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6505 psx.PixelShaderValid = true;
6506 psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6507 psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6508 psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6509 psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6510 psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6511 psx.PixelShaderIsPerSample =
6512 elk_wm_prog_data_is_persample(wm_prog_data, 0);
6513
6514 /* _NEW_MULTISAMPLE | ELK_NEW_CONSERVATIVE_RASTERIZATION */
6515 if (wm_prog_data->uses_sample_mask)
6516 psx.PixelShaderUsesInputCoverageMask = true;
6517
6518 psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6519
6520 /* The stricter cross-primitive coherency guarantees that the hardware
6521 * gives us with the "Accesses UAV" bit set for at least one shader stage
6522 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6523 * are redundant within the current image, atomic counter and SSBO GL
6524 * APIs, which all have very loose ordering and coherency requirements
6525 * and generally rely on the application to insert explicit barriers when
6526 * a shader invocation is expected to see the memory writes performed by
6527 * the invocations of some previous primitive. Regardless of the value
6528 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6529 * cause an in most cases useless DC flush when the lowermost stage with
6530 * the bit set finishes execution.
6531 *
6532 * It would be nice to disable it, but in some cases we can't because on
6533 * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6534 * signal (which could be set independently from the coherency mechanism
6535 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6536 * determine whether the hardware skips execution of the fragment shader
6537 * or not via the ThreadDispatchEnable signal. However if we know that
6538 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6539 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6540 * difference so we may just disable it here.
6541 *
6542 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6543 * take into account KillPixels when no depth or stencil writes are
6544 * enabled. In order for occlusion queries to work correctly with no
6545 * attachments, we need to force-enable here.
6546 *
6547 */
6548 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6549 !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6550 psx.PixelShaderHasUAV = true;
6551 }
6552 #endif
6553 }
6554 #endif
6555
6556 #if GFX_VER >= 7
6557 if (ice->state.streamout_active) {
6558 if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6559 for (int i = 0; i < 4; i++) {
6560 struct crocus_stream_output_target *tgt =
6561 (void *) ice->state.so_target[i];
6562
6563 if (!tgt) {
6564 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6565 sob.SOBufferIndex = i;
6566 sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6567 }
6568 continue;
6569 }
6570 struct crocus_resource *res = (void *) tgt->base.buffer;
6571 uint32_t start = tgt->base.buffer_offset;
6572 #if GFX_VER < 8
6573 uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6574 #endif
6575 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6576 sob.SOBufferIndex = i;
6577
6578 sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6579 sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6580 #if GFX_VER < 8
6581 sob.SurfacePitch = tgt->stride;
6582 sob.SurfaceEndAddress = rw_bo(res->bo, end);
6583 #else
6584 sob.SOBufferEnable = true;
6585 sob.StreamOffsetWriteEnable = true;
6586 sob.StreamOutputBufferOffsetAddressEnable = true;
6587
6588 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6589 sob.StreamOutputBufferOffsetAddress =
6590 rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6591 if (tgt->zero_offset) {
6592 sob.StreamOffset = 0;
6593 tgt->zero_offset = false;
6594 } else
6595 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6596 #endif
6597 }
6598 }
6599 }
6600
6601 if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6602 uint32_t *decl_list =
6603 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6604 crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6605 }
6606
6607 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6608 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6609
6610 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6611 crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6612 sol.SOFunctionEnable = true;
6613 sol.SOStatisticsEnable = true;
6614
6615 sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6616 !ice->state.prims_generated_query_active;
6617 sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6618 }
6619
6620 assert(ice->state.streamout);
6621
6622 crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6623 GENX(3DSTATE_STREAMOUT_length));
6624 }
6625 } else {
6626 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6627 crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6628 }
6629 }
6630 #endif
6631 #if GFX_VER == 6
6632 if (ice->state.streamout_active) {
6633 if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6634 crocus_emit_so_svbi(ice);
6635 }
6636 }
6637 #endif
6638
6639 if (dirty & CROCUS_DIRTY_CLIP) {
6640 #if GFX_VER < 6
6641 const struct elk_clip_prog_data *clip_prog_data = (struct elk_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6642 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6643
6644 uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6645 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6646 _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6647 clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6648 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6649 clip.SingleProgramFlow = true;
6650 clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6651
6652 clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6653 clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6654
6655 clip.DispatchGRFStartRegisterForURBData = 1;
6656 clip.VertexURBEntryReadOffset = 0;
6657 clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6658
6659 clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6660 clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6661
6662 if (batch->ice->urb.nr_clip_entries >= 10) {
6663 /* Half of the URB entries go to each thread, and it has to be an
6664 * even number.
6665 */
6666 assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6667
6668 /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6669 * only 2 threads can output VUEs at a time.
6670 */
6671 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6672 } else {
6673 assert(batch->ice->urb.nr_clip_entries >= 5);
6674 clip.MaximumNumberofThreads = 1 - 1;
6675 }
6676 clip.VertexPositionSpace = VPOS_NDCSPACE;
6677 clip.UserClipFlagsMustClipEnable = true;
6678 clip.GuardbandClipTestEnable = true;
6679
6680 clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6681 clip.ScreenSpaceViewportXMin = -1.0;
6682 clip.ScreenSpaceViewportXMax = 1.0;
6683 clip.ScreenSpaceViewportYMin = -1.0;
6684 clip.ScreenSpaceViewportYMax = 1.0;
6685 clip.ViewportXYClipTestEnable = true;
6686 clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6687
6688 #if GFX_VER == 5 || GFX_VERx10 == 45
6689 clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6690 #else
6691 /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6692 * workaround.
6693 */
6694 clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6695 #endif
6696
6697 clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6698 clip.GuardbandClipTestEnable = true;
6699
6700 clip.ClipMode = clip_prog_data->clip_mode;
6701 #if GFX_VERx10 == 45
6702 clip.NegativeWClipTestEnable = true;
6703 #endif
6704 }
6705
6706 #else //if GFX_VER >= 6
6707 struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6708 const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6709 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6710 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6711 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6712 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6713 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6714 : ice->state.prim_is_points_or_lines);
6715 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6716 crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6717 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6718 if (cso_rast->cso.rasterizer_discard)
6719 cl.ClipMode = CLIPMODE_REJECT_ALL;
6720 else if (ice->state.window_space_position)
6721 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6722 else
6723 cl.ClipMode = CLIPMODE_NORMAL;
6724
6725 cl.PerspectiveDivideDisable = ice->state.window_space_position;
6726 cl.ViewportXYClipTestEnable = !points_or_lines;
6727
6728 cl.UserClipDistanceCullTestEnableBitmask =
6729 elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6730
6731 cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes;
6732
6733 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6734 cl.MaximumVPIndex = ice->state.num_viewports - 1;
6735 }
6736 crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6737 ARRAY_SIZE(cso_rast->clip));
6738 #endif
6739 }
6740
6741 if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6742 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6743 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6744 const struct elk_stage_prog_data *prog_data = &vue_prog_data->base;
6745 #if GFX_VER == 7
6746 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6747 gen7_emit_vs_workaround_flush(batch);
6748 #endif
6749
6750
6751 #if GFX_VER == 6
6752 struct push_bos push_bos = {};
6753 setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6754
6755 emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6756 #endif
6757 #if GFX_VER >= 6
6758 crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6759 #else
6760 uint32_t *vs_ptr = stream_state(batch,
6761 GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6762 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6763 _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6764 #endif
6765 {
6766 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6767
6768 vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6769
6770 #if GFX_VER < 6
6771 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6772 vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6773 vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6774
6775 vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6776 vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6777
6778 vs.MaximumNumberofThreads =
6779 CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6780 vs.StatisticsEnable = false;
6781 vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6782 #endif
6783 #if GFX_VER == 5
6784 /* Force single program flow on Ironlake. We cannot reliably get
6785 * all applications working without it. See:
6786 * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6787 *
6788 * The most notable and reliably failing application is the Humus
6789 * demo "CelShading"
6790 */
6791 vs.SingleProgramFlow = true;
6792 vs.SamplerCount = 0; /* hardware requirement */
6793
6794 #endif
6795 #if GFX_VER >= 8
6796 vs.SIMD8DispatchEnable =
6797 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6798
6799 vs.UserClipDistanceCullTestEnableBitmask =
6800 vue_prog_data->cull_distance_mask;
6801 #endif
6802 }
6803
6804 #if GFX_VER == 6
6805 crocus_emit_pipe_control_flush(batch,
6806 "post VS const",
6807 PIPE_CONTROL_DEPTH_STALL |
6808 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6809 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6810 #endif
6811 }
6812
6813 if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6814 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6815 bool active = GFX_VER >= 6 && shader;
6816 #if GFX_VER == 6
6817 struct push_bos push_bos = {};
6818 if (shader)
6819 setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6820
6821 emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6822 #endif
6823 #if GFX_VERx10 == 70
6824 /**
6825 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6826 * Geometry > Geometry Shader > State:
6827 *
6828 * "Note: Because of corruption in IVB:GT2, software needs to flush the
6829 * whole fixed function pipeline when the GS enable changes value in
6830 * the 3DSTATE_GS."
6831 *
6832 * The hardware architects have clarified that in this context "flush the
6833 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6834 * Stall" bit set.
6835 */
6836 if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6837 gen7_emit_cs_stall_flush(batch);
6838 #endif
6839 #if GFX_VER >= 6
6840 crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6841 #else
6842 uint32_t *gs_ptr = stream_state(batch,
6843 GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6844 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6845 _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6846 #endif
6847 {
6848 #if GFX_VER >= 6
6849 if (active) {
6850 const struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(shader->prog_data);
6851 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6852 const struct elk_stage_prog_data *prog_data = &gs_prog_data->base.base;
6853
6854 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6855 #if GFX_VER >= 7
6856 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6857 gs.OutputTopology = gs_prog_data->output_topology;
6858 gs.ControlDataHeaderSize =
6859 gs_prog_data->control_data_header_size_hwords;
6860
6861 gs.InstanceControl = gs_prog_data->invocations - 1;
6862 gs.DispatchMode = vue_prog_data->dispatch_mode;
6863
6864 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6865
6866 gs.ControlDataFormat = gs_prog_data->control_data_format;
6867 #endif
6868
6869 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6870 * Ivy Bridge and Haswell.
6871 *
6872 * On Ivy Bridge, setting this bit causes the vertices of a triangle
6873 * strip to be delivered to the geometry shader in an order that does
6874 * not strictly follow the OpenGL spec, but preserves triangle
6875 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
6876 * the geometry shader sees triangles:
6877 *
6878 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6879 *
6880 * (Clearing the bit is even worse, because it fails to preserve
6881 * orientation).
6882 *
6883 * Triangle strips with adjacency always ordered in a way that preserves
6884 * triangle orientation but does not strictly follow the OpenGL spec,
6885 * regardless of the setting of this bit.
6886 *
6887 * On Haswell, both triangle strips and triangle strips with adjacency
6888 * are always ordered in a way that preserves triangle orientation.
6889 * Setting this bit causes the ordering to strictly follow the OpenGL
6890 * spec.
6891 *
6892 * So in either case we want to set the bit. Unfortunately on Ivy
6893 * Bridge this will get the order close to correct but not perfect.
6894 */
6895 gs.ReorderMode = TRAILING;
6896 gs.MaximumNumberofThreads =
6897 GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6898 (batch->screen->devinfo.max_gs_threads - 1);
6899 #if GFX_VER < 7
6900 gs.SOStatisticsEnable = true;
6901 if (gs_prog_data->num_transform_feedback_bindings)
6902 gs.SVBIPayloadEnable = ice->state.streamout_active;
6903
6904 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6905 * was previously done for gen6.
6906 *
6907 * TODO: test with both disabled to see if the HW is behaving
6908 * as expected, like in gen7.
6909 */
6910 gs.SingleProgramFlow = true;
6911 gs.VectorMaskEnable = true;
6912 #endif
6913 #if GFX_VER >= 8
6914 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6915
6916 if (gs_prog_data->static_vertex_count != -1) {
6917 gs.StaticOutput = true;
6918 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6919 }
6920 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6921
6922 gs.UserClipDistanceCullTestEnableBitmask =
6923 vue_prog_data->cull_distance_mask;
6924
6925 const int urb_entry_write_offset = 1;
6926 const uint32_t urb_entry_output_length =
6927 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6928 urb_entry_write_offset;
6929
6930 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6931 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6932 #endif
6933 }
6934 #endif
6935 #if GFX_VER <= 6
6936 if (!active && ice->shaders.ff_gs_prog) {
6937 const struct elk_ff_gs_prog_data *gs_prog_data = (struct elk_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6938 /* In gen6, transform feedback for the VS stage is done with an
6939 * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6940 * for this.
6941 */
6942 gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6943 gs.SingleProgramFlow = true;
6944 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6945 gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6946
6947 #if GFX_VER <= 5
6948 gs.GRFRegisterCount =
6949 DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6950 /* ELK_NEW_URB_FENCE */
6951 gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6952 gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6953 gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6954 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6955 #else
6956 gs.Enable = true;
6957 gs.VectorMaskEnable = true;
6958 gs.SVBIPayloadEnable = true;
6959 gs.SVBIPostIncrementEnable = true;
6960 gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6961 gs.SOStatisticsEnable = true;
6962 gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6963 #endif
6964 }
6965 #endif
6966 if (!active && !ice->shaders.ff_gs_prog) {
6967 #if GFX_VER < 8
6968 gs.DispatchGRFStartRegisterForURBData = 1;
6969 #if GFX_VER >= 7
6970 gs.IncludeVertexHandles = true;
6971 #endif
6972 #endif
6973 }
6974 #if GFX_VER >= 6
6975 gs.StatisticsEnable = true;
6976 #endif
6977 #if GFX_VER == 5 || GFX_VER == 6
6978 gs.RenderingEnabled = true;
6979 #endif
6980 #if GFX_VER <= 5
6981 gs.MaximumVPIndex = ice->state.num_viewports - 1;
6982 #endif
6983 }
6984 ice->state.gs_enabled = active;
6985 }
6986
6987 #if GFX_VER >= 7
6988 if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6989 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6990
6991 if (shader) {
6992 const struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(shader->prog_data);
6993 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6994 const struct elk_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6995
6996 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
6997 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
6998 hs.InstanceCount = tcs_prog_data->instances - 1;
6999 hs.IncludeVertexHandles = true;
7000 hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7001 }
7002 } else {
7003 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7004 }
7005
7006 }
7007
7008 if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7009 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7010 if (shader) {
7011 const struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(shader->prog_data);
7012 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
7013 const struct elk_stage_prog_data *prog_data = &tes_prog_data->base.base;
7014
7015 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7016 te.Partitioning = tes_prog_data->partitioning;
7017 te.OutputTopology = tes_prog_data->output_topology;
7018 te.TEDomain = tes_prog_data->domain;
7019 te.TEEnable = true;
7020 te.MaximumTessellationFactorOdd = 63.0;
7021 te.MaximumTessellationFactorNotOdd = 64.0;
7022 };
7023 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7024 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7025
7026 ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7027 ds.ComputeWCoordinateEnable =
7028 tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
7029
7030 #if GFX_VER >= 8
7031 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7032 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7033 ds.UserClipDistanceCullTestEnableBitmask =
7034 vue_prog_data->cull_distance_mask;
7035 #endif
7036 };
7037 } else {
7038 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7039 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7040 }
7041 }
7042 #endif
7043 if (dirty & CROCUS_DIRTY_RASTER) {
7044
7045 #if GFX_VER < 6
7046 const struct elk_sf_prog_data *sf_prog_data = (struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7047 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7048 uint32_t *sf_ptr = stream_state(batch,
7049 GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7050 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7051 _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7052 sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7053 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7054 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7055 sf.DispatchGRFStartRegisterForURBData = 3;
7056 sf.VertexURBEntryReadOffset = ELK_SF_URB_ENTRY_READ_OFFSET;
7057 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7058 sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7059 sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7060 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7061
7062 sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7063
7064 sf.MaximumNumberofThreads =
7065 MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7066
7067 sf.SpritePointEnable = cso_state->point_quad_rasterization;
7068 sf.DestinationOriginHorizontalBias = 0.5;
7069 sf.DestinationOriginVerticalBias = 0.5;
7070
7071 sf.LineEndCapAntialiasingRegionWidth =
7072 cso_state->line_smooth ? _10pixels : _05pixels;
7073 sf.LastPixelEnable = cso_state->line_last_pixel;
7074 sf.AntialiasingEnable = cso_state->line_smooth;
7075
7076 sf.LineWidth = get_line_width(cso_state);
7077 sf.PointWidth = cso_state->point_size;
7078 sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7079 #if GFX_VERx10 >= 45
7080 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7081 #endif
7082 sf.ViewportTransformEnable = true;
7083 sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7084 sf.ScissorRectangleEnable = true;
7085 sf.CullMode = translate_cull_mode(cso_state->cull_face);
7086
7087 if (cso_state->flatshade_first) {
7088 sf.TriangleFanProvokingVertexSelect = 1;
7089 } else {
7090 sf.TriangleStripListProvokingVertexSelect = 2;
7091 sf.TriangleFanProvokingVertexSelect = 2;
7092 sf.LineStripListProvokingVertexSelect = 1;
7093 }
7094 }
7095 #else
7096 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7097 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7098 crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7099 sf.ViewportTransformEnable = !ice->state.window_space_position;
7100
7101 #if GFX_VER == 6
7102 const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7103 uint32_t urb_entry_read_length;
7104 uint32_t urb_entry_read_offset;
7105 uint32_t point_sprite_enables;
7106 calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7107 &urb_entry_read_length,
7108 &urb_entry_read_offset);
7109 sf.VertexURBEntryReadLength = urb_entry_read_length;
7110 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7111 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7112 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7113 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7114 #endif
7115
7116 #if GFX_VER >= 6 && GFX_VER < 8
7117 if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7118 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7119 #endif
7120 #if GFX_VER == 7
7121 if (ice->state.framebuffer.zsbuf) {
7122 struct crocus_resource *zres, *sres;
7123 crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7124 ice->state.framebuffer.zsbuf->texture,
7125 &zres, &sres);
7126 /* ANV thinks that the stencil-ness doesn't matter, this is just
7127 * about handling polygon offset scaling.
7128 */
7129 sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7130 }
7131 #endif
7132 }
7133 crocus_emit_merge(batch, cso->sf, dynamic_sf,
7134 ARRAY_SIZE(dynamic_sf));
7135 #if GFX_VER == 8
7136 crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7137 #endif
7138 #endif
7139 }
7140
7141 if (dirty & CROCUS_DIRTY_WM) {
7142 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7143 const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7144 UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF;
7145 UNUSED const struct shader_info *fs_info =
7146 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7147
7148 #if GFX_VER == 6
7149 struct push_bos push_bos = {};
7150 setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7151
7152 emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7153 #endif
7154 #if GFX_VER >= 6
7155 crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7156 #else
7157 uint32_t *wm_ptr = stream_state(batch,
7158 GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7159
7160 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7161
7162 _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7163 #endif
7164 {
7165 #if GFX_VER <= 6
7166 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7167 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7168 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7169 #endif
7170 #if GFX_VER == 4
7171 /* On gen4, we only have one shader kernel */
7172 if (elk_wm_state_has_ksp(wm, 0)) {
7173 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7174 wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7175 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7176 wm_prog_data->base.dispatch_grf_start_reg;
7177 }
7178 #elif GFX_VER == 5
7179 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7180 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7181 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7182 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7183 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7184 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7185
7186 wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7187 wm.GRFRegisterCount1 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7188 wm.GRFRegisterCount2 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7189
7190 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7191 wm_prog_data->base.dispatch_grf_start_reg;
7192 #elif GFX_VER == 6
7193 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7194 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7195 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7196 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7197 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7198 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7199
7200 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7201 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7202 wm.DispatchGRFStartRegisterForConstantSetupData1 =
7203 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7204 wm.DispatchGRFStartRegisterForConstantSetupData2 =
7205 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7206 #endif
7207 #if GFX_VER <= 5
7208 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7209 wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7210 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7211 wm.SetupURBEntryReadOffset = 0;
7212 wm.EarlyDepthTestEnable = true;
7213 wm.LineAntialiasingRegionWidth = _05pixels;
7214 wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7215 wm.DepthCoefficientURBReadOffset = 1;
7216
7217 if (cso->cso.offset_tri) {
7218 wm.GlobalDepthOffsetEnable = true;
7219
7220 /* Something weird going on with legacy_global_depth_bias,
7221 * offset_constant, scaling and MRD. This value passes glean
7222 * but gives some odd results elsewere (eg. the
7223 * quad-offset-units test).
7224 */
7225 wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7226 wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7227 }
7228 wm.SamplerStatePointer = ro_bo(batch->state.bo,
7229 ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7230 #endif
7231
7232 wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7233 ice->state.statistics_counters_enabled : 0;
7234
7235 #if GFX_VER >= 6
7236 wm.LineAntialiasingRegionWidth = _10pixels;
7237 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7238
7239 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7240 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7241 #endif
7242 #if GFX_VER == 6
7243 wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7244 ice->state.cso_blend->dual_color_blending;
7245 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7246 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7247
7248 /* From the SNB PRM, volume 2 part 1, page 281:
7249 * "If the PS kernel does not need the Position XY Offsets
7250 * to compute a Position XY value, then this field should be
7251 * programmed to POSOFFSET_NONE."
7252 *
7253 * "SW Recommendation: If the PS kernel needs the Position Offsets
7254 * to compute a Position XY value, this field should match Position
7255 * ZW Interpolation Mode to ensure a consistent position.xyzw
7256 * computation."
7257 * We only require XY sample offsets. So, this recommendation doesn't
7258 * look useful at the moment. We might need this in future.
7259 */
7260 if (wm_prog_data->uses_pos_offset)
7261 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7262 else
7263 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7264 #endif
7265 wm.LineStippleEnable = cso->cso.line_stipple_enable;
7266 wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7267
7268 #if GFX_VER < 7
7269 if (wm_prog_data->base.use_alt_mode)
7270 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7271 wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7272 wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7273 #endif
7274
7275 #if GFX_VER < 8
7276 #if GFX_VER >= 6
7277 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7278
7279 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7280 if (fb->samples > 1) {
7281 if (cso->cso.multisample)
7282 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7283 else
7284 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7285
7286 if (elk_wm_prog_data_is_persample(wm_prog_data, 0))
7287 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7288 else
7289 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7290 } else {
7291 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7292 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7293 }
7294 #endif
7295
7296 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7297
7298 if (wm_prog_data->uses_kill ||
7299 ice->state.cso_zsa->cso.alpha_enabled ||
7300 ice->state.cso_blend->cso.alpha_to_coverage ||
7301 (GFX_VER >= 6 && wm_prog_data->uses_omask))
7302 wm.PixelShaderKillsPixel = true;
7303
7304 if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7305 writes_depth || wm.PixelShaderKillsPixel ||
7306 (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7307 wm.ThreadDispatchEnable = true;
7308
7309 #if GFX_VER >= 7
7310 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7311 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7312 #else
7313 if (wm_prog_data->base.total_scratch) {
7314 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7315 MESA_SHADER_FRAGMENT);
7316 wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7317 wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7318 }
7319
7320 wm.PixelShaderComputedDepth = writes_depth;
7321
7322 #endif
7323 /* The "UAV access enable" bits are unnecessary on HSW because they only
7324 * seem to have an effect on the HW-assisted coherency mechanism which we
7325 * don't need, and the rasterization-related UAV_ONLY flag and the
7326 * DISPATCH_ENABLE bit can be set independently from it.
7327 * C.f. gen8_upload_ps_extra().
7328 *
7329 * ELK_NEW_FRAGMENT_PROGRAM | ELK_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7330 * _NEW_COLOR
7331 */
7332 #if GFX_VERx10 == 75
7333 if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7334 wm_prog_data->has_side_effects)
7335 wm.PSUAVonly = ON;
7336 #endif
7337 #endif
7338 #if GFX_VER >= 7
7339 /* ELK_NEW_FS_PROG_DATA */
7340 if (wm_prog_data->early_fragment_tests)
7341 wm.EarlyDepthStencilControl = EDSC_PREPS;
7342 else if (wm_prog_data->has_side_effects)
7343 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7344 #endif
7345 #if GFX_VER == 8
7346 /* We could skip this bit if color writes are enabled. */
7347 if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7348 wm.ForceThreadDispatchEnable = ForceON;
7349 #endif
7350 };
7351
7352 #if GFX_VER <= 5
7353 if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7354 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7355 clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7356 }
7357 ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7358 }
7359 #endif
7360 }
7361
7362 #if GFX_VER >= 7
7363 if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7364 crocus_emit_sbe(batch, ice);
7365 }
7366 #endif
7367
7368 #if GFX_VER >= 8
7369 if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7370 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7371 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7372 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7373 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7374 const struct shader_info *fs_info =
7375 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7376 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7377 crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7378 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7379 pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7380 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7381 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7382 }
7383 crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7384 ARRAY_SIZE(cso_blend->ps_blend));
7385 }
7386 #endif
7387
7388 #if GFX_VER >= 6
7389 if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7390
7391 #if GFX_VER >= 8
7392 crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7393 set_depth_stencil_bits(ice, &wmds);
7394 }
7395 #else
7396 uint32_t ds_offset;
7397 void *ds_map = stream_state(batch,
7398 sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7399 64, &ds_offset);
7400 _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7401 set_depth_stencil_bits(ice, &ds);
7402 }
7403
7404 #if GFX_VER == 6
7405 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7406 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7407 ptr.DEPTH_STENCIL_STATEChange = true;
7408 }
7409 #else
7410 crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7411 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7412 }
7413 #endif
7414 #endif
7415 }
7416
7417 if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7418 /* Align to 64-byte boundary as per anv. */
7419 uint32_t scissor_offset;
7420 struct pipe_scissor_state *scissor_map = (void *)
7421 stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7422 64, &scissor_offset);
7423 for (int i = 0; i < ice->state.num_viewports; i++) {
7424 struct pipe_scissor_state scissor;
7425 crocus_fill_scissor_rect(ice, i, &scissor);
7426 scissor_map[i] = scissor;
7427 }
7428
7429 crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7430 ptr.ScissorRectPointer = scissor_offset;
7431 }
7432 }
7433 #endif
7434
7435 if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7436 struct isl_device *isl_dev = &batch->screen->isl_dev;
7437 #if GFX_VER >= 6
7438 crocus_emit_depth_stall_flushes(batch);
7439 #endif
7440 void *batch_ptr;
7441 struct crocus_resource *zres, *sres;
7442 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7443 batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7444
7445 struct isl_view view = {
7446 .base_level = 0,
7447 .levels = 1,
7448 .base_array_layer = 0,
7449 .array_len = 1,
7450 .swizzle = ISL_SWIZZLE_IDENTITY,
7451 };
7452 struct isl_depth_stencil_hiz_emit_info info = {
7453 .view = &view,
7454 .mocs = crocus_mocs(NULL, isl_dev),
7455 };
7456
7457 if (cso->zsbuf) {
7458 crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7459 struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7460 if (zsbuf->align_res) {
7461 zres = (struct crocus_resource *)zsbuf->align_res;
7462 }
7463 view.base_level = cso->zsbuf->u.tex.level;
7464 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7465 view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7466
7467 if (zres) {
7468 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7469
7470 info.depth_surf = &zres->surf;
7471 info.depth_address = crocus_command_reloc(batch,
7472 (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7473 zres->bo, 0, RELOC_32BIT);
7474
7475 info.mocs = crocus_mocs(zres->bo, isl_dev);
7476 view.format = zres->surf.format;
7477
7478 if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7479 info.hiz_usage = zres->aux.usage;
7480 info.hiz_surf = &zres->aux.surf;
7481 uint64_t hiz_offset = 0;
7482
7483 #if GFX_VER == 6
7484 /* HiZ surfaces on Sandy Bridge technically don't support
7485 * mip-mapping. However, we can fake it by offsetting to the
7486 * first slice of LOD0 in the HiZ surface.
7487 */
7488 isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7489 view.base_level, 0, 0,
7490 &hiz_offset, NULL, NULL);
7491 #endif
7492 info.hiz_address = crocus_command_reloc(batch,
7493 (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7494 zres->aux.bo, zres->aux.offset + hiz_offset,
7495 RELOC_32BIT);
7496 info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7497 }
7498 }
7499
7500 #if GFX_VER >= 6
7501 if (sres) {
7502 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7503 info.stencil_aux_usage = sres->aux.usage;
7504 info.stencil_surf = &sres->surf;
7505
7506 uint64_t stencil_offset = 0;
7507 #if GFX_VER == 6
7508 /* Stencil surfaces on Sandy Bridge technically don't support
7509 * mip-mapping. However, we can fake it by offsetting to the
7510 * first slice of LOD0 in the stencil surface.
7511 */
7512 isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7513 view.base_level, 0, 0,
7514 &stencil_offset, NULL, NULL);
7515 #endif
7516
7517 info.stencil_address = crocus_command_reloc(batch,
7518 (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7519 sres->bo, stencil_offset, RELOC_32BIT);
7520 if (!zres) {
7521 view.format = sres->surf.format;
7522 info.mocs = crocus_mocs(sres->bo, isl_dev);
7523 }
7524 }
7525 #endif
7526 }
7527 isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7528 }
7529
7530 /* TODO: Disable emitting this until something uses a stipple. */
7531 if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7532 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7533 for (int i = 0; i < 32; i++) {
7534 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7535 }
7536 }
7537 }
7538
7539 if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7540 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7541 crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7542 }
7543
7544 #if GFX_VER >= 8
7545 if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7546 crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7547 topo.PrimitiveTopologyType =
7548 translate_prim_type(draw->mode, ice->state.patch_vertices);
7549 }
7550 }
7551 #endif
7552
7553 #if GFX_VER <= 5
7554 if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7555 upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7556 ice->shaders.vs_offset, ice->shaders.sf_offset,
7557 ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7558 crocus_upload_urb_fence(batch);
7559
7560 crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7561 cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7562 cs.URBEntryAllocationSize = ice->urb.csize - 1;
7563 }
7564 dirty |= CROCUS_DIRTY_GEN4_CURBE;
7565 }
7566 #endif
7567 if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7568 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7569 if (fb->width && fb->height) {
7570 crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7571 rect.ClippedDrawingRectangleXMax = fb->width - 1;
7572 rect.ClippedDrawingRectangleYMax = fb->height - 1;
7573 }
7574 }
7575 }
7576
7577 if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7578 const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7579 const uint32_t count = user_count +
7580 ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7581 uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7582
7583 if (count) {
7584 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7585
7586 uint32_t *map =
7587 crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7588 _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7589 vb.DWordLength = (vb_dwords * count + 1) - 2;
7590 }
7591 map += 1;
7592
7593 uint32_t bound = dynamic_bound;
7594 int i;
7595 while (bound) {
7596 i = u_bit_scan(&bound);
7597 struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7598 struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7599 uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7600
7601 emit_vertex_buffer_state(batch, i, bo,
7602 buf->buffer_offset,
7603 ice->state.vb_end[i],
7604 ice->state.cso_vertex_elements->strides[i],
7605 step_rate,
7606 &map);
7607 }
7608 i = user_count;
7609 if (ice->state.vs_uses_draw_params) {
7610 struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7611 emit_vertex_buffer_state(batch, i++,
7612 res->bo,
7613 ice->draw.draw_params.offset,
7614 ice->draw.draw_params.res->width0,
7615 0, 0, &map);
7616 }
7617 if (ice->state.vs_uses_derived_draw_params) {
7618 struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7619 emit_vertex_buffer_state(batch, i++,
7620 res->bo,
7621 ice->draw.derived_draw_params.offset,
7622 ice->draw.derived_draw_params.res->width0,
7623 0, 0, &map);
7624 }
7625 }
7626 }
7627
7628 if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7629 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7630 const unsigned entries = MAX2(cso->count, 1);
7631 if (!(ice->state.vs_needs_sgvs_element ||
7632 ice->state.vs_uses_derived_draw_params ||
7633 ice->state.vs_needs_edge_flag)) {
7634 crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7635 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7636 } else {
7637 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7638 const unsigned dyn_count = cso->count +
7639 ice->state.vs_needs_sgvs_element +
7640 ice->state.vs_uses_derived_draw_params;
7641
7642 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7643 &dynamic_ves, ve) {
7644 ve.DWordLength =
7645 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7646 }
7647 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7648 (cso->count - ice->state.vs_needs_edge_flag) *
7649 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7650 uint32_t *ve_pack_dest =
7651 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7652 GENX(VERTEX_ELEMENT_STATE_length)];
7653
7654 if (ice->state.vs_needs_sgvs_element) {
7655 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7656 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7657 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7658 ve.Valid = true;
7659 ve.VertexBufferIndex =
7660 util_bitcount64(ice->state.bound_vertex_buffers);
7661 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7662 ve.Component0Control = base_ctrl;
7663 ve.Component1Control = base_ctrl;
7664 #if GFX_VER < 8
7665 ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7666 ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7667 #else
7668 ve.Component2Control = VFCOMP_STORE_0;
7669 ve.Component3Control = VFCOMP_STORE_0;
7670 #endif
7671 #if GFX_VER < 5
7672 ve.DestinationElementOffset = cso->count * 4;
7673 #endif
7674 }
7675 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7676 }
7677 if (ice->state.vs_uses_derived_draw_params) {
7678 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7679 ve.Valid = true;
7680 ve.VertexBufferIndex =
7681 util_bitcount64(ice->state.bound_vertex_buffers) +
7682 ice->state.vs_uses_draw_params;
7683 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7684 ve.Component0Control = VFCOMP_STORE_SRC;
7685 ve.Component1Control = VFCOMP_STORE_SRC;
7686 ve.Component2Control = VFCOMP_STORE_0;
7687 ve.Component3Control = VFCOMP_STORE_0;
7688 #if GFX_VER < 5
7689 ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7690 #endif
7691 }
7692 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7693 }
7694 if (ice->state.vs_needs_edge_flag) {
7695 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
7696 ve_pack_dest[i] = cso->edgeflag_ve[i];
7697 }
7698
7699 crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7700 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7701 }
7702
7703 #if GFX_VER == 8
7704 if (!ice->state.vs_needs_edge_flag) {
7705 crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7706 entries * GENX(3DSTATE_VF_INSTANCING_length));
7707 } else {
7708 assert(cso->count > 0);
7709 const unsigned edgeflag_index = cso->count - 1;
7710 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7711 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7712 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7713
7714 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7715 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7716 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7717 vi.VertexElementIndex = edgeflag_index +
7718 ice->state.vs_needs_sgvs_element +
7719 ice->state.vs_uses_derived_draw_params;
7720 }
7721 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
7722 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7723
7724 crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7725 entries * GENX(3DSTATE_VF_INSTANCING_length));
7726 }
7727 #endif
7728 }
7729
7730 #if GFX_VER == 8
7731 if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7732 const struct elk_vs_prog_data *vs_prog_data = (void *)
7733 ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7734 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7735
7736 crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7737 if (vs_prog_data->uses_vertexid) {
7738 sgv.VertexIDEnable = true;
7739 sgv.VertexIDComponentNumber = 2;
7740 sgv.VertexIDElementOffset =
7741 cso->count - ice->state.vs_needs_edge_flag;
7742 }
7743
7744 if (vs_prog_data->uses_instanceid) {
7745 sgv.InstanceIDEnable = true;
7746 sgv.InstanceIDComponentNumber = 3;
7747 sgv.InstanceIDElementOffset =
7748 cso->count - ice->state.vs_needs_edge_flag;
7749 }
7750 }
7751 }
7752 #endif
7753 #if GFX_VERx10 >= 75
7754 if (dirty & CROCUS_DIRTY_GEN75_VF) {
7755 crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7756 if (draw->primitive_restart) {
7757 vf.IndexedDrawCutIndexEnable = true;
7758 vf.CutIndex = draw->restart_index;
7759 }
7760 }
7761 }
7762 #endif
7763
7764 #if GFX_VER == 8
7765 if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7766 bool enable = want_pma_fix(ice);
7767 genX(crocus_update_pma_fix)(ice, batch, enable);
7768 }
7769 #endif
7770
7771 #if GFX_VER <= 5
7772 if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7773 gen4_upload_curbe(batch);
7774 }
7775 #endif
7776 }
7777
7778 static void
7779 crocus_upload_render_state(struct crocus_context *ice,
7780 struct crocus_batch *batch,
7781 const struct pipe_draw_info *draw,
7782 unsigned drawid_offset,
7783 const struct pipe_draw_indirect_info *indirect,
7784 const struct pipe_draw_start_count_bias *sc)
7785 {
7786 #if GFX_VER >= 7
7787 bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7788 #endif
7789
7790 batch->no_wrap = true;
7791 batch->contains_draw = true;
7792
7793 crocus_update_surface_base_address(batch);
7794
7795 crocus_upload_dirty_render_state(ice, batch, draw);
7796
7797 batch->no_wrap = false;
7798 if (draw->index_size > 0) {
7799 unsigned offset;
7800 unsigned size;
7801 bool emit_index = false;
7802
7803 if (draw->has_user_indices) {
7804 unsigned start_offset = draw->index_size * sc->start;
7805 u_upload_data(ice->ctx.stream_uploader, 0,
7806 sc->count * draw->index_size, 4,
7807 (char *)draw->index.user + start_offset,
7808 &offset, &ice->state.index_buffer.res);
7809 offset -= start_offset;
7810 size = start_offset + sc->count * draw->index_size;
7811 emit_index = true;
7812 } else {
7813 struct crocus_resource *res = (void *) draw->index.resource;
7814
7815 if (ice->state.index_buffer.res != draw->index.resource) {
7816 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7817 pipe_resource_reference(&ice->state.index_buffer.res,
7818 draw->index.resource);
7819 emit_index = true;
7820 }
7821 offset = 0;
7822 size = draw->index.resource->width0;
7823 }
7824
7825 if (!emit_index &&
7826 (ice->state.index_buffer.size != size ||
7827 ice->state.index_buffer.index_size != draw->index_size
7828 #if GFX_VERx10 < 75
7829 || ice->state.index_buffer.prim_restart != draw->primitive_restart
7830 #endif
7831 )
7832 )
7833 emit_index = true;
7834
7835 if (emit_index) {
7836 struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7837
7838 crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7839 #if GFX_VERx10 < 75
7840 ib.CutIndexEnable = draw->primitive_restart;
7841 #endif
7842 ib.IndexFormat = draw->index_size >> 1;
7843 ib.BufferStartingAddress = ro_bo(bo, offset);
7844 #if GFX_VER >= 8
7845 ib.BufferSize = bo->size - offset;
7846 #else
7847 ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7848 #endif
7849 #if GFX_VER >= 6
7850 ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7851 #endif
7852 }
7853 ice->state.index_buffer.size = size;
7854 ice->state.index_buffer.offset = offset;
7855 ice->state.index_buffer.index_size = draw->index_size;
7856 #if GFX_VERx10 < 75
7857 ice->state.index_buffer.prim_restart = draw->primitive_restart;
7858 #endif
7859 }
7860 }
7861
7862 #define _3DPRIM_END_OFFSET 0x2420
7863 #define _3DPRIM_START_VERTEX 0x2430
7864 #define _3DPRIM_VERTEX_COUNT 0x2434
7865 #define _3DPRIM_INSTANCE_COUNT 0x2438
7866 #define _3DPRIM_START_INSTANCE 0x243C
7867 #define _3DPRIM_BASE_VERTEX 0x2440
7868
7869 #if GFX_VER >= 7
7870 if (indirect && !indirect->count_from_stream_output) {
7871 if (indirect->indirect_draw_count) {
7872 use_predicate = true;
7873
7874 struct crocus_bo *draw_count_bo =
7875 crocus_resource_bo(indirect->indirect_draw_count);
7876 unsigned draw_count_offset =
7877 indirect->indirect_draw_count_offset;
7878
7879 crocus_emit_pipe_control_flush(batch,
7880 "ensure indirect draw buffer is flushed",
7881 PIPE_CONTROL_FLUSH_ENABLE);
7882 if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7883 #if GFX_VERx10 >= 75
7884 struct mi_builder b;
7885 mi_builder_init(&b, &batch->screen->devinfo, batch);
7886
7887 /* comparison = draw id < draw count */
7888 struct mi_value comparison =
7889 mi_ult(&b, mi_imm(drawid_offset),
7890 mi_mem32(ro_bo(draw_count_bo,
7891 draw_count_offset)));
7892 #if GFX_VER == 8
7893 /* predicate = comparison & conditional rendering predicate */
7894 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7895 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7896 #else
7897 /* predicate = comparison & conditional rendering predicate */
7898 struct mi_value pred = mi_iand(&b, comparison,
7899 mi_reg32(CS_GPR(15)));
7900
7901 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7902 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7903
7904 unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7905 MI_PREDICATE_COMBINEOP_SET |
7906 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7907
7908 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7909 #endif
7910 #endif
7911 } else {
7912 uint32_t mi_predicate;
7913
7914 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7915 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7916 /* Upload the current draw count from the draw parameters buffer
7917 * to MI_PREDICATE_SRC0.
7918 */
7919 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7920 draw_count_bo, draw_count_offset);
7921 /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7922 crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7923
7924 if (drawid_offset == 0) {
7925 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7926 MI_PREDICATE_COMBINEOP_SET |
7927 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7928 } else {
7929 /* While draw_index < draw_count the predicate's result will be
7930 * (draw_index == draw_count) ^ TRUE = TRUE
7931 * When draw_index == draw_count the result is
7932 * (TRUE) ^ TRUE = FALSE
7933 * After this all results will be:
7934 * (FALSE) ^ FALSE = FALSE
7935 */
7936 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7937 MI_PREDICATE_COMBINEOP_XOR |
7938 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7939 }
7940 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7941 }
7942 }
7943
7944 #if GFX_VER >= 7
7945 struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7946 assert(bo);
7947
7948 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7949 lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7950 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7951 }
7952 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7953 lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7954 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7955 }
7956 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7957 lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7958 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7959 }
7960 if (draw->index_size) {
7961 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7962 lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7963 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7964 }
7965 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7966 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7967 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7968 }
7969 } else {
7970 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7971 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7972 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7973 }
7974 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7975 lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7976 lri.DataDWord = 0;
7977 }
7978 }
7979 #endif
7980 } else if (indirect && indirect->count_from_stream_output) {
7981 #if GFX_VERx10 >= 75
7982 struct crocus_stream_output_target *so =
7983 (void *) indirect->count_from_stream_output;
7984
7985 /* XXX: Replace with actual cache tracking */
7986 crocus_emit_pipe_control_flush(batch,
7987 "draw count from stream output stall",
7988 PIPE_CONTROL_CS_STALL);
7989
7990 struct mi_builder b;
7991 mi_builder_init(&b, &batch->screen->devinfo, batch);
7992
7993 struct crocus_address addr =
7994 ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7995 struct mi_value offset =
7996 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
7997
7998 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
7999 mi_udiv32_imm(&b, offset, so->stride));
8000
8001 _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8002 _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8003 _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8004 _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8005 #endif
8006 }
8007 #else
8008 assert(!indirect);
8009 #endif
8010
8011 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8012 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8013 #if GFX_VER >= 7
8014 prim.PredicateEnable = use_predicate;
8015 #endif
8016
8017 prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8018 if (indirect) {
8019 // XXX Probably have to do something for gen6 here?
8020 #if GFX_VER >= 7
8021 prim.IndirectParameterEnable = true;
8022 #endif
8023 } else {
8024 #if GFX_VER >= 5
8025 prim.StartInstanceLocation = draw->start_instance;
8026 #endif
8027 prim.InstanceCount = draw->instance_count;
8028 prim.VertexCountPerInstance = sc->count;
8029
8030 prim.StartVertexLocation = sc->start;
8031
8032 if (draw->index_size) {
8033 prim.BaseVertexLocation += sc->index_bias;
8034 }
8035 }
8036 }
8037 }
8038
8039 #if GFX_VER >= 7
8040
8041 static void
8042 crocus_upload_compute_state(struct crocus_context *ice,
8043 struct crocus_batch *batch,
8044 const struct pipe_grid_info *grid)
8045 {
8046 const uint64_t stage_dirty = ice->state.stage_dirty;
8047 struct crocus_screen *screen = batch->screen;
8048 const struct intel_device_info *devinfo = &screen->devinfo;
8049 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8050 struct crocus_compiled_shader *shader =
8051 ice->shaders.prog[MESA_SHADER_COMPUTE];
8052 struct elk_stage_prog_data *prog_data = shader->prog_data;
8053 struct elk_cs_prog_data *cs_prog_data = (void *) prog_data;
8054 const struct intel_cs_dispatch_info dispatch =
8055 elk_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8056
8057 crocus_update_surface_base_address(batch);
8058 if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8059 upload_sysvals(ice, MESA_SHADER_COMPUTE);
8060
8061 if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8062 crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8063 ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8064 crocus_upload_binding_table(ice, batch,
8065 ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8066 ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8067 }
8068
8069 if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8070 crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8071
8072 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8073 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8074 /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8075 *
8076 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8077 * the only bits that are changed are scoreboard related: Scoreboard
8078 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
8079 * these scoreboard related states, a MEDIA_STATE_FLUSH is
8080 * sufficient."
8081 */
8082 crocus_emit_pipe_control_flush(batch,
8083 "workaround: stall before MEDIA_VFE_STATE",
8084 PIPE_CONTROL_CS_STALL);
8085
8086 crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8087 if (prog_data->total_scratch) {
8088 struct crocus_bo *bo =
8089 crocus_get_scratch_space(ice, prog_data->total_scratch,
8090 MESA_SHADER_COMPUTE);
8091 #if GFX_VER == 8
8092 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8093 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8094 */
8095 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8096 #elif GFX_VERx10 == 75
8097 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8098 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8099 */
8100 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8101 #else
8102 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8103 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8104 */
8105 vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8106 #endif
8107 vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8108 }
8109
8110 vfe.MaximumNumberofThreads =
8111 devinfo->max_cs_threads * devinfo->subslice_total - 1;
8112 vfe.ResetGatewayTimer =
8113 Resettingrelativetimerandlatchingtheglobaltimestamp;
8114 vfe.BypassGatewayControl = true;
8115 #if GFX_VER == 7
8116 vfe.GPGPUMode = true;
8117 #endif
8118 #if GFX_VER == 8
8119 vfe.BypassGatewayControl = true;
8120 #endif
8121 vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8122 vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8123
8124 vfe.CURBEAllocationSize =
8125 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8126 cs_prog_data->push.cross_thread.regs, 2);
8127 }
8128 }
8129
8130 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8131 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8132 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8133 uint32_t curbe_data_offset = 0;
8134 assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8135 cs_prog_data->push.per_thread.dwords == 1 &&
8136 cs_prog_data->base.param[0] == ELK_PARAM_BUILTIN_SUBGROUP_ID);
8137 const unsigned push_const_size =
8138 elk_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8139 uint32_t *curbe_data_map =
8140 stream_state(batch,
8141 ALIGN(push_const_size, 64), 64,
8142 &curbe_data_offset);
8143 assert(curbe_data_map);
8144 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8145 crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8146 curbe_data_map);
8147
8148 crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8149 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8150 curbe.CURBEDataStartAddress = curbe_data_offset;
8151 }
8152 }
8153
8154 if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8155 CROCUS_STAGE_DIRTY_BINDINGS_CS |
8156 CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8157 CROCUS_STAGE_DIRTY_CS)) {
8158 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8159 const uint64_t ksp = KSP(ice,shader) + elk_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8160 crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8161 idd.KernelStartPointer = ksp;
8162 idd.SamplerStatePointer = shs->sampler_offset;
8163 idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8164 idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8165 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8166 idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8167 idd.BarrierEnable = cs_prog_data->uses_barrier;
8168 idd.SharedLocalMemorySize = elk_encode_slm_size(GFX_VER,
8169 prog_data->total_shared);
8170 #if GFX_VERx10 >= 75
8171 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8172 #endif
8173 }
8174
8175 crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8176 load.InterfaceDescriptorTotalLength =
8177 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8178 load.InterfaceDescriptorDataStartAddress =
8179 emit_state(batch, desc, sizeof(desc), 64);
8180 }
8181 }
8182
8183 #define GPGPU_DISPATCHDIMX 0x2500
8184 #define GPGPU_DISPATCHDIMY 0x2504
8185 #define GPGPU_DISPATCHDIMZ 0x2508
8186
8187 if (grid->indirect) {
8188 struct crocus_state_ref *grid_size = &ice->state.grid_size;
8189 struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8190 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8191 lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8192 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8193 }
8194 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8195 lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8196 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8197 }
8198 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8199 lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8200 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8201 }
8202
8203 #if GFX_VER == 7
8204 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8205 _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8206 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8207
8208 /* Load compute_dispatch_indirect_x_size into SRC0 */
8209 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8210
8211 /* predicate = (compute_dispatch_indirect_x_size == 0); */
8212 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8213 mip.LoadOperation = LOAD_LOAD;
8214 mip.CombineOperation = COMBINE_SET;
8215 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8216 };
8217
8218 /* Load compute_dispatch_indirect_y_size into SRC0 */
8219 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8220
8221 /* predicate = (compute_dispatch_indirect_y_size == 0); */
8222 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8223 mip.LoadOperation = LOAD_LOAD;
8224 mip.CombineOperation = COMBINE_OR;
8225 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8226 };
8227
8228 /* Load compute_dispatch_indirect_z_size into SRC0 */
8229 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8230
8231 /* predicate = (compute_dispatch_indirect_z_size == 0); */
8232 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8233 mip.LoadOperation = LOAD_LOAD;
8234 mip.CombineOperation = COMBINE_OR;
8235 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8236 };
8237
8238 /* predicate = !predicate; */
8239 #define COMPARE_FALSE 1
8240 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8241 mip.LoadOperation = LOAD_LOADINV;
8242 mip.CombineOperation = COMBINE_OR;
8243 mip.CompareOperation = COMPARE_FALSE;
8244 }
8245 #endif
8246 }
8247
8248 crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8249 ggw.IndirectParameterEnable = grid->indirect != NULL;
8250 ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL;
8251 ggw.SIMDSize = dispatch.simd_size / 16;
8252 ggw.ThreadDepthCounterMaximum = 0;
8253 ggw.ThreadHeightCounterMaximum = 0;
8254 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
8255 ggw.ThreadGroupIDXDimension = grid->grid[0];
8256 ggw.ThreadGroupIDYDimension = grid->grid[1];
8257 ggw.ThreadGroupIDZDimension = grid->grid[2];
8258 ggw.RightExecutionMask = dispatch.right_mask;
8259 ggw.BottomExecutionMask = 0xffffffff;
8260 }
8261
8262 crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8263
8264 batch->contains_draw = true;
8265 }
8266
8267 #endif /* GFX_VER >= 7 */
8268
8269 /**
8270 * State module teardown.
8271 */
8272 static void
8273 crocus_destroy_state(struct crocus_context *ice)
8274 {
8275 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
8276
8277 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8278 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8279
8280 free(ice->state.genx);
8281
8282 for (int i = 0; i < 4; i++) {
8283 pipe_so_target_reference(&ice->state.so_target[i], NULL);
8284 }
8285
8286 util_unreference_framebuffer_state(cso);
8287
8288 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8289 struct crocus_shader_state *shs = &ice->state.shaders[stage];
8290 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8291 pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8292 }
8293 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8294 pipe_resource_reference(&shs->image[i].base.resource, NULL);
8295 }
8296 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8297 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8298 }
8299 for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8300 pipe_sampler_view_reference((struct pipe_sampler_view **)
8301 &shs->textures[i], NULL);
8302 }
8303 }
8304
8305 for (int i = 0; i < 16; i++)
8306 pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8307 pipe_resource_reference(&ice->state.grid_size.res, NULL);
8308
8309 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8310 }
8311
8312 /* ------------------------------------------------------------------- */
8313
8314 static void
8315 crocus_rebind_buffer(struct crocus_context *ice,
8316 struct crocus_resource *res)
8317 {
8318 struct pipe_context *ctx = &ice->ctx;
8319
8320 assert(res->base.b.target == PIPE_BUFFER);
8321
8322 /* Buffers can't be framebuffer attachments, nor display related,
8323 * and we don't have upstream Clover support.
8324 */
8325 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8326 PIPE_BIND_RENDER_TARGET |
8327 PIPE_BIND_BLENDABLE |
8328 PIPE_BIND_DISPLAY_TARGET |
8329 PIPE_BIND_CURSOR |
8330 PIPE_BIND_COMPUTE_RESOURCE |
8331 PIPE_BIND_GLOBAL)));
8332
8333 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8334 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8335 while (bound_vbs) {
8336 const int i = u_bit_scan64(&bound_vbs);
8337 struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8338
8339 if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8340 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8341 }
8342 }
8343
8344 if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8345 ice->state.index_buffer.res) {
8346 if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8347 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8348 }
8349 /* There is no need to handle these:
8350 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8351 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8352 */
8353
8354 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8355 /* XXX: be careful about resetting vs appending... */
8356 for (int i = 0; i < 4; i++) {
8357 if (ice->state.so_target[i] &&
8358 (ice->state.so_target[i]->buffer == &res->base.b)) {
8359 #if GFX_VER == 6
8360 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8361 #else
8362 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8363 #endif
8364 }
8365 }
8366 }
8367
8368 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8369 struct crocus_shader_state *shs = &ice->state.shaders[s];
8370 enum pipe_shader_type p_stage = stage_to_pipe(s);
8371
8372 if (!(res->bind_stages & (1 << s)))
8373 continue;
8374
8375 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8376 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8377 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8378 while (bound_cbufs) {
8379 const int i = u_bit_scan(&bound_cbufs);
8380 struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8381
8382 if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8383 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8384 }
8385 }
8386 }
8387
8388 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8389 uint32_t bound_ssbos = shs->bound_ssbos;
8390 while (bound_ssbos) {
8391 const int i = u_bit_scan(&bound_ssbos);
8392 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8393
8394 if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8395 struct pipe_shader_buffer buf = {
8396 .buffer = &res->base.b,
8397 .buffer_offset = ssbo->buffer_offset,
8398 .buffer_size = ssbo->buffer_size,
8399 };
8400 crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8401 (shs->writable_ssbos >> i) & 1);
8402 }
8403 }
8404 }
8405
8406 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8407 uint32_t bound_sampler_views = shs->bound_sampler_views;
8408 while (bound_sampler_views) {
8409 const int i = u_bit_scan(&bound_sampler_views);
8410 struct crocus_sampler_view *isv = shs->textures[i];
8411 struct crocus_bo *bo = isv->res->bo;
8412
8413 if (res->bo == bo) {
8414 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8415 }
8416 }
8417 }
8418
8419 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8420 uint32_t bound_image_views = shs->bound_image_views;
8421 while (bound_image_views) {
8422 const int i = u_bit_scan(&bound_image_views);
8423 struct crocus_image_view *iv = &shs->image[i];
8424 struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8425
8426 if (res->bo == bo)
8427 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8428 }
8429 }
8430 }
8431 }
8432
8433 /* ------------------------------------------------------------------- */
8434
8435 static unsigned
8436 flags_to_post_sync_op(uint32_t flags)
8437 {
8438 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8439 return WriteImmediateData;
8440
8441 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8442 return WritePSDepthCount;
8443
8444 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8445 return WriteTimestamp;
8446
8447 return 0;
8448 }
8449
8450 /*
8451 * Do the given flags have a Post Sync or LRI Post Sync operation?
8452 */
8453 static enum pipe_control_flags
8454 get_post_sync_flags(enum pipe_control_flags flags)
8455 {
8456 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8457 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8458 PIPE_CONTROL_WRITE_TIMESTAMP |
8459 PIPE_CONTROL_LRI_POST_SYNC_OP;
8460
8461 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8462 * "LRI Post Sync Operation". So more than one bit set would be illegal.
8463 */
8464 assert(util_bitcount(flags) <= 1);
8465
8466 return flags;
8467 }
8468
8469 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8470
8471 /**
8472 * Emit a series of PIPE_CONTROL commands, taking into account any
8473 * workarounds necessary to actually accomplish the caller's request.
8474 *
8475 * Unless otherwise noted, spec quotations in this function come from:
8476 *
8477 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8478 * Restrictions for PIPE_CONTROL.
8479 *
8480 * You should not use this function directly. Use the helpers in
8481 * crocus_pipe_control.c instead, which may split the pipe control further.
8482 */
8483 static void
8484 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8485 const char *reason,
8486 uint32_t flags,
8487 struct crocus_bo *bo,
8488 uint32_t offset,
8489 uint64_t imm)
8490 {
8491 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8492 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8493 UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8494 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8495
8496 /* Recursive PIPE_CONTROL workarounds --------------------------------
8497 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8498 *
8499 * We do these first because we want to look at the original operation,
8500 * rather than any workarounds we set.
8501 */
8502
8503 /* "Flush Types" workarounds ---------------------------------------------
8504 * We do these now because they may add post-sync operations or CS stalls.
8505 */
8506
8507 if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8508 /* Hardware workaround: SNB B-Spec says:
8509 *
8510 * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8511 * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8512 * required."
8513 */
8514 crocus_emit_post_sync_nonzero_flush(batch);
8515 }
8516
8517 #if GFX_VER == 8
8518 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8519 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8520 *
8521 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8522 * 'Write PS Depth Count' or 'Write Timestamp'."
8523 */
8524 if (!bo) {
8525 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8526 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8527 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8528 bo = batch->ice->workaround_bo;
8529 offset = batch->ice->workaround_offset;
8530 }
8531 }
8532 #endif
8533
8534 #if GFX_VERx10 < 75
8535 if (flags & PIPE_CONTROL_DEPTH_STALL) {
8536 /* Project: PRE-HSW / Argument: Depth Stall
8537 *
8538 * "The following bits must be clear:
8539 * - Render Target Cache Flush Enable ([12] of DW1)
8540 * - Depth Cache Flush Enable ([0] of DW1)"
8541 */
8542 assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8543 PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8544 }
8545 #endif
8546 if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8547 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8548 *
8549 * "This bit must be DISABLED for operations other than writing
8550 * PS_DEPTH_COUNT."
8551 *
8552 * This seems like nonsense. An Ivybridge workaround requires us to
8553 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8554 * operation. Gen8+ requires us to emit depth stalls and depth cache
8555 * flushes together. So, it's hard to imagine this means anything other
8556 * than "we originally intended this to be used for PS_DEPTH_COUNT".
8557 *
8558 * We ignore the supposed restriction and do nothing.
8559 */
8560 }
8561
8562 if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8563 /* Project: PRE-HSW / Argument: Depth Cache Flush
8564 *
8565 * "Depth Stall must be clear ([13] of DW1)."
8566 */
8567 assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8568 }
8569
8570 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8571 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8572 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8573 *
8574 * "This bit must be DISABLED for End-of-pipe (Read) fences,
8575 * PS_DEPTH_COUNT or TIMESTAMP queries."
8576 *
8577 * TODO: Implement end-of-pipe checking.
8578 */
8579 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8580 PIPE_CONTROL_WRITE_TIMESTAMP)));
8581 }
8582
8583 if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8584 /* From the PIPE_CONTROL instruction table, bit 1:
8585 *
8586 * "This bit is ignored if Depth Stall Enable is set.
8587 * Further, the render cache is not flushed even if Write Cache
8588 * Flush Enable bit is set."
8589 *
8590 * We assert that the caller doesn't do this combination, to try and
8591 * prevent mistakes. It shouldn't hurt the GPU, though.
8592 *
8593 * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8594 * and "Render Target Flush" combo is explicitly required for BTI
8595 * update workarounds.
8596 */
8597 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8598 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8599 }
8600
8601 /* PIPE_CONTROL page workarounds ------------------------------------- */
8602
8603 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8604 /* From the PIPE_CONTROL page itself:
8605 *
8606 * "IVB, HSW, BDW
8607 * Restriction: Pipe_control with CS-stall bit set must be issued
8608 * before a pipe-control command that has the State Cache
8609 * Invalidate bit set."
8610 */
8611 flags |= PIPE_CONTROL_CS_STALL;
8612 }
8613
8614 if ((GFX_VERx10 == 75)) {
8615 /* From the PIPE_CONTROL page itself:
8616 *
8617 * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8618 * Prior to programming a PIPECONTROL command with any of the RO
8619 * cache invalidation bit set, program a PIPECONTROL flush command
8620 * with “CS stall” bit and “HDC Flush” bit set."
8621 *
8622 * TODO: Actually implement this. What's an HDC Flush?
8623 */
8624 }
8625
8626 if (flags & PIPE_CONTROL_FLUSH_LLC) {
8627 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8628 *
8629 * "Project: ALL
8630 * SW must always program Post-Sync Operation to "Write Immediate
8631 * Data" when Flush LLC is set."
8632 *
8633 * For now, we just require the caller to do it.
8634 */
8635 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8636 }
8637
8638 /* "Post-Sync Operation" workarounds -------------------------------- */
8639
8640 /* Project: All / Argument: Global Snapshot Count Reset [19]
8641 *
8642 * "This bit must not be exercised on any product.
8643 * Requires stall bit ([20] of DW1) set."
8644 *
8645 * We don't use this, so we just assert that it isn't used. The
8646 * PIPE_CONTROL instruction page indicates that they intended this
8647 * as a debug feature and don't think it is useful in production,
8648 * but it may actually be usable, should we ever want to.
8649 */
8650 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8651
8652 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8653 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8654 /* Project: All / Arguments:
8655 *
8656 * - Generic Media State Clear [16]
8657 * - Indirect State Pointers Disable [16]
8658 *
8659 * "Requires stall bit ([20] of DW1) set."
8660 *
8661 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8662 * State Clear) says:
8663 *
8664 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
8665 * programmed prior to programming a PIPECONTROL command with "Media
8666 * State Clear" set in GPGPU mode of operation"
8667 *
8668 * This is a subset of the earlier rule, so there's nothing to do.
8669 */
8670 flags |= PIPE_CONTROL_CS_STALL;
8671 }
8672
8673 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8674 /* Project: All / Argument: Store Data Index
8675 *
8676 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8677 * than '0'."
8678 *
8679 * For now, we just assert that the caller does this. We might want to
8680 * automatically add a write to the workaround BO...
8681 */
8682 assert(non_lri_post_sync_flags != 0);
8683 }
8684
8685 if (flags & PIPE_CONTROL_SYNC_GFDT) {
8686 /* Project: All / Argument: Sync GFDT
8687 *
8688 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8689 * than '0' or 0x2520[13] must be set."
8690 *
8691 * For now, we just assert that the caller does this.
8692 */
8693 assert(non_lri_post_sync_flags != 0);
8694 }
8695
8696 if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8697 /* Project: SNB, IVB, HSW / Argument: TLB inv
8698 *
8699 * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8700 * must be set to something other than '0'."
8701 *
8702 * For now, we just assert that the caller does this.
8703 */
8704 assert(non_lri_post_sync_flags != 0);
8705 }
8706
8707 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8708 /* Project: IVB+ / Argument: TLB inv
8709 *
8710 * "Requires stall bit ([20] of DW1) set."
8711 *
8712 * Also, from the PIPE_CONTROL instruction table:
8713 *
8714 * "Project: SKL+
8715 * Post Sync Operation or CS stall must be set to ensure a TLB
8716 * invalidation occurs. Otherwise no cycle will occur to the TLB
8717 * cache to invalidate."
8718 *
8719 * This is not a subset of the earlier rule, so there's nothing to do.
8720 */
8721 flags |= PIPE_CONTROL_CS_STALL;
8722 }
8723 #if GFX_VER == 8
8724 if (IS_COMPUTE_PIPELINE(batch)) {
8725 if (post_sync_flags ||
8726 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8727 PIPE_CONTROL_DEPTH_STALL |
8728 PIPE_CONTROL_RENDER_TARGET_FLUSH |
8729 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8730 PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8731 /* Project: BDW / Arguments:
8732 *
8733 * - LRI Post Sync Operation [23]
8734 * - Post Sync Op [15:14]
8735 * - Notify En [8]
8736 * - Depth Stall [13]
8737 * - Render Target Cache Flush [12]
8738 * - Depth Cache Flush [0]
8739 * - DC Flush Enable [5]
8740 *
8741 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
8742 * Workloads."
8743 *
8744 * (The docs have separate table rows for each bit, with essentially
8745 * the same workaround text. We've combined them here.)
8746 */
8747 flags |= PIPE_CONTROL_CS_STALL;
8748
8749 /* Also, from the PIPE_CONTROL instruction table, bit 20:
8750 *
8751 * "Project: BDW
8752 * This bit must be always set when PIPE_CONTROL command is
8753 * programmed by GPGPU and MEDIA workloads, except for the cases
8754 * when only Read Only Cache Invalidation bits are set (State
8755 * Cache Invalidation Enable, Instruction cache Invalidation
8756 * Enable, Texture Cache Invalidation Enable, Constant Cache
8757 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
8758 * need not implemented when FF_DOP_CG is disable via "Fixed
8759 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8760 *
8761 * It sounds like we could avoid CS stalls in some cases, but we
8762 * don't currently bother. This list isn't exactly the list above,
8763 * either...
8764 */
8765 }
8766 }
8767 #endif
8768 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8769 *
8770 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8771 * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8772 *
8773 * Note that the kernel does CS stalls between batches, so we only need
8774 * to count them within a batch. We currently naively count every 4, and
8775 * don't skip the ones with only read-cache-invalidate bits set. This
8776 * may or may not be a problem...
8777 */
8778 if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8779 if (flags & PIPE_CONTROL_CS_STALL) {
8780 /* If we're doing a CS stall, reset the counter and carry on. */
8781 batch->pipe_controls_since_last_cs_stall = 0;
8782 }
8783
8784 /* If this is the fourth pipe control without a CS stall, do one now. */
8785 if (++batch->pipe_controls_since_last_cs_stall == 4) {
8786 batch->pipe_controls_since_last_cs_stall = 0;
8787 flags |= PIPE_CONTROL_CS_STALL;
8788 }
8789 }
8790
8791 /* "Stall" workarounds ----------------------------------------------
8792 * These have to come after the earlier ones because we may have added
8793 * some additional CS stalls above.
8794 */
8795
8796 if (flags & PIPE_CONTROL_CS_STALL) {
8797 /* Project: PRE-SKL, VLV, CHV
8798 *
8799 * "[All Stepping][All SKUs]:
8800 *
8801 * One of the following must also be set:
8802 *
8803 * - Render Target Cache Flush Enable ([12] of DW1)
8804 * - Depth Cache Flush Enable ([0] of DW1)
8805 * - Stall at Pixel Scoreboard ([1] of DW1)
8806 * - Depth Stall ([13] of DW1)
8807 * - Post-Sync Operation ([13] of DW1)
8808 * - DC Flush Enable ([5] of DW1)"
8809 *
8810 * If we don't already have one of those bits set, we choose to add
8811 * "Stall at Pixel Scoreboard". Some of the other bits require a
8812 * CS stall as a workaround (see above), which would send us into
8813 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
8814 * appears to be safe, so we choose that.
8815 */
8816 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8817 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8818 PIPE_CONTROL_WRITE_IMMEDIATE |
8819 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8820 PIPE_CONTROL_WRITE_TIMESTAMP |
8821 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8822 PIPE_CONTROL_DEPTH_STALL |
8823 PIPE_CONTROL_DATA_CACHE_FLUSH;
8824 if (!(flags & wa_bits))
8825 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8826 }
8827
8828 /* Emit --------------------------------------------------------------- */
8829
8830 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8831 fprintf(stderr,
8832 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8833 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8834 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8835 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8836 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8837 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8838 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8839 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8840 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8841 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8842 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8843 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8844 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8845 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8846 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8847 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8848 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8849 "SnapRes" : "",
8850 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8851 "ISPDis" : "",
8852 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8853 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8854 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8855 imm, reason);
8856 }
8857
8858 crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8859 #if GFX_VER >= 7
8860 pc.LRIPostSyncOperation = NoLRIOperation;
8861 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8862 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8863 #endif
8864 #if GFX_VER >= 6
8865 pc.StoreDataIndex = 0;
8866 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8867 pc.GlobalSnapshotCountReset =
8868 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8869 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8870 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8871 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8872 pc.RenderTargetCacheFlushEnable =
8873 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8874 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8875 pc.StateCacheInvalidationEnable =
8876 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8877 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8878 pc.ConstantCacheInvalidationEnable =
8879 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8880 #else
8881 pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8882 #endif
8883 pc.PostSyncOperation = flags_to_post_sync_op(flags);
8884 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8885 pc.InstructionCacheInvalidateEnable =
8886 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8887 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8888 #if GFX_VER >= 5 || GFX_VERx10 == 45
8889 pc.IndirectStatePointersDisable =
8890 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8891 #endif
8892 #if GFX_VER >= 6
8893 pc.TextureCacheInvalidationEnable =
8894 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8895 #elif GFX_VER == 5 || GFX_VERx10 == 45
8896 pc.TextureCacheFlushEnable =
8897 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8898 #endif
8899 pc.Address = ggtt_bo(bo, offset);
8900 if (GFX_VER < 7 && bo)
8901 pc.DestinationAddressType = DAT_GGTT;
8902 pc.ImmediateData = imm;
8903 }
8904 }
8905
8906 #if GFX_VER == 6
8907 void
8908 genX(crocus_upload_urb)(struct crocus_batch *batch,
8909 unsigned vs_size,
8910 bool gs_present,
8911 unsigned gs_size)
8912 {
8913 struct crocus_context *ice = batch->ice;
8914 int nr_vs_entries, nr_gs_entries;
8915 int total_urb_size = ice->urb.size * 1024; /* in bytes */
8916 const struct intel_device_info *devinfo = &batch->screen->devinfo;
8917
8918 /* Calculate how many entries fit in each stage's section of the URB */
8919 if (gs_present) {
8920 nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8921 nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8922 } else {
8923 nr_vs_entries = total_urb_size / (vs_size * 128);
8924 nr_gs_entries = 0;
8925 }
8926
8927 /* Then clamp to the maximum allowed by the hardware */
8928 if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8929 nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8930
8931 if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8932 nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8933
8934 /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8935 ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8936 ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8937
8938 assert(ice->urb.nr_vs_entries >=
8939 devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8940 assert(ice->urb.nr_vs_entries % 4 == 0);
8941 assert(ice->urb.nr_gs_entries % 4 == 0);
8942 assert(vs_size <= 5);
8943 assert(gs_size <= 5);
8944
8945 crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8946 urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8947 urb.VSURBEntryAllocationSize = vs_size - 1;
8948
8949 urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8950 urb.GSURBEntryAllocationSize = gs_size - 1;
8951 };
8952 /* From the PRM Volume 2 part 1, section 1.4.7:
8953 *
8954 * Because of a urb corruption caused by allocating a previous gsunit’s
8955 * urb entry to vsunit software is required to send a "GS NULL
8956 * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8957 * a dummy DRAW call before any case where VS will be taking over GS URB
8958 * space.
8959 *
8960 * It is not clear exactly what this means ("URB fence" is a command that
8961 * doesn't exist on Gen6). So for now we just do a full pipeline flush as
8962 * a workaround.
8963 */
8964 if (ice->urb.gs_present && !gs_present)
8965 crocus_emit_mi_flush(batch);
8966 ice->urb.gs_present = gs_present;
8967 }
8968 #endif
8969
8970 static void
8971 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8972 {
8973 }
8974
8975 static void
8976 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8977 struct crocus_bo *bo,
8978 uint32_t offset_in_bytes,
8979 uint32_t report_id)
8980 {
8981 #if GFX_VER >= 7
8982 crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8983 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8984 mi_rpc.ReportID = report_id;
8985 }
8986 #endif
8987 }
8988
8989 /**
8990 * From the PRM, Volume 2a:
8991 *
8992 * "Indirect State Pointers Disable
8993 *
8994 * At the completion of the post-sync operation associated with this pipe
8995 * control packet, the indirect state pointers in the hardware are
8996 * considered invalid; the indirect pointers are not saved in the context.
8997 * If any new indirect state commands are executed in the command stream
8998 * while the pipe control is pending, the new indirect state commands are
8999 * preserved.
9000 *
9001 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9002 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9003 * commands are only considered as Indirect State Pointers. Once ISP is
9004 * issued in a context, SW must initialize by programming push constant
9005 * commands for all the shaders (at least to zero length) before attempting
9006 * any rendering operation for the same context."
9007 *
9008 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9009 * even though they point to a BO that has been already unreferenced at
9010 * the end of the previous batch buffer. This has been fine so far since
9011 * we are protected by these scratch page (every address not covered by
9012 * a BO should be pointing to the scratch page). But on CNL, it is
9013 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9014 * instruction.
9015 *
9016 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9017 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9018 * context restore, so the mentioned hang doesn't happen. However,
9019 * software must program push constant commands for all stages prior to
9020 * rendering anything, so we flag them as dirty.
9021 *
9022 * Finally, we also make sure to stall at pixel scoreboard to make sure the
9023 * constants have been loaded into the EUs prior to disable the push constants
9024 * so that it doesn't hang a previous 3DPRIMITIVE.
9025 */
9026 #if GFX_VER >= 7
9027 static void
9028 gen7_emit_isp_disable(struct crocus_batch *batch)
9029 {
9030 crocus_emit_raw_pipe_control(batch, "isp disable",
9031 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9032 PIPE_CONTROL_CS_STALL,
9033 NULL, 0, 0);
9034 crocus_emit_raw_pipe_control(batch, "isp disable",
9035 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9036 PIPE_CONTROL_CS_STALL,
9037 NULL, 0, 0);
9038
9039 struct crocus_context *ice = batch->ice;
9040 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9041 CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9042 CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9043 CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9044 CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9045 }
9046 #endif
9047
9048 #if GFX_VER >= 7
9049 static void
9050 crocus_state_finish_batch(struct crocus_batch *batch)
9051 {
9052 #if GFX_VERx10 == 75
9053 if (batch->name == CROCUS_BATCH_RENDER) {
9054 crocus_emit_mi_flush(batch);
9055 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9056 ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9057 }
9058
9059 crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9060 PIPE_CONTROL_CS_STALL);
9061 }
9062 #endif
9063 gen7_emit_isp_disable(batch);
9064 }
9065 #endif
9066
9067 static void
9068 crocus_batch_reset_dirty(struct crocus_batch *batch)
9069 {
9070 /* unreference any index buffer so it get reemitted. */
9071 pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9072
9073 /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9074 * as the old state batch won't still be available.
9075 */
9076 batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9077 CROCUS_DIRTY_COLOR_CALC_STATE;
9078
9079 batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9080
9081 batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9082 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9083 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9084 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9085 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9086 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9087 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9088
9089 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9090 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9091 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9092 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9093 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9094 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9095
9096 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9097 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9098 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9099 batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9100
9101 #if GFX_VER >= 6
9102 /* SCISSOR_STATE */
9103 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9104 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9105 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9106
9107 #endif
9108 #if GFX_VER <= 5
9109 /* dirty the SF state on gen4/5 */
9110 batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9111 batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9112 batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9113 batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9114 #endif
9115 #if GFX_VER >= 7
9116 /* Streamout dirty */
9117 batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9118 batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9119 batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9120 #endif
9121 }
9122
9123 #if GFX_VERx10 == 75
9124 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9125 {
9126 return &ice->state.cso_rast->cso;
9127 }
9128 #endif
9129
9130 #if GFX_VER >= 6
9131 static void update_so_strides(struct crocus_context *ice,
9132 uint16_t *strides)
9133 {
9134 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9135 struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9136 if (so)
9137 so->stride = strides[i] * sizeof(uint32_t);
9138 }
9139 }
9140 #endif
9141
9142 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9143 int s,
9144 uint32_t *clamp_mask)
9145 {
9146 #if GFX_VER < 8
9147 if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9148 samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9149 if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9150 clamp_mask[0] |= (1 << s);
9151 if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9152 clamp_mask[1] |= (1 << s);
9153 if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9154 clamp_mask[2] |= (1 << s);
9155 }
9156 #endif
9157 }
9158
9159 static void
9160 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9161 {
9162 struct crocus_context *ice = (struct crocus_context *) ctx;
9163
9164 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9165 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9166 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9167 }
9168
9169 if (ice->batch_count == 1)
9170 return;
9171
9172 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9173 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9174 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9175 }
9176 }
9177
9178 void
9179 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9180 {
9181 assert(screen->devinfo.verx10 == GFX_VERx10);
9182 assert(screen->devinfo.ver == GFX_VER);
9183 screen->vtbl.destroy_state = crocus_destroy_state;
9184 screen->vtbl.init_render_context = crocus_init_render_context;
9185 screen->vtbl.upload_render_state = crocus_upload_render_state;
9186 #if GFX_VER >= 7
9187 screen->vtbl.init_compute_context = crocus_init_compute_context;
9188 screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9189 #endif
9190 screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9191 screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9192 screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9193 #if GFX_VERx10 >= 75
9194 screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9195 screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9196 screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9197 screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9198 screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9199 screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9200 #endif
9201 #if GFX_VER >= 7
9202 screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9203 screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9204 screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9205 screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9206 #endif
9207 screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9208 #if GFX_VER >= 6
9209 screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9210 screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9211 #endif
9212 screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9213 screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9214 screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9215 screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9216 screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9217 screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9218 screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9219 #if GFX_VER >= 7
9220 screen->vtbl.finish_batch = crocus_state_finish_batch;
9221 #endif
9222 #if GFX_VER <= 5
9223 screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9224 screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9225 #endif
9226 screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9227 screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9228 screen->vtbl.translate_prim_type = translate_prim_type;
9229 #if GFX_VER >= 6
9230 screen->vtbl.update_so_strides = update_so_strides;
9231 screen->vtbl.get_so_offset = crocus_get_so_offset;
9232 #endif
9233
9234 genX(crocus_init_blt)(screen);
9235 }
9236
9237 void
9238 genX(crocus_init_state)(struct crocus_context *ice)
9239 {
9240 struct pipe_context *ctx = &ice->ctx;
9241
9242 ctx->create_blend_state = crocus_create_blend_state;
9243 ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9244 ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9245 ctx->create_sampler_state = crocus_create_sampler_state;
9246 ctx->create_sampler_view = crocus_create_sampler_view;
9247 ctx->create_surface = crocus_create_surface;
9248 ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9249 ctx->bind_blend_state = crocus_bind_blend_state;
9250 ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9251 ctx->bind_sampler_states = crocus_bind_sampler_states;
9252 ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9253 ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9254 ctx->delete_blend_state = crocus_delete_state;
9255 ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9256 ctx->delete_rasterizer_state = crocus_delete_state;
9257 ctx->delete_sampler_state = crocus_delete_state;
9258 ctx->delete_vertex_elements_state = crocus_delete_state;
9259 ctx->set_blend_color = crocus_set_blend_color;
9260 ctx->set_clip_state = crocus_set_clip_state;
9261 ctx->set_constant_buffer = crocus_set_constant_buffer;
9262 ctx->set_shader_buffers = crocus_set_shader_buffers;
9263 ctx->set_shader_images = crocus_set_shader_images;
9264 ctx->set_sampler_views = crocus_set_sampler_views;
9265 ctx->set_tess_state = crocus_set_tess_state;
9266 ctx->set_patch_vertices = crocus_set_patch_vertices;
9267 ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9268 ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9269 ctx->set_sample_mask = crocus_set_sample_mask;
9270 ctx->set_scissor_states = crocus_set_scissor_states;
9271 ctx->set_stencil_ref = crocus_set_stencil_ref;
9272 ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9273 ctx->set_viewport_states = crocus_set_viewport_states;
9274 ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9275 ctx->surface_destroy = crocus_surface_destroy;
9276 ctx->draw_vbo = crocus_draw_vbo;
9277 ctx->launch_grid = crocus_launch_grid;
9278
9279 ctx->set_frontend_noop = crocus_set_frontend_noop;
9280
9281 #if GFX_VER >= 6
9282 ctx->create_stream_output_target = crocus_create_stream_output_target;
9283 ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9284 ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9285 #endif
9286
9287 ice->state.dirty = ~0ull;
9288 ice->state.stage_dirty = ~0ull;
9289
9290 ice->state.statistics_counters_enabled = true;
9291
9292 ice->state.sample_mask = 0xff;
9293 ice->state.num_viewports = 1;
9294 ice->state.prim_mode = MESA_PRIM_COUNT;
9295 ice->state.reduced_prim_mode = MESA_PRIM_COUNT;
9296 ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9297 ice->draw.derived_params.drawid = -1;
9298
9299 /* Default all scissor rectangles to be empty regions. */
9300 for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9301 ice->state.scissors[i] = (struct pipe_scissor_state) {
9302 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9303 };
9304 }
9305 }
9306