1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef BLORP_GENX_EXEC_H
25 #define BLORP_GENX_EXEC_H
26
27 #include "blorp_priv.h"
28 #include "dev/intel_device_info.h"
29 #include "common/intel_sample_positions.h"
30 #include "common/intel_l3_config.h"
31 #include "genxml/gen_macros.h"
32
33 /**
34 * This file provides the blorp pipeline setup and execution functionality.
35 * It defines the following function:
36 *
37 * static void
38 * blorp_exec(struct blorp_context *blorp, void *batch_data,
39 * const struct blorp_params *params);
40 *
41 * It is the job of whoever includes this header to wrap this in something
42 * to get an externally visible symbol.
43 *
44 * In order for the blorp_exec function to work, the driver must provide
45 * implementations of the following static helper functions.
46 */
47
48 static void *
49 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
50
51 static uint64_t
52 blorp_emit_reloc(struct blorp_batch *batch,
53 void *location, struct blorp_address address, uint32_t delta);
54
55 static void
56 blorp_measure_start(struct blorp_batch *batch,
57 const struct blorp_params *params);
58
59 static void
60 blorp_measure_end(struct blorp_batch *batch,
61 const struct blorp_params *params);
62
63 static void *
64 blorp_alloc_dynamic_state(struct blorp_batch *batch,
65 uint32_t size,
66 uint32_t alignment,
67 uint32_t *offset);
68
69 UNUSED static void *
70 blorp_alloc_general_state(struct blorp_batch *batch,
71 uint32_t size,
72 uint32_t alignment,
73 uint32_t *offset);
74
75 static void *
76 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
77 struct blorp_address *addr);
78 static void
79 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
80 const struct blorp_address *addrs,
81 uint32_t *sizes,
82 unsigned num_vbs);
83
84 UNUSED static struct blorp_address
85 blorp_get_workaround_address(struct blorp_batch *batch);
86
87 static void
88 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
89 unsigned state_size, unsigned state_alignment,
90 uint32_t *bt_offset, uint32_t *surface_offsets,
91 void **surface_maps);
92
93 static uint32_t
94 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
95 uint32_t offset);
96
97 static void
98 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
99
100 static void
101 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
102 struct blorp_address address, uint32_t delta);
103
104 static uint64_t
105 blorp_get_surface_address(struct blorp_batch *batch,
106 struct blorp_address address);
107
108 #if GFX_VER >= 7 && GFX_VER < 10
109 static struct blorp_address
110 blorp_get_surface_base_address(struct blorp_batch *batch);
111 #endif
112
113 #if GFX_VER >= 7
114 static const struct intel_l3_config *
115 blorp_get_l3_config(struct blorp_batch *batch);
116 # else
117 static void
118 blorp_emit_urb_config(struct blorp_batch *batch,
119 unsigned vs_entry_size, unsigned sf_entry_size);
120 #endif
121
122 static void
123 blorp_emit_pipeline(struct blorp_batch *batch,
124 const struct blorp_params *params);
125
126 /***** BEGIN blorp_exec implementation ******/
127
128 static uint64_t
_blorp_combine_address(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)129 _blorp_combine_address(struct blorp_batch *batch, void *location,
130 struct blorp_address address, uint32_t delta)
131 {
132 if (address.buffer == NULL) {
133 return address.offset + delta;
134 } else {
135 return blorp_emit_reloc(batch, location, address, delta);
136 }
137 }
138
139 #define __gen_address_type struct blorp_address
140 #define __gen_user_data struct blorp_batch
141 #define __gen_combine_address _blorp_combine_address
142
143 #include "genxml/genX_pack.h"
144
145 #define _blorp_cmd_length(cmd) cmd ## _length
146 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
147 #define _blorp_cmd_header(cmd) cmd ## _header
148 #define _blorp_cmd_pack(cmd) cmd ## _pack
149
150 #define blorp_emit(batch, cmd, name) \
151 for (struct cmd name = { _blorp_cmd_header(cmd) }, \
152 *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
153 __builtin_expect(_dst != NULL, 1); \
154 _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
155 _dst = NULL)
156
157 #define blorp_emitn(batch, cmd, n, ...) ({ \
158 uint32_t *_dw = blorp_emit_dwords(batch, n); \
159 if (_dw) { \
160 struct cmd template = { \
161 _blorp_cmd_header(cmd), \
162 .DWordLength = n - _blorp_cmd_length_bias(cmd), \
163 __VA_ARGS__ \
164 }; \
165 _blorp_cmd_pack(cmd)(batch, _dw, &template); \
166 } \
167 _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */ \
168 })
169
170 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
171
172 #define blorp_emit_dynamic(batch, state, name, align, offset) \
173 for (struct state name = STRUCT_ZERO(state), \
174 *_dst = blorp_alloc_dynamic_state(batch, \
175 _blorp_cmd_length(state) * 4, \
176 align, offset); \
177 __builtin_expect(_dst != NULL, 1); \
178 _blorp_cmd_pack(state)(batch, (void *)_dst, &name), \
179 blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4), \
180 _dst = NULL)
181
182 /* 3DSTATE_URB
183 * 3DSTATE_URB_VS
184 * 3DSTATE_URB_HS
185 * 3DSTATE_URB_DS
186 * 3DSTATE_URB_GS
187 *
188 * Assign the entire URB to the VS. Even though the VS disabled, URB space
189 * is still needed because the clipper loads the VUE's from the URB. From
190 * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
191 * Dword 1.15:0 "VS Number of URB Entries":
192 * This field is always used (even if VS Function Enable is DISABLED).
193 *
194 * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
195 * safely ignore it because this batch contains only one draw call.
196 * Because of URB corruption caused by allocating a previous GS unit
197 * URB entry to the VS unit, software is required to send a “GS NULL
198 * Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
199 * plus a dummy DRAW call before any case where VS will be taking over
200 * GS URB space.
201 *
202 * If the 3DSTATE_URB_VS is emitted, than the others must be also.
203 * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
204 *
205 * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
206 * programmed in order for the programming of this state to be
207 * valid.
208 */
209 static void
emit_urb_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size * deref_block_size)210 emit_urb_config(struct blorp_batch *batch,
211 const struct blorp_params *params,
212 UNUSED enum intel_urb_deref_block_size *deref_block_size)
213 {
214 /* Once vertex fetcher has written full VUE entries with complete
215 * header the space requirement is as follows per vertex (in bytes):
216 *
217 * Header Position Program constants
218 * +--------+------------+-------------------+
219 * | 16 | 16 | n x 16 |
220 * +--------+------------+-------------------+
221 *
222 * where 'n' stands for number of varying inputs expressed as vec4s.
223 */
224 const unsigned num_varyings =
225 params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
226 const unsigned total_needed = 16 + 16 + num_varyings * 16;
227
228 /* The URB size is expressed in units of 64 bytes (512 bits) */
229 const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
230
231 ASSERTED const unsigned sf_entry_size =
232 params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
233
234 #if GFX_VER >= 7
235 assert(sf_entry_size == 0);
236 const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 };
237
238 unsigned entries[4], start[4];
239 bool constrained;
240 intel_get_urb_config(batch->blorp->compiler->devinfo,
241 blorp_get_l3_config(batch),
242 false, false, entry_size,
243 entries, start, deref_block_size, &constrained);
244
245 #if GFX_VERx10 == 70
246 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
247 *
248 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
249 * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
250 * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
251 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL
252 * needs to be sent before any combination of VS associated 3DSTATE."
253 */
254 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
255 pc.DepthStallEnable = true;
256 pc.PostSyncOperation = WriteImmediateData;
257 pc.Address = blorp_get_workaround_address(batch);
258 }
259 #endif
260
261 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
262 blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
263 urb._3DCommandSubOpcode += i;
264 urb.VSURBStartingAddress = start[i];
265 urb.VSURBEntryAllocationSize = entry_size[i] - 1;
266 urb.VSNumberofURBEntries = entries[i];
267 }
268 }
269 #else /* GFX_VER < 7 */
270 blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
271 #endif
272 }
273
274 #if GFX_VER >= 7
275 static void
276 blorp_emit_memcpy(struct blorp_batch *batch,
277 struct blorp_address dst,
278 struct blorp_address src,
279 uint32_t size);
280 #endif
281
282 static void
blorp_emit_vertex_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)283 blorp_emit_vertex_data(struct blorp_batch *batch,
284 const struct blorp_params *params,
285 struct blorp_address *addr,
286 uint32_t *size)
287 {
288 const float vertices[] = {
289 /* v0 */ (float)params->x1, (float)params->y1, params->z,
290 /* v1 */ (float)params->x0, (float)params->y1, params->z,
291 /* v2 */ (float)params->x0, (float)params->y0, params->z,
292 };
293
294 void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
295 memcpy(data, vertices, sizeof(vertices));
296 *size = sizeof(vertices);
297 blorp_flush_range(batch, data, *size);
298 }
299
300 static void
blorp_emit_input_varying_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)301 blorp_emit_input_varying_data(struct blorp_batch *batch,
302 const struct blorp_params *params,
303 struct blorp_address *addr,
304 uint32_t *size)
305 {
306 const unsigned vec4_size_in_bytes = 4 * sizeof(float);
307 const unsigned max_num_varyings =
308 DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
309 const unsigned num_varyings =
310 params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
311
312 *size = 16 + num_varyings * vec4_size_in_bytes;
313
314 const uint32_t *const inputs_src = (const uint32_t *)¶ms->wm_inputs;
315 void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
316 uint32_t *inputs = data;
317
318 /* Copy in the VS inputs */
319 assert(sizeof(params->vs_inputs) == 16);
320 memcpy(inputs, ¶ms->vs_inputs, sizeof(params->vs_inputs));
321 inputs += 4;
322
323 if (params->wm_prog_data) {
324 /* Walk over the attribute slots, determine if the attribute is used by
325 * the program and when necessary copy the values from the input storage
326 * to the vertex data buffer.
327 */
328 for (unsigned i = 0; i < max_num_varyings; i++) {
329 const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
330
331 const int input_index = params->wm_prog_data->urb_setup[attr];
332 if (input_index < 0)
333 continue;
334
335 memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
336
337 inputs += 4;
338 }
339 }
340
341 blorp_flush_range(batch, data, *size);
342
343 if (params->dst_clear_color_as_input) {
344 #if GFX_VER >= 7
345 /* In this case, the clear color isn't known statically and instead
346 * comes in through an indirect which we have to copy into the vertex
347 * buffer before we execute the 3DPRIMITIVE. We already copied the
348 * value of params->wm_inputs.clear_color into the vertex buffer in the
349 * loop above. Now we emit code to stomp it from the GPU with the
350 * actual clear color value.
351 */
352 assert(num_varyings == 1);
353
354 /* The clear color is the first thing after the header */
355 struct blorp_address clear_color_input_addr = *addr;
356 clear_color_input_addr.offset += 16;
357
358 const unsigned clear_color_size =
359 GFX_VER < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
360 blorp_emit_memcpy(batch, clear_color_input_addr,
361 params->dst.clear_color_addr,
362 clear_color_size);
363 #else
364 unreachable("MCS partial resolve is not a thing on SNB and earlier");
365 #endif
366 }
367 }
368
369 static void
blorp_fill_vertex_buffer_state(struct GENX (VERTEX_BUFFER_STATE)* vb,unsigned idx,struct blorp_address addr,uint32_t size,uint32_t stride)370 blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
371 unsigned idx,
372 struct blorp_address addr, uint32_t size,
373 uint32_t stride)
374 {
375 vb[idx].VertexBufferIndex = idx;
376 vb[idx].BufferStartingAddress = addr;
377 vb[idx].BufferPitch = stride;
378
379 #if GFX_VER >= 6
380 vb[idx].MOCS = addr.mocs;
381 #endif
382
383 #if GFX_VER >= 7
384 vb[idx].AddressModifyEnable = true;
385 #endif
386
387 #if GFX_VER >= 8
388 vb[idx].BufferSize = size;
389 #elif GFX_VER >= 5
390 vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
391 vb[idx].EndAddress = vb[idx].BufferStartingAddress;
392 vb[idx].EndAddress.offset += size - 1;
393 #elif GFX_VER == 4
394 vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
395 vb[idx].MaxIndex = stride > 0 ? size / stride : 0;
396 #endif
397
398 #if GFX_VER >= 12
399 vb[idx].L3BypassDisable = true;
400 #endif
401 }
402
403 static void
blorp_emit_vertex_buffers(struct blorp_batch * batch,const struct blorp_params * params)404 blorp_emit_vertex_buffers(struct blorp_batch *batch,
405 const struct blorp_params *params)
406 {
407 struct GENX(VERTEX_BUFFER_STATE) vb[3];
408 uint32_t num_vbs = 2;
409 memset(vb, 0, sizeof(vb));
410
411 struct blorp_address addrs[2] = {};
412 uint32_t sizes[2];
413 blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
414 blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
415 3 * sizeof(float));
416
417 blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
418 blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
419
420 blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
421
422 const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
423 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
424 if (!dw)
425 return;
426
427 for (unsigned i = 0; i < num_vbs; i++) {
428 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
429 dw += GENX(VERTEX_BUFFER_STATE_length);
430 }
431 }
432
433 static void
blorp_emit_vertex_elements(struct blorp_batch * batch,const struct blorp_params * params)434 blorp_emit_vertex_elements(struct blorp_batch *batch,
435 const struct blorp_params *params)
436 {
437 const unsigned num_varyings =
438 params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
439 bool need_ndc = batch->blorp->compiler->devinfo->ver <= 5;
440 const unsigned num_elements = 2 + need_ndc + num_varyings;
441
442 struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
443 memset(ve, 0, num_elements * sizeof(*ve));
444
445 /* Setup VBO for the rectangle primitive..
446 *
447 * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
448 * vertices. The vertices reside in screen space with DirectX
449 * coordinates (that is, (0, 0) is the upper left corner).
450 *
451 * v2 ------ implied
452 * | |
453 * | |
454 * v1 ----- v0
455 *
456 * Since the VS is disabled, the clipper loads each VUE directly from
457 * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
458 * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
459 * dw0: Reserved, MBZ.
460 * dw1: Render Target Array Index. Below vertex fetcher gets programmed
461 * to assign this with primitive instance identifier which will be
462 * used for layered clears. All other renders have only one instance
463 * and therefore the value will be effectively zero.
464 * dw2: Viewport Index. The HiZ op disables viewport mapping and
465 * scissoring, so set the dword to 0.
466 * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
467 * so set the dword to 0.
468 * dw4: Vertex Position X.
469 * dw5: Vertex Position Y.
470 * dw6: Vertex Position Z.
471 * dw7: Vertex Position W.
472 *
473 * dw8: Flat vertex input 0
474 * dw9: Flat vertex input 1
475 * ...
476 * dwn: Flat vertex input n - 8
477 *
478 * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
479 * "Vertex URB Entry (VUE) Formats".
480 *
481 * Only vertex position X and Y are going to be variable, Z is fixed to
482 * zero and W to one. Header words dw0,2,3 are zero. There is no need to
483 * include the fixed values in the vertex buffer. Vertex fetcher can be
484 * instructed to fill vertex elements with constant values of one and zero
485 * instead of reading them from the buffer.
486 * Flat inputs are program constants that are not interpolated. Moreover
487 * their values will be the same between vertices.
488 *
489 * See the vertex element setup below.
490 */
491 unsigned slot = 0;
492
493 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
494 .VertexBufferIndex = 1,
495 .Valid = true,
496 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
497 .SourceElementOffset = 0,
498 .Component0Control = VFCOMP_STORE_SRC,
499
500 /* From Gfx8 onwards hardware is no more instructed to overwrite
501 * components using an element specifier. Instead one has separate
502 * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
503 */
504 #if GFX_VER >= 8
505 .Component1Control = VFCOMP_STORE_0,
506 #elif GFX_VER >= 5
507 .Component1Control = VFCOMP_STORE_IID,
508 #else
509 .Component1Control = VFCOMP_STORE_0,
510 #endif
511 .Component2Control = VFCOMP_STORE_0,
512 .Component3Control = VFCOMP_STORE_0,
513 #if GFX_VER <= 5
514 .DestinationElementOffset = slot * 4,
515 #endif
516 };
517 slot++;
518
519 #if GFX_VER <= 5
520 /* On Iron Lake and earlier, a native device coordinates version of the
521 * position goes right after the normal VUE header and before position.
522 * Since w == 1 for all of our coordinates, this is just a copy of the
523 * position.
524 */
525 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
526 .VertexBufferIndex = 0,
527 .Valid = true,
528 .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
529 .SourceElementOffset = 0,
530 .Component0Control = VFCOMP_STORE_SRC,
531 .Component1Control = VFCOMP_STORE_SRC,
532 .Component2Control = VFCOMP_STORE_SRC,
533 .Component3Control = VFCOMP_STORE_1_FP,
534 .DestinationElementOffset = slot * 4,
535 };
536 slot++;
537 #endif
538
539 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
540 .VertexBufferIndex = 0,
541 .Valid = true,
542 .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
543 .SourceElementOffset = 0,
544 .Component0Control = VFCOMP_STORE_SRC,
545 .Component1Control = VFCOMP_STORE_SRC,
546 .Component2Control = VFCOMP_STORE_SRC,
547 .Component3Control = VFCOMP_STORE_1_FP,
548 #if GFX_VER <= 5
549 .DestinationElementOffset = slot * 4,
550 #endif
551 };
552 slot++;
553
554 for (unsigned i = 0; i < num_varyings; ++i) {
555 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
556 .VertexBufferIndex = 1,
557 .Valid = true,
558 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
559 .SourceElementOffset = 16 + i * 4 * sizeof(float),
560 .Component0Control = VFCOMP_STORE_SRC,
561 .Component1Control = VFCOMP_STORE_SRC,
562 .Component2Control = VFCOMP_STORE_SRC,
563 .Component3Control = VFCOMP_STORE_SRC,
564 #if GFX_VER <= 5
565 .DestinationElementOffset = slot * 4,
566 #endif
567 };
568 slot++;
569 }
570
571 const unsigned num_dwords =
572 1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
573 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
574 if (!dw)
575 return;
576
577 for (unsigned i = 0; i < num_elements; i++) {
578 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
579 dw += GENX(VERTEX_ELEMENT_STATE_length);
580 }
581
582 blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
583 vf.StatisticsEnable = false;
584 }
585
586 #if GFX_VER >= 8
587 /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
588 * primitive instance identifier. This is used for layered clears.
589 */
590 blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
591 sgvs.InstanceIDEnable = true;
592 sgvs.InstanceIDComponentNumber = COMP_1;
593 sgvs.InstanceIDElementOffset = 0;
594 }
595
596 for (unsigned i = 0; i < num_elements; i++) {
597 blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
598 vf.VertexElementIndex = i;
599 vf.InstancingEnable = false;
600 }
601 }
602
603 blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
604 topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
605 }
606 #endif
607 }
608
609 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
610 static uint32_t
blorp_emit_cc_viewport(struct blorp_batch * batch)611 blorp_emit_cc_viewport(struct blorp_batch *batch)
612 {
613 uint32_t cc_vp_offset;
614 blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
615 vp.MinimumDepth = 0.0;
616 vp.MaximumDepth = 1.0;
617 }
618
619 #if GFX_VER >= 7
620 blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
621 vsp.CCViewportPointer = cc_vp_offset;
622 }
623 #elif GFX_VER == 6
624 blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
625 vsp.CCViewportStateChange = true;
626 vsp.PointertoCC_VIEWPORT = cc_vp_offset;
627 }
628 #endif
629
630 return cc_vp_offset;
631 }
632
633 static uint32_t
blorp_emit_sampler_state(struct blorp_batch * batch)634 blorp_emit_sampler_state(struct blorp_batch *batch)
635 {
636 uint32_t offset;
637 blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
638 sampler.MipModeFilter = MIPFILTER_NONE;
639 sampler.MagModeFilter = MAPFILTER_LINEAR;
640 sampler.MinModeFilter = MAPFILTER_LINEAR;
641 sampler.MinLOD = 0;
642 sampler.MaxLOD = 0;
643 sampler.TCXAddressControlMode = TCM_CLAMP;
644 sampler.TCYAddressControlMode = TCM_CLAMP;
645 sampler.TCZAddressControlMode = TCM_CLAMP;
646 sampler.MaximumAnisotropy = RATIO21;
647 sampler.RAddressMinFilterRoundingEnable = true;
648 sampler.RAddressMagFilterRoundingEnable = true;
649 sampler.VAddressMinFilterRoundingEnable = true;
650 sampler.VAddressMagFilterRoundingEnable = true;
651 sampler.UAddressMinFilterRoundingEnable = true;
652 sampler.UAddressMagFilterRoundingEnable = true;
653 #if GFX_VER > 6
654 sampler.NonnormalizedCoordinateEnable = true;
655 #endif
656 }
657
658 return offset;
659 }
660
661 UNUSED static uint32_t
blorp_emit_sampler_state_ps(struct blorp_batch * batch)662 blorp_emit_sampler_state_ps(struct blorp_batch *batch)
663 {
664 uint32_t offset = blorp_emit_sampler_state(batch);
665
666 #if GFX_VER >= 7
667 blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
668 ssp.PointertoPSSamplerState = offset;
669 }
670 #elif GFX_VER == 6
671 blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
672 ssp.VSSamplerStateChange = true;
673 ssp.GSSamplerStateChange = true;
674 ssp.PSSamplerStateChange = true;
675 ssp.PointertoPSSamplerState = offset;
676 }
677 #endif
678
679 return offset;
680 }
681
682 /* What follows is the code for setting up a "pipeline" on Sandy Bridge and
683 * later hardware. This file will be included by i965 for gfx4-5 as well, so
684 * this code is guarded by GFX_VER >= 6.
685 */
686 #if GFX_VER >= 6
687
688 static void
blorp_emit_vs_config(struct blorp_batch * batch,const struct blorp_params * params)689 blorp_emit_vs_config(struct blorp_batch *batch,
690 const struct blorp_params *params)
691 {
692 struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
693 assert(!vs_prog_data || GFX_VER < 11 ||
694 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
695
696 blorp_emit(batch, GENX(3DSTATE_VS), vs) {
697 if (vs_prog_data) {
698 vs.Enable = true;
699
700 vs.KernelStartPointer = params->vs_prog_kernel;
701
702 vs.DispatchGRFStartRegisterForURBData =
703 vs_prog_data->base.base.dispatch_grf_start_reg;
704 vs.VertexURBEntryReadLength =
705 vs_prog_data->base.urb_read_length;
706 vs.VertexURBEntryReadOffset = 0;
707
708 vs.MaximumNumberofThreads =
709 batch->blorp->isl_dev->info->max_vs_threads - 1;
710
711 #if GFX_VER >= 8
712 vs.SIMD8DispatchEnable =
713 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
714 #endif
715 }
716 }
717 }
718
719 static void
blorp_emit_sf_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size urb_deref_block_size)720 blorp_emit_sf_config(struct blorp_batch *batch,
721 const struct blorp_params *params,
722 UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
723 {
724 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
725
726 /* 3DSTATE_SF
727 *
728 * Disable ViewportTransformEnable (dw2.1)
729 *
730 * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
731 * Primitives Overview":
732 * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
733 * use of screen- space coordinates).
734 *
735 * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
736 * and BackFaceFillMode (dw2.5:6) to SOLID(0).
737 *
738 * From the Sandy Bridge PRM, Volume 2, Part 1, Section
739 * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
740 * SOLID: Any triangle or rectangle object found to be front-facing
741 * is rendered as a solid object. This setting is required when
742 * (rendering rectangle (RECTLIST) objects.
743 */
744
745 #if GFX_VER >= 8
746
747 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
748 #if GFX_VER >= 12
749 sf.DerefBlockSize = urb_deref_block_size;
750 #endif
751 }
752
753 blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
754 raster.CullMode = CULLMODE_NONE;
755 }
756
757 blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
758 sbe.VertexURBEntryReadOffset = 1;
759 if (prog_data) {
760 sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
761 sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
762 sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
763 } else {
764 sbe.NumberofSFOutputAttributes = 0;
765 sbe.VertexURBEntryReadLength = 1;
766 }
767 sbe.ForceVertexURBEntryReadLength = true;
768 sbe.ForceVertexURBEntryReadOffset = true;
769
770 #if GFX_VER >= 9
771 for (unsigned i = 0; i < 32; i++)
772 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
773 #endif
774 }
775
776 #elif GFX_VER >= 7
777
778 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
779 sf.FrontFaceFillMode = FILL_MODE_SOLID;
780 sf.BackFaceFillMode = FILL_MODE_SOLID;
781
782 sf.MultisampleRasterizationMode = params->num_samples > 1 ?
783 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
784
785 #if GFX_VER == 7
786 sf.DepthBufferSurfaceFormat = params->depth_format;
787 #endif
788 }
789
790 blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
791 sbe.VertexURBEntryReadOffset = 1;
792 if (prog_data) {
793 sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
794 sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
795 sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
796 } else {
797 sbe.NumberofSFOutputAttributes = 0;
798 sbe.VertexURBEntryReadLength = 1;
799 }
800 }
801
802 #else /* GFX_VER <= 6 */
803
804 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
805 sf.FrontFaceFillMode = FILL_MODE_SOLID;
806 sf.BackFaceFillMode = FILL_MODE_SOLID;
807
808 sf.MultisampleRasterizationMode = params->num_samples > 1 ?
809 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
810
811 sf.VertexURBEntryReadOffset = 1;
812 if (prog_data) {
813 sf.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
814 sf.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
815 sf.ConstantInterpolationEnable = prog_data->flat_inputs;
816 } else {
817 sf.NumberofSFOutputAttributes = 0;
818 sf.VertexURBEntryReadLength = 1;
819 }
820 }
821
822 #endif /* GFX_VER */
823 }
824
825 static void
blorp_emit_ps_config(struct blorp_batch * batch,const struct blorp_params * params)826 blorp_emit_ps_config(struct blorp_batch *batch,
827 const struct blorp_params *params)
828 {
829 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
830
831 /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
832 * nonzero to prevent the GPU from hanging. While the documentation doesn't
833 * mention this explicitly, it notes that the valid range for the field is
834 * [1,39] = [2,40] threads, which excludes zero.
835 *
836 * To be safe (and to minimize extraneous code) we go ahead and fully
837 * configure the WM state whether or not there is a WM program.
838 */
839
840 #if GFX_VER >= 8
841
842 blorp_emit(batch, GENX(3DSTATE_WM), wm);
843
844 blorp_emit(batch, GENX(3DSTATE_PS), ps) {
845 if (params->src.enabled) {
846 ps.SamplerCount = 1; /* Up to 4 samplers */
847 ps.BindingTableEntryCount = 2;
848 } else {
849 ps.BindingTableEntryCount = 1;
850 }
851
852 /* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
853 if (GFX_VER == 11)
854 ps.SamplerCount = 0;
855
856 if (prog_data) {
857 ps._8PixelDispatchEnable = prog_data->dispatch_8;
858 ps._16PixelDispatchEnable = prog_data->dispatch_16;
859 ps._32PixelDispatchEnable = prog_data->dispatch_32;
860
861 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
862 *
863 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
864 * Dispatch must not be enabled for PER_PIXEL dispatch mode."
865 *
866 * Since 16x MSAA is first introduced on SKL, we don't need to apply
867 * the workaround on any older hardware.
868 */
869 if (GFX_VER >= 9 && !prog_data->persample_dispatch &&
870 params->num_samples == 16) {
871 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
872 ps._32PixelDispatchEnable = false;
873 }
874
875 ps.DispatchGRFStartRegisterForConstantSetupData0 =
876 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
877 ps.DispatchGRFStartRegisterForConstantSetupData1 =
878 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
879 ps.DispatchGRFStartRegisterForConstantSetupData2 =
880 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
881
882 ps.KernelStartPointer0 = params->wm_prog_kernel +
883 brw_wm_prog_data_prog_offset(prog_data, ps, 0);
884 ps.KernelStartPointer1 = params->wm_prog_kernel +
885 brw_wm_prog_data_prog_offset(prog_data, ps, 1);
886 ps.KernelStartPointer2 = params->wm_prog_kernel +
887 brw_wm_prog_data_prog_offset(prog_data, ps, 2);
888 }
889
890 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
891 * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
892 * k, it implies 2(k+1) threads. It implicitly scales for different GT
893 * levels (which have some # of PSDs).
894 *
895 * In Gfx8 the format is U8-2 whereas in Gfx9+ it is U9-1.
896 */
897 const struct intel_device_info *devinfo = batch->blorp->compiler->devinfo;
898 ps.MaximumNumberofThreadsPerPSD =
899 devinfo->max_threads_per_psd - (GFX_VER == 8 ? 2 : 1);
900
901 switch (params->fast_clear_op) {
902 case ISL_AUX_OP_NONE:
903 break;
904 #if GFX_VER >= 10
905 case ISL_AUX_OP_AMBIGUATE:
906 ps.RenderTargetFastClearEnable = true;
907 ps.RenderTargetResolveType = FAST_CLEAR_0;
908 break;
909 #endif
910 #if GFX_VER >= 9
911 case ISL_AUX_OP_PARTIAL_RESOLVE:
912 ps.RenderTargetResolveType = RESOLVE_PARTIAL;
913 break;
914 case ISL_AUX_OP_FULL_RESOLVE:
915 ps.RenderTargetResolveType = RESOLVE_FULL;
916 break;
917 #else
918 case ISL_AUX_OP_FULL_RESOLVE:
919 ps.RenderTargetResolveEnable = true;
920 break;
921 #endif
922 case ISL_AUX_OP_FAST_CLEAR:
923 ps.RenderTargetFastClearEnable = true;
924 break;
925 default:
926 unreachable("Invalid fast clear op");
927 }
928 }
929
930 blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
931 if (prog_data) {
932 psx.PixelShaderValid = true;
933 psx.AttributeEnable = prog_data->num_varying_inputs > 0;
934 psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
935 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
936 #if GFX_VER >= 9
937 psx.PixelShaderComputesStencil = prog_data->computed_stencil;
938 #endif
939 }
940
941 if (params->src.enabled)
942 psx.PixelShaderKillsPixel = true;
943 }
944
945 #elif GFX_VER >= 7
946
947 blorp_emit(batch, GENX(3DSTATE_WM), wm) {
948 switch (params->hiz_op) {
949 case ISL_AUX_OP_FAST_CLEAR:
950 wm.DepthBufferClear = true;
951 break;
952 case ISL_AUX_OP_FULL_RESOLVE:
953 wm.DepthBufferResolveEnable = true;
954 break;
955 case ISL_AUX_OP_AMBIGUATE:
956 wm.HierarchicalDepthBufferResolveEnable = true;
957 break;
958 case ISL_AUX_OP_NONE:
959 break;
960 default:
961 unreachable("not reached");
962 }
963
964 if (prog_data) {
965 wm.ThreadDispatchEnable = true;
966 wm.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
967 }
968
969 if (params->src.enabled)
970 wm.PixelShaderKillsPixel = true;
971
972 if (params->num_samples > 1) {
973 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
974 wm.MultisampleDispatchMode =
975 (prog_data && prog_data->persample_dispatch) ?
976 MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
977 } else {
978 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
979 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
980 }
981 }
982
983 blorp_emit(batch, GENX(3DSTATE_PS), ps) {
984 ps.MaximumNumberofThreads =
985 batch->blorp->isl_dev->info->max_wm_threads - 1;
986
987 #if GFX_VERx10 == 75
988 ps.SampleMask = 1;
989 #endif
990
991 if (prog_data) {
992 ps._8PixelDispatchEnable = prog_data->dispatch_8;
993 ps._16PixelDispatchEnable = prog_data->dispatch_16;
994 ps._32PixelDispatchEnable = prog_data->dispatch_32;
995
996 ps.DispatchGRFStartRegisterForConstantSetupData0 =
997 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
998 ps.DispatchGRFStartRegisterForConstantSetupData1 =
999 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
1000 ps.DispatchGRFStartRegisterForConstantSetupData2 =
1001 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
1002
1003 ps.KernelStartPointer0 = params->wm_prog_kernel +
1004 brw_wm_prog_data_prog_offset(prog_data, ps, 0);
1005 ps.KernelStartPointer1 = params->wm_prog_kernel +
1006 brw_wm_prog_data_prog_offset(prog_data, ps, 1);
1007 ps.KernelStartPointer2 = params->wm_prog_kernel +
1008 brw_wm_prog_data_prog_offset(prog_data, ps, 2);
1009
1010 ps.AttributeEnable = prog_data->num_varying_inputs > 0;
1011 } else {
1012 /* Gfx7 hardware gets angry if we don't enable at least one dispatch
1013 * mode, so just enable 16-pixel dispatch if we don't have a program.
1014 */
1015 ps._16PixelDispatchEnable = true;
1016 }
1017
1018 if (params->src.enabled)
1019 ps.SamplerCount = 1; /* Up to 4 samplers */
1020
1021 switch (params->fast_clear_op) {
1022 case ISL_AUX_OP_NONE:
1023 break;
1024 case ISL_AUX_OP_FULL_RESOLVE:
1025 ps.RenderTargetResolveEnable = true;
1026 break;
1027 case ISL_AUX_OP_FAST_CLEAR:
1028 ps.RenderTargetFastClearEnable = true;
1029 break;
1030 default:
1031 unreachable("Invalid fast clear op");
1032 }
1033 }
1034
1035 #else /* GFX_VER <= 6 */
1036
1037 blorp_emit(batch, GENX(3DSTATE_WM), wm) {
1038 wm.MaximumNumberofThreads =
1039 batch->blorp->isl_dev->info->max_wm_threads - 1;
1040
1041 switch (params->hiz_op) {
1042 case ISL_AUX_OP_FAST_CLEAR:
1043 wm.DepthBufferClear = true;
1044 break;
1045 case ISL_AUX_OP_FULL_RESOLVE:
1046 wm.DepthBufferResolveEnable = true;
1047 break;
1048 case ISL_AUX_OP_AMBIGUATE:
1049 wm.HierarchicalDepthBufferResolveEnable = true;
1050 break;
1051 case ISL_AUX_OP_NONE:
1052 break;
1053 default:
1054 unreachable("not reached");
1055 }
1056
1057 if (prog_data) {
1058 wm.ThreadDispatchEnable = true;
1059
1060 wm._8PixelDispatchEnable = prog_data->dispatch_8;
1061 wm._16PixelDispatchEnable = prog_data->dispatch_16;
1062 wm._32PixelDispatchEnable = prog_data->dispatch_32;
1063
1064 wm.DispatchGRFStartRegisterForConstantSetupData0 =
1065 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
1066 wm.DispatchGRFStartRegisterForConstantSetupData1 =
1067 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
1068 wm.DispatchGRFStartRegisterForConstantSetupData2 =
1069 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
1070
1071 wm.KernelStartPointer0 = params->wm_prog_kernel +
1072 brw_wm_prog_data_prog_offset(prog_data, wm, 0);
1073 wm.KernelStartPointer1 = params->wm_prog_kernel +
1074 brw_wm_prog_data_prog_offset(prog_data, wm, 1);
1075 wm.KernelStartPointer2 = params->wm_prog_kernel +
1076 brw_wm_prog_data_prog_offset(prog_data, wm, 2);
1077
1078 wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
1079 }
1080
1081 if (params->src.enabled) {
1082 wm.SamplerCount = 1; /* Up to 4 samplers */
1083 wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
1084 }
1085
1086 if (params->num_samples > 1) {
1087 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1088 wm.MultisampleDispatchMode =
1089 (prog_data && prog_data->persample_dispatch) ?
1090 MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
1091 } else {
1092 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1093 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1094 }
1095 }
1096
1097 #endif /* GFX_VER */
1098 }
1099
1100 static uint32_t
blorp_emit_blend_state(struct blorp_batch * batch,const struct blorp_params * params)1101 blorp_emit_blend_state(struct blorp_batch *batch,
1102 const struct blorp_params *params)
1103 {
1104 struct GENX(BLEND_STATE) blend = { };
1105
1106 uint32_t offset;
1107 int size = GENX(BLEND_STATE_length) * 4;
1108 size += GENX(BLEND_STATE_ENTRY_length) * 4 * params->num_draw_buffers;
1109 uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
1110 uint32_t *pos = state;
1111
1112 GENX(BLEND_STATE_pack)(NULL, pos, &blend);
1113 pos += GENX(BLEND_STATE_length);
1114
1115 for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
1116 struct GENX(BLEND_STATE_ENTRY) entry = {
1117 .PreBlendColorClampEnable = true,
1118 .PostBlendColorClampEnable = true,
1119 .ColorClampRange = COLORCLAMP_RTFORMAT,
1120
1121 .WriteDisableRed = params->color_write_disable & 1,
1122 .WriteDisableGreen = params->color_write_disable & 2,
1123 .WriteDisableBlue = params->color_write_disable & 4,
1124 .WriteDisableAlpha = params->color_write_disable & 8,
1125 };
1126 GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
1127 pos += GENX(BLEND_STATE_ENTRY_length);
1128 }
1129
1130 blorp_flush_range(batch, state, size);
1131
1132 #if GFX_VER >= 7
1133 blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
1134 sp.BlendStatePointer = offset;
1135 #if GFX_VER >= 8
1136 sp.BlendStatePointerValid = true;
1137 #endif
1138 }
1139 #endif
1140
1141 #if GFX_VER >= 8
1142 blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
1143 ps_blend.HasWriteableRT = true;
1144 }
1145 #endif
1146
1147 return offset;
1148 }
1149
1150 static uint32_t
blorp_emit_color_calc_state(struct blorp_batch * batch,UNUSED const struct blorp_params * params)1151 blorp_emit_color_calc_state(struct blorp_batch *batch,
1152 UNUSED const struct blorp_params *params)
1153 {
1154 uint32_t offset;
1155 blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
1156 #if GFX_VER <= 8
1157 cc.StencilReferenceValue = params->stencil_ref;
1158 #endif
1159 }
1160
1161 #if GFX_VER >= 7
1162 blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
1163 sp.ColorCalcStatePointer = offset;
1164 #if GFX_VER >= 8
1165 sp.ColorCalcStatePointerValid = true;
1166 #endif
1167 }
1168 #endif
1169
1170 return offset;
1171 }
1172
1173 static uint32_t
blorp_emit_depth_stencil_state(struct blorp_batch * batch,const struct blorp_params * params)1174 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
1175 const struct blorp_params *params)
1176 {
1177 #if GFX_VER >= 8
1178 struct GENX(3DSTATE_WM_DEPTH_STENCIL) ds = {
1179 GENX(3DSTATE_WM_DEPTH_STENCIL_header),
1180 };
1181 #else
1182 struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
1183 #endif
1184
1185 if (params->depth.enabled) {
1186 ds.DepthBufferWriteEnable = true;
1187
1188 switch (params->hiz_op) {
1189 /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
1190 * - 7.5.3.1 Depth Buffer Clear
1191 * - 7.5.3.2 Depth Buffer Resolve
1192 * - 7.5.3.3 Hierarchical Depth Buffer Resolve
1193 */
1194 case ISL_AUX_OP_FULL_RESOLVE:
1195 ds.DepthTestEnable = true;
1196 ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1197 break;
1198
1199 case ISL_AUX_OP_NONE:
1200 case ISL_AUX_OP_FAST_CLEAR:
1201 case ISL_AUX_OP_AMBIGUATE:
1202 ds.DepthTestEnable = false;
1203 break;
1204 case ISL_AUX_OP_PARTIAL_RESOLVE:
1205 unreachable("Invalid HIZ op");
1206 }
1207 }
1208
1209 if (params->stencil.enabled) {
1210 ds.StencilBufferWriteEnable = true;
1211 ds.StencilTestEnable = true;
1212 ds.DoubleSidedStencilEnable = false;
1213
1214 ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1215 ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1216
1217 ds.StencilWriteMask = params->stencil_mask;
1218 #if GFX_VER >= 9
1219 ds.StencilReferenceValue = params->stencil_ref;
1220 #endif
1221 }
1222
1223 #if GFX_VER >= 8
1224 uint32_t offset = 0;
1225 uint32_t *dw = blorp_emit_dwords(batch,
1226 GENX(3DSTATE_WM_DEPTH_STENCIL_length));
1227 if (!dw)
1228 return 0;
1229
1230 GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &ds);
1231 #else
1232 uint32_t offset;
1233 void *state = blorp_alloc_dynamic_state(batch,
1234 GENX(DEPTH_STENCIL_STATE_length) * 4,
1235 64, &offset);
1236 GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
1237 blorp_flush_range(batch, state, GENX(DEPTH_STENCIL_STATE_length) * 4);
1238 #endif
1239
1240 #if GFX_VER == 7
1241 blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
1242 sp.PointertoDEPTH_STENCIL_STATE = offset;
1243 }
1244 #endif
1245
1246 #if GFX_VER >= 12
1247 blorp_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
1248 db.DepthBoundsTestEnable = false;
1249 db.DepthBoundsTestMinValue = 0.0;
1250 db.DepthBoundsTestMaxValue = 1.0;
1251 }
1252 #endif
1253
1254 return offset;
1255 }
1256
1257 static void
blorp_emit_3dstate_multisample(struct blorp_batch * batch,const struct blorp_params * params)1258 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1259 const struct blorp_params *params)
1260 {
1261 blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1262 ms.NumberofMultisamples = __builtin_ffs(params->num_samples) - 1;
1263
1264 #if GFX_VER >= 8
1265 /* The PRM says that this bit is valid only for DX9:
1266 *
1267 * SW can choose to set this bit only for DX9 API. DX10/OGL API's
1268 * should not have any effect by setting or not setting this bit.
1269 */
1270 ms.PixelPositionOffsetEnable = false;
1271 #elif GFX_VER >= 7
1272
1273 switch (params->num_samples) {
1274 case 1:
1275 INTEL_SAMPLE_POS_1X(ms.Sample);
1276 break;
1277 case 2:
1278 INTEL_SAMPLE_POS_2X(ms.Sample);
1279 break;
1280 case 4:
1281 INTEL_SAMPLE_POS_4X(ms.Sample);
1282 break;
1283 case 8:
1284 INTEL_SAMPLE_POS_8X(ms.Sample);
1285 break;
1286 default:
1287 break;
1288 }
1289 #else
1290 INTEL_SAMPLE_POS_4X(ms.Sample);
1291 #endif
1292 ms.PixelLocation = CENTER;
1293 }
1294 }
1295
1296 static void
blorp_emit_pipeline(struct blorp_batch * batch,const struct blorp_params * params)1297 blorp_emit_pipeline(struct blorp_batch *batch,
1298 const struct blorp_params *params)
1299 {
1300 uint32_t blend_state_offset = 0;
1301 uint32_t color_calc_state_offset;
1302 uint32_t depth_stencil_state_offset;
1303
1304 enum intel_urb_deref_block_size urb_deref_block_size;
1305 emit_urb_config(batch, params, &urb_deref_block_size);
1306
1307 if (params->wm_prog_data) {
1308 blend_state_offset = blorp_emit_blend_state(batch, params);
1309 }
1310 color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
1311 depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
1312
1313 #if GFX_VER == 6
1314 /* 3DSTATE_CC_STATE_POINTERS
1315 *
1316 * The pointer offsets are relative to
1317 * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
1318 *
1319 * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
1320 *
1321 * The dynamic state emit helpers emit their own STATE_POINTERS packets on
1322 * gfx7+. However, on gfx6 and earlier, they're all lumpped together in
1323 * one CC_STATE_POINTERS packet so we have to emit that here.
1324 */
1325 blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
1326 cc.BLEND_STATEChange = params->wm_prog_data ? true : false;
1327 cc.ColorCalcStatePointerValid = true;
1328 cc.DEPTH_STENCIL_STATEChange = true;
1329 cc.PointertoBLEND_STATE = blend_state_offset;
1330 cc.ColorCalcStatePointer = color_calc_state_offset;
1331 cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
1332 }
1333 #else
1334 (void)blend_state_offset;
1335 (void)color_calc_state_offset;
1336 (void)depth_stencil_state_offset;
1337 #endif
1338
1339 UNUSED uint32_t mocs = isl_mocs(batch->blorp->isl_dev, 0, false);
1340
1341 #if GFX_VER >= 12
1342 blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
1343 /* Update empty push constants for all stages (bitmask = 11111b) */
1344 pc.ShaderUpdateEnable = 0x1f;
1345 pc.MOCS = mocs;
1346 }
1347 #else
1348 #if GFX_VER >= 9
1349 #define CONSTANT_MOCS xs.MOCS = mocs
1350 #elif GFX_VER == 7
1351 #define CONSTANT_MOCS xs.ConstantBody.MOCS = mocs
1352 #else
1353 #define CONSTANT_MOCS
1354 #endif
1355 blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), xs) { CONSTANT_MOCS; }
1356 #if GFX_VER >= 7
1357 blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), xs) { CONSTANT_MOCS; }
1358 blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), xs) { CONSTANT_MOCS; }
1359 #endif
1360 blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), xs) { CONSTANT_MOCS; }
1361 blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), xs) { CONSTANT_MOCS; }
1362 #endif
1363 #undef CONSTANT_MOCS
1364
1365 if (params->src.enabled)
1366 blorp_emit_sampler_state_ps(batch);
1367
1368 blorp_emit_3dstate_multisample(batch, params);
1369
1370 blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1371 mask.SampleMask = (1 << params->num_samples) - 1;
1372 }
1373
1374 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1375 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1376 *
1377 * [DevSNB] A pipeline flush must be programmed prior to a
1378 * 3DSTATE_VS command that causes the VS Function Enable to
1379 * toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1380 * command with CS stall bit set and a post sync operation.
1381 *
1382 * We've already done one at the start of the BLORP operation.
1383 */
1384 blorp_emit_vs_config(batch, params);
1385 #if GFX_VER >= 7
1386 blorp_emit(batch, GENX(3DSTATE_HS), hs);
1387 blorp_emit(batch, GENX(3DSTATE_TE), te);
1388 blorp_emit(batch, GENX(3DSTATE_DS), DS);
1389 blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1390 #endif
1391 blorp_emit(batch, GENX(3DSTATE_GS), gs);
1392
1393 blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1394 clip.PerspectiveDivideDisable = true;
1395 }
1396
1397 blorp_emit_sf_config(batch, params, urb_deref_block_size);
1398 blorp_emit_ps_config(batch, params);
1399
1400 blorp_emit_cc_viewport(batch);
1401
1402 #if GFX_VER >= 12
1403 /* Disable Primitive Replication. */
1404 blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1405 #endif
1406
1407 if (batch->blorp->config.use_mesh_shading) {
1408 #if GFX_VERx10 >= 125
1409 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
1410 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
1411
1412 blorp_emit(batch, GENX(3DSTATE_MESH_SHADER), zero);
1413 blorp_emit(batch, GENX(3DSTATE_TASK_SHADER), zero);
1414
1415 blorp_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
1416 blorp_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
1417 #endif
1418 }
1419 }
1420
1421 /******** This is the end of the pipeline setup code ********/
1422
1423 #endif /* GFX_VER >= 6 */
1424
1425 #if GFX_VER >= 7
1426 static void
blorp_emit_memcpy(struct blorp_batch * batch,struct blorp_address dst,struct blorp_address src,uint32_t size)1427 blorp_emit_memcpy(struct blorp_batch *batch,
1428 struct blorp_address dst,
1429 struct blorp_address src,
1430 uint32_t size)
1431 {
1432 assert(size % 4 == 0);
1433
1434 for (unsigned dw = 0; dw < size; dw += 4) {
1435 #if GFX_VER >= 8
1436 blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1437 cp.DestinationMemoryAddress = dst;
1438 cp.SourceMemoryAddress = src;
1439 }
1440 #else
1441 /* IVB does not have a general purpose register for command streamer
1442 * commands. Therefore, we use an alternate temporary register.
1443 */
1444 #define BLORP_TEMP_REG 0x2440 /* GFX7_3DPRIM_BASE_VERTEX */
1445 blorp_emit(batch, GENX(MI_LOAD_REGISTER_MEM), load) {
1446 load.RegisterAddress = BLORP_TEMP_REG;
1447 load.MemoryAddress = src;
1448 }
1449 blorp_emit(batch, GENX(MI_STORE_REGISTER_MEM), store) {
1450 store.RegisterAddress = BLORP_TEMP_REG;
1451 store.MemoryAddress = dst;
1452 }
1453 #undef BLORP_TEMP_REG
1454 #endif
1455 dst.offset += 4;
1456 src.offset += 4;
1457 }
1458 }
1459 #endif
1460
1461 static void
blorp_emit_surface_state(struct blorp_batch * batch,const struct brw_blorp_surface_info * surface,UNUSED enum isl_aux_op aux_op,void * state,uint32_t state_offset,uint8_t color_write_disable,bool is_render_target)1462 blorp_emit_surface_state(struct blorp_batch *batch,
1463 const struct brw_blorp_surface_info *surface,
1464 UNUSED enum isl_aux_op aux_op,
1465 void *state, uint32_t state_offset,
1466 uint8_t color_write_disable,
1467 bool is_render_target)
1468 {
1469 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1470 struct isl_surf surf = surface->surf;
1471
1472 if (surf.dim == ISL_SURF_DIM_1D &&
1473 surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1474 assert(surf.logical_level0_px.height == 1);
1475 surf.dim = ISL_SURF_DIM_2D;
1476 }
1477
1478 if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1479 /* BLORP doesn't render with depth so we can't use HiZ */
1480 assert(!is_render_target);
1481 /* We can't reinterpret HiZ */
1482 assert(surface->surf.format == surface->view.format);
1483 }
1484
1485 enum isl_aux_usage aux_usage = surface->aux_usage;
1486
1487 /* On gfx12, implicit CCS has no aux buffer */
1488 bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1489 (surface->aux_addr.buffer != NULL);
1490
1491 isl_channel_mask_t write_disable_mask = 0;
1492 if (is_render_target && GFX_VER <= 5) {
1493 if (color_write_disable & BITFIELD_BIT(0))
1494 write_disable_mask |= ISL_CHANNEL_RED_BIT;
1495 if (color_write_disable & BITFIELD_BIT(1))
1496 write_disable_mask |= ISL_CHANNEL_GREEN_BIT;
1497 if (color_write_disable & BITFIELD_BIT(2))
1498 write_disable_mask |= ISL_CHANNEL_BLUE_BIT;
1499 if (color_write_disable & BITFIELD_BIT(3))
1500 write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
1501 }
1502
1503 const bool use_clear_address =
1504 GFX_VER >= 10 && (surface->clear_color_addr.buffer != NULL);
1505
1506 isl_surf_fill_state(batch->blorp->isl_dev, state,
1507 .surf = &surf, .view = &surface->view,
1508 .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1509 .address =
1510 blorp_get_surface_address(batch, surface->addr),
1511 .aux_address = !use_aux_address ? 0 :
1512 blorp_get_surface_address(batch, surface->aux_addr),
1513 .clear_address = !use_clear_address ? 0 :
1514 blorp_get_surface_address(batch,
1515 surface->clear_color_addr),
1516 .mocs = surface->addr.mocs,
1517 .clear_color = surface->clear_color,
1518 .use_clear_address = use_clear_address,
1519 .write_disables = write_disable_mask);
1520
1521 blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1522 surface->addr, 0);
1523
1524 if (use_aux_address) {
1525 /* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1526 * used to store other information. This should be ok, however, because
1527 * surface buffer addresses are always 4K page alinged.
1528 */
1529 assert((surface->aux_addr.offset & 0xfff) == 0);
1530 uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1531 blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1532 surface->aux_addr, *aux_addr);
1533 }
1534
1535 if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1536 #if GFX_VER >= 10
1537 assert((surface->clear_color_addr.offset & 0x3f) == 0);
1538 uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
1539 blorp_surface_reloc(batch, state_offset +
1540 isl_dev->ss.clear_color_state_offset,
1541 surface->clear_color_addr, *clear_addr);
1542 #elif GFX_VER >= 7
1543 /* Fast clears just whack the AUX surface and don't actually use the
1544 * clear color for anything. We can avoid the MI memcpy on that case.
1545 */
1546 if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1547 struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1548 dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1549 blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1550 isl_dev->ss.clear_value_size);
1551 }
1552 #else
1553 unreachable("Fast clears are only supported on gfx7+");
1554 #endif
1555 }
1556
1557 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1558 }
1559
1560 static void
blorp_emit_null_surface_state(struct blorp_batch * batch,const struct brw_blorp_surface_info * surface,uint32_t * state)1561 blorp_emit_null_surface_state(struct blorp_batch *batch,
1562 const struct brw_blorp_surface_info *surface,
1563 uint32_t *state)
1564 {
1565 struct GENX(RENDER_SURFACE_STATE) ss = {
1566 .SurfaceType = SURFTYPE_NULL,
1567 .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1568 .Width = surface->surf.logical_level0_px.width - 1,
1569 .Height = surface->surf.logical_level0_px.height - 1,
1570 .MIPCountLOD = surface->view.base_level,
1571 .MinimumArrayElement = surface->view.base_array_layer,
1572 .Depth = surface->view.array_len - 1,
1573 .RenderTargetViewExtent = surface->view.array_len - 1,
1574 #if GFX_VER >= 6
1575 .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1576 .MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1577 #endif
1578
1579 #if GFX_VER >= 7
1580 .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1581 #endif
1582
1583 #if GFX_VERx10 >= 125
1584 .TileMode = TILE4,
1585 #elif GFX_VER >= 8
1586 .TileMode = YMAJOR,
1587 #else
1588 .TiledSurface = true,
1589 #endif
1590 };
1591
1592 GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1593
1594 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1595 }
1596
1597 static uint32_t
blorp_setup_binding_table(struct blorp_batch * batch,const struct blorp_params * params)1598 blorp_setup_binding_table(struct blorp_batch *batch,
1599 const struct blorp_params *params)
1600 {
1601 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1602 uint32_t surface_offsets[2], bind_offset = 0;
1603 void *surface_maps[2];
1604
1605 UNUSED bool has_indirect_clear_color = false;
1606 if (params->use_pre_baked_binding_table) {
1607 bind_offset = params->pre_baked_binding_table_offset;
1608 } else {
1609 unsigned num_surfaces = 1 + params->src.enabled;
1610 blorp_alloc_binding_table(batch, num_surfaces,
1611 isl_dev->ss.size, isl_dev->ss.align,
1612 &bind_offset, surface_offsets, surface_maps);
1613
1614 if (params->dst.enabled) {
1615 blorp_emit_surface_state(batch, ¶ms->dst,
1616 params->fast_clear_op,
1617 surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1618 surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1619 params->color_write_disable, true);
1620 if (params->dst.clear_color_addr.buffer != NULL)
1621 has_indirect_clear_color = true;
1622 } else {
1623 assert(params->depth.enabled || params->stencil.enabled);
1624 const struct brw_blorp_surface_info *surface =
1625 params->depth.enabled ? ¶ms->depth : ¶ms->stencil;
1626 blorp_emit_null_surface_state(batch, surface,
1627 surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1628 }
1629
1630 if (params->src.enabled) {
1631 blorp_emit_surface_state(batch, ¶ms->src,
1632 params->fast_clear_op,
1633 surface_maps[BLORP_TEXTURE_BT_INDEX],
1634 surface_offsets[BLORP_TEXTURE_BT_INDEX],
1635 0, false);
1636 if (params->src.clear_color_addr.buffer != NULL)
1637 has_indirect_clear_color = true;
1638 }
1639 }
1640
1641 #if GFX_VER >= 7 && GFX_VER < 12
1642 if (has_indirect_clear_color) {
1643 /* Updating a surface state object may require that the state cache be
1644 * invalidated. From the SKL PRM, Shared Functions -> State -> State
1645 * Caching:
1646 *
1647 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1648 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1649 * modified [...], the L1 state cache must be invalidated to ensure
1650 * the new surface or sampler state is fetched from system memory.
1651 *
1652 * XXX - Investigate why exactly this invalidation is necessary to
1653 * avoid Vulkan regressions on ICL. It's possible that the
1654 * MI_ATOMIC used to update the clear color isn't correctly
1655 * ordered with the pre-existing invalidation in
1656 * blorp_update_clear_color().
1657 */
1658 blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1659 pipe.StateCacheInvalidationEnable = true;
1660 }
1661 }
1662 #endif
1663
1664 return bind_offset;
1665 }
1666
1667 static void
blorp_emit_btp(struct blorp_batch * batch,uint32_t bind_offset)1668 blorp_emit_btp(struct blorp_batch *batch, uint32_t bind_offset)
1669 {
1670 #if GFX_VER >= 7
1671 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1672 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1673 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1674 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1675
1676 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1677 bt.PointertoPSBindingTable =
1678 blorp_binding_table_offset_to_pointer(batch, bind_offset);
1679 }
1680 #elif GFX_VER >= 6
1681 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1682 bt.PSBindingTableChange = true;
1683 bt.PointertoPSBindingTable =
1684 blorp_binding_table_offset_to_pointer(batch, bind_offset);
1685 }
1686 #else
1687 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1688 bt.PointertoPSBindingTable =
1689 blorp_binding_table_offset_to_pointer(batch, bind_offset);
1690 }
1691 #endif
1692 }
1693
1694 static void
blorp_emit_depth_stencil_config(struct blorp_batch * batch,const struct blorp_params * params)1695 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1696 const struct blorp_params *params)
1697 {
1698 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1699
1700 uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1701 if (dw == NULL)
1702 return;
1703
1704 struct isl_depth_stencil_hiz_emit_info info = { };
1705
1706 if (params->depth.enabled) {
1707 info.view = ¶ms->depth.view;
1708 info.mocs = params->depth.addr.mocs;
1709 } else if (params->stencil.enabled) {
1710 info.view = ¶ms->stencil.view;
1711 info.mocs = params->stencil.addr.mocs;
1712 } else {
1713 info.mocs = isl_mocs(isl_dev, 0, false);
1714 }
1715
1716 if (params->depth.enabled) {
1717 info.depth_surf = ¶ms->depth.surf;
1718
1719 info.depth_address =
1720 blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1721 params->depth.addr, 0);
1722
1723 info.hiz_usage = params->depth.aux_usage;
1724 if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1725 info.hiz_surf = ¶ms->depth.aux_surf;
1726
1727 struct blorp_address hiz_address = params->depth.aux_addr;
1728 #if GFX_VER == 6
1729 /* Sandy bridge hardware does not technically support mipmapped HiZ.
1730 * However, we have a special layout that allows us to make it work
1731 * anyway by manually offsetting to the specified miplevel.
1732 */
1733 assert(info.hiz_surf->dim_layout == ISL_DIM_LAYOUT_GFX6_STENCIL_HIZ);
1734 uint64_t offset_B;
1735 isl_surf_get_image_offset_B_tile_sa(info.hiz_surf,
1736 info.view->base_level, 0, 0,
1737 &offset_B, NULL, NULL);
1738 hiz_address.offset += offset_B;
1739 #endif
1740
1741 info.hiz_address =
1742 blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1743 hiz_address, 0);
1744
1745 info.depth_clear_value = params->depth.clear_color.f32[0];
1746 }
1747 }
1748
1749 if (params->stencil.enabled) {
1750 info.stencil_surf = ¶ms->stencil.surf;
1751
1752 info.stencil_aux_usage = params->stencil.aux_usage;
1753 struct blorp_address stencil_address = params->stencil.addr;
1754 #if GFX_VER == 6
1755 /* Sandy bridge hardware does not technically support mipmapped stencil.
1756 * However, we have a special layout that allows us to make it work
1757 * anyway by manually offsetting to the specified miplevel.
1758 */
1759 assert(info.stencil_surf->dim_layout == ISL_DIM_LAYOUT_GFX6_STENCIL_HIZ);
1760 uint64_t offset_B;
1761 isl_surf_get_image_offset_B_tile_sa(info.stencil_surf,
1762 info.view->base_level, 0, 0,
1763 &offset_B, NULL, NULL);
1764 stencil_address.offset += offset_B;
1765 #endif
1766
1767 info.stencil_address =
1768 blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1769 stencil_address, 0);
1770 }
1771
1772 isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1773
1774 #if GFX_VER >= 12
1775 /* Wa_1408224581
1776 *
1777 * Workaround: Gfx12LP Astep only An additional pipe control with
1778 * post-sync = store dword operation would be required.( w/a is to
1779 * have an additional pipe control after the stencil state whenever
1780 * the surface state bits of this state is changing).
1781 *
1782 * This also seems sufficient to handle Wa_14014148106.
1783 */
1784 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1785 pc.PostSyncOperation = WriteImmediateData;
1786 pc.Address = blorp_get_workaround_address(batch);
1787 }
1788 #endif
1789 }
1790
1791 #if GFX_VER >= 8
1792 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1793 * depth/stencil buffer extents are ignored to handle APIs which perform
1794 * clearing operations without such information.
1795 * */
1796 static void
blorp_emit_gfx8_hiz_op(struct blorp_batch * batch,const struct blorp_params * params)1797 blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1798 const struct blorp_params *params)
1799 {
1800 /* We should be performing an operation on a depth or stencil buffer.
1801 */
1802 assert(params->depth.enabled || params->stencil.enabled);
1803
1804 blorp_measure_start(batch, params);
1805
1806 /* The stencil buffer should only be enabled if a fast clear operation is
1807 * requested.
1808 */
1809 if (params->stencil.enabled)
1810 assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1811
1812 /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1813 *
1814 * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1815 * the Number of Multisamples. This packet must not be used to change
1816 * Number of Multisamples in a rendering sequence.
1817 *
1818 * Since HIZ may be the first thing in a batch buffer, play safe and always
1819 * emit 3DSTATE_MULTISAMPLE.
1820 */
1821 blorp_emit_3dstate_multisample(batch, params);
1822
1823 /* From the BDW PRM Volume 7, Depth Buffer Clear:
1824 *
1825 * The clear value must be between the min and max depth values
1826 * (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1827 * D32_FLOAT, then +/-DENORM values are also allowed.
1828 *
1829 * Set the bounds to match our hardware limits, [0.0, 1.0].
1830 */
1831 if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
1832 assert(params->depth.clear_color.f32[0] >= 0.0f);
1833 assert(params->depth.clear_color.f32[0] <= 1.0f);
1834 blorp_emit_cc_viewport(batch);
1835 }
1836
1837 /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1838 * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1839 * even when WM_HZ_OP is active. However, WM thread dispatch is normally
1840 * disabled for HiZ ops and it appears that force-enabling it can lead to
1841 * GPU hangs on at least Skylake. Since we don't know the current state of
1842 * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1843 */
1844 blorp_emit(batch, GENX(3DSTATE_WM), wm);
1845
1846 /* If we can't alter the depth stencil config and multiple layers are
1847 * involved, the HiZ op will fail. This is because the op requires that a
1848 * new config is emitted for each additional layer.
1849 */
1850 if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1851 assert(params->num_layers <= 1);
1852 } else {
1853 blorp_emit_depth_stencil_config(batch, params);
1854 }
1855
1856 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1857 switch (params->hiz_op) {
1858 case ISL_AUX_OP_FAST_CLEAR:
1859 hzp.StencilBufferClearEnable = params->stencil.enabled;
1860 hzp.DepthBufferClearEnable = params->depth.enabled;
1861 hzp.StencilClearValue = params->stencil_ref;
1862 hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1863 break;
1864 case ISL_AUX_OP_FULL_RESOLVE:
1865 assert(params->full_surface_hiz_op);
1866 hzp.DepthBufferResolveEnable = true;
1867 break;
1868 case ISL_AUX_OP_AMBIGUATE:
1869 assert(params->full_surface_hiz_op);
1870 hzp.HierarchicalDepthBufferResolveEnable = true;
1871 break;
1872 case ISL_AUX_OP_PARTIAL_RESOLVE:
1873 case ISL_AUX_OP_NONE:
1874 unreachable("Invalid HIZ op");
1875 }
1876
1877 hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1878 hzp.SampleMask = 0xFFFF;
1879
1880 /* Due to a hardware issue, this bit MBZ */
1881 assert(hzp.ScissorRectangleEnable == false);
1882
1883 /* Contrary to the HW docs both fields are inclusive */
1884 hzp.ClearRectangleXMin = params->x0;
1885 hzp.ClearRectangleYMin = params->y0;
1886
1887 /* Contrary to the HW docs both fields are exclusive */
1888 hzp.ClearRectangleXMax = params->x1;
1889 hzp.ClearRectangleYMax = params->y1;
1890 }
1891
1892 /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1893 * to “Write Immediate Data” enabled.
1894 */
1895 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1896 pc.PostSyncOperation = WriteImmediateData;
1897 pc.Address = blorp_get_workaround_address(batch);
1898 }
1899
1900 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1901
1902 blorp_measure_end(batch, params);
1903 }
1904 #endif
1905
1906 static void
blorp_update_clear_color(UNUSED struct blorp_batch * batch,const struct brw_blorp_surface_info * info,enum isl_aux_op op)1907 blorp_update_clear_color(UNUSED struct blorp_batch *batch,
1908 const struct brw_blorp_surface_info *info,
1909 enum isl_aux_op op)
1910 {
1911 if (info->clear_color_addr.buffer && op == ISL_AUX_OP_FAST_CLEAR) {
1912 #if GFX_VER == 11
1913 blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1914 pipe.CommandStreamerStallEnable = true;
1915 }
1916
1917 /* 2 QWORDS */
1918 const unsigned inlinedata_dw = 2 * 2;
1919 const unsigned num_dwords = GENX(MI_ATOMIC_length) + inlinedata_dw;
1920
1921 struct blorp_address clear_addr = info->clear_color_addr;
1922 uint32_t *dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
1923 .DataSize = MI_ATOMIC_QWORD,
1924 .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
1925 .InlineData = true,
1926 .MemoryAddress = clear_addr);
1927 /* dw starts at dword 1, but we need to fill dwords 3 and 5 */
1928 dw[2] = info->clear_color.u32[0];
1929 dw[3] = 0;
1930 dw[4] = info->clear_color.u32[1];
1931 dw[5] = 0;
1932
1933 clear_addr.offset += 8;
1934 dw = blorp_emitn(batch, GENX(MI_ATOMIC), num_dwords,
1935 .DataSize = MI_ATOMIC_QWORD,
1936 .ATOMICOPCODE = MI_ATOMIC_OP_MOVE8B,
1937 .CSSTALL = true,
1938 .ReturnDataControl = true,
1939 .InlineData = true,
1940 .MemoryAddress = clear_addr);
1941 /* dw starts at dword 1, but we need to fill dwords 3 and 5 */
1942 dw[2] = info->clear_color.u32[2];
1943 dw[3] = 0;
1944 dw[4] = info->clear_color.u32[3];
1945 dw[5] = 0;
1946
1947 blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1948 pipe.StateCacheInvalidationEnable = true;
1949 pipe.TextureCacheInvalidationEnable = true;
1950 }
1951 #elif GFX_VER >= 9
1952
1953 /* According to Wa_2201730850, in the Clear Color Programming Note
1954 * under the Red channel, "Software shall write the converted Depth
1955 * Clear to this dword." The only depth formats listed under the red
1956 * channel are IEEE_FP and UNORM24_X8. These two requirements are
1957 * incompatible with the UNORM16 depth format, so just ignore that case
1958 * and simply perform the conversion for all depth formats.
1959 */
1960 union isl_color_value fixed_color = info->clear_color;
1961 if (GFX_VER == 12 && isl_surf_usage_is_depth(info->surf.usage)) {
1962 isl_color_value_pack(&info->clear_color, info->surf.format,
1963 fixed_color.u32);
1964 }
1965
1966 for (int i = 0; i < 4; i++) {
1967 blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
1968 sdi.Address = info->clear_color_addr;
1969 sdi.Address.offset += i * 4;
1970 sdi.ImmediateData = fixed_color.u32[i];
1971 #if GFX_VER >= 12
1972 if (i == 3)
1973 sdi.ForceWriteCompletionCheck = true;
1974 #endif
1975 }
1976 }
1977
1978 /* The RENDER_SURFACE_STATE::ClearColor field states that software should
1979 * write the converted depth value 16B after the clear address:
1980 *
1981 * 3D Sampler will always fetch clear depth from the location 16-bytes
1982 * above this address, where the clear depth, converted to native
1983 * surface format by software, will be stored.
1984 *
1985 */
1986 #if GFX_VER >= 12
1987 if (isl_surf_usage_is_depth(info->surf.usage)) {
1988 blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
1989 sdi.Address = info->clear_color_addr;
1990 sdi.Address.offset += 4 * 4;
1991 sdi.ImmediateData = fixed_color.u32[0];
1992 sdi.ForceWriteCompletionCheck = true;
1993 }
1994 }
1995 #endif
1996
1997 #elif GFX_VER >= 7
1998 blorp_emit(batch, GENX(MI_STORE_DATA_IMM), sdi) {
1999 sdi.Address = info->clear_color_addr;
2000 sdi.ImmediateData = ISL_CHANNEL_SELECT_RED << 25 |
2001 ISL_CHANNEL_SELECT_GREEN << 22 |
2002 ISL_CHANNEL_SELECT_BLUE << 19 |
2003 ISL_CHANNEL_SELECT_ALPHA << 16;
2004 if (isl_format_has_int_channel(info->view.format)) {
2005 for (unsigned i = 0; i < 4; i++) {
2006 assert(info->clear_color.u32[i] == 0 ||
2007 info->clear_color.u32[i] == 1);
2008 }
2009 sdi.ImmediateData |= (info->clear_color.u32[0] != 0) << 31;
2010 sdi.ImmediateData |= (info->clear_color.u32[1] != 0) << 30;
2011 sdi.ImmediateData |= (info->clear_color.u32[2] != 0) << 29;
2012 sdi.ImmediateData |= (info->clear_color.u32[3] != 0) << 28;
2013 } else {
2014 for (unsigned i = 0; i < 4; i++) {
2015 assert(info->clear_color.f32[i] == 0.0f ||
2016 info->clear_color.f32[i] == 1.0f);
2017 }
2018 sdi.ImmediateData |= (info->clear_color.f32[0] != 0.0f) << 31;
2019 sdi.ImmediateData |= (info->clear_color.f32[1] != 0.0f) << 30;
2020 sdi.ImmediateData |= (info->clear_color.f32[2] != 0.0f) << 29;
2021 sdi.ImmediateData |= (info->clear_color.f32[3] != 0.0f) << 28;
2022 }
2023 }
2024 #endif
2025 }
2026 }
2027
2028 static void
blorp_exec_3d(struct blorp_batch * batch,const struct blorp_params * params)2029 blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
2030 {
2031 if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
2032 blorp_update_clear_color(batch, ¶ms->dst, params->fast_clear_op);
2033 blorp_update_clear_color(batch, ¶ms->depth, params->hiz_op);
2034 }
2035
2036 #if GFX_VER >= 8
2037 if (params->hiz_op != ISL_AUX_OP_NONE) {
2038 blorp_emit_gfx8_hiz_op(batch, params);
2039 return;
2040 }
2041 #endif
2042
2043 blorp_measure_start(batch, params);
2044
2045 blorp_emit_vertex_buffers(batch, params);
2046 blorp_emit_vertex_elements(batch, params);
2047
2048 blorp_emit_pipeline(batch, params);
2049
2050 blorp_emit_btp(batch, blorp_setup_binding_table(batch, params));
2051
2052 if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
2053 blorp_emit_depth_stencil_config(batch, params);
2054
2055 blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
2056 prim.VertexAccessType = SEQUENTIAL;
2057 prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
2058 #if GFX_VER >= 7
2059 prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
2060 #endif
2061 prim.VertexCountPerInstance = 3;
2062 prim.InstanceCount = params->num_layers;
2063 }
2064
2065 blorp_measure_end(batch, params);
2066 }
2067
2068 #if GFX_VER >= 7
2069
2070 static void
blorp_get_compute_push_const(struct blorp_batch * batch,const struct blorp_params * params,uint32_t threads,uint32_t * state_offset,unsigned * state_size)2071 blorp_get_compute_push_const(struct blorp_batch *batch,
2072 const struct blorp_params *params,
2073 uint32_t threads,
2074 uint32_t *state_offset,
2075 unsigned *state_size)
2076 {
2077 const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
2078 const unsigned push_const_size =
2079 ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
2080 assert(cs_prog_data->push.cross_thread.size +
2081 cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
2082
2083 if (push_const_size == 0) {
2084 *state_offset = 0;
2085 *state_size = 0;
2086 return;
2087 }
2088
2089 uint32_t push_const_offset;
2090 uint32_t *push_const =
2091 GFX_VERx10 >= 125 ?
2092 blorp_alloc_general_state(batch, push_const_size, 64,
2093 &push_const_offset) :
2094 blorp_alloc_dynamic_state(batch, push_const_size, 64,
2095 &push_const_offset);
2096 memset(push_const, 0x0, push_const_size);
2097
2098 void *dst = push_const;
2099 const void *src = (char *)¶ms->wm_inputs;
2100
2101 if (cs_prog_data->push.cross_thread.size > 0) {
2102 memcpy(dst, src, cs_prog_data->push.cross_thread.size);
2103 dst += cs_prog_data->push.cross_thread.size;
2104 src += cs_prog_data->push.cross_thread.size;
2105 }
2106
2107 assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
2108 #if GFX_VERx10 < 125
2109 if (cs_prog_data->push.per_thread.size > 0) {
2110 for (unsigned t = 0; t < threads; t++) {
2111 memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
2112
2113 uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
2114 *subgroup_id = t;
2115
2116 dst += cs_prog_data->push.per_thread.size;
2117 }
2118 }
2119 #endif
2120
2121 *state_offset = push_const_offset;
2122 *state_size = push_const_size;
2123 }
2124
2125 #endif /* GFX_VER >= 7 */
2126
2127 static void
blorp_exec_compute(struct blorp_batch * batch,const struct blorp_params * params)2128 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
2129 {
2130 assert(!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR));
2131 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
2132 assert(params->hiz_op == ISL_AUX_OP_NONE);
2133
2134 blorp_measure_start(batch, params);
2135
2136 #if GFX_VER >= 7
2137
2138 const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
2139 const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
2140 const struct brw_cs_dispatch_info dispatch =
2141 brw_cs_get_dispatch_info(batch->blorp->compiler->devinfo, cs_prog_data,
2142 NULL);
2143 const struct intel_device_info *devinfo = batch->blorp->compiler->devinfo;
2144
2145 uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
2146 uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
2147 uint32_t group_z0 = params->dst.z_offset;
2148 uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
2149 uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
2150 assert(params->num_layers >= 1);
2151 uint32_t group_z1 = params->dst.z_offset + params->num_layers;
2152 assert(cs_prog_data->local_size[2] == 1);
2153
2154 #endif /* GFX_VER >= 7 */
2155
2156 #if GFX_VERx10 >= 125
2157
2158 blorp_emit(batch, GENX(CFE_STATE), cfe) {
2159 cfe.MaximumNumberofThreads =
2160 devinfo->max_cs_threads * devinfo->subslice_total;
2161 }
2162
2163 assert(cs_prog_data->push.per_thread.regs == 0);
2164 blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
2165 cw.SIMDSize = dispatch.simd_size / 16;
2166 cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
2167 cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
2168 cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
2169 cw.ThreadGroupIDStartingX = group_x0;
2170 cw.ThreadGroupIDStartingY = group_y0;
2171 cw.ThreadGroupIDStartingZ = group_z0;
2172 cw.ThreadGroupIDXDimension = group_x1;
2173 cw.ThreadGroupIDYDimension = group_y1;
2174 cw.ThreadGroupIDZDimension = group_z1;
2175 cw.ExecutionMask = 0xffffffff;
2176 cw.PostSync.MOCS = isl_mocs(batch->blorp->isl_dev, 0, false);
2177
2178 uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
2179
2180 uint32_t samplers_offset =
2181 params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
2182
2183 uint32_t push_const_offset;
2184 unsigned push_const_size;
2185 blorp_get_compute_push_const(batch, params, dispatch.threads,
2186 &push_const_offset, &push_const_size);
2187 cw.IndirectDataStartAddress = push_const_offset;
2188 cw.IndirectDataLength = push_const_size;
2189
2190 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
2191 .KernelStartPointer = params->cs_prog_kernel,
2192 .SamplerStatePointer = samplers_offset,
2193 .SamplerCount = params->src.enabled ? 1 : 0,
2194 .BindingTableEntryCount = params->src.enabled ? 2 : 1,
2195 .BindingTablePointer = surfaces_offset,
2196 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2197 .SharedLocalMemorySize =
2198 encode_slm_size(GFX_VER, prog_data->total_shared),
2199 .NumberOfBarriers = cs_prog_data->uses_barrier,
2200 };
2201 }
2202
2203 #elif GFX_VER >= 7
2204
2205 /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
2206 *
2207 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
2208 * the only bits that are changed are scoreboard related: Scoreboard
2209 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
2210 * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
2211 *
2212 * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
2213 * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
2214 */
2215 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
2216 pc.CommandStreamerStallEnable = true;
2217 pc.StallAtPixelScoreboard = true;
2218 }
2219
2220 blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
2221 assert(prog_data->total_scratch == 0);
2222 vfe.MaximumNumberofThreads =
2223 devinfo->max_cs_threads * devinfo->subslice_total - 1;
2224 vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0;
2225 #if GFX_VER < 11
2226 vfe.ResetGatewayTimer =
2227 Resettingrelativetimerandlatchingtheglobaltimestamp;
2228 #endif
2229 #if GFX_VER < 9
2230 vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
2231 #endif
2232 #if GFX_VER == 7
2233 vfe.GPGPUMode = true;
2234 #endif
2235 vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0;
2236
2237 const uint32_t vfe_curbe_allocation =
2238 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2239 cs_prog_data->push.cross_thread.regs, 2);
2240 vfe.CURBEAllocationSize = vfe_curbe_allocation;
2241 }
2242
2243 uint32_t push_const_offset;
2244 unsigned push_const_size;
2245 blorp_get_compute_push_const(batch, params, dispatch.threads,
2246 &push_const_offset, &push_const_size);
2247
2248 blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
2249 curbe.CURBETotalDataLength = push_const_size;
2250 curbe.CURBEDataStartAddress = push_const_offset;
2251 }
2252
2253 uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
2254
2255 uint32_t samplers_offset =
2256 params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
2257
2258 struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
2259 .KernelStartPointer = params->cs_prog_kernel,
2260 .SamplerStatePointer = samplers_offset,
2261 .SamplerCount = params->src.enabled ? 1 : 0,
2262 .BindingTableEntryCount = params->src.enabled ? 2 : 1,
2263 .BindingTablePointer = surfaces_offset,
2264 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2265 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2266 .SharedLocalMemorySize = encode_slm_size(GFX_VER,
2267 prog_data->total_shared),
2268 .BarrierEnable = cs_prog_data->uses_barrier,
2269 #if GFX_VER >= 8 || GEN_IS_HASWELL
2270 .CrossThreadConstantDataReadLength =
2271 cs_prog_data->push.cross_thread.regs,
2272 #endif
2273 };
2274
2275 uint32_t idd_offset;
2276 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
2277 void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
2278 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
2279
2280 blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
2281 mid.InterfaceDescriptorTotalLength = size;
2282 mid.InterfaceDescriptorDataStartAddress = idd_offset;
2283 }
2284
2285 blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
2286 ggw.SIMDSize = dispatch.simd_size / 16;
2287 ggw.ThreadDepthCounterMaximum = 0;
2288 ggw.ThreadHeightCounterMaximum = 0;
2289 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
2290 ggw.ThreadGroupIDStartingX = group_x0;
2291 ggw.ThreadGroupIDStartingY = group_y0;
2292 #if GFX_VER >= 8
2293 ggw.ThreadGroupIDStartingResumeZ = group_z0;
2294 #else
2295 ggw.ThreadGroupIDStartingZ = group_z0;
2296 #endif
2297 ggw.ThreadGroupIDXDimension = group_x1;
2298 ggw.ThreadGroupIDYDimension = group_y1;
2299 ggw.ThreadGroupIDZDimension = group_z1;
2300 ggw.RightExecutionMask = dispatch.right_mask;
2301 ggw.BottomExecutionMask = 0xffffffff;
2302 }
2303
2304 #else /* GFX_VER >= 7 */
2305
2306 unreachable("Compute blorp is not supported on SNB and earlier");
2307
2308 #endif /* GFX_VER >= 7 */
2309
2310 blorp_measure_end(batch, params);
2311 }
2312
2313 /* -----------------------------------------------------------------------
2314 * -- BLORP on blitter
2315 * -----------------------------------------------------------------------
2316 */
2317
2318 #include "isl/isl_genX_helpers.h"
2319
2320 #if GFX_VER >= 12
2321 static uint32_t
xy_bcb_tiling(const struct isl_surf * surf)2322 xy_bcb_tiling(const struct isl_surf *surf)
2323 {
2324 switch (surf->tiling) {
2325 case ISL_TILING_LINEAR:
2326 return XY_TILE_LINEAR;
2327 #if GFX_VERx10 >= 125
2328 case ISL_TILING_X:
2329 return XY_TILE_X;
2330 case ISL_TILING_4:
2331 return XY_TILE_4;
2332 case ISL_TILING_64:
2333 return XY_TILE_64;
2334 #else
2335 case ISL_TILING_Y0:
2336 return XY_TILE_Y;
2337 #endif
2338 default:
2339 unreachable("Invalid tiling for XY_BLOCK_COPY_BLT");
2340 }
2341 }
2342
2343 static uint32_t
xy_color_depth(const struct isl_format_layout * fmtl)2344 xy_color_depth(const struct isl_format_layout *fmtl)
2345 {
2346 switch (fmtl->bpb) {
2347 case 128: return XY_BPP_128_BIT;
2348 case 96: return XY_BPP_96_BIT;
2349 case 64: return XY_BPP_64_BIT;
2350 case 32: return XY_BPP_32_BIT;
2351 case 16: return XY_BPP_16_BIT;
2352 case 8: return XY_BPP_8_BIT;
2353 default:
2354 unreachable("Invalid bpp");
2355 }
2356 }
2357 #endif
2358
2359 #if GFX_VERx10 >= 125
2360 static uint32_t
xy_bcb_surf_dim(const struct isl_surf * surf)2361 xy_bcb_surf_dim(const struct isl_surf *surf)
2362 {
2363 switch (surf->dim) {
2364 case ISL_SURF_DIM_1D:
2365 return XY_SURFTYPE_1D;
2366 case ISL_SURF_DIM_2D:
2367 return XY_SURFTYPE_2D;
2368 case ISL_SURF_DIM_3D:
2369 return XY_SURFTYPE_3D;
2370 default:
2371 unreachable("Invalid dimensionality for XY_BLOCK_COPY_BLT");
2372 }
2373 }
2374
2375 static uint32_t
xy_bcb_surf_depth(const struct isl_surf * surf)2376 xy_bcb_surf_depth(const struct isl_surf *surf)
2377 {
2378 return surf->dim == ISL_SURF_DIM_3D ? surf->logical_level0_px.depth
2379 : surf->logical_level0_px.array_len;
2380 }
2381
2382 static uint32_t
xy_aux_mode(const struct brw_blorp_surface_info * info)2383 xy_aux_mode(const struct brw_blorp_surface_info *info)
2384 {
2385 switch (info->aux_usage) {
2386 case ISL_AUX_USAGE_CCS_E:
2387 case ISL_AUX_USAGE_GFX12_CCS_E:
2388 return XY_CCS_E;
2389 case ISL_AUX_USAGE_NONE:
2390 return XY_NONE;
2391 default:
2392 unreachable("Unsupported aux mode");
2393 }
2394 }
2395 #endif
2396
2397 UNUSED static void
blorp_xy_block_copy_blt(struct blorp_batch * batch,const struct blorp_params * params)2398 blorp_xy_block_copy_blt(struct blorp_batch *batch,
2399 const struct blorp_params *params)
2400 {
2401 #if GFX_VER < 12
2402 unreachable("Blitter is only suppotred on Gfx12+");
2403 #else
2404 UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
2405
2406 assert(batch->flags & BLORP_BATCH_USE_BLITTER);
2407 assert(!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR));
2408 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
2409 assert(params->hiz_op == ISL_AUX_OP_NONE);
2410
2411 assert(params->num_layers == 1);
2412 assert(params->dst.view.levels == 1);
2413 assert(params->src.view.levels == 1);
2414
2415 #if GFX_VERx10 < 125
2416 assert(params->dst.view.base_array_layer == 0);
2417 assert(params->dst.z_offset == 0);
2418 #endif
2419
2420 unsigned dst_x0 = params->x0;
2421 unsigned dst_x1 = params->x1;
2422 unsigned src_x0 =
2423 dst_x0 - params->wm_inputs.coord_transform[0].offset;
2424 ASSERTED unsigned src_x1 =
2425 dst_x1 - params->wm_inputs.coord_transform[0].offset;
2426 unsigned dst_y0 = params->y0;
2427 unsigned dst_y1 = params->y1;
2428 unsigned src_y0 =
2429 dst_y0 - params->wm_inputs.coord_transform[1].offset;
2430 ASSERTED unsigned src_y1 =
2431 dst_y1 - params->wm_inputs.coord_transform[1].offset;
2432
2433 assert(src_x1 - src_x0 == dst_x1 - dst_x0);
2434 assert(src_y1 - src_y0 == dst_y1 - dst_y0);
2435
2436 const struct isl_surf *src_surf = ¶ms->src.surf;
2437 const struct isl_surf *dst_surf = ¶ms->dst.surf;
2438
2439 const struct isl_format_layout *fmtl =
2440 isl_format_get_layout(params->dst.view.format);
2441
2442 if (fmtl->bpb == 96) {
2443 assert(src_surf->tiling == ISL_TILING_LINEAR &&
2444 dst_surf->tiling == ISL_TILING_LINEAR);
2445 }
2446
2447 assert(src_surf->samples == 1);
2448 assert(dst_surf->samples == 1);
2449
2450 unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2451 unsigned src_pitch_unit = src_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2452
2453 #if GFX_VERx10 >= 125
2454 struct isl_extent3d src_align = isl_get_image_alignment(src_surf);
2455 struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2456 #endif
2457
2458 blorp_emit(batch, GENX(XY_BLOCK_COPY_BLT), blt) {
2459 blt.ColorDepth = xy_color_depth(fmtl);
2460
2461 blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2462 blt.DestinationMOCS = params->dst.addr.mocs;
2463 blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2464 blt.DestinationX1 = dst_x0;
2465 blt.DestinationY1 = dst_y0;
2466 blt.DestinationX2 = dst_x1;
2467 blt.DestinationY2 = dst_y1;
2468 blt.DestinationBaseAddress = params->dst.addr;
2469 blt.DestinationXOffset = params->dst.tile_x_sa;
2470 blt.DestinationYOffset = params->dst.tile_y_sa;
2471
2472 #if GFX_VERx10 >= 125
2473 blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2474 blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2475 blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2476 blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2477 blt.DestinationArrayIndex =
2478 params->dst.view.base_array_layer + params->dst.z_offset;
2479 blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2480 blt.DestinationLOD = params->dst.view.base_level;
2481 blt.DestinationMipTailStartLOD = 15;
2482 blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2483 blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2484 blt.DestinationDepthStencilResource = false;
2485 blt.DestinationTargetMemory =
2486 params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2487
2488 if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2489 blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(¶ms->dst);
2490 blt.DestinationCompressionEnable = true;
2491 blt.DestinationCompressionFormat =
2492 isl_get_render_compression_format(dst_surf->format);
2493 blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2494 blt.DestinationClearAddress = params->dst.clear_color_addr;
2495 }
2496 #endif
2497
2498 blt.SourceX1 = src_x0;
2499 blt.SourceY1 = src_y0;
2500 blt.SourcePitch = (src_surf->row_pitch_B / src_pitch_unit) - 1;
2501 blt.SourceMOCS = params->src.addr.mocs;
2502 blt.SourceTiling = xy_bcb_tiling(src_surf);
2503 blt.SourceBaseAddress = params->src.addr;
2504 blt.SourceXOffset = params->src.tile_x_sa;
2505 blt.SourceYOffset = params->src.tile_y_sa;
2506
2507 #if GFX_VERx10 >= 125
2508 blt.SourceSurfaceType = xy_bcb_surf_dim(src_surf);
2509 blt.SourceSurfaceWidth = src_surf->logical_level0_px.w - 1;
2510 blt.SourceSurfaceHeight = src_surf->logical_level0_px.h - 1;
2511 blt.SourceSurfaceDepth = xy_bcb_surf_depth(src_surf) - 1;
2512 blt.SourceArrayIndex =
2513 params->src.view.base_array_layer + params->src.z_offset;
2514 blt.SourceSurfaceQPitch = isl_get_qpitch(src_surf) >> 2;
2515 blt.SourceLOD = params->src.view.base_level;
2516 blt.SourceMipTailStartLOD = 15;
2517 blt.SourceHorizontalAlign = isl_encode_halign(src_align.width);
2518 blt.SourceVerticalAlign = isl_encode_valign(src_align.height);
2519 blt.SourceDepthStencilResource = false;
2520 blt.SourceTargetMemory =
2521 params->src.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2522
2523 if (params->src.aux_usage != ISL_AUX_USAGE_NONE) {
2524 blt.SourceAuxiliarySurfaceMode = xy_aux_mode(¶ms->src);
2525 blt.SourceCompressionEnable = true;
2526 blt.SourceCompressionFormat =
2527 isl_get_render_compression_format(src_surf->format);
2528 blt.SourceClearValueEnable = !!params->src.clear_color_addr.buffer;
2529 blt.SourceClearAddress = params->src.clear_color_addr;
2530 }
2531
2532 /* XeHP needs special MOCS values for the blitter */
2533 blt.DestinationMOCS = isl_dev->mocs.blitter_dst;
2534 blt.SourceMOCS = isl_dev->mocs.blitter_src;
2535 #endif
2536 }
2537 #endif
2538 }
2539
2540 static void
blorp_exec_blitter(struct blorp_batch * batch,const struct blorp_params * params)2541 blorp_exec_blitter(struct blorp_batch *batch,
2542 const struct blorp_params *params)
2543 {
2544 blorp_measure_start(batch, params);
2545
2546 /* Someday, if we implement clears on the blit enginer, we can
2547 * use params->src.enabled to determine which case we're in.
2548 */
2549 assert(params->src.enabled);
2550 blorp_xy_block_copy_blt(batch, params);
2551
2552 blorp_measure_end(batch, params);
2553 }
2554
2555 /**
2556 * \brief Execute a blit or render pass operation.
2557 *
2558 * To execute the operation, this function manually constructs and emits a
2559 * batch to draw a rectangle primitive. The batchbuffer is flushed before
2560 * constructing and after emitting the batch.
2561 *
2562 * This function alters no GL state.
2563 */
2564 static void
blorp_exec(struct blorp_batch * batch,const struct blorp_params * params)2565 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
2566 {
2567 if (batch->flags & BLORP_BATCH_USE_BLITTER) {
2568 blorp_exec_blitter(batch, params);
2569 } else if (batch->flags & BLORP_BATCH_USE_COMPUTE) {
2570 blorp_exec_compute(batch, params);
2571 } else {
2572 blorp_exec_3d(batch, params);
2573 }
2574 }
2575
2576 #endif /* BLORP_GENX_EXEC_H */
2577