1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef BLORP_GENX_EXEC_BRW_H
25 #define BLORP_GENX_EXEC_BRW_H
26
27 #include "blorp_priv.h"
28 #include "dev/intel_device_info.h"
29 #include "common/intel_compute_slm.h"
30 #include "common/intel_sample_positions.h"
31 #include "common/intel_l3_config.h"
32 #include "genxml/gen_macros.h"
33 #include "intel/compiler/brw_compiler.h"
34
35 /**
36 * This file provides the blorp pipeline setup and execution functionality.
37 * It defines the following function:
38 *
39 * static void
40 * blorp_exec(struct blorp_context *blorp, void *batch_data,
41 * const struct blorp_params *params);
42 *
43 * It is the job of whoever includes this header to wrap this in something
44 * to get an externally visible symbol.
45 *
46 * In order for the blorp_exec function to work, the driver must provide
47 * implementations of the following static helper functions.
48 */
49
50 static void *
51 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
52
53 static uint64_t
54 blorp_emit_reloc(struct blorp_batch *batch,
55 void *location, struct blorp_address address, uint32_t delta);
56
57 static void
58 blorp_measure_start(struct blorp_batch *batch,
59 const struct blorp_params *params);
60
61 static void
62 blorp_measure_end(struct blorp_batch *batch,
63 const struct blorp_params *params);
64
65 static void *
66 blorp_alloc_dynamic_state(struct blorp_batch *batch,
67 uint32_t size,
68 uint32_t alignment,
69 uint32_t *offset);
70
71 UNUSED static void *
72 blorp_alloc_general_state(struct blorp_batch *batch,
73 uint32_t size,
74 uint32_t alignment,
75 uint32_t *offset);
76
77 static uint32_t
78 blorp_get_dynamic_state(struct blorp_batch *batch,
79 enum blorp_dynamic_state name);
80
81 static void *
82 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
83 struct blorp_address *addr);
84 static void
85 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
86 const struct blorp_address *addrs,
87 uint32_t *sizes,
88 unsigned num_vbs);
89
90 UNUSED static struct blorp_address
91 blorp_get_workaround_address(struct blorp_batch *batch);
92
93 static bool
94 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
95 unsigned state_size, unsigned state_alignment,
96 uint32_t *bt_offset, uint32_t *surface_offsets,
97 void **surface_maps);
98
99 static uint32_t
100 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
101 uint32_t offset);
102
103 static void
104 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
105
106 static void
107 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
108 struct blorp_address address, uint32_t delta);
109
110 static uint64_t
111 blorp_get_surface_address(struct blorp_batch *batch,
112 struct blorp_address address);
113
114 #if GFX_VER < 10
115 static struct blorp_address
116 blorp_get_surface_base_address(struct blorp_batch *batch);
117 #endif
118
119 static const struct intel_l3_config *
120 blorp_get_l3_config(struct blorp_batch *batch);
121
122 static void
123 blorp_pre_emit_urb_config(struct blorp_batch *batch,
124 struct intel_urb_config *urb_config);
125
126 static void
127 blorp_emit_urb_config(struct blorp_batch *batch,
128 struct intel_urb_config *urb_config);
129
130 static void
131 blorp_emit_pipeline(struct blorp_batch *batch,
132 const struct blorp_params *params);
133
134 static void
135 blorp_emit_pre_draw(struct blorp_batch *batch,
136 const struct blorp_params *params);
137 static void
138 blorp_emit_post_draw(struct blorp_batch *batch,
139 const struct blorp_params *params);
140
141 static inline unsigned
brw_blorp_get_urb_length(const struct brw_wm_prog_data * prog_data)142 brw_blorp_get_urb_length(const struct brw_wm_prog_data *prog_data)
143 {
144 if (prog_data == NULL)
145 return 1;
146
147 /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE
148 *
149 * read_length = ceiling((max_source_attr+1)/2)
150 */
151 return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
152 }
153
154 /***** BEGIN blorp_exec implementation ******/
155
156 static uint64_t
_blorp_combine_address(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)157 _blorp_combine_address(struct blorp_batch *batch, void *location,
158 struct blorp_address address, uint32_t delta)
159 {
160 if (address.buffer == NULL) {
161 return address.offset + delta;
162 } else {
163 return blorp_emit_reloc(batch, location, address, delta);
164 }
165 }
166
167 #define __gen_address_type struct blorp_address
168 #define __gen_user_data struct blorp_batch
169 #define __gen_combine_address _blorp_combine_address
170
171 #include "genxml/genX_pack.h"
172 #include "common/intel_genX_state_brw.h"
173
174 #define _blorp_cmd_length(cmd) cmd ## _length
175 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
176 #define _blorp_cmd_header(cmd) cmd ## _header
177 #define _blorp_cmd_pack(cmd) cmd ## _pack
178
179 #define blorp_emit(batch, cmd, name) \
180 for (struct cmd name = { _blorp_cmd_header(cmd) }, \
181 *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
182 __builtin_expect(_dst != NULL, 1); \
183 _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
184 _dst = NULL)
185
186 #define blorp_emitn(batch, cmd, n, ...) ({ \
187 uint32_t *_dw = blorp_emit_dwords(batch, n); \
188 if (_dw) { \
189 struct cmd template = { \
190 _blorp_cmd_header(cmd), \
191 .DWordLength = n - _blorp_cmd_length_bias(cmd), \
192 __VA_ARGS__ \
193 }; \
194 _blorp_cmd_pack(cmd)(batch, _dw, &template); \
195 } \
196 _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */ \
197 })
198
199 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
200
201 #define blorp_context_upload_dynamic(context, state, name, \
202 align, dynamic_name) \
203 for (struct state name = STRUCT_ZERO(state), *_dst = &name; \
204 _dst != NULL; \
205 ({ \
206 uint32_t _dw[_blorp_cmd_length(state)]; \
207 _blorp_cmd_pack(state)(NULL, (void *)_dw, &name); \
208 context->upload_dynamic_state(context, _dw, \
209 _blorp_cmd_length(state) * 4, \
210 align, dynamic_name); \
211 _dst = NULL; \
212 }))
213
214 #define blorp_emit_dynamic(batch, state, name, align, offset) \
215 for (struct state name = STRUCT_ZERO(state), \
216 *_dst = blorp_alloc_dynamic_state(batch, \
217 _blorp_cmd_length(state) * 4, \
218 align, offset); \
219 __builtin_expect(_dst != NULL, 1); \
220 _blorp_cmd_pack(state)(batch, (void *)_dst, &name), \
221 blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4), \
222 _dst = NULL)
223
224 /* 3DSTATE_URB
225 * 3DSTATE_URB_VS
226 * 3DSTATE_URB_HS
227 * 3DSTATE_URB_DS
228 * 3DSTATE_URB_GS
229 *
230 * Assign the entire URB to the VS. Even though the VS disabled, URB space
231 * is still needed because the clipper loads the VUE's from the URB. From
232 * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
233 * Dword 1.15:0 "VS Number of URB Entries":
234 * This field is always used (even if VS Function Enable is DISABLED).
235 *
236 * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
237 * safely ignore it because this batch contains only one draw call.
238 * Because of URB corruption caused by allocating a previous GS unit
239 * URB entry to the VS unit, software is required to send a “GS NULL
240 * Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
241 * plus a dummy DRAW call before any case where VS will be taking over
242 * GS URB space.
243 *
244 * If the 3DSTATE_URB_VS is emitted, than the others must be also.
245 * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
246 *
247 * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
248 * programmed in order for the programming of this state to be
249 * valid.
250 */
251 static void
emit_urb_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size * deref_block_size)252 emit_urb_config(struct blorp_batch *batch,
253 const struct blorp_params *params,
254 UNUSED enum intel_urb_deref_block_size *deref_block_size)
255 {
256 /* Once vertex fetcher has written full VUE entries with complete
257 * header the space requirement is as follows per vertex (in bytes):
258 *
259 * Header Position Program constants
260 * +--------+------------+-------------------+
261 * | 16 | 16 | n x 16 |
262 * +--------+------------+-------------------+
263 *
264 * where 'n' stands for number of varying inputs expressed as vec4s.
265 */
266 struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
267 const unsigned num_varyings =
268 wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
269 const unsigned total_needed = 16 + 16 + num_varyings * 16;
270
271 /* The URB size is expressed in units of 64 bytes (512 bits) */
272 const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
273
274 struct intel_urb_config urb_cfg = {
275 .size = { vs_entry_size, 1, 1, 1 },
276 };
277
278 bool constrained;
279 intel_get_urb_config(batch->blorp->compiler->brw->devinfo,
280 blorp_get_l3_config(batch),
281 false, false, &urb_cfg,
282 deref_block_size, &constrained);
283
284 /* Tell drivers about the config. */
285 blorp_pre_emit_urb_config(batch, &urb_cfg);
286
287 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
288 #if GFX_VER >= 12
289 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
290 urb._3DCommandSubOpcode += i;
291 urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
292 urb.VSURBStartingAddressSlice0 = urb_cfg.start[i];
293 urb.VSURBStartingAddressSliceN = urb_cfg.start[i];
294 urb.VSNumberofURBEntriesSlice0 = urb_cfg.entries[i];
295 urb.VSNumberofURBEntriesSliceN = urb_cfg.entries[i];
296 }
297 #else
298 blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
299 urb._3DCommandSubOpcode += i;
300 urb.VSURBStartingAddress = urb_cfg.start[i];
301 urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
302 urb.VSNumberofURBEntries = urb_cfg.entries[i];
303 }
304 #endif
305 }
306
307 if (batch->blorp->config.use_mesh_shading) {
308 #if GFX_VERx10 >= 125
309 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
310 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
311 #endif
312 }
313 }
314
315 static void
316 blorp_emit_memcpy(struct blorp_batch *batch,
317 struct blorp_address dst,
318 struct blorp_address src,
319 uint32_t size);
320
321 static void
blorp_emit_vertex_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)322 blorp_emit_vertex_data(struct blorp_batch *batch,
323 const struct blorp_params *params,
324 struct blorp_address *addr,
325 uint32_t *size)
326 {
327 const float vertices[] = {
328 /* v0 */ (float)params->x1, (float)params->y1, params->z,
329 /* v1 */ (float)params->x0, (float)params->y1, params->z,
330 /* v2 */ (float)params->x0, (float)params->y0, params->z,
331 };
332
333 void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
334 if (data == NULL)
335 return;
336 memcpy(data, vertices, sizeof(vertices));
337 *size = sizeof(vertices);
338 blorp_flush_range(batch, data, *size);
339 }
340
341 static void
blorp_emit_input_varying_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)342 blorp_emit_input_varying_data(struct blorp_batch *batch,
343 const struct blorp_params *params,
344 struct blorp_address *addr,
345 uint32_t *size)
346 {
347 const unsigned vec4_size_in_bytes = 4 * sizeof(float);
348 const unsigned max_num_varyings =
349 DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
350 struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
351 const unsigned num_varyings =
352 wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
353
354 *size = 16 + num_varyings * vec4_size_in_bytes;
355
356 const uint32_t *const inputs_src = (const uint32_t *)¶ms->wm_inputs;
357 void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
358 if (data == NULL)
359 return;
360 uint32_t *inputs = data;
361
362 /* Copy in the VS inputs */
363 assert(sizeof(params->vs_inputs) == 16);
364 memcpy(inputs, ¶ms->vs_inputs, sizeof(params->vs_inputs));
365 inputs += 4;
366
367 if (params->wm_prog_data) {
368 /* Walk over the attribute slots, determine if the attribute is used by
369 * the program and when necessary copy the values from the input storage
370 * to the vertex data buffer.
371 */
372 for (unsigned i = 0; i < max_num_varyings; i++) {
373 const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
374
375 const int input_index = wm_prog_data->urb_setup[attr];
376 if (input_index < 0)
377 continue;
378
379 memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
380
381 inputs += 4;
382 }
383 }
384
385 blorp_flush_range(batch, data, *size);
386
387 if (params->dst_clear_color_as_input) {
388 /* In this case, the clear color isn't known statically and instead
389 * comes in through an indirect which we have to copy into the vertex
390 * buffer before we execute the 3DPRIMITIVE. We already copied the
391 * value of params->wm_inputs.clear_color into the vertex buffer in the
392 * loop above. Now we emit code to stomp it from the GPU with the
393 * actual clear color value.
394 */
395 assert(num_varyings == 1);
396
397 /* The clear color is the first thing after the header */
398 struct blorp_address clear_color_input_addr = *addr;
399 clear_color_input_addr.offset += 16;
400
401 const unsigned clear_color_size =
402 GFX_VER < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
403 blorp_emit_memcpy(batch, clear_color_input_addr,
404 params->dst.clear_color_addr,
405 clear_color_size);
406 }
407 }
408
409 static void
blorp_fill_vertex_buffer_state(struct GENX (VERTEX_BUFFER_STATE)* vb,unsigned idx,struct blorp_address addr,uint32_t size,uint32_t stride)410 blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
411 unsigned idx,
412 struct blorp_address addr, uint32_t size,
413 uint32_t stride)
414 {
415 vb[idx].VertexBufferIndex = idx;
416 vb[idx].BufferStartingAddress = addr;
417 vb[idx].BufferPitch = stride;
418 vb[idx].MOCS = addr.mocs;
419 vb[idx].AddressModifyEnable = true;
420 vb[idx].BufferSize = size;
421
422 #if GFX_VER >= 12
423 vb[idx].L3BypassDisable = true;
424 #endif
425 }
426
427 static void
blorp_emit_vertex_buffers(struct blorp_batch * batch,const struct blorp_params * params)428 blorp_emit_vertex_buffers(struct blorp_batch *batch,
429 const struct blorp_params *params)
430 {
431 struct GENX(VERTEX_BUFFER_STATE) vb[2] = {};
432 const uint32_t num_vbs = ARRAY_SIZE(vb);
433
434 struct blorp_address addrs[2] = {};
435 uint32_t sizes[2] = {};
436 blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
437 if (sizes[0] == 0)
438 return;
439 blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
440 3 * sizeof(float));
441
442 blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
443 blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
444
445 blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
446
447 const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
448 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
449 if (!dw)
450 return;
451
452 for (unsigned i = 0; i < num_vbs; i++) {
453 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
454 dw += GENX(VERTEX_BUFFER_STATE_length);
455 }
456 }
457
458 static void
blorp_emit_vertex_elements(struct blorp_batch * batch,const struct blorp_params * params)459 blorp_emit_vertex_elements(struct blorp_batch *batch,
460 const struct blorp_params *params)
461 {
462 struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
463 const unsigned num_varyings =
464 wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
465 const unsigned num_elements = 2 + num_varyings;
466
467 struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
468 memset(ve, 0, num_elements * sizeof(*ve));
469
470 /* Setup VBO for the rectangle primitive..
471 *
472 * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
473 * vertices. The vertices reside in screen space with DirectX
474 * coordinates (that is, (0, 0) is the upper left corner).
475 *
476 * v2 ------ implied
477 * | |
478 * | |
479 * v1 ----- v0
480 *
481 * Since the VS is disabled, the clipper loads each VUE directly from
482 * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
483 * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
484 * dw0: Reserved, MBZ.
485 * dw1: Render Target Array Index. Below vertex fetcher gets programmed
486 * to assign this with primitive instance identifier which will be
487 * used for layered clears. All other renders have only one instance
488 * and therefore the value will be effectively zero.
489 * dw2: Viewport Index. The HiZ op disables viewport mapping and
490 * scissoring, so set the dword to 0.
491 * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
492 * so set the dword to 0.
493 * dw4: Vertex Position X.
494 * dw5: Vertex Position Y.
495 * dw6: Vertex Position Z.
496 * dw7: Vertex Position W.
497 *
498 * dw8: Flat vertex input 0
499 * dw9: Flat vertex input 1
500 * ...
501 * dwn: Flat vertex input n - 8
502 *
503 * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
504 * "Vertex URB Entry (VUE) Formats".
505 *
506 * Only vertex position X and Y are going to be variable, Z is fixed to
507 * zero and W to one. Header words dw0,2,3 are zero. There is no need to
508 * include the fixed values in the vertex buffer. Vertex fetcher can be
509 * instructed to fill vertex elements with constant values of one and zero
510 * instead of reading them from the buffer.
511 * Flat inputs are program constants that are not interpolated. Moreover
512 * their values will be the same between vertices.
513 *
514 * See the vertex element setup below.
515 */
516 unsigned slot = 0;
517
518 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
519 .VertexBufferIndex = 1,
520 .Valid = true,
521 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
522 .SourceElementOffset = 0,
523 .Component0Control = VFCOMP_STORE_SRC,
524
525 /* From Gfx8 onwards hardware is no more instructed to overwrite
526 * components using an element specifier. Instead one has separate
527 * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
528 */
529 .Component1Control = VFCOMP_STORE_0,
530 .Component2Control = VFCOMP_STORE_0,
531 .Component3Control = VFCOMP_STORE_0,
532 };
533 slot++;
534
535 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
536 .VertexBufferIndex = 0,
537 .Valid = true,
538 .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
539 .SourceElementOffset = 0,
540 .Component0Control = VFCOMP_STORE_SRC,
541 .Component1Control = VFCOMP_STORE_SRC,
542 .Component2Control = VFCOMP_STORE_SRC,
543 .Component3Control = VFCOMP_STORE_1_FP,
544 };
545 slot++;
546
547 for (unsigned i = 0; i < num_varyings; ++i) {
548 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
549 .VertexBufferIndex = 1,
550 .Valid = true,
551 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
552 .SourceElementOffset = 16 + i * 4 * sizeof(float),
553 .Component0Control = VFCOMP_STORE_SRC,
554 .Component1Control = VFCOMP_STORE_SRC,
555 .Component2Control = VFCOMP_STORE_SRC,
556 .Component3Control = VFCOMP_STORE_SRC,
557 };
558 slot++;
559 }
560
561 const unsigned num_dwords =
562 1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
563 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
564 if (!dw)
565 return;
566
567 for (unsigned i = 0; i < num_elements; i++) {
568 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
569 dw += GENX(VERTEX_ELEMENT_STATE_length);
570 }
571
572 blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
573 vf.StatisticsEnable = false;
574 }
575
576 /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
577 * primitive instance identifier. This is used for layered clears.
578 */
579 blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
580 sgvs.InstanceIDEnable = true;
581 sgvs.InstanceIDComponentNumber = COMP_1;
582 sgvs.InstanceIDElementOffset = 0;
583 }
584
585 #if GFX_VER >= 11
586 blorp_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
587 #endif
588
589 for (unsigned i = 0; i < num_elements; i++) {
590 blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
591 vf.VertexElementIndex = i;
592 vf.InstancingEnable = false;
593 }
594 }
595
596 blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
597 topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
598 }
599 }
600
601 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
602 static uint32_t
blorp_emit_cc_viewport(struct blorp_batch * batch)603 blorp_emit_cc_viewport(struct blorp_batch *batch)
604 {
605 uint32_t cc_vp_offset;
606
607 /* Somehow reusing CC_VIEWPORT on Gfx9 is causing issues :
608 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
609 */
610 if (GFX_VER != 9 && batch->blorp->config.use_cached_dynamic_states) {
611 cc_vp_offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_CC_VIEWPORT);
612 } else {
613 blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
614 vp.MinimumDepth = batch->blorp->config.use_unrestricted_depth_range ?
615 -FLT_MAX : 0.0;
616 vp.MaximumDepth = batch->blorp->config.use_unrestricted_depth_range ?
617 FLT_MAX : 1.0;
618 }
619 }
620
621 blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
622 vsp.CCViewportPointer = cc_vp_offset;
623 }
624
625 return cc_vp_offset;
626 }
627
628 static uint32_t
blorp_emit_sampler_state(struct blorp_batch * batch)629 blorp_emit_sampler_state(struct blorp_batch *batch)
630 {
631 uint32_t offset;
632 blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
633 sampler.MipModeFilter = MIPFILTER_NONE;
634 sampler.MagModeFilter = MAPFILTER_LINEAR;
635 sampler.MinModeFilter = MAPFILTER_LINEAR;
636 sampler.MinLOD = 0;
637 sampler.MaxLOD = 0;
638 sampler.TCXAddressControlMode = TCM_CLAMP;
639 sampler.TCYAddressControlMode = TCM_CLAMP;
640 sampler.TCZAddressControlMode = TCM_CLAMP;
641 sampler.MaximumAnisotropy = RATIO21;
642 sampler.RAddressMinFilterRoundingEnable = true;
643 sampler.RAddressMagFilterRoundingEnable = true;
644 sampler.VAddressMinFilterRoundingEnable = true;
645 sampler.VAddressMagFilterRoundingEnable = true;
646 sampler.UAddressMinFilterRoundingEnable = true;
647 sampler.UAddressMagFilterRoundingEnable = true;
648 sampler.NonnormalizedCoordinateEnable = true;
649 }
650
651 return offset;
652 }
653
654 UNUSED static uint32_t
blorp_emit_sampler_state_ps(struct blorp_batch * batch)655 blorp_emit_sampler_state_ps(struct blorp_batch *batch)
656 {
657 uint32_t offset = batch->blorp->config.use_cached_dynamic_states ?
658 blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_SAMPLER) :
659 blorp_emit_sampler_state(batch);
660
661 blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
662 ssp.PointertoPSSamplerState = offset;
663 }
664
665 return offset;
666 }
667
668 /* What follows is the code for setting up a "pipeline". */
669
670 static void
blorp_emit_vs_config(struct blorp_batch * batch,const struct blorp_params * params)671 blorp_emit_vs_config(struct blorp_batch *batch,
672 const struct blorp_params *params)
673 {
674 struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
675 assert(!vs_prog_data || GFX_VER < 11 ||
676 vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
677
678 blorp_emit(batch, GENX(3DSTATE_VS), vs) {
679 if (vs_prog_data) {
680 vs.Enable = true;
681
682 vs.KernelStartPointer = params->vs_prog_kernel;
683
684 vs.DispatchGRFStartRegisterForURBData =
685 vs_prog_data->base.base.dispatch_grf_start_reg;
686 vs.VertexURBEntryReadLength =
687 vs_prog_data->base.urb_read_length;
688 vs.VertexURBEntryReadOffset = 0;
689
690 vs.MaximumNumberofThreads =
691 batch->blorp->isl_dev->info->max_vs_threads - 1;
692
693 assert(vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
694 #if GFX_VER < 20
695 vs.SIMD8DispatchEnable = true;
696 #endif
697
698 #if GFX_VER >= 30
699 vs.RegistersPerThread = ptl_register_blocks(vs_prog_data->base.base.grf_used);
700 #endif
701 }
702 }
703 }
704
705 static void
blorp_emit_sf_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size urb_deref_block_size)706 blorp_emit_sf_config(struct blorp_batch *batch,
707 const struct blorp_params *params,
708 UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
709 {
710 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
711
712 /* 3DSTATE_SF
713 *
714 * Disable ViewportTransformEnable (dw2.1)
715 *
716 * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
717 * Primitives Overview":
718 * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
719 * use of screen- space coordinates).
720 *
721 * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
722 * and BackFaceFillMode (dw2.5:6) to SOLID(0).
723 *
724 * From the Sandy Bridge PRM, Volume 2, Part 1, Section
725 * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
726 * SOLID: Any triangle or rectangle object found to be front-facing
727 * is rendered as a solid object. This setting is required when
728 * (rendering rectangle (RECTLIST) objects.
729 */
730
731 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
732 #if GFX_VER >= 12
733 sf.DerefBlockSize = urb_deref_block_size;
734 #endif
735 }
736
737 blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
738 raster.CullMode = CULLMODE_NONE;
739 }
740
741 blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
742 sbe.VertexURBEntryReadOffset = 1;
743 if (prog_data) {
744 sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
745 sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
746 sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
747 } else {
748 sbe.NumberofSFOutputAttributes = 0;
749 sbe.VertexURBEntryReadLength = 1;
750 }
751 sbe.ForceVertexURBEntryReadLength = true;
752 sbe.ForceVertexURBEntryReadOffset = true;
753
754 for (unsigned i = 0; i < 32; i++)
755 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
756 }
757 }
758
759 static void
blorp_emit_ps_config(struct blorp_batch * batch,const struct blorp_params * params)760 blorp_emit_ps_config(struct blorp_batch *batch,
761 const struct blorp_params *params)
762 {
763 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
764
765 /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
766 * nonzero to prevent the GPU from hanging. While the documentation doesn't
767 * mention this explicitly, it notes that the valid range for the field is
768 * [1,39] = [2,40] threads, which excludes zero.
769 *
770 * To be safe (and to minimize extraneous code) we go ahead and fully
771 * configure the WM state whether or not there is a WM program.
772 */
773
774 const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
775
776 blorp_emit(batch, GENX(3DSTATE_WM), wm);
777
778 blorp_emit(batch, GENX(3DSTATE_PS), ps) {
779 if (params->src.enabled) {
780 ps.SamplerCount = 1; /* Up to 4 samplers */
781 ps.BindingTableEntryCount = 2;
782 } else {
783 ps.BindingTableEntryCount = 1;
784 }
785
786 /* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
787 if (GFX_VER == 11)
788 ps.SamplerCount = 0;
789
790 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
791 * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
792 * k, it implies 2(k+1) threads. It implicitly scales for different GT
793 * levels (which have some # of PSDs).
794 */
795 ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
796
797 switch (params->fast_clear_op) {
798 case ISL_AUX_OP_NONE:
799 break;
800 #if GFX_VER < 20
801 #if GFX_VER >= 10
802 case ISL_AUX_OP_AMBIGUATE:
803 ps.RenderTargetFastClearEnable = true;
804 ps.RenderTargetResolveType = FAST_CLEAR_0;
805 break;
806 #endif /* GFX_VER >= 10 */
807 case ISL_AUX_OP_PARTIAL_RESOLVE:
808 ps.RenderTargetResolveType = RESOLVE_PARTIAL;
809 break;
810 case ISL_AUX_OP_FULL_RESOLVE:
811 ps.RenderTargetResolveType = RESOLVE_FULL;
812 break;
813 #endif /* GFX_VER < 20 */
814 case ISL_AUX_OP_FAST_CLEAR:
815 ps.RenderTargetFastClearEnable = true;
816 break;
817 default:
818 unreachable("Invalid fast clear op");
819 }
820
821 #if GFX_VERx10 == 120
822 /* The 3DSTATE_PS_BODY page for TGL says:
823 *
824 * 3D/Volumetric surfaces do not support Fast Clear operation.
825 *
826 * [...]
827 *
828 * 3D/Volumetric surfaces do not support in-place resolve pass
829 * operation.
830 *
831 * HSD 1406738321 suggests a more limited scope of restrictions, but
832 * there should be no harm in complying with the Bspec restrictions.
833 */
834 if (params->dst.surf.dim == ISL_SURF_DIM_3D)
835 assert(params->fast_clear_op == ISL_AUX_OP_NONE);
836
837 /* The RENDER_SURFACE_STATE page for TGL says:
838 *
839 * For an 8 bpp surface with NUM_MULTISAMPLES = 1, Surface Width not
840 * multiple of 64 pixels and more than 1 mip level in the view, Fast
841 * Clear is not supported when AUX_CCS_E is set in this field.
842 *
843 * The granularity of a fast-clear or ambiguate operation is likely one
844 * CCS element. For an 8 bpp primary surface, this maps to 32px x 4rows.
845 * Due to the surface layout parameters, if LOD0's width isn't a
846 * multiple of 64px, LOD1 and LOD2+ will share CCS elements. Assert that
847 * these operations aren't occurring on these LODs.
848 */
849 if (isl_format_get_layout(params->dst.surf.format)->bpb == 8 &&
850 params->dst.surf.logical_level0_px.width % 64 != 0 &&
851 params->dst.surf.levels >= 3 &&
852 params->dst.view.base_level >= 1) {
853 assert(params->num_samples == 1);
854 assert(!ps.RenderTargetFastClearEnable);
855 }
856
857 /* From the TGL BSpec 44930 (r47128):
858 *
859 * Compression of 3D Ys surfaces with 64 or 128 bpp is not supported
860 * in Gen12. Moreover, "Render Target Fast-clear Enable" command is
861 * not supported for any 3D Ys surfaces. except when Surface is a
862 * Procdural Texture.
863 *
864 * It's not clear where the exception applies, but either way, we don't
865 * support Procedural Textures.
866 */
867 if (params->dst.surf.dim == ISL_SURF_DIM_3D &&
868 params->dst.surf.tiling == ISL_TILING_ICL_Ys &&
869 isl_format_get_layout(params->dst.surf.format)->bpb >= 64) {
870 assert(params->dst.aux_usage != ISL_AUX_USAGE_CCS_D);
871 assert(!ps.RenderTargetFastClearEnable);
872 }
873 #endif
874
875 if (prog_data) {
876 intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
877 params->num_samples,
878 0 /* msaa_flags */);
879
880 ps.DispatchGRFStartRegisterForConstantSetupData0 =
881 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
882 ps.DispatchGRFStartRegisterForConstantSetupData1 =
883 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
884 #if GFX_VER < 20
885 ps.DispatchGRFStartRegisterForConstantSetupData2 =
886 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
887 #endif
888
889 ps.KernelStartPointer0 = params->wm_prog_kernel +
890 brw_wm_prog_data_prog_offset(prog_data, ps, 0);
891 ps.KernelStartPointer1 = params->wm_prog_kernel +
892 brw_wm_prog_data_prog_offset(prog_data, ps, 1);
893 #if GFX_VER < 20
894 ps.KernelStartPointer2 = params->wm_prog_kernel +
895 brw_wm_prog_data_prog_offset(prog_data, ps, 2);
896 #endif
897
898 #if GFX_VER >= 30
899 ps.RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used);
900 #endif
901 }
902 }
903
904 blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
905 if (params->src.enabled)
906 psx.PixelShaderKillsPixel = true;
907
908 if (prog_data) {
909 psx.PixelShaderValid = true;
910 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
911 psx.PixelShaderComputesStencil = prog_data->computed_stencil;
912 psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
913
914 #if INTEL_WA_18038825448_GFX_VER
915 psx.EnablePSDependencyOnCPsizeChange =
916 batch->flags & BLORP_BATCH_FORCE_CPS_DEPENDENCY;
917 #endif
918
919 #if GFX_VER < 20
920 psx.AttributeEnable = prog_data->num_varying_inputs > 0;
921 #else
922 /* Bspec 57340 (r59562):
923 *
924 * For MSAA fast clear, it (clear shader) must be in per-pixel
925 * dispatch mode.
926 *
927 * Bspec 56424 (r58933):
928 *
929 * Bit 6 of Bit Group 0: Pixel Shader Is Per Sample
930 * If this bit is DISABLED, the dispatch rate is determined by the
931 * value of Pixel Shader Is Per Coarse Pixel.
932 *
933 * Bit 4 of Bit Group 0: Pixel Shader Is Per Coarse Pixel
934 * If Pixel Shader Is Per Sample is DISABLED and this bit is
935 * DISABLED, the pixel shader is dispatched at the per pixel
936 * shading rate.
937 *
938 * The below assertion ensures the MSAA clear shader is in per-pixel
939 * dispatch mode.
940 */
941 if (params->fast_clear_op == ISL_AUX_OP_FAST_CLEAR &&
942 params->num_samples > 1) {
943 assert(!psx.PixelShaderIsPerSample &&
944 !psx.PixelShaderIsPerCoarsePixel);
945 }
946 #endif
947 }
948 }
949 }
950
951 static void
blorp_emit_blend_state(struct blorp_batch * batch,const struct blorp_params * params)952 blorp_emit_blend_state(struct blorp_batch *batch,
953 const struct blorp_params *params)
954 {
955 uint32_t offset;
956 if (!batch->blorp->config.use_cached_dynamic_states) {
957 struct GENX(BLEND_STATE) blend = { };
958
959 const unsigned size = 96;
960 uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
961 if (state == NULL)
962 return;
963 uint32_t *pos = state;
964
965 GENX(BLEND_STATE_pack)(NULL, pos, &blend);
966 pos += GENX(BLEND_STATE_length);
967
968 for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
969 struct GENX(BLEND_STATE_ENTRY) entry = {
970 .PreBlendColorClampEnable = true,
971 .PostBlendColorClampEnable = true,
972 .ColorClampRange = COLORCLAMP_RTFORMAT,
973
974 .WriteDisableRed = params->color_write_disable & 1,
975 .WriteDisableGreen = params->color_write_disable & 2,
976 .WriteDisableBlue = params->color_write_disable & 4,
977 .WriteDisableAlpha = params->color_write_disable & 8,
978 };
979 GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
980 pos += GENX(BLEND_STATE_ENTRY_length);
981 }
982
983 blorp_flush_range(batch, state, size);
984 } else {
985 /* We only cached this case. */
986 assert(params->color_write_disable == 0);
987 offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_BLEND);
988 }
989
990 blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
991 sp.BlendStatePointer = offset;
992 sp.BlendStatePointerValid = true;
993 }
994
995 blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
996 ps_blend.HasWriteableRT = true;
997 }
998 }
999
1000 static void
blorp_emit_color_calc_state(struct blorp_batch * batch,UNUSED const struct blorp_params * params)1001 blorp_emit_color_calc_state(struct blorp_batch *batch,
1002 UNUSED const struct blorp_params *params)
1003 {
1004 uint32_t offset;
1005
1006 if (batch->blorp->config.use_cached_dynamic_states)
1007 offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_COLOR_CALC);
1008 else
1009 blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {}
1010
1011 blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
1012 sp.ColorCalcStatePointer = offset;
1013 sp.ColorCalcStatePointerValid = true;
1014 }
1015 }
1016
1017 static void
blorp_emit_depth_stencil_state(struct blorp_batch * batch,const struct blorp_params * params)1018 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
1019 const struct blorp_params *params)
1020 {
1021 blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
1022 if (params->depth.enabled) {
1023 ds.DepthBufferWriteEnable = true;
1024
1025 switch (params->hiz_op) {
1026 /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
1027 * - 7.5.3.1 Depth Buffer Clear
1028 * - 7.5.3.2 Depth Buffer Resolve
1029 * - 7.5.3.3 Hierarchical Depth Buffer Resolve
1030 */
1031 case ISL_AUX_OP_FULL_RESOLVE:
1032 ds.DepthTestEnable = true;
1033 ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1034 break;
1035
1036 case ISL_AUX_OP_NONE:
1037 case ISL_AUX_OP_FAST_CLEAR:
1038 case ISL_AUX_OP_AMBIGUATE:
1039 ds.DepthTestEnable = false;
1040 break;
1041 case ISL_AUX_OP_PARTIAL_RESOLVE:
1042 unreachable("Invalid HIZ op");
1043 }
1044 }
1045
1046 if (params->stencil.enabled) {
1047 ds.StencilBufferWriteEnable = true;
1048 ds.StencilTestEnable = true;
1049 ds.DoubleSidedStencilEnable = false;
1050
1051 ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1052 ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1053
1054 ds.StencilWriteMask = params->stencil_mask;
1055 ds.StencilReferenceValue = params->stencil_ref;
1056 }
1057 }
1058
1059 #if GFX_VER >= 12
1060 blorp_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
1061 db.DepthBoundsTestEnable = false;
1062 db.DepthBoundsTestMinValue = 0.0;
1063 db.DepthBoundsTestMaxValue = 1.0;
1064 }
1065 #endif
1066 }
1067
1068 static void
blorp_emit_3dstate_multisample(struct blorp_batch * batch,const struct blorp_params * params)1069 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1070 const struct blorp_params *params)
1071 {
1072 blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1073 ms.NumberofMultisamples = __builtin_ffs(params->num_samples) - 1;
1074 ms.PixelLocation = CENTER;
1075 }
1076 }
1077
1078 static void
blorp_emit_pipeline(struct blorp_batch * batch,const struct blorp_params * params)1079 blorp_emit_pipeline(struct blorp_batch *batch,
1080 const struct blorp_params *params)
1081 {
1082 enum intel_urb_deref_block_size urb_deref_block_size;
1083 emit_urb_config(batch, params, &urb_deref_block_size);
1084
1085 if (params->wm_prog_data) {
1086 blorp_emit_blend_state(batch, params);
1087 }
1088 blorp_emit_color_calc_state(batch, params);
1089 blorp_emit_depth_stencil_state(batch, params);
1090
1091 UNUSED uint32_t mocs = isl_mocs(batch->blorp->isl_dev, 0, false);
1092
1093 #if GFX_VER >= 12
1094 blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
1095 /* Update empty push constants for all stages (bitmask = 11111b) */
1096 pc.ShaderUpdateEnable = 0x1f;
1097 pc.MOCS = mocs;
1098 }
1099 #else
1100 blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), xs) { xs.MOCS = mocs; }
1101 blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), xs) { xs.MOCS = mocs; }
1102 blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), xs) { xs.MOCS = mocs; }
1103 blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), xs) { xs.MOCS = mocs; }
1104 blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), xs) { xs.MOCS = mocs; }
1105 #endif
1106
1107 if (params->src.enabled)
1108 blorp_emit_sampler_state_ps(batch);
1109
1110 blorp_emit_3dstate_multisample(batch, params);
1111
1112 blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1113 mask.SampleMask = (1 << params->num_samples) - 1;
1114 }
1115
1116 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1117 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1118 *
1119 * [DevSNB] A pipeline flush must be programmed prior to a
1120 * 3DSTATE_VS command that causes the VS Function Enable to
1121 * toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1122 * command with CS stall bit set and a post sync operation.
1123 *
1124 * We've already done one at the start of the BLORP operation.
1125 */
1126 blorp_emit_vs_config(batch, params);
1127 blorp_emit(batch, GENX(3DSTATE_HS), hs);
1128 blorp_emit(batch, GENX(3DSTATE_TE), te);
1129 blorp_emit(batch, GENX(3DSTATE_DS), DS);
1130 blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1131 blorp_emit(batch, GENX(3DSTATE_GS), gs);
1132
1133 blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1134 clip.PerspectiveDivideDisable = true;
1135 }
1136
1137 blorp_emit_sf_config(batch, params, urb_deref_block_size);
1138 blorp_emit_ps_config(batch, params);
1139
1140 blorp_emit_cc_viewport(batch);
1141
1142 #if GFX_VER >= 12
1143 /* Disable Primitive Replication. */
1144 blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1145 #endif
1146
1147 if (batch->blorp->config.use_mesh_shading) {
1148 #if GFX_VERx10 >= 125
1149 blorp_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
1150 blorp_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
1151 #endif
1152 }
1153 }
1154
1155 /******** This is the end of the pipeline setup code ********/
1156
1157 static void
blorp_emit_memcpy(struct blorp_batch * batch,struct blorp_address dst,struct blorp_address src,uint32_t size)1158 blorp_emit_memcpy(struct blorp_batch *batch,
1159 struct blorp_address dst,
1160 struct blorp_address src,
1161 uint32_t size)
1162 {
1163 assert(size % 4 == 0);
1164
1165 for (unsigned dw = 0; dw < size; dw += 4) {
1166 blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1167 cp.DestinationMemoryAddress = dst;
1168 cp.SourceMemoryAddress = src;
1169 }
1170 dst.offset += 4;
1171 src.offset += 4;
1172 }
1173 }
1174
1175 static void
blorp_emit_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,UNUSED enum isl_aux_op aux_op,void * state,uint32_t state_offset,uint8_t color_write_disable,bool is_render_target)1176 blorp_emit_surface_state(struct blorp_batch *batch,
1177 const struct blorp_surface_info *surface,
1178 UNUSED enum isl_aux_op aux_op,
1179 void *state, uint32_t state_offset,
1180 uint8_t color_write_disable,
1181 bool is_render_target)
1182 {
1183 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1184 struct isl_surf surf = surface->surf;
1185
1186 if (surf.dim == ISL_SURF_DIM_1D &&
1187 surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1188 assert(surf.logical_level0_px.height == 1);
1189 surf.dim = ISL_SURF_DIM_2D;
1190 }
1191
1192 if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1193 /* BLORP doesn't render with depth so we can't use HiZ */
1194 assert(!is_render_target);
1195 /* We can't reinterpret HiZ */
1196 assert(surface->surf.format == surface->view.format);
1197 }
1198
1199 enum isl_aux_usage aux_usage = surface->aux_usage;
1200
1201 /* On gfx12, implicit CCS has no aux buffer */
1202 bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1203 (surface->aux_addr.buffer != NULL);
1204
1205 const bool use_clear_address =
1206 GFX_VER >= 10 && (surface->clear_color_addr.buffer != NULL);
1207
1208 /* On gfx12 (and optionally on gfx11), hardware will read and write to the
1209 * clear color address, converting the raw clear color channels to a pixel
1210 * during a fast-clear. To avoid the restrictions associated with the
1211 * hardware feature, we instead write a software-converted pixel ourselves.
1212 * If we're performing a fast-clear, provide a substitute address to avoid
1213 * a collision with hardware. Outside of gfx11 and gfx12, indirect clear
1214 * color BOs are not used during fast-clears.
1215 */
1216 const struct blorp_address op_clear_addr =
1217 aux_op == ISL_AUX_OP_FAST_CLEAR ? blorp_get_workaround_address(batch) :
1218 surface->clear_color_addr;
1219
1220 isl_surf_fill_state(batch->blorp->isl_dev, state,
1221 .surf = &surf, .view = &surface->view,
1222 .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1223 .address =
1224 blorp_get_surface_address(batch, surface->addr),
1225 .aux_address = !use_aux_address ? 0 :
1226 blorp_get_surface_address(batch, surface->aux_addr),
1227 .clear_address = !use_clear_address ? 0 :
1228 blorp_get_surface_address(batch, op_clear_addr),
1229 .mocs = surface->addr.mocs,
1230 .clear_color = surface->clear_color,
1231 .use_clear_address = use_clear_address);
1232
1233 blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1234 surface->addr, 0);
1235
1236 if (use_aux_address) {
1237 /* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1238 * used to store other information. This should be ok, however, because
1239 * surface buffer addresses are always 4K page alinged.
1240 */
1241 assert((surface->aux_addr.offset & 0xfff) == 0);
1242 uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1243 blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1244 surface->aux_addr, *aux_addr);
1245 }
1246
1247 if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1248 #if GFX_VER >= 10
1249 assert((surface->clear_color_addr.offset & 0x3f) == 0);
1250 uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
1251 blorp_surface_reloc(batch, state_offset +
1252 isl_dev->ss.clear_color_state_offset,
1253 op_clear_addr, *clear_addr);
1254 #else
1255 /* Fast clears just whack the AUX surface and don't actually use the
1256 * clear color for anything. We can avoid the MI memcpy on that case.
1257 */
1258 if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1259 struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1260 dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1261 blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1262 isl_dev->ss.clear_value_size);
1263 }
1264 #endif
1265 }
1266
1267 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1268 }
1269
1270 static void
blorp_emit_null_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,uint32_t * state)1271 blorp_emit_null_surface_state(struct blorp_batch *batch,
1272 const struct blorp_surface_info *surface,
1273 uint32_t *state)
1274 {
1275 struct GENX(RENDER_SURFACE_STATE) ss = {
1276 .SurfaceType = SURFTYPE_NULL,
1277 .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1278 .Width = surface->surf.logical_level0_px.width - 1,
1279 .Height = surface->surf.logical_level0_px.height - 1,
1280 .MIPCountLOD = surface->view.base_level,
1281 .MinimumArrayElement = surface->view.base_array_layer,
1282 .Depth = surface->view.array_len - 1,
1283 .RenderTargetViewExtent = surface->view.array_len - 1,
1284 .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1285 .MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1286
1287 .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1288
1289 #if GFX_VERx10 >= 125
1290 .TileMode = TILE4,
1291 #else
1292 .TileMode = YMAJOR,
1293 #endif
1294 };
1295
1296 GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1297
1298 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1299 }
1300
1301 static uint32_t
blorp_setup_binding_table(struct blorp_batch * batch,const struct blorp_params * params)1302 blorp_setup_binding_table(struct blorp_batch *batch,
1303 const struct blorp_params *params)
1304 {
1305 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1306 uint32_t surface_offsets[2], bind_offset = 0;
1307 void *surface_maps[2];
1308
1309 if (params->use_pre_baked_binding_table) {
1310 bind_offset = params->pre_baked_binding_table_offset;
1311 } else {
1312 unsigned num_surfaces = 1 + params->src.enabled;
1313 if (!blorp_alloc_binding_table(batch, num_surfaces,
1314 isl_dev->ss.size, isl_dev->ss.align,
1315 &bind_offset, surface_offsets, surface_maps))
1316 return 0;
1317
1318 if (params->dst.enabled) {
1319 blorp_emit_surface_state(batch, ¶ms->dst,
1320 params->fast_clear_op,
1321 surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1322 surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1323 params->color_write_disable, true);
1324 } else {
1325 assert(params->depth.enabled || params->stencil.enabled);
1326 const struct blorp_surface_info *surface =
1327 params->depth.enabled ? ¶ms->depth : ¶ms->stencil;
1328 blorp_emit_null_surface_state(batch, surface,
1329 surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1330 }
1331
1332 if (params->src.enabled) {
1333 blorp_emit_surface_state(batch, ¶ms->src,
1334 params->fast_clear_op,
1335 surface_maps[BLORP_TEXTURE_BT_INDEX],
1336 surface_offsets[BLORP_TEXTURE_BT_INDEX],
1337 0, false);
1338 }
1339 }
1340
1341 return bind_offset;
1342 }
1343
1344 static void
blorp_emit_btp(struct blorp_batch * batch,uint32_t bind_offset)1345 blorp_emit_btp(struct blorp_batch *batch, uint32_t bind_offset)
1346 {
1347 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1348 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1349 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1350 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1351
1352 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1353 bt.PointertoPSBindingTable =
1354 blorp_binding_table_offset_to_pointer(batch, bind_offset);
1355 }
1356 }
1357
1358 static void
blorp_emit_depth_stencil_config(struct blorp_batch * batch,const struct blorp_params * params)1359 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1360 const struct blorp_params *params)
1361 {
1362 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1363 const struct intel_device_info *devinfo =
1364 batch->blorp->compiler->brw->devinfo;
1365
1366 uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1367 if (dw == NULL)
1368 return;
1369
1370 struct isl_depth_stencil_hiz_emit_info info = { };
1371
1372 if (params->depth.enabled) {
1373 info.view = ¶ms->depth.view;
1374 info.mocs = params->depth.addr.mocs;
1375 } else if (params->stencil.enabled) {
1376 info.view = ¶ms->stencil.view;
1377 info.mocs = params->stencil.addr.mocs;
1378 } else {
1379 info.mocs = isl_mocs(isl_dev, 0, false);
1380 }
1381
1382 if (params->depth.enabled) {
1383 info.depth_surf = ¶ms->depth.surf;
1384
1385 info.depth_address =
1386 blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1387 params->depth.addr, 0);
1388
1389 info.hiz_usage = params->depth.aux_usage;
1390 if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1391 info.hiz_surf = ¶ms->depth.aux_surf;
1392
1393 struct blorp_address hiz_address = params->depth.aux_addr;
1394
1395 info.hiz_address =
1396 blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1397 hiz_address, 0);
1398
1399 info.depth_clear_value = params->depth.clear_color.f32[0];
1400 }
1401 }
1402
1403 if (params->stencil.enabled) {
1404 info.stencil_surf = ¶ms->stencil.surf;
1405
1406 info.stencil_aux_usage = params->stencil.aux_usage;
1407 struct blorp_address stencil_address = params->stencil.addr;
1408
1409 info.stencil_address =
1410 blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1411 stencil_address, 0);
1412 }
1413
1414 isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1415
1416 if (intel_needs_workaround(devinfo, 1408224581) ||
1417 intel_needs_workaround(devinfo, 14014097488) ||
1418 intel_needs_workaround(devinfo, 14016712196)) {
1419 /* Wa_1408224581
1420 *
1421 * Workaround: Gfx12LP Astep only An additional pipe control with
1422 * post-sync = store dword operation would be required.( w/a is to
1423 * have an additional pipe control after the stencil state whenever
1424 * the surface state bits of this state is changing).
1425 *
1426 * This also seems sufficient to handle Wa_14014097488 and
1427 * Wa_14016712196.
1428 */
1429 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1430 pc.PostSyncOperation = WriteImmediateData;
1431 pc.Address = blorp_get_workaround_address(batch);
1432 }
1433 }
1434 }
1435
1436 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1437 * depth/stencil buffer extents are ignored to handle APIs which perform
1438 * clearing operations without such information.
1439 * */
1440 static void
blorp_emit_gfx8_hiz_op(struct blorp_batch * batch,const struct blorp_params * params)1441 blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1442 const struct blorp_params *params)
1443 {
1444 /* We should be performing an operation on a depth or stencil buffer.
1445 */
1446 assert(params->depth.enabled || params->stencil.enabled);
1447
1448 blorp_measure_start(batch, params);
1449
1450 /* The stencil buffer should only be enabled if a fast clear operation is
1451 * requested.
1452 */
1453 if (params->stencil.enabled)
1454 assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1455
1456 /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1457 *
1458 * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1459 * the Number of Multisamples. This packet must not be used to change
1460 * Number of Multisamples in a rendering sequence.
1461 *
1462 * Since HIZ may be the first thing in a batch buffer, play safe and always
1463 * emit 3DSTATE_MULTISAMPLE.
1464 */
1465 blorp_emit_3dstate_multisample(batch, params);
1466
1467 /* From the BDW PRM Volume 7, Depth Buffer Clear:
1468 *
1469 * The clear value must be between the min and max depth values
1470 * (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1471 * D32_FLOAT, then +/-DENORM values are also allowed.
1472 *
1473 * Set the bounds to match our hardware limits.
1474 */
1475 if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR)
1476 blorp_emit_cc_viewport(batch);
1477
1478 /* Make sure to disable fragment shader, a previous draw might have enabled
1479 * a SIMD32 shader and we could be dispatching threads here with MSAA 16x
1480 * which does not support SIMD32.
1481 *
1482 * dEQP-VK.pipeline.monolithic.multisample.misc.clear_attachments.
1483 * r8g8b8a8_unorm_r16g16b16a16_sfloat_r32g32b32a32_uint_d16_unorm.
1484 * 16x.ds_resolve_sample_zero.sub_framebuffer
1485 * exercises this case.
1486 */
1487 blorp_emit(batch, GENX(3DSTATE_PS), ps);
1488 blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx);
1489
1490 /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1491 * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1492 * even when WM_HZ_OP is active. However, WM thread dispatch is normally
1493 * disabled for HiZ ops and it appears that force-enabling it can lead to
1494 * GPU hangs on at least Skylake. Since we don't know the current state of
1495 * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1496 */
1497 blorp_emit(batch, GENX(3DSTATE_WM), wm);
1498
1499 /* If we can't alter the depth stencil config and multiple layers are
1500 * involved, the HiZ op will fail. This is because the op requires that a
1501 * new config is emitted for each additional layer.
1502 */
1503 if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1504 assert(params->num_layers <= 1);
1505 } else {
1506 blorp_emit_depth_stencil_config(batch, params);
1507 }
1508
1509 /* TODO - If we ever start using 3DSTATE_WM_HZ_OP::StencilBufferResolveEnable
1510 * we need to implement required steps, flushes documented in Wa_1605967699.
1511 */
1512 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1513 switch (params->hiz_op) {
1514 case ISL_AUX_OP_FAST_CLEAR:
1515 hzp.StencilBufferClearEnable = params->stencil.enabled;
1516 hzp.DepthBufferClearEnable = params->depth.enabled;
1517 hzp.StencilClearValue = params->stencil_ref;
1518 hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1519 #if GFX_VER >= 20
1520 hzp.DepthClearValue = params->depth.clear_color.f32[0];
1521
1522 /* From the Xe2 Bspec 56437 (r61349):
1523 *
1524 * The Depth Clear value cannot be a NAN (Not-A-Number) if the
1525 * depth format is Float32.
1526 *
1527 * We're not required to support NaN in APIs, so flush to zero.
1528 */
1529 if (util_is_nan(hzp.DepthClearValue))
1530 hzp.DepthClearValue = 0;
1531 #endif
1532 break;
1533 case ISL_AUX_OP_FULL_RESOLVE:
1534 assert(params->full_surface_hiz_op);
1535 hzp.DepthBufferResolveEnable = true;
1536 break;
1537 case ISL_AUX_OP_AMBIGUATE:
1538 assert(params->full_surface_hiz_op);
1539 hzp.HierarchicalDepthBufferResolveEnable = true;
1540 break;
1541 case ISL_AUX_OP_PARTIAL_RESOLVE:
1542 case ISL_AUX_OP_NONE:
1543 unreachable("Invalid HIZ op");
1544 }
1545
1546 hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1547 hzp.SampleMask = 0xFFFF;
1548
1549 /* Due to a hardware issue, this bit MBZ */
1550 assert(hzp.ScissorRectangleEnable == false);
1551
1552 /* Contrary to the HW docs both fields are inclusive */
1553 hzp.ClearRectangleXMin = params->x0;
1554 hzp.ClearRectangleYMin = params->y0;
1555
1556 /* Contrary to the HW docs both fields are exclusive */
1557 hzp.ClearRectangleXMax = params->x1;
1558 hzp.ClearRectangleYMax = params->y1;
1559 }
1560
1561 /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1562 * to “Write Immediate Data” enabled.
1563 */
1564 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1565 pc.PostSyncOperation = WriteImmediateData;
1566 pc.Address = blorp_get_workaround_address(batch);
1567 }
1568
1569 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1570
1571 blorp_measure_end(batch, params);
1572 }
1573
1574 static bool
blorp_uses_bti_rt_writes(const struct blorp_batch * batch,const struct blorp_params * params)1575 blorp_uses_bti_rt_writes(const struct blorp_batch *batch, const struct blorp_params *params)
1576 {
1577 if (batch->flags & (BLORP_BATCH_USE_BLITTER | BLORP_BATCH_USE_COMPUTE))
1578 return false;
1579
1580 /* HIZ clears use WM_HZ ops rather than a clear shader using RT writes. */
1581 return params->hiz_op == ISL_AUX_OP_NONE;
1582 }
1583
1584 static void
blorp_exec_3d(struct blorp_batch * batch,const struct blorp_params * params)1585 blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
1586 {
1587 if (params->hiz_op != ISL_AUX_OP_NONE) {
1588 blorp_emit_gfx8_hiz_op(batch, params);
1589 return;
1590 }
1591
1592 blorp_emit_vertex_buffers(batch, params);
1593 blorp_emit_vertex_elements(batch, params);
1594
1595 blorp_emit_pipeline(batch, params);
1596
1597 blorp_emit_btp(batch, blorp_setup_binding_table(batch, params));
1598
1599 if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1600 blorp_emit_depth_stencil_config(batch, params);
1601
1602 const UNUSED bool use_tbimr = false;
1603 blorp_emit_pre_draw(batch, params);
1604 blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1605 prim.VertexAccessType = SEQUENTIAL;
1606 prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1607 prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1608 #if GFX_VERx10 >= 125
1609 prim.TBIMREnable = use_tbimr;
1610 #endif
1611 prim.VertexCountPerInstance = 3;
1612 prim.InstanceCount = params->num_layers;
1613 }
1614 blorp_emit_post_draw(batch, params);
1615 }
1616
1617 static void
blorp_get_compute_push_const(struct blorp_batch * batch,const struct blorp_params * params,uint32_t threads,uint32_t * state_offset,unsigned * state_size)1618 blorp_get_compute_push_const(struct blorp_batch *batch,
1619 const struct blorp_params *params,
1620 uint32_t threads,
1621 uint32_t *state_offset,
1622 unsigned *state_size)
1623 {
1624 const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1625 const unsigned push_const_size =
1626 ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
1627 assert(cs_prog_data->push.cross_thread.size +
1628 cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
1629
1630 if (push_const_size == 0) {
1631 *state_offset = 0;
1632 *state_size = 0;
1633 return;
1634 }
1635
1636 uint32_t push_const_offset;
1637 uint32_t *push_const =
1638 GFX_VERx10 >= 125 ?
1639 blorp_alloc_general_state(batch, push_const_size, 64,
1640 &push_const_offset) :
1641 blorp_alloc_dynamic_state(batch, push_const_size, 64,
1642 &push_const_offset);
1643 if (push_const == NULL) {
1644 *state_offset = 0;
1645 *state_size = 0;
1646 return;
1647 }
1648 memset(push_const, 0x0, push_const_size);
1649
1650 void *dst = push_const;
1651 const void *src = (char *)¶ms->wm_inputs;
1652
1653 if (cs_prog_data->push.cross_thread.size > 0) {
1654 memcpy(dst, src, cs_prog_data->push.cross_thread.size);
1655 dst += cs_prog_data->push.cross_thread.size;
1656 src += cs_prog_data->push.cross_thread.size;
1657 }
1658
1659 assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
1660 #if GFX_VERx10 < 125
1661 if (cs_prog_data->push.per_thread.size > 0) {
1662 for (unsigned t = 0; t < threads; t++) {
1663 memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
1664
1665 uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
1666 *subgroup_id = t;
1667
1668 dst += cs_prog_data->push.per_thread.size;
1669 }
1670 }
1671 #endif
1672
1673 *state_offset = push_const_offset;
1674 *state_size = push_const_size;
1675 }
1676
1677 static void
blorp_exec_compute(struct blorp_batch * batch,const struct blorp_params * params)1678 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
1679 {
1680 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1681 assert(params->hiz_op == ISL_AUX_OP_NONE);
1682
1683 blorp_measure_start(batch, params);
1684
1685 const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
1686 const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1687 const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
1688 const struct intel_cs_dispatch_info dispatch =
1689 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
1690
1691 uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
1692 uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
1693 uint32_t group_z0 = params->dst.z_offset;
1694 uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
1695 uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
1696 assert(params->num_layers >= 1);
1697 uint32_t group_z1 = params->dst.z_offset + params->num_layers;
1698 assert(cs_prog_data->local_size[2] == 1);
1699
1700 #if GFX_VERx10 >= 125
1701 uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1702
1703 uint32_t samplers_offset =
1704 params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1705
1706 uint32_t push_const_offset;
1707 unsigned push_const_size;
1708 blorp_get_compute_push_const(batch, params, dispatch.threads,
1709 &push_const_offset, &push_const_size);
1710 struct GENX(COMPUTE_WALKER_BODY) body = {
1711 .SIMDSize = dispatch.simd_size / 16,
1712 .MessageSIMD = dispatch.simd_size / 16,
1713 .LocalXMaximum = cs_prog_data->local_size[0] - 1,
1714 .LocalYMaximum = cs_prog_data->local_size[1] - 1,
1715 .LocalZMaximum = cs_prog_data->local_size[2] - 1,
1716 .ThreadGroupIDStartingX = group_x0,
1717 .ThreadGroupIDStartingY = group_y0,
1718 .ThreadGroupIDStartingZ = group_z0,
1719 .ThreadGroupIDXDimension = group_x1,
1720 .ThreadGroupIDYDimension = group_y1,
1721 .ThreadGroupIDZDimension = group_z1,
1722 .ExecutionMask = dispatch.right_mask,
1723 .PostSync.MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1724
1725 .IndirectDataStartAddress = push_const_offset,
1726 .IndirectDataLength = push_const_size,
1727
1728 #if GFX_VERx10 >= 125
1729 .GenerateLocalID = cs_prog_data->generate_local_id != 0,
1730 .EmitLocal = cs_prog_data->generate_local_id,
1731 .WalkOrder = cs_prog_data->walk_order,
1732 .TileLayout = cs_prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
1733 TileY32bpe : Linear,
1734 #endif
1735
1736 .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1737 .KernelStartPointer = params->cs_prog_kernel,
1738 .SamplerStatePointer = samplers_offset,
1739 .SamplerCount = params->src.enabled ? 1 : 0,
1740 .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1741 .BindingTablePointer = surfaces_offset,
1742 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1743 .SharedLocalMemorySize =
1744 intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
1745 .PreferredSLMAllocationSize =
1746 intel_compute_preferred_slm_calc_encode_size(devinfo,
1747 prog_data->total_shared,
1748 dispatch.group_size,
1749 dispatch.simd_size),
1750 .NumberOfBarriers = cs_prog_data->uses_barrier,
1751 #if GFX_VER >= 30
1752 .RegistersPerThread = ptl_register_blocks(prog_data->grf_used),
1753 #endif
1754 },
1755 };
1756
1757 assert(cs_prog_data->push.per_thread.regs == 0);
1758 blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
1759 cw.body = body;
1760 }
1761
1762 #else
1763
1764 /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
1765 *
1766 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1767 * the only bits that are changed are scoreboard related: Scoreboard
1768 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1769 * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
1770 *
1771 * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
1772 * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
1773 */
1774 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1775 pc.CommandStreamerStallEnable = true;
1776 pc.StallAtPixelScoreboard = true;
1777 }
1778
1779 blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
1780 assert(prog_data->total_scratch == 0);
1781 vfe.MaximumNumberofThreads =
1782 devinfo->max_cs_threads * devinfo->subslice_total - 1;
1783 vfe.NumberofURBEntries = 2;
1784 #if GFX_VER < 11
1785 vfe.ResetGatewayTimer =
1786 Resettingrelativetimerandlatchingtheglobaltimestamp;
1787 #endif
1788 vfe.URBEntryAllocationSize = 2;
1789
1790 const uint32_t vfe_curbe_allocation =
1791 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
1792 cs_prog_data->push.cross_thread.regs, 2);
1793 vfe.CURBEAllocationSize = vfe_curbe_allocation;
1794 }
1795
1796 uint32_t push_const_offset;
1797 unsigned push_const_size;
1798 blorp_get_compute_push_const(batch, params, dispatch.threads,
1799 &push_const_offset, &push_const_size);
1800
1801 blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1802 curbe.CURBETotalDataLength = push_const_size;
1803 curbe.CURBEDataStartAddress = push_const_offset;
1804 }
1805
1806 uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1807
1808 uint32_t samplers_offset =
1809 params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1810
1811 struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
1812 .KernelStartPointer = params->cs_prog_kernel,
1813 .SamplerStatePointer = samplers_offset,
1814 .SamplerCount = params->src.enabled ? 1 : 0,
1815 .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1816 .BindingTablePointer = surfaces_offset,
1817 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
1818 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1819 .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
1820 prog_data->total_shared),
1821 .BarrierEnable = cs_prog_data->uses_barrier,
1822 .CrossThreadConstantDataReadLength =
1823 cs_prog_data->push.cross_thread.regs,
1824 };
1825
1826 uint32_t idd_offset;
1827 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1828 void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
1829 if (state == NULL)
1830 return;
1831 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
1832
1833 blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1834 mid.InterfaceDescriptorTotalLength = size;
1835 mid.InterfaceDescriptorDataStartAddress = idd_offset;
1836 }
1837
1838 blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
1839 ggw.SIMDSize = dispatch.simd_size / 16;
1840 ggw.ThreadDepthCounterMaximum = 0;
1841 ggw.ThreadHeightCounterMaximum = 0;
1842 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
1843 ggw.ThreadGroupIDStartingX = group_x0;
1844 ggw.ThreadGroupIDStartingY = group_y0;
1845 ggw.ThreadGroupIDStartingResumeZ = group_z0;
1846 ggw.ThreadGroupIDXDimension = group_x1;
1847 ggw.ThreadGroupIDYDimension = group_y1;
1848 ggw.ThreadGroupIDZDimension = group_z1;
1849 ggw.RightExecutionMask = dispatch.right_mask;
1850 ggw.BottomExecutionMask = 0xffffffff;
1851 }
1852
1853 #endif
1854
1855 blorp_measure_end(batch, params);
1856 }
1857
1858 /* -----------------------------------------------------------------------
1859 * -- BLORP on blitter
1860 * -----------------------------------------------------------------------
1861 */
1862
1863 #include "isl/isl_genX_helpers.h"
1864
1865 #if GFX_VER >= 12
1866 static uint32_t
xy_bcb_tiling(const struct isl_surf * surf)1867 xy_bcb_tiling(const struct isl_surf *surf)
1868 {
1869 switch (surf->tiling) {
1870 case ISL_TILING_LINEAR:
1871 return XY_TILE_LINEAR;
1872 #if GFX_VERx10 >= 125
1873 case ISL_TILING_X:
1874 return XY_TILE_X;
1875 case ISL_TILING_4:
1876 return XY_TILE_4;
1877 case ISL_TILING_64:
1878 case ISL_TILING_64_XE2:
1879 return XY_TILE_64;
1880 #else
1881 case ISL_TILING_Y0:
1882 return XY_TILE_Y;
1883 #endif
1884 default:
1885 unreachable("Invalid tiling for XY_BLOCK_COPY_BLT");
1886 }
1887 }
1888
1889 static uint32_t
xy_color_depth(const struct isl_format_layout * fmtl)1890 xy_color_depth(const struct isl_format_layout *fmtl)
1891 {
1892 switch (fmtl->bpb) {
1893 case 128: return XY_BPP_128_BIT;
1894 case 96: return XY_BPP_96_BIT;
1895 case 64: return XY_BPP_64_BIT;
1896 case 32: return XY_BPP_32_BIT;
1897 case 16: return XY_BPP_16_BIT;
1898 case 8: return XY_BPP_8_BIT;
1899 default:
1900 unreachable("Invalid bpp");
1901 }
1902 }
1903 #endif
1904
1905 #if GFX_VERx10 >= 125
1906 static uint32_t
xy_bcb_surf_dim(const struct isl_surf * surf)1907 xy_bcb_surf_dim(const struct isl_surf *surf)
1908 {
1909 switch (surf->dim) {
1910 case ISL_SURF_DIM_1D:
1911 /* An undocumented assertion in simulation is that 1D surfaces must use
1912 * LINEAR tiling. But that doesn't work, so instead consider 1D tiled
1913 * surfaces as 2D with a Height=1.
1914 */
1915 return surf->tiling != ISL_TILING_LINEAR ? XY_SURFTYPE_2D: XY_SURFTYPE_1D;
1916 case ISL_SURF_DIM_2D:
1917 return XY_SURFTYPE_2D;
1918 case ISL_SURF_DIM_3D:
1919 return XY_SURFTYPE_3D;
1920 default:
1921 unreachable("Invalid dimensionality for XY_BLOCK_COPY_BLT");
1922 }
1923 }
1924
1925 static uint32_t
xy_bcb_surf_depth(const struct isl_surf * surf)1926 xy_bcb_surf_depth(const struct isl_surf *surf)
1927 {
1928 return surf->dim == ISL_SURF_DIM_3D ? surf->logical_level0_px.depth
1929 : surf->logical_level0_px.array_len;
1930 }
1931
1932 #if GFX_VER < 20
1933 static uint32_t
xy_aux_mode(const struct blorp_surface_info * info)1934 xy_aux_mode(const struct blorp_surface_info *info)
1935 {
1936 switch (info->aux_usage) {
1937 case ISL_AUX_USAGE_CCS_E:
1938 case ISL_AUX_USAGE_FCV_CCS_E:
1939 case ISL_AUX_USAGE_STC_CCS:
1940 return XY_CCS_E;
1941 case ISL_AUX_USAGE_NONE:
1942 return XY_NONE;
1943 default:
1944 unreachable("Unsupported aux mode");
1945 }
1946 }
1947 #endif // GFX_VER < 20
1948 #endif // GFX_VERx10 >= 125
1949
1950 UNUSED static void
blorp_xy_block_copy_blt(struct blorp_batch * batch,const struct blorp_params * params)1951 blorp_xy_block_copy_blt(struct blorp_batch *batch,
1952 const struct blorp_params *params)
1953 {
1954 #if GFX_VER < 12
1955 unreachable("Blitter is only supported on Gfx12+");
1956 #else
1957 UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
1958
1959 assert(batch->flags & BLORP_BATCH_USE_BLITTER);
1960 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1961 assert(params->hiz_op == ISL_AUX_OP_NONE);
1962
1963 assert(params->num_layers == 1);
1964 assert(params->dst.view.levels == 1);
1965 assert(params->src.view.levels == 1);
1966
1967 #if GFX_VERx10 < 125
1968 assert(params->dst.view.base_array_layer == 0);
1969 assert(params->dst.z_offset == 0);
1970 #endif
1971
1972 unsigned dst_x0 = params->x0;
1973 unsigned dst_x1 = params->x1;
1974 unsigned src_x0 =
1975 dst_x0 - params->wm_inputs.coord_transform[0].offset;
1976 ASSERTED unsigned src_x1 =
1977 dst_x1 - params->wm_inputs.coord_transform[0].offset;
1978 unsigned dst_y0 = params->y0;
1979 unsigned dst_y1 = params->y1;
1980 unsigned src_y0 =
1981 dst_y0 - params->wm_inputs.coord_transform[1].offset;
1982 ASSERTED unsigned src_y1 =
1983 dst_y1 - params->wm_inputs.coord_transform[1].offset;
1984
1985 assert(src_x1 - src_x0 == dst_x1 - dst_x0);
1986 assert(src_y1 - src_y0 == dst_y1 - dst_y0);
1987
1988 const struct isl_surf *src_surf = ¶ms->src.surf;
1989 const struct isl_surf *dst_surf = ¶ms->dst.surf;
1990
1991 const struct isl_format_layout *fmtl =
1992 isl_format_get_layout(params->dst.view.format);
1993
1994 if (fmtl->bpb == 96) {
1995 assert(src_surf->tiling == ISL_TILING_LINEAR &&
1996 dst_surf->tiling == ISL_TILING_LINEAR);
1997 }
1998
1999 assert(src_surf->samples == 1);
2000 assert(dst_surf->samples == 1);
2001
2002 unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2003 unsigned src_pitch_unit = src_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2004
2005 #if GFX_VERx10 >= 125
2006 struct isl_extent3d src_align = isl_get_image_alignment(src_surf);
2007 struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2008 #endif
2009
2010 blorp_emit(batch, GENX(XY_BLOCK_COPY_BLT), blt) {
2011 blt.ColorDepth = xy_color_depth(fmtl);
2012
2013 blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2014 blt.DestinationMOCS = params->dst.addr.mocs;
2015 blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2016 blt.DestinationX1 = dst_x0;
2017 blt.DestinationY1 = dst_y0;
2018 blt.DestinationX2 = dst_x1;
2019 blt.DestinationY2 = dst_y1;
2020 blt.DestinationBaseAddress = params->dst.addr;
2021 blt.DestinationXOffset = params->dst.tile_x_sa;
2022 blt.DestinationYOffset = params->dst.tile_y_sa;
2023
2024 #if GFX_VERx10 >= 125
2025 blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2026 blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2027 blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2028 blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2029 blt.DestinationArrayIndex =
2030 params->dst.view.base_array_layer + params->dst.z_offset;
2031 blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2032 blt.DestinationLOD = params->dst.view.base_level;
2033 blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
2034 blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2035 blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2036 #if GFX_VER < 20
2037 /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
2038 blt.DestinationDepthStencilResource =
2039 params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
2040 #endif
2041 blt.DestinationTargetMemory =
2042 params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2043
2044 if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2045 #if GFX_VER < 20
2046 blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(¶ms->dst);
2047 blt.DestinationCompressionEnable = true;
2048 #endif
2049 blt.DestinationCompressionFormat =
2050 isl_get_render_compression_format(dst_surf->format);
2051 blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2052 blt.DestinationClearAddress = params->dst.clear_color_addr;
2053 }
2054 #endif
2055
2056 blt.SourceX1 = src_x0;
2057 blt.SourceY1 = src_y0;
2058 blt.SourcePitch = (src_surf->row_pitch_B / src_pitch_unit) - 1;
2059 blt.SourceMOCS = params->src.addr.mocs;
2060 blt.SourceTiling = xy_bcb_tiling(src_surf);
2061 blt.SourceBaseAddress = params->src.addr;
2062 blt.SourceXOffset = params->src.tile_x_sa;
2063 blt.SourceYOffset = params->src.tile_y_sa;
2064
2065 #if GFX_VERx10 >= 125
2066 blt.SourceSurfaceType = xy_bcb_surf_dim(src_surf);
2067 blt.SourceSurfaceWidth = src_surf->logical_level0_px.w - 1;
2068 blt.SourceSurfaceHeight = src_surf->logical_level0_px.h - 1;
2069 blt.SourceSurfaceDepth = xy_bcb_surf_depth(src_surf) - 1;
2070 blt.SourceArrayIndex =
2071 params->src.view.base_array_layer + params->src.z_offset;
2072 blt.SourceSurfaceQPitch = isl_get_qpitch(src_surf) >> 2;
2073 blt.SourceLOD = params->src.view.base_level;
2074 blt.SourceMipTailStartLOD = src_surf->miptail_start_level;
2075 blt.SourceHorizontalAlign = isl_encode_halign(src_align.width);
2076 blt.SourceVerticalAlign = isl_encode_valign(src_align.height);
2077 #if GFX_VER < 20
2078 /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
2079 blt.SourceDepthStencilResource =
2080 params->src.aux_usage == ISL_AUX_USAGE_STC_CCS;
2081 #endif
2082 blt.SourceTargetMemory =
2083 params->src.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2084
2085 if (params->src.aux_usage != ISL_AUX_USAGE_NONE) {
2086 #if GFX_VER < 20
2087 blt.SourceAuxiliarySurfaceMode = xy_aux_mode(¶ms->src);
2088 blt.SourceCompressionEnable = true;
2089 #endif
2090 blt.SourceCompressionFormat =
2091 isl_get_render_compression_format(src_surf->format);
2092 blt.SourceClearValueEnable = !!params->src.clear_color_addr.buffer;
2093 blt.SourceClearAddress = params->src.clear_color_addr;
2094 }
2095 #endif
2096 }
2097 #endif
2098 }
2099
2100 UNUSED static void
blorp_xy_fast_color_blit(struct blorp_batch * batch,const struct blorp_params * params)2101 blorp_xy_fast_color_blit(struct blorp_batch *batch,
2102 const struct blorp_params *params)
2103 {
2104 #if GFX_VER < 12
2105 unreachable("Blitter is only supported on Gfx12+");
2106 #else
2107 UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
2108 const struct isl_surf *dst_surf = ¶ms->dst.surf;
2109 const struct isl_format_layout *fmtl =
2110 isl_format_get_layout(params->dst.view.format);
2111
2112 assert(batch->flags & BLORP_BATCH_USE_BLITTER);
2113 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
2114 assert(params->hiz_op == ISL_AUX_OP_NONE);
2115
2116 assert(params->num_layers == 1);
2117 assert(params->dst.view.levels == 1);
2118 assert(dst_surf->samples == 1);
2119 assert(fmtl->bpb != 96 || dst_surf->tiling == ISL_TILING_LINEAR);
2120
2121 #if GFX_VERx10 < 125
2122 assert(params->dst.view.base_array_layer == 0);
2123 assert(params->dst.z_offset == 0);
2124 #endif
2125
2126 unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2127
2128 #if GFX_VERx10 >= 125
2129 struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2130 #endif
2131
2132 #if INTEL_NEEDS_WA_16021021469
2133 assert(fmtl->bpb != 96);
2134 #endif
2135
2136 blorp_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
2137 blt.ColorDepth = xy_color_depth(fmtl);
2138
2139 blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2140 blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2141 blt.DestinationX1 = params->x0;
2142 blt.DestinationY1 = params->y0;
2143 blt.DestinationX2 = params->x1;
2144 blt.DestinationY2 = params->y1;
2145 blt.DestinationBaseAddress = params->dst.addr;
2146 blt.DestinationXOffset = params->dst.tile_x_sa;
2147 blt.DestinationYOffset = params->dst.tile_y_sa;
2148
2149 isl_color_value_pack((union isl_color_value *)
2150 params->wm_inputs.clear_color,
2151 params->dst.view.format, blt.FillColor);
2152
2153 #if GFX_VERx10 >= 125
2154 blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2155 blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2156 blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2157 blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2158 blt.DestinationArrayIndex =
2159 params->dst.view.base_array_layer + params->dst.z_offset;
2160 blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2161 blt.DestinationLOD = params->dst.view.base_level;
2162 blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
2163 blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2164 blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2165 /* XY_FAST_COLOR_BLT only supports AUX_CCS. */
2166 blt.DestinationDepthStencilResource =
2167 params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
2168 blt.DestinationTargetMemory =
2169 params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2170
2171 if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2172 #if GFX_VERx10 == 125
2173 blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(¶ms->dst);
2174 blt.DestinationCompressionEnable = true;
2175 blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2176 blt.DestinationClearAddress = params->dst.clear_color_addr;
2177 #endif
2178 blt.DestinationCompressionFormat =
2179 isl_get_render_compression_format(dst_surf->format);
2180 }
2181
2182 blt.DestinationMOCS = params->dst.addr.mocs;
2183 #endif
2184 }
2185 #endif
2186 }
2187
2188 static void
blorp_exec_blitter(struct blorp_batch * batch,const struct blorp_params * params)2189 blorp_exec_blitter(struct blorp_batch *batch,
2190 const struct blorp_params *params)
2191 {
2192 blorp_measure_start(batch, params);
2193
2194 if (params->src.enabled)
2195 blorp_xy_block_copy_blt(batch, params);
2196 else
2197 blorp_xy_fast_color_blit(batch, params);
2198
2199 blorp_measure_end(batch, params);
2200 }
2201
2202 /**
2203 * \brief Execute a blit or render pass operation.
2204 *
2205 * To execute the operation, this function manually constructs and emits a
2206 * batch to draw a rectangle primitive. The batchbuffer is flushed before
2207 * constructing and after emitting the batch.
2208 *
2209 * This function alters no GL state.
2210 */
2211 static void
blorp_exec(struct blorp_batch * batch,const struct blorp_params * params)2212 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
2213 {
2214 if (batch->flags & BLORP_BATCH_USE_BLITTER) {
2215 blorp_exec_blitter(batch, params);
2216 } else if (batch->flags & BLORP_BATCH_USE_COMPUTE) {
2217 blorp_exec_compute(batch, params);
2218 } else {
2219 blorp_exec_3d(batch, params);
2220 }
2221 }
2222
2223 static void
blorp_init_dynamic_states(struct blorp_context * context)2224 blorp_init_dynamic_states(struct blorp_context *context)
2225 {
2226 {
2227 struct GENX(BLEND_STATE) blend = { };
2228
2229 uint32_t dws[GENX(BLEND_STATE_length) * 4 +
2230 GENX(BLEND_STATE_ENTRY_length) * 4 * 8 /* MAX_RTS */];
2231 uint32_t *pos = dws;
2232
2233 GENX(BLEND_STATE_pack)(NULL, pos, &blend);
2234 pos += GENX(BLEND_STATE_length);
2235
2236 for (unsigned i = 0; i < 8; ++i) {
2237 struct GENX(BLEND_STATE_ENTRY) entry = {
2238 .PreBlendColorClampEnable = true,
2239 .PostBlendColorClampEnable = true,
2240 .ColorClampRange = COLORCLAMP_RTFORMAT,
2241 };
2242 GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
2243 pos += GENX(BLEND_STATE_ENTRY_length);
2244 }
2245
2246 context->upload_dynamic_state(context, dws, sizeof(dws), 64,
2247 BLORP_DYNAMIC_STATE_BLEND);
2248 }
2249
2250 blorp_context_upload_dynamic(context, GENX(CC_VIEWPORT), vp, 32,
2251 BLORP_DYNAMIC_STATE_CC_VIEWPORT) {
2252 vp.MinimumDepth = context->config.use_unrestricted_depth_range ?
2253 -FLT_MAX : 0.0;
2254 vp.MaximumDepth = context->config.use_unrestricted_depth_range ?
2255 FLT_MAX : 1.0;
2256 }
2257
2258 blorp_context_upload_dynamic(context, GENX(COLOR_CALC_STATE), cc, 64,
2259 BLORP_DYNAMIC_STATE_COLOR_CALC) {
2260 /* Nothing */
2261 }
2262
2263 blorp_context_upload_dynamic(context, GENX(SAMPLER_STATE), sampler, 32,
2264 BLORP_DYNAMIC_STATE_SAMPLER) {
2265 sampler.MipModeFilter = MIPFILTER_NONE;
2266 sampler.MagModeFilter = MAPFILTER_LINEAR;
2267 sampler.MinModeFilter = MAPFILTER_LINEAR;
2268 sampler.MinLOD = 0;
2269 sampler.MaxLOD = 0;
2270 sampler.TCXAddressControlMode = TCM_CLAMP;
2271 sampler.TCYAddressControlMode = TCM_CLAMP;
2272 sampler.TCZAddressControlMode = TCM_CLAMP;
2273 sampler.MaximumAnisotropy = RATIO21;
2274 sampler.RAddressMinFilterRoundingEnable = true;
2275 sampler.RAddressMagFilterRoundingEnable = true;
2276 sampler.VAddressMinFilterRoundingEnable = true;
2277 sampler.VAddressMagFilterRoundingEnable = true;
2278 sampler.UAddressMinFilterRoundingEnable = true;
2279 sampler.UAddressMagFilterRoundingEnable = true;
2280 sampler.NonnormalizedCoordinateEnable = true;
2281 }
2282 }
2283
2284 #endif /* BLORP_GENX_EXEC_BRW_H */
2285