• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef BLORP_GENX_EXEC_BRW_H
25 #define BLORP_GENX_EXEC_BRW_H
26 
27 #include "blorp_priv.h"
28 #include "dev/intel_device_info.h"
29 #include "common/intel_compute_slm.h"
30 #include "common/intel_sample_positions.h"
31 #include "common/intel_l3_config.h"
32 #include "genxml/gen_macros.h"
33 #include "intel/compiler/brw_compiler.h"
34 
35 /**
36  * This file provides the blorp pipeline setup and execution functionality.
37  * It defines the following function:
38  *
39  * static void
40  * blorp_exec(struct blorp_context *blorp, void *batch_data,
41  *            const struct blorp_params *params);
42  *
43  * It is the job of whoever includes this header to wrap this in something
44  * to get an externally visible symbol.
45  *
46  * In order for the blorp_exec function to work, the driver must provide
47  * implementations of the following static helper functions.
48  */
49 
50 static void *
51 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
52 
53 static uint64_t
54 blorp_emit_reloc(struct blorp_batch *batch,
55                  void *location, struct blorp_address address, uint32_t delta);
56 
57 static void
58 blorp_measure_start(struct blorp_batch *batch,
59                     const struct blorp_params *params);
60 
61 static void
62 blorp_measure_end(struct blorp_batch *batch,
63                   const struct blorp_params *params);
64 
65 static void *
66 blorp_alloc_dynamic_state(struct blorp_batch *batch,
67                           uint32_t size,
68                           uint32_t alignment,
69                           uint32_t *offset);
70 
71 UNUSED static void *
72 blorp_alloc_general_state(struct blorp_batch *batch,
73                           uint32_t size,
74                           uint32_t alignment,
75                           uint32_t *offset);
76 
77 static uint32_t
78 blorp_get_dynamic_state(struct blorp_batch *batch,
79                         enum blorp_dynamic_state name);
80 
81 static void *
82 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
83                           struct blorp_address *addr);
84 static void
85 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
86                                            const struct blorp_address *addrs,
87                                            uint32_t *sizes,
88                                            unsigned num_vbs);
89 
90 UNUSED static struct blorp_address
91 blorp_get_workaround_address(struct blorp_batch *batch);
92 
93 static bool
94 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
95                           unsigned state_size, unsigned state_alignment,
96                           uint32_t *bt_offset, uint32_t *surface_offsets,
97                           void **surface_maps);
98 
99 static uint32_t
100 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
101                                       uint32_t offset);
102 
103 static void
104 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
105 
106 static void
107 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
108                     struct blorp_address address, uint32_t delta);
109 
110 static uint64_t
111 blorp_get_surface_address(struct blorp_batch *batch,
112                           struct blorp_address address);
113 
114 #if GFX_VER < 10
115 static struct blorp_address
116 blorp_get_surface_base_address(struct blorp_batch *batch);
117 #endif
118 
119 static const struct intel_l3_config *
120 blorp_get_l3_config(struct blorp_batch *batch);
121 
122 static void
123 blorp_pre_emit_urb_config(struct blorp_batch *batch,
124                           struct intel_urb_config *urb_config);
125 
126 static void
127 blorp_emit_urb_config(struct blorp_batch *batch,
128                       struct intel_urb_config *urb_config);
129 
130 static void
131 blorp_emit_pipeline(struct blorp_batch *batch,
132                     const struct blorp_params *params);
133 
134 static void
135 blorp_emit_pre_draw(struct blorp_batch *batch,
136                     const struct blorp_params *params);
137 static void
138 blorp_emit_post_draw(struct blorp_batch *batch,
139                      const struct blorp_params *params);
140 
141 static inline unsigned
brw_blorp_get_urb_length(const struct brw_wm_prog_data * prog_data)142 brw_blorp_get_urb_length(const struct brw_wm_prog_data *prog_data)
143 {
144    if (prog_data == NULL)
145       return 1;
146 
147    /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE
148     *
149     * read_length = ceiling((max_source_attr+1)/2)
150     */
151    return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
152 }
153 
154 /***** BEGIN blorp_exec implementation ******/
155 
156 static uint64_t
_blorp_combine_address(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)157 _blorp_combine_address(struct blorp_batch *batch, void *location,
158                        struct blorp_address address, uint32_t delta)
159 {
160    if (address.buffer == NULL) {
161       return address.offset + delta;
162    } else {
163       return blorp_emit_reloc(batch, location, address, delta);
164    }
165 }
166 
167 #define __gen_address_type struct blorp_address
168 #define __gen_user_data struct blorp_batch
169 #define __gen_combine_address _blorp_combine_address
170 
171 #include "genxml/genX_pack.h"
172 #include "common/intel_genX_state_brw.h"
173 
174 #define _blorp_cmd_length(cmd) cmd ## _length
175 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
176 #define _blorp_cmd_header(cmd) cmd ## _header
177 #define _blorp_cmd_pack(cmd) cmd ## _pack
178 
179 #define blorp_emit(batch, cmd, name)                              \
180    for (struct cmd name = { _blorp_cmd_header(cmd) },             \
181         *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
182         __builtin_expect(_dst != NULL, 1);                        \
183         _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name),         \
184         _dst = NULL)
185 
186 #define blorp_emitn(batch, cmd, n, ...) ({                  \
187       uint32_t *_dw = blorp_emit_dwords(batch, n);          \
188       if (_dw) {                                            \
189          struct cmd template = {                            \
190             _blorp_cmd_header(cmd),                         \
191             .DWordLength = n - _blorp_cmd_length_bias(cmd), \
192             __VA_ARGS__                                     \
193          };                                                 \
194          _blorp_cmd_pack(cmd)(batch, _dw, &template);       \
195       }                                                     \
196       _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */     \
197    })
198 
199 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
200 
201 #define blorp_context_upload_dynamic(context, state, name,              \
202                                      align, dynamic_name)               \
203    for (struct state name = STRUCT_ZERO(state), *_dst = &name;          \
204         _dst != NULL;                                                   \
205         ({                                                              \
206            uint32_t _dw[_blorp_cmd_length(state)];                      \
207            _blorp_cmd_pack(state)(NULL, (void *)_dw, &name);            \
208            context->upload_dynamic_state(context, _dw,                  \
209                                          _blorp_cmd_length(state) * 4,  \
210                                          align, dynamic_name);          \
211            _dst = NULL;                                                 \
212         }))
213 
214 #define blorp_emit_dynamic(batch, state, name, align, offset)           \
215    for (struct state name = STRUCT_ZERO(state),                         \
216         *_dst = blorp_alloc_dynamic_state(batch,                   \
217                                           _blorp_cmd_length(state) * 4, \
218                                           align, offset);               \
219         __builtin_expect(_dst != NULL, 1);                              \
220         _blorp_cmd_pack(state)(batch, (void *)_dst, &name),             \
221         blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4),   \
222         _dst = NULL)
223 
224 /* 3DSTATE_URB
225  * 3DSTATE_URB_VS
226  * 3DSTATE_URB_HS
227  * 3DSTATE_URB_DS
228  * 3DSTATE_URB_GS
229  *
230  * Assign the entire URB to the VS. Even though the VS disabled, URB space
231  * is still needed because the clipper loads the VUE's from the URB. From
232  * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
233  * Dword 1.15:0 "VS Number of URB Entries":
234  *     This field is always used (even if VS Function Enable is DISABLED).
235  *
236  * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
237  * safely ignore it because this batch contains only one draw call.
238  *     Because of URB corruption caused by allocating a previous GS unit
239  *     URB entry to the VS unit, software is required to send a “GS NULL
240  *     Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
241  *     plus a dummy DRAW call before any case where VS will be taking over
242  *     GS URB space.
243  *
244  * If the 3DSTATE_URB_VS is emitted, than the others must be also.
245  * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
246  *
247  *     3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
248  *     programmed in order for the programming of this state to be
249  *     valid.
250  */
251 static void
emit_urb_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size * deref_block_size)252 emit_urb_config(struct blorp_batch *batch,
253                 const struct blorp_params *params,
254                 UNUSED enum intel_urb_deref_block_size *deref_block_size)
255 {
256    /* Once vertex fetcher has written full VUE entries with complete
257     * header the space requirement is as follows per vertex (in bytes):
258     *
259     *     Header    Position    Program constants
260     *   +--------+------------+-------------------+
261     *   |   16   |     16     |      n x 16       |
262     *   +--------+------------+-------------------+
263     *
264     * where 'n' stands for number of varying inputs expressed as vec4s.
265     */
266    struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
267    const unsigned num_varyings =
268       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
269    const unsigned total_needed = 16 + 16 + num_varyings * 16;
270 
271    /* The URB size is expressed in units of 64 bytes (512 bits) */
272    const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
273 
274    struct intel_urb_config urb_cfg = {
275       .size = { vs_entry_size, 1, 1, 1 },
276    };
277 
278    bool constrained;
279    intel_get_urb_config(batch->blorp->compiler->brw->devinfo,
280                         blorp_get_l3_config(batch),
281                         false, false, &urb_cfg,
282                         deref_block_size, &constrained);
283 
284    /* Tell drivers about the config. */
285    blorp_pre_emit_urb_config(batch, &urb_cfg);
286 
287    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
288 #if GFX_VER >= 12
289       blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_VS), urb) {
290          urb._3DCommandSubOpcode            += i;
291          urb.VSURBEntryAllocationSize        = urb_cfg.size[i] - 1;
292          urb.VSURBStartingAddressSlice0      = urb_cfg.start[i];
293          urb.VSURBStartingAddressSliceN      = urb_cfg.start[i];
294          urb.VSNumberofURBEntriesSlice0      = urb_cfg.entries[i];
295          urb.VSNumberofURBEntriesSliceN      = urb_cfg.entries[i];
296       }
297 #else
298       blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
299          urb._3DCommandSubOpcode      += i;
300          urb.VSURBStartingAddress      = urb_cfg.start[i];
301          urb.VSURBEntryAllocationSize  = urb_cfg.size[i] - 1;
302          urb.VSNumberofURBEntries      = urb_cfg.entries[i];
303       }
304 #endif
305    }
306 
307    if (batch->blorp->config.use_mesh_shading) {
308 #if GFX_VERx10 >= 125
309       blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
310       blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
311 #endif
312    }
313 }
314 
315 static void
316 blorp_emit_memcpy(struct blorp_batch *batch,
317                   struct blorp_address dst,
318                   struct blorp_address src,
319                   uint32_t size);
320 
321 static void
blorp_emit_vertex_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)322 blorp_emit_vertex_data(struct blorp_batch *batch,
323                        const struct blorp_params *params,
324                        struct blorp_address *addr,
325                        uint32_t *size)
326 {
327    const float vertices[] = {
328       /* v0 */ (float)params->x1, (float)params->y1, params->z,
329       /* v1 */ (float)params->x0, (float)params->y1, params->z,
330       /* v2 */ (float)params->x0, (float)params->y0, params->z,
331    };
332 
333    void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
334    if (data == NULL)
335       return;
336    memcpy(data, vertices, sizeof(vertices));
337    *size = sizeof(vertices);
338    blorp_flush_range(batch, data, *size);
339 }
340 
341 static void
blorp_emit_input_varying_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)342 blorp_emit_input_varying_data(struct blorp_batch *batch,
343                               const struct blorp_params *params,
344                               struct blorp_address *addr,
345                               uint32_t *size)
346 {
347    const unsigned vec4_size_in_bytes = 4 * sizeof(float);
348    const unsigned max_num_varyings =
349       DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
350    struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
351    const unsigned num_varyings =
352       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
353 
354    *size = 16 + num_varyings * vec4_size_in_bytes;
355 
356    const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
357    void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
358    if (data == NULL)
359       return;
360    uint32_t *inputs = data;
361 
362    /* Copy in the VS inputs */
363    assert(sizeof(params->vs_inputs) == 16);
364    memcpy(inputs, &params->vs_inputs, sizeof(params->vs_inputs));
365    inputs += 4;
366 
367    if (params->wm_prog_data) {
368       /* Walk over the attribute slots, determine if the attribute is used by
369        * the program and when necessary copy the values from the input storage
370        * to the vertex data buffer.
371        */
372       for (unsigned i = 0; i < max_num_varyings; i++) {
373          const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
374 
375          const int input_index = wm_prog_data->urb_setup[attr];
376          if (input_index < 0)
377             continue;
378 
379          memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
380 
381          inputs += 4;
382       }
383    }
384 
385    blorp_flush_range(batch, data, *size);
386 
387    if (params->dst_clear_color_as_input) {
388       /* In this case, the clear color isn't known statically and instead
389        * comes in through an indirect which we have to copy into the vertex
390        * buffer before we execute the 3DPRIMITIVE.  We already copied the
391        * value of params->wm_inputs.clear_color into the vertex buffer in the
392        * loop above.  Now we emit code to stomp it from the GPU with the
393        * actual clear color value.
394        */
395       assert(num_varyings == 1);
396 
397       /* The clear color is the first thing after the header */
398       struct blorp_address clear_color_input_addr = *addr;
399       clear_color_input_addr.offset += 16;
400 
401       const unsigned clear_color_size =
402          GFX_VER < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
403       blorp_emit_memcpy(batch, clear_color_input_addr,
404                         params->dst.clear_color_addr,
405                         clear_color_size);
406    }
407 }
408 
409 static void
blorp_fill_vertex_buffer_state(struct GENX (VERTEX_BUFFER_STATE)* vb,unsigned idx,struct blorp_address addr,uint32_t size,uint32_t stride)410 blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
411                                unsigned idx,
412                                struct blorp_address addr, uint32_t size,
413                                uint32_t stride)
414 {
415    vb[idx].VertexBufferIndex = idx;
416    vb[idx].BufferStartingAddress = addr;
417    vb[idx].BufferPitch = stride;
418    vb[idx].MOCS = addr.mocs;
419    vb[idx].AddressModifyEnable = true;
420    vb[idx].BufferSize = size;
421 
422 #if GFX_VER >= 12
423    vb[idx].L3BypassDisable = true;
424 #endif
425 }
426 
427 static void
blorp_emit_vertex_buffers(struct blorp_batch * batch,const struct blorp_params * params)428 blorp_emit_vertex_buffers(struct blorp_batch *batch,
429                           const struct blorp_params *params)
430 {
431    struct GENX(VERTEX_BUFFER_STATE) vb[2] = {};
432    const uint32_t num_vbs = ARRAY_SIZE(vb);
433 
434    struct blorp_address addrs[2] = {};
435    uint32_t sizes[2] = {};
436    blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
437    if (sizes[0] == 0)
438       return;
439    blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
440                                   3 * sizeof(float));
441 
442    blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
443    blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
444 
445    blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
446 
447    const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
448    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
449    if (!dw)
450       return;
451 
452    for (unsigned i = 0; i < num_vbs; i++) {
453       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
454       dw += GENX(VERTEX_BUFFER_STATE_length);
455    }
456 }
457 
458 static void
blorp_emit_vertex_elements(struct blorp_batch * batch,const struct blorp_params * params)459 blorp_emit_vertex_elements(struct blorp_batch *batch,
460                            const struct blorp_params *params)
461 {
462    struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
463    const unsigned num_varyings =
464       wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
465    const unsigned num_elements = 2 + num_varyings;
466 
467    struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
468    memset(ve, 0, num_elements * sizeof(*ve));
469 
470    /* Setup VBO for the rectangle primitive..
471     *
472     * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
473     * vertices. The vertices reside in screen space with DirectX
474     * coordinates (that is, (0, 0) is the upper left corner).
475     *
476     *   v2 ------ implied
477     *    |        |
478     *    |        |
479     *   v1 ----- v0
480     *
481     * Since the VS is disabled, the clipper loads each VUE directly from
482     * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
483     * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
484     *   dw0: Reserved, MBZ.
485     *   dw1: Render Target Array Index. Below vertex fetcher gets programmed
486     *        to assign this with primitive instance identifier which will be
487     *        used for layered clears. All other renders have only one instance
488     *        and therefore the value will be effectively zero.
489     *   dw2: Viewport Index. The HiZ op disables viewport mapping and
490     *        scissoring, so set the dword to 0.
491     *   dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
492     *        so set the dword to 0.
493     *   dw4: Vertex Position X.
494     *   dw5: Vertex Position Y.
495     *   dw6: Vertex Position Z.
496     *   dw7: Vertex Position W.
497     *
498     *   dw8: Flat vertex input 0
499     *   dw9: Flat vertex input 1
500     *   ...
501     *   dwn: Flat vertex input n - 8
502     *
503     * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
504     * "Vertex URB Entry (VUE) Formats".
505     *
506     * Only vertex position X and Y are going to be variable, Z is fixed to
507     * zero and W to one. Header words dw0,2,3 are zero. There is no need to
508     * include the fixed values in the vertex buffer. Vertex fetcher can be
509     * instructed to fill vertex elements with constant values of one and zero
510     * instead of reading them from the buffer.
511     * Flat inputs are program constants that are not interpolated. Moreover
512     * their values will be the same between vertices.
513     *
514     * See the vertex element setup below.
515     */
516    unsigned slot = 0;
517 
518    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
519       .VertexBufferIndex = 1,
520       .Valid = true,
521       .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
522       .SourceElementOffset = 0,
523       .Component0Control = VFCOMP_STORE_SRC,
524 
525       /* From Gfx8 onwards hardware is no more instructed to overwrite
526        * components using an element specifier. Instead one has separate
527        * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
528        */
529       .Component1Control = VFCOMP_STORE_0,
530       .Component2Control = VFCOMP_STORE_0,
531       .Component3Control = VFCOMP_STORE_0,
532    };
533    slot++;
534 
535    ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
536       .VertexBufferIndex = 0,
537       .Valid = true,
538       .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
539       .SourceElementOffset = 0,
540       .Component0Control = VFCOMP_STORE_SRC,
541       .Component1Control = VFCOMP_STORE_SRC,
542       .Component2Control = VFCOMP_STORE_SRC,
543       .Component3Control = VFCOMP_STORE_1_FP,
544    };
545    slot++;
546 
547    for (unsigned i = 0; i < num_varyings; ++i) {
548       ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
549          .VertexBufferIndex = 1,
550          .Valid = true,
551          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
552          .SourceElementOffset = 16 + i * 4 * sizeof(float),
553          .Component0Control = VFCOMP_STORE_SRC,
554          .Component1Control = VFCOMP_STORE_SRC,
555          .Component2Control = VFCOMP_STORE_SRC,
556          .Component3Control = VFCOMP_STORE_SRC,
557       };
558       slot++;
559    }
560 
561    const unsigned num_dwords =
562       1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
563    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
564    if (!dw)
565       return;
566 
567    for (unsigned i = 0; i < num_elements; i++) {
568       GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
569       dw += GENX(VERTEX_ELEMENT_STATE_length);
570    }
571 
572    blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
573       vf.StatisticsEnable = false;
574    }
575 
576    /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
577     * primitive instance identifier. This is used for layered clears.
578     */
579    blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
580       sgvs.InstanceIDEnable = true;
581       sgvs.InstanceIDComponentNumber = COMP_1;
582       sgvs.InstanceIDElementOffset = 0;
583    }
584 
585 #if GFX_VER >= 11
586    blorp_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
587 #endif
588 
589    for (unsigned i = 0; i < num_elements; i++) {
590       blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
591          vf.VertexElementIndex = i;
592          vf.InstancingEnable = false;
593       }
594    }
595 
596    blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
597       topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
598    }
599 }
600 
601 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
602 static uint32_t
blorp_emit_cc_viewport(struct blorp_batch * batch)603 blorp_emit_cc_viewport(struct blorp_batch *batch)
604 {
605    uint32_t cc_vp_offset;
606 
607    /* Somehow reusing CC_VIEWPORT on Gfx9 is causing issues :
608     *    https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
609     */
610    if (GFX_VER != 9 && batch->blorp->config.use_cached_dynamic_states) {
611       cc_vp_offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_CC_VIEWPORT);
612    } else {
613       blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
614          vp.MinimumDepth = batch->blorp->config.use_unrestricted_depth_range ?
615                            -FLT_MAX : 0.0;
616          vp.MaximumDepth = batch->blorp->config.use_unrestricted_depth_range ?
617                            FLT_MAX : 1.0;
618       }
619    }
620 
621    blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
622       vsp.CCViewportPointer = cc_vp_offset;
623    }
624 
625    return cc_vp_offset;
626 }
627 
628 static uint32_t
blorp_emit_sampler_state(struct blorp_batch * batch)629 blorp_emit_sampler_state(struct blorp_batch *batch)
630 {
631    uint32_t offset;
632    blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
633       sampler.MipModeFilter = MIPFILTER_NONE;
634       sampler.MagModeFilter = MAPFILTER_LINEAR;
635       sampler.MinModeFilter = MAPFILTER_LINEAR;
636       sampler.MinLOD = 0;
637       sampler.MaxLOD = 0;
638       sampler.TCXAddressControlMode = TCM_CLAMP;
639       sampler.TCYAddressControlMode = TCM_CLAMP;
640       sampler.TCZAddressControlMode = TCM_CLAMP;
641       sampler.MaximumAnisotropy = RATIO21;
642       sampler.RAddressMinFilterRoundingEnable = true;
643       sampler.RAddressMagFilterRoundingEnable = true;
644       sampler.VAddressMinFilterRoundingEnable = true;
645       sampler.VAddressMagFilterRoundingEnable = true;
646       sampler.UAddressMinFilterRoundingEnable = true;
647       sampler.UAddressMagFilterRoundingEnable = true;
648       sampler.NonnormalizedCoordinateEnable = true;
649    }
650 
651    return offset;
652 }
653 
654 UNUSED static uint32_t
blorp_emit_sampler_state_ps(struct blorp_batch * batch)655 blorp_emit_sampler_state_ps(struct blorp_batch *batch)
656 {
657    uint32_t offset = batch->blorp->config.use_cached_dynamic_states ?
658       blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_SAMPLER) :
659       blorp_emit_sampler_state(batch);
660 
661    blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
662       ssp.PointertoPSSamplerState = offset;
663    }
664 
665    return offset;
666 }
667 
668 /* What follows is the code for setting up a "pipeline". */
669 
670 static void
blorp_emit_vs_config(struct blorp_batch * batch,const struct blorp_params * params)671 blorp_emit_vs_config(struct blorp_batch *batch,
672                      const struct blorp_params *params)
673 {
674    struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
675    assert(!vs_prog_data || GFX_VER < 11 ||
676           vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
677 
678    blorp_emit(batch, GENX(3DSTATE_VS), vs) {
679       if (vs_prog_data) {
680          vs.Enable = true;
681 
682          vs.KernelStartPointer = params->vs_prog_kernel;
683 
684          vs.DispatchGRFStartRegisterForURBData =
685             vs_prog_data->base.base.dispatch_grf_start_reg;
686          vs.VertexURBEntryReadLength =
687             vs_prog_data->base.urb_read_length;
688          vs.VertexURBEntryReadOffset = 0;
689 
690          vs.MaximumNumberofThreads =
691             batch->blorp->isl_dev->info->max_vs_threads - 1;
692 
693          assert(vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
694 #if GFX_VER < 20
695          vs.SIMD8DispatchEnable = true;
696 #endif
697 
698 #if GFX_VER >= 30
699          vs.RegistersPerThread = ptl_register_blocks(vs_prog_data->base.base.grf_used);
700 #endif
701       }
702    }
703 }
704 
705 static void
blorp_emit_sf_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size urb_deref_block_size)706 blorp_emit_sf_config(struct blorp_batch *batch,
707                      const struct blorp_params *params,
708                      UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
709 {
710    const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
711 
712    /* 3DSTATE_SF
713     *
714     * Disable ViewportTransformEnable (dw2.1)
715     *
716     * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
717     * Primitives Overview":
718     *     RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
719     *     use of screen- space coordinates).
720     *
721     * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
722     * and BackFaceFillMode (dw2.5:6) to SOLID(0).
723     *
724     * From the Sandy Bridge PRM, Volume 2, Part 1, Section
725     * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
726     *     SOLID: Any triangle or rectangle object found to be front-facing
727     *     is rendered as a solid object. This setting is required when
728     *     (rendering rectangle (RECTLIST) objects.
729     */
730 
731    blorp_emit(batch, GENX(3DSTATE_SF), sf) {
732 #if GFX_VER >= 12
733       sf.DerefBlockSize = urb_deref_block_size;
734 #endif
735    }
736 
737    blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
738       raster.CullMode = CULLMODE_NONE;
739    }
740 
741    blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
742       sbe.VertexURBEntryReadOffset = 1;
743       if (prog_data) {
744          sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
745          sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
746          sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
747       } else {
748          sbe.NumberofSFOutputAttributes = 0;
749          sbe.VertexURBEntryReadLength = 1;
750       }
751       sbe.ForceVertexURBEntryReadLength = true;
752       sbe.ForceVertexURBEntryReadOffset = true;
753 
754       for (unsigned i = 0; i < 32; i++)
755          sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
756    }
757 }
758 
759 static void
blorp_emit_ps_config(struct blorp_batch * batch,const struct blorp_params * params)760 blorp_emit_ps_config(struct blorp_batch *batch,
761                      const struct blorp_params *params)
762 {
763    const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
764 
765    /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
766     * nonzero to prevent the GPU from hanging.  While the documentation doesn't
767     * mention this explicitly, it notes that the valid range for the field is
768     * [1,39] = [2,40] threads, which excludes zero.
769     *
770     * To be safe (and to minimize extraneous code) we go ahead and fully
771     * configure the WM state whether or not there is a WM program.
772     */
773 
774    const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
775 
776    blorp_emit(batch, GENX(3DSTATE_WM), wm);
777 
778    blorp_emit(batch, GENX(3DSTATE_PS), ps) {
779       if (params->src.enabled) {
780          ps.SamplerCount = 1; /* Up to 4 samplers */
781          ps.BindingTableEntryCount = 2;
782       } else {
783          ps.BindingTableEntryCount = 1;
784       }
785 
786       /* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
787       if (GFX_VER == 11)
788          ps.SamplerCount = 0;
789 
790       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
791        * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
792        * k, it implies 2(k+1) threads. It implicitly scales for different GT
793        * levels (which have some # of PSDs).
794        */
795       ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
796 
797       switch (params->fast_clear_op) {
798       case ISL_AUX_OP_NONE:
799          break;
800 #if GFX_VER < 20
801 #if GFX_VER >= 10
802       case ISL_AUX_OP_AMBIGUATE:
803          ps.RenderTargetFastClearEnable = true;
804          ps.RenderTargetResolveType = FAST_CLEAR_0;
805          break;
806 #endif /* GFX_VER >= 10 */
807       case ISL_AUX_OP_PARTIAL_RESOLVE:
808          ps.RenderTargetResolveType = RESOLVE_PARTIAL;
809          break;
810       case ISL_AUX_OP_FULL_RESOLVE:
811          ps.RenderTargetResolveType = RESOLVE_FULL;
812          break;
813 #endif /* GFX_VER < 20 */
814       case ISL_AUX_OP_FAST_CLEAR:
815          ps.RenderTargetFastClearEnable = true;
816          break;
817       default:
818          unreachable("Invalid fast clear op");
819       }
820 
821 #if GFX_VERx10 == 120
822       /* The 3DSTATE_PS_BODY page for TGL says:
823        *
824        *   3D/Volumetric surfaces do not support Fast Clear operation.
825        *
826        *   [...]
827        *
828        *   3D/Volumetric surfaces do not support in-place resolve pass
829        *   operation.
830        *
831        * HSD 1406738321 suggests a more limited scope of restrictions, but
832        * there should be no harm in complying with the Bspec restrictions.
833        */
834       if (params->dst.surf.dim == ISL_SURF_DIM_3D)
835          assert(params->fast_clear_op == ISL_AUX_OP_NONE);
836 
837       /* The RENDER_SURFACE_STATE page for TGL says:
838        *
839        *   For an 8 bpp surface with NUM_MULTISAMPLES = 1, Surface Width not
840        *   multiple of 64 pixels and more than 1 mip level in the view, Fast
841        *   Clear is not supported when AUX_CCS_E is set in this field.
842        *
843        * The granularity of a fast-clear or ambiguate operation is likely one
844        * CCS element. For an 8 bpp primary surface, this maps to 32px x 4rows.
845        * Due to the surface layout parameters, if LOD0's width isn't a
846        * multiple of 64px, LOD1 and LOD2+ will share CCS elements. Assert that
847        * these operations aren't occurring on these LODs.
848        */
849       if (isl_format_get_layout(params->dst.surf.format)->bpb == 8 &&
850           params->dst.surf.logical_level0_px.width % 64 != 0 &&
851           params->dst.surf.levels >= 3 &&
852           params->dst.view.base_level >= 1) {
853          assert(params->num_samples == 1);
854          assert(!ps.RenderTargetFastClearEnable);
855       }
856 
857       /* From the TGL BSpec 44930 (r47128):
858        *
859        *   Compression of 3D Ys surfaces with 64 or 128 bpp is not supported
860        *   in Gen12. Moreover, "Render Target Fast-clear Enable" command is
861        *   not supported for any 3D Ys surfaces. except when Surface is a
862        *   Procdural Texture.
863        *
864        * It's not clear where the exception applies, but either way, we don't
865        * support Procedural Textures.
866        */
867       if (params->dst.surf.dim == ISL_SURF_DIM_3D &&
868           params->dst.surf.tiling == ISL_TILING_ICL_Ys &&
869           isl_format_get_layout(params->dst.surf.format)->bpb >= 64) {
870          assert(params->dst.aux_usage != ISL_AUX_USAGE_CCS_D);
871          assert(!ps.RenderTargetFastClearEnable);
872       }
873 #endif
874 
875       if (prog_data) {
876          intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
877                                      params->num_samples,
878                                      0 /* msaa_flags */);
879 
880          ps.DispatchGRFStartRegisterForConstantSetupData0 =
881             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
882          ps.DispatchGRFStartRegisterForConstantSetupData1 =
883             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
884 #if GFX_VER < 20
885          ps.DispatchGRFStartRegisterForConstantSetupData2 =
886             brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
887 #endif
888 
889          ps.KernelStartPointer0 = params->wm_prog_kernel +
890                                   brw_wm_prog_data_prog_offset(prog_data, ps, 0);
891          ps.KernelStartPointer1 = params->wm_prog_kernel +
892                                   brw_wm_prog_data_prog_offset(prog_data, ps, 1);
893 #if GFX_VER < 20
894          ps.KernelStartPointer2 = params->wm_prog_kernel +
895                                   brw_wm_prog_data_prog_offset(prog_data, ps, 2);
896 #endif
897 
898 #if GFX_VER >= 30
899          ps.RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used);
900 #endif
901       }
902    }
903 
904    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
905       if (params->src.enabled)
906          psx.PixelShaderKillsPixel = true;
907 
908       if (prog_data) {
909          psx.PixelShaderValid = true;
910          psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
911          psx.PixelShaderComputesStencil = prog_data->computed_stencil;
912          psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
913 
914 #if INTEL_WA_18038825448_GFX_VER
915          psx.EnablePSDependencyOnCPsizeChange =
916             batch->flags & BLORP_BATCH_FORCE_CPS_DEPENDENCY;
917 #endif
918 
919 #if GFX_VER < 20
920          psx.AttributeEnable = prog_data->num_varying_inputs > 0;
921 #else
922          /* Bspec 57340 (r59562):
923           *
924           *   For MSAA fast clear, it (clear shader) must be in per-pixel
925           *   dispatch mode.
926           *
927           * Bspec 56424 (r58933):
928           *
929           *   Bit 6 of Bit Group 0: Pixel Shader Is Per Sample
930           *   If this bit is DISABLED, the dispatch rate is determined by the
931           *   value of Pixel Shader Is Per Coarse Pixel.
932           *
933           *   Bit 4 of Bit Group 0: Pixel Shader Is Per Coarse Pixel
934           *   If Pixel Shader Is Per Sample is DISABLED and this bit is
935           *   DISABLED, the pixel shader is dispatched at the per pixel
936           *   shading rate.
937           *
938           * The below assertion ensures the MSAA clear shader is in per-pixel
939           * dispatch mode.
940           */
941          if (params->fast_clear_op == ISL_AUX_OP_FAST_CLEAR &&
942              params->num_samples > 1) {
943             assert(!psx.PixelShaderIsPerSample &&
944                    !psx.PixelShaderIsPerCoarsePixel);
945          }
946 #endif
947       }
948    }
949 }
950 
951 static void
blorp_emit_blend_state(struct blorp_batch * batch,const struct blorp_params * params)952 blorp_emit_blend_state(struct blorp_batch *batch,
953                        const struct blorp_params *params)
954 {
955    uint32_t offset;
956    if (!batch->blorp->config.use_cached_dynamic_states) {
957       struct GENX(BLEND_STATE) blend = { };
958 
959       const unsigned size = 96;
960       uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
961       if (state == NULL)
962          return;
963       uint32_t *pos = state;
964 
965       GENX(BLEND_STATE_pack)(NULL, pos, &blend);
966       pos += GENX(BLEND_STATE_length);
967 
968       for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
969          struct GENX(BLEND_STATE_ENTRY) entry = {
970             .PreBlendColorClampEnable = true,
971             .PostBlendColorClampEnable = true,
972             .ColorClampRange = COLORCLAMP_RTFORMAT,
973 
974             .WriteDisableRed = params->color_write_disable & 1,
975             .WriteDisableGreen = params->color_write_disable & 2,
976             .WriteDisableBlue = params->color_write_disable & 4,
977             .WriteDisableAlpha = params->color_write_disable & 8,
978          };
979          GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
980          pos += GENX(BLEND_STATE_ENTRY_length);
981       }
982 
983       blorp_flush_range(batch, state, size);
984    } else {
985       /* We only cached this case. */
986       assert(params->color_write_disable == 0);
987       offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_BLEND);
988    }
989 
990    blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
991       sp.BlendStatePointer = offset;
992       sp.BlendStatePointerValid = true;
993    }
994 
995    blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
996       ps_blend.HasWriteableRT = true;
997    }
998 }
999 
1000 static void
blorp_emit_color_calc_state(struct blorp_batch * batch,UNUSED const struct blorp_params * params)1001 blorp_emit_color_calc_state(struct blorp_batch *batch,
1002                             UNUSED const struct blorp_params *params)
1003 {
1004    uint32_t offset;
1005 
1006    if (batch->blorp->config.use_cached_dynamic_states)
1007       offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_COLOR_CALC);
1008    else
1009       blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {}
1010 
1011    blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
1012       sp.ColorCalcStatePointer = offset;
1013       sp.ColorCalcStatePointerValid = true;
1014    }
1015 }
1016 
1017 static void
blorp_emit_depth_stencil_state(struct blorp_batch * batch,const struct blorp_params * params)1018 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
1019                                const struct blorp_params *params)
1020 {
1021    blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
1022       if (params->depth.enabled) {
1023          ds.DepthBufferWriteEnable = true;
1024 
1025          switch (params->hiz_op) {
1026          /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
1027           *   - 7.5.3.1 Depth Buffer Clear
1028           *   - 7.5.3.2 Depth Buffer Resolve
1029           *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
1030           */
1031          case ISL_AUX_OP_FULL_RESOLVE:
1032             ds.DepthTestEnable = true;
1033             ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1034             break;
1035 
1036          case ISL_AUX_OP_NONE:
1037          case ISL_AUX_OP_FAST_CLEAR:
1038          case ISL_AUX_OP_AMBIGUATE:
1039             ds.DepthTestEnable = false;
1040             break;
1041          case ISL_AUX_OP_PARTIAL_RESOLVE:
1042             unreachable("Invalid HIZ op");
1043          }
1044       }
1045 
1046       if (params->stencil.enabled) {
1047          ds.StencilBufferWriteEnable = true;
1048          ds.StencilTestEnable = true;
1049          ds.DoubleSidedStencilEnable = false;
1050 
1051          ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1052          ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1053 
1054          ds.StencilWriteMask = params->stencil_mask;
1055          ds.StencilReferenceValue = params->stencil_ref;
1056       }
1057    }
1058 
1059 #if GFX_VER >= 12
1060    blorp_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
1061       db.DepthBoundsTestEnable = false;
1062       db.DepthBoundsTestMinValue = 0.0;
1063       db.DepthBoundsTestMaxValue = 1.0;
1064    }
1065 #endif
1066 }
1067 
1068 static void
blorp_emit_3dstate_multisample(struct blorp_batch * batch,const struct blorp_params * params)1069 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1070                                const struct blorp_params *params)
1071 {
1072    blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1073       ms.NumberofMultisamples       = __builtin_ffs(params->num_samples) - 1;
1074       ms.PixelLocation              = CENTER;
1075    }
1076 }
1077 
1078 static void
blorp_emit_pipeline(struct blorp_batch * batch,const struct blorp_params * params)1079 blorp_emit_pipeline(struct blorp_batch *batch,
1080                     const struct blorp_params *params)
1081 {
1082    enum intel_urb_deref_block_size urb_deref_block_size;
1083    emit_urb_config(batch, params, &urb_deref_block_size);
1084 
1085    if (params->wm_prog_data) {
1086       blorp_emit_blend_state(batch, params);
1087    }
1088    blorp_emit_color_calc_state(batch, params);
1089    blorp_emit_depth_stencil_state(batch, params);
1090 
1091    UNUSED uint32_t mocs = isl_mocs(batch->blorp->isl_dev, 0, false);
1092 
1093 #if GFX_VER >= 12
1094    blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
1095       /* Update empty push constants for all stages (bitmask = 11111b) */
1096       pc.ShaderUpdateEnable = 0x1f;
1097       pc.MOCS = mocs;
1098    }
1099 #else
1100    blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), xs) { xs.MOCS = mocs; }
1101    blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), xs) { xs.MOCS = mocs; }
1102    blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), xs) { xs.MOCS = mocs; }
1103    blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), xs) { xs.MOCS = mocs; }
1104    blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), xs) { xs.MOCS = mocs; }
1105 #endif
1106 
1107    if (params->src.enabled)
1108       blorp_emit_sampler_state_ps(batch);
1109 
1110    blorp_emit_3dstate_multisample(batch, params);
1111 
1112    blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1113       mask.SampleMask = (1 << params->num_samples) - 1;
1114    }
1115 
1116    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1117     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1118     *
1119     *   [DevSNB] A pipeline flush must be programmed prior to a
1120     *   3DSTATE_VS command that causes the VS Function Enable to
1121     *   toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1122     *   command with CS stall bit set and a post sync operation.
1123     *
1124     * We've already done one at the start of the BLORP operation.
1125     */
1126    blorp_emit_vs_config(batch, params);
1127    blorp_emit(batch, GENX(3DSTATE_HS), hs);
1128    blorp_emit(batch, GENX(3DSTATE_TE), te);
1129    blorp_emit(batch, GENX(3DSTATE_DS), DS);
1130    blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1131    blorp_emit(batch, GENX(3DSTATE_GS), gs);
1132 
1133    blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1134       clip.PerspectiveDivideDisable = true;
1135    }
1136 
1137    blorp_emit_sf_config(batch, params, urb_deref_block_size);
1138    blorp_emit_ps_config(batch, params);
1139 
1140    blorp_emit_cc_viewport(batch);
1141 
1142 #if GFX_VER >= 12
1143    /* Disable Primitive Replication. */
1144    blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1145 #endif
1146 
1147    if (batch->blorp->config.use_mesh_shading) {
1148 #if GFX_VERx10 >= 125
1149       blorp_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
1150       blorp_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
1151 #endif
1152    }
1153 }
1154 
1155 /******** This is the end of the pipeline setup code ********/
1156 
1157 static void
blorp_emit_memcpy(struct blorp_batch * batch,struct blorp_address dst,struct blorp_address src,uint32_t size)1158 blorp_emit_memcpy(struct blorp_batch *batch,
1159                   struct blorp_address dst,
1160                   struct blorp_address src,
1161                   uint32_t size)
1162 {
1163    assert(size % 4 == 0);
1164 
1165    for (unsigned dw = 0; dw < size; dw += 4) {
1166       blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1167          cp.DestinationMemoryAddress = dst;
1168          cp.SourceMemoryAddress = src;
1169       }
1170       dst.offset += 4;
1171       src.offset += 4;
1172    }
1173 }
1174 
1175 static void
blorp_emit_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,UNUSED enum isl_aux_op aux_op,void * state,uint32_t state_offset,uint8_t color_write_disable,bool is_render_target)1176 blorp_emit_surface_state(struct blorp_batch *batch,
1177                          const struct blorp_surface_info *surface,
1178                          UNUSED enum isl_aux_op aux_op,
1179                          void *state, uint32_t state_offset,
1180                          uint8_t color_write_disable,
1181                          bool is_render_target)
1182 {
1183    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1184    struct isl_surf surf = surface->surf;
1185 
1186    if (surf.dim == ISL_SURF_DIM_1D &&
1187        surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1188       assert(surf.logical_level0_px.height == 1);
1189       surf.dim = ISL_SURF_DIM_2D;
1190    }
1191 
1192    if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1193       /* BLORP doesn't render with depth so we can't use HiZ */
1194       assert(!is_render_target);
1195       /* We can't reinterpret HiZ */
1196       assert(surface->surf.format == surface->view.format);
1197    }
1198 
1199    enum isl_aux_usage aux_usage = surface->aux_usage;
1200 
1201    /* On gfx12, implicit CCS has no aux buffer */
1202    bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1203                           (surface->aux_addr.buffer != NULL);
1204 
1205    const bool use_clear_address =
1206       GFX_VER >= 10 && (surface->clear_color_addr.buffer != NULL);
1207 
1208    /* On gfx12 (and optionally on gfx11), hardware will read and write to the
1209     * clear color address, converting the raw clear color channels to a pixel
1210     * during a fast-clear. To avoid the restrictions associated with the
1211     * hardware feature, we instead write a software-converted pixel ourselves.
1212     * If we're performing a fast-clear, provide a substitute address to avoid
1213     * a collision with hardware. Outside of gfx11 and gfx12, indirect clear
1214     * color BOs are not used during fast-clears.
1215     */
1216    const struct blorp_address op_clear_addr =
1217       aux_op == ISL_AUX_OP_FAST_CLEAR ? blorp_get_workaround_address(batch) :
1218                                         surface->clear_color_addr;
1219 
1220    isl_surf_fill_state(batch->blorp->isl_dev, state,
1221                        .surf = &surf, .view = &surface->view,
1222                        .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1223                        .address =
1224                           blorp_get_surface_address(batch, surface->addr),
1225                        .aux_address = !use_aux_address ? 0 :
1226                           blorp_get_surface_address(batch, surface->aux_addr),
1227                        .clear_address = !use_clear_address ? 0 :
1228                           blorp_get_surface_address(batch, op_clear_addr),
1229                        .mocs = surface->addr.mocs,
1230                        .clear_color = surface->clear_color,
1231                        .use_clear_address = use_clear_address);
1232 
1233    blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1234                        surface->addr, 0);
1235 
1236    if (use_aux_address) {
1237       /* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1238        * used to store other information.  This should be ok, however, because
1239        * surface buffer addresses are always 4K page alinged.
1240        */
1241       assert((surface->aux_addr.offset & 0xfff) == 0);
1242       uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1243       blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1244                           surface->aux_addr, *aux_addr);
1245    }
1246 
1247    if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1248 #if GFX_VER >= 10
1249       assert((surface->clear_color_addr.offset & 0x3f) == 0);
1250       uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
1251       blorp_surface_reloc(batch, state_offset +
1252                           isl_dev->ss.clear_color_state_offset,
1253                           op_clear_addr, *clear_addr);
1254 #else
1255       /* Fast clears just whack the AUX surface and don't actually use the
1256        * clear color for anything.  We can avoid the MI memcpy on that case.
1257        */
1258       if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1259          struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1260          dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1261          blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1262                            isl_dev->ss.clear_value_size);
1263       }
1264 #endif
1265    }
1266 
1267    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1268 }
1269 
1270 static void
blorp_emit_null_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,uint32_t * state)1271 blorp_emit_null_surface_state(struct blorp_batch *batch,
1272                               const struct blorp_surface_info *surface,
1273                               uint32_t *state)
1274 {
1275    struct GENX(RENDER_SURFACE_STATE) ss = {
1276       .SurfaceType = SURFTYPE_NULL,
1277       .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1278       .Width = surface->surf.logical_level0_px.width - 1,
1279       .Height = surface->surf.logical_level0_px.height - 1,
1280       .MIPCountLOD = surface->view.base_level,
1281       .MinimumArrayElement = surface->view.base_array_layer,
1282       .Depth = surface->view.array_len - 1,
1283       .RenderTargetViewExtent = surface->view.array_len - 1,
1284       .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1285       .MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1286 
1287       .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1288 
1289 #if GFX_VERx10 >= 125
1290       .TileMode = TILE4,
1291 #else
1292       .TileMode = YMAJOR,
1293 #endif
1294    };
1295 
1296    GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1297 
1298    blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1299 }
1300 
1301 static uint32_t
blorp_setup_binding_table(struct blorp_batch * batch,const struct blorp_params * params)1302 blorp_setup_binding_table(struct blorp_batch *batch,
1303                            const struct blorp_params *params)
1304 {
1305    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1306    uint32_t surface_offsets[2], bind_offset = 0;
1307    void *surface_maps[2];
1308 
1309    if (params->use_pre_baked_binding_table) {
1310       bind_offset = params->pre_baked_binding_table_offset;
1311    } else {
1312       unsigned num_surfaces = 1 + params->src.enabled;
1313       if (!blorp_alloc_binding_table(batch, num_surfaces,
1314                                      isl_dev->ss.size, isl_dev->ss.align,
1315                                      &bind_offset, surface_offsets, surface_maps))
1316          return 0;
1317 
1318       if (params->dst.enabled) {
1319          blorp_emit_surface_state(batch, &params->dst,
1320                                   params->fast_clear_op,
1321                                   surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1322                                   surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1323                                   params->color_write_disable, true);
1324       } else {
1325          assert(params->depth.enabled || params->stencil.enabled);
1326          const struct blorp_surface_info *surface =
1327             params->depth.enabled ? &params->depth : &params->stencil;
1328          blorp_emit_null_surface_state(batch, surface,
1329                                        surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1330       }
1331 
1332       if (params->src.enabled) {
1333          blorp_emit_surface_state(batch, &params->src,
1334                                   params->fast_clear_op,
1335                                   surface_maps[BLORP_TEXTURE_BT_INDEX],
1336                                   surface_offsets[BLORP_TEXTURE_BT_INDEX],
1337                                   0, false);
1338       }
1339    }
1340 
1341    return bind_offset;
1342 }
1343 
1344 static void
blorp_emit_btp(struct blorp_batch * batch,uint32_t bind_offset)1345 blorp_emit_btp(struct blorp_batch *batch, uint32_t bind_offset)
1346 {
1347    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1348    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1349    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1350    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1351 
1352    blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1353       bt.PointertoPSBindingTable =
1354          blorp_binding_table_offset_to_pointer(batch, bind_offset);
1355    }
1356 }
1357 
1358 static void
blorp_emit_depth_stencil_config(struct blorp_batch * batch,const struct blorp_params * params)1359 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1360                                 const struct blorp_params *params)
1361 {
1362    const struct isl_device *isl_dev = batch->blorp->isl_dev;
1363    const struct intel_device_info *devinfo =
1364       batch->blorp->compiler->brw->devinfo;
1365 
1366    uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1367    if (dw == NULL)
1368       return;
1369 
1370    struct isl_depth_stencil_hiz_emit_info info = { };
1371 
1372    if (params->depth.enabled) {
1373       info.view = &params->depth.view;
1374       info.mocs = params->depth.addr.mocs;
1375    } else if (params->stencil.enabled) {
1376       info.view = &params->stencil.view;
1377       info.mocs = params->stencil.addr.mocs;
1378    } else {
1379       info.mocs = isl_mocs(isl_dev, 0, false);
1380    }
1381 
1382    if (params->depth.enabled) {
1383       info.depth_surf = &params->depth.surf;
1384 
1385       info.depth_address =
1386          blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1387                           params->depth.addr, 0);
1388 
1389       info.hiz_usage = params->depth.aux_usage;
1390       if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1391          info.hiz_surf = &params->depth.aux_surf;
1392 
1393          struct blorp_address hiz_address = params->depth.aux_addr;
1394 
1395          info.hiz_address =
1396             blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1397                              hiz_address, 0);
1398 
1399          info.depth_clear_value = params->depth.clear_color.f32[0];
1400       }
1401    }
1402 
1403    if (params->stencil.enabled) {
1404       info.stencil_surf = &params->stencil.surf;
1405 
1406       info.stencil_aux_usage = params->stencil.aux_usage;
1407       struct blorp_address stencil_address = params->stencil.addr;
1408 
1409       info.stencil_address =
1410          blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1411                           stencil_address, 0);
1412    }
1413 
1414    isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1415 
1416    if (intel_needs_workaround(devinfo, 1408224581) ||
1417        intel_needs_workaround(devinfo, 14014097488) ||
1418        intel_needs_workaround(devinfo, 14016712196)) {
1419       /* Wa_1408224581
1420        *
1421        * Workaround: Gfx12LP Astep only An additional pipe control with
1422        * post-sync = store dword operation would be required.( w/a is to
1423        * have an additional pipe control after the stencil state whenever
1424        * the surface state bits of this state is changing).
1425        *
1426        * This also seems sufficient to handle Wa_14014097488 and
1427        * Wa_14016712196.
1428        */
1429       blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1430          pc.PostSyncOperation = WriteImmediateData;
1431          pc.Address = blorp_get_workaround_address(batch);
1432       }
1433    }
1434 }
1435 
1436 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1437  * depth/stencil buffer extents are ignored to handle APIs which perform
1438  * clearing operations without such information.
1439  * */
1440 static void
blorp_emit_gfx8_hiz_op(struct blorp_batch * batch,const struct blorp_params * params)1441 blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1442                        const struct blorp_params *params)
1443 {
1444    /* We should be performing an operation on a depth or stencil buffer.
1445     */
1446    assert(params->depth.enabled || params->stencil.enabled);
1447 
1448    blorp_measure_start(batch, params);
1449 
1450    /* The stencil buffer should only be enabled if a fast clear operation is
1451     * requested.
1452     */
1453    if (params->stencil.enabled)
1454       assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1455 
1456    /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1457     *
1458     * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1459     * the Number of Multisamples. This packet must not be used to change
1460     * Number of Multisamples in a rendering sequence.
1461     *
1462     * Since HIZ may be the first thing in a batch buffer, play safe and always
1463     * emit 3DSTATE_MULTISAMPLE.
1464     */
1465    blorp_emit_3dstate_multisample(batch, params);
1466 
1467    /* From the BDW PRM Volume 7, Depth Buffer Clear:
1468     *
1469     *    The clear value must be between the min and max depth values
1470     *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1471     *    D32_FLOAT, then +/-DENORM values are also allowed.
1472     *
1473     * Set the bounds to match our hardware limits.
1474     */
1475    if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR)
1476       blorp_emit_cc_viewport(batch);
1477 
1478    /* Make sure to disable fragment shader, a previous draw might have enabled
1479     * a SIMD32 shader and we could be dispatching threads here with MSAA 16x
1480     * which does not support SIMD32.
1481     *
1482     * dEQP-VK.pipeline.monolithic.multisample.misc.clear_attachments.
1483     * r8g8b8a8_unorm_r16g16b16a16_sfloat_r32g32b32a32_uint_d16_unorm.
1484     * 16x.ds_resolve_sample_zero.sub_framebuffer
1485     * exercises this case.
1486     */
1487    blorp_emit(batch, GENX(3DSTATE_PS), ps);
1488    blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx);
1489 
1490    /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1491     * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1492     * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
1493     * disabled for HiZ ops and it appears that force-enabling it can lead to
1494     * GPU hangs on at least Skylake.  Since we don't know the current state of
1495     * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1496     */
1497    blorp_emit(batch, GENX(3DSTATE_WM), wm);
1498 
1499    /* If we can't alter the depth stencil config and multiple layers are
1500     * involved, the HiZ op will fail. This is because the op requires that a
1501     * new config is emitted for each additional layer.
1502     */
1503    if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1504       assert(params->num_layers <= 1);
1505    } else {
1506       blorp_emit_depth_stencil_config(batch, params);
1507    }
1508 
1509    /* TODO - If we ever start using 3DSTATE_WM_HZ_OP::StencilBufferResolveEnable
1510     * we need to implement required steps, flushes documented in Wa_1605967699.
1511     */
1512    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1513       switch (params->hiz_op) {
1514       case ISL_AUX_OP_FAST_CLEAR:
1515          hzp.StencilBufferClearEnable = params->stencil.enabled;
1516          hzp.DepthBufferClearEnable = params->depth.enabled;
1517          hzp.StencilClearValue = params->stencil_ref;
1518          hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1519 #if GFX_VER >= 20
1520          hzp.DepthClearValue = params->depth.clear_color.f32[0];
1521 
1522          /* From the Xe2 Bspec 56437 (r61349):
1523           *
1524           *    The Depth Clear value cannot be a NAN (Not-A-Number) if the
1525           *    depth format is Float32.
1526           *
1527           * We're not required to support NaN in APIs, so flush to zero.
1528           */
1529          if (util_is_nan(hzp.DepthClearValue))
1530             hzp.DepthClearValue = 0;
1531 #endif
1532          break;
1533       case ISL_AUX_OP_FULL_RESOLVE:
1534          assert(params->full_surface_hiz_op);
1535          hzp.DepthBufferResolveEnable = true;
1536          break;
1537       case ISL_AUX_OP_AMBIGUATE:
1538          assert(params->full_surface_hiz_op);
1539          hzp.HierarchicalDepthBufferResolveEnable = true;
1540          break;
1541       case ISL_AUX_OP_PARTIAL_RESOLVE:
1542       case ISL_AUX_OP_NONE:
1543          unreachable("Invalid HIZ op");
1544       }
1545 
1546       hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1547       hzp.SampleMask = 0xFFFF;
1548 
1549       /* Due to a hardware issue, this bit MBZ */
1550       assert(hzp.ScissorRectangleEnable == false);
1551 
1552       /* Contrary to the HW docs both fields are inclusive */
1553       hzp.ClearRectangleXMin = params->x0;
1554       hzp.ClearRectangleYMin = params->y0;
1555 
1556       /* Contrary to the HW docs both fields are exclusive */
1557       hzp.ClearRectangleXMax = params->x1;
1558       hzp.ClearRectangleYMax = params->y1;
1559    }
1560 
1561    /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1562     * to “Write Immediate Data” enabled.
1563     */
1564    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1565       pc.PostSyncOperation = WriteImmediateData;
1566       pc.Address = blorp_get_workaround_address(batch);
1567    }
1568 
1569    blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1570 
1571    blorp_measure_end(batch, params);
1572 }
1573 
1574 static bool
blorp_uses_bti_rt_writes(const struct blorp_batch * batch,const struct blorp_params * params)1575 blorp_uses_bti_rt_writes(const struct blorp_batch *batch, const struct blorp_params *params)
1576 {
1577    if (batch->flags & (BLORP_BATCH_USE_BLITTER | BLORP_BATCH_USE_COMPUTE))
1578       return false;
1579 
1580    /* HIZ clears use WM_HZ ops rather than a clear shader using RT writes. */
1581    return params->hiz_op == ISL_AUX_OP_NONE;
1582 }
1583 
1584 static void
blorp_exec_3d(struct blorp_batch * batch,const struct blorp_params * params)1585 blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
1586 {
1587    if (params->hiz_op != ISL_AUX_OP_NONE) {
1588       blorp_emit_gfx8_hiz_op(batch, params);
1589       return;
1590    }
1591 
1592    blorp_emit_vertex_buffers(batch, params);
1593    blorp_emit_vertex_elements(batch, params);
1594 
1595    blorp_emit_pipeline(batch, params);
1596 
1597    blorp_emit_btp(batch, blorp_setup_binding_table(batch, params));
1598 
1599    if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1600       blorp_emit_depth_stencil_config(batch, params);
1601 
1602    const UNUSED bool use_tbimr = false;
1603    blorp_emit_pre_draw(batch, params);
1604    blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1605       prim.VertexAccessType = SEQUENTIAL;
1606       prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1607       prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1608 #if GFX_VERx10 >= 125
1609       prim.TBIMREnable = use_tbimr;
1610 #endif
1611       prim.VertexCountPerInstance = 3;
1612       prim.InstanceCount = params->num_layers;
1613    }
1614    blorp_emit_post_draw(batch, params);
1615 }
1616 
1617 static void
blorp_get_compute_push_const(struct blorp_batch * batch,const struct blorp_params * params,uint32_t threads,uint32_t * state_offset,unsigned * state_size)1618 blorp_get_compute_push_const(struct blorp_batch *batch,
1619                              const struct blorp_params *params,
1620                              uint32_t threads,
1621                              uint32_t *state_offset,
1622                              unsigned *state_size)
1623 {
1624    const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1625    const unsigned push_const_size =
1626       ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
1627    assert(cs_prog_data->push.cross_thread.size +
1628           cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
1629 
1630    if (push_const_size == 0) {
1631       *state_offset = 0;
1632       *state_size = 0;
1633       return;
1634    }
1635 
1636    uint32_t push_const_offset;
1637    uint32_t *push_const =
1638       GFX_VERx10 >= 125 ?
1639       blorp_alloc_general_state(batch, push_const_size, 64,
1640                                 &push_const_offset) :
1641       blorp_alloc_dynamic_state(batch, push_const_size, 64,
1642                                 &push_const_offset);
1643    if (push_const == NULL) {
1644       *state_offset = 0;
1645       *state_size = 0;
1646       return;
1647    }
1648    memset(push_const, 0x0, push_const_size);
1649 
1650    void *dst = push_const;
1651    const void *src = (char *)&params->wm_inputs;
1652 
1653    if (cs_prog_data->push.cross_thread.size > 0) {
1654       memcpy(dst, src, cs_prog_data->push.cross_thread.size);
1655       dst += cs_prog_data->push.cross_thread.size;
1656       src += cs_prog_data->push.cross_thread.size;
1657    }
1658 
1659    assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
1660 #if GFX_VERx10 < 125
1661    if (cs_prog_data->push.per_thread.size > 0) {
1662       for (unsigned t = 0; t < threads; t++) {
1663          memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
1664 
1665          uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
1666          *subgroup_id = t;
1667 
1668          dst += cs_prog_data->push.per_thread.size;
1669       }
1670    }
1671 #endif
1672 
1673    *state_offset = push_const_offset;
1674    *state_size = push_const_size;
1675 }
1676 
1677 static void
blorp_exec_compute(struct blorp_batch * batch,const struct blorp_params * params)1678 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
1679 {
1680    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1681    assert(params->hiz_op == ISL_AUX_OP_NONE);
1682 
1683    blorp_measure_start(batch, params);
1684 
1685    const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
1686    const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1687    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
1688    const struct intel_cs_dispatch_info dispatch =
1689       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
1690 
1691    uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
1692    uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
1693    uint32_t group_z0 = params->dst.z_offset;
1694    uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
1695    uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
1696    assert(params->num_layers >= 1);
1697    uint32_t group_z1 = params->dst.z_offset + params->num_layers;
1698    assert(cs_prog_data->local_size[2] == 1);
1699 
1700 #if GFX_VERx10 >= 125
1701    uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1702 
1703    uint32_t samplers_offset =
1704       params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1705 
1706    uint32_t push_const_offset;
1707    unsigned push_const_size;
1708    blorp_get_compute_push_const(batch, params, dispatch.threads,
1709                                 &push_const_offset, &push_const_size);
1710    struct GENX(COMPUTE_WALKER_BODY) body = {
1711       .SIMDSize                       = dispatch.simd_size / 16,
1712       .MessageSIMD                    = dispatch.simd_size / 16,
1713       .LocalXMaximum                  = cs_prog_data->local_size[0] - 1,
1714       .LocalYMaximum                  = cs_prog_data->local_size[1] - 1,
1715       .LocalZMaximum                  = cs_prog_data->local_size[2] - 1,
1716       .ThreadGroupIDStartingX         = group_x0,
1717       .ThreadGroupIDStartingY         = group_y0,
1718       .ThreadGroupIDStartingZ         = group_z0,
1719       .ThreadGroupIDXDimension        = group_x1,
1720       .ThreadGroupIDYDimension        = group_y1,
1721       .ThreadGroupIDZDimension        = group_z1,
1722       .ExecutionMask                  = dispatch.right_mask,
1723       .PostSync.MOCS                  = isl_mocs(batch->blorp->isl_dev, 0, false),
1724 
1725       .IndirectDataStartAddress       = push_const_offset,
1726       .IndirectDataLength             = push_const_size,
1727 
1728 #if GFX_VERx10 >= 125
1729       .GenerateLocalID                = cs_prog_data->generate_local_id != 0,
1730       .EmitLocal                      = cs_prog_data->generate_local_id,
1731       .WalkOrder                      = cs_prog_data->walk_order,
1732       .TileLayout = cs_prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
1733                     TileY32bpe : Linear,
1734 #endif
1735 
1736       .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1737          .KernelStartPointer = params->cs_prog_kernel,
1738          .SamplerStatePointer = samplers_offset,
1739          .SamplerCount = params->src.enabled ? 1 : 0,
1740          .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1741          .BindingTablePointer = surfaces_offset,
1742          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1743          .SharedLocalMemorySize =
1744             intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
1745          .PreferredSLMAllocationSize =
1746             intel_compute_preferred_slm_calc_encode_size(devinfo,
1747                                                          prog_data->total_shared,
1748                                                          dispatch.group_size,
1749                                                          dispatch.simd_size),
1750          .NumberOfBarriers = cs_prog_data->uses_barrier,
1751 #if GFX_VER >= 30
1752          .RegistersPerThread = ptl_register_blocks(prog_data->grf_used),
1753 #endif
1754       },
1755    };
1756 
1757    assert(cs_prog_data->push.per_thread.regs == 0);
1758    blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
1759       cw.body = body;
1760    }
1761 
1762 #else
1763 
1764    /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
1765     *
1766     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1767     *  the only bits that are changed are scoreboard related: Scoreboard
1768     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1769     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
1770     *
1771     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
1772     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
1773     */
1774    blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1775       pc.CommandStreamerStallEnable = true;
1776       pc.StallAtPixelScoreboard = true;
1777    }
1778 
1779    blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
1780       assert(prog_data->total_scratch == 0);
1781       vfe.MaximumNumberofThreads =
1782          devinfo->max_cs_threads * devinfo->subslice_total - 1;
1783       vfe.NumberofURBEntries = 2;
1784 #if GFX_VER < 11
1785       vfe.ResetGatewayTimer =
1786          Resettingrelativetimerandlatchingtheglobaltimestamp;
1787 #endif
1788       vfe.URBEntryAllocationSize = 2;
1789 
1790       const uint32_t vfe_curbe_allocation =
1791          ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
1792                cs_prog_data->push.cross_thread.regs, 2);
1793       vfe.CURBEAllocationSize = vfe_curbe_allocation;
1794    }
1795 
1796    uint32_t push_const_offset;
1797    unsigned push_const_size;
1798    blorp_get_compute_push_const(batch, params, dispatch.threads,
1799                                 &push_const_offset, &push_const_size);
1800 
1801    blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1802       curbe.CURBETotalDataLength = push_const_size;
1803       curbe.CURBEDataStartAddress = push_const_offset;
1804    }
1805 
1806    uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1807 
1808    uint32_t samplers_offset =
1809       params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1810 
1811    struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
1812       .KernelStartPointer = params->cs_prog_kernel,
1813       .SamplerStatePointer = samplers_offset,
1814       .SamplerCount = params->src.enabled ? 1 : 0,
1815       .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1816       .BindingTablePointer = surfaces_offset,
1817       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
1818       .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1819       .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
1820                                                              prog_data->total_shared),
1821       .BarrierEnable = cs_prog_data->uses_barrier,
1822       .CrossThreadConstantDataReadLength =
1823          cs_prog_data->push.cross_thread.regs,
1824    };
1825 
1826    uint32_t idd_offset;
1827    uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1828    void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
1829    if (state == NULL)
1830       return;
1831    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
1832 
1833    blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1834       mid.InterfaceDescriptorTotalLength        = size;
1835       mid.InterfaceDescriptorDataStartAddress   = idd_offset;
1836    }
1837 
1838    blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
1839       ggw.SIMDSize                     = dispatch.simd_size / 16;
1840       ggw.ThreadDepthCounterMaximum    = 0;
1841       ggw.ThreadHeightCounterMaximum   = 0;
1842       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
1843       ggw.ThreadGroupIDStartingX       = group_x0;
1844       ggw.ThreadGroupIDStartingY       = group_y0;
1845       ggw.ThreadGroupIDStartingResumeZ = group_z0;
1846       ggw.ThreadGroupIDXDimension      = group_x1;
1847       ggw.ThreadGroupIDYDimension      = group_y1;
1848       ggw.ThreadGroupIDZDimension      = group_z1;
1849       ggw.RightExecutionMask           = dispatch.right_mask;
1850       ggw.BottomExecutionMask          = 0xffffffff;
1851    }
1852 
1853 #endif
1854 
1855    blorp_measure_end(batch, params);
1856 }
1857 
1858 /* -----------------------------------------------------------------------
1859  * -- BLORP on blitter
1860  * -----------------------------------------------------------------------
1861  */
1862 
1863 #include "isl/isl_genX_helpers.h"
1864 
1865 #if GFX_VER >= 12
1866 static uint32_t
xy_bcb_tiling(const struct isl_surf * surf)1867 xy_bcb_tiling(const struct isl_surf *surf)
1868 {
1869    switch (surf->tiling) {
1870    case ISL_TILING_LINEAR:
1871       return XY_TILE_LINEAR;
1872 #if GFX_VERx10 >= 125
1873    case ISL_TILING_X:
1874       return XY_TILE_X;
1875    case ISL_TILING_4:
1876       return XY_TILE_4;
1877    case ISL_TILING_64:
1878    case ISL_TILING_64_XE2:
1879       return XY_TILE_64;
1880 #else
1881    case ISL_TILING_Y0:
1882       return XY_TILE_Y;
1883 #endif
1884    default:
1885       unreachable("Invalid tiling for XY_BLOCK_COPY_BLT");
1886    }
1887 }
1888 
1889 static uint32_t
xy_color_depth(const struct isl_format_layout * fmtl)1890 xy_color_depth(const struct isl_format_layout *fmtl)
1891 {
1892    switch (fmtl->bpb) {
1893    case 128: return XY_BPP_128_BIT;
1894    case  96: return XY_BPP_96_BIT;
1895    case  64: return XY_BPP_64_BIT;
1896    case  32: return XY_BPP_32_BIT;
1897    case  16: return XY_BPP_16_BIT;
1898    case   8: return XY_BPP_8_BIT;
1899    default:
1900       unreachable("Invalid bpp");
1901    }
1902 }
1903 #endif
1904 
1905 #if GFX_VERx10 >= 125
1906 static uint32_t
xy_bcb_surf_dim(const struct isl_surf * surf)1907 xy_bcb_surf_dim(const struct isl_surf *surf)
1908 {
1909    switch (surf->dim) {
1910    case ISL_SURF_DIM_1D:
1911       /* An undocumented assertion in simulation is that 1D surfaces must use
1912        * LINEAR tiling. But that doesn't work, so instead consider 1D tiled
1913        * surfaces as 2D with a Height=1.
1914        */
1915       return surf->tiling != ISL_TILING_LINEAR ? XY_SURFTYPE_2D: XY_SURFTYPE_1D;
1916    case ISL_SURF_DIM_2D:
1917       return XY_SURFTYPE_2D;
1918    case ISL_SURF_DIM_3D:
1919       return XY_SURFTYPE_3D;
1920    default:
1921       unreachable("Invalid dimensionality for XY_BLOCK_COPY_BLT");
1922    }
1923 }
1924 
1925 static uint32_t
xy_bcb_surf_depth(const struct isl_surf * surf)1926 xy_bcb_surf_depth(const struct isl_surf *surf)
1927 {
1928    return surf->dim == ISL_SURF_DIM_3D ? surf->logical_level0_px.depth
1929                                        : surf->logical_level0_px.array_len;
1930 }
1931 
1932 #if GFX_VER < 20
1933 static uint32_t
xy_aux_mode(const struct blorp_surface_info * info)1934 xy_aux_mode(const struct blorp_surface_info *info)
1935 {
1936    switch (info->aux_usage) {
1937    case ISL_AUX_USAGE_CCS_E:
1938    case ISL_AUX_USAGE_FCV_CCS_E:
1939    case ISL_AUX_USAGE_STC_CCS:
1940       return XY_CCS_E;
1941    case ISL_AUX_USAGE_NONE:
1942       return XY_NONE;
1943    default:
1944       unreachable("Unsupported aux mode");
1945    }
1946 }
1947 #endif // GFX_VER < 20
1948 #endif // GFX_VERx10 >= 125
1949 
1950 UNUSED static void
blorp_xy_block_copy_blt(struct blorp_batch * batch,const struct blorp_params * params)1951 blorp_xy_block_copy_blt(struct blorp_batch *batch,
1952                         const struct blorp_params *params)
1953 {
1954 #if GFX_VER < 12
1955    unreachable("Blitter is only supported on Gfx12+");
1956 #else
1957    UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
1958 
1959    assert(batch->flags & BLORP_BATCH_USE_BLITTER);
1960    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1961    assert(params->hiz_op == ISL_AUX_OP_NONE);
1962 
1963    assert(params->num_layers == 1);
1964    assert(params->dst.view.levels == 1);
1965    assert(params->src.view.levels == 1);
1966 
1967 #if GFX_VERx10 < 125
1968    assert(params->dst.view.base_array_layer == 0);
1969    assert(params->dst.z_offset == 0);
1970 #endif
1971 
1972    unsigned dst_x0 = params->x0;
1973    unsigned dst_x1 = params->x1;
1974    unsigned src_x0 =
1975       dst_x0 - params->wm_inputs.coord_transform[0].offset;
1976    ASSERTED unsigned src_x1 =
1977       dst_x1 - params->wm_inputs.coord_transform[0].offset;
1978    unsigned dst_y0 = params->y0;
1979    unsigned dst_y1 = params->y1;
1980    unsigned src_y0 =
1981       dst_y0 - params->wm_inputs.coord_transform[1].offset;
1982    ASSERTED unsigned src_y1 =
1983       dst_y1 - params->wm_inputs.coord_transform[1].offset;
1984 
1985    assert(src_x1 - src_x0 == dst_x1 - dst_x0);
1986    assert(src_y1 - src_y0 == dst_y1 - dst_y0);
1987 
1988    const struct isl_surf *src_surf = &params->src.surf;
1989    const struct isl_surf *dst_surf = &params->dst.surf;
1990 
1991    const struct isl_format_layout *fmtl =
1992       isl_format_get_layout(params->dst.view.format);
1993 
1994    if (fmtl->bpb == 96) {
1995       assert(src_surf->tiling == ISL_TILING_LINEAR &&
1996              dst_surf->tiling == ISL_TILING_LINEAR);
1997    }
1998 
1999    assert(src_surf->samples == 1);
2000    assert(dst_surf->samples == 1);
2001 
2002    unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2003    unsigned src_pitch_unit = src_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2004 
2005 #if GFX_VERx10 >= 125
2006    struct isl_extent3d src_align = isl_get_image_alignment(src_surf);
2007    struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2008 #endif
2009 
2010    blorp_emit(batch, GENX(XY_BLOCK_COPY_BLT), blt) {
2011       blt.ColorDepth = xy_color_depth(fmtl);
2012 
2013       blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2014       blt.DestinationMOCS = params->dst.addr.mocs;
2015       blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2016       blt.DestinationX1 = dst_x0;
2017       blt.DestinationY1 = dst_y0;
2018       blt.DestinationX2 = dst_x1;
2019       blt.DestinationY2 = dst_y1;
2020       blt.DestinationBaseAddress = params->dst.addr;
2021       blt.DestinationXOffset = params->dst.tile_x_sa;
2022       blt.DestinationYOffset = params->dst.tile_y_sa;
2023 
2024 #if GFX_VERx10 >= 125
2025       blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2026       blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2027       blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2028       blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2029       blt.DestinationArrayIndex =
2030          params->dst.view.base_array_layer + params->dst.z_offset;
2031       blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2032       blt.DestinationLOD = params->dst.view.base_level;
2033       blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
2034       blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2035       blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2036 #if GFX_VER < 20
2037       /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
2038       blt.DestinationDepthStencilResource =
2039          params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
2040 #endif
2041       blt.DestinationTargetMemory =
2042          params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2043 
2044       if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2045 #if GFX_VER < 20
2046          blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(&params->dst);
2047          blt.DestinationCompressionEnable = true;
2048 #endif
2049          blt.DestinationCompressionFormat =
2050             isl_get_render_compression_format(dst_surf->format);
2051          blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2052          blt.DestinationClearAddress = params->dst.clear_color_addr;
2053       }
2054 #endif
2055 
2056       blt.SourceX1 = src_x0;
2057       blt.SourceY1 = src_y0;
2058       blt.SourcePitch = (src_surf->row_pitch_B / src_pitch_unit) - 1;
2059       blt.SourceMOCS = params->src.addr.mocs;
2060       blt.SourceTiling = xy_bcb_tiling(src_surf);
2061       blt.SourceBaseAddress = params->src.addr;
2062       blt.SourceXOffset = params->src.tile_x_sa;
2063       blt.SourceYOffset = params->src.tile_y_sa;
2064 
2065 #if GFX_VERx10 >= 125
2066       blt.SourceSurfaceType = xy_bcb_surf_dim(src_surf);
2067       blt.SourceSurfaceWidth = src_surf->logical_level0_px.w - 1;
2068       blt.SourceSurfaceHeight = src_surf->logical_level0_px.h - 1;
2069       blt.SourceSurfaceDepth = xy_bcb_surf_depth(src_surf) - 1;
2070       blt.SourceArrayIndex =
2071          params->src.view.base_array_layer + params->src.z_offset;
2072       blt.SourceSurfaceQPitch = isl_get_qpitch(src_surf) >> 2;
2073       blt.SourceLOD = params->src.view.base_level;
2074       blt.SourceMipTailStartLOD = src_surf->miptail_start_level;
2075       blt.SourceHorizontalAlign = isl_encode_halign(src_align.width);
2076       blt.SourceVerticalAlign = isl_encode_valign(src_align.height);
2077 #if GFX_VER < 20
2078       /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
2079       blt.SourceDepthStencilResource =
2080          params->src.aux_usage == ISL_AUX_USAGE_STC_CCS;
2081 #endif
2082       blt.SourceTargetMemory =
2083          params->src.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2084 
2085       if (params->src.aux_usage != ISL_AUX_USAGE_NONE) {
2086 #if GFX_VER < 20
2087          blt.SourceAuxiliarySurfaceMode = xy_aux_mode(&params->src);
2088          blt.SourceCompressionEnable = true;
2089 #endif
2090          blt.SourceCompressionFormat =
2091             isl_get_render_compression_format(src_surf->format);
2092          blt.SourceClearValueEnable = !!params->src.clear_color_addr.buffer;
2093          blt.SourceClearAddress = params->src.clear_color_addr;
2094       }
2095 #endif
2096    }
2097 #endif
2098 }
2099 
2100 UNUSED static void
blorp_xy_fast_color_blit(struct blorp_batch * batch,const struct blorp_params * params)2101 blorp_xy_fast_color_blit(struct blorp_batch *batch,
2102                          const struct blorp_params *params)
2103 {
2104 #if GFX_VER < 12
2105    unreachable("Blitter is only supported on Gfx12+");
2106 #else
2107    UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
2108    const struct isl_surf *dst_surf = &params->dst.surf;
2109    const struct isl_format_layout *fmtl =
2110       isl_format_get_layout(params->dst.view.format);
2111 
2112    assert(batch->flags & BLORP_BATCH_USE_BLITTER);
2113    assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
2114    assert(params->hiz_op == ISL_AUX_OP_NONE);
2115 
2116    assert(params->num_layers == 1);
2117    assert(params->dst.view.levels == 1);
2118    assert(dst_surf->samples == 1);
2119    assert(fmtl->bpb != 96 || dst_surf->tiling == ISL_TILING_LINEAR);
2120 
2121 #if GFX_VERx10 < 125
2122    assert(params->dst.view.base_array_layer == 0);
2123    assert(params->dst.z_offset == 0);
2124 #endif
2125 
2126    unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2127 
2128 #if GFX_VERx10 >= 125
2129    struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2130 #endif
2131 
2132 #if INTEL_NEEDS_WA_16021021469
2133    assert(fmtl->bpb != 96);
2134 #endif
2135 
2136    blorp_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
2137       blt.ColorDepth = xy_color_depth(fmtl);
2138 
2139       blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2140       blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2141       blt.DestinationX1 = params->x0;
2142       blt.DestinationY1 = params->y0;
2143       blt.DestinationX2 = params->x1;
2144       blt.DestinationY2 = params->y1;
2145       blt.DestinationBaseAddress = params->dst.addr;
2146       blt.DestinationXOffset = params->dst.tile_x_sa;
2147       blt.DestinationYOffset = params->dst.tile_y_sa;
2148 
2149       isl_color_value_pack((union isl_color_value *)
2150                            params->wm_inputs.clear_color,
2151                            params->dst.view.format, blt.FillColor);
2152 
2153 #if GFX_VERx10 >= 125
2154       blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2155       blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2156       blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2157       blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2158       blt.DestinationArrayIndex =
2159          params->dst.view.base_array_layer + params->dst.z_offset;
2160       blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2161       blt.DestinationLOD = params->dst.view.base_level;
2162       blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
2163       blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2164       blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2165       /* XY_FAST_COLOR_BLT only supports AUX_CCS. */
2166       blt.DestinationDepthStencilResource =
2167          params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
2168       blt.DestinationTargetMemory =
2169          params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2170 
2171       if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2172 #if GFX_VERx10 == 125
2173          blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(&params->dst);
2174          blt.DestinationCompressionEnable = true;
2175          blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2176          blt.DestinationClearAddress = params->dst.clear_color_addr;
2177 #endif
2178          blt.DestinationCompressionFormat =
2179             isl_get_render_compression_format(dst_surf->format);
2180       }
2181 
2182       blt.DestinationMOCS = params->dst.addr.mocs;
2183 #endif
2184    }
2185 #endif
2186 }
2187 
2188 static void
blorp_exec_blitter(struct blorp_batch * batch,const struct blorp_params * params)2189 blorp_exec_blitter(struct blorp_batch *batch,
2190                    const struct blorp_params *params)
2191 {
2192    blorp_measure_start(batch, params);
2193 
2194    if (params->src.enabled)
2195       blorp_xy_block_copy_blt(batch, params);
2196    else
2197       blorp_xy_fast_color_blit(batch, params);
2198 
2199    blorp_measure_end(batch, params);
2200 }
2201 
2202 /**
2203  * \brief Execute a blit or render pass operation.
2204  *
2205  * To execute the operation, this function manually constructs and emits a
2206  * batch to draw a rectangle primitive. The batchbuffer is flushed before
2207  * constructing and after emitting the batch.
2208  *
2209  * This function alters no GL state.
2210  */
2211 static void
blorp_exec(struct blorp_batch * batch,const struct blorp_params * params)2212 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
2213 {
2214    if (batch->flags & BLORP_BATCH_USE_BLITTER) {
2215       blorp_exec_blitter(batch, params);
2216    } else if (batch->flags & BLORP_BATCH_USE_COMPUTE) {
2217       blorp_exec_compute(batch, params);
2218    } else {
2219       blorp_exec_3d(batch, params);
2220    }
2221 }
2222 
2223 static void
blorp_init_dynamic_states(struct blorp_context * context)2224 blorp_init_dynamic_states(struct blorp_context *context)
2225 {
2226    {
2227       struct GENX(BLEND_STATE) blend = { };
2228 
2229       uint32_t dws[GENX(BLEND_STATE_length) * 4 +
2230                    GENX(BLEND_STATE_ENTRY_length) * 4 * 8 /* MAX_RTS */];
2231       uint32_t *pos = dws;
2232 
2233       GENX(BLEND_STATE_pack)(NULL, pos, &blend);
2234       pos += GENX(BLEND_STATE_length);
2235 
2236       for (unsigned i = 0; i < 8; ++i) {
2237          struct GENX(BLEND_STATE_ENTRY) entry = {
2238             .PreBlendColorClampEnable = true,
2239             .PostBlendColorClampEnable = true,
2240             .ColorClampRange = COLORCLAMP_RTFORMAT,
2241          };
2242          GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
2243          pos += GENX(BLEND_STATE_ENTRY_length);
2244       }
2245 
2246       context->upload_dynamic_state(context, dws, sizeof(dws), 64,
2247                                     BLORP_DYNAMIC_STATE_BLEND);
2248    }
2249 
2250    blorp_context_upload_dynamic(context, GENX(CC_VIEWPORT), vp, 32,
2251                                 BLORP_DYNAMIC_STATE_CC_VIEWPORT) {
2252       vp.MinimumDepth = context->config.use_unrestricted_depth_range ?
2253                         -FLT_MAX : 0.0;
2254       vp.MaximumDepth = context->config.use_unrestricted_depth_range ?
2255                         FLT_MAX : 1.0;
2256    }
2257 
2258    blorp_context_upload_dynamic(context, GENX(COLOR_CALC_STATE), cc, 64,
2259                                 BLORP_DYNAMIC_STATE_COLOR_CALC) {
2260       /* Nothing */
2261    }
2262 
2263    blorp_context_upload_dynamic(context, GENX(SAMPLER_STATE), sampler, 32,
2264                                 BLORP_DYNAMIC_STATE_SAMPLER) {
2265       sampler.MipModeFilter = MIPFILTER_NONE;
2266       sampler.MagModeFilter = MAPFILTER_LINEAR;
2267       sampler.MinModeFilter = MAPFILTER_LINEAR;
2268       sampler.MinLOD = 0;
2269       sampler.MaxLOD = 0;
2270       sampler.TCXAddressControlMode = TCM_CLAMP;
2271       sampler.TCYAddressControlMode = TCM_CLAMP;
2272       sampler.TCZAddressControlMode = TCM_CLAMP;
2273       sampler.MaximumAnisotropy = RATIO21;
2274       sampler.RAddressMinFilterRoundingEnable = true;
2275       sampler.RAddressMagFilterRoundingEnable = true;
2276       sampler.VAddressMinFilterRoundingEnable = true;
2277       sampler.VAddressMagFilterRoundingEnable = true;
2278       sampler.UAddressMinFilterRoundingEnable = true;
2279       sampler.UAddressMagFilterRoundingEnable = true;
2280       sampler.NonnormalizedCoordinateEnable = true;
2281    }
2282 }
2283 
2284 #endif /* BLORP_GENX_EXEC_BRW_H */
2285