1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28
29 #include "common/intel_l3_config.h"
30
31 /**
32 * This file implements some lightweight memcpy/memset operations on the GPU
33 * using a vertex buffer and streamout.
34 */
35
36 /**
37 * Returns the greatest common divisor of a and b that is a power of two.
38 */
39 static uint64_t
gcd_pow2_u64(uint64_t a,uint64_t b)40 gcd_pow2_u64(uint64_t a, uint64_t b)
41 {
42 assert(a > 0 || b > 0);
43
44 unsigned a_log2 = ffsll(a) - 1;
45 unsigned b_log2 = ffsll(b) - 1;
46
47 /* If either a or b is 0, then a_log2 or b_log2 will be UINT_MAX in which
48 * case, the MIN2() will take the other one. If both are 0 then we will
49 * hit the assert above.
50 */
51 return 1 << MIN2(a_log2, b_log2);
52 }
53
54 static void
emit_common_so_memcpy(struct anv_batch * batch,struct anv_device * device,const struct intel_urb_config * urb_cfg_in,struct intel_urb_config * urb_cfg_out,const struct intel_l3_config * l3_config)55 emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
56 const struct intel_urb_config *urb_cfg_in,
57 struct intel_urb_config *urb_cfg_out,
58 const struct intel_l3_config *l3_config)
59 {
60 anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
61 vfi.InstancingEnable = false;
62 vfi.VertexElementIndex = 0;
63 }
64 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
65 #if GFX_VER >= 11
66 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
67 #endif
68
69 /* Disable all shader stages */
70 anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
71 anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
72 anv_batch_emit(batch, GENX(3DSTATE_TE), te);
73 anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
74 anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
75 anv_batch_emit(batch, GENX(3DSTATE_PS), gs);
76
77 #if GFX_VERx10 >= 125
78 /* Disable Mesh, we can't have this and streamout enabled at the same
79 * time.
80 */
81 if (device->vk.enabled_extensions.EXT_mesh_shader) {
82 anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
83 anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
84 }
85 #endif
86
87 #if INTEL_WA_16013994831_GFX_VER
88 /* Wa_16013994831 - Disable preemption during streamout. */
89 if (intel_needs_workaround(device->info, 16013994831))
90 genX(batch_set_preemption)(batch, device->info, _3D, false);
91 #endif
92
93 anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
94 sbe.VertexURBEntryReadOffset = 1;
95 sbe.NumberofSFOutputAttributes = 1;
96 sbe.VertexURBEntryReadLength = 1;
97 sbe.ForceVertexURBEntryReadLength = true;
98 sbe.ForceVertexURBEntryReadOffset = true;
99
100 for (unsigned i = 0; i < 32; i++)
101 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
102 }
103
104 /* Emit URB setup. We tell it that the VS is active because we want it to
105 * allocate space for the VS. Even though one isn't run, we need VUEs to
106 * store the data that VF is going to pass to SOL.
107 */
108 const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
109 memcpy(urb_cfg_out->size, &entry_size, sizeof(entry_size));
110
111 genX(emit_urb_setup)(device, batch, l3_config,
112 VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, urb_cfg_out,
113 NULL);
114
115 #if GFX_VER >= 12
116 /* Disable Primitive Replication. */
117 anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
118 #endif
119
120 anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
121 topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
122 }
123
124 anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
125 vf.StatisticsEnable = false;
126 }
127 }
128
129 static void
emit_so_memcpy(struct anv_batch * batch,struct anv_device * device,struct anv_address dst,struct anv_address src,uint32_t size)130 emit_so_memcpy(struct anv_batch *batch, struct anv_device *device,
131 struct anv_address dst, struct anv_address src,
132 uint32_t size)
133 {
134 /* The maximum copy block size is 4 32-bit components at a time. */
135 assert(size % 4 == 0);
136 unsigned bs = gcd_pow2_u64(16, size);
137
138 enum isl_format format;
139 switch (bs) {
140 case 4: format = ISL_FORMAT_R32_UINT; break;
141 case 8: format = ISL_FORMAT_R32G32_UINT; break;
142 case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break;
143 default:
144 unreachable("Invalid size");
145 }
146
147 uint32_t *dw;
148 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
149 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
150 &(struct GENX(VERTEX_BUFFER_STATE)) {
151 .VertexBufferIndex = 32, /* Reserved for this */
152 .AddressModifyEnable = true,
153 .BufferStartingAddress = src,
154 .BufferPitch = bs,
155 .MOCS = anv_mocs(device, src.bo, 0),
156 #if GFX_VER >= 12
157 .L3BypassDisable = true,
158 #endif
159 .BufferSize = size,
160 });
161
162 dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
163 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1,
164 &(struct GENX(VERTEX_ELEMENT_STATE)) {
165 .VertexBufferIndex = 32,
166 .Valid = true,
167 .SourceElementFormat = format,
168 .SourceElementOffset = 0,
169 .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
170 .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
171 .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
172 .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
173 });
174
175
176 /* Wa_16011411144:
177 *
178 * SW must insert a PIPE_CONTROL cmd before and after the
179 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
180 * state is not combined with other state changes.
181 */
182 if (intel_needs_workaround(device->info, 16011411144))
183 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
184
185 anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) {
186 #if GFX_VER < 12
187 sob.SOBufferIndex = 0;
188 #else
189 sob._3DCommandOpcode = 0;
190 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD;
191 #endif
192 sob.MOCS = anv_mocs(device, dst.bo, ISL_SURF_USAGE_STREAM_OUT_BIT),
193 sob.SurfaceBaseAddress = dst;
194
195 sob.SOBufferEnable = true;
196 sob.SurfaceSize = size / 4 - 1;
197
198 /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with
199 * the end position of the stream. We need to reset this value to 0 at
200 * the beginning of the run or else SOL will start at the offset from
201 * the previous draw.
202 */
203 sob.StreamOffsetWriteEnable = true;
204 sob.StreamOffset = 0;
205 }
206
207 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
208 if (intel_needs_workaround(device->info, 16011411144))
209 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
210
211 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST),
212 .StreamtoBufferSelects0 = (1 << 0),
213 .NumEntries0 = 1);
214 GENX(SO_DECL_ENTRY_pack)(batch, dw + 3,
215 &(struct GENX(SO_DECL_ENTRY)) {
216 .Stream0Decl = {
217 .OutputBufferSlot = 0,
218 .RegisterIndex = 0,
219 .ComponentMask = (1 << (bs / 4)) - 1,
220 },
221 });
222
223 #if GFX_VERx10 == 125
224 /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
225 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
226 #endif
227
228 anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {
229 so.SOFunctionEnable = true;
230 so.RenderingDisable = true;
231 so.Stream0VertexReadOffset = 0;
232 so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64);
233 so.Buffer0SurfacePitch = bs;
234 }
235
236 genX(emit_breakpoint)(batch, device, true);
237 anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
238 prim.VertexAccessType = SEQUENTIAL;
239 prim.VertexCountPerInstance = size / bs;
240 prim.StartVertexLocation = 0;
241 prim.InstanceCount = 1;
242 prim.StartInstanceLocation = 0;
243 prim.BaseVertexLocation = 0;
244 }
245
246 genX(batch_emit_post_3dprimitive_was)(batch,
247 device,
248 _3DPRIM_POINTLIST, size / bs);
249
250 genX(emit_breakpoint)(batch, device, false);
251 }
252
253 void
genX(emit_so_memcpy_init)254 genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
255 struct anv_device *device,
256 struct anv_batch *batch)
257 {
258 memset(state, 0, sizeof(*state));
259
260 state->batch = batch;
261 state->device = device;
262
263 const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
264 genX(emit_l3_config)(batch, device, cfg);
265 genX(emit_pipeline_select)(batch, _3D, device);
266
267 struct intel_urb_config urb_cfg_in = { 0 };
268 struct intel_urb_config urb_cfg = { 0 };
269
270 emit_common_so_memcpy(batch, device, &urb_cfg_in, &urb_cfg, cfg);
271 }
272
273 void
genX(emit_so_memcpy_fini)274 genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
275 {
276 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
277 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
278 NULL);
279 }
280
281 void
genX(emit_so_memcpy_end)282 genX(emit_so_memcpy_end)(struct anv_memcpy_state *state)
283 {
284 if (intel_needs_workaround(state->device->info, 16013994831))
285 genX(batch_set_preemption)(state->batch, state->device->info, _3D, true);
286
287 anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
288
289 if ((state->batch->next - state->batch->start) & 4)
290 anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
291 }
292
293 void
genX(emit_so_memcpy)294 genX(emit_so_memcpy)(struct anv_memcpy_state *state,
295 struct anv_address dst, struct anv_address src,
296 uint32_t size)
297 {
298 if (GFX_VER == 9 &&
299 anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
300 &state->vb_dirty,
301 src, size)) {
302 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
303 ANV_PIPE_CS_STALL_BIT |
304 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
305 NULL);
306 memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
307 }
308
309 emit_so_memcpy(state->batch, state->device, dst, src, size);
310 }
311
312 void
genX(cmd_buffer_so_memcpy)313 genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
314 struct anv_address dst, struct anv_address src,
315 uint32_t size)
316 {
317 if (size == 0)
318 return;
319
320 if (!cmd_buffer->state.current_l3_config) {
321 const struct intel_l3_config *cfg =
322 intel_get_default_l3_config(cmd_buffer->device->info);
323 genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
324 }
325
326 #if GFX_VER == 9
327 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size);
328 #endif
329
330 /* Wa_14015814527 */
331 genX(apply_task_urb_workaround)(cmd_buffer);
332
333 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
334
335 genX(flush_pipeline_select_3d)(cmd_buffer);
336
337 struct intel_urb_config urb_cfg;
338
339 emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device,
340 &cmd_buffer->state.gfx.urb_cfg,
341 &urb_cfg,
342 cmd_buffer->state.current_l3_config);
343 emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size);
344
345 #if GFX_VER == 9
346 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
347 1ull << 32);
348 #endif
349
350 /* Update urb config after memcpy. */
351 memcpy(&cmd_buffer->state.gfx.urb_cfg, &urb_cfg,
352 sizeof(struct intel_urb_config));
353
354 /* Flag all the instructions emitted by the memcpy. */
355 struct anv_gfx_dynamic_state *hw_state =
356 &cmd_buffer->state.gfx.dyn_state;
357
358 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
359 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
360 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
361 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
362 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
363 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
364 #if GFX_VER >= 11
365 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
366 #endif
367 #if GFX_VER >= 12
368 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
369 #endif
370 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST);
371 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
372 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
373 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
374 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
375 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
376 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
377 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
378 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
379 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
380 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
381 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
382 if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
383 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
384 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
385 }
386
387 cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_PIPELINE |
388 ANV_CMD_DIRTY_INDEX_BUFFER);
389 }
390