1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28
29 #include "common/intel_l3_config.h"
30
31 /**
32 * This file implements some lightweight memcpy/memset operations on the GPU
33 * using a vertex buffer and streamout.
34 */
35
36 /**
37 * Returns the greatest common divisor of a and b that is a power of two.
38 */
39 static uint64_t
gcd_pow2_u64(uint64_t a,uint64_t b)40 gcd_pow2_u64(uint64_t a, uint64_t b)
41 {
42 assert(a > 0 || b > 0);
43
44 unsigned a_log2 = ffsll(a) - 1;
45 unsigned b_log2 = ffsll(b) - 1;
46
47 /* If either a or b is 0, then a_log2 or b_log2 will be UINT_MAX in which
48 * case, the MIN2() will take the other one. If both are 0 then we will
49 * hit the assert above.
50 */
51 return 1 << MIN2(a_log2, b_log2);
52 }
53
54 static void
emit_common_so_memcpy(struct anv_memcpy_state * state,const struct intel_urb_config * urb_cfg_in,const struct intel_l3_config * l3_config)55 emit_common_so_memcpy(struct anv_memcpy_state *state,
56 const struct intel_urb_config *urb_cfg_in,
57 const struct intel_l3_config *l3_config)
58 {
59 struct anv_batch *batch = state->batch;
60 struct anv_device *device = state->device;
61
62 if (state->cmd_buffer) {
63 /* Wa_14015814527 */
64 genX(apply_task_urb_workaround)(state->cmd_buffer);
65
66 genX(cmd_buffer_apply_pipe_flushes)(state->cmd_buffer);
67
68 genX(flush_pipeline_select_3d)(state->cmd_buffer);
69
70 #if GFX_VER == 9
71 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(
72 state->cmd_buffer, SEQUENTIAL, 1ull << 32);
73 #endif
74 }
75
76 anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
77 vfi.InstancingEnable = false;
78 vfi.VertexElementIndex = 0;
79 }
80 anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vfs);
81 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
82 #if GFX_VER >= 11
83 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
84 #endif
85
86 /* Disable all shader stages */
87 anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
88 anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
89 anv_batch_emit(batch, GENX(3DSTATE_TE), te);
90 anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
91 anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
92 anv_batch_emit(batch, GENX(3DSTATE_PS), gs);
93
94 #if GFX_VERx10 >= 125
95 /* Disable Mesh, we can't have this and streamout enabled at the same
96 * time.
97 */
98 if (device->vk.enabled_extensions.EXT_mesh_shader) {
99 anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
100 anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
101 }
102 #endif
103
104 #if INTEL_WA_16013994831_GFX_VER
105 /* Wa_16013994831 - Disable preemption during streamout. */
106 if (intel_needs_workaround(device->info, 16013994831))
107 genX(batch_set_preemption)(batch, device, _3D, false);
108 #endif
109
110 anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
111 sbe.VertexURBEntryReadOffset = 1;
112 sbe.NumberofSFOutputAttributes = 1;
113 sbe.VertexURBEntryReadLength = 1;
114 sbe.ForceVertexURBEntryReadLength = true;
115 sbe.ForceVertexURBEntryReadOffset = true;
116
117 for (unsigned i = 0; i < 32; i++)
118 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
119 }
120
121 /* Emit URB setup. We tell it that the VS is active because we want it to
122 * allocate space for the VS. Even though one isn't run, we need VUEs to
123 * store the data that VF is going to pass to SOL.
124 */
125 const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
126 memcpy(state->urb_cfg.size, &entry_size, sizeof(entry_size));
127
128 genX(emit_urb_setup)(device, batch, l3_config,
129 VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, &state->urb_cfg,
130 NULL);
131
132 #if GFX_VER >= 12
133 /* Disable Primitive Replication. */
134 anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
135 #endif
136
137 anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
138 topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
139 }
140
141 anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
142 vf.StatisticsEnable = false;
143 }
144 }
145
146 static void
emit_so_memcpy(struct anv_memcpy_state * state,struct anv_address dst,struct anv_address src,uint32_t size)147 emit_so_memcpy(struct anv_memcpy_state *state,
148 struct anv_address dst, struct anv_address src,
149 uint32_t size)
150 {
151 struct anv_batch *batch = state->batch;
152 struct anv_device *device = state->device;
153
154 /* The maximum copy block size is 4 32-bit components at a time. */
155 assert(size % 4 == 0);
156 unsigned bs = gcd_pow2_u64(16, size);
157
158 enum isl_format format;
159 switch (bs) {
160 case 4: format = ISL_FORMAT_R32_UINT; break;
161 case 8: format = ISL_FORMAT_R32G32_UINT; break;
162 case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break;
163 default:
164 unreachable("Invalid size");
165 }
166
167 uint32_t *dw;
168 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
169 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
170 &(struct GENX(VERTEX_BUFFER_STATE)) {
171 .VertexBufferIndex = 32, /* Reserved for this */
172 .AddressModifyEnable = true,
173 .BufferStartingAddress = src,
174 .BufferPitch = bs,
175 .MOCS = anv_mocs(device, src.bo, 0),
176 #if GFX_VER >= 12
177 .L3BypassDisable = true,
178 #endif
179 .BufferSize = size,
180 });
181
182 dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
183 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1,
184 &(struct GENX(VERTEX_ELEMENT_STATE)) {
185 .VertexBufferIndex = 32,
186 .Valid = true,
187 .SourceElementFormat = format,
188 .SourceElementOffset = 0,
189 .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
190 .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
191 .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
192 .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
193 });
194
195
196 /* Wa_16011411144:
197 *
198 * SW must insert a PIPE_CONTROL cmd before and after the
199 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
200 * state is not combined with other state changes.
201 */
202 if (intel_needs_workaround(device->info, 16011411144))
203 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
204
205 anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) {
206 #if GFX_VER < 12
207 sob.SOBufferIndex = 0;
208 #else
209 sob._3DCommandOpcode = 0;
210 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD;
211 #endif
212 sob.MOCS = anv_mocs(device, dst.bo, ISL_SURF_USAGE_STREAM_OUT_BIT),
213 sob.SurfaceBaseAddress = dst;
214
215 sob.SOBufferEnable = true;
216 sob.SurfaceSize = size / 4 - 1;
217
218 /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with
219 * the end position of the stream. We need to reset this value to 0 at
220 * the beginning of the run or else SOL will start at the offset from
221 * the previous draw.
222 */
223 sob.StreamOffsetWriteEnable = true;
224 sob.StreamOffset = 0;
225 }
226
227 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
228 if (intel_needs_workaround(device->info, 16011411144))
229 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
230
231 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST),
232 .StreamtoBufferSelects0 = (1 << 0),
233 .NumEntries0 = 1);
234 GENX(SO_DECL_ENTRY_pack)(batch, dw + 3,
235 &(struct GENX(SO_DECL_ENTRY)) {
236 .Stream0Decl = {
237 .OutputBufferSlot = 0,
238 .RegisterIndex = 0,
239 .ComponentMask = (1 << (bs / 4)) - 1,
240 },
241 });
242
243 #if GFX_VERx10 == 125
244 /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
245 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
246 #endif
247
248 anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {
249 so.SOFunctionEnable = true;
250 so.RenderingDisable = true;
251 so.Stream0VertexReadOffset = 0;
252 so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64);
253 so.Buffer0SurfacePitch = bs;
254 }
255
256 genX(emit_breakpoint)(batch, device, true);
257 anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
258 prim.VertexAccessType = SEQUENTIAL;
259 prim.VertexCountPerInstance = size / bs;
260 prim.StartVertexLocation = 0;
261 prim.InstanceCount = 1;
262 prim.StartInstanceLocation = 0;
263 prim.BaseVertexLocation = 0;
264 }
265
266 genX(batch_emit_post_3dprimitive_was)(batch,
267 device,
268 _3DPRIM_POINTLIST, size / bs);
269
270 genX(emit_breakpoint)(batch, device, false);
271 }
272
273 void
genX(emit_so_memcpy_init)274 genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
275 struct anv_device *device,
276 struct anv_cmd_buffer *cmd_buffer,
277 struct anv_batch *batch)
278 {
279 memset(state, 0, sizeof(*state));
280
281 state->cmd_buffer = cmd_buffer;
282 state->batch = batch;
283 state->device = device;
284
285 if (state->cmd_buffer) {
286 if (!cmd_buffer->state.current_l3_config) {
287 genX(cmd_buffer_config_l3)(cmd_buffer,
288 intel_get_default_l3_config(device->info));
289 }
290 emit_common_so_memcpy(state,
291 &state->cmd_buffer->state.gfx.urb_cfg,
292 cmd_buffer->state.current_l3_config);
293 } else {
294 const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
295 genX(emit_l3_config)(batch, device, cfg);
296 genX(emit_pipeline_select)(batch, _3D, device);
297
298 /* Dummy URB config, will trigger URB reemission */
299 struct intel_urb_config urb_cfg_in = { 0 };
300 emit_common_so_memcpy(state, &urb_cfg_in, cfg);
301 }
302 }
303
304 void
genX(emit_so_memcpy_fini)305 genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
306 {
307 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
308 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
309 NULL);
310
311 if (state->cmd_buffer) {
312 /* Flag all the instructions emitted by the memcpy. */
313 struct anv_gfx_dynamic_state *hw_state =
314 &state->cmd_buffer->state.gfx.dyn_state;
315
316 #if INTEL_WA_14018283232_GFX_VER
317 genX(cmd_buffer_ensure_wa_14018283232)(state->cmd_buffer, false);
318 #endif
319
320 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
321 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
322 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
323 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
324 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
325 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
326 #if GFX_VER >= 11
327 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
328 #endif
329 #if GFX_VER >= 12
330 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
331 #endif
332 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST);
333 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
334 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
335 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
336 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
337 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
338 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
339 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
340 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
341 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
342 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
343 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
344 if (state->cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
345 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
346 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
347 }
348
349 state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_PIPELINE |
350 ANV_CMD_DIRTY_INDEX_BUFFER);
351
352 memcpy(&state->cmd_buffer->state.gfx.urb_cfg, &state->urb_cfg,
353 sizeof(struct intel_urb_config));
354 }
355 }
356
357 void
genX(emit_so_memcpy_end)358 genX(emit_so_memcpy_end)(struct anv_memcpy_state *state)
359 {
360 if (intel_needs_workaround(state->device->info, 16013994831))
361 genX(batch_set_preemption)(state->batch, state->device, _3D, true);
362
363 anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
364
365 if ((state->batch->next - state->batch->start) & 4)
366 anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
367 }
368
369 void
genX(emit_so_memcpy)370 genX(emit_so_memcpy)(struct anv_memcpy_state *state,
371 struct anv_address dst, struct anv_address src,
372 uint32_t size)
373 {
374 if (GFX_VER == 9 &&
375 anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
376 &state->vb_dirty,
377 src, size)) {
378 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
379 ANV_PIPE_CS_STALL_BIT |
380 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
381 NULL);
382 memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
383 }
384
385 emit_so_memcpy(state, dst, src, size);
386 }
387
388 void
genX(cmd_buffer_so_memcpy)389 genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
390 struct anv_address dst, struct anv_address src,
391 uint32_t size)
392 {
393 if (size == 0)
394 return;
395
396 struct anv_memcpy_state state;
397 genX(emit_so_memcpy_init)(&state,
398 cmd_buffer->device,
399 cmd_buffer,
400 &cmd_buffer->batch);
401 emit_so_memcpy(&state, dst, src, size);
402 genX(emit_so_memcpy_fini)(&state);
403 }
404