1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "brw_batch.h"
27 #include "brw_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "brw_buffers.h"
30 #include "brw_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/intel_decoder.h"
35 #include "common/intel_gem.h"
36
37 #include "util/hash_table.h"
38
39 #include <xf86drm.h>
40 #include "drm-uapi/i915_drm.h"
41
42 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
43
44 /**
45 * Target sizes of the batch and state buffers. We create the initial
46 * buffers at these sizes, and flush when they're nearly full. If we
47 * underestimate how close we are to the end, and suddenly need more space
48 * in the middle of a draw, we can grow the buffers, and finish the draw.
49 * At that point, we'll be over our target size, so the next operation
50 * should flush. Each time we flush the batch, we recreate both buffers
51 * at the original target size, so it doesn't grow without bound.
52 */
53 #define BATCH_SZ (20 * 1024)
54 #define STATE_SZ (16 * 1024)
55
56 static void
57 brw_batch_reset(struct brw_context *brw);
58 static void
59 brw_new_batch(struct brw_context *brw);
60
61 static unsigned
num_fences(struct brw_batch * batch)62 num_fences(struct brw_batch *batch)
63 {
64 return util_dynarray_num_elements(&batch->exec_fences,
65 struct drm_i915_gem_exec_fence);
66 }
67
68
69 static void
dump_validation_list(struct brw_batch * batch)70 dump_validation_list(struct brw_batch *batch)
71 {
72 fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
73
74 for (int i = 0; i < batch->exec_count; i++) {
75 uint64_t flags = batch->validation_list[i].flags;
76 assert(batch->validation_list[i].handle ==
77 batch->exec_bos[i]->gem_handle);
78 fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%"PRIx64"%s (%"PRIu64"B)\n",
79 i,
80 batch->validation_list[i].handle,
81 batch->exec_bos[i]->name,
82 batch->exec_bos[i],
83 (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b",
84 (flags & EXEC_OBJECT_WRITE) ? " write)" : ")",
85 (uint64_t)batch->validation_list[i].offset,
86 (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "",
87 batch->exec_bos[i]->size);
88 }
89 }
90
91 static struct intel_batch_decode_bo
decode_get_bo(void * v_brw,bool ppgtt,uint64_t address)92 decode_get_bo(void *v_brw, bool ppgtt, uint64_t address)
93 {
94 struct brw_context *brw = v_brw;
95 struct brw_batch *batch = &brw->batch;
96
97 for (int i = 0; i < batch->exec_count; i++) {
98 struct brw_bo *bo = batch->exec_bos[i];
99 /* The decoder zeroes out the top 16 bits, so we need to as well */
100 uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
101
102 if (address >= bo_address && address < bo_address + bo->size) {
103 return (struct intel_batch_decode_bo) {
104 .addr = bo_address,
105 .size = bo->size,
106 .map = brw_bo_map(brw, bo, MAP_READ),
107 };
108 }
109 }
110
111 return (struct intel_batch_decode_bo) { };
112 }
113
114 static unsigned
decode_get_state_size(void * v_brw,uint64_t address,uint64_t base_address)115 decode_get_state_size(void *v_brw, uint64_t address, uint64_t base_address)
116 {
117 struct brw_context *brw = v_brw;
118 struct brw_batch *batch = &brw->batch;
119 unsigned size = (uintptr_t)
120 _mesa_hash_table_u64_search(batch->state_batch_sizes,
121 address - base_address);
122 return size;
123 }
124
125 static void
init_reloc_list(struct brw_reloc_list * rlist,int count)126 init_reloc_list(struct brw_reloc_list *rlist, int count)
127 {
128 rlist->reloc_count = 0;
129 rlist->reloc_array_size = count;
130 rlist->relocs = malloc(rlist->reloc_array_size *
131 sizeof(struct drm_i915_gem_relocation_entry));
132 }
133
134 void
brw_batch_init(struct brw_context * brw)135 brw_batch_init(struct brw_context *brw)
136 {
137 struct brw_screen *screen = brw->screen;
138 struct brw_batch *batch = &brw->batch;
139 const struct intel_device_info *devinfo = &screen->devinfo;
140
141 if (INTEL_DEBUG(DEBUG_BATCH)) {
142 /* The shadow doesn't get relocs written so state decode fails. */
143 batch->use_shadow_copy = false;
144 } else
145 batch->use_shadow_copy = !devinfo->has_llc;
146
147 init_reloc_list(&batch->batch_relocs, 250);
148 init_reloc_list(&batch->state_relocs, 250);
149
150 batch->batch.map = NULL;
151 batch->state.map = NULL;
152 batch->exec_count = 0;
153 batch->exec_array_size = 100;
154 batch->exec_bos =
155 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
156 batch->validation_list =
157 malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
158 batch->contains_fence_signal = false;
159
160 if (INTEL_DEBUG(DEBUG_BATCH)) {
161 batch->state_batch_sizes =
162 _mesa_hash_table_u64_create(NULL);
163
164 const unsigned decode_flags =
165 INTEL_BATCH_DECODE_FULL |
166 (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
167 INTEL_BATCH_DECODE_OFFSETS |
168 INTEL_BATCH_DECODE_FLOATS;
169
170 intel_batch_decode_ctx_init(&batch->decoder, devinfo, stderr,
171 decode_flags, NULL, decode_get_bo,
172 decode_get_state_size, brw);
173 batch->decoder.max_vbo_decoded_lines = 100;
174 }
175
176 batch->use_batch_first =
177 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
178
179 /* PIPE_CONTROL needs a w/a but only on gfx6 */
180 batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
181 if (devinfo->ver == 6)
182 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
183
184 brw_batch_reset(brw);
185 }
186
187 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
188
189 static unsigned
add_exec_bo(struct brw_batch * batch,struct brw_bo * bo)190 add_exec_bo(struct brw_batch *batch, struct brw_bo *bo)
191 {
192 assert(bo->bufmgr == batch->batch.bo->bufmgr);
193
194 unsigned index = READ_ONCE(bo->index);
195
196 if (index < batch->exec_count && batch->exec_bos[index] == bo)
197 return index;
198
199 /* May have been shared between multiple active batches */
200 for (index = 0; index < batch->exec_count; index++) {
201 if (batch->exec_bos[index] == bo)
202 return index;
203 }
204
205 brw_bo_reference(bo);
206
207 if (batch->exec_count == batch->exec_array_size) {
208 batch->exec_array_size *= 2;
209 batch->exec_bos =
210 realloc(batch->exec_bos,
211 batch->exec_array_size * sizeof(batch->exec_bos[0]));
212 batch->validation_list =
213 realloc(batch->validation_list,
214 batch->exec_array_size * sizeof(batch->validation_list[0]));
215 }
216
217 batch->validation_list[batch->exec_count] =
218 (struct drm_i915_gem_exec_object2) {
219 .handle = bo->gem_handle,
220 .offset = bo->gtt_offset,
221 .flags = bo->kflags,
222 };
223
224 bo->index = batch->exec_count;
225 batch->exec_bos[batch->exec_count] = bo;
226 batch->aperture_space += bo->size;
227
228 return batch->exec_count++;
229 }
230
231 static void
recreate_growing_buffer(struct brw_context * brw,struct brw_growing_bo * grow,const char * name,unsigned size,enum brw_memory_zone memzone)232 recreate_growing_buffer(struct brw_context *brw,
233 struct brw_growing_bo *grow,
234 const char *name, unsigned size,
235 enum brw_memory_zone memzone)
236 {
237 struct brw_screen *screen = brw->screen;
238 struct brw_batch *batch = &brw->batch;
239 struct brw_bufmgr *bufmgr = screen->bufmgr;
240
241 /* We can't grow buffers when using softpin, so just overallocate them. */
242 if (brw_using_softpin(bufmgr))
243 size *= 2;
244
245 grow->bo = brw_bo_alloc(bufmgr, name, size, memzone);
246 grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
247 grow->partial_bo = NULL;
248 grow->partial_bo_map = NULL;
249 grow->partial_bytes = 0;
250 grow->memzone = memzone;
251
252 if (batch->use_shadow_copy)
253 grow->map = realloc(grow->map, grow->bo->size);
254 else
255 grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
256 }
257
258 static void
brw_batch_reset(struct brw_context * brw)259 brw_batch_reset(struct brw_context *brw)
260 {
261 struct brw_batch *batch = &brw->batch;
262
263 if (batch->last_bo != NULL) {
264 brw_bo_unreference(batch->last_bo);
265 batch->last_bo = NULL;
266 }
267 batch->last_bo = batch->batch.bo;
268
269 recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ,
270 BRW_MEMZONE_OTHER);
271 batch->map_next = batch->batch.map;
272
273 recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ,
274 BRW_MEMZONE_DYNAMIC);
275
276 /* Avoid making 0 a valid state offset - otherwise the decoder will try
277 * and decode data when we use offset 0 as a null pointer.
278 */
279 batch->state_used = 1;
280
281 add_exec_bo(batch, batch->batch.bo);
282 assert(batch->batch.bo->index == 0);
283
284 batch->needs_sol_reset = false;
285 batch->state_base_address_emitted = false;
286
287 if (batch->state_batch_sizes)
288 _mesa_hash_table_u64_clear(batch->state_batch_sizes);
289
290 /* Always add workaround_bo which contains a driver identifier to be
291 * recorded in error states.
292 */
293 struct brw_bo *identifier_bo = brw->workaround_bo;
294 if (identifier_bo)
295 add_exec_bo(batch, identifier_bo);
296
297 if (batch->contains_fence_signal)
298 batch->contains_fence_signal = false;
299 }
300
301 static void
brw_batch_reset_and_clear_render_cache(struct brw_context * brw)302 brw_batch_reset_and_clear_render_cache(struct brw_context *brw)
303 {
304 brw_batch_reset(brw);
305 brw_cache_sets_clear(brw);
306 }
307
308 void
brw_batch_save_state(struct brw_context * brw)309 brw_batch_save_state(struct brw_context *brw)
310 {
311 brw->batch.saved.map_next = brw->batch.map_next;
312 brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
313 brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
314 brw->batch.saved.exec_count = brw->batch.exec_count;
315 }
316
317 bool
brw_batch_saved_state_is_empty(struct brw_context * brw)318 brw_batch_saved_state_is_empty(struct brw_context *brw)
319 {
320 struct brw_batch *batch = &brw->batch;
321 return (batch->saved.map_next == batch->batch.map);
322 }
323
324 void
brw_batch_reset_to_saved(struct brw_context * brw)325 brw_batch_reset_to_saved(struct brw_context *brw)
326 {
327 for (int i = brw->batch.saved.exec_count;
328 i < brw->batch.exec_count; i++) {
329 brw_bo_unreference(brw->batch.exec_bos[i]);
330 }
331 brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
332 brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
333 brw->batch.exec_count = brw->batch.saved.exec_count;
334
335 brw->batch.map_next = brw->batch.saved.map_next;
336 if (USED_BATCH(brw->batch) == 0)
337 brw_new_batch(brw);
338 }
339
340 void
brw_batch_free(struct brw_batch * batch)341 brw_batch_free(struct brw_batch *batch)
342 {
343 if (batch->use_shadow_copy) {
344 free(batch->batch.map);
345 free(batch->state.map);
346 }
347
348 for (int i = 0; i < batch->exec_count; i++) {
349 brw_bo_unreference(batch->exec_bos[i]);
350 }
351 free(batch->batch_relocs.relocs);
352 free(batch->state_relocs.relocs);
353 free(batch->exec_bos);
354 free(batch->validation_list);
355
356 brw_bo_unreference(batch->last_bo);
357 brw_bo_unreference(batch->batch.bo);
358 brw_bo_unreference(batch->state.bo);
359 if (batch->state_batch_sizes) {
360 _mesa_hash_table_u64_destroy(batch->state_batch_sizes);
361 intel_batch_decode_ctx_finish(&batch->decoder);
362 }
363 }
364
365 /**
366 * Finish copying the old batch/state buffer's contents to the new one
367 * after we tried to "grow" the buffer in an earlier operation.
368 */
369 static void
finish_growing_bos(struct brw_growing_bo * grow)370 finish_growing_bos(struct brw_growing_bo *grow)
371 {
372 struct brw_bo *old_bo = grow->partial_bo;
373 if (!old_bo)
374 return;
375
376 memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
377
378 grow->partial_bo = NULL;
379 grow->partial_bo_map = NULL;
380 grow->partial_bytes = 0;
381
382 brw_bo_unreference(old_bo);
383 }
384
385 static void
replace_bo_in_reloc_list(struct brw_reloc_list * rlist,uint32_t old_handle,uint32_t new_handle)386 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
387 uint32_t old_handle, uint32_t new_handle)
388 {
389 for (int i = 0; i < rlist->reloc_count; i++) {
390 if (rlist->relocs[i].target_handle == old_handle)
391 rlist->relocs[i].target_handle = new_handle;
392 }
393 }
394
395 /**
396 * Grow either the batch or state buffer to a new larger size.
397 *
398 * We can't actually grow buffers, so we allocate a new one, copy over
399 * the existing contents, and update our lists to refer to the new one.
400 *
401 * Note that this is only temporary - each new batch recreates the buffers
402 * at their original target size (BATCH_SZ or STATE_SZ).
403 */
404 static void
grow_buffer(struct brw_context * brw,struct brw_growing_bo * grow,unsigned existing_bytes,unsigned new_size)405 grow_buffer(struct brw_context *brw,
406 struct brw_growing_bo *grow,
407 unsigned existing_bytes,
408 unsigned new_size)
409 {
410 struct brw_batch *batch = &brw->batch;
411 struct brw_bufmgr *bufmgr = brw->bufmgr;
412 struct brw_bo *bo = grow->bo;
413
414 /* We can't grow buffers that are softpinned, as the growing mechanism
415 * involves putting a larger buffer at the same gtt_offset...and we've
416 * only allocated the smaller amount of VMA. Without relocations, this
417 * simply won't work. This should never happen, however.
418 */
419 assert(!(bo->kflags & EXEC_OBJECT_PINNED));
420
421 perf_debug("Growing %s - ran out of space\n", bo->name);
422
423 if (grow->partial_bo) {
424 /* We've already grown once, and now we need to do it again.
425 * Finish our last grow operation so we can start a new one.
426 * This should basically never happen.
427 */
428 perf_debug("Had to grow multiple times");
429 finish_growing_bos(grow);
430 }
431
432 struct brw_bo *new_bo =
433 brw_bo_alloc(bufmgr, bo->name, new_size, grow->memzone);
434
435 /* Copy existing data to the new larger buffer */
436 grow->partial_bo_map = grow->map;
437
438 if (batch->use_shadow_copy) {
439 /* We can't safely use realloc, as it may move the existing buffer,
440 * breaking existing pointers the caller may still be using. Just
441 * malloc a new copy and memcpy it like the normal BO path.
442 *
443 * Use bo->size rather than new_size because the bufmgr may have
444 * rounded up the size, and we want the shadow size to match.
445 */
446 grow->map = malloc(new_bo->size);
447 } else {
448 grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
449 }
450
451 /* Try to put the new BO at the same GTT offset as the old BO (which
452 * we're throwing away, so it doesn't need to be there).
453 *
454 * This guarantees that our relocations continue to work: values we've
455 * already written into the buffer, values we're going to write into the
456 * buffer, and the validation/relocation lists all will match.
457 *
458 * Also preserve kflags for EXEC_OBJECT_CAPTURE.
459 */
460 new_bo->gtt_offset = bo->gtt_offset;
461 new_bo->index = bo->index;
462 new_bo->kflags = bo->kflags;
463
464 /* Batch/state buffers are per-context, and if we've run out of space,
465 * we must have actually used them before, so...they will be in the list.
466 */
467 assert(bo->index < batch->exec_count);
468 assert(batch->exec_bos[bo->index] == bo);
469
470 /* Update the validation list to use the new BO. */
471 batch->validation_list[bo->index].handle = new_bo->gem_handle;
472
473 if (!batch->use_batch_first) {
474 /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
475 * update the relocation list entries to point at the new BO as well.
476 * (With newer kernels, the "handle" is an offset into the validation
477 * list, which remains unchanged, so we can skip this.)
478 */
479 replace_bo_in_reloc_list(&batch->batch_relocs,
480 bo->gem_handle, new_bo->gem_handle);
481 replace_bo_in_reloc_list(&batch->state_relocs,
482 bo->gem_handle, new_bo->gem_handle);
483 }
484
485 /* Exchange the two BOs...without breaking pointers to the old BO.
486 *
487 * Consider this scenario:
488 *
489 * 1. Somebody calls brw_state_batch() to get a region of memory, and
490 * and then creates a brw_address pointing to brw->batch.state.bo.
491 * 2. They then call brw_state_batch() a second time, which happens to
492 * grow and replace the state buffer. They then try to emit a
493 * relocation to their first section of memory.
494 *
495 * If we replace the brw->batch.state.bo pointer at step 2, we would
496 * break the address created in step 1. They'd have a pointer to the
497 * old destroyed BO. Emitting a relocation would add this dead BO to
498 * the validation list...causing /both/ statebuffers to be in the list,
499 * and all kinds of disasters.
500 *
501 * This is not a contrived case - BLORP vertex data upload hits this.
502 *
503 * There are worse scenarios too. Fences for GL sync objects reference
504 * brw->batch.batch.bo. If we replaced the batch pointer when growing,
505 * we'd need to chase down every fence and update it to point to the
506 * new BO. Otherwise, it would refer to a "batch" that never actually
507 * gets submitted, and would fail to trigger.
508 *
509 * To work around both of these issues, we transmutate the buffers in
510 * place, making the existing struct brw_bo represent the new buffer,
511 * and "new_bo" represent the old BO. This is highly unusual, but it
512 * seems like a necessary evil.
513 *
514 * We also defer the memcpy of the existing batch's contents. Callers
515 * may make multiple brw_state_batch calls, and retain pointers to the
516 * old BO's map. We'll perform the memcpy in finish_growing_bo() when
517 * we finally submit the batch, at which point we've finished uploading
518 * state, and nobody should have any old references anymore.
519 *
520 * To do that, we keep a reference to the old BO in grow->partial_bo,
521 * and store the number of bytes to copy in grow->partial_bytes. We
522 * can monkey with the refcounts directly without atomics because these
523 * are per-context BOs and they can only be touched by this thread.
524 */
525 assert(new_bo->refcount == 1);
526 new_bo->refcount = bo->refcount;
527 bo->refcount = 1;
528
529 assert(list_is_empty(&bo->exports));
530 assert(list_is_empty(&new_bo->exports));
531
532 struct brw_bo tmp;
533 memcpy(&tmp, bo, sizeof(struct brw_bo));
534 memcpy(bo, new_bo, sizeof(struct brw_bo));
535 memcpy(new_bo, &tmp, sizeof(struct brw_bo));
536
537 list_inithead(&bo->exports);
538 list_inithead(&new_bo->exports);
539
540 grow->partial_bo = new_bo; /* the one reference of the OLD bo */
541 grow->partial_bytes = existing_bytes;
542 }
543
544 void
brw_batch_require_space(struct brw_context * brw,GLuint sz)545 brw_batch_require_space(struct brw_context *brw, GLuint sz)
546 {
547 struct brw_batch *batch = &brw->batch;
548
549 const unsigned batch_used = USED_BATCH(*batch) * 4;
550 if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
551 brw_batch_flush(brw);
552 } else if (batch_used + sz >= batch->batch.bo->size) {
553 const unsigned new_size =
554 MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
555 MAX_BATCH_SIZE);
556 grow_buffer(brw, &batch->batch, batch_used, new_size);
557 batch->map_next = (void *) batch->batch.map + batch_used;
558 assert(batch_used + sz < batch->batch.bo->size);
559 }
560 }
561
562 /**
563 * Called when starting a new batch buffer.
564 */
565 static void
brw_new_batch(struct brw_context * brw)566 brw_new_batch(struct brw_context *brw)
567 {
568 /* Unreference any BOs held by the previous batch, and reset counts. */
569 for (int i = 0; i < brw->batch.exec_count; i++) {
570 brw_bo_unreference(brw->batch.exec_bos[i]);
571 brw->batch.exec_bos[i] = NULL;
572 }
573 brw->batch.batch_relocs.reloc_count = 0;
574 brw->batch.state_relocs.reloc_count = 0;
575 brw->batch.exec_count = 0;
576 brw->batch.aperture_space = 0;
577
578 brw_bo_unreference(brw->batch.state.bo);
579
580 /* Create a new batchbuffer and reset the associated state: */
581 brw_batch_reset_and_clear_render_cache(brw);
582
583 /* If the kernel supports hardware contexts, then most hardware state is
584 * preserved between batches; we only need to re-emit state that is required
585 * to be in every batch. Otherwise we need to re-emit all the state that
586 * would otherwise be stored in the context (which for all intents and
587 * purposes means everything).
588 */
589 if (brw->hw_ctx == 0) {
590 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
591 brw_upload_invariant_state(brw);
592 }
593
594 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
595
596 brw->ib.index_size = -1;
597
598 /* We need to periodically reap the shader time results, because rollover
599 * happens every few seconds. We also want to see results every once in a
600 * while, because many programs won't cleanly destroy our context, so the
601 * end-of-run printout may not happen.
602 */
603 if (INTEL_DEBUG(DEBUG_SHADER_TIME))
604 brw_collect_and_report_shader_time(brw);
605
606 brw_batch_maybe_noop(brw);
607 }
608
609 /**
610 * Called from brw_batch_flush before emitting MI_BATCHBUFFER_END and
611 * sending it off.
612 *
613 * This function can emit state (say, to preserve registers that aren't saved
614 * between batches).
615 */
616 static void
brw_finish_batch(struct brw_context * brw)617 brw_finish_batch(struct brw_context *brw)
618 {
619 const struct intel_device_info *devinfo = &brw->screen->devinfo;
620
621 brw->batch.no_wrap = true;
622
623 /* Capture the closing pipeline statistics register values necessary to
624 * support query objects (in the non-hardware context world).
625 */
626 brw_emit_query_end(brw);
627
628 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
629 * assume that the L3 cache is configured according to the hardware
630 * defaults. On Kernel 4.16+, we no longer need to do this.
631 */
632 if (devinfo->ver >= 7 &&
633 !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
634 gfx7_restore_default_l3_config(brw);
635
636 if (devinfo->is_haswell) {
637 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
638 * 3DSTATE_CC_STATE_POINTERS > "Note":
639 *
640 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
641 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
642 *
643 * From the example in the docs, it seems to expect a regular pipe control
644 * flush here as well. We may have done it already, but meh.
645 *
646 * See also WaAvoidRCZCounterRollover.
647 */
648 brw_emit_mi_flush(brw);
649 BEGIN_BATCH(2);
650 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
651 OUT_BATCH(brw->cc.state_offset | 1);
652 ADVANCE_BATCH();
653 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
654 PIPE_CONTROL_CS_STALL);
655 }
656
657 /* Do not restore push constant packets during context restore. */
658 if (devinfo->ver >= 7)
659 gfx7_emit_isp_disable(brw);
660
661 /* Emit MI_BATCH_BUFFER_END to finish our batch. Note that execbuf2
662 * requires our batch size to be QWord aligned, so we pad it out if
663 * necessary by emitting an extra MI_NOOP after the end.
664 */
665 brw_batch_require_space(brw, 8);
666 *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
667 if (USED_BATCH(brw->batch) & 1) {
668 *brw->batch.map_next++ = MI_NOOP;
669 }
670
671 brw->batch.no_wrap = false;
672 }
673
674 static void
throttle(struct brw_context * brw)675 throttle(struct brw_context *brw)
676 {
677 /* Wait for the swapbuffers before the one we just emitted, so we
678 * don't get too many swaps outstanding for apps that are GPU-heavy
679 * but not CPU-heavy.
680 *
681 * We're using intelDRI2Flush (called from the loader before
682 * swapbuffer) and glFlush (for front buffer rendering) as the
683 * indicator that a frame is done and then throttle when we get
684 * here as we prepare to render the next frame. At this point for
685 * round trips for swap/copy and getting new buffers are done and
686 * we'll spend less time waiting on the GPU.
687 *
688 * Unfortunately, we don't have a handle to the batch containing
689 * the swap, and getting our hands on that doesn't seem worth it,
690 * so we just use the first batch we emitted after the last swap.
691 */
692 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
693 if (brw->throttle_batch[1]) {
694 if (!brw->disable_throttling) {
695 brw_bo_wait_rendering(brw->throttle_batch[1]);
696 }
697 brw_bo_unreference(brw->throttle_batch[1]);
698 }
699 brw->throttle_batch[1] = brw->throttle_batch[0];
700 brw->throttle_batch[0] = NULL;
701 brw->need_swap_throttle = false;
702 /* Throttling here is more precise than the throttle ioctl, so skip it */
703 brw->need_flush_throttle = false;
704 }
705
706 if (brw->need_flush_throttle) {
707 drmCommandNone(brw->screen->fd, DRM_I915_GEM_THROTTLE);
708 brw->need_flush_throttle = false;
709 }
710 }
711
712 static int
execbuffer(int fd,struct brw_batch * batch,uint32_t ctx_id,int used,int in_fence,int * out_fence,int flags)713 execbuffer(int fd,
714 struct brw_batch *batch,
715 uint32_t ctx_id,
716 int used,
717 int in_fence,
718 int *out_fence,
719 int flags)
720 {
721 struct drm_i915_gem_execbuffer2 execbuf = {
722 .buffers_ptr = (uintptr_t) batch->validation_list,
723 .buffer_count = batch->exec_count,
724 .batch_start_offset = 0,
725 .batch_len = used,
726 .flags = flags,
727 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
728 };
729
730 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
731
732 if (in_fence != -1) {
733 execbuf.rsvd2 = in_fence;
734 execbuf.flags |= I915_EXEC_FENCE_IN;
735 }
736
737 if (out_fence != NULL) {
738 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
739 *out_fence = -1;
740 execbuf.flags |= I915_EXEC_FENCE_OUT;
741 }
742
743 if (num_fences(batch)) {
744 execbuf.flags |= I915_EXEC_FENCE_ARRAY;
745 execbuf.num_cliprects = num_fences(batch);
746 execbuf.cliprects_ptr =
747 (uintptr_t)util_dynarray_begin(&batch->exec_fences);
748 }
749
750
751 int ret = drmIoctl(fd, cmd, &execbuf);
752 if (ret != 0)
753 ret = -errno;
754
755 for (int i = 0; i < batch->exec_count; i++) {
756 struct brw_bo *bo = batch->exec_bos[i];
757
758 bo->idle = false;
759 bo->index = -1;
760
761 /* Update brw_bo::gtt_offset */
762 if (batch->validation_list[i].offset != bo->gtt_offset) {
763 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
764 bo->gem_handle, bo->gtt_offset,
765 (uint64_t)batch->validation_list[i].offset);
766 assert(!(bo->kflags & EXEC_OBJECT_PINNED));
767 bo->gtt_offset = batch->validation_list[i].offset;
768 }
769 }
770
771 if (ret == 0 && out_fence != NULL)
772 *out_fence = execbuf.rsvd2 >> 32;
773
774 return ret;
775 }
776
777 static int
submit_batch(struct brw_context * brw,int in_fence_fd,int * out_fence_fd)778 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
779 {
780 struct brw_batch *batch = &brw->batch;
781 int ret = 0;
782
783 if (batch->use_shadow_copy) {
784 void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
785 memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
786
787 bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
788 memcpy(bo_map, batch->state.map, batch->state_used);
789 }
790
791 brw_bo_unmap(batch->batch.bo);
792 brw_bo_unmap(batch->state.bo);
793
794 if (!brw->screen->devinfo.no_hw) {
795 /* The requirement for using I915_EXEC_NO_RELOC are:
796 *
797 * The addresses written in the objects must match the corresponding
798 * reloc.gtt_offset which in turn must match the corresponding
799 * execobject.offset.
800 *
801 * Any render targets written to in the batch must be flagged with
802 * EXEC_OBJECT_WRITE.
803 *
804 * To avoid stalling, execobject.offset should match the current
805 * address of that object within the active context.
806 */
807 int flags = I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
808
809 if (batch->needs_sol_reset)
810 flags |= I915_EXEC_GEN7_SOL_RESET;
811
812 /* Set statebuffer relocations */
813 const unsigned state_index = batch->state.bo->index;
814 if (state_index < batch->exec_count &&
815 batch->exec_bos[state_index] == batch->state.bo) {
816 struct drm_i915_gem_exec_object2 *entry =
817 &batch->validation_list[state_index];
818 assert(entry->handle == batch->state.bo->gem_handle);
819 entry->relocation_count = batch->state_relocs.reloc_count;
820 entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
821 }
822
823 /* Set batchbuffer relocations */
824 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
825 assert(entry->handle == batch->batch.bo->gem_handle);
826 entry->relocation_count = batch->batch_relocs.reloc_count;
827 entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
828
829 if (batch->use_batch_first) {
830 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
831 } else {
832 /* Move the batch to the end of the validation list */
833 struct drm_i915_gem_exec_object2 tmp;
834 struct brw_bo *tmp_bo;
835 const unsigned index = batch->exec_count - 1;
836
837 tmp = *entry;
838 *entry = batch->validation_list[index];
839 batch->validation_list[index] = tmp;
840
841 tmp_bo = batch->exec_bos[0];
842 batch->exec_bos[0] = batch->exec_bos[index];
843 batch->exec_bos[index] = tmp_bo;
844 }
845
846 ret = execbuffer(brw->screen->fd, batch, brw->hw_ctx,
847 4 * USED_BATCH(*batch),
848 in_fence_fd, out_fence_fd, flags);
849
850 throttle(brw);
851 }
852
853 if (INTEL_DEBUG(DEBUG_BATCH)) {
854 intel_print_batch(&batch->decoder, batch->batch.map,
855 4 * USED_BATCH(*batch),
856 batch->batch.bo->gtt_offset, false);
857 }
858
859 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
860 brw_check_for_reset(brw);
861
862 if (ret != 0) {
863 fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
864 strerror(-ret));
865 abort();
866 }
867
868 return ret;
869 }
870
871 /**
872 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
873 * of the fd.
874 *
875 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
876 * of the returned fd.
877 */
878 int
_brw_batch_flush_fence(struct brw_context * brw,int in_fence_fd,int * out_fence_fd,const char * file,int line)879 _brw_batch_flush_fence(struct brw_context *brw,
880 int in_fence_fd, int *out_fence_fd,
881 const char *file, int line)
882 {
883 int ret;
884
885 if (USED_BATCH(brw->batch) == 0 && !brw->batch.contains_fence_signal)
886 return 0;
887
888 /* Check that we didn't just wrap our batchbuffer at a bad time. */
889 assert(!brw->batch.no_wrap);
890
891 brw_finish_batch(brw);
892 brw_upload_finish(&brw->upload);
893
894 finish_growing_bos(&brw->batch.batch);
895 finish_growing_bos(&brw->batch.state);
896
897 if (brw->throttle_batch[0] == NULL) {
898 brw->throttle_batch[0] = brw->batch.batch.bo;
899 brw_bo_reference(brw->throttle_batch[0]);
900 }
901
902 if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) {
903 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
904 int bytes_for_state = brw->batch.state_used;
905 fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
906 " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
907 " %4d batch relocs, %4d state relocs\n", file, line,
908 bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
909 bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
910 brw->batch.exec_count,
911 (float) (brw->batch.aperture_space / (1024 * 1024)),
912 brw->batch.batch_relocs.reloc_count,
913 brw->batch.state_relocs.reloc_count);
914
915 dump_validation_list(&brw->batch);
916 }
917
918 ret = submit_batch(brw, in_fence_fd, out_fence_fd);
919
920 if (INTEL_DEBUG(DEBUG_SYNC)) {
921 fprintf(stderr, "waiting for idle\n");
922 brw_bo_wait_rendering(brw->batch.batch.bo);
923 }
924
925 /* Start a new batch buffer. */
926 brw_new_batch(brw);
927
928 return ret;
929 }
930
931 void
brw_batch_maybe_noop(struct brw_context * brw)932 brw_batch_maybe_noop(struct brw_context *brw)
933 {
934 if (!brw->frontend_noop || USED_BATCH(brw->batch) != 0)
935 return;
936
937 BEGIN_BATCH(1);
938 OUT_BATCH(MI_BATCH_BUFFER_END);
939 ADVANCE_BATCH();
940 }
941
942 bool
brw_batch_references(struct brw_batch * batch,struct brw_bo * bo)943 brw_batch_references(struct brw_batch *batch, struct brw_bo *bo)
944 {
945 unsigned index = READ_ONCE(bo->index);
946 if (index < batch->exec_count && batch->exec_bos[index] == bo)
947 return true;
948
949 for (int i = 0; i < batch->exec_count; i++) {
950 if (batch->exec_bos[i] == bo)
951 return true;
952 }
953 return false;
954 }
955
956 /* This is the only way buffers get added to the validate list.
957 */
958 static uint64_t
emit_reloc(struct brw_batch * batch,struct brw_reloc_list * rlist,uint32_t offset,struct brw_bo * target,int32_t target_offset,unsigned int reloc_flags)959 emit_reloc(struct brw_batch *batch,
960 struct brw_reloc_list *rlist, uint32_t offset,
961 struct brw_bo *target, int32_t target_offset,
962 unsigned int reloc_flags)
963 {
964 assert(target != NULL);
965
966 if (target->kflags & EXEC_OBJECT_PINNED) {
967 brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
968 return intel_canonical_address(target->gtt_offset + target_offset);
969 }
970
971 unsigned int index = add_exec_bo(batch, target);
972 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
973
974 if (rlist->reloc_count == rlist->reloc_array_size) {
975 rlist->reloc_array_size *= 2;
976 rlist->relocs = realloc(rlist->relocs,
977 rlist->reloc_array_size *
978 sizeof(struct drm_i915_gem_relocation_entry));
979 }
980
981 if (reloc_flags & RELOC_32BIT) {
982 /* Restrict this buffer to the low 32 bits of the address space.
983 *
984 * Altering the validation list flags restricts it for this batch,
985 * but we also alter the BO's kflags to restrict it permanently
986 * (until the BO is destroyed and put back in the cache). Buffers
987 * may stay bound across batches, and we want keep it constrained.
988 */
989 target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
990 entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
991
992 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
993 reloc_flags &= ~RELOC_32BIT;
994 }
995
996 if (reloc_flags)
997 entry->flags |= reloc_flags & batch->valid_reloc_flags;
998
999 rlist->relocs[rlist->reloc_count++] =
1000 (struct drm_i915_gem_relocation_entry) {
1001 .offset = offset,
1002 .delta = target_offset,
1003 .target_handle = batch->use_batch_first ? index : target->gem_handle,
1004 .presumed_offset = entry->offset,
1005 };
1006
1007 /* Using the old buffer offset, write in what the right data would be, in
1008 * case the buffer doesn't move and we can short-circuit the relocation
1009 * processing in the kernel
1010 */
1011 return entry->offset + target_offset;
1012 }
1013
1014 void
brw_use_pinned_bo(struct brw_batch * batch,struct brw_bo * bo,unsigned writable_flag)1015 brw_use_pinned_bo(struct brw_batch *batch, struct brw_bo *bo,
1016 unsigned writable_flag)
1017 {
1018 assert(bo->kflags & EXEC_OBJECT_PINNED);
1019 assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
1020
1021 unsigned int index = add_exec_bo(batch, bo);
1022 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
1023 assert(entry->offset == bo->gtt_offset);
1024
1025 if (writable_flag)
1026 entry->flags |= EXEC_OBJECT_WRITE;
1027 }
1028
1029 uint64_t
brw_batch_reloc(struct brw_batch * batch,uint32_t batch_offset,struct brw_bo * target,uint32_t target_offset,unsigned int reloc_flags)1030 brw_batch_reloc(struct brw_batch *batch, uint32_t batch_offset,
1031 struct brw_bo *target, uint32_t target_offset,
1032 unsigned int reloc_flags)
1033 {
1034 assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
1035
1036 return emit_reloc(batch, &batch->batch_relocs, batch_offset,
1037 target, target_offset, reloc_flags);
1038 }
1039
1040 uint64_t
brw_state_reloc(struct brw_batch * batch,uint32_t state_offset,struct brw_bo * target,uint32_t target_offset,unsigned int reloc_flags)1041 brw_state_reloc(struct brw_batch *batch, uint32_t state_offset,
1042 struct brw_bo *target, uint32_t target_offset,
1043 unsigned int reloc_flags)
1044 {
1045 assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
1046
1047 return emit_reloc(batch, &batch->state_relocs, state_offset,
1048 target, target_offset, reloc_flags);
1049 }
1050
1051 /**
1052 * Reserve some space in the statebuffer, or flush.
1053 *
1054 * This is used to estimate when we're near the end of the batch,
1055 * so we can flush early.
1056 */
1057 void
brw_require_statebuffer_space(struct brw_context * brw,int size)1058 brw_require_statebuffer_space(struct brw_context *brw, int size)
1059 {
1060 if (brw->batch.state_used + size >= STATE_SZ)
1061 brw_batch_flush(brw);
1062 }
1063
1064 /**
1065 * Allocates a block of space in the batchbuffer for indirect state.
1066 */
1067 void *
brw_state_batch(struct brw_context * brw,int size,int alignment,uint32_t * out_offset)1068 brw_state_batch(struct brw_context *brw,
1069 int size,
1070 int alignment,
1071 uint32_t *out_offset)
1072 {
1073 struct brw_batch *batch = &brw->batch;
1074
1075 assert(size < batch->state.bo->size);
1076
1077 uint32_t offset = ALIGN(batch->state_used, alignment);
1078
1079 if (offset + size >= STATE_SZ && !batch->no_wrap) {
1080 brw_batch_flush(brw);
1081 offset = ALIGN(batch->state_used, alignment);
1082 } else if (offset + size >= batch->state.bo->size) {
1083 const unsigned new_size =
1084 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
1085 MAX_STATE_SIZE);
1086 grow_buffer(brw, &batch->state, batch->state_used, new_size);
1087 assert(offset + size < batch->state.bo->size);
1088 }
1089
1090 if (INTEL_DEBUG(DEBUG_BATCH)) {
1091 _mesa_hash_table_u64_insert(batch->state_batch_sizes,
1092 offset, (void *) (uintptr_t) size);
1093 }
1094
1095 batch->state_used = offset + size;
1096
1097 *out_offset = offset;
1098 return batch->state.map + (offset >> 2);
1099 }
1100
1101 void
brw_batch_data(struct brw_context * brw,const void * data,GLuint bytes)1102 brw_batch_data(struct brw_context *brw,
1103 const void *data, GLuint bytes)
1104 {
1105 assert((bytes & 3) == 0);
1106 brw_batch_require_space(brw, bytes);
1107 memcpy(brw->batch.map_next, data, bytes);
1108 brw->batch.map_next += bytes >> 2;
1109 }
1110
1111 static void
load_sized_register_mem(struct brw_context * brw,uint32_t reg,struct brw_bo * bo,uint32_t offset,int size)1112 load_sized_register_mem(struct brw_context *brw,
1113 uint32_t reg,
1114 struct brw_bo *bo,
1115 uint32_t offset,
1116 int size)
1117 {
1118 const struct intel_device_info *devinfo = &brw->screen->devinfo;
1119 int i;
1120
1121 /* MI_LOAD_REGISTER_MEM only exists on Gfx7+. */
1122 assert(devinfo->ver >= 7);
1123
1124 if (devinfo->ver >= 8) {
1125 BEGIN_BATCH(4 * size);
1126 for (i = 0; i < size; i++) {
1127 OUT_BATCH(GFX7_MI_LOAD_REGISTER_MEM | (4 - 2));
1128 OUT_BATCH(reg + i * 4);
1129 OUT_RELOC64(bo, 0, offset + i * 4);
1130 }
1131 ADVANCE_BATCH();
1132 } else {
1133 BEGIN_BATCH(3 * size);
1134 for (i = 0; i < size; i++) {
1135 OUT_BATCH(GFX7_MI_LOAD_REGISTER_MEM | (3 - 2));
1136 OUT_BATCH(reg + i * 4);
1137 OUT_RELOC(bo, 0, offset + i * 4);
1138 }
1139 ADVANCE_BATCH();
1140 }
1141 }
1142
1143 void
brw_load_register_mem(struct brw_context * brw,uint32_t reg,struct brw_bo * bo,uint32_t offset)1144 brw_load_register_mem(struct brw_context *brw,
1145 uint32_t reg,
1146 struct brw_bo *bo,
1147 uint32_t offset)
1148 {
1149 load_sized_register_mem(brw, reg, bo, offset, 1);
1150 }
1151
1152 void
brw_load_register_mem64(struct brw_context * brw,uint32_t reg,struct brw_bo * bo,uint32_t offset)1153 brw_load_register_mem64(struct brw_context *brw,
1154 uint32_t reg,
1155 struct brw_bo *bo,
1156 uint32_t offset)
1157 {
1158 load_sized_register_mem(brw, reg, bo, offset, 2);
1159 }
1160
1161 /*
1162 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1163 */
1164 void
brw_store_register_mem32(struct brw_context * brw,struct brw_bo * bo,uint32_t reg,uint32_t offset)1165 brw_store_register_mem32(struct brw_context *brw,
1166 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1167 {
1168 const struct intel_device_info *devinfo = &brw->screen->devinfo;
1169
1170 assert(devinfo->ver >= 6);
1171
1172 if (devinfo->ver >= 8) {
1173 BEGIN_BATCH(4);
1174 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1175 OUT_BATCH(reg);
1176 OUT_RELOC64(bo, RELOC_WRITE, offset);
1177 ADVANCE_BATCH();
1178 } else {
1179 BEGIN_BATCH(3);
1180 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1181 OUT_BATCH(reg);
1182 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1183 ADVANCE_BATCH();
1184 }
1185 }
1186
1187 /*
1188 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1189 */
1190 void
brw_store_register_mem64(struct brw_context * brw,struct brw_bo * bo,uint32_t reg,uint32_t offset)1191 brw_store_register_mem64(struct brw_context *brw,
1192 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1193 {
1194 const struct intel_device_info *devinfo = &brw->screen->devinfo;
1195
1196 assert(devinfo->ver >= 6);
1197
1198 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1199 * read a full 64-bit register, we need to do two of them.
1200 */
1201 if (devinfo->ver >= 8) {
1202 BEGIN_BATCH(8);
1203 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1204 OUT_BATCH(reg);
1205 OUT_RELOC64(bo, RELOC_WRITE, offset);
1206 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1207 OUT_BATCH(reg + sizeof(uint32_t));
1208 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1209 ADVANCE_BATCH();
1210 } else {
1211 BEGIN_BATCH(6);
1212 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1213 OUT_BATCH(reg);
1214 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1215 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1216 OUT_BATCH(reg + sizeof(uint32_t));
1217 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1218 ADVANCE_BATCH();
1219 }
1220 }
1221
1222 /*
1223 * Write a 32-bit register using immediate data.
1224 */
1225 void
brw_load_register_imm32(struct brw_context * brw,uint32_t reg,uint32_t imm)1226 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1227 {
1228 assert(brw->screen->devinfo.ver >= 6);
1229
1230 BEGIN_BATCH(3);
1231 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1232 OUT_BATCH(reg);
1233 OUT_BATCH(imm);
1234 ADVANCE_BATCH();
1235 }
1236
1237 /*
1238 * Write a 64-bit register using immediate data.
1239 */
1240 void
brw_load_register_imm64(struct brw_context * brw,uint32_t reg,uint64_t imm)1241 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1242 {
1243 assert(brw->screen->devinfo.ver >= 6);
1244
1245 BEGIN_BATCH(5);
1246 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1247 OUT_BATCH(reg);
1248 OUT_BATCH(imm & 0xffffffff);
1249 OUT_BATCH(reg + 4);
1250 OUT_BATCH(imm >> 32);
1251 ADVANCE_BATCH();
1252 }
1253
1254 /*
1255 * Copies a 32-bit register.
1256 */
1257 void
brw_load_register_reg(struct brw_context * brw,uint32_t dest,uint32_t src)1258 brw_load_register_reg(struct brw_context *brw, uint32_t dest, uint32_t src)
1259 {
1260 assert(brw->screen->devinfo.verx10 >= 75);
1261
1262 BEGIN_BATCH(3);
1263 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1264 OUT_BATCH(src);
1265 OUT_BATCH(dest);
1266 ADVANCE_BATCH();
1267 }
1268
1269 /*
1270 * Copies a 64-bit register.
1271 */
1272 void
brw_load_register_reg64(struct brw_context * brw,uint32_t dest,uint32_t src)1273 brw_load_register_reg64(struct brw_context *brw, uint32_t dest, uint32_t src)
1274 {
1275 assert(brw->screen->devinfo.verx10 >= 75);
1276
1277 BEGIN_BATCH(6);
1278 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1279 OUT_BATCH(src);
1280 OUT_BATCH(dest);
1281 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1282 OUT_BATCH(src + sizeof(uint32_t));
1283 OUT_BATCH(dest + sizeof(uint32_t));
1284 ADVANCE_BATCH();
1285 }
1286
1287 /*
1288 * Write 32-bits of immediate data to a GPU memory buffer.
1289 */
1290 void
brw_store_data_imm32(struct brw_context * brw,struct brw_bo * bo,uint32_t offset,uint32_t imm)1291 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1292 uint32_t offset, uint32_t imm)
1293 {
1294 const struct intel_device_info *devinfo = &brw->screen->devinfo;
1295
1296 assert(devinfo->ver >= 6);
1297
1298 BEGIN_BATCH(4);
1299 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1300 if (devinfo->ver >= 8)
1301 OUT_RELOC64(bo, RELOC_WRITE, offset);
1302 else {
1303 OUT_BATCH(0); /* MBZ */
1304 OUT_RELOC(bo, RELOC_WRITE, offset);
1305 }
1306 OUT_BATCH(imm);
1307 ADVANCE_BATCH();
1308 }
1309
1310 /*
1311 * Write 64-bits of immediate data to a GPU memory buffer.
1312 */
1313 void
brw_store_data_imm64(struct brw_context * brw,struct brw_bo * bo,uint32_t offset,uint64_t imm)1314 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1315 uint32_t offset, uint64_t imm)
1316 {
1317 const struct intel_device_info *devinfo = &brw->screen->devinfo;
1318
1319 assert(devinfo->ver >= 6);
1320
1321 BEGIN_BATCH(5);
1322 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1323 if (devinfo->ver >= 8)
1324 OUT_RELOC64(bo, RELOC_WRITE, offset);
1325 else {
1326 OUT_BATCH(0); /* MBZ */
1327 OUT_RELOC(bo, RELOC_WRITE, offset);
1328 }
1329 OUT_BATCH(imm & 0xffffffffu);
1330 OUT_BATCH(imm >> 32);
1331 ADVANCE_BATCH();
1332 }
1333