1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "gpu_cmds.h"
26
27 void
gen7_render_flush(struct intel_batchbuffer * batch,uint32_t batch_end)28 gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
29 {
30 int ret;
31
32 ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
33 if (ret == 0)
34 ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
35 NULL, 0, 0, 0);
36 igt_assert(ret == 0);
37 }
38
39 void
gen7_render_context_flush(struct intel_batchbuffer * batch,uint32_t batch_end)40 gen7_render_context_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
41 {
42 int ret;
43
44 ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
45 if (ret == 0)
46 ret = drm_intel_gem_bo_context_exec(batch->bo, batch->ctx,
47 batch_end, 0);
48 igt_assert(ret == 0);
49 }
50
51 uint32_t
gen7_fill_curbe_buffer_data(struct intel_batchbuffer * batch,uint8_t color)52 gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch,
53 uint8_t color)
54 {
55 uint8_t *curbe_buffer;
56 uint32_t offset;
57
58 curbe_buffer = intel_batchbuffer_subdata_alloc(batch,
59 sizeof(uint32_t) * 8,
60 64);
61 offset = intel_batchbuffer_subdata_offset(batch, curbe_buffer);
62 *curbe_buffer = color;
63
64 return offset;
65 }
66
67 uint32_t
gen11_fill_curbe_buffer_data(struct intel_batchbuffer * batch)68 gen11_fill_curbe_buffer_data(struct intel_batchbuffer *batch)
69 {
70 uint32_t *curbe_buffer;
71 uint32_t offset;
72
73 curbe_buffer = intel_batchbuffer_subdata_alloc(batch,
74 sizeof(uint32_t) * 8,
75 64);
76 offset = intel_batchbuffer_subdata_offset(batch, curbe_buffer);
77 *curbe_buffer++ = 0;
78 *curbe_buffer = 1;
79
80 return offset;
81 }
82
83 uint32_t
gen7_fill_surface_state(struct intel_batchbuffer * batch,const struct igt_buf * buf,uint32_t format,int is_dst)84 gen7_fill_surface_state(struct intel_batchbuffer *batch,
85 const struct igt_buf *buf,
86 uint32_t format,
87 int is_dst)
88 {
89 struct gen7_surface_state *ss;
90 uint32_t write_domain, read_domain, offset;
91 int ret;
92
93 if (is_dst) {
94 write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
95 } else {
96 write_domain = 0;
97 read_domain = I915_GEM_DOMAIN_SAMPLER;
98 }
99
100 ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
101 offset = intel_batchbuffer_subdata_offset(batch, ss);
102
103 ss->ss0.surface_type = SURFACE_2D;
104 ss->ss0.surface_format = format;
105 ss->ss0.render_cache_read_write = 1;
106
107 if (buf->tiling == I915_TILING_X)
108 ss->ss0.tiled_mode = 2;
109 else if (buf->tiling == I915_TILING_Y)
110 ss->ss0.tiled_mode = 3;
111
112 ss->ss1.base_addr = buf->bo->offset;
113 ret = drm_intel_bo_emit_reloc(batch->bo,
114 intel_batchbuffer_subdata_offset(batch, ss) + 4,
115 buf->bo, 0,
116 read_domain, write_domain);
117 igt_assert(ret == 0);
118
119 ss->ss2.height = igt_buf_height(buf) - 1;
120 ss->ss2.width = igt_buf_width(buf) - 1;
121
122 ss->ss3.pitch = buf->stride - 1;
123
124 ss->ss7.shader_chanel_select_r = 4;
125 ss->ss7.shader_chanel_select_g = 5;
126 ss->ss7.shader_chanel_select_b = 6;
127 ss->ss7.shader_chanel_select_a = 7;
128
129 return offset;
130 }
131
132 uint32_t
gen7_fill_binding_table(struct intel_batchbuffer * batch,const struct igt_buf * dst)133 gen7_fill_binding_table(struct intel_batchbuffer *batch,
134 const struct igt_buf *dst)
135 {
136 uint32_t *binding_table, offset;
137
138 binding_table = intel_batchbuffer_subdata_alloc(batch, 32, 64);
139 offset = intel_batchbuffer_subdata_offset(batch, binding_table);
140 if (IS_GEN7(batch->devid))
141 binding_table[0] = gen7_fill_surface_state(batch, dst,
142 SURFACEFORMAT_R8_UNORM, 1);
143 else
144 binding_table[0] = gen8_fill_surface_state(batch, dst,
145 SURFACEFORMAT_R8_UNORM, 1);
146
147 return offset;
148 }
149
150 uint32_t
gen11_fill_binding_table(struct intel_batchbuffer * batch,const struct igt_buf * src,const struct igt_buf * dst)151 gen11_fill_binding_table(struct intel_batchbuffer *batch,
152 const struct igt_buf *src,const struct igt_buf *dst)
153 {
154 uint32_t *binding_table, offset;
155
156 binding_table = intel_batchbuffer_subdata_alloc(batch, 64, 64);
157 offset = intel_batchbuffer_subdata_offset(batch, binding_table);
158 binding_table[0] = gen11_fill_surface_state(batch, src,
159 SURFACE_1D,SURFACEFORMAT_R32G32B32A32_FLOAT,
160 0,0,
161 0);
162 binding_table[1] = gen11_fill_surface_state(batch, dst,
163 SURFACE_BUFFER, SURFACEFORMAT_RAW,
164 1,1,
165 1);
166
167 return offset;
168 }
169
170 uint32_t
gen7_fill_kernel(struct intel_batchbuffer * batch,const uint32_t kernel[][4],size_t size)171 gen7_fill_kernel(struct intel_batchbuffer *batch,
172 const uint32_t kernel[][4],
173 size_t size)
174 {
175 uint32_t offset;
176
177 offset = intel_batchbuffer_copy_data(batch, kernel, size, 64);
178
179 return offset;
180 }
181
182 uint32_t
gen7_fill_interface_descriptor(struct intel_batchbuffer * batch,const struct igt_buf * dst,const uint32_t kernel[][4],size_t size)183 gen7_fill_interface_descriptor(struct intel_batchbuffer *batch,
184 const struct igt_buf *dst,
185 const uint32_t kernel[][4],
186 size_t size)
187 {
188 struct gen7_interface_descriptor_data *idd;
189 uint32_t offset;
190 uint32_t binding_table_offset, kernel_offset;
191
192 binding_table_offset = gen7_fill_binding_table(batch, dst);
193 kernel_offset = gen7_fill_kernel(batch, kernel, size);
194
195 idd = intel_batchbuffer_subdata_alloc(batch, sizeof(*idd), 64);
196 offset = intel_batchbuffer_subdata_offset(batch, idd);
197
198 idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
199
200 idd->desc1.single_program_flow = 1;
201 idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
202
203 idd->desc2.sampler_count = 0; /* 0 samplers used */
204 idd->desc2.sampler_state_pointer = 0;
205
206 idd->desc3.binding_table_entry_count = 0;
207 idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
208
209 idd->desc4.constant_urb_entry_read_offset = 0;
210 idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
211
212 return offset;
213 }
214
215 void
gen7_emit_state_base_address(struct intel_batchbuffer * batch)216 gen7_emit_state_base_address(struct intel_batchbuffer *batch)
217 {
218 OUT_BATCH(GEN7_STATE_BASE_ADDRESS | (10 - 2));
219
220 /* general */
221 OUT_BATCH(0);
222
223 /* surface */
224 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
225 BASE_ADDRESS_MODIFY);
226
227 /* dynamic */
228 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
229 BASE_ADDRESS_MODIFY);
230
231 /* indirect */
232 OUT_BATCH(0);
233
234 /* instruction */
235 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
236 BASE_ADDRESS_MODIFY);
237
238 /* general/dynamic/indirect/instruction access Bound */
239 OUT_BATCH(0);
240 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
241 OUT_BATCH(0);
242 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
243 }
244
245 void
gen7_emit_vfe_state(struct intel_batchbuffer * batch,uint32_t threads,uint32_t urb_entries,uint32_t urb_size,uint32_t curbe_size,uint32_t mode)246 gen7_emit_vfe_state(struct intel_batchbuffer *batch, uint32_t threads,
247 uint32_t urb_entries, uint32_t urb_size,
248 uint32_t curbe_size, uint32_t mode)
249 {
250 OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
251
252 /* scratch buffer */
253 OUT_BATCH(0);
254
255 /* number of threads & urb entries */
256 OUT_BATCH(threads << 16 |
257 urb_entries << 8 |
258 mode << 2); /* GPGPU vs media mode */
259
260 OUT_BATCH(0);
261
262 /* urb entry size & curbe size */
263 OUT_BATCH(urb_size << 16 | /* in 256 bits unit */
264 curbe_size); /* in 256 bits unit */
265
266 /* scoreboard */
267 OUT_BATCH(0);
268 OUT_BATCH(0);
269 OUT_BATCH(0);
270 }
271
272 void
gen7_emit_curbe_load(struct intel_batchbuffer * batch,uint32_t curbe_buffer)273 gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t curbe_buffer)
274 {
275 OUT_BATCH(GEN7_MEDIA_CURBE_LOAD | (4 - 2));
276 OUT_BATCH(0);
277 /* curbe total data length */
278 OUT_BATCH(64);
279 /* curbe data start address, is relative to the dynamics base address */
280 OUT_BATCH(curbe_buffer);
281 }
282
283 void
gen7_emit_interface_descriptor_load(struct intel_batchbuffer * batch,uint32_t interface_descriptor)284 gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch,
285 uint32_t interface_descriptor)
286 {
287 OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD | (4 - 2));
288 OUT_BATCH(0);
289 /* interface descriptor data length */
290 if (IS_GEN7(batch->devid))
291 OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
292 else
293 OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
294 /* interface descriptor address, is relative to the dynamics base
295 * address
296 */
297 OUT_BATCH(interface_descriptor);
298 }
299
300 void
gen7_emit_media_objects(struct intel_batchbuffer * batch,unsigned int x,unsigned int y,unsigned int width,unsigned int height)301 gen7_emit_media_objects(struct intel_batchbuffer *batch,
302 unsigned int x, unsigned int y,
303 unsigned int width, unsigned int height)
304 {
305 int i, j;
306
307 for (i = 0; i < width / 16; i++) {
308 for (j = 0; j < height / 16; j++) {
309 gen_emit_media_object(batch, x + i * 16, y + j * 16);
310 }
311 }
312 }
313
314 void
gen7_emit_gpgpu_walk(struct intel_batchbuffer * batch,unsigned int x,unsigned int y,unsigned int width,unsigned int height)315 gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
316 unsigned int x, unsigned int y,
317 unsigned int width, unsigned int height)
318 {
319 uint32_t x_dim, y_dim, tmp, right_mask;
320
321 /*
322 * Simply do SIMD16 based dispatch, so every thread uses
323 * SIMD16 channels.
324 *
325 * Define our own thread group size, e.g 16x1 for every group, then
326 * will have 1 thread each group in SIMD16 dispatch. So thread
327 * width/height/depth are all 1.
328 *
329 * Then thread group X = width / 16 (aligned to 16)
330 * thread group Y = height;
331 */
332 x_dim = (width + 15) / 16;
333 y_dim = height;
334
335 tmp = width & 15;
336 if (tmp == 0)
337 right_mask = (1 << 16) - 1;
338 else
339 right_mask = (1 << tmp) - 1;
340
341 OUT_BATCH(GEN7_GPGPU_WALKER | 9);
342
343 /* interface descriptor offset */
344 OUT_BATCH(0);
345
346 /* SIMD size, thread w/h/d */
347 OUT_BATCH(1 << 30 | /* SIMD16 */
348 0 << 16 | /* depth:1 */
349 0 << 8 | /* height:1 */
350 0); /* width:1 */
351
352 /* thread group X */
353 OUT_BATCH(0);
354 OUT_BATCH(x_dim);
355
356 /* thread group Y */
357 OUT_BATCH(0);
358 OUT_BATCH(y_dim);
359
360 /* thread group Z */
361 OUT_BATCH(0);
362 OUT_BATCH(1);
363
364 /* right mask */
365 OUT_BATCH(right_mask);
366
367 /* bottom mask, height 1, always 0xffffffff */
368 OUT_BATCH(0xffffffff);
369 }
370
371 uint32_t
gen8_spin_curbe_buffer_data(struct intel_batchbuffer * batch,uint32_t iters)372 gen8_spin_curbe_buffer_data(struct intel_batchbuffer *batch,
373 uint32_t iters)
374 {
375 uint32_t *curbe_buffer;
376 uint32_t offset;
377
378 curbe_buffer = intel_batchbuffer_subdata_alloc(batch, 64, 64);
379 offset = intel_batchbuffer_subdata_offset(batch, curbe_buffer);
380 *curbe_buffer = iters;
381
382 return offset;
383 }
384
385 uint32_t
gen8_fill_surface_state(struct intel_batchbuffer * batch,const struct igt_buf * buf,uint32_t format,int is_dst)386 gen8_fill_surface_state(struct intel_batchbuffer *batch,
387 const struct igt_buf *buf,
388 uint32_t format,
389 int is_dst)
390 {
391 struct gen8_surface_state *ss;
392 uint32_t write_domain, read_domain, offset;
393 int ret;
394
395 if (is_dst) {
396 write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
397 } else {
398 write_domain = 0;
399 read_domain = I915_GEM_DOMAIN_SAMPLER;
400 }
401
402 ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
403 offset = intel_batchbuffer_subdata_offset(batch, ss);
404
405 ss->ss0.surface_type = SURFACE_2D;
406 ss->ss0.surface_format = format;
407 ss->ss0.render_cache_read_write = 1;
408 ss->ss0.vertical_alignment = 1; /* align 4 */
409 ss->ss0.horizontal_alignment = 1; /* align 4 */
410
411 if (buf->tiling == I915_TILING_X)
412 ss->ss0.tiled_mode = 2;
413 else if (buf->tiling == I915_TILING_Y)
414 ss->ss0.tiled_mode = 3;
415
416 ss->ss8.base_addr = buf->bo->offset;
417
418 ret = drm_intel_bo_emit_reloc(batch->bo,
419 intel_batchbuffer_subdata_offset(batch, ss) + 8 * 4,
420 buf->bo, 0, read_domain, write_domain);
421 igt_assert(ret == 0);
422
423 ss->ss2.height = igt_buf_height(buf) - 1;
424 ss->ss2.width = igt_buf_width(buf) - 1;
425 ss->ss3.pitch = buf->stride - 1;
426
427 ss->ss7.shader_chanel_select_r = 4;
428 ss->ss7.shader_chanel_select_g = 5;
429 ss->ss7.shader_chanel_select_b = 6;
430 ss->ss7.shader_chanel_select_a = 7;
431
432 return offset;
433 }
434
435 uint32_t
gen11_fill_surface_state(struct intel_batchbuffer * batch,const struct igt_buf * buf,uint32_t surface_type,uint32_t format,uint32_t vertical_alignment,uint32_t horizontal_alignment,int is_dst)436 gen11_fill_surface_state(struct intel_batchbuffer *batch,
437 const struct igt_buf *buf,
438 uint32_t surface_type,
439 uint32_t format,
440 uint32_t vertical_alignment,
441 uint32_t horizontal_alignment,
442 int is_dst)
443 {
444 struct gen8_surface_state *ss;
445 uint32_t write_domain, read_domain, offset;
446 int ret;
447
448 if (is_dst) {
449 write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
450 } else {
451 write_domain = 0;
452 read_domain = I915_GEM_DOMAIN_SAMPLER;
453 }
454
455 ss = intel_batchbuffer_subdata_alloc(batch, sizeof(*ss), 64);
456 offset = intel_batchbuffer_subdata_offset(batch, ss);
457
458 ss->ss0.surface_type = surface_type;
459 ss->ss0.surface_format = format;
460 ss->ss0.render_cache_read_write = 1;
461 ss->ss0.vertical_alignment = vertical_alignment; /* align 4 */
462 ss->ss0.horizontal_alignment = horizontal_alignment; /* align 4 */
463
464 if (buf->tiling == I915_TILING_X)
465 ss->ss0.tiled_mode = 2;
466 else if (buf->tiling == I915_TILING_Y)
467 ss->ss0.tiled_mode = 3;
468 else
469 ss->ss0.tiled_mode = 0;
470
471 ss->ss8.base_addr = buf->bo->offset;
472
473 ret = drm_intel_bo_emit_reloc(batch->bo,
474 intel_batchbuffer_subdata_offset(batch, ss) + 8 * 4,
475 buf->bo, 0, read_domain, write_domain);
476 igt_assert(ret == 0);
477
478 if (is_dst) {
479 ss->ss1.memory_object_control = 2;
480 ss->ss2.height = 1;
481 ss->ss2.width = 95;
482 ss->ss3.pitch = 0;
483 ss->ss7.shader_chanel_select_r = 4;
484 ss->ss7.shader_chanel_select_g = 5;
485 ss->ss7.shader_chanel_select_b = 6;
486 ss->ss7.shader_chanel_select_a = 7;
487 }
488 else {
489 ss->ss1.qpitch = 4040;
490 ss->ss1.base_mip_level = 31;
491 ss->ss2.height = 9216;
492 ss->ss2.width = 1019;
493 ss->ss3.pitch = 64;
494 ss->ss5.mip_count = 2;
495 }
496
497 return offset;
498 }
499
500 uint32_t
gen8_fill_interface_descriptor(struct intel_batchbuffer * batch,const struct igt_buf * dst,const uint32_t kernel[][4],size_t size)501 gen8_fill_interface_descriptor(struct intel_batchbuffer *batch,
502 const struct igt_buf *dst,
503 const uint32_t kernel[][4],
504 size_t size)
505 {
506 struct gen8_interface_descriptor_data *idd;
507 uint32_t offset;
508 uint32_t binding_table_offset, kernel_offset;
509
510 binding_table_offset = gen7_fill_binding_table(batch, dst);
511 kernel_offset = gen7_fill_kernel(batch, kernel, size);
512
513 idd = intel_batchbuffer_subdata_alloc(batch, sizeof(*idd), 64);
514 offset = intel_batchbuffer_subdata_offset(batch, idd);
515
516 idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
517
518 idd->desc2.single_program_flow = 1;
519 idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
520
521 idd->desc3.sampler_count = 0; /* 0 samplers used */
522 idd->desc3.sampler_state_pointer = 0;
523
524 idd->desc4.binding_table_entry_count = 0;
525 idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
526
527 idd->desc5.constant_urb_entry_read_offset = 0;
528 idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
529
530 idd->desc6.num_threads_in_tg = 1;
531
532 return offset;
533 }
534
535 uint32_t
gen11_fill_interface_descriptor(struct intel_batchbuffer * batch,const struct igt_buf * src,const struct igt_buf * dst,const uint32_t kernel[][4],size_t size)536 gen11_fill_interface_descriptor(struct intel_batchbuffer *batch,
537 const struct igt_buf *src,const struct igt_buf *dst,
538 const uint32_t kernel[][4],
539 size_t size)
540 {
541 struct gen8_interface_descriptor_data *idd;
542 uint32_t offset;
543 uint32_t binding_table_offset, kernel_offset;
544
545 binding_table_offset = gen11_fill_binding_table(batch, src,dst);
546 kernel_offset = gen7_fill_kernel(batch, kernel, size);
547
548 idd = intel_batchbuffer_subdata_alloc(batch, sizeof(*idd), 64);
549 offset = intel_batchbuffer_subdata_offset(batch, idd);
550
551 idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
552
553 idd->desc2.single_program_flow = 1;
554 idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
555
556 idd->desc3.sampler_count = 0; /* 0 samplers used */
557 idd->desc3.sampler_state_pointer = 0;
558
559 idd->desc4.binding_table_entry_count = 0;
560 idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
561
562 idd->desc5.constant_urb_entry_read_offset = 0;
563 idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
564
565 idd->desc6.num_threads_in_tg = 1;
566
567 return offset;
568 }
569
570 void
gen8_emit_state_base_address(struct intel_batchbuffer * batch)571 gen8_emit_state_base_address(struct intel_batchbuffer *batch)
572 {
573 OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (16 - 2));
574
575 /* general */
576 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
577 OUT_BATCH(0);
578
579 /* stateless data port */
580 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
581
582 /* surface */
583 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
584
585 /* dynamic */
586 OUT_RELOC(batch->bo,
587 I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
588 0, BASE_ADDRESS_MODIFY);
589
590 /* indirect */
591 OUT_BATCH(0);
592 OUT_BATCH(0);
593
594 /* instruction */
595 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
596 BASE_ADDRESS_MODIFY);
597
598 /* general state buffer size */
599 OUT_BATCH(0xfffff000 | 1);
600 /* dynamic state buffer size */
601 OUT_BATCH(1 << 12 | 1);
602 /* indirect object buffer size */
603 OUT_BATCH(0xfffff000 | 1);
604 /* instruction buffer size, must set modify enable bit, otherwise it may
605 * result in GPU hang
606 */
607 OUT_BATCH(1 << 12 | 1);
608 }
609
610 void
gen8_emit_media_state_flush(struct intel_batchbuffer * batch)611 gen8_emit_media_state_flush(struct intel_batchbuffer *batch)
612 {
613 OUT_BATCH(GEN8_MEDIA_STATE_FLUSH | (2 - 2));
614 OUT_BATCH(0);
615 }
616
617 void
gen8_emit_vfe_state(struct intel_batchbuffer * batch,uint32_t threads,uint32_t urb_entries,uint32_t urb_size,uint32_t curbe_size)618 gen8_emit_vfe_state(struct intel_batchbuffer *batch, uint32_t threads,
619 uint32_t urb_entries, uint32_t urb_size,
620 uint32_t curbe_size)
621 {
622 OUT_BATCH(GEN7_MEDIA_VFE_STATE | (9 - 2));
623
624 /* scratch buffer */
625 OUT_BATCH(0);
626 OUT_BATCH(0);
627
628 /* number of threads & urb entries */
629 OUT_BATCH(threads << 16 |
630 urb_entries << 8);
631
632 OUT_BATCH(0);
633
634 /* urb entry size & curbe size */
635 OUT_BATCH(urb_size << 16 |
636 curbe_size);
637
638 /* scoreboard */
639 OUT_BATCH(0);
640 OUT_BATCH(0);
641 OUT_BATCH(0);
642 }
643
644 void
gen8_emit_gpgpu_walk(struct intel_batchbuffer * batch,unsigned int x,unsigned int y,unsigned int width,unsigned int height)645 gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
646 unsigned int x, unsigned int y,
647 unsigned int width, unsigned int height)
648 {
649 uint32_t x_dim, y_dim, tmp, right_mask;
650
651 /*
652 * Simply do SIMD16 based dispatch, so every thread uses
653 * SIMD16 channels.
654 *
655 * Define our own thread group size, e.g 16x1 for every group, then
656 * will have 1 thread each group in SIMD16 dispatch. So thread
657 * width/height/depth are all 1.
658 *
659 * Then thread group X = width / 16 (aligned to 16)
660 * thread group Y = height;
661 */
662 x_dim = (width + 15) / 16;
663 y_dim = height;
664
665 tmp = width & 15;
666 if (tmp == 0)
667 right_mask = (1 << 16) - 1;
668 else
669 right_mask = (1 << tmp) - 1;
670
671 OUT_BATCH(GEN7_GPGPU_WALKER | 13);
672
673 OUT_BATCH(0); /* kernel offset */
674 OUT_BATCH(0); /* indirect data length */
675 OUT_BATCH(0); /* indirect data offset */
676
677 /* SIMD size, thread w/h/d */
678 OUT_BATCH(1 << 30 | /* SIMD16 */
679 0 << 16 | /* depth:1 */
680 0 << 8 | /* height:1 */
681 0); /* width:1 */
682
683 /* thread group X */
684 OUT_BATCH(0);
685 OUT_BATCH(0);
686 OUT_BATCH(x_dim);
687
688 /* thread group Y */
689 OUT_BATCH(0);
690 OUT_BATCH(0);
691 OUT_BATCH(y_dim);
692
693 /* thread group Z */
694 OUT_BATCH(0);
695 OUT_BATCH(1);
696
697 /* right mask */
698 OUT_BATCH(right_mask);
699
700 /* bottom mask, height 1, always 0xffffffff */
701 OUT_BATCH(0xffffffff);
702 }
703
704 void
gen_emit_media_object(struct intel_batchbuffer * batch,unsigned int xoffset,unsigned int yoffset)705 gen_emit_media_object(struct intel_batchbuffer *batch,
706 unsigned int xoffset, unsigned int yoffset)
707 {
708 OUT_BATCH(GEN7_MEDIA_OBJECT | (8 - 2));
709
710 /* interface descriptor offset */
711 OUT_BATCH(0);
712
713 /* without indirect data */
714 OUT_BATCH(0);
715 OUT_BATCH(0);
716
717 /* scoreboard */
718 OUT_BATCH(0);
719 OUT_BATCH(0);
720
721 /* inline data (xoffset, yoffset) */
722 OUT_BATCH(xoffset);
723 OUT_BATCH(yoffset);
724 if (AT_LEAST_GEN(batch->devid, 8) && !IS_CHERRYVIEW(batch->devid))
725 gen8_emit_media_state_flush(batch);
726 }
727
728 void
gen9_emit_state_base_address(struct intel_batchbuffer * batch)729 gen9_emit_state_base_address(struct intel_batchbuffer *batch)
730 {
731 OUT_BATCH(GEN8_STATE_BASE_ADDRESS | (19 - 2));
732
733 /* general */
734 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
735 OUT_BATCH(0);
736
737 /* stateless data port */
738 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
739
740 /* surface */
741 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
742
743 /* dynamic */
744 OUT_RELOC(batch->bo,
745 I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION,
746 0, BASE_ADDRESS_MODIFY);
747
748 /* indirect */
749 OUT_BATCH(0);
750 OUT_BATCH(0);
751
752 /* instruction */
753 OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
754 BASE_ADDRESS_MODIFY);
755
756 /* general state buffer size */
757 OUT_BATCH(0xfffff000 | 1);
758 /* dynamic state buffer size */
759 OUT_BATCH(1 << 12 | 1);
760 /* indirect object buffer size */
761 OUT_BATCH(0xfffff000 | 1);
762 /* intruction buffer size, must set modify enable bit, otherwise it may
763 * result in GPU hang
764 */
765 OUT_BATCH(1 << 12 | 1);
766
767 /* Bindless surface state base address */
768 OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
769 OUT_BATCH(0);
770 OUT_BATCH(0xfffff000);
771 }
772