1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2017 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "decode.h"
27
28 #include "drm-uapi/panfrost_drm.h"
29
30 #include "pan_cmdstream.h"
31 #include "pan_context.h"
32 #include "pan_fb_preload.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40
41 int
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44 /* Reserve the framebuffer and local storage descriptors */
45 batch->framebuffer =
46 #if PAN_ARCH == 4
47 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49 pan_pool_alloc_desc_aggregate(
50 &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53 if (!batch->framebuffer.gpu)
54 return -1;
55
56 #if PAN_ARCH >= 6
57 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
58 #else
59 /* On Midgard, the TLS is embedded in the FB descriptor */
60 batch->tls = batch->framebuffer;
61
62 if (!batch->tls.cpu)
63 return -1;
64
65 #if PAN_ARCH == 5
66 struct mali_framebuffer_pointer_packed ptr;
67
68 pan_pack(&ptr, FRAMEBUFFER_POINTER, cfg) {
69 cfg.pointer = batch->framebuffer.gpu;
70 cfg.render_target_count = 1; /* a necessary lie */
71 }
72
73 /* XXX: THIS IS A BUG, FIXME */
74 batch->tls.gpu = ptr.opaque[0];
75 #endif
76 #endif
77
78 return 0;
79 }
80
81 static int
jm_submit_jc(struct panfrost_batch * batch,uint64_t first_job_desc,uint32_t reqs,uint32_t out_sync)82 jm_submit_jc(struct panfrost_batch *batch, uint64_t first_job_desc,
83 uint32_t reqs, uint32_t out_sync)
84 {
85 struct panfrost_context *ctx = batch->ctx;
86 struct pipe_context *gallium = (struct pipe_context *)ctx;
87 struct panfrost_device *dev = pan_device(gallium->screen);
88 struct drm_panfrost_submit submit = {
89 0,
90 };
91 uint32_t in_syncs[1];
92 uint32_t *bo_handles;
93 int ret;
94
95 /* If we trace, we always need a syncobj, so make one of our own if we
96 * weren't given one to use. Remember that we did so, so we can free it
97 * after we're done but preventing double-frees if we were given a
98 * syncobj */
99
100 if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
101 out_sync = ctx->syncobj;
102
103 submit.out_sync = out_sync;
104 submit.jc = first_job_desc;
105 submit.requirements = reqs;
106
107 if (ctx->in_sync_fd >= 0) {
108 ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
109 ctx->in_sync_fd);
110 assert(!ret);
111
112 in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
113 close(ctx->in_sync_fd);
114 ctx->in_sync_fd = -1;
115 }
116
117 if (submit.in_sync_count)
118 submit.in_syncs = (uintptr_t)in_syncs;
119
120 bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
121 panfrost_pool_num_bos(&batch->invisible_pool) +
122 batch->num_bos + 2,
123 sizeof(*bo_handles));
124 assert(bo_handles);
125
126 pan_bo_access *flags = util_dynarray_begin(&batch->bos);
127 unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
128
129 for (int i = 0; i < end_bo; ++i) {
130 if (!flags[i])
131 continue;
132
133 assert(submit.bo_handle_count < batch->num_bos);
134 bo_handles[submit.bo_handle_count++] = i;
135
136 /* Update the BO access flags so that panfrost_bo_wait() knows
137 * about all pending accesses.
138 * We only keep the READ/WRITE info since this is all the BO
139 * wait logic cares about.
140 * We also preserve existing flags as this batch might not
141 * be the first one to access the BO.
142 */
143 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
144
145 bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
146 }
147
148 panfrost_pool_get_bo_handles(&batch->pool,
149 bo_handles + submit.bo_handle_count);
150 submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
151 panfrost_pool_get_bo_handles(&batch->invisible_pool,
152 bo_handles + submit.bo_handle_count);
153 submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
154
155 /* Add the tiler heap to the list of accessed BOs if the batch has at
156 * least one tiler job. Tiler heap is written by tiler jobs and read
157 * by fragment jobs (the polygon list is coming from this heap).
158 */
159 if (batch->jm.jobs.vtc_jc.first_tiler)
160 bo_handles[submit.bo_handle_count++] =
161 panfrost_bo_handle(dev->tiler_heap);
162
163 /* Always used on Bifrost, occassionally used on Midgard */
164 bo_handles[submit.bo_handle_count++] =
165 panfrost_bo_handle(dev->sample_positions);
166
167 submit.bo_handles = (uint64_t)(uintptr_t)bo_handles;
168 if (ctx->is_noop)
169 ret = 0;
170 else
171 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
172 free(bo_handles);
173
174 if (ret)
175 return errno;
176
177 /* Trace the job if we're doing that */
178 if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
179 /* Wait so we can get errors reported back */
180 ret = drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX,
181 0, NULL);
182 if (ret)
183 return errno;
184
185 if (dev->debug & PAN_DBG_TRACE)
186 pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
187
188 if (dev->debug & PAN_DBG_DUMP)
189 pandecode_dump_mappings(dev->decode_ctx);
190
191 /* Jobs won't be complete if blackhole rendering, that's ok */
192 if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
193 pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
194 }
195
196 return 0;
197 }
198
199 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
200 * outsync corresponding to the later of the two (since there will be an
201 * implicit dep between them) */
202
203 int
GENX(jm_submit_batch)204 GENX(jm_submit_batch)(struct panfrost_batch *batch)
205 {
206 struct pipe_screen *pscreen = batch->ctx->base.screen;
207 struct panfrost_device *dev = pan_device(pscreen);
208 bool has_draws = batch->jm.jobs.vtc_jc.first_job;
209 bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
210 bool has_frag = panfrost_has_fragment_job(batch);
211 uint32_t out_sync = batch->ctx->syncobj;
212 int ret = 0;
213
214 unsigned reqs =
215 batch->need_job_req_cycle_count ? PANFROST_JD_REQ_CYCLE_COUNT : 0;
216
217 /* Take the submit lock to make sure no tiler jobs from other context
218 * are inserted between our tiler and fragment jobs, failing to do that
219 * might result in tiler heap corruption.
220 */
221 if (has_tiler)
222 pthread_mutex_lock(&dev->submit_lock);
223
224 if (has_draws) {
225 ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, reqs,
226 has_frag ? 0 : out_sync);
227
228 if (ret)
229 goto done;
230 }
231
232 if (has_frag) {
233 ret = jm_submit_jc(batch, batch->jm.jobs.frag, reqs | PANFROST_JD_REQ_FS,
234 out_sync);
235 if (ret)
236 goto done;
237 }
238
239 done:
240 if (has_tiler)
241 pthread_mutex_unlock(&dev->submit_lock);
242
243 return ret;
244 }
245
246 void
GENX(jm_preload_fb)247 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
248 {
249 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
250 struct panfrost_ptr preload_jobs[2];
251
252 unsigned preload_job_count =
253 GENX(pan_preload_fb)(&dev->fb_preload_cache, &batch->pool.base, fb,
254 batch->tls.gpu, preload_jobs);
255
256 assert(PAN_ARCH < 6 || !preload_job_count);
257
258 for (unsigned j = 0; j < preload_job_count; j++) {
259 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
260 0, 0, &preload_jobs[j], true);
261 }
262 }
263
264 void
GENX(jm_emit_fbds)265 GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
266 struct pan_tls_info *tls)
267 {
268 batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
269 fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
270 }
271
272 void
GENX(jm_emit_fragment_job)273 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
274 const struct pan_fb_info *pfb)
275 {
276 struct panfrost_ptr transfer =
277 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
278
279 GENX(pan_emit_fragment_job_payload)
280 (pfb, batch->framebuffer.gpu, transfer.cpu);
281
282 pan_section_pack(transfer.cpu, FRAGMENT_JOB, HEADER, header) {
283 header.type = MALI_JOB_TYPE_FRAGMENT;
284 header.index = 1;
285 }
286
287 batch->jm.jobs.frag = transfer.gpu;
288 }
289
290 #if PAN_ARCH == 9
291 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,uint64_t shader_ptr)292 jm_emit_shader_env(struct panfrost_batch *batch,
293 struct MALI_SHADER_ENVIRONMENT *cfg,
294 enum pipe_shader_type stage, uint64_t shader_ptr)
295 {
296 cfg->resources = panfrost_emit_resources(batch, stage);
297 cfg->thread_storage = batch->tls.gpu;
298 cfg->shader = shader_ptr;
299
300 /* Each entry of FAU is 64-bits */
301 cfg->fau = batch->push_uniforms[stage];
302 cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
303 }
304 #endif
305
306 void
GENX(jm_launch_grid)307 GENX(jm_launch_grid)(struct panfrost_batch *batch,
308 const struct pipe_grid_info *info)
309 {
310 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
311
312 /* Invoke according to the grid info */
313
314 unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
315
316 if (info->indirect)
317 num_wg[0] = num_wg[1] = num_wg[2] = 1;
318
319 #if PAN_ARCH <= 7
320 panfrost_pack_work_groups_compute(
321 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
322 num_wg[2], info->block[0], info->block[1], info->block[2], false,
323 info->indirect != NULL);
324
325 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
326 cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
327 util_logbase2_ceil(info->block[1] + 1) +
328 util_logbase2_ceil(info->block[2] + 1);
329 }
330
331 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
332 cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
333 cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
334 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
335 cfg.thread_storage = batch->tls.gpu;
336 cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
337 cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
338 cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
339 cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
340 }
341
342 #if PAN_ARCH == 4
343 pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
344 ;
345 #endif
346 #else
347 struct panfrost_context *ctx = batch->ctx;
348 struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
349
350 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
351 cfg.workgroup_size_x = info->block[0];
352 cfg.workgroup_size_y = info->block[1];
353 cfg.workgroup_size_z = info->block[2];
354
355 cfg.workgroup_count_x = num_wg[0];
356 cfg.workgroup_count_y = num_wg[1];
357 cfg.workgroup_count_z = num_wg[2];
358
359 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
360 batch->rsd[PIPE_SHADER_COMPUTE]);
361
362 /* Workgroups may be merged if the shader does not use barriers
363 * or shared memory. This condition is checked against the
364 * static shared_size at compile-time. We need to check the
365 * variable shared size at launch_grid time, because the
366 * compiler doesn't know about that.
367 */
368 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
369 (info->variable_shared_mem == 0);
370
371 cfg.task_increment = 1;
372 cfg.task_axis = MALI_TASK_AXIS_Z;
373 }
374 #endif
375
376 unsigned indirect_dep = 0;
377 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
378 if (info->indirect) {
379 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
380 struct pan_indirect_dispatch_info indirect = {
381 .job = t.gpu,
382 .indirect_dim = pan_resource(info->indirect)->image.data.base +
383 info->indirect_offset,
384 .num_wg_sysval =
385 {
386 batch->num_wg_sysval[0],
387 batch->num_wg_sysval[1],
388 batch->num_wg_sysval[2],
389 },
390 };
391
392 indirect_dep = GENX(pan_indirect_dispatch_emit)(
393 &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
394 &indirect);
395 }
396 #endif
397
398 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, true, false,
399 indirect_dep, 0, &t, false);
400 }
401
402 #if PAN_ARCH >= 6
403 static uint64_t
jm_emit_tiler_desc(struct panfrost_batch * batch)404 jm_emit_tiler_desc(struct panfrost_batch *batch)
405 {
406 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
407 uint64_t tiler_desc = PAN_ARCH >= 9 ? batch->tiler_ctx.bifrost.desc
408 : batch->tiler_ctx.valhall.desc;
409
410 if (tiler_desc)
411 return tiler_desc;
412
413 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
414
415 pan_cast_and_pack(t.cpu, TILER_HEAP, heap) {
416 heap.size = panfrost_bo_size(dev->tiler_heap);
417 heap.base = dev->tiler_heap->ptr.gpu;
418 heap.bottom = dev->tiler_heap->ptr.gpu;
419 heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
420 }
421
422 uint64_t heap = t.gpu;
423 unsigned max_levels = dev->tiler_features.max_levels;
424 assert(max_levels >= 2);
425
426 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
427 pan_cast_and_pack(t.cpu, TILER_CONTEXT, tiler) {
428 tiler.hierarchy_mask =
429 pan_select_tiler_hierarchy_mask(batch->key.width,
430 batch->key.height,
431 dev->tiler_features.max_levels);
432
433 tiler.fb_width = batch->key.width;
434 tiler.fb_height = batch->key.height;
435 tiler.heap = heap;
436 tiler.sample_pattern =
437 pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
438 #if PAN_ARCH >= 9
439 tiler.first_provoking_vertex =
440 batch->first_provoking_vertex == U_TRISTATE_YES;
441 #endif
442 }
443
444 if (PAN_ARCH >= 9)
445 batch->tiler_ctx.valhall.desc = t.gpu;
446 else
447 batch->tiler_ctx.bifrost.desc = t.gpu;
448
449 return t.gpu;
450 }
451 #endif
452
453 #if PAN_ARCH <= 7
454 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)455 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
456 enum pipe_shader_type st)
457 {
458 d->offset_start = batch->ctx->offset_start;
459 d->instance_size =
460 batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
461
462 d->uniform_buffers = batch->uniform_buffers[st];
463 d->push_uniforms = batch->push_uniforms[st];
464 d->textures = batch->textures[st];
465 d->samplers = batch->samplers[st];
466 }
467
468 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,struct mali_draw_packed * section)469 jm_emit_vertex_draw(struct panfrost_batch *batch,
470 struct mali_draw_packed *section)
471 {
472 pan_pack(section, DRAW, cfg) {
473 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
474 cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
475 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
476 cfg.varyings = batch->varyings.vs;
477 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
478 cfg.thread_storage = batch->tls.gpu;
479 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
480 }
481 }
482
483 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)484 jm_emit_vertex_job(struct panfrost_batch *batch,
485 const struct pipe_draw_info *info, void *invocation_template,
486 void *job)
487 {
488 void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
489 memcpy(section, invocation_template, pan_size(INVOCATION));
490
491 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
492 cfg.job_task_split = 5;
493 }
494
495 section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
496 jm_emit_vertex_draw(batch, section);
497
498 #if PAN_ARCH == 4
499 pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
500 ;
501 #endif
502 }
503 #endif /* PAN_ARCH <= 7 */
504
505 static void
jm_emit_tiler_draw(struct mali_draw_packed * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)506 jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch,
507 bool fs_required, enum mesa_prim prim)
508 {
509 struct panfrost_context *ctx = batch->ctx;
510 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
511 bool polygon = (prim == MESA_PRIM_TRIANGLES);
512
513 pan_pack(out, DRAW, cfg) {
514 /*
515 * From the Gallium documentation,
516 * pipe_rasterizer_state::cull_face "indicates which faces of
517 * polygons to cull". Points and lines are not considered
518 * polygons and should be drawn even if all faces are culled.
519 * The hardware does not take primitive type into account when
520 * culling, so we need to do that check ourselves.
521 */
522 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
523 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
524 cfg.front_face_ccw = rast->front_ccw;
525
526 if (ctx->occlusion_query && ctx->active_queries) {
527 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
528 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
529 else
530 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
531
532 struct panfrost_resource *rsrc =
533 pan_resource(ctx->occlusion_query->rsrc);
534 cfg.occlusion = rsrc->image.data.base;
535 panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
536 }
537
538 #if PAN_ARCH >= 9
539 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
540
541 cfg.multisample_enable = rast->multisample;
542 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
543
544 /* Use per-sample shading if required by API Also use it when a
545 * blend shader is used with multisampling, as this is handled
546 * by a single ST_TILE in the blend shader with the current
547 * sample ID, requiring per-sample shading.
548 */
549 cfg.evaluate_per_sample =
550 (rast->multisample &&
551 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
552
553 cfg.single_sampled_lines = !rast->multisample;
554
555 cfg.vertex_array.packet = true;
556
557 cfg.minimum_z = batch->minimum_z;
558 cfg.maximum_z = batch->maximum_z;
559
560 cfg.depth_stencil = batch->depth_stencil;
561
562 if (prim == MESA_PRIM_LINES && rast->line_smooth) {
563 cfg.multisample_enable = true;
564 cfg.single_sampled_lines = false;
565 }
566
567 if (fs_required) {
568 bool has_oq = ctx->occlusion_query && ctx->active_queries;
569
570 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
571 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
572 ctx->blend->base.alpha_to_coverage,
573 ctx->depth_stencil->zs_always_passes);
574
575 cfg.pixel_kill_operation = earlyzs.kill;
576 cfg.zs_update_operation = earlyzs.update;
577
578 cfg.allow_forward_pixel_to_kill =
579 pan_allow_forward_pixel_to_kill(ctx, fs);
580 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
581
582 /* Mask of render targets that may be written. A render
583 * target may be written if the fragment shader writes
584 * to it AND it actually exists. If the render target
585 * doesn't actually exist, the blend descriptor will be
586 * OFF so it may be omitted from the mask.
587 *
588 * Only set when there is a fragment shader, since
589 * otherwise no colour updates are possible.
590 */
591 cfg.render_target_mask =
592 (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
593
594 /* Also use per-sample shading if required by the shader
595 */
596 cfg.evaluate_per_sample |=
597 (fs->info.fs.sample_shading && rast->multisample);
598
599 /* Unlike Bifrost, alpha-to-coverage must be included in
600 * this identically-named flag. Confusing, isn't it?
601 */
602 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
603 fs->info.fs.can_discard ||
604 ctx->blend->base.alpha_to_coverage;
605
606 /* Blend descriptors are only accessed by a BLEND
607 * instruction on Valhall. It follows that if the
608 * fragment shader is omitted, we may also emit the
609 * blend descriptors.
610 */
611 cfg.blend = batch->blend;
612 cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
613 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
614
615 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
616 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
617
618 jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
619 batch->rsd[PIPE_SHADER_FRAGMENT]);
620 } else {
621 /* These operations need to be FORCE to benefit from the
622 * depth-only pass optimizations.
623 */
624 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
625 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
626
627 /* No shader and no blend => no shader or blend
628 * reasons to disable FPK. The only FPK-related state
629 * not covered is alpha-to-coverage which we don't set
630 * without blend.
631 */
632 cfg.allow_forward_pixel_to_kill = true;
633
634 /* No shader => no shader side effects */
635 cfg.allow_forward_pixel_to_be_killed = true;
636
637 /* Alpha isn't written so these are vacuous */
638 cfg.overdraw_alpha0 = true;
639 cfg.overdraw_alpha1 = true;
640 }
641 #else
642 cfg.position = batch->varyings.pos;
643 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
644 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
645 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
646 cfg.viewport = batch->viewport;
647 cfg.varyings = batch->varyings.fs;
648 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
649 cfg.thread_storage = batch->tls.gpu;
650
651 /* For all primitives but lines DRAW.flat_shading_vertex must
652 * be set to 0 and the provoking vertex is selected with the
653 * PRIMITIVE.first_provoking_vertex field.
654 */
655 if (prim == MESA_PRIM_LINES) {
656 /* The logic is inverted across arches. */
657 cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
658 }
659
660 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
661 #endif
662 }
663 }
664
665 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
666 * jobs and Valhall IDVS jobs
667 */
668 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,struct mali_primitive_packed * out)669 jm_emit_primitive(struct panfrost_batch *batch,
670 const struct pipe_draw_info *info,
671 const struct pipe_draw_start_count_bias *draw,
672 bool secondary_shader, struct mali_primitive_packed *out)
673 {
674 struct panfrost_context *ctx = batch->ctx;
675 UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
676
677 pan_pack(out, PRIMITIVE, cfg) {
678 cfg.draw_mode = pan_draw_mode(info->mode);
679 if (panfrost_writes_point_size(ctx))
680 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
681
682 #if PAN_ARCH <= 8
683 /* For line primitives, PRIMITIVE.first_provoking_vertex must
684 * be set to true and the provoking vertex is selected with
685 * DRAW.flat_shading_vertex.
686 */
687 if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
688 cfg.first_provoking_vertex = true;
689 else
690 cfg.first_provoking_vertex = rast->flatshade_first;
691
692 if (panfrost_is_implicit_prim_restart(info)) {
693 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
694 } else if (info->primitive_restart) {
695 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
696 cfg.primitive_restart_index = info->restart_index;
697 }
698
699 cfg.job_task_split = 6;
700 #else
701 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
702
703 cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
704 cfg.primitive_restart = info->primitive_restart;
705
706 /* Non-fixed restart indices should have been lowered */
707 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
708 #endif
709
710 cfg.low_depth_cull = rast->depth_clip_near;
711 cfg.high_depth_cull = rast->depth_clip_far;
712
713 cfg.index_count = draw->count;
714 cfg.index_type = panfrost_translate_index_size(info->index_size);
715
716 if (PAN_ARCH >= 9) {
717 /* Base vertex offset on Valhall is used for both
718 * indexed and non-indexed draws, in a simple way for
719 * either. Handle both cases.
720 */
721 if (cfg.index_type)
722 cfg.base_vertex_offset = draw->index_bias;
723 else
724 cfg.base_vertex_offset = draw->start;
725
726 /* Indices are moved outside the primitive descriptor
727 * on Valhall, so we don't need to set that here
728 */
729 } else if (cfg.index_type) {
730 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
731
732 #if PAN_ARCH <= 7
733 cfg.indices = batch->indices;
734 #endif
735 }
736
737 #if PAN_ARCH >= 6
738 cfg.secondary_shader = secondary_shader;
739 #endif
740 }
741 }
742
743 #if PAN_ARCH == 9
744 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)745 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
746 const struct pipe_draw_info *info,
747 const struct pipe_draw_start_count_bias *draw,
748 bool secondary_shader, void *job)
749 {
750 struct panfrost_context *ctx = batch->ctx;
751 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
752 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
753
754 bool fs_required = panfrost_fs_required(
755 fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
756
757 /* Varying shaders only feed data to the fragment shader, so if we omit
758 * the fragment shader, we should omit the varying shader too.
759 */
760 secondary_shader &= fs_required;
761
762 jm_emit_primitive(batch, info, draw, secondary_shader,
763 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
764
765 pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
766 cfg.count = info->instance_count;
767 }
768
769 pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
770 if (secondary_shader) {
771 unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
772 cfg.vertex_packet_stride = sz + 16;
773 cfg.vertex_attribute_stride = sz;
774 } else {
775 /* Hardware requirement for "no varyings" */
776 cfg.vertex_packet_stride = 16;
777 cfg.vertex_attribute_stride = 0;
778 }
779 }
780
781 pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
782 cfg.address = jm_emit_tiler_desc(batch);
783 }
784
785 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
786 memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
787 pan_size(SCISSOR));
788
789 panfrost_emit_primitive_size(
790 ctx, info->mode == MESA_PRIM_POINTS, 0,
791 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
792
793 pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
794 cfg.address = batch->indices;
795 }
796
797 jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
798 fs_required, u_reduced_prim(info->mode));
799
800 pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
801 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
802 panfrost_get_position_shader(batch, info));
803 }
804
805 pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
806 /* If a varying shader is used, we configure it with the same
807 * state as the position shader for backwards compatible
808 * behaviour with Bifrost. This could be optimized.
809 */
810 if (!secondary_shader)
811 continue;
812
813 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
814 panfrost_get_varying_shader(batch));
815 }
816 }
817 #endif
818
819 #if PAN_ARCH <= 7
820 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)821 jm_emit_tiler_job(struct panfrost_batch *batch,
822 const struct pipe_draw_info *info,
823 const struct pipe_draw_start_count_bias *draw,
824 void *invocation_template, bool secondary_shader, void *job)
825 {
826 struct panfrost_context *ctx = batch->ctx;
827
828 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
829 memcpy(section, invocation_template, pan_size(INVOCATION));
830
831 jm_emit_primitive(batch, info, draw, secondary_shader,
832 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
833
834 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
835 enum mesa_prim prim = u_reduced_prim(info->mode);
836
837 #if PAN_ARCH >= 6
838 pan_section_pack(job, TILER_JOB, TILER, cfg) {
839 cfg.address = jm_emit_tiler_desc(batch);
840 }
841
842 pan_section_pack(job, TILER_JOB, PADDING, cfg)
843 ;
844 #endif
845
846 jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
847
848 panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
849 batch->varyings.psiz, prim_size);
850 }
851 #endif
852
853 void
GENX(jm_launch_xfb)854 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
855 const struct pipe_draw_info *info, unsigned count)
856 {
857 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
858
859 #if PAN_ARCH == 9
860 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
861 cfg.workgroup_size_x = 1;
862 cfg.workgroup_size_y = 1;
863 cfg.workgroup_size_z = 1;
864
865 cfg.workgroup_count_x = count;
866 cfg.workgroup_count_y = info->instance_count;
867 cfg.workgroup_count_z = 1;
868
869 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
870 batch->rsd[PIPE_SHADER_VERTEX]);
871
872 /* TODO: Indexing. Also, this is a legacy feature... */
873 cfg.compute.attribute_offset = batch->ctx->offset_start;
874
875 /* Transform feedback shaders do not use barriers or shared
876 * memory, so we may merge workgroups.
877 */
878 cfg.allow_merging_workgroups = true;
879 cfg.task_increment = 1;
880 cfg.task_axis = MALI_TASK_AXIS_Z;
881 }
882 #else
883 struct mali_invocation_packed invocation;
884
885 panfrost_pack_work_groups_compute(&invocation, 1, count,
886 info->instance_count, 1, 1, 1,
887 PAN_ARCH <= 5, false);
888
889 /* No varyings on XFB compute jobs. */
890 uint64_t saved_vs_varyings = batch->varyings.vs;
891
892 batch->varyings.vs = 0;
893 jm_emit_vertex_job(batch, info, &invocation, t.cpu);
894 batch->varyings.vs = saved_vs_varyings;
895
896 #endif
897 enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
898 #if PAN_ARCH <= 5
899 job_type = MALI_JOB_TYPE_VERTEX;
900 #endif
901 pan_jc_add_job(&batch->jm.jobs.vtc_jc, job_type, true, false, 0, 0, &t,
902 false);
903 }
904
905 #if PAN_ARCH < 9
906 /*
907 * Push jobs required for the rasterization pipeline. If there are side effects
908 * from the vertex shader, these are handled ahead-of-time with a compute
909 * shader. This function should not be called if rasterization is skipped.
910 */
911 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)912 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
913 const struct panfrost_ptr *vertex_job,
914 const struct panfrost_ptr *tiler_job)
915 {
916 unsigned vertex =
917 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
918 0, 0, vertex_job, false);
919
920 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
921 vertex, 0, tiler_job, false);
922 }
923 #endif
924
925 void
GENX(jm_launch_draw)926 GENX(jm_launch_draw)(struct panfrost_batch *batch,
927 const struct pipe_draw_info *info, unsigned drawid_offset,
928 const struct pipe_draw_start_count_bias *draw,
929 unsigned vertex_count)
930 {
931 struct panfrost_context *ctx = batch->ctx;
932 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
933 bool secondary_shader = vs->info.vs.secondary_enable;
934 bool idvs = vs->info.vs.idvs;
935
936 #if PAN_ARCH <= 7
937 struct mali_invocation_packed invocation;
938 if (info->instance_count > 1) {
939 panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
940 info->instance_count, 1, 1, 1, true,
941 false);
942 } else {
943 pan_pack(&invocation, INVOCATION, cfg) {
944 cfg.invocations = vertex_count - 1;
945 cfg.size_y_shift = 0;
946 cfg.size_z_shift = 0;
947 cfg.workgroups_x_shift = 0;
948 cfg.workgroups_y_shift = 0;
949 cfg.workgroups_z_shift = 32;
950 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
951 }
952 }
953
954 /* Emit all sort of descriptors. */
955 #endif
956
957 UNUSED struct panfrost_ptr tiler, vertex;
958
959 if (idvs) {
960 #if PAN_ARCH == 9
961 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
962 #elif PAN_ARCH >= 6
963 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
964 #else
965 unreachable("IDVS is unsupported on Midgard");
966 #endif
967 } else {
968 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
969 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
970 }
971
972 if ((!idvs && !vertex.cpu) || !tiler.cpu) {
973 mesa_loge("jm_launch_draw failed");
974 return;
975 }
976
977 #if PAN_ARCH == 9
978 assert(idvs && "Memory allocated IDVS required on Valhall");
979
980 jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
981
982 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_MALLOC_VERTEX, false,
983 false, 0, 0, &tiler, false);
984 #else
985 /* Fire off the draw itself */
986 jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
987 tiler.cpu);
988 if (idvs) {
989 #if PAN_ARCH >= 6
990 jm_emit_vertex_draw(
991 batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
992
993 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX,
994 false, false, 0, 0, &tiler, false);
995 #endif
996 } else {
997 jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
998 jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
999 }
1000 #endif
1001 }
1002
1003 void
GENX(jm_launch_draw_indirect)1004 GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
1005 const struct pipe_draw_info *info,
1006 unsigned drawid_offset,
1007 const struct pipe_draw_indirect_info *indirect)
1008 {
1009 unreachable("draw indirect not implemented for jm");
1010 }
1011
1012 void
GENX(jm_emit_write_timestamp)1013 GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
1014 struct panfrost_resource *dst, unsigned offset)
1015 {
1016 struct panfrost_ptr job =
1017 pan_pool_alloc_desc(&batch->pool.base, WRITE_VALUE_JOB);
1018
1019 pan_section_pack(job.cpu, WRITE_VALUE_JOB, PAYLOAD, cfg) {
1020 cfg.address = dst->image.data.base + dst->image.data.offset + offset;
1021 cfg.type = MALI_WRITE_VALUE_TYPE_SYSTEM_TIMESTAMP;
1022 }
1023
1024 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false,
1025 false, 0, 0, &job, false);
1026 panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1027 }
1028