1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2017 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "decode.h"
27
28 #include "drm-uapi/panfrost_drm.h"
29
30 #include "pan_cmdstream.h"
31 #include "pan_context.h"
32 #include "pan_fb_preload.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40
41 int
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44 /* Reserve the framebuffer and local storage descriptors */
45 batch->framebuffer =
46 #if PAN_ARCH == 4
47 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49 pan_pool_alloc_desc_aggregate(
50 &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53 if (!batch->framebuffer.gpu)
54 return -1;
55
56 #if PAN_ARCH >= 6
57 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
58 #else
59 /* On Midgard, the TLS is embedded in the FB descriptor */
60 batch->tls = batch->framebuffer;
61
62 if (!batch->tls.cpu)
63 return -1;
64
65 #if PAN_ARCH == 5
66 struct mali_framebuffer_pointer_packed ptr;
67
68 pan_pack(&ptr, FRAMEBUFFER_POINTER, cfg) {
69 cfg.pointer = batch->framebuffer.gpu;
70 cfg.render_target_count = 1; /* a necessary lie */
71 }
72
73 /* XXX: THIS IS A BUG, FIXME */
74 batch->tls.gpu = ptr.opaque[0];
75 #endif
76 #endif
77
78 return 0;
79 }
80
81 static int
jm_submit_jc(struct panfrost_batch * batch,uint64_t first_job_desc,uint32_t reqs,uint32_t out_sync)82 jm_submit_jc(struct panfrost_batch *batch, uint64_t first_job_desc,
83 uint32_t reqs, uint32_t out_sync)
84 {
85 struct panfrost_context *ctx = batch->ctx;
86 struct pipe_context *gallium = (struct pipe_context *)ctx;
87 struct panfrost_device *dev = pan_device(gallium->screen);
88 struct drm_panfrost_submit submit = {
89 0,
90 };
91 uint32_t in_syncs[1];
92 uint32_t *bo_handles;
93 int ret;
94
95 /* If we trace, we always need a syncobj, so make one of our own if we
96 * weren't given one to use. Remember that we did so, so we can free it
97 * after we're done but preventing double-frees if we were given a
98 * syncobj */
99
100 if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
101 out_sync = ctx->syncobj;
102
103 submit.out_sync = out_sync;
104 submit.jc = first_job_desc;
105 submit.requirements = reqs;
106
107 if (ctx->in_sync_fd >= 0) {
108 ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
109 ctx->in_sync_fd);
110 assert(!ret);
111
112 in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
113 close(ctx->in_sync_fd);
114 ctx->in_sync_fd = -1;
115 }
116
117 if (submit.in_sync_count)
118 submit.in_syncs = (uintptr_t)in_syncs;
119
120 bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
121 panfrost_pool_num_bos(&batch->invisible_pool) +
122 batch->num_bos + 2,
123 sizeof(*bo_handles));
124 assert(bo_handles);
125
126 pan_bo_access *flags = util_dynarray_begin(&batch->bos);
127 unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
128
129 for (int i = 0; i < end_bo; ++i) {
130 if (!flags[i])
131 continue;
132
133 assert(submit.bo_handle_count < batch->num_bos);
134 bo_handles[submit.bo_handle_count++] = i;
135
136 /* Update the BO access flags so that panfrost_bo_wait() knows
137 * about all pending accesses.
138 * We only keep the READ/WRITE info since this is all the BO
139 * wait logic cares about.
140 * We also preserve existing flags as this batch might not
141 * be the first one to access the BO.
142 */
143 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
144
145 bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
146 }
147
148 panfrost_pool_get_bo_handles(&batch->pool,
149 bo_handles + submit.bo_handle_count);
150 submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
151 panfrost_pool_get_bo_handles(&batch->invisible_pool,
152 bo_handles + submit.bo_handle_count);
153 submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
154
155 /* Add the tiler heap to the list of accessed BOs if the batch has at
156 * least one tiler job. Tiler heap is written by tiler jobs and read
157 * by fragment jobs (the polygon list is coming from this heap).
158 */
159 if (batch->jm.jobs.vtc_jc.first_tiler)
160 bo_handles[submit.bo_handle_count++] =
161 panfrost_bo_handle(dev->tiler_heap);
162
163 /* Always used on Bifrost, occassionally used on Midgard */
164 bo_handles[submit.bo_handle_count++] =
165 panfrost_bo_handle(dev->sample_positions);
166
167 submit.bo_handles = (uint64_t)(uintptr_t)bo_handles;
168 if (ctx->is_noop)
169 ret = 0;
170 else
171 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
172 free(bo_handles);
173
174 if (ret)
175 return errno;
176
177 /* Trace the job if we're doing that */
178 if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
179 /* Wait so we can get errors reported back */
180 ret = drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX,
181 0, NULL);
182 if (ret)
183 return errno;
184
185 if (dev->debug & PAN_DBG_TRACE)
186 pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
187
188 if (dev->debug & PAN_DBG_DUMP)
189 pandecode_dump_mappings(dev->decode_ctx);
190
191 /* Jobs won't be complete if blackhole rendering, that's ok */
192 if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
193 pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
194 }
195
196 return 0;
197 }
198
199 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
200 * outsync corresponding to the later of the two (since there will be an
201 * implicit dep between them) */
202
203 int
GENX(jm_submit_batch)204 GENX(jm_submit_batch)(struct panfrost_batch *batch)
205 {
206 struct pipe_screen *pscreen = batch->ctx->base.screen;
207 struct panfrost_device *dev = pan_device(pscreen);
208 bool has_draws = batch->jm.jobs.vtc_jc.first_job;
209 bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
210 bool has_frag = panfrost_has_fragment_job(batch);
211 uint32_t out_sync = batch->ctx->syncobj;
212 int ret = 0;
213
214 unsigned reqs =
215 batch->need_job_req_cycle_count ? PANFROST_JD_REQ_CYCLE_COUNT : 0;
216
217 /* Take the submit lock to make sure no tiler jobs from other context
218 * are inserted between our tiler and fragment jobs, failing to do that
219 * might result in tiler heap corruption.
220 */
221 if (has_tiler)
222 pthread_mutex_lock(&dev->submit_lock);
223
224 if (has_draws) {
225 ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, reqs,
226 has_frag ? 0 : out_sync);
227
228 if (ret)
229 goto done;
230 }
231
232 if (has_frag) {
233 ret = jm_submit_jc(batch, batch->jm.jobs.frag, reqs | PANFROST_JD_REQ_FS,
234 out_sync);
235 if (ret)
236 goto done;
237 }
238
239 done:
240 if (has_tiler)
241 pthread_mutex_unlock(&dev->submit_lock);
242
243 return ret;
244 }
245
246 void
GENX(jm_preload_fb)247 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
248 {
249 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
250 struct panfrost_ptr preload_jobs[2];
251
252 unsigned preload_job_count =
253 GENX(pan_preload_fb)(&dev->fb_preload_cache, &batch->pool.base, fb,
254 batch->tls.gpu, preload_jobs);
255
256 assert(PAN_ARCH < 6 || !preload_job_count);
257
258 for (unsigned j = 0; j < preload_job_count; j++) {
259 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
260 0, 0, &preload_jobs[j], true);
261 }
262 }
263
264 void
GENX(jm_emit_fbds)265 GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
266 struct pan_tls_info *tls)
267 {
268 batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
269 fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
270 }
271
272 void
GENX(jm_emit_fragment_job)273 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
274 const struct pan_fb_info *pfb)
275 {
276 struct panfrost_ptr transfer =
277 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
278
279 GENX(pan_emit_fragment_job_payload)
280 (pfb, batch->framebuffer.gpu, transfer.cpu);
281
282 pan_section_pack(transfer.cpu, FRAGMENT_JOB, HEADER, header) {
283 header.type = MALI_JOB_TYPE_FRAGMENT;
284 header.index = 1;
285 }
286
287 batch->jm.jobs.frag = transfer.gpu;
288 }
289
290 #if PAN_ARCH == 9
291 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,uint64_t shader_ptr)292 jm_emit_shader_env(struct panfrost_batch *batch,
293 struct MALI_SHADER_ENVIRONMENT *cfg,
294 enum pipe_shader_type stage, uint64_t shader_ptr)
295 {
296 cfg->resources = panfrost_emit_resources(batch, stage);
297 cfg->thread_storage = batch->tls.gpu;
298 cfg->shader = shader_ptr;
299
300 /* Each entry of FAU is 64-bits */
301 cfg->fau = batch->push_uniforms[stage];
302 cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
303 }
304 #endif
305
306 void
GENX(jm_launch_grid)307 GENX(jm_launch_grid)(struct panfrost_batch *batch,
308 const struct pipe_grid_info *info)
309 {
310 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
311
312 /* Invoke according to the grid info */
313
314 unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
315
316 if (info->indirect)
317 num_wg[0] = num_wg[1] = num_wg[2] = 1;
318
319 #if PAN_ARCH <= 7
320 panfrost_pack_work_groups_compute(
321 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
322 num_wg[2], info->block[0], info->block[1], info->block[2], false,
323 info->indirect != NULL);
324
325 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
326 cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
327 util_logbase2_ceil(info->block[1] + 1) +
328 util_logbase2_ceil(info->block[2] + 1);
329 }
330
331 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
332 cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
333 cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
334 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
335 cfg.thread_storage = batch->tls.gpu;
336 cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
337 cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
338 cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
339 cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
340 }
341
342 #if PAN_ARCH == 4
343 pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
344 ;
345 #endif
346 #else
347 struct panfrost_context *ctx = batch->ctx;
348 struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
349
350 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
351 cfg.workgroup_size_x = info->block[0];
352 cfg.workgroup_size_y = info->block[1];
353 cfg.workgroup_size_z = info->block[2];
354
355 cfg.workgroup_count_x = num_wg[0];
356 cfg.workgroup_count_y = num_wg[1];
357 cfg.workgroup_count_z = num_wg[2];
358
359 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
360 batch->rsd[PIPE_SHADER_COMPUTE]);
361
362 /* Workgroups may be merged if the shader does not use barriers
363 * or shared memory. This condition is checked against the
364 * static shared_size at compile-time. We need to check the
365 * variable shared size at launch_grid time, because the
366 * compiler doesn't know about that.
367 */
368 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
369 (info->variable_shared_mem == 0);
370
371 cfg.task_increment = 1;
372 cfg.task_axis = MALI_TASK_AXIS_Z;
373 }
374 #endif
375
376 unsigned indirect_dep = 0;
377 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
378 if (info->indirect) {
379 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
380 struct pan_indirect_dispatch_info indirect = {
381 .job = t.gpu,
382 .indirect_dim = pan_resource(info->indirect)->image.data.base +
383 info->indirect_offset,
384 .num_wg_sysval =
385 {
386 batch->num_wg_sysval[0],
387 batch->num_wg_sysval[1],
388 batch->num_wg_sysval[2],
389 },
390 };
391
392 indirect_dep = GENX(pan_indirect_dispatch_emit)(
393 &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
394 &indirect);
395 }
396 #endif
397
398 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, true, false,
399 indirect_dep, 0, &t, false);
400 }
401
402 #if PAN_ARCH >= 6
403 static uint64_t
jm_emit_tiler_desc(struct panfrost_batch * batch)404 jm_emit_tiler_desc(struct panfrost_batch *batch)
405 {
406 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
407 uint64_t tiler_desc = PAN_ARCH >= 9 ? batch->tiler_ctx.bifrost.desc
408 : batch->tiler_ctx.valhall.desc;
409
410 if (tiler_desc)
411 return tiler_desc;
412
413 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
414
415 pan_cast_and_pack(t.cpu, TILER_HEAP, heap) {
416 heap.size = panfrost_bo_size(dev->tiler_heap);
417 heap.base = dev->tiler_heap->ptr.gpu;
418 heap.bottom = dev->tiler_heap->ptr.gpu;
419 heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
420 }
421
422 uint64_t heap = t.gpu;
423 unsigned max_levels = dev->tiler_features.max_levels;
424 assert(max_levels >= 2);
425
426 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
427 pan_cast_and_pack(t.cpu, TILER_CONTEXT, tiler) {
428 /* TODO: Select hierarchy mask more effectively */
429 tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
430
431 /* For large framebuffers, disable the smallest bin size to
432 * avoid pathological tiler memory usage. Required to avoid OOM
433 * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
434 * Mali-G57.
435 */
436 if (MAX2(batch->key.width, batch->key.height) >= 4096)
437 tiler.hierarchy_mask &= ~1;
438
439 tiler.fb_width = batch->key.width;
440 tiler.fb_height = batch->key.height;
441 tiler.heap = heap;
442 tiler.sample_pattern =
443 pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
444 #if PAN_ARCH >= 9
445 tiler.first_provoking_vertex =
446 batch->first_provoking_vertex == U_TRISTATE_YES;
447 #endif
448 }
449
450 if (PAN_ARCH >= 9)
451 batch->tiler_ctx.valhall.desc = t.gpu;
452 else
453 batch->tiler_ctx.bifrost.desc = t.gpu;
454
455 return t.gpu;
456 }
457 #endif
458
459 #if PAN_ARCH <= 7
460 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)461 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
462 enum pipe_shader_type st)
463 {
464 d->offset_start = batch->ctx->offset_start;
465 d->instance_size =
466 batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
467
468 d->uniform_buffers = batch->uniform_buffers[st];
469 d->push_uniforms = batch->push_uniforms[st];
470 d->textures = batch->textures[st];
471 d->samplers = batch->samplers[st];
472 }
473
474 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,struct mali_draw_packed * section)475 jm_emit_vertex_draw(struct panfrost_batch *batch,
476 struct mali_draw_packed *section)
477 {
478 pan_pack(section, DRAW, cfg) {
479 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
480 cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
481 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
482 cfg.varyings = batch->varyings.vs;
483 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
484 cfg.thread_storage = batch->tls.gpu;
485 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
486 }
487 }
488
489 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)490 jm_emit_vertex_job(struct panfrost_batch *batch,
491 const struct pipe_draw_info *info, void *invocation_template,
492 void *job)
493 {
494 void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
495 memcpy(section, invocation_template, pan_size(INVOCATION));
496
497 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
498 cfg.job_task_split = 5;
499 }
500
501 section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
502 jm_emit_vertex_draw(batch, section);
503
504 #if PAN_ARCH == 4
505 pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
506 ;
507 #endif
508 }
509 #endif /* PAN_ARCH <= 7 */
510
511 static void
jm_emit_tiler_draw(struct mali_draw_packed * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)512 jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch,
513 bool fs_required, enum mesa_prim prim)
514 {
515 struct panfrost_context *ctx = batch->ctx;
516 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
517 bool polygon = (prim == MESA_PRIM_TRIANGLES);
518
519 pan_pack(out, DRAW, cfg) {
520 /*
521 * From the Gallium documentation,
522 * pipe_rasterizer_state::cull_face "indicates which faces of
523 * polygons to cull". Points and lines are not considered
524 * polygons and should be drawn even if all faces are culled.
525 * The hardware does not take primitive type into account when
526 * culling, so we need to do that check ourselves.
527 */
528 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
529 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
530 cfg.front_face_ccw = rast->front_ccw;
531
532 if (ctx->occlusion_query && ctx->active_queries) {
533 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
534 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
535 else
536 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
537
538 struct panfrost_resource *rsrc =
539 pan_resource(ctx->occlusion_query->rsrc);
540 cfg.occlusion = rsrc->image.data.base;
541 panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
542 }
543
544 #if PAN_ARCH >= 9
545 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
546
547 cfg.multisample_enable = rast->multisample;
548 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
549
550 /* Use per-sample shading if required by API Also use it when a
551 * blend shader is used with multisampling, as this is handled
552 * by a single ST_TILE in the blend shader with the current
553 * sample ID, requiring per-sample shading.
554 */
555 cfg.evaluate_per_sample =
556 (rast->multisample &&
557 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
558
559 cfg.single_sampled_lines = !rast->multisample;
560
561 cfg.vertex_array.packet = true;
562
563 cfg.minimum_z = batch->minimum_z;
564 cfg.maximum_z = batch->maximum_z;
565
566 cfg.depth_stencil = batch->depth_stencil;
567
568 if (prim == MESA_PRIM_LINES && rast->line_smooth) {
569 cfg.multisample_enable = true;
570 cfg.single_sampled_lines = false;
571 }
572
573 if (fs_required) {
574 bool has_oq = ctx->occlusion_query && ctx->active_queries;
575
576 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
577 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
578 ctx->blend->base.alpha_to_coverage,
579 ctx->depth_stencil->zs_always_passes);
580
581 cfg.pixel_kill_operation = earlyzs.kill;
582 cfg.zs_update_operation = earlyzs.update;
583
584 cfg.allow_forward_pixel_to_kill =
585 pan_allow_forward_pixel_to_kill(ctx, fs);
586 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
587
588 /* Mask of render targets that may be written. A render
589 * target may be written if the fragment shader writes
590 * to it AND it actually exists. If the render target
591 * doesn't actually exist, the blend descriptor will be
592 * OFF so it may be omitted from the mask.
593 *
594 * Only set when there is a fragment shader, since
595 * otherwise no colour updates are possible.
596 */
597 cfg.render_target_mask =
598 (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
599
600 /* Also use per-sample shading if required by the shader
601 */
602 cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
603
604 /* Unlike Bifrost, alpha-to-coverage must be included in
605 * this identically-named flag. Confusing, isn't it?
606 */
607 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
608 fs->info.fs.can_discard ||
609 ctx->blend->base.alpha_to_coverage;
610
611 /* Blend descriptors are only accessed by a BLEND
612 * instruction on Valhall. It follows that if the
613 * fragment shader is omitted, we may also emit the
614 * blend descriptors.
615 */
616 cfg.blend = batch->blend;
617 cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
618 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
619
620 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
621 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
622
623 jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
624 batch->rsd[PIPE_SHADER_FRAGMENT]);
625 } else {
626 /* These operations need to be FORCE to benefit from the
627 * depth-only pass optimizations.
628 */
629 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
630 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
631
632 /* No shader and no blend => no shader or blend
633 * reasons to disable FPK. The only FPK-related state
634 * not covered is alpha-to-coverage which we don't set
635 * without blend.
636 */
637 cfg.allow_forward_pixel_to_kill = true;
638
639 /* No shader => no shader side effects */
640 cfg.allow_forward_pixel_to_be_killed = true;
641
642 /* Alpha isn't written so these are vacuous */
643 cfg.overdraw_alpha0 = true;
644 cfg.overdraw_alpha1 = true;
645 }
646 #else
647 cfg.position = batch->varyings.pos;
648 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
649 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
650 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
651 cfg.viewport = batch->viewport;
652 cfg.varyings = batch->varyings.fs;
653 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
654 cfg.thread_storage = batch->tls.gpu;
655
656 /* For all primitives but lines DRAW.flat_shading_vertex must
657 * be set to 0 and the provoking vertex is selected with the
658 * PRIMITIVE.first_provoking_vertex field.
659 */
660 if (prim == MESA_PRIM_LINES) {
661 /* The logic is inverted across arches. */
662 cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
663 }
664
665 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
666 #endif
667 }
668 }
669
670 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
671 * jobs and Valhall IDVS jobs
672 */
673 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,struct mali_primitive_packed * out)674 jm_emit_primitive(struct panfrost_batch *batch,
675 const struct pipe_draw_info *info,
676 const struct pipe_draw_start_count_bias *draw,
677 bool secondary_shader, struct mali_primitive_packed *out)
678 {
679 struct panfrost_context *ctx = batch->ctx;
680 UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
681
682 pan_pack(out, PRIMITIVE, cfg) {
683 cfg.draw_mode = pan_draw_mode(info->mode);
684 if (panfrost_writes_point_size(ctx))
685 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
686
687 #if PAN_ARCH <= 8
688 /* For line primitives, PRIMITIVE.first_provoking_vertex must
689 * be set to true and the provoking vertex is selected with
690 * DRAW.flat_shading_vertex.
691 */
692 if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
693 cfg.first_provoking_vertex = true;
694 else
695 cfg.first_provoking_vertex = rast->flatshade_first;
696
697 if (panfrost_is_implicit_prim_restart(info)) {
698 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
699 } else if (info->primitive_restart) {
700 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
701 cfg.primitive_restart_index = info->restart_index;
702 }
703
704 cfg.job_task_split = 6;
705 #else
706 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
707
708 cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
709 cfg.primitive_restart = info->primitive_restart;
710
711 /* Non-fixed restart indices should have been lowered */
712 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
713 #endif
714
715 cfg.low_depth_cull = rast->depth_clip_near;
716 cfg.high_depth_cull = rast->depth_clip_far;
717
718 cfg.index_count = draw->count;
719 cfg.index_type = panfrost_translate_index_size(info->index_size);
720
721 if (PAN_ARCH >= 9) {
722 /* Base vertex offset on Valhall is used for both
723 * indexed and non-indexed draws, in a simple way for
724 * either. Handle both cases.
725 */
726 if (cfg.index_type)
727 cfg.base_vertex_offset = draw->index_bias;
728 else
729 cfg.base_vertex_offset = draw->start;
730
731 /* Indices are moved outside the primitive descriptor
732 * on Valhall, so we don't need to set that here
733 */
734 } else if (cfg.index_type) {
735 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
736
737 #if PAN_ARCH <= 7
738 cfg.indices = batch->indices;
739 #endif
740 }
741
742 #if PAN_ARCH >= 6
743 cfg.secondary_shader = secondary_shader;
744 #endif
745 }
746 }
747
748 #if PAN_ARCH == 9
749 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)750 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
751 const struct pipe_draw_info *info,
752 const struct pipe_draw_start_count_bias *draw,
753 bool secondary_shader, void *job)
754 {
755 struct panfrost_context *ctx = batch->ctx;
756 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
757 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
758
759 bool fs_required = panfrost_fs_required(
760 fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
761
762 /* Varying shaders only feed data to the fragment shader, so if we omit
763 * the fragment shader, we should omit the varying shader too.
764 */
765 secondary_shader &= fs_required;
766
767 jm_emit_primitive(batch, info, draw, secondary_shader,
768 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
769
770 pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
771 cfg.count = info->instance_count;
772 }
773
774 pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
775 if (secondary_shader) {
776 unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
777 cfg.vertex_packet_stride = sz + 16;
778 cfg.vertex_attribute_stride = sz;
779 } else {
780 /* Hardware requirement for "no varyings" */
781 cfg.vertex_packet_stride = 16;
782 cfg.vertex_attribute_stride = 0;
783 }
784 }
785
786 pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
787 cfg.address = jm_emit_tiler_desc(batch);
788 }
789
790 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
791 memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
792 pan_size(SCISSOR));
793
794 panfrost_emit_primitive_size(
795 ctx, info->mode == MESA_PRIM_POINTS, 0,
796 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
797
798 pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
799 cfg.address = batch->indices;
800 }
801
802 jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
803 fs_required, u_reduced_prim(info->mode));
804
805 pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
806 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
807 panfrost_get_position_shader(batch, info));
808 }
809
810 pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
811 /* If a varying shader is used, we configure it with the same
812 * state as the position shader for backwards compatible
813 * behaviour with Bifrost. This could be optimized.
814 */
815 if (!secondary_shader)
816 continue;
817
818 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
819 panfrost_get_varying_shader(batch));
820 }
821 }
822 #endif
823
824 #if PAN_ARCH <= 7
825 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)826 jm_emit_tiler_job(struct panfrost_batch *batch,
827 const struct pipe_draw_info *info,
828 const struct pipe_draw_start_count_bias *draw,
829 void *invocation_template, bool secondary_shader, void *job)
830 {
831 struct panfrost_context *ctx = batch->ctx;
832
833 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
834 memcpy(section, invocation_template, pan_size(INVOCATION));
835
836 jm_emit_primitive(batch, info, draw, secondary_shader,
837 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
838
839 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
840 enum mesa_prim prim = u_reduced_prim(info->mode);
841
842 #if PAN_ARCH >= 6
843 pan_section_pack(job, TILER_JOB, TILER, cfg) {
844 cfg.address = jm_emit_tiler_desc(batch);
845 }
846
847 pan_section_pack(job, TILER_JOB, PADDING, cfg)
848 ;
849 #endif
850
851 jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
852
853 panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
854 batch->varyings.psiz, prim_size);
855 }
856 #endif
857
858 void
GENX(jm_launch_xfb)859 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
860 const struct pipe_draw_info *info, unsigned count)
861 {
862 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
863
864 #if PAN_ARCH == 9
865 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
866 cfg.workgroup_size_x = 1;
867 cfg.workgroup_size_y = 1;
868 cfg.workgroup_size_z = 1;
869
870 cfg.workgroup_count_x = count;
871 cfg.workgroup_count_y = info->instance_count;
872 cfg.workgroup_count_z = 1;
873
874 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
875 batch->rsd[PIPE_SHADER_VERTEX]);
876
877 /* TODO: Indexing. Also, this is a legacy feature... */
878 cfg.compute.attribute_offset = batch->ctx->offset_start;
879
880 /* Transform feedback shaders do not use barriers or shared
881 * memory, so we may merge workgroups.
882 */
883 cfg.allow_merging_workgroups = true;
884 cfg.task_increment = 1;
885 cfg.task_axis = MALI_TASK_AXIS_Z;
886 }
887 #else
888 struct mali_invocation_packed invocation;
889
890 panfrost_pack_work_groups_compute(&invocation, 1, count,
891 info->instance_count, 1, 1, 1,
892 PAN_ARCH <= 5, false);
893
894 /* No varyings on XFB compute jobs. */
895 uint64_t saved_vs_varyings = batch->varyings.vs;
896
897 batch->varyings.vs = 0;
898 jm_emit_vertex_job(batch, info, &invocation, t.cpu);
899 batch->varyings.vs = saved_vs_varyings;
900
901 #endif
902 enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
903 #if PAN_ARCH <= 5
904 job_type = MALI_JOB_TYPE_VERTEX;
905 #endif
906 pan_jc_add_job(&batch->jm.jobs.vtc_jc, job_type, true, false, 0, 0, &t,
907 false);
908 }
909
910 #if PAN_ARCH < 9
911 /*
912 * Push jobs required for the rasterization pipeline. If there are side effects
913 * from the vertex shader, these are handled ahead-of-time with a compute
914 * shader. This function should not be called if rasterization is skipped.
915 */
916 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)917 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
918 const struct panfrost_ptr *vertex_job,
919 const struct panfrost_ptr *tiler_job)
920 {
921 unsigned vertex =
922 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
923 0, 0, vertex_job, false);
924
925 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
926 vertex, 0, tiler_job, false);
927 }
928 #endif
929
930 void
GENX(jm_launch_draw)931 GENX(jm_launch_draw)(struct panfrost_batch *batch,
932 const struct pipe_draw_info *info, unsigned drawid_offset,
933 const struct pipe_draw_start_count_bias *draw,
934 unsigned vertex_count)
935 {
936 struct panfrost_context *ctx = batch->ctx;
937 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
938 bool secondary_shader = vs->info.vs.secondary_enable;
939 bool idvs = vs->info.vs.idvs;
940
941 #if PAN_ARCH <= 7
942 struct mali_invocation_packed invocation;
943 if (info->instance_count > 1) {
944 panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
945 info->instance_count, 1, 1, 1, true,
946 false);
947 } else {
948 pan_pack(&invocation, INVOCATION, cfg) {
949 cfg.invocations = vertex_count - 1;
950 cfg.size_y_shift = 0;
951 cfg.size_z_shift = 0;
952 cfg.workgroups_x_shift = 0;
953 cfg.workgroups_y_shift = 0;
954 cfg.workgroups_z_shift = 32;
955 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
956 }
957 }
958
959 /* Emit all sort of descriptors. */
960 #endif
961
962 UNUSED struct panfrost_ptr tiler, vertex;
963
964 if (idvs) {
965 #if PAN_ARCH == 9
966 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
967 #elif PAN_ARCH >= 6
968 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
969 #else
970 unreachable("IDVS is unsupported on Midgard");
971 #endif
972 } else {
973 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
974 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
975 }
976
977 if ((!idvs && !vertex.cpu) || !tiler.cpu) {
978 mesa_loge("jm_launch_draw failed");
979 return;
980 }
981
982 #if PAN_ARCH == 9
983 assert(idvs && "Memory allocated IDVS required on Valhall");
984
985 jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
986
987 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_MALLOC_VERTEX, false,
988 false, 0, 0, &tiler, false);
989 #else
990 /* Fire off the draw itself */
991 jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
992 tiler.cpu);
993 if (idvs) {
994 #if PAN_ARCH >= 6
995 jm_emit_vertex_draw(
996 batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
997
998 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX,
999 false, false, 0, 0, &tiler, false);
1000 #endif
1001 } else {
1002 jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
1003 jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
1004 }
1005 #endif
1006 }
1007
1008 void
GENX(jm_launch_draw_indirect)1009 GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
1010 const struct pipe_draw_info *info,
1011 unsigned drawid_offset,
1012 const struct pipe_draw_indirect_info *indirect)
1013 {
1014 unreachable("draw indirect not implemented for jm");
1015 }
1016
1017 void
GENX(jm_emit_write_timestamp)1018 GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
1019 struct panfrost_resource *dst, unsigned offset)
1020 {
1021 struct panfrost_ptr job =
1022 pan_pool_alloc_desc(&batch->pool.base, WRITE_VALUE_JOB);
1023
1024 pan_section_pack(job.cpu, WRITE_VALUE_JOB, PAYLOAD, cfg) {
1025 cfg.address = dst->image.data.base + dst->image.data.offset + offset;
1026 cfg.type = MALI_WRITE_VALUE_TYPE_SYSTEM_TIMESTAMP;
1027 }
1028
1029 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false,
1030 false, 0, 0, &job, false);
1031 panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1032 }
1033