1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2017 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "decode.h"
27
28 #include "drm-uapi/panfrost_drm.h"
29
30 #include "pan_blitter.h"
31 #include "pan_cmdstream.h"
32 #include "pan_context.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40
41 void
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44 /* Reserve the framebuffer and local storage descriptors */
45 batch->framebuffer =
46 #if PAN_ARCH == 4
47 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49 pan_pool_alloc_desc_aggregate(
50 &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53
54 #if PAN_ARCH >= 6
55 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
56 #else
57 /* On Midgard, the TLS is embedded in the FB descriptor */
58 batch->tls = batch->framebuffer;
59
60 #if PAN_ARCH == 5
61 struct mali_framebuffer_pointer_packed ptr;
62
63 pan_pack(ptr.opaque, FRAMEBUFFER_POINTER, cfg) {
64 cfg.pointer = batch->framebuffer.gpu;
65 cfg.render_target_count = 1; /* a necessary lie */
66 }
67
68 batch->tls.gpu = ptr.opaque[0];
69 #endif
70 #endif
71 }
72
73 static int
jm_submit_jc(struct panfrost_batch * batch,mali_ptr first_job_desc,uint32_t reqs,uint32_t out_sync)74 jm_submit_jc(struct panfrost_batch *batch, mali_ptr first_job_desc,
75 uint32_t reqs, uint32_t out_sync)
76 {
77 struct panfrost_context *ctx = batch->ctx;
78 struct pipe_context *gallium = (struct pipe_context *)ctx;
79 struct panfrost_device *dev = pan_device(gallium->screen);
80 struct drm_panfrost_submit submit = {
81 0,
82 };
83 uint32_t in_syncs[1];
84 uint32_t *bo_handles;
85 int ret;
86
87 /* If we trace, we always need a syncobj, so make one of our own if we
88 * weren't given one to use. Remember that we did so, so we can free it
89 * after we're done but preventing double-frees if we were given a
90 * syncobj */
91
92 if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
93 out_sync = ctx->syncobj;
94
95 submit.out_sync = out_sync;
96 submit.jc = first_job_desc;
97 submit.requirements = reqs;
98
99 if (ctx->in_sync_fd >= 0) {
100 ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
101 ctx->in_sync_fd);
102 assert(!ret);
103
104 in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
105 close(ctx->in_sync_fd);
106 ctx->in_sync_fd = -1;
107 }
108
109 if (submit.in_sync_count)
110 submit.in_syncs = (uintptr_t)in_syncs;
111
112 bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
113 panfrost_pool_num_bos(&batch->invisible_pool) +
114 batch->num_bos + 2,
115 sizeof(*bo_handles));
116 assert(bo_handles);
117
118 pan_bo_access *flags = util_dynarray_begin(&batch->bos);
119 unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
120
121 for (int i = 0; i < end_bo; ++i) {
122 if (!flags[i])
123 continue;
124
125 assert(submit.bo_handle_count < batch->num_bos);
126 bo_handles[submit.bo_handle_count++] = i;
127
128 /* Update the BO access flags so that panfrost_bo_wait() knows
129 * about all pending accesses.
130 * We only keep the READ/WRITE info since this is all the BO
131 * wait logic cares about.
132 * We also preserve existing flags as this batch might not
133 * be the first one to access the BO.
134 */
135 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
136
137 bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
138 }
139
140 panfrost_pool_get_bo_handles(&batch->pool,
141 bo_handles + submit.bo_handle_count);
142 submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
143 panfrost_pool_get_bo_handles(&batch->invisible_pool,
144 bo_handles + submit.bo_handle_count);
145 submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
146
147 /* Add the tiler heap to the list of accessed BOs if the batch has at
148 * least one tiler job. Tiler heap is written by tiler jobs and read
149 * by fragment jobs (the polygon list is coming from this heap).
150 */
151 if (batch->jm.jobs.vtc_jc.first_tiler)
152 bo_handles[submit.bo_handle_count++] =
153 panfrost_bo_handle(dev->tiler_heap);
154
155 /* Always used on Bifrost, occassionally used on Midgard */
156 bo_handles[submit.bo_handle_count++] =
157 panfrost_bo_handle(dev->sample_positions);
158
159 submit.bo_handles = (u64)(uintptr_t)bo_handles;
160 if (ctx->is_noop)
161 ret = 0;
162 else
163 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
164 free(bo_handles);
165
166 if (ret)
167 return errno;
168
169 /* Trace the job if we're doing that */
170 if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
171 /* Wait so we can get errors reported back */
172 drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX, 0, NULL);
173
174 if (dev->debug & PAN_DBG_TRACE)
175 pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
176
177 if (dev->debug & PAN_DBG_DUMP)
178 pandecode_dump_mappings(dev->decode_ctx);
179
180 /* Jobs won't be complete if blackhole rendering, that's ok */
181 if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
182 pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
183 }
184
185 return 0;
186 }
187
188 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
189 * outsync corresponding to the later of the two (since there will be an
190 * implicit dep between them) */
191
192 int
GENX(jm_submit_batch)193 GENX(jm_submit_batch)(struct panfrost_batch *batch)
194 {
195 struct pipe_screen *pscreen = batch->ctx->base.screen;
196 struct panfrost_device *dev = pan_device(pscreen);
197 bool has_draws = batch->jm.jobs.vtc_jc.first_job;
198 bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
199 bool has_frag = panfrost_has_fragment_job(batch);
200 uint32_t out_sync = batch->ctx->syncobj;
201 int ret = 0;
202
203 /* Take the submit lock to make sure no tiler jobs from other context
204 * are inserted between our tiler and fragment jobs, failing to do that
205 * might result in tiler heap corruption.
206 */
207 if (has_tiler)
208 pthread_mutex_lock(&dev->submit_lock);
209
210 if (has_draws) {
211 ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, 0,
212 has_frag ? 0 : out_sync);
213
214 if (ret)
215 goto done;
216 }
217
218 if (has_frag) {
219 ret =
220 jm_submit_jc(batch, batch->jm.jobs.frag, PANFROST_JD_REQ_FS, out_sync);
221 if (ret)
222 goto done;
223 }
224
225 done:
226 if (has_tiler)
227 pthread_mutex_unlock(&dev->submit_lock);
228
229 return ret;
230 }
231
232 void
GENX(jm_preload_fb)233 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
234 {
235 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
236
237 GENX(pan_preload_fb)
238 (&dev->blitter, &batch->pool.base, &batch->jm.jobs.vtc_jc, fb,
239 batch->tls.gpu, PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
240 }
241
242 void
GENX(jm_emit_fragment_job)243 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
244 const struct pan_fb_info *pfb)
245 {
246 struct panfrost_ptr transfer =
247 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
248
249 GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu, transfer.cpu);
250
251 batch->jm.jobs.frag = transfer.gpu;
252 }
253
254 #if PAN_ARCH == 9
255 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,mali_ptr shader_ptr)256 jm_emit_shader_env(struct panfrost_batch *batch,
257 struct MALI_SHADER_ENVIRONMENT *cfg,
258 enum pipe_shader_type stage, mali_ptr shader_ptr)
259 {
260 cfg->resources = panfrost_emit_resources(batch, stage);
261 cfg->thread_storage = batch->tls.gpu;
262 cfg->shader = shader_ptr;
263
264 /* Each entry of FAU is 64-bits */
265 cfg->fau = batch->push_uniforms[stage];
266 cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
267 }
268 #endif
269
270 void
GENX(jm_launch_grid)271 GENX(jm_launch_grid)(struct panfrost_batch *batch,
272 const struct pipe_grid_info *info)
273 {
274 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
275
276 /* Invoke according to the grid info */
277
278 unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
279
280 if (info->indirect)
281 num_wg[0] = num_wg[1] = num_wg[2] = 1;
282
283 #if PAN_ARCH <= 7
284 panfrost_pack_work_groups_compute(
285 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
286 num_wg[2], info->block[0], info->block[1], info->block[2], false,
287 info->indirect != NULL);
288
289 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
290 cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
291 util_logbase2_ceil(info->block[1] + 1) +
292 util_logbase2_ceil(info->block[2] + 1);
293 }
294
295 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
296 cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
297 cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
298 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
299 cfg.thread_storage = batch->tls.gpu;
300 cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
301 cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
302 cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
303 cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
304 }
305
306 #if PAN_ARCH == 4
307 pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
308 ;
309 #endif
310 #else
311 struct panfrost_context *ctx = batch->ctx;
312 struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
313
314 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
315 cfg.workgroup_size_x = info->block[0];
316 cfg.workgroup_size_y = info->block[1];
317 cfg.workgroup_size_z = info->block[2];
318
319 cfg.workgroup_count_x = num_wg[0];
320 cfg.workgroup_count_y = num_wg[1];
321 cfg.workgroup_count_z = num_wg[2];
322
323 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
324 batch->rsd[PIPE_SHADER_COMPUTE]);
325
326 /* Workgroups may be merged if the shader does not use barriers
327 * or shared memory. This condition is checked against the
328 * static shared_size at compile-time. We need to check the
329 * variable shared size at launch_grid time, because the
330 * compiler doesn't know about that.
331 */
332 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
333 (info->variable_shared_mem == 0);
334
335 cfg.task_increment = 1;
336 cfg.task_axis = MALI_TASK_AXIS_Z;
337 }
338 #endif
339
340 unsigned indirect_dep = 0;
341 #if PAN_GPU_INDIRECTS
342 if (info->indirect) {
343 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
344 struct pan_indirect_dispatch_info indirect = {
345 .job = t.gpu,
346 .indirect_dim = pan_resource(info->indirect)->image.data.base +
347 info->indirect_offset,
348 .num_wg_sysval =
349 {
350 batch->num_wg_sysval[0],
351 batch->num_wg_sysval[1],
352 batch->num_wg_sysval[2],
353 },
354 };
355
356 indirect_dep = GENX(pan_indirect_dispatch_emit)(
357 &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
358 &indirect);
359 }
360 #endif
361
362 pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
363 MALI_JOB_TYPE_COMPUTE, true, false, indirect_dep, 0, &t,
364 false);
365 }
366
367 #if PAN_ARCH >= 6
368 static mali_ptr
jm_emit_tiler_desc(struct panfrost_batch * batch)369 jm_emit_tiler_desc(struct panfrost_batch *batch)
370 {
371 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
372
373 if (batch->tiler_ctx.bifrost)
374 return batch->tiler_ctx.bifrost;
375
376 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
377
378 pan_pack(t.cpu, TILER_HEAP, heap) {
379 heap.size = panfrost_bo_size(dev->tiler_heap);
380 heap.base = dev->tiler_heap->ptr.gpu;
381 heap.bottom = dev->tiler_heap->ptr.gpu;
382 heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
383 }
384
385 mali_ptr heap = t.gpu;
386 unsigned max_levels = dev->tiler_features.max_levels;
387 assert(max_levels >= 2);
388
389 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
390 pan_pack(t.cpu, TILER_CONTEXT, tiler) {
391 /* TODO: Select hierarchy mask more effectively */
392 tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
393
394 /* For large framebuffers, disable the smallest bin size to
395 * avoid pathological tiler memory usage. Required to avoid OOM
396 * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
397 * Mali-G57.
398 */
399 if (MAX2(batch->key.width, batch->key.height) >= 4096)
400 tiler.hierarchy_mask &= ~1;
401
402 tiler.fb_width = batch->key.width;
403 tiler.fb_height = batch->key.height;
404 tiler.heap = heap;
405 tiler.sample_pattern =
406 pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
407 #if PAN_ARCH >= 9
408 tiler.first_provoking_vertex =
409 pan_tristate_get(batch->first_provoking_vertex);
410 #endif
411 }
412
413 batch->tiler_ctx.bifrost = t.gpu;
414 return batch->tiler_ctx.bifrost;
415 }
416 #endif
417
418 #if PAN_ARCH <= 7
419 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)420 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
421 enum pipe_shader_type st)
422 {
423 d->offset_start = batch->ctx->offset_start;
424 d->instance_size =
425 batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
426
427 d->uniform_buffers = batch->uniform_buffers[st];
428 d->push_uniforms = batch->push_uniforms[st];
429 d->textures = batch->textures[st];
430 d->samplers = batch->samplers[st];
431 }
432
433 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,void * section)434 jm_emit_vertex_draw(struct panfrost_batch *batch, void *section)
435 {
436 pan_pack(section, DRAW, cfg) {
437 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
438 cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
439 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
440 cfg.varyings = batch->varyings.vs;
441 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
442 cfg.thread_storage = batch->tls.gpu;
443 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
444 }
445 }
446
447 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)448 jm_emit_vertex_job(struct panfrost_batch *batch,
449 const struct pipe_draw_info *info, void *invocation_template,
450 void *job)
451 {
452 void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
453 memcpy(section, invocation_template, pan_size(INVOCATION));
454
455 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
456 cfg.job_task_split = 5;
457 }
458
459 section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
460 jm_emit_vertex_draw(batch, section);
461
462 #if PAN_ARCH == 4
463 pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
464 ;
465 #endif
466 }
467 #endif /* PAN_ARCH <= 7 */
468
469 static void
jm_emit_tiler_draw(void * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)470 jm_emit_tiler_draw(void *out, struct panfrost_batch *batch, bool fs_required,
471 enum mesa_prim prim)
472 {
473 struct panfrost_context *ctx = batch->ctx;
474 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
475 bool polygon = (prim == MESA_PRIM_TRIANGLES);
476
477 pan_pack(out, DRAW, cfg) {
478 /*
479 * From the Gallium documentation,
480 * pipe_rasterizer_state::cull_face "indicates which faces of
481 * polygons to cull". Points and lines are not considered
482 * polygons and should be drawn even if all faces are culled.
483 * The hardware does not take primitive type into account when
484 * culling, so we need to do that check ourselves.
485 */
486 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
487 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
488 cfg.front_face_ccw = rast->front_ccw;
489
490 if (ctx->occlusion_query && ctx->active_queries) {
491 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
492 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
493 else
494 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
495
496 struct panfrost_resource *rsrc =
497 pan_resource(ctx->occlusion_query->rsrc);
498 cfg.occlusion = rsrc->image.data.base;
499 panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
500 }
501
502 #if PAN_ARCH >= 9
503 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
504
505 cfg.multisample_enable = rast->multisample;
506 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
507
508 /* Use per-sample shading if required by API Also use it when a
509 * blend shader is used with multisampling, as this is handled
510 * by a single ST_TILE in the blend shader with the current
511 * sample ID, requiring per-sample shading.
512 */
513 cfg.evaluate_per_sample =
514 (rast->multisample &&
515 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
516
517 cfg.single_sampled_lines = !rast->multisample;
518
519 cfg.vertex_array.packet = true;
520
521 cfg.minimum_z = batch->minimum_z;
522 cfg.maximum_z = batch->maximum_z;
523
524 cfg.depth_stencil = batch->depth_stencil;
525
526 if (prim == MESA_PRIM_LINES && rast->line_smooth) {
527 cfg.multisample_enable = true;
528 cfg.single_sampled_lines = false;
529 }
530
531 if (fs_required) {
532 bool has_oq = ctx->occlusion_query && ctx->active_queries;
533
534 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
535 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
536 ctx->blend->base.alpha_to_coverage,
537 ctx->depth_stencil->zs_always_passes);
538
539 cfg.pixel_kill_operation = earlyzs.kill;
540 cfg.zs_update_operation = earlyzs.update;
541
542 cfg.allow_forward_pixel_to_kill =
543 pan_allow_forward_pixel_to_kill(ctx, fs);
544 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
545
546 /* Mask of render targets that may be written. A render
547 * target may be written if the fragment shader writes
548 * to it AND it actually exists. If the render target
549 * doesn't actually exist, the blend descriptor will be
550 * OFF so it may be omitted from the mask.
551 *
552 * Only set when there is a fragment shader, since
553 * otherwise no colour updates are possible.
554 */
555 cfg.render_target_mask =
556 (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
557
558 /* Also use per-sample shading if required by the shader
559 */
560 cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
561
562 /* Unlike Bifrost, alpha-to-coverage must be included in
563 * this identically-named flag. Confusing, isn't it?
564 */
565 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
566 fs->info.fs.can_discard ||
567 ctx->blend->base.alpha_to_coverage;
568
569 /* Blend descriptors are only accessed by a BLEND
570 * instruction on Valhall. It follows that if the
571 * fragment shader is omitted, we may also emit the
572 * blend descriptors.
573 */
574 cfg.blend = batch->blend;
575 cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
576 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
577
578 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
579 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
580
581 jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
582 batch->rsd[PIPE_SHADER_FRAGMENT]);
583 } else {
584 /* These operations need to be FORCE to benefit from the
585 * depth-only pass optimizations.
586 */
587 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
588 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
589
590 /* No shader and no blend => no shader or blend
591 * reasons to disable FPK. The only FPK-related state
592 * not covered is alpha-to-coverage which we don't set
593 * without blend.
594 */
595 cfg.allow_forward_pixel_to_kill = true;
596
597 /* No shader => no shader side effects */
598 cfg.allow_forward_pixel_to_be_killed = true;
599
600 /* Alpha isn't written so these are vacuous */
601 cfg.overdraw_alpha0 = true;
602 cfg.overdraw_alpha1 = true;
603 }
604 #else
605 cfg.position = batch->varyings.pos;
606 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
607 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
608 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
609 cfg.viewport = batch->viewport;
610 cfg.varyings = batch->varyings.fs;
611 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
612 cfg.thread_storage = batch->tls.gpu;
613
614 /* For all primitives but lines DRAW.flat_shading_vertex must
615 * be set to 0 and the provoking vertex is selected with the
616 * PRIMITIVE.first_provoking_vertex field.
617 */
618 if (prim == MESA_PRIM_LINES) {
619 /* The logic is inverted across arches. */
620 cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
621 }
622
623 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
624 #endif
625 }
626 }
627
628 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
629 * jobs and Valhall IDVS jobs
630 */
631 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * out)632 jm_emit_primitive(struct panfrost_batch *batch,
633 const struct pipe_draw_info *info,
634 const struct pipe_draw_start_count_bias *draw,
635 bool secondary_shader, void *out)
636 {
637 struct panfrost_context *ctx = batch->ctx;
638 UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
639
640 pan_pack(out, PRIMITIVE, cfg) {
641 cfg.draw_mode = pan_draw_mode(info->mode);
642 if (panfrost_writes_point_size(ctx))
643 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
644
645 #if PAN_ARCH <= 8
646 /* For line primitives, PRIMITIVE.first_provoking_vertex must
647 * be set to true and the provoking vertex is selected with
648 * DRAW.flat_shading_vertex.
649 */
650 if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
651 cfg.first_provoking_vertex = true;
652 else
653 cfg.first_provoking_vertex = rast->flatshade_first;
654
655 if (panfrost_is_implicit_prim_restart(info)) {
656 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
657 } else if (info->primitive_restart) {
658 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
659 cfg.primitive_restart_index = info->restart_index;
660 }
661
662 cfg.job_task_split = 6;
663 #else
664 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
665
666 cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
667 cfg.primitive_restart = info->primitive_restart;
668
669 /* Non-fixed restart indices should have been lowered */
670 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
671 #endif
672
673 cfg.index_count = draw->count;
674 cfg.index_type = panfrost_translate_index_size(info->index_size);
675
676 if (PAN_ARCH >= 9) {
677 /* Base vertex offset on Valhall is used for both
678 * indexed and non-indexed draws, in a simple way for
679 * either. Handle both cases.
680 */
681 if (cfg.index_type)
682 cfg.base_vertex_offset = draw->index_bias;
683 else
684 cfg.base_vertex_offset = draw->start;
685
686 /* Indices are moved outside the primitive descriptor
687 * on Valhall, so we don't need to set that here
688 */
689 } else if (cfg.index_type) {
690 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
691
692 #if PAN_ARCH <= 7
693 cfg.indices = batch->indices;
694 #endif
695 }
696
697 #if PAN_ARCH >= 6
698 cfg.secondary_shader = secondary_shader;
699 #endif
700 }
701 }
702
703 #if PAN_ARCH == 9
704 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)705 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
706 const struct pipe_draw_info *info,
707 const struct pipe_draw_start_count_bias *draw,
708 bool secondary_shader, void *job)
709 {
710 struct panfrost_context *ctx = batch->ctx;
711 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
712 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
713
714 bool fs_required = panfrost_fs_required(
715 fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
716
717 /* Varying shaders only feed data to the fragment shader, so if we omit
718 * the fragment shader, we should omit the varying shader too.
719 */
720 secondary_shader &= fs_required;
721
722 jm_emit_primitive(batch, info, draw, secondary_shader,
723 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
724
725 pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
726 cfg.count = info->instance_count;
727 }
728
729 pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
730 if (secondary_shader) {
731 unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
732 cfg.vertex_packet_stride = sz + 16;
733 cfg.vertex_attribute_stride = sz;
734 } else {
735 /* Hardware requirement for "no varyings" */
736 cfg.vertex_packet_stride = 16;
737 cfg.vertex_attribute_stride = 0;
738 }
739 }
740
741 pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
742 cfg.address = jm_emit_tiler_desc(batch);
743 }
744
745 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
746 memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
747 pan_size(SCISSOR));
748
749 panfrost_emit_primitive_size(
750 ctx, info->mode == MESA_PRIM_POINTS, 0,
751 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
752
753 pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
754 cfg.address = batch->indices;
755 }
756
757 jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
758 fs_required, u_reduced_prim(info->mode));
759
760 pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
761 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
762 panfrost_get_position_shader(batch, info));
763 }
764
765 pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
766 /* If a varying shader is used, we configure it with the same
767 * state as the position shader for backwards compatible
768 * behaviour with Bifrost. This could be optimized.
769 */
770 if (!secondary_shader)
771 continue;
772
773 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
774 panfrost_get_varying_shader(batch));
775 }
776 }
777 #endif
778
779 #if PAN_ARCH <= 7
780 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)781 jm_emit_tiler_job(struct panfrost_batch *batch,
782 const struct pipe_draw_info *info,
783 const struct pipe_draw_start_count_bias *draw,
784 void *invocation_template, bool secondary_shader, void *job)
785 {
786 struct panfrost_context *ctx = batch->ctx;
787
788 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
789 memcpy(section, invocation_template, pan_size(INVOCATION));
790
791 jm_emit_primitive(batch, info, draw, secondary_shader,
792 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
793
794 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
795 enum mesa_prim prim = u_reduced_prim(info->mode);
796
797 #if PAN_ARCH >= 6
798 pan_section_pack(job, TILER_JOB, TILER, cfg) {
799 cfg.address = jm_emit_tiler_desc(batch);
800 }
801
802 pan_section_pack(job, TILER_JOB, PADDING, cfg)
803 ;
804 #endif
805
806 jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
807
808 panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
809 batch->varyings.psiz, prim_size);
810 }
811 #endif
812
813 void
GENX(jm_launch_xfb)814 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
815 const struct pipe_draw_info *info, unsigned count)
816 {
817 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
818
819 #if PAN_ARCH == 9
820 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
821 cfg.workgroup_size_x = 1;
822 cfg.workgroup_size_y = 1;
823 cfg.workgroup_size_z = 1;
824
825 cfg.workgroup_count_x = count;
826 cfg.workgroup_count_y = info->instance_count;
827 cfg.workgroup_count_z = 1;
828
829 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
830 batch->rsd[PIPE_SHADER_VERTEX]);
831
832 /* TODO: Indexing. Also, this is a legacy feature... */
833 cfg.compute.attribute_offset = batch->ctx->offset_start;
834
835 /* Transform feedback shaders do not use barriers or shared
836 * memory, so we may merge workgroups.
837 */
838 cfg.allow_merging_workgroups = true;
839 cfg.task_increment = 1;
840 cfg.task_axis = MALI_TASK_AXIS_Z;
841 }
842 #else
843 struct mali_invocation_packed invocation;
844
845 panfrost_pack_work_groups_compute(&invocation, 1, count,
846 info->instance_count, 1, 1, 1,
847 PAN_ARCH <= 5, false);
848
849 /* No varyings on XFB compute jobs. */
850 mali_ptr saved_vs_varyings = batch->varyings.vs;
851
852 batch->varyings.vs = 0;
853 jm_emit_vertex_job(batch, info, &invocation, t.cpu);
854 batch->varyings.vs = saved_vs_varyings;
855
856 #endif
857 enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
858 #if PAN_ARCH <= 5
859 job_type = MALI_JOB_TYPE_VERTEX;
860 #endif
861 pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc, job_type, true,
862 false, 0, 0, &t, false);
863 }
864
865 #if PAN_ARCH < 9
866 /*
867 * Push jobs required for the rasterization pipeline. If there are side effects
868 * from the vertex shader, these are handled ahead-of-time with a compute
869 * shader. This function should not be called if rasterization is skipped.
870 */
871 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)872 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
873 const struct panfrost_ptr *vertex_job,
874 const struct panfrost_ptr *tiler_job)
875 {
876 unsigned vertex = pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
877 MALI_JOB_TYPE_VERTEX, false, false, 0, 0,
878 vertex_job, false);
879
880 pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
881 MALI_JOB_TYPE_TILER, false, false, vertex, 0, tiler_job,
882 false);
883 }
884 #endif
885
886 void
GENX(jm_launch_draw)887 GENX(jm_launch_draw)(struct panfrost_batch *batch,
888 const struct pipe_draw_info *info, unsigned drawid_offset,
889 const struct pipe_draw_start_count_bias *draw,
890 unsigned vertex_count)
891 {
892 struct panfrost_context *ctx = batch->ctx;
893 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
894 bool secondary_shader = vs->info.vs.secondary_enable;
895 bool idvs = vs->info.vs.idvs;
896
897 #if PAN_ARCH <= 7
898 struct mali_invocation_packed invocation;
899 if (info->instance_count > 1) {
900 panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
901 info->instance_count, 1, 1, 1, true,
902 false);
903 } else {
904 pan_pack(&invocation, INVOCATION, cfg) {
905 cfg.invocations = vertex_count - 1;
906 cfg.size_y_shift = 0;
907 cfg.size_z_shift = 0;
908 cfg.workgroups_x_shift = 0;
909 cfg.workgroups_y_shift = 0;
910 cfg.workgroups_z_shift = 32;
911 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
912 }
913 }
914
915 /* Emit all sort of descriptors. */
916 #endif
917
918 UNUSED struct panfrost_ptr tiler, vertex;
919
920 if (idvs) {
921 #if PAN_ARCH == 9
922 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
923 #elif PAN_ARCH >= 6
924 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
925 #else
926 unreachable("IDVS is unsupported on Midgard");
927 #endif
928 } else {
929 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
930 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
931 }
932
933 #if PAN_ARCH == 9
934 assert(idvs && "Memory allocated IDVS required on Valhall");
935
936 jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
937
938 pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
939 MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, 0, &tiler,
940 false);
941 #else
942 /* Fire off the draw itself */
943 jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
944 tiler.cpu);
945 if (idvs) {
946 #if PAN_ARCH >= 6
947 jm_emit_vertex_draw(
948 batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
949
950 pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
951 MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, 0, &tiler,
952 false);
953 #endif
954 } else {
955 jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
956 jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
957 }
958 #endif
959 }
960