• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2017 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "decode.h"
27 
28 #include "drm-uapi/panfrost_drm.h"
29 
30 #include "pan_cmdstream.h"
31 #include "pan_context.h"
32 #include "pan_fb_preload.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36 
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40 
41 int
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44    /* Reserve the framebuffer and local storage descriptors */
45    batch->framebuffer =
46 #if PAN_ARCH == 4
47       pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49       pan_pool_alloc_desc_aggregate(
50          &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51          PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53    if (!batch->framebuffer.gpu)
54       return -1;
55 
56 #if PAN_ARCH >= 6
57    batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
58 #else
59    /* On Midgard, the TLS is embedded in the FB descriptor */
60    batch->tls = batch->framebuffer;
61 
62    if (!batch->tls.cpu)
63       return -1;
64 
65 #if PAN_ARCH == 5
66    struct mali_framebuffer_pointer_packed ptr;
67 
68    pan_pack(&ptr, FRAMEBUFFER_POINTER, cfg) {
69       cfg.pointer = batch->framebuffer.gpu;
70       cfg.render_target_count = 1; /* a necessary lie */
71    }
72 
73    /* XXX: THIS IS A BUG, FIXME */
74    batch->tls.gpu = ptr.opaque[0];
75 #endif
76 #endif
77 
78    return 0;
79 }
80 
81 static int
jm_submit_jc(struct panfrost_batch * batch,uint64_t first_job_desc,uint32_t reqs,uint32_t out_sync)82 jm_submit_jc(struct panfrost_batch *batch, uint64_t first_job_desc,
83              uint32_t reqs, uint32_t out_sync)
84 {
85    struct panfrost_context *ctx = batch->ctx;
86    struct pipe_context *gallium = (struct pipe_context *)ctx;
87    struct panfrost_device *dev = pan_device(gallium->screen);
88    struct drm_panfrost_submit submit = {
89       0,
90    };
91    uint32_t in_syncs[1];
92    uint32_t *bo_handles;
93    int ret;
94 
95    /* If we trace, we always need a syncobj, so make one of our own if we
96     * weren't given one to use. Remember that we did so, so we can free it
97     * after we're done but preventing double-frees if we were given a
98     * syncobj */
99 
100    if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
101       out_sync = ctx->syncobj;
102 
103    submit.out_sync = out_sync;
104    submit.jc = first_job_desc;
105    submit.requirements = reqs;
106 
107    if (ctx->in_sync_fd >= 0) {
108       ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
109                                      ctx->in_sync_fd);
110       assert(!ret);
111 
112       in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
113       close(ctx->in_sync_fd);
114       ctx->in_sync_fd = -1;
115    }
116 
117    if (submit.in_sync_count)
118       submit.in_syncs = (uintptr_t)in_syncs;
119 
120    bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
121                           panfrost_pool_num_bos(&batch->invisible_pool) +
122                           batch->num_bos + 2,
123                        sizeof(*bo_handles));
124    assert(bo_handles);
125 
126    pan_bo_access *flags = util_dynarray_begin(&batch->bos);
127    unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
128 
129    for (int i = 0; i < end_bo; ++i) {
130       if (!flags[i])
131          continue;
132 
133       assert(submit.bo_handle_count < batch->num_bos);
134       bo_handles[submit.bo_handle_count++] = i;
135 
136       /* Update the BO access flags so that panfrost_bo_wait() knows
137        * about all pending accesses.
138        * We only keep the READ/WRITE info since this is all the BO
139        * wait logic cares about.
140        * We also preserve existing flags as this batch might not
141        * be the first one to access the BO.
142        */
143       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
144 
145       bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
146    }
147 
148    panfrost_pool_get_bo_handles(&batch->pool,
149                                 bo_handles + submit.bo_handle_count);
150    submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
151    panfrost_pool_get_bo_handles(&batch->invisible_pool,
152                                 bo_handles + submit.bo_handle_count);
153    submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
154 
155    /* Add the tiler heap to the list of accessed BOs if the batch has at
156     * least one tiler job. Tiler heap is written by tiler jobs and read
157     * by fragment jobs (the polygon list is coming from this heap).
158     */
159    if (batch->jm.jobs.vtc_jc.first_tiler)
160       bo_handles[submit.bo_handle_count++] =
161          panfrost_bo_handle(dev->tiler_heap);
162 
163    /* Always used on Bifrost, occassionally used on Midgard */
164    bo_handles[submit.bo_handle_count++] =
165       panfrost_bo_handle(dev->sample_positions);
166 
167    submit.bo_handles = (uint64_t)(uintptr_t)bo_handles;
168    if (ctx->is_noop)
169       ret = 0;
170    else
171       ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
172    free(bo_handles);
173 
174    if (ret)
175       return errno;
176 
177    /* Trace the job if we're doing that */
178    if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
179       /* Wait so we can get errors reported back */
180       ret = drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX,
181                            0, NULL);
182       if (ret)
183          return errno;
184 
185       if (dev->debug & PAN_DBG_TRACE)
186          pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
187 
188       if (dev->debug & PAN_DBG_DUMP)
189          pandecode_dump_mappings(dev->decode_ctx);
190 
191       /* Jobs won't be complete if blackhole rendering, that's ok */
192       if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
193          pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
194    }
195 
196    return 0;
197 }
198 
199 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
200  * outsync corresponding to the later of the two (since there will be an
201  * implicit dep between them) */
202 
203 int
GENX(jm_submit_batch)204 GENX(jm_submit_batch)(struct panfrost_batch *batch)
205 {
206    struct pipe_screen *pscreen = batch->ctx->base.screen;
207    struct panfrost_device *dev = pan_device(pscreen);
208    bool has_draws = batch->jm.jobs.vtc_jc.first_job;
209    bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
210    bool has_frag = panfrost_has_fragment_job(batch);
211    uint32_t out_sync = batch->ctx->syncobj;
212    int ret = 0;
213 
214    unsigned reqs =
215       batch->need_job_req_cycle_count ? PANFROST_JD_REQ_CYCLE_COUNT : 0;
216 
217    /* Take the submit lock to make sure no tiler jobs from other context
218     * are inserted between our tiler and fragment jobs, failing to do that
219     * might result in tiler heap corruption.
220     */
221    if (has_tiler)
222       pthread_mutex_lock(&dev->submit_lock);
223 
224    if (has_draws) {
225       ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, reqs,
226                          has_frag ? 0 : out_sync);
227 
228       if (ret)
229          goto done;
230    }
231 
232    if (has_frag) {
233       ret = jm_submit_jc(batch, batch->jm.jobs.frag, reqs | PANFROST_JD_REQ_FS,
234                          out_sync);
235       if (ret)
236          goto done;
237    }
238 
239 done:
240    if (has_tiler)
241       pthread_mutex_unlock(&dev->submit_lock);
242 
243    return ret;
244 }
245 
246 void
GENX(jm_preload_fb)247 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
248 {
249    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
250    struct panfrost_ptr preload_jobs[2];
251 
252    unsigned preload_job_count =
253       GENX(pan_preload_fb)(&dev->fb_preload_cache, &batch->pool.base, fb,
254                            batch->tls.gpu, preload_jobs);
255 
256    assert(PAN_ARCH < 6 || !preload_job_count);
257 
258    for (unsigned j = 0; j < preload_job_count; j++) {
259       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
260                      0, 0, &preload_jobs[j], true);
261    }
262 }
263 
264 void
GENX(jm_emit_fbds)265 GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
266                    struct pan_tls_info *tls)
267 {
268    batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
269       fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
270 }
271 
272 void
GENX(jm_emit_fragment_job)273 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
274                            const struct pan_fb_info *pfb)
275 {
276    struct panfrost_ptr transfer =
277       pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
278 
279    GENX(pan_emit_fragment_job_payload)
280    (pfb, batch->framebuffer.gpu, transfer.cpu);
281 
282    pan_section_pack(transfer.cpu, FRAGMENT_JOB, HEADER, header) {
283       header.type = MALI_JOB_TYPE_FRAGMENT;
284       header.index = 1;
285    }
286 
287    batch->jm.jobs.frag = transfer.gpu;
288 }
289 
290 #if PAN_ARCH == 9
291 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,uint64_t shader_ptr)292 jm_emit_shader_env(struct panfrost_batch *batch,
293                    struct MALI_SHADER_ENVIRONMENT *cfg,
294                    enum pipe_shader_type stage, uint64_t shader_ptr)
295 {
296    cfg->resources = panfrost_emit_resources(batch, stage);
297    cfg->thread_storage = batch->tls.gpu;
298    cfg->shader = shader_ptr;
299 
300    /* Each entry of FAU is 64-bits */
301    cfg->fau = batch->push_uniforms[stage];
302    cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
303 }
304 #endif
305 
306 void
GENX(jm_launch_grid)307 GENX(jm_launch_grid)(struct panfrost_batch *batch,
308                      const struct pipe_grid_info *info)
309 {
310    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
311 
312    /* Invoke according to the grid info */
313 
314    unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
315 
316    if (info->indirect)
317       num_wg[0] = num_wg[1] = num_wg[2] = 1;
318 
319 #if PAN_ARCH <= 7
320    panfrost_pack_work_groups_compute(
321       pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
322       num_wg[2], info->block[0], info->block[1], info->block[2], false,
323       info->indirect != NULL);
324 
325    pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
326       cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
327                            util_logbase2_ceil(info->block[1] + 1) +
328                            util_logbase2_ceil(info->block[2] + 1);
329    }
330 
331    pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
332       cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
333       cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
334       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
335       cfg.thread_storage = batch->tls.gpu;
336       cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
337       cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
338       cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
339       cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
340    }
341 
342 #if PAN_ARCH == 4
343    pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
344       ;
345 #endif
346 #else
347    struct panfrost_context *ctx = batch->ctx;
348    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
349 
350    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
351       cfg.workgroup_size_x = info->block[0];
352       cfg.workgroup_size_y = info->block[1];
353       cfg.workgroup_size_z = info->block[2];
354 
355       cfg.workgroup_count_x = num_wg[0];
356       cfg.workgroup_count_y = num_wg[1];
357       cfg.workgroup_count_z = num_wg[2];
358 
359       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
360                          batch->rsd[PIPE_SHADER_COMPUTE]);
361 
362       /* Workgroups may be merged if the shader does not use barriers
363        * or shared memory. This condition is checked against the
364        * static shared_size at compile-time. We need to check the
365        * variable shared size at launch_grid time, because the
366        * compiler doesn't know about that.
367        */
368       cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
369                                      (info->variable_shared_mem == 0);
370 
371       cfg.task_increment = 1;
372       cfg.task_axis = MALI_TASK_AXIS_Z;
373    }
374 #endif
375 
376    unsigned indirect_dep = 0;
377 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
378    if (info->indirect) {
379       struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
380       struct pan_indirect_dispatch_info indirect = {
381          .job = t.gpu,
382          .indirect_dim = pan_resource(info->indirect)->image.data.base +
383                          info->indirect_offset,
384          .num_wg_sysval =
385             {
386                batch->num_wg_sysval[0],
387                batch->num_wg_sysval[1],
388                batch->num_wg_sysval[2],
389             },
390       };
391 
392       indirect_dep = GENX(pan_indirect_dispatch_emit)(
393          &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
394          &indirect);
395    }
396 #endif
397 
398    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, true, false,
399                   indirect_dep, 0, &t, false);
400 }
401 
402 #if PAN_ARCH >= 6
403 static uint64_t
jm_emit_tiler_desc(struct panfrost_batch * batch)404 jm_emit_tiler_desc(struct panfrost_batch *batch)
405 {
406    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
407    uint64_t tiler_desc = PAN_ARCH >= 9 ? batch->tiler_ctx.bifrost.desc
408                                        : batch->tiler_ctx.valhall.desc;
409 
410    if (tiler_desc)
411       return tiler_desc;
412 
413    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
414 
415    pan_cast_and_pack(t.cpu, TILER_HEAP, heap) {
416       heap.size = panfrost_bo_size(dev->tiler_heap);
417       heap.base = dev->tiler_heap->ptr.gpu;
418       heap.bottom = dev->tiler_heap->ptr.gpu;
419       heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
420    }
421 
422    uint64_t heap = t.gpu;
423    unsigned max_levels = dev->tiler_features.max_levels;
424    assert(max_levels >= 2);
425 
426    t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
427    pan_cast_and_pack(t.cpu, TILER_CONTEXT, tiler) {
428       /* TODO: Select hierarchy mask more effectively */
429       tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
430 
431       /* For large framebuffers, disable the smallest bin size to
432        * avoid pathological tiler memory usage. Required to avoid OOM
433        * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
434        * Mali-G57.
435        */
436       if (MAX2(batch->key.width, batch->key.height) >= 4096)
437          tiler.hierarchy_mask &= ~1;
438 
439       tiler.fb_width = batch->key.width;
440       tiler.fb_height = batch->key.height;
441       tiler.heap = heap;
442       tiler.sample_pattern =
443          pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
444 #if PAN_ARCH >= 9
445       tiler.first_provoking_vertex =
446          batch->first_provoking_vertex == U_TRISTATE_YES;
447 #endif
448    }
449 
450    if (PAN_ARCH >= 9)
451       batch->tiler_ctx.valhall.desc = t.gpu;
452    else
453       batch->tiler_ctx.bifrost.desc = t.gpu;
454 
455    return t.gpu;
456 }
457 #endif
458 
459 #if PAN_ARCH <= 7
460 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)461 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
462                    enum pipe_shader_type st)
463 {
464    d->offset_start = batch->ctx->offset_start;
465    d->instance_size =
466       batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
467 
468    d->uniform_buffers = batch->uniform_buffers[st];
469    d->push_uniforms = batch->push_uniforms[st];
470    d->textures = batch->textures[st];
471    d->samplers = batch->samplers[st];
472 }
473 
474 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,struct mali_draw_packed * section)475 jm_emit_vertex_draw(struct panfrost_batch *batch,
476                     struct mali_draw_packed *section)
477 {
478    pan_pack(section, DRAW, cfg) {
479       cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
480       cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
481       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
482       cfg.varyings = batch->varyings.vs;
483       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
484       cfg.thread_storage = batch->tls.gpu;
485       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
486    }
487 }
488 
489 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)490 jm_emit_vertex_job(struct panfrost_batch *batch,
491                    const struct pipe_draw_info *info, void *invocation_template,
492                    void *job)
493 {
494    void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
495    memcpy(section, invocation_template, pan_size(INVOCATION));
496 
497    pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
498       cfg.job_task_split = 5;
499    }
500 
501    section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
502    jm_emit_vertex_draw(batch, section);
503 
504 #if PAN_ARCH == 4
505    pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
506       ;
507 #endif
508 }
509 #endif /* PAN_ARCH <= 7 */
510 
511 static void
jm_emit_tiler_draw(struct mali_draw_packed * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)512 jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch,
513                    bool fs_required, enum mesa_prim prim)
514 {
515    struct panfrost_context *ctx = batch->ctx;
516    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
517    bool polygon = (prim == MESA_PRIM_TRIANGLES);
518 
519    pan_pack(out, DRAW, cfg) {
520       /*
521        * From the Gallium documentation,
522        * pipe_rasterizer_state::cull_face "indicates which faces of
523        * polygons to cull". Points and lines are not considered
524        * polygons and should be drawn even if all faces are culled.
525        * The hardware does not take primitive type into account when
526        * culling, so we need to do that check ourselves.
527        */
528       cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
529       cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
530       cfg.front_face_ccw = rast->front_ccw;
531 
532       if (ctx->occlusion_query && ctx->active_queries) {
533          if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
534             cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
535          else
536             cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
537 
538          struct panfrost_resource *rsrc =
539             pan_resource(ctx->occlusion_query->rsrc);
540          cfg.occlusion = rsrc->image.data.base;
541          panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
542       }
543 
544 #if PAN_ARCH >= 9
545       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
546 
547       cfg.multisample_enable = rast->multisample;
548       cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
549 
550       /* Use per-sample shading if required by API Also use it when a
551        * blend shader is used with multisampling, as this is handled
552        * by a single ST_TILE in the blend shader with the current
553        * sample ID, requiring per-sample shading.
554        */
555       cfg.evaluate_per_sample =
556          (rast->multisample &&
557           ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
558 
559       cfg.single_sampled_lines = !rast->multisample;
560 
561       cfg.vertex_array.packet = true;
562 
563       cfg.minimum_z = batch->minimum_z;
564       cfg.maximum_z = batch->maximum_z;
565 
566       cfg.depth_stencil = batch->depth_stencil;
567 
568       if (prim == MESA_PRIM_LINES && rast->line_smooth) {
569          cfg.multisample_enable = true;
570          cfg.single_sampled_lines = false;
571       }
572 
573       if (fs_required) {
574          bool has_oq = ctx->occlusion_query && ctx->active_queries;
575 
576          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
577             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
578             ctx->blend->base.alpha_to_coverage,
579             ctx->depth_stencil->zs_always_passes);
580 
581          cfg.pixel_kill_operation = earlyzs.kill;
582          cfg.zs_update_operation = earlyzs.update;
583 
584          cfg.allow_forward_pixel_to_kill =
585             pan_allow_forward_pixel_to_kill(ctx, fs);
586          cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
587 
588          /* Mask of render targets that may be written. A render
589           * target may be written if the fragment shader writes
590           * to it AND it actually exists. If the render target
591           * doesn't actually exist, the blend descriptor will be
592           * OFF so it may be omitted from the mask.
593           *
594           * Only set when there is a fragment shader, since
595           * otherwise no colour updates are possible.
596           */
597          cfg.render_target_mask =
598             (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
599 
600          /* Also use per-sample shading if required by the shader
601           */
602          cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
603 
604          /* Unlike Bifrost, alpha-to-coverage must be included in
605           * this identically-named flag. Confusing, isn't it?
606           */
607          cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
608                                         fs->info.fs.can_discard ||
609                                         ctx->blend->base.alpha_to_coverage;
610 
611          /* Blend descriptors are only accessed by a BLEND
612           * instruction on Valhall. It follows that if the
613           * fragment shader is omitted, we may also emit the
614           * blend descriptors.
615           */
616          cfg.blend = batch->blend;
617          cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
618          cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
619 
620          cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
621          cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
622 
623          jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
624                             batch->rsd[PIPE_SHADER_FRAGMENT]);
625       } else {
626          /* These operations need to be FORCE to benefit from the
627           * depth-only pass optimizations.
628           */
629          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
630          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
631 
632          /* No shader and no blend => no shader or blend
633           * reasons to disable FPK. The only FPK-related state
634           * not covered is alpha-to-coverage which we don't set
635           * without blend.
636           */
637          cfg.allow_forward_pixel_to_kill = true;
638 
639          /* No shader => no shader side effects */
640          cfg.allow_forward_pixel_to_be_killed = true;
641 
642          /* Alpha isn't written so these are vacuous */
643          cfg.overdraw_alpha0 = true;
644          cfg.overdraw_alpha1 = true;
645       }
646 #else
647       cfg.position = batch->varyings.pos;
648       cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
649       cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
650       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
651       cfg.viewport = batch->viewport;
652       cfg.varyings = batch->varyings.fs;
653       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
654       cfg.thread_storage = batch->tls.gpu;
655 
656       /* For all primitives but lines DRAW.flat_shading_vertex must
657        * be set to 0 and the provoking vertex is selected with the
658        * PRIMITIVE.first_provoking_vertex field.
659        */
660       if (prim == MESA_PRIM_LINES) {
661          /* The logic is inverted across arches. */
662          cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
663       }
664 
665       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
666 #endif
667    }
668 }
669 
670 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
671  * jobs and Valhall IDVS jobs
672  */
673 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,struct mali_primitive_packed * out)674 jm_emit_primitive(struct panfrost_batch *batch,
675                   const struct pipe_draw_info *info,
676                   const struct pipe_draw_start_count_bias *draw,
677                   bool secondary_shader, struct mali_primitive_packed *out)
678 {
679    struct panfrost_context *ctx = batch->ctx;
680    UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
681 
682    pan_pack(out, PRIMITIVE, cfg) {
683       cfg.draw_mode = pan_draw_mode(info->mode);
684       if (panfrost_writes_point_size(ctx))
685          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
686 
687 #if PAN_ARCH <= 8
688       /* For line primitives, PRIMITIVE.first_provoking_vertex must
689        * be set to true and the provoking vertex is selected with
690        * DRAW.flat_shading_vertex.
691        */
692       if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
693          cfg.first_provoking_vertex = true;
694       else
695          cfg.first_provoking_vertex = rast->flatshade_first;
696 
697       if (panfrost_is_implicit_prim_restart(info)) {
698          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
699       } else if (info->primitive_restart) {
700          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
701          cfg.primitive_restart_index = info->restart_index;
702       }
703 
704       cfg.job_task_split = 6;
705 #else
706       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
707 
708       cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
709       cfg.primitive_restart = info->primitive_restart;
710 
711       /* Non-fixed restart indices should have been lowered */
712       assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
713 #endif
714 
715       cfg.low_depth_cull = rast->depth_clip_near;
716       cfg.high_depth_cull = rast->depth_clip_far;
717 
718       cfg.index_count = draw->count;
719       cfg.index_type = panfrost_translate_index_size(info->index_size);
720 
721       if (PAN_ARCH >= 9) {
722          /* Base vertex offset on Valhall is used for both
723           * indexed and non-indexed draws, in a simple way for
724           * either. Handle both cases.
725           */
726          if (cfg.index_type)
727             cfg.base_vertex_offset = draw->index_bias;
728          else
729             cfg.base_vertex_offset = draw->start;
730 
731          /* Indices are moved outside the primitive descriptor
732           * on Valhall, so we don't need to set that here
733           */
734       } else if (cfg.index_type) {
735          cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
736 
737 #if PAN_ARCH <= 7
738          cfg.indices = batch->indices;
739 #endif
740       }
741 
742 #if PAN_ARCH >= 6
743       cfg.secondary_shader = secondary_shader;
744 #endif
745    }
746 }
747 
748 #if PAN_ARCH == 9
749 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)750 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
751                           const struct pipe_draw_info *info,
752                           const struct pipe_draw_start_count_bias *draw,
753                           bool secondary_shader, void *job)
754 {
755    struct panfrost_context *ctx = batch->ctx;
756    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
757    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
758 
759    bool fs_required = panfrost_fs_required(
760       fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
761 
762    /* Varying shaders only feed data to the fragment shader, so if we omit
763     * the fragment shader, we should omit the varying shader too.
764     */
765    secondary_shader &= fs_required;
766 
767    jm_emit_primitive(batch, info, draw, secondary_shader,
768                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
769 
770    pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
771       cfg.count = info->instance_count;
772    }
773 
774    pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
775       if (secondary_shader) {
776          unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
777          cfg.vertex_packet_stride = sz + 16;
778          cfg.vertex_attribute_stride = sz;
779       } else {
780          /* Hardware requirement for "no varyings" */
781          cfg.vertex_packet_stride = 16;
782          cfg.vertex_attribute_stride = 0;
783       }
784    }
785 
786    pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
787       cfg.address = jm_emit_tiler_desc(batch);
788    }
789 
790    STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
791    memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
792           pan_size(SCISSOR));
793 
794    panfrost_emit_primitive_size(
795       ctx, info->mode == MESA_PRIM_POINTS, 0,
796       pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
797 
798    pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
799       cfg.address = batch->indices;
800    }
801 
802    jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
803                       fs_required, u_reduced_prim(info->mode));
804 
805    pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
806       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
807                          panfrost_get_position_shader(batch, info));
808    }
809 
810    pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
811       /* If a varying shader is used, we configure it with the same
812        * state as the position shader for backwards compatible
813        * behaviour with Bifrost. This could be optimized.
814        */
815       if (!secondary_shader)
816          continue;
817 
818       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
819                          panfrost_get_varying_shader(batch));
820    }
821 }
822 #endif
823 
824 #if PAN_ARCH <= 7
825 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)826 jm_emit_tiler_job(struct panfrost_batch *batch,
827                   const struct pipe_draw_info *info,
828                   const struct pipe_draw_start_count_bias *draw,
829                   void *invocation_template, bool secondary_shader, void *job)
830 {
831    struct panfrost_context *ctx = batch->ctx;
832 
833    void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
834    memcpy(section, invocation_template, pan_size(INVOCATION));
835 
836    jm_emit_primitive(batch, info, draw, secondary_shader,
837                      pan_section_ptr(job, TILER_JOB, PRIMITIVE));
838 
839    void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
840    enum mesa_prim prim = u_reduced_prim(info->mode);
841 
842 #if PAN_ARCH >= 6
843    pan_section_pack(job, TILER_JOB, TILER, cfg) {
844       cfg.address = jm_emit_tiler_desc(batch);
845    }
846 
847    pan_section_pack(job, TILER_JOB, PADDING, cfg)
848       ;
849 #endif
850 
851    jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
852 
853    panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
854                                 batch->varyings.psiz, prim_size);
855 }
856 #endif
857 
858 void
GENX(jm_launch_xfb)859 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
860                     const struct pipe_draw_info *info, unsigned count)
861 {
862    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
863 
864 #if PAN_ARCH == 9
865    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
866       cfg.workgroup_size_x = 1;
867       cfg.workgroup_size_y = 1;
868       cfg.workgroup_size_z = 1;
869 
870       cfg.workgroup_count_x = count;
871       cfg.workgroup_count_y = info->instance_count;
872       cfg.workgroup_count_z = 1;
873 
874       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
875                          batch->rsd[PIPE_SHADER_VERTEX]);
876 
877       /* TODO: Indexing. Also, this is a legacy feature... */
878       cfg.compute.attribute_offset = batch->ctx->offset_start;
879 
880       /* Transform feedback shaders do not use barriers or shared
881        * memory, so we may merge workgroups.
882        */
883       cfg.allow_merging_workgroups = true;
884       cfg.task_increment = 1;
885       cfg.task_axis = MALI_TASK_AXIS_Z;
886    }
887 #else
888    struct mali_invocation_packed invocation;
889 
890    panfrost_pack_work_groups_compute(&invocation, 1, count,
891                                      info->instance_count, 1, 1, 1,
892                                      PAN_ARCH <= 5, false);
893 
894    /* No varyings on XFB compute jobs. */
895    uint64_t saved_vs_varyings = batch->varyings.vs;
896 
897    batch->varyings.vs = 0;
898    jm_emit_vertex_job(batch, info, &invocation, t.cpu);
899    batch->varyings.vs = saved_vs_varyings;
900 
901 #endif
902    enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
903 #if PAN_ARCH <= 5
904    job_type = MALI_JOB_TYPE_VERTEX;
905 #endif
906    pan_jc_add_job(&batch->jm.jobs.vtc_jc, job_type, true, false, 0, 0, &t,
907                   false);
908 }
909 
910 #if PAN_ARCH < 9
911 /*
912  * Push jobs required for the rasterization pipeline. If there are side effects
913  * from the vertex shader, these are handled ahead-of-time with a compute
914  * shader. This function should not be called if rasterization is skipped.
915  */
916 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)917 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
918                           const struct panfrost_ptr *vertex_job,
919                           const struct panfrost_ptr *tiler_job)
920 {
921    unsigned vertex =
922       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
923                      0, 0, vertex_job, false);
924 
925    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
926                   vertex, 0, tiler_job, false);
927 }
928 #endif
929 
930 void
GENX(jm_launch_draw)931 GENX(jm_launch_draw)(struct panfrost_batch *batch,
932                      const struct pipe_draw_info *info, unsigned drawid_offset,
933                      const struct pipe_draw_start_count_bias *draw,
934                      unsigned vertex_count)
935 {
936    struct panfrost_context *ctx = batch->ctx;
937    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
938    bool secondary_shader = vs->info.vs.secondary_enable;
939    bool idvs = vs->info.vs.idvs;
940 
941 #if PAN_ARCH <= 7
942    struct mali_invocation_packed invocation;
943    if (info->instance_count > 1) {
944       panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
945                                         info->instance_count, 1, 1, 1, true,
946                                         false);
947    } else {
948       pan_pack(&invocation, INVOCATION, cfg) {
949          cfg.invocations = vertex_count - 1;
950          cfg.size_y_shift = 0;
951          cfg.size_z_shift = 0;
952          cfg.workgroups_x_shift = 0;
953          cfg.workgroups_y_shift = 0;
954          cfg.workgroups_z_shift = 32;
955          cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
956       }
957    }
958 
959    /* Emit all sort of descriptors. */
960 #endif
961 
962    UNUSED struct panfrost_ptr tiler, vertex;
963 
964    if (idvs) {
965 #if PAN_ARCH == 9
966       tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
967 #elif PAN_ARCH >= 6
968       tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
969 #else
970       unreachable("IDVS is unsupported on Midgard");
971 #endif
972    } else {
973       vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
974       tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
975    }
976 
977    if ((!idvs && !vertex.cpu) || !tiler.cpu) {
978       mesa_loge("jm_launch_draw failed");
979       return;
980    }
981 
982 #if PAN_ARCH == 9
983    assert(idvs && "Memory allocated IDVS required on Valhall");
984 
985    jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
986 
987    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_MALLOC_VERTEX, false,
988                   false, 0, 0, &tiler, false);
989 #else
990    /* Fire off the draw itself */
991    jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
992                      tiler.cpu);
993    if (idvs) {
994 #if PAN_ARCH >= 6
995       jm_emit_vertex_draw(
996          batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
997 
998       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX,
999                      false, false, 0, 0, &tiler, false);
1000 #endif
1001    } else {
1002       jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
1003       jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
1004    }
1005 #endif
1006 }
1007 
1008 void
GENX(jm_launch_draw_indirect)1009 GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
1010                               const struct pipe_draw_info *info,
1011                               unsigned drawid_offset,
1012                               const struct pipe_draw_indirect_info *indirect)
1013 {
1014    unreachable("draw indirect not implemented for jm");
1015 }
1016 
1017 void
GENX(jm_emit_write_timestamp)1018 GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
1019                               struct panfrost_resource *dst, unsigned offset)
1020 {
1021    struct panfrost_ptr job =
1022       pan_pool_alloc_desc(&batch->pool.base, WRITE_VALUE_JOB);
1023 
1024    pan_section_pack(job.cpu, WRITE_VALUE_JOB, PAYLOAD, cfg) {
1025       cfg.address = dst->image.data.base + dst->image.data.offset + offset;
1026       cfg.type = MALI_WRITE_VALUE_TYPE_SYSTEM_TIMESTAMP;
1027    }
1028 
1029    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false,
1030                   false, 0, 0, &job, false);
1031    panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1032 }
1033