• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2017 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "decode.h"
27 
28 #include "drm-uapi/panfrost_drm.h"
29 
30 #include "pan_cmdstream.h"
31 #include "pan_context.h"
32 #include "pan_fb_preload.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36 
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40 
41 int
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44    /* Reserve the framebuffer and local storage descriptors */
45    batch->framebuffer =
46 #if PAN_ARCH == 4
47       pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49       pan_pool_alloc_desc_aggregate(
50          &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51          PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53    if (!batch->framebuffer.gpu)
54       return -1;
55 
56 #if PAN_ARCH >= 6
57    batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
58 #else
59    /* On Midgard, the TLS is embedded in the FB descriptor */
60    batch->tls = batch->framebuffer;
61 
62    if (!batch->tls.cpu)
63       return -1;
64 
65 #if PAN_ARCH == 5
66    struct mali_framebuffer_pointer_packed ptr;
67 
68    pan_pack(&ptr, FRAMEBUFFER_POINTER, cfg) {
69       cfg.pointer = batch->framebuffer.gpu;
70       cfg.render_target_count = 1; /* a necessary lie */
71    }
72 
73    /* XXX: THIS IS A BUG, FIXME */
74    batch->tls.gpu = ptr.opaque[0];
75 #endif
76 #endif
77 
78    return 0;
79 }
80 
81 static int
jm_submit_jc(struct panfrost_batch * batch,uint64_t first_job_desc,uint32_t reqs,uint32_t out_sync)82 jm_submit_jc(struct panfrost_batch *batch, uint64_t first_job_desc,
83              uint32_t reqs, uint32_t out_sync)
84 {
85    struct panfrost_context *ctx = batch->ctx;
86    struct pipe_context *gallium = (struct pipe_context *)ctx;
87    struct panfrost_device *dev = pan_device(gallium->screen);
88    struct drm_panfrost_submit submit = {
89       0,
90    };
91    uint32_t in_syncs[1];
92    uint32_t *bo_handles;
93    int ret;
94 
95    /* If we trace, we always need a syncobj, so make one of our own if we
96     * weren't given one to use. Remember that we did so, so we can free it
97     * after we're done but preventing double-frees if we were given a
98     * syncobj */
99 
100    if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
101       out_sync = ctx->syncobj;
102 
103    submit.out_sync = out_sync;
104    submit.jc = first_job_desc;
105    submit.requirements = reqs;
106 
107    if (ctx->in_sync_fd >= 0) {
108       ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
109                                      ctx->in_sync_fd);
110       assert(!ret);
111 
112       in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
113       close(ctx->in_sync_fd);
114       ctx->in_sync_fd = -1;
115    }
116 
117    if (submit.in_sync_count)
118       submit.in_syncs = (uintptr_t)in_syncs;
119 
120    bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
121                           panfrost_pool_num_bos(&batch->invisible_pool) +
122                           batch->num_bos + 2,
123                        sizeof(*bo_handles));
124    assert(bo_handles);
125 
126    pan_bo_access *flags = util_dynarray_begin(&batch->bos);
127    unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
128 
129    for (int i = 0; i < end_bo; ++i) {
130       if (!flags[i])
131          continue;
132 
133       assert(submit.bo_handle_count < batch->num_bos);
134       bo_handles[submit.bo_handle_count++] = i;
135 
136       /* Update the BO access flags so that panfrost_bo_wait() knows
137        * about all pending accesses.
138        * We only keep the READ/WRITE info since this is all the BO
139        * wait logic cares about.
140        * We also preserve existing flags as this batch might not
141        * be the first one to access the BO.
142        */
143       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
144 
145       bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
146    }
147 
148    panfrost_pool_get_bo_handles(&batch->pool,
149                                 bo_handles + submit.bo_handle_count);
150    submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
151    panfrost_pool_get_bo_handles(&batch->invisible_pool,
152                                 bo_handles + submit.bo_handle_count);
153    submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
154 
155    /* Add the tiler heap to the list of accessed BOs if the batch has at
156     * least one tiler job. Tiler heap is written by tiler jobs and read
157     * by fragment jobs (the polygon list is coming from this heap).
158     */
159    if (batch->jm.jobs.vtc_jc.first_tiler)
160       bo_handles[submit.bo_handle_count++] =
161          panfrost_bo_handle(dev->tiler_heap);
162 
163    /* Always used on Bifrost, occassionally used on Midgard */
164    bo_handles[submit.bo_handle_count++] =
165       panfrost_bo_handle(dev->sample_positions);
166 
167    submit.bo_handles = (uint64_t)(uintptr_t)bo_handles;
168    if (ctx->is_noop)
169       ret = 0;
170    else
171       ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
172    free(bo_handles);
173 
174    if (ret)
175       return errno;
176 
177    /* Trace the job if we're doing that */
178    if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
179       /* Wait so we can get errors reported back */
180       ret = drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX,
181                            0, NULL);
182       if (ret)
183          return errno;
184 
185       if (dev->debug & PAN_DBG_TRACE)
186          pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
187 
188       if (dev->debug & PAN_DBG_DUMP)
189          pandecode_dump_mappings(dev->decode_ctx);
190 
191       /* Jobs won't be complete if blackhole rendering, that's ok */
192       if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
193          pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
194    }
195 
196    return 0;
197 }
198 
199 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
200  * outsync corresponding to the later of the two (since there will be an
201  * implicit dep between them) */
202 
203 int
GENX(jm_submit_batch)204 GENX(jm_submit_batch)(struct panfrost_batch *batch)
205 {
206    struct pipe_screen *pscreen = batch->ctx->base.screen;
207    struct panfrost_device *dev = pan_device(pscreen);
208    bool has_draws = batch->jm.jobs.vtc_jc.first_job;
209    bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
210    bool has_frag = panfrost_has_fragment_job(batch);
211    uint32_t out_sync = batch->ctx->syncobj;
212    int ret = 0;
213 
214    unsigned reqs =
215       batch->need_job_req_cycle_count ? PANFROST_JD_REQ_CYCLE_COUNT : 0;
216 
217    /* Take the submit lock to make sure no tiler jobs from other context
218     * are inserted between our tiler and fragment jobs, failing to do that
219     * might result in tiler heap corruption.
220     */
221    if (has_tiler)
222       pthread_mutex_lock(&dev->submit_lock);
223 
224    if (has_draws) {
225       ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, reqs,
226                          has_frag ? 0 : out_sync);
227 
228       if (ret)
229          goto done;
230    }
231 
232    if (has_frag) {
233       ret = jm_submit_jc(batch, batch->jm.jobs.frag, reqs | PANFROST_JD_REQ_FS,
234                          out_sync);
235       if (ret)
236          goto done;
237    }
238 
239 done:
240    if (has_tiler)
241       pthread_mutex_unlock(&dev->submit_lock);
242 
243    return ret;
244 }
245 
246 void
GENX(jm_preload_fb)247 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
248 {
249    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
250    struct panfrost_ptr preload_jobs[2];
251 
252    unsigned preload_job_count =
253       GENX(pan_preload_fb)(&dev->fb_preload_cache, &batch->pool.base, fb,
254                            batch->tls.gpu, preload_jobs);
255 
256    assert(PAN_ARCH < 6 || !preload_job_count);
257 
258    for (unsigned j = 0; j < preload_job_count; j++) {
259       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
260                      0, 0, &preload_jobs[j], true);
261    }
262 }
263 
264 void
GENX(jm_emit_fbds)265 GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
266                    struct pan_tls_info *tls)
267 {
268    batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
269       fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
270 }
271 
272 void
GENX(jm_emit_fragment_job)273 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
274                            const struct pan_fb_info *pfb)
275 {
276    struct panfrost_ptr transfer =
277       pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
278 
279    GENX(pan_emit_fragment_job_payload)
280    (pfb, batch->framebuffer.gpu, transfer.cpu);
281 
282    pan_section_pack(transfer.cpu, FRAGMENT_JOB, HEADER, header) {
283       header.type = MALI_JOB_TYPE_FRAGMENT;
284       header.index = 1;
285    }
286 
287    batch->jm.jobs.frag = transfer.gpu;
288 }
289 
290 #if PAN_ARCH == 9
291 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,uint64_t shader_ptr)292 jm_emit_shader_env(struct panfrost_batch *batch,
293                    struct MALI_SHADER_ENVIRONMENT *cfg,
294                    enum pipe_shader_type stage, uint64_t shader_ptr)
295 {
296    cfg->resources = panfrost_emit_resources(batch, stage);
297    cfg->thread_storage = batch->tls.gpu;
298    cfg->shader = shader_ptr;
299 
300    /* Each entry of FAU is 64-bits */
301    cfg->fau = batch->push_uniforms[stage];
302    cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
303 }
304 #endif
305 
306 void
GENX(jm_launch_grid)307 GENX(jm_launch_grid)(struct panfrost_batch *batch,
308                      const struct pipe_grid_info *info)
309 {
310    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
311 
312    /* Invoke according to the grid info */
313 
314    unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
315 
316    if (info->indirect)
317       num_wg[0] = num_wg[1] = num_wg[2] = 1;
318 
319 #if PAN_ARCH <= 7
320    panfrost_pack_work_groups_compute(
321       pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
322       num_wg[2], info->block[0], info->block[1], info->block[2], false,
323       info->indirect != NULL);
324 
325    pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
326       cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
327                            util_logbase2_ceil(info->block[1] + 1) +
328                            util_logbase2_ceil(info->block[2] + 1);
329    }
330 
331    pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
332       cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
333       cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
334       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
335       cfg.thread_storage = batch->tls.gpu;
336       cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
337       cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
338       cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
339       cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
340    }
341 
342 #if PAN_ARCH == 4
343    pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
344       ;
345 #endif
346 #else
347    struct panfrost_context *ctx = batch->ctx;
348    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
349 
350    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
351       cfg.workgroup_size_x = info->block[0];
352       cfg.workgroup_size_y = info->block[1];
353       cfg.workgroup_size_z = info->block[2];
354 
355       cfg.workgroup_count_x = num_wg[0];
356       cfg.workgroup_count_y = num_wg[1];
357       cfg.workgroup_count_z = num_wg[2];
358 
359       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
360                          batch->rsd[PIPE_SHADER_COMPUTE]);
361 
362       /* Workgroups may be merged if the shader does not use barriers
363        * or shared memory. This condition is checked against the
364        * static shared_size at compile-time. We need to check the
365        * variable shared size at launch_grid time, because the
366        * compiler doesn't know about that.
367        */
368       cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
369                                      (info->variable_shared_mem == 0);
370 
371       cfg.task_increment = 1;
372       cfg.task_axis = MALI_TASK_AXIS_Z;
373    }
374 #endif
375 
376    unsigned indirect_dep = 0;
377 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
378    if (info->indirect) {
379       struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
380       struct pan_indirect_dispatch_info indirect = {
381          .job = t.gpu,
382          .indirect_dim = pan_resource(info->indirect)->image.data.base +
383                          info->indirect_offset,
384          .num_wg_sysval =
385             {
386                batch->num_wg_sysval[0],
387                batch->num_wg_sysval[1],
388                batch->num_wg_sysval[2],
389             },
390       };
391 
392       indirect_dep = GENX(pan_indirect_dispatch_emit)(
393          &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
394          &indirect);
395    }
396 #endif
397 
398    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, true, false,
399                   indirect_dep, 0, &t, false);
400 }
401 
402 #if PAN_ARCH >= 6
403 static uint64_t
jm_emit_tiler_desc(struct panfrost_batch * batch)404 jm_emit_tiler_desc(struct panfrost_batch *batch)
405 {
406    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
407    uint64_t tiler_desc = PAN_ARCH >= 9 ? batch->tiler_ctx.bifrost.desc
408                                        : batch->tiler_ctx.valhall.desc;
409 
410    if (tiler_desc)
411       return tiler_desc;
412 
413    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
414 
415    pan_cast_and_pack(t.cpu, TILER_HEAP, heap) {
416       heap.size = panfrost_bo_size(dev->tiler_heap);
417       heap.base = dev->tiler_heap->ptr.gpu;
418       heap.bottom = dev->tiler_heap->ptr.gpu;
419       heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
420    }
421 
422    uint64_t heap = t.gpu;
423    unsigned max_levels = dev->tiler_features.max_levels;
424    assert(max_levels >= 2);
425 
426    t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
427    pan_cast_and_pack(t.cpu, TILER_CONTEXT, tiler) {
428       tiler.hierarchy_mask =
429          pan_select_tiler_hierarchy_mask(batch->key.width,
430                                          batch->key.height,
431                                          dev->tiler_features.max_levels);
432 
433       tiler.fb_width = batch->key.width;
434       tiler.fb_height = batch->key.height;
435       tiler.heap = heap;
436       tiler.sample_pattern =
437          pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
438 #if PAN_ARCH >= 9
439       tiler.first_provoking_vertex =
440          batch->first_provoking_vertex == U_TRISTATE_YES;
441 #endif
442    }
443 
444    if (PAN_ARCH >= 9)
445       batch->tiler_ctx.valhall.desc = t.gpu;
446    else
447       batch->tiler_ctx.bifrost.desc = t.gpu;
448 
449    return t.gpu;
450 }
451 #endif
452 
453 #if PAN_ARCH <= 7
454 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)455 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
456                    enum pipe_shader_type st)
457 {
458    d->offset_start = batch->ctx->offset_start;
459    d->instance_size =
460       batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
461 
462    d->uniform_buffers = batch->uniform_buffers[st];
463    d->push_uniforms = batch->push_uniforms[st];
464    d->textures = batch->textures[st];
465    d->samplers = batch->samplers[st];
466 }
467 
468 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,struct mali_draw_packed * section)469 jm_emit_vertex_draw(struct panfrost_batch *batch,
470                     struct mali_draw_packed *section)
471 {
472    pan_pack(section, DRAW, cfg) {
473       cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
474       cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
475       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
476       cfg.varyings = batch->varyings.vs;
477       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
478       cfg.thread_storage = batch->tls.gpu;
479       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
480    }
481 }
482 
483 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)484 jm_emit_vertex_job(struct panfrost_batch *batch,
485                    const struct pipe_draw_info *info, void *invocation_template,
486                    void *job)
487 {
488    void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
489    memcpy(section, invocation_template, pan_size(INVOCATION));
490 
491    pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
492       cfg.job_task_split = 5;
493    }
494 
495    section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
496    jm_emit_vertex_draw(batch, section);
497 
498 #if PAN_ARCH == 4
499    pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
500       ;
501 #endif
502 }
503 #endif /* PAN_ARCH <= 7 */
504 
505 static void
jm_emit_tiler_draw(struct mali_draw_packed * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)506 jm_emit_tiler_draw(struct mali_draw_packed *out, struct panfrost_batch *batch,
507                    bool fs_required, enum mesa_prim prim)
508 {
509    struct panfrost_context *ctx = batch->ctx;
510    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
511    bool polygon = (prim == MESA_PRIM_TRIANGLES);
512 
513    pan_pack(out, DRAW, cfg) {
514       /*
515        * From the Gallium documentation,
516        * pipe_rasterizer_state::cull_face "indicates which faces of
517        * polygons to cull". Points and lines are not considered
518        * polygons and should be drawn even if all faces are culled.
519        * The hardware does not take primitive type into account when
520        * culling, so we need to do that check ourselves.
521        */
522       cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
523       cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
524       cfg.front_face_ccw = rast->front_ccw;
525 
526       if (ctx->occlusion_query && ctx->active_queries) {
527          if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
528             cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
529          else
530             cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
531 
532          struct panfrost_resource *rsrc =
533             pan_resource(ctx->occlusion_query->rsrc);
534          cfg.occlusion = rsrc->image.data.base;
535          panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
536       }
537 
538 #if PAN_ARCH >= 9
539       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
540 
541       cfg.multisample_enable = rast->multisample;
542       cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
543 
544       /* Use per-sample shading if required by API Also use it when a
545        * blend shader is used with multisampling, as this is handled
546        * by a single ST_TILE in the blend shader with the current
547        * sample ID, requiring per-sample shading.
548        */
549       cfg.evaluate_per_sample =
550          (rast->multisample &&
551           ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
552 
553       cfg.single_sampled_lines = !rast->multisample;
554 
555       cfg.vertex_array.packet = true;
556 
557       cfg.minimum_z = batch->minimum_z;
558       cfg.maximum_z = batch->maximum_z;
559 
560       cfg.depth_stencil = batch->depth_stencil;
561 
562       if (prim == MESA_PRIM_LINES && rast->line_smooth) {
563          cfg.multisample_enable = true;
564          cfg.single_sampled_lines = false;
565       }
566 
567       if (fs_required) {
568          bool has_oq = ctx->occlusion_query && ctx->active_queries;
569 
570          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
571             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
572             ctx->blend->base.alpha_to_coverage,
573             ctx->depth_stencil->zs_always_passes);
574 
575          cfg.pixel_kill_operation = earlyzs.kill;
576          cfg.zs_update_operation = earlyzs.update;
577 
578          cfg.allow_forward_pixel_to_kill =
579             pan_allow_forward_pixel_to_kill(ctx, fs);
580          cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
581 
582          /* Mask of render targets that may be written. A render
583           * target may be written if the fragment shader writes
584           * to it AND it actually exists. If the render target
585           * doesn't actually exist, the blend descriptor will be
586           * OFF so it may be omitted from the mask.
587           *
588           * Only set when there is a fragment shader, since
589           * otherwise no colour updates are possible.
590           */
591          cfg.render_target_mask =
592             (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
593 
594          /* Also use per-sample shading if required by the shader
595           */
596          cfg.evaluate_per_sample |=
597             (fs->info.fs.sample_shading && rast->multisample);
598 
599          /* Unlike Bifrost, alpha-to-coverage must be included in
600           * this identically-named flag. Confusing, isn't it?
601           */
602          cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
603                                         fs->info.fs.can_discard ||
604                                         ctx->blend->base.alpha_to_coverage;
605 
606          /* Blend descriptors are only accessed by a BLEND
607           * instruction on Valhall. It follows that if the
608           * fragment shader is omitted, we may also emit the
609           * blend descriptors.
610           */
611          cfg.blend = batch->blend;
612          cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
613          cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
614 
615          cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
616          cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
617 
618          jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
619                             batch->rsd[PIPE_SHADER_FRAGMENT]);
620       } else {
621          /* These operations need to be FORCE to benefit from the
622           * depth-only pass optimizations.
623           */
624          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
625          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
626 
627          /* No shader and no blend => no shader or blend
628           * reasons to disable FPK. The only FPK-related state
629           * not covered is alpha-to-coverage which we don't set
630           * without blend.
631           */
632          cfg.allow_forward_pixel_to_kill = true;
633 
634          /* No shader => no shader side effects */
635          cfg.allow_forward_pixel_to_be_killed = true;
636 
637          /* Alpha isn't written so these are vacuous */
638          cfg.overdraw_alpha0 = true;
639          cfg.overdraw_alpha1 = true;
640       }
641 #else
642       cfg.position = batch->varyings.pos;
643       cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
644       cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
645       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
646       cfg.viewport = batch->viewport;
647       cfg.varyings = batch->varyings.fs;
648       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
649       cfg.thread_storage = batch->tls.gpu;
650 
651       /* For all primitives but lines DRAW.flat_shading_vertex must
652        * be set to 0 and the provoking vertex is selected with the
653        * PRIMITIVE.first_provoking_vertex field.
654        */
655       if (prim == MESA_PRIM_LINES) {
656          /* The logic is inverted across arches. */
657          cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
658       }
659 
660       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
661 #endif
662    }
663 }
664 
665 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
666  * jobs and Valhall IDVS jobs
667  */
668 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,struct mali_primitive_packed * out)669 jm_emit_primitive(struct panfrost_batch *batch,
670                   const struct pipe_draw_info *info,
671                   const struct pipe_draw_start_count_bias *draw,
672                   bool secondary_shader, struct mali_primitive_packed *out)
673 {
674    struct panfrost_context *ctx = batch->ctx;
675    UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
676 
677    pan_pack(out, PRIMITIVE, cfg) {
678       cfg.draw_mode = pan_draw_mode(info->mode);
679       if (panfrost_writes_point_size(ctx))
680          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
681 
682 #if PAN_ARCH <= 8
683       /* For line primitives, PRIMITIVE.first_provoking_vertex must
684        * be set to true and the provoking vertex is selected with
685        * DRAW.flat_shading_vertex.
686        */
687       if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
688          cfg.first_provoking_vertex = true;
689       else
690          cfg.first_provoking_vertex = rast->flatshade_first;
691 
692       if (panfrost_is_implicit_prim_restart(info)) {
693          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
694       } else if (info->primitive_restart) {
695          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
696          cfg.primitive_restart_index = info->restart_index;
697       }
698 
699       cfg.job_task_split = 6;
700 #else
701       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
702 
703       cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
704       cfg.primitive_restart = info->primitive_restart;
705 
706       /* Non-fixed restart indices should have been lowered */
707       assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
708 #endif
709 
710       cfg.low_depth_cull = rast->depth_clip_near;
711       cfg.high_depth_cull = rast->depth_clip_far;
712 
713       cfg.index_count = draw->count;
714       cfg.index_type = panfrost_translate_index_size(info->index_size);
715 
716       if (PAN_ARCH >= 9) {
717          /* Base vertex offset on Valhall is used for both
718           * indexed and non-indexed draws, in a simple way for
719           * either. Handle both cases.
720           */
721          if (cfg.index_type)
722             cfg.base_vertex_offset = draw->index_bias;
723          else
724             cfg.base_vertex_offset = draw->start;
725 
726          /* Indices are moved outside the primitive descriptor
727           * on Valhall, so we don't need to set that here
728           */
729       } else if (cfg.index_type) {
730          cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
731 
732 #if PAN_ARCH <= 7
733          cfg.indices = batch->indices;
734 #endif
735       }
736 
737 #if PAN_ARCH >= 6
738       cfg.secondary_shader = secondary_shader;
739 #endif
740    }
741 }
742 
743 #if PAN_ARCH == 9
744 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)745 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
746                           const struct pipe_draw_info *info,
747                           const struct pipe_draw_start_count_bias *draw,
748                           bool secondary_shader, void *job)
749 {
750    struct panfrost_context *ctx = batch->ctx;
751    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
752    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
753 
754    bool fs_required = panfrost_fs_required(
755       fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
756 
757    /* Varying shaders only feed data to the fragment shader, so if we omit
758     * the fragment shader, we should omit the varying shader too.
759     */
760    secondary_shader &= fs_required;
761 
762    jm_emit_primitive(batch, info, draw, secondary_shader,
763                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
764 
765    pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
766       cfg.count = info->instance_count;
767    }
768 
769    pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
770       if (secondary_shader) {
771          unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
772          cfg.vertex_packet_stride = sz + 16;
773          cfg.vertex_attribute_stride = sz;
774       } else {
775          /* Hardware requirement for "no varyings" */
776          cfg.vertex_packet_stride = 16;
777          cfg.vertex_attribute_stride = 0;
778       }
779    }
780 
781    pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
782       cfg.address = jm_emit_tiler_desc(batch);
783    }
784 
785    STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
786    memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
787           pan_size(SCISSOR));
788 
789    panfrost_emit_primitive_size(
790       ctx, info->mode == MESA_PRIM_POINTS, 0,
791       pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
792 
793    pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
794       cfg.address = batch->indices;
795    }
796 
797    jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
798                       fs_required, u_reduced_prim(info->mode));
799 
800    pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
801       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
802                          panfrost_get_position_shader(batch, info));
803    }
804 
805    pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
806       /* If a varying shader is used, we configure it with the same
807        * state as the position shader for backwards compatible
808        * behaviour with Bifrost. This could be optimized.
809        */
810       if (!secondary_shader)
811          continue;
812 
813       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
814                          panfrost_get_varying_shader(batch));
815    }
816 }
817 #endif
818 
819 #if PAN_ARCH <= 7
820 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)821 jm_emit_tiler_job(struct panfrost_batch *batch,
822                   const struct pipe_draw_info *info,
823                   const struct pipe_draw_start_count_bias *draw,
824                   void *invocation_template, bool secondary_shader, void *job)
825 {
826    struct panfrost_context *ctx = batch->ctx;
827 
828    void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
829    memcpy(section, invocation_template, pan_size(INVOCATION));
830 
831    jm_emit_primitive(batch, info, draw, secondary_shader,
832                      pan_section_ptr(job, TILER_JOB, PRIMITIVE));
833 
834    void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
835    enum mesa_prim prim = u_reduced_prim(info->mode);
836 
837 #if PAN_ARCH >= 6
838    pan_section_pack(job, TILER_JOB, TILER, cfg) {
839       cfg.address = jm_emit_tiler_desc(batch);
840    }
841 
842    pan_section_pack(job, TILER_JOB, PADDING, cfg)
843       ;
844 #endif
845 
846    jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
847 
848    panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
849                                 batch->varyings.psiz, prim_size);
850 }
851 #endif
852 
853 void
GENX(jm_launch_xfb)854 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
855                     const struct pipe_draw_info *info, unsigned count)
856 {
857    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
858 
859 #if PAN_ARCH == 9
860    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
861       cfg.workgroup_size_x = 1;
862       cfg.workgroup_size_y = 1;
863       cfg.workgroup_size_z = 1;
864 
865       cfg.workgroup_count_x = count;
866       cfg.workgroup_count_y = info->instance_count;
867       cfg.workgroup_count_z = 1;
868 
869       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
870                          batch->rsd[PIPE_SHADER_VERTEX]);
871 
872       /* TODO: Indexing. Also, this is a legacy feature... */
873       cfg.compute.attribute_offset = batch->ctx->offset_start;
874 
875       /* Transform feedback shaders do not use barriers or shared
876        * memory, so we may merge workgroups.
877        */
878       cfg.allow_merging_workgroups = true;
879       cfg.task_increment = 1;
880       cfg.task_axis = MALI_TASK_AXIS_Z;
881    }
882 #else
883    struct mali_invocation_packed invocation;
884 
885    panfrost_pack_work_groups_compute(&invocation, 1, count,
886                                      info->instance_count, 1, 1, 1,
887                                      PAN_ARCH <= 5, false);
888 
889    /* No varyings on XFB compute jobs. */
890    uint64_t saved_vs_varyings = batch->varyings.vs;
891 
892    batch->varyings.vs = 0;
893    jm_emit_vertex_job(batch, info, &invocation, t.cpu);
894    batch->varyings.vs = saved_vs_varyings;
895 
896 #endif
897    enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
898 #if PAN_ARCH <= 5
899    job_type = MALI_JOB_TYPE_VERTEX;
900 #endif
901    pan_jc_add_job(&batch->jm.jobs.vtc_jc, job_type, true, false, 0, 0, &t,
902                   false);
903 }
904 
905 #if PAN_ARCH < 9
906 /*
907  * Push jobs required for the rasterization pipeline. If there are side effects
908  * from the vertex shader, these are handled ahead-of-time with a compute
909  * shader. This function should not be called if rasterization is skipped.
910  */
911 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)912 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
913                           const struct panfrost_ptr *vertex_job,
914                           const struct panfrost_ptr *tiler_job)
915 {
916    unsigned vertex =
917       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
918                      0, 0, vertex_job, false);
919 
920    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
921                   vertex, 0, tiler_job, false);
922 }
923 #endif
924 
925 void
GENX(jm_launch_draw)926 GENX(jm_launch_draw)(struct panfrost_batch *batch,
927                      const struct pipe_draw_info *info, unsigned drawid_offset,
928                      const struct pipe_draw_start_count_bias *draw,
929                      unsigned vertex_count)
930 {
931    struct panfrost_context *ctx = batch->ctx;
932    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
933    bool secondary_shader = vs->info.vs.secondary_enable;
934    bool idvs = vs->info.vs.idvs;
935 
936 #if PAN_ARCH <= 7
937    struct mali_invocation_packed invocation;
938    if (info->instance_count > 1) {
939       panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
940                                         info->instance_count, 1, 1, 1, true,
941                                         false);
942    } else {
943       pan_pack(&invocation, INVOCATION, cfg) {
944          cfg.invocations = vertex_count - 1;
945          cfg.size_y_shift = 0;
946          cfg.size_z_shift = 0;
947          cfg.workgroups_x_shift = 0;
948          cfg.workgroups_y_shift = 0;
949          cfg.workgroups_z_shift = 32;
950          cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
951       }
952    }
953 
954    /* Emit all sort of descriptors. */
955 #endif
956 
957    UNUSED struct panfrost_ptr tiler, vertex;
958 
959    if (idvs) {
960 #if PAN_ARCH == 9
961       tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
962 #elif PAN_ARCH >= 6
963       tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
964 #else
965       unreachable("IDVS is unsupported on Midgard");
966 #endif
967    } else {
968       vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
969       tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
970    }
971 
972    if ((!idvs && !vertex.cpu) || !tiler.cpu) {
973       mesa_loge("jm_launch_draw failed");
974       return;
975    }
976 
977 #if PAN_ARCH == 9
978    assert(idvs && "Memory allocated IDVS required on Valhall");
979 
980    jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
981 
982    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_MALLOC_VERTEX, false,
983                   false, 0, 0, &tiler, false);
984 #else
985    /* Fire off the draw itself */
986    jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
987                      tiler.cpu);
988    if (idvs) {
989 #if PAN_ARCH >= 6
990       jm_emit_vertex_draw(
991          batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
992 
993       pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX,
994                      false, false, 0, 0, &tiler, false);
995 #endif
996    } else {
997       jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
998       jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
999    }
1000 #endif
1001 }
1002 
1003 void
GENX(jm_launch_draw_indirect)1004 GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
1005                               const struct pipe_draw_info *info,
1006                               unsigned drawid_offset,
1007                               const struct pipe_draw_indirect_info *indirect)
1008 {
1009    unreachable("draw indirect not implemented for jm");
1010 }
1011 
1012 void
GENX(jm_emit_write_timestamp)1013 GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
1014                               struct panfrost_resource *dst, unsigned offset)
1015 {
1016    struct panfrost_ptr job =
1017       pan_pool_alloc_desc(&batch->pool.base, WRITE_VALUE_JOB);
1018 
1019    pan_section_pack(job.cpu, WRITE_VALUE_JOB, PAYLOAD, cfg) {
1020       cfg.address = dst->image.data.base + dst->image.data.offset + offset;
1021       cfg.type = MALI_WRITE_VALUE_TYPE_SYSTEM_TIMESTAMP;
1022    }
1023 
1024    pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false,
1025                   false, 0, 0, &job, false);
1026    panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1027 }
1028