• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 Alyssa Rosenzweig
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2017 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "decode.h"
27 
28 #include "drm-uapi/panfrost_drm.h"
29 
30 #include "pan_blitter.h"
31 #include "pan_cmdstream.h"
32 #include "pan_context.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36 
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40 
41 void
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44    /* Reserve the framebuffer and local storage descriptors */
45    batch->framebuffer =
46 #if PAN_ARCH == 4
47       pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49       pan_pool_alloc_desc_aggregate(
50          &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51          PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53 
54 #if PAN_ARCH >= 6
55    batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
56 #else
57    /* On Midgard, the TLS is embedded in the FB descriptor */
58    batch->tls = batch->framebuffer;
59 
60 #if PAN_ARCH == 5
61    struct mali_framebuffer_pointer_packed ptr;
62 
63    pan_pack(ptr.opaque, FRAMEBUFFER_POINTER, cfg) {
64       cfg.pointer = batch->framebuffer.gpu;
65       cfg.render_target_count = 1; /* a necessary lie */
66    }
67 
68    batch->tls.gpu = ptr.opaque[0];
69 #endif
70 #endif
71 }
72 
73 static int
jm_submit_jc(struct panfrost_batch * batch,mali_ptr first_job_desc,uint32_t reqs,uint32_t out_sync)74 jm_submit_jc(struct panfrost_batch *batch, mali_ptr first_job_desc,
75              uint32_t reqs, uint32_t out_sync)
76 {
77    struct panfrost_context *ctx = batch->ctx;
78    struct pipe_context *gallium = (struct pipe_context *)ctx;
79    struct panfrost_device *dev = pan_device(gallium->screen);
80    struct drm_panfrost_submit submit = {
81       0,
82    };
83    uint32_t in_syncs[1];
84    uint32_t *bo_handles;
85    int ret;
86 
87    /* If we trace, we always need a syncobj, so make one of our own if we
88     * weren't given one to use. Remember that we did so, so we can free it
89     * after we're done but preventing double-frees if we were given a
90     * syncobj */
91 
92    if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
93       out_sync = ctx->syncobj;
94 
95    submit.out_sync = out_sync;
96    submit.jc = first_job_desc;
97    submit.requirements = reqs;
98 
99    if (ctx->in_sync_fd >= 0) {
100       ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
101                                      ctx->in_sync_fd);
102       assert(!ret);
103 
104       in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
105       close(ctx->in_sync_fd);
106       ctx->in_sync_fd = -1;
107    }
108 
109    if (submit.in_sync_count)
110       submit.in_syncs = (uintptr_t)in_syncs;
111 
112    bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
113                           panfrost_pool_num_bos(&batch->invisible_pool) +
114                           batch->num_bos + 2,
115                        sizeof(*bo_handles));
116    assert(bo_handles);
117 
118    pan_bo_access *flags = util_dynarray_begin(&batch->bos);
119    unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
120 
121    for (int i = 0; i < end_bo; ++i) {
122       if (!flags[i])
123          continue;
124 
125       assert(submit.bo_handle_count < batch->num_bos);
126       bo_handles[submit.bo_handle_count++] = i;
127 
128       /* Update the BO access flags so that panfrost_bo_wait() knows
129        * about all pending accesses.
130        * We only keep the READ/WRITE info since this is all the BO
131        * wait logic cares about.
132        * We also preserve existing flags as this batch might not
133        * be the first one to access the BO.
134        */
135       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
136 
137       bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
138    }
139 
140    panfrost_pool_get_bo_handles(&batch->pool,
141                                 bo_handles + submit.bo_handle_count);
142    submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
143    panfrost_pool_get_bo_handles(&batch->invisible_pool,
144                                 bo_handles + submit.bo_handle_count);
145    submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
146 
147    /* Add the tiler heap to the list of accessed BOs if the batch has at
148     * least one tiler job. Tiler heap is written by tiler jobs and read
149     * by fragment jobs (the polygon list is coming from this heap).
150     */
151    if (batch->jm.jobs.vtc_jc.first_tiler)
152       bo_handles[submit.bo_handle_count++] =
153          panfrost_bo_handle(dev->tiler_heap);
154 
155    /* Always used on Bifrost, occassionally used on Midgard */
156    bo_handles[submit.bo_handle_count++] =
157       panfrost_bo_handle(dev->sample_positions);
158 
159    submit.bo_handles = (u64)(uintptr_t)bo_handles;
160    if (ctx->is_noop)
161       ret = 0;
162    else
163       ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
164    free(bo_handles);
165 
166    if (ret)
167       return errno;
168 
169    /* Trace the job if we're doing that */
170    if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
171       /* Wait so we can get errors reported back */
172       drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX, 0, NULL);
173 
174       if (dev->debug & PAN_DBG_TRACE)
175          pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
176 
177       if (dev->debug & PAN_DBG_DUMP)
178          pandecode_dump_mappings(dev->decode_ctx);
179 
180       /* Jobs won't be complete if blackhole rendering, that's ok */
181       if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
182          pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
183    }
184 
185    return 0;
186 }
187 
188 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
189  * outsync corresponding to the later of the two (since there will be an
190  * implicit dep between them) */
191 
192 int
GENX(jm_submit_batch)193 GENX(jm_submit_batch)(struct panfrost_batch *batch)
194 {
195    struct pipe_screen *pscreen = batch->ctx->base.screen;
196    struct panfrost_device *dev = pan_device(pscreen);
197    bool has_draws = batch->jm.jobs.vtc_jc.first_job;
198    bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
199    bool has_frag = panfrost_has_fragment_job(batch);
200    uint32_t out_sync = batch->ctx->syncobj;
201    int ret = 0;
202 
203    /* Take the submit lock to make sure no tiler jobs from other context
204     * are inserted between our tiler and fragment jobs, failing to do that
205     * might result in tiler heap corruption.
206     */
207    if (has_tiler)
208       pthread_mutex_lock(&dev->submit_lock);
209 
210    if (has_draws) {
211       ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, 0,
212                          has_frag ? 0 : out_sync);
213 
214       if (ret)
215          goto done;
216    }
217 
218    if (has_frag) {
219       ret =
220          jm_submit_jc(batch, batch->jm.jobs.frag, PANFROST_JD_REQ_FS, out_sync);
221       if (ret)
222          goto done;
223    }
224 
225 done:
226    if (has_tiler)
227       pthread_mutex_unlock(&dev->submit_lock);
228 
229    return ret;
230 }
231 
232 void
GENX(jm_preload_fb)233 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
234 {
235    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
236 
237    GENX(pan_preload_fb)
238    (&dev->blitter, &batch->pool.base, &batch->jm.jobs.vtc_jc, fb,
239     batch->tls.gpu, PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
240 }
241 
242 void
GENX(jm_emit_fragment_job)243 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
244                            const struct pan_fb_info *pfb)
245 {
246    struct panfrost_ptr transfer =
247       pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
248 
249    GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu, transfer.cpu);
250 
251    batch->jm.jobs.frag = transfer.gpu;
252 }
253 
254 #if PAN_ARCH == 9
255 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,mali_ptr shader_ptr)256 jm_emit_shader_env(struct panfrost_batch *batch,
257                    struct MALI_SHADER_ENVIRONMENT *cfg,
258                    enum pipe_shader_type stage, mali_ptr shader_ptr)
259 {
260    cfg->resources = panfrost_emit_resources(batch, stage);
261    cfg->thread_storage = batch->tls.gpu;
262    cfg->shader = shader_ptr;
263 
264    /* Each entry of FAU is 64-bits */
265    cfg->fau = batch->push_uniforms[stage];
266    cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
267 }
268 #endif
269 
270 void
GENX(jm_launch_grid)271 GENX(jm_launch_grid)(struct panfrost_batch *batch,
272                      const struct pipe_grid_info *info)
273 {
274    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
275 
276    /* Invoke according to the grid info */
277 
278    unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
279 
280    if (info->indirect)
281       num_wg[0] = num_wg[1] = num_wg[2] = 1;
282 
283 #if PAN_ARCH <= 7
284    panfrost_pack_work_groups_compute(
285       pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
286       num_wg[2], info->block[0], info->block[1], info->block[2], false,
287       info->indirect != NULL);
288 
289    pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
290       cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
291                            util_logbase2_ceil(info->block[1] + 1) +
292                            util_logbase2_ceil(info->block[2] + 1);
293    }
294 
295    pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
296       cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
297       cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
298       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
299       cfg.thread_storage = batch->tls.gpu;
300       cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
301       cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
302       cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
303       cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
304    }
305 
306 #if PAN_ARCH == 4
307    pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
308       ;
309 #endif
310 #else
311    struct panfrost_context *ctx = batch->ctx;
312    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
313 
314    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
315       cfg.workgroup_size_x = info->block[0];
316       cfg.workgroup_size_y = info->block[1];
317       cfg.workgroup_size_z = info->block[2];
318 
319       cfg.workgroup_count_x = num_wg[0];
320       cfg.workgroup_count_y = num_wg[1];
321       cfg.workgroup_count_z = num_wg[2];
322 
323       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
324                          batch->rsd[PIPE_SHADER_COMPUTE]);
325 
326       /* Workgroups may be merged if the shader does not use barriers
327        * or shared memory. This condition is checked against the
328        * static shared_size at compile-time. We need to check the
329        * variable shared size at launch_grid time, because the
330        * compiler doesn't know about that.
331        */
332       cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
333                                      (info->variable_shared_mem == 0);
334 
335       cfg.task_increment = 1;
336       cfg.task_axis = MALI_TASK_AXIS_Z;
337    }
338 #endif
339 
340    unsigned indirect_dep = 0;
341 #if PAN_GPU_INDIRECTS
342    if (info->indirect) {
343       struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
344       struct pan_indirect_dispatch_info indirect = {
345          .job = t.gpu,
346          .indirect_dim = pan_resource(info->indirect)->image.data.base +
347                          info->indirect_offset,
348          .num_wg_sysval =
349             {
350                batch->num_wg_sysval[0],
351                batch->num_wg_sysval[1],
352                batch->num_wg_sysval[2],
353             },
354       };
355 
356       indirect_dep = GENX(pan_indirect_dispatch_emit)(
357          &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
358          &indirect);
359    }
360 #endif
361 
362    pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
363                   MALI_JOB_TYPE_COMPUTE, true, false, indirect_dep, 0, &t,
364                   false);
365 }
366 
367 #if PAN_ARCH >= 6
368 static mali_ptr
jm_emit_tiler_desc(struct panfrost_batch * batch)369 jm_emit_tiler_desc(struct panfrost_batch *batch)
370 {
371    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
372 
373    if (batch->tiler_ctx.bifrost)
374       return batch->tiler_ctx.bifrost;
375 
376    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
377 
378    pan_pack(t.cpu, TILER_HEAP, heap) {
379       heap.size = panfrost_bo_size(dev->tiler_heap);
380       heap.base = dev->tiler_heap->ptr.gpu;
381       heap.bottom = dev->tiler_heap->ptr.gpu;
382       heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
383    }
384 
385    mali_ptr heap = t.gpu;
386    unsigned max_levels = dev->tiler_features.max_levels;
387    assert(max_levels >= 2);
388 
389    t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
390    pan_pack(t.cpu, TILER_CONTEXT, tiler) {
391       /* TODO: Select hierarchy mask more effectively */
392       tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
393 
394       /* For large framebuffers, disable the smallest bin size to
395        * avoid pathological tiler memory usage. Required to avoid OOM
396        * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
397        * Mali-G57.
398        */
399       if (MAX2(batch->key.width, batch->key.height) >= 4096)
400          tiler.hierarchy_mask &= ~1;
401 
402       tiler.fb_width = batch->key.width;
403       tiler.fb_height = batch->key.height;
404       tiler.heap = heap;
405       tiler.sample_pattern =
406          pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
407 #if PAN_ARCH >= 9
408       tiler.first_provoking_vertex =
409          pan_tristate_get(batch->first_provoking_vertex);
410 #endif
411    }
412 
413    batch->tiler_ctx.bifrost = t.gpu;
414    return batch->tiler_ctx.bifrost;
415 }
416 #endif
417 
418 #if PAN_ARCH <= 7
419 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)420 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
421                    enum pipe_shader_type st)
422 {
423    d->offset_start = batch->ctx->offset_start;
424    d->instance_size =
425       batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
426 
427    d->uniform_buffers = batch->uniform_buffers[st];
428    d->push_uniforms = batch->push_uniforms[st];
429    d->textures = batch->textures[st];
430    d->samplers = batch->samplers[st];
431 }
432 
433 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,void * section)434 jm_emit_vertex_draw(struct panfrost_batch *batch, void *section)
435 {
436    pan_pack(section, DRAW, cfg) {
437       cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
438       cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
439       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
440       cfg.varyings = batch->varyings.vs;
441       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
442       cfg.thread_storage = batch->tls.gpu;
443       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
444    }
445 }
446 
447 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)448 jm_emit_vertex_job(struct panfrost_batch *batch,
449                    const struct pipe_draw_info *info, void *invocation_template,
450                    void *job)
451 {
452    void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
453    memcpy(section, invocation_template, pan_size(INVOCATION));
454 
455    pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
456       cfg.job_task_split = 5;
457    }
458 
459    section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
460    jm_emit_vertex_draw(batch, section);
461 
462 #if PAN_ARCH == 4
463    pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
464       ;
465 #endif
466 }
467 #endif /* PAN_ARCH <= 7 */
468 
469 static void
jm_emit_tiler_draw(void * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)470 jm_emit_tiler_draw(void *out, struct panfrost_batch *batch, bool fs_required,
471                    enum mesa_prim prim)
472 {
473    struct panfrost_context *ctx = batch->ctx;
474    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
475    bool polygon = (prim == MESA_PRIM_TRIANGLES);
476 
477    pan_pack(out, DRAW, cfg) {
478       /*
479        * From the Gallium documentation,
480        * pipe_rasterizer_state::cull_face "indicates which faces of
481        * polygons to cull". Points and lines are not considered
482        * polygons and should be drawn even if all faces are culled.
483        * The hardware does not take primitive type into account when
484        * culling, so we need to do that check ourselves.
485        */
486       cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
487       cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
488       cfg.front_face_ccw = rast->front_ccw;
489 
490       if (ctx->occlusion_query && ctx->active_queries) {
491          if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
492             cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
493          else
494             cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
495 
496          struct panfrost_resource *rsrc =
497             pan_resource(ctx->occlusion_query->rsrc);
498          cfg.occlusion = rsrc->image.data.base;
499          panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
500       }
501 
502 #if PAN_ARCH >= 9
503       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
504 
505       cfg.multisample_enable = rast->multisample;
506       cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
507 
508       /* Use per-sample shading if required by API Also use it when a
509        * blend shader is used with multisampling, as this is handled
510        * by a single ST_TILE in the blend shader with the current
511        * sample ID, requiring per-sample shading.
512        */
513       cfg.evaluate_per_sample =
514          (rast->multisample &&
515           ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
516 
517       cfg.single_sampled_lines = !rast->multisample;
518 
519       cfg.vertex_array.packet = true;
520 
521       cfg.minimum_z = batch->minimum_z;
522       cfg.maximum_z = batch->maximum_z;
523 
524       cfg.depth_stencil = batch->depth_stencil;
525 
526       if (prim == MESA_PRIM_LINES && rast->line_smooth) {
527          cfg.multisample_enable = true;
528          cfg.single_sampled_lines = false;
529       }
530 
531       if (fs_required) {
532          bool has_oq = ctx->occlusion_query && ctx->active_queries;
533 
534          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
535             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
536             ctx->blend->base.alpha_to_coverage,
537             ctx->depth_stencil->zs_always_passes);
538 
539          cfg.pixel_kill_operation = earlyzs.kill;
540          cfg.zs_update_operation = earlyzs.update;
541 
542          cfg.allow_forward_pixel_to_kill =
543             pan_allow_forward_pixel_to_kill(ctx, fs);
544          cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
545 
546          /* Mask of render targets that may be written. A render
547           * target may be written if the fragment shader writes
548           * to it AND it actually exists. If the render target
549           * doesn't actually exist, the blend descriptor will be
550           * OFF so it may be omitted from the mask.
551           *
552           * Only set when there is a fragment shader, since
553           * otherwise no colour updates are possible.
554           */
555          cfg.render_target_mask =
556             (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
557 
558          /* Also use per-sample shading if required by the shader
559           */
560          cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
561 
562          /* Unlike Bifrost, alpha-to-coverage must be included in
563           * this identically-named flag. Confusing, isn't it?
564           */
565          cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
566                                         fs->info.fs.can_discard ||
567                                         ctx->blend->base.alpha_to_coverage;
568 
569          /* Blend descriptors are only accessed by a BLEND
570           * instruction on Valhall. It follows that if the
571           * fragment shader is omitted, we may also emit the
572           * blend descriptors.
573           */
574          cfg.blend = batch->blend;
575          cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
576          cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
577 
578          cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
579          cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
580 
581          jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
582                             batch->rsd[PIPE_SHADER_FRAGMENT]);
583       } else {
584          /* These operations need to be FORCE to benefit from the
585           * depth-only pass optimizations.
586           */
587          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
588          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
589 
590          /* No shader and no blend => no shader or blend
591           * reasons to disable FPK. The only FPK-related state
592           * not covered is alpha-to-coverage which we don't set
593           * without blend.
594           */
595          cfg.allow_forward_pixel_to_kill = true;
596 
597          /* No shader => no shader side effects */
598          cfg.allow_forward_pixel_to_be_killed = true;
599 
600          /* Alpha isn't written so these are vacuous */
601          cfg.overdraw_alpha0 = true;
602          cfg.overdraw_alpha1 = true;
603       }
604 #else
605       cfg.position = batch->varyings.pos;
606       cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
607       cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
608       cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
609       cfg.viewport = batch->viewport;
610       cfg.varyings = batch->varyings.fs;
611       cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
612       cfg.thread_storage = batch->tls.gpu;
613 
614       /* For all primitives but lines DRAW.flat_shading_vertex must
615        * be set to 0 and the provoking vertex is selected with the
616        * PRIMITIVE.first_provoking_vertex field.
617        */
618       if (prim == MESA_PRIM_LINES) {
619          /* The logic is inverted across arches. */
620          cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
621       }
622 
623       jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
624 #endif
625    }
626 }
627 
628 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
629  * jobs and Valhall IDVS jobs
630  */
631 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * out)632 jm_emit_primitive(struct panfrost_batch *batch,
633                   const struct pipe_draw_info *info,
634                   const struct pipe_draw_start_count_bias *draw,
635                   bool secondary_shader, void *out)
636 {
637    struct panfrost_context *ctx = batch->ctx;
638    UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
639 
640    pan_pack(out, PRIMITIVE, cfg) {
641       cfg.draw_mode = pan_draw_mode(info->mode);
642       if (panfrost_writes_point_size(ctx))
643          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
644 
645 #if PAN_ARCH <= 8
646       /* For line primitives, PRIMITIVE.first_provoking_vertex must
647        * be set to true and the provoking vertex is selected with
648        * DRAW.flat_shading_vertex.
649        */
650       if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
651          cfg.first_provoking_vertex = true;
652       else
653          cfg.first_provoking_vertex = rast->flatshade_first;
654 
655       if (panfrost_is_implicit_prim_restart(info)) {
656          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
657       } else if (info->primitive_restart) {
658          cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
659          cfg.primitive_restart_index = info->restart_index;
660       }
661 
662       cfg.job_task_split = 6;
663 #else
664       struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
665 
666       cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
667       cfg.primitive_restart = info->primitive_restart;
668 
669       /* Non-fixed restart indices should have been lowered */
670       assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
671 #endif
672 
673       cfg.index_count = draw->count;
674       cfg.index_type = panfrost_translate_index_size(info->index_size);
675 
676       if (PAN_ARCH >= 9) {
677          /* Base vertex offset on Valhall is used for both
678           * indexed and non-indexed draws, in a simple way for
679           * either. Handle both cases.
680           */
681          if (cfg.index_type)
682             cfg.base_vertex_offset = draw->index_bias;
683          else
684             cfg.base_vertex_offset = draw->start;
685 
686          /* Indices are moved outside the primitive descriptor
687           * on Valhall, so we don't need to set that here
688           */
689       } else if (cfg.index_type) {
690          cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
691 
692 #if PAN_ARCH <= 7
693          cfg.indices = batch->indices;
694 #endif
695       }
696 
697 #if PAN_ARCH >= 6
698       cfg.secondary_shader = secondary_shader;
699 #endif
700    }
701 }
702 
703 #if PAN_ARCH == 9
704 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)705 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
706                           const struct pipe_draw_info *info,
707                           const struct pipe_draw_start_count_bias *draw,
708                           bool secondary_shader, void *job)
709 {
710    struct panfrost_context *ctx = batch->ctx;
711    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
712    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
713 
714    bool fs_required = panfrost_fs_required(
715       fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
716 
717    /* Varying shaders only feed data to the fragment shader, so if we omit
718     * the fragment shader, we should omit the varying shader too.
719     */
720    secondary_shader &= fs_required;
721 
722    jm_emit_primitive(batch, info, draw, secondary_shader,
723                      pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
724 
725    pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
726       cfg.count = info->instance_count;
727    }
728 
729    pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
730       if (secondary_shader) {
731          unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
732          cfg.vertex_packet_stride = sz + 16;
733          cfg.vertex_attribute_stride = sz;
734       } else {
735          /* Hardware requirement for "no varyings" */
736          cfg.vertex_packet_stride = 16;
737          cfg.vertex_attribute_stride = 0;
738       }
739    }
740 
741    pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
742       cfg.address = jm_emit_tiler_desc(batch);
743    }
744 
745    STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
746    memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
747           pan_size(SCISSOR));
748 
749    panfrost_emit_primitive_size(
750       ctx, info->mode == MESA_PRIM_POINTS, 0,
751       pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
752 
753    pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
754       cfg.address = batch->indices;
755    }
756 
757    jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
758                       fs_required, u_reduced_prim(info->mode));
759 
760    pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
761       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
762                          panfrost_get_position_shader(batch, info));
763    }
764 
765    pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
766       /* If a varying shader is used, we configure it with the same
767        * state as the position shader for backwards compatible
768        * behaviour with Bifrost. This could be optimized.
769        */
770       if (!secondary_shader)
771          continue;
772 
773       jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
774                          panfrost_get_varying_shader(batch));
775    }
776 }
777 #endif
778 
779 #if PAN_ARCH <= 7
780 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)781 jm_emit_tiler_job(struct panfrost_batch *batch,
782                   const struct pipe_draw_info *info,
783                   const struct pipe_draw_start_count_bias *draw,
784                   void *invocation_template, bool secondary_shader, void *job)
785 {
786    struct panfrost_context *ctx = batch->ctx;
787 
788    void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
789    memcpy(section, invocation_template, pan_size(INVOCATION));
790 
791    jm_emit_primitive(batch, info, draw, secondary_shader,
792                      pan_section_ptr(job, TILER_JOB, PRIMITIVE));
793 
794    void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
795    enum mesa_prim prim = u_reduced_prim(info->mode);
796 
797 #if PAN_ARCH >= 6
798    pan_section_pack(job, TILER_JOB, TILER, cfg) {
799       cfg.address = jm_emit_tiler_desc(batch);
800    }
801 
802    pan_section_pack(job, TILER_JOB, PADDING, cfg)
803       ;
804 #endif
805 
806    jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
807 
808    panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
809                                 batch->varyings.psiz, prim_size);
810 }
811 #endif
812 
813 void
GENX(jm_launch_xfb)814 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
815                     const struct pipe_draw_info *info, unsigned count)
816 {
817    struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
818 
819 #if PAN_ARCH == 9
820    pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
821       cfg.workgroup_size_x = 1;
822       cfg.workgroup_size_y = 1;
823       cfg.workgroup_size_z = 1;
824 
825       cfg.workgroup_count_x = count;
826       cfg.workgroup_count_y = info->instance_count;
827       cfg.workgroup_count_z = 1;
828 
829       jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
830                          batch->rsd[PIPE_SHADER_VERTEX]);
831 
832       /* TODO: Indexing. Also, this is a legacy feature... */
833       cfg.compute.attribute_offset = batch->ctx->offset_start;
834 
835       /* Transform feedback shaders do not use barriers or shared
836        * memory, so we may merge workgroups.
837        */
838       cfg.allow_merging_workgroups = true;
839       cfg.task_increment = 1;
840       cfg.task_axis = MALI_TASK_AXIS_Z;
841    }
842 #else
843    struct mali_invocation_packed invocation;
844 
845    panfrost_pack_work_groups_compute(&invocation, 1, count,
846                                      info->instance_count, 1, 1, 1,
847                                      PAN_ARCH <= 5, false);
848 
849    /* No varyings on XFB compute jobs. */
850    mali_ptr saved_vs_varyings = batch->varyings.vs;
851 
852    batch->varyings.vs = 0;
853    jm_emit_vertex_job(batch, info, &invocation, t.cpu);
854    batch->varyings.vs = saved_vs_varyings;
855 
856 #endif
857    enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
858 #if PAN_ARCH <= 5
859    job_type = MALI_JOB_TYPE_VERTEX;
860 #endif
861    pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc, job_type, true,
862                   false, 0, 0, &t, false);
863 }
864 
865 #if PAN_ARCH < 9
866 /*
867  * Push jobs required for the rasterization pipeline. If there are side effects
868  * from the vertex shader, these are handled ahead-of-time with a compute
869  * shader. This function should not be called if rasterization is skipped.
870  */
871 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)872 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
873                           const struct panfrost_ptr *vertex_job,
874                           const struct panfrost_ptr *tiler_job)
875 {
876    unsigned vertex = pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
877                                     MALI_JOB_TYPE_VERTEX, false, false, 0, 0,
878                                     vertex_job, false);
879 
880    pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
881                   MALI_JOB_TYPE_TILER, false, false, vertex, 0, tiler_job,
882                   false);
883 }
884 #endif
885 
886 void
GENX(jm_launch_draw)887 GENX(jm_launch_draw)(struct panfrost_batch *batch,
888                      const struct pipe_draw_info *info, unsigned drawid_offset,
889                      const struct pipe_draw_start_count_bias *draw,
890                      unsigned vertex_count)
891 {
892    struct panfrost_context *ctx = batch->ctx;
893    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
894    bool secondary_shader = vs->info.vs.secondary_enable;
895    bool idvs = vs->info.vs.idvs;
896 
897 #if PAN_ARCH <= 7
898    struct mali_invocation_packed invocation;
899    if (info->instance_count > 1) {
900       panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
901                                         info->instance_count, 1, 1, 1, true,
902                                         false);
903    } else {
904       pan_pack(&invocation, INVOCATION, cfg) {
905          cfg.invocations = vertex_count - 1;
906          cfg.size_y_shift = 0;
907          cfg.size_z_shift = 0;
908          cfg.workgroups_x_shift = 0;
909          cfg.workgroups_y_shift = 0;
910          cfg.workgroups_z_shift = 32;
911          cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
912       }
913    }
914 
915    /* Emit all sort of descriptors. */
916 #endif
917 
918    UNUSED struct panfrost_ptr tiler, vertex;
919 
920    if (idvs) {
921 #if PAN_ARCH == 9
922       tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
923 #elif PAN_ARCH >= 6
924       tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
925 #else
926       unreachable("IDVS is unsupported on Midgard");
927 #endif
928    } else {
929       vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
930       tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
931    }
932 
933 #if PAN_ARCH == 9
934    assert(idvs && "Memory allocated IDVS required on Valhall");
935 
936    jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
937 
938    pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
939                   MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, 0, &tiler,
940                   false);
941 #else
942    /* Fire off the draw itself */
943    jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
944                      tiler.cpu);
945    if (idvs) {
946 #if PAN_ARCH >= 6
947       jm_emit_vertex_draw(
948          batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
949 
950       pan_jc_add_job(&batch->pool.base, &batch->jm.jobs.vtc_jc,
951                      MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, 0, &tiler,
952                      false);
953 #endif
954    } else {
955       jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
956       jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
957    }
958 #endif
959 }
960