/* * Copyright (C) 2018 Alyssa Rosenzweig * Copyright (C) 2020 Collabora Ltd. * Copyright © 2017 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "util/macros.h" #include "util/u_prim.h" #include "util/u_vbuf.h" #include "util/u_helpers.h" #include "util/u_draw.h" #include "util/u_memory.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "gallium/auxiliary/util/u_blend.h" #include "genxml/gen_macros.h" #include "pan_pool.h" #include "pan_bo.h" #include "pan_blend.h" #include "pan_context.h" #include "pan_job.h" #include "pan_shader.h" #include "pan_texture.h" #include "pan_util.h" #include "pan_indirect_draw.h" #include "pan_indirect_dispatch.h" #include "pan_blitter.h" #define PAN_GPU_INDIRECTS (PAN_ARCH == 7) struct panfrost_rasterizer { struct pipe_rasterizer_state base; #if PAN_ARCH <= 7 /* Partially packed RSD words */ struct mali_multisample_misc_packed multisample; struct mali_stencil_mask_misc_packed stencil_misc; #endif }; struct panfrost_zsa_state { struct pipe_depth_stencil_alpha_state base; /* Is any depth, stencil, or alpha testing enabled? */ bool enabled; /* Does the depth and stencil tests always pass? This ignores write * masks, we are only interested in whether pixels may be killed. */ bool zs_always_passes; /* Are depth or stencil writes possible? */ bool writes_zs; #if PAN_ARCH <= 7 /* Prepacked words from the RSD */ struct mali_multisample_misc_packed rsd_depth; struct mali_stencil_mask_misc_packed rsd_stencil; struct mali_stencil_packed stencil_front, stencil_back; #else /* Depth/stencil descriptor template */ struct mali_depth_stencil_packed desc; #endif }; struct panfrost_sampler_state { struct pipe_sampler_state base; struct mali_sampler_packed hw; }; /* Misnomer: Sampler view corresponds to textures, not samplers */ struct panfrost_sampler_view { struct pipe_sampler_view base; struct panfrost_pool_ref state; struct mali_texture_packed bifrost_descriptor; mali_ptr texture_bo; uint64_t modifier; /* Pool used to allocate the descriptor. If NULL, defaults to the global * descriptor pool. Can be set for short lived descriptors, useful for * shader images on Valhall. */ struct panfrost_pool *pool; }; struct panfrost_vertex_state { unsigned num_elements; struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS]; #if PAN_ARCH >= 9 /* Packed attribute descriptor. All fields are set at CSO create time * except for stride, which must be ORed in at draw time */ struct mali_attribute_packed attributes[PIPE_MAX_ATTRIBS]; #else /* buffers corresponds to attribute buffer, element_buffers corresponds * to an index in buffers for each vertex element */ struct pan_vertex_buffer buffers[PIPE_MAX_ATTRIBS]; unsigned element_buffer[PIPE_MAX_ATTRIBS]; unsigned nr_bufs; unsigned formats[PIPE_MAX_ATTRIBS]; #endif }; /* Statically assert that PIPE_* enums match the hardware enums. * (As long as they match, we don't need to translate them.) */ static_assert((int)PIPE_FUNC_NEVER == MALI_FUNC_NEVER, "must match"); static_assert((int)PIPE_FUNC_LESS == MALI_FUNC_LESS, "must match"); static_assert((int)PIPE_FUNC_EQUAL == MALI_FUNC_EQUAL, "must match"); static_assert((int)PIPE_FUNC_LEQUAL == MALI_FUNC_LEQUAL, "must match"); static_assert((int)PIPE_FUNC_GREATER == MALI_FUNC_GREATER, "must match"); static_assert((int)PIPE_FUNC_NOTEQUAL == MALI_FUNC_NOT_EQUAL, "must match"); static_assert((int)PIPE_FUNC_GEQUAL == MALI_FUNC_GEQUAL, "must match"); static_assert((int)PIPE_FUNC_ALWAYS == MALI_FUNC_ALWAYS, "must match"); static inline enum mali_sample_pattern panfrost_sample_pattern(unsigned samples) { switch (samples) { case 1: return MALI_SAMPLE_PATTERN_SINGLE_SAMPLED; case 4: return MALI_SAMPLE_PATTERN_ROTATED_4X_GRID; case 8: return MALI_SAMPLE_PATTERN_D3D_8X_GRID; case 16: return MALI_SAMPLE_PATTERN_D3D_16X_GRID; default: unreachable("Unsupported sample count"); } } static unsigned translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest) { /* CLAMP is only supported on Midgard, where it is broken for nearest * filtering. Use CLAMP_TO_EDGE in that case. */ switch (w) { case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT; case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE; case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER; case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER; #if PAN_ARCH <= 5 case PIPE_TEX_WRAP_CLAMP: return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE : MALI_WRAP_MODE_CLAMP; case PIPE_TEX_WRAP_MIRROR_CLAMP: return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE : MALI_WRAP_MODE_MIRRORED_CLAMP; #endif default: unreachable("Invalid wrap"); } } /* The hardware compares in the wrong order order, so we have to flip before * encoding. Yes, really. */ static enum mali_func panfrost_sampler_compare_func(const struct pipe_sampler_state *cso) { return !cso->compare_mode ? MALI_FUNC_NEVER : panfrost_flip_compare_func((enum mali_func) cso->compare_func); } static enum mali_mipmap_mode pan_pipe_to_mipmode(enum pipe_tex_mipfilter f) { switch (f) { case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST; case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR; #if PAN_ARCH >= 6 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE; #else case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NEAREST; #endif default: unreachable("Invalid"); } } static void * panfrost_create_sampler_state( struct pipe_context *pctx, const struct pipe_sampler_state *cso) { struct panfrost_sampler_state *so = CALLOC_STRUCT(panfrost_sampler_state); so->base = *cso; bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST; pan_pack(&so->hw, SAMPLER, cfg) { cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST; cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST; cfg.normalized_coordinates = cso->normalized_coords; cfg.lod_bias = FIXED_16(cso->lod_bias, true); cfg.minimum_lod = FIXED_16(cso->min_lod, false); cfg.maximum_lod = FIXED_16(cso->max_lod, false); cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s, using_nearest); cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t, using_nearest); cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r, using_nearest); cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter); cfg.compare_function = panfrost_sampler_compare_func(cso); cfg.seamless_cube_map = cso->seamless_cube_map; cfg.border_color_r = cso->border_color.ui[0]; cfg.border_color_g = cso->border_color.ui[1]; cfg.border_color_b = cso->border_color.ui[2]; cfg.border_color_a = cso->border_color.ui[3]; #if PAN_ARCH >= 6 if (cso->max_anisotropy > 1) { cfg.maximum_anisotropy = cso->max_anisotropy; cfg.lod_algorithm = MALI_LOD_ALGORITHM_ANISOTROPIC; } #else /* Emulate disabled mipmapping by clamping the LOD as tight as * possible (from 0 to epsilon = 1/256) */ if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) cfg.maximum_lod = cfg.minimum_lod + 1; #endif } return so; } static bool panfrost_fs_required( struct panfrost_shader_state *fs, struct panfrost_blend_state *blend, struct pipe_framebuffer_state *state, const struct panfrost_zsa_state *zsa) { /* If we generally have side effects. This inclues use of discard, * which can affect the results of an occlusion query. */ if (fs->info.fs.sidefx) return true; /* Using an empty FS requires early-z to be enabled, but alpha test * needs it disabled. Alpha test is only native on Midgard, so only * check there. */ if (PAN_ARCH <= 5 && zsa->base.alpha_func != PIPE_FUNC_ALWAYS) return true; /* If colour is written we need to execute */ for (unsigned i = 0; i < state->nr_cbufs; ++i) { if (state->cbufs[i] && !blend->info[i].no_colour) return true; } /* If depth is written and not implied we need to execute. * TODO: Predicate on Z/S writes being enabled */ return (fs->info.fs.writes_depth || fs->info.fs.writes_stencil); } /* Get pointers to the blend shaders bound to each active render target. Used * to emit the blend descriptors, as well as the fragment renderer state * descriptor. */ static void panfrost_get_blend_shaders(struct panfrost_batch *batch, mali_ptr *blend_shaders) { unsigned shader_offset = 0; struct panfrost_bo *shader_bo = NULL; for (unsigned c = 0; c < batch->key.nr_cbufs; ++c) { if (batch->key.cbufs[c]) { blend_shaders[c] = panfrost_get_blend(batch, c, &shader_bo, &shader_offset); } } } #if PAN_ARCH >= 5 UNUSED static uint16_t pack_blend_constant(enum pipe_format format, float cons) { const struct util_format_description *format_desc = util_format_description(format); unsigned chan_size = 0; for (unsigned i = 0; i < format_desc->nr_channels; i++) chan_size = MAX2(format_desc->channel[0].size, chan_size); uint16_t unorm = (cons * ((1 << chan_size) - 1)); return unorm << (16 - chan_size); } /* * Determine whether to set the respective overdraw alpha flag. * * The overdraw alpha=1 flag should be set when alpha=1 implies full overdraw, * equivalently, all enabled render targets have alpha_one_store set. Likewise, * overdraw alpha=0 should be set when alpha=0 implies no overdraw, * equivalently, all enabled render targets have alpha_zero_nop set. */ static bool panfrost_overdraw_alpha(const struct panfrost_context *ctx, bool zero) { const struct panfrost_blend_state *so = ctx->blend; for (unsigned i = 0; i < ctx->pipe_framebuffer.nr_cbufs; ++i) { const struct pan_blend_info info = so->info[i]; bool enabled = ctx->pipe_framebuffer.cbufs[i] && info.no_colour; bool flag = zero ? info.alpha_zero_nop : info.alpha_one_store; if (enabled && !flag) return false; } return true; } static void panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_shaders) { unsigned rt_count = batch->key.nr_cbufs; struct panfrost_context *ctx = batch->ctx; const struct panfrost_blend_state *so = ctx->blend; bool dithered = so->base.dither; /* Always have at least one render target for depth-only passes */ for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) { struct mali_blend_packed *packed = rts + (i * pan_size(BLEND)); /* Disable blending for unbacked render targets */ if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) { pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) { cfg.enable = false; #if PAN_ARCH >= 6 cfg.internal.mode = MALI_BLEND_MODE_OFF; #endif } continue; } struct pan_blend_info info = so->info[i]; enum pipe_format format = batch->key.cbufs[i]->format; float cons = pan_blend_get_constant(info.constant_mask, ctx->blend_color.color); /* Word 0: Flags and constant */ pan_pack(packed, BLEND, cfg) { cfg.srgb = util_format_is_srgb(format); cfg.load_destination = info.load_dest; cfg.round_to_fb_precision = !dithered; cfg.alpha_to_one = ctx->blend->base.alpha_to_one; #if PAN_ARCH >= 6 if (!blend_shaders[i]) cfg.constant = pack_blend_constant(format, cons); #else cfg.blend_shader = (blend_shaders[i] != 0); if (blend_shaders[i]) cfg.shader_pc = blend_shaders[i]; else cfg.constant = cons; #endif } if (!blend_shaders[i]) { /* Word 1: Blend Equation */ STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i]; } #if PAN_ARCH >= 6 const struct panfrost_device *dev = pan_device(ctx->base.screen); struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); /* Words 2 and 3: Internal blend */ if (blend_shaders[i]) { /* The blend shader's address needs to be at * the same top 32 bit as the fragment shader. * TODO: Ensure that's always the case. */ assert(!fs->bin.bo || (blend_shaders[i] & (0xffffffffull << 32)) == (fs->bin.gpu & (0xffffffffull << 32))); pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { cfg.mode = MALI_BLEND_MODE_SHADER; cfg.shader.pc = (u32) blend_shaders[i]; #if PAN_ARCH <= 7 unsigned ret_offset = fs->info.bifrost.blend[i].return_offset; assert(!(ret_offset & 0x7)); cfg.shader.return_value = ret_offset ? fs->bin.gpu + ret_offset : 0; #endif } } else { pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) { cfg.mode = info.opaque ? MALI_BLEND_MODE_OPAQUE : MALI_BLEND_MODE_FIXED_FUNCTION; /* If we want the conversion to work properly, * num_comps must be set to 4 */ cfg.fixed_function.num_comps = 4; cfg.fixed_function.conversion.memory_format = panfrost_format_to_bifrost_blend(dev, format, dithered); cfg.fixed_function.conversion.register_format = fs->info.bifrost.blend[i].format; cfg.fixed_function.rt = i; #if PAN_ARCH <= 7 if (!info.opaque) { cfg.fixed_function.alpha_zero_nop = info.alpha_zero_nop; cfg.fixed_function.alpha_one_store = info.alpha_one_store; } #endif } } #endif } } #endif static inline bool pan_allow_forward_pixel_to_kill(struct panfrost_context *ctx, struct panfrost_shader_state *fs) { /* Track if any colour buffer is reused across draws, either * from reading it directly, or from failing to write it */ unsigned rt_mask = ctx->fb_rt_mask; uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0); bool blend_reads_dest = (ctx->blend->load_dest_mask & rt_mask); bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; return fs->info.fs.can_fpk && !(rt_mask & ~rt_written) && !alpha_to_coverage && !blend_reads_dest; } static mali_ptr panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage) { struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage); panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_VERTEX); panfrost_batch_add_bo(batch, ss->state.bo, PIPE_SHADER_VERTEX); return ss->state.gpu; } #if PAN_ARCH <= 7 /* Construct a partial RSD corresponding to no executed fragment shader, and * merge with the existing partial RSD. */ static void pan_merge_empty_fs(struct mali_renderer_state_packed *rsd) { struct mali_renderer_state_packed empty_rsd; pan_pack(&empty_rsd, RENDERER_STATE, cfg) { #if PAN_ARCH >= 6 cfg.properties.shader_modifies_coverage = true; cfg.properties.allow_forward_pixel_to_kill = true; cfg.properties.allow_forward_pixel_to_be_killed = true; cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY; /* Alpha isn't written so these are vacuous */ cfg.multisample_misc.overdraw_alpha0 = true; cfg.multisample_misc.overdraw_alpha1 = true; #else cfg.shader.shader = 0x1; cfg.properties.work_register_count = 1; cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION; cfg.properties.force_early_z = true; #endif } pan_merge((*rsd), empty_rsd, RENDERER_STATE); } static void panfrost_prepare_fs_state(struct panfrost_context *ctx, mali_ptr *blend_shaders, struct mali_renderer_state_packed *rsd) { struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; const struct panfrost_zsa_state *zsa = ctx->depth_stencil; struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); struct panfrost_blend_state *so = ctx->blend; bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage; bool msaa = rast->multisample; unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs; bool has_blend_shader = false; for (unsigned c = 0; c < rt_count; ++c) has_blend_shader |= (blend_shaders[c] != 0); bool has_oq = ctx->occlusion_query && ctx->active_queries; pan_pack(rsd, RENDERER_STATE, cfg) { if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) { #if PAN_ARCH >= 6 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, ctx->blend->base.alpha_to_coverage, ctx->depth_stencil->zs_always_passes); cfg.properties.pixel_kill_operation = earlyzs.kill; cfg.properties.zs_update_operation = earlyzs.update; cfg.properties.allow_forward_pixel_to_kill = pan_allow_forward_pixel_to_kill(ctx, fs); #else cfg.properties.force_early_z = fs->info.fs.can_early_z && !alpha_to_coverage && ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS); /* TODO: Reduce this limit? */ if (has_blend_shader) cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8); else cfg.properties.work_register_count = fs->info.work_reg_count; /* Hardware quirks around early-zs forcing without a * depth buffer. Note this breaks occlusion queries. */ bool force_ez_with_discard = !zsa->enabled && !has_oq; cfg.properties.shader_reads_tilebuffer = force_ez_with_discard && fs->info.fs.can_discard; cfg.properties.shader_contains_discard = !force_ez_with_discard && fs->info.fs.can_discard; #endif } #if PAN_ARCH == 4 if (rt_count > 0) { cfg.multisample_misc.load_destination = so->info[0].load_dest; cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0); cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour; cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format); cfg.stencil_mask_misc.dither_disable = !so->base.dither; cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one; if (blend_shaders[0]) { cfg.blend_shader = blend_shaders[0]; } else { cfg.blend_constant = pan_blend_get_constant( so->info[0].constant_mask, ctx->blend_color.color); } } else { /* If there is no colour buffer, leaving fields default is * fine, except for blending which is nonnullable */ cfg.blend_equation.color_mask = 0xf; cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC; cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC; cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO; cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC; cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC; cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO; } #elif PAN_ARCH == 5 /* Workaround */ cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count); #endif cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF; cfg.multisample_misc.evaluate_per_sample = msaa && (ctx->min_samples > 1); #if PAN_ARCH >= 6 /* MSAA blend shaders need to pass their sample ID to * LD_TILE/ST_TILE, so we must preload it. Additionally, we * need per-sample shading for the blend shader, accomplished * by forcing per-sample shading for the whole program. */ if (msaa && has_blend_shader) { cfg.multisample_misc.evaluate_per_sample = true; cfg.preload.fragment.sample_mask_id = true; } /* Flip gl_PointCoord (and point sprites) depending on API * setting on framebuffer orientation. We do not use * lower_wpos_pntc on Bifrost. */ cfg.properties.point_sprite_coord_origin_max_y = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); cfg.multisample_misc.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0); cfg.multisample_misc.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1); #endif cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage; cfg.depth_units = rast->offset_units * 2.0f; cfg.depth_factor = rast->offset_scale; bool back_enab = zsa->base.stencil[1].enabled; cfg.stencil_front.reference_value = ctx->stencil_ref.ref_value[0]; cfg.stencil_back.reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; #if PAN_ARCH <= 5 /* v6+ fits register preload here, no alpha testing */ cfg.alpha_reference = zsa->base.alpha_ref_value; #endif } } static void panfrost_emit_frag_shader(struct panfrost_context *ctx, struct mali_renderer_state_packed *fragmeta, mali_ptr *blend_shaders) { const struct panfrost_zsa_state *zsa = ctx->depth_stencil; const struct panfrost_rasterizer *rast = ctx->rasterizer; struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); /* We need to merge several several partial renderer state descriptors, * so stage to temporary storage rather than reading back write-combine * memory, which will trash performance. */ struct mali_renderer_state_packed rsd; panfrost_prepare_fs_state(ctx, blend_shaders, &rsd); #if PAN_ARCH == 4 if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) { /* Word 14: SFBD Blend Equation */ STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4); rsd.opaque[14] = ctx->blend->equation[0]; } #endif /* Merge with CSO state and upload */ if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) { struct mali_renderer_state_packed *partial_rsd = (struct mali_renderer_state_packed *)&fs->partial_rsd; STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd)); pan_merge(rsd, *partial_rsd, RENDERER_STATE); } else { pan_merge_empty_fs(&rsd); } /* Word 8, 9 Misc state */ rsd.opaque[8] |= zsa->rsd_depth.opaque[0] | rast->multisample.opaque[0]; rsd.opaque[9] |= zsa->rsd_stencil.opaque[0] | rast->stencil_misc.opaque[0]; /* Word 10, 11 Stencil Front and Back */ rsd.opaque[10] |= zsa->stencil_front.opaque[0]; rsd.opaque[11] |= zsa->stencil_back.opaque[0]; memcpy(fragmeta, &rsd, sizeof(rsd)); } static mali_ptr panfrost_emit_frag_shader_meta(struct panfrost_batch *batch) { struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); panfrost_batch_add_bo(batch, ss->bin.bo, PIPE_SHADER_FRAGMENT); struct panfrost_ptr xfer; #if PAN_ARCH == 4 xfer = pan_pool_alloc_desc(&batch->pool.base, RENDERER_STATE); #else unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1); xfer = pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(RENDERER_STATE), PAN_DESC_ARRAY(rt_count, BLEND)); #endif mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 }; panfrost_get_blend_shaders(batch, blend_shaders); panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders); #if PAN_ARCH >= 5 panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders); #endif return xfer.gpu; } #endif static mali_ptr panfrost_emit_viewport(struct panfrost_batch *batch) { struct panfrost_context *ctx = batch->ctx; const struct pipe_viewport_state *vp = &ctx->pipe_viewport; const struct pipe_scissor_state *ss = &ctx->scissor; const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; /* Derive min/max from translate/scale. Note since |x| >= 0 by * definition, we have that -|x| <= |x| hence translate - |scale| <= * translate + |scale|, so the ordering is correct here. */ float vp_minx = vp->translate[0] - fabsf(vp->scale[0]); float vp_maxx = vp->translate[0] + fabsf(vp->scale[0]); float vp_miny = vp->translate[1] - fabsf(vp->scale[1]); float vp_maxy = vp->translate[1] + fabsf(vp->scale[1]); float minz = (vp->translate[2] - fabsf(vp->scale[2])); float maxz = (vp->translate[2] + fabsf(vp->scale[2])); /* Scissor to the intersection of viewport and to the scissor, clamped * to the framebuffer */ unsigned minx = MIN2(batch->key.width, MAX2((int) vp_minx, 0)); unsigned maxx = MIN2(batch->key.width, MAX2((int) vp_maxx, 0)); unsigned miny = MIN2(batch->key.height, MAX2((int) vp_miny, 0)); unsigned maxy = MIN2(batch->key.height, MAX2((int) vp_maxy, 0)); if (ss && rast->scissor) { minx = MAX2(ss->minx, minx); miny = MAX2(ss->miny, miny); maxx = MIN2(ss->maxx, maxx); maxy = MIN2(ss->maxy, maxy); } /* Set the range to [1, 1) so max values don't wrap round */ if (maxx == 0 || maxy == 0) maxx = maxy = minx = miny = 1; panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy); batch->scissor_culls_everything = (minx >= maxx || miny >= maxy); /* [minx, maxx) and [miny, maxy) are exclusive ranges in the hardware */ maxx--; maxy--; batch->minimum_z = rast->depth_clip_near ? minz : -INFINITY; batch->maximum_z = rast->depth_clip_far ? maxz : +INFINITY; #if PAN_ARCH <= 7 struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, VIEWPORT); pan_pack(T.cpu, VIEWPORT, cfg) { cfg.scissor_minimum_x = minx; cfg.scissor_minimum_y = miny; cfg.scissor_maximum_x = maxx; cfg.scissor_maximum_y = maxy; cfg.minimum_z = batch->minimum_z; cfg.maximum_z = batch->maximum_z; } return T.gpu; #else pan_pack(&batch->scissor, SCISSOR, cfg) { cfg.scissor_minimum_x = minx; cfg.scissor_minimum_y = miny; cfg.scissor_maximum_x = maxx; cfg.scissor_maximum_y = maxy; } return 0; #endif } #if PAN_ARCH >= 9 /** * Emit a Valhall depth/stencil descriptor at draw-time. The bulk of the * descriptor corresponds to a pipe_depth_stencil_alpha CSO and is packed at * CSO create time. However, the stencil reference values and shader * interactions are dynamic state. Pack only the dynamic state here and OR * together. */ static mali_ptr panfrost_emit_depth_stencil(struct panfrost_batch *batch) { struct panfrost_context *ctx = batch->ctx; const struct panfrost_zsa_state *zsa = ctx->depth_stencil; struct panfrost_rasterizer *rast = ctx->rasterizer; struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); bool back_enab = zsa->base.stencil[1].enabled; struct panfrost_ptr T = pan_pool_alloc_desc(&batch->pool.base, DEPTH_STENCIL); struct mali_depth_stencil_packed dynamic; pan_pack(&dynamic, DEPTH_STENCIL, cfg) { cfg.front_reference_value = ctx->stencil_ref.ref_value[0]; cfg.back_reference_value = ctx->stencil_ref.ref_value[back_enab ? 1 : 0]; cfg.stencil_from_shader = fs->info.fs.writes_stencil; cfg.depth_source = pan_depth_source(&fs->info); cfg.depth_bias_enable = rast->base.offset_tri; cfg.depth_units = rast->base.offset_units * 2.0f; cfg.depth_factor = rast->base.offset_scale; cfg.depth_bias_clamp = rast->base.offset_clamp; } pan_merge(dynamic, zsa->desc, DEPTH_STENCIL); memcpy(T.cpu, &dynamic, pan_size(DEPTH_STENCIL)); return T.gpu; } /** * Emit Valhall blend descriptor at draw-time. The descriptor itself is shared * with Bifrost, but the container data structure is simplified. */ static mali_ptr panfrost_emit_blend_valhall(struct panfrost_batch *batch) { unsigned rt_count = MAX2(batch->key.nr_cbufs, 1); struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, rt_count, BLEND); mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 }; panfrost_get_blend_shaders(batch, blend_shaders); panfrost_emit_blend(batch, T.cpu, blend_shaders); /* Precalculate for the per-draw path */ bool has_blend_shader = false; for (unsigned i = 0; i < rt_count; ++i) has_blend_shader |= !!blend_shaders[i]; batch->ctx->valhall_has_blend_shader = has_blend_shader; return T.gpu; } /** * Emit Valhall buffer descriptors for bound vertex buffers at draw-time. */ static mali_ptr panfrost_emit_vertex_buffers(struct panfrost_batch *batch) { struct panfrost_context *ctx = batch->ctx; unsigned buffer_count = util_last_bit(ctx->vb_mask); struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, buffer_count, BUFFER); struct mali_buffer_packed *buffers = T.cpu; u_foreach_bit(i, ctx->vb_mask) { struct pipe_vertex_buffer vb = ctx->vertex_buffers[i]; struct pipe_resource *prsrc = vb.buffer.resource; struct panfrost_resource *rsrc = pan_resource(prsrc); assert(!vb.is_user_buffer); panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); pan_pack(buffers + i, BUFFER, cfg) { cfg.address = rsrc->image.data.bo->ptr.gpu + vb.buffer_offset; cfg.size = prsrc->width0 - vb.buffer_offset; } } return T.gpu; } /** * Emit Valhall attribute descriptors and associated (vertex) buffer * descriptors at draw-time. The attribute descriptors are packed at draw time * except for the stride field. The buffer descriptors are packed here, though * that could be moved into panfrost_set_vertex_buffers if needed. */ static mali_ptr panfrost_emit_vertex_data(struct panfrost_batch *batch) { struct panfrost_context *ctx = batch->ctx; struct panfrost_vertex_state *vtx = ctx->vertex; struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, vtx->num_elements, ATTRIBUTE); struct mali_attribute_packed *attributes = T.cpu; for (unsigned i = 0; i < vtx->num_elements; ++i) { struct mali_attribute_packed packed; unsigned vbi = vtx->pipe[i].vertex_buffer_index; pan_pack(&packed, ATTRIBUTE, cfg) { cfg.stride = ctx->vertex_buffers[vbi].stride; } pan_merge(packed, vtx->attributes[i], ATTRIBUTE); attributes[i] = packed; } return T.gpu; } /* * Emit Valhall descriptors for shader images. Unlike previous generations, * Valhall does not have a special descriptor for images. Standard texture * descriptors are used. The binding is different in Gallium, however, so we * translate. */ static struct pipe_sampler_view panfrost_pipe_image_to_sampler_view(struct pipe_image_view *v) { struct pipe_sampler_view out = { .format = v->format, .texture = v->resource, .target = v->resource->target, .swizzle_r = PIPE_SWIZZLE_X, .swizzle_g = PIPE_SWIZZLE_Y, .swizzle_b = PIPE_SWIZZLE_Z, .swizzle_a = PIPE_SWIZZLE_W }; if (out.target == PIPE_BUFFER) { out.u.buf.offset = v->u.buf.offset; out.u.buf.size = v->u.buf.size; } else { out.u.tex.first_layer = v->u.tex.first_layer; out.u.tex.last_layer = v->u.tex.last_layer; /* Single level only */ out.u.tex.first_level = v->u.tex.level; out.u.tex.last_level = v->u.tex.level; } return out; } static void panfrost_update_sampler_view(struct panfrost_sampler_view *view, struct pipe_context *pctx); static mali_ptr panfrost_emit_images(struct panfrost_batch *batch, enum pipe_shader_type stage) { struct panfrost_context *ctx = batch->ctx; unsigned last_bit = util_last_bit(ctx->image_mask[stage]); struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, last_bit, TEXTURE); struct mali_texture_packed *out = (struct mali_texture_packed *) T.cpu; for (int i = 0; i < last_bit; ++i) { struct pipe_image_view *image = &ctx->images[stage][i]; if (!(ctx->image_mask[stage] & BITFIELD_BIT(i))) { memset(&out[i], 0, sizeof(out[i])); continue; } /* Construct a synthetic sampler view so we can use our usual * sampler view code for the actual descriptor packing. * * Use the batch pool for a transient allocation, rather than * allocating a long-lived descriptor. */ struct panfrost_sampler_view view = { .base = panfrost_pipe_image_to_sampler_view(image), .pool = &batch->pool }; /* If we specify a cube map, the hardware internally treat it as * a 2D array. Since cube maps as images can confuse our common * texturing code, explicitly use a 2D array. * * Similar concerns apply to 3D textures. */ if (view.base.target == PIPE_BUFFER) view.base.target = PIPE_BUFFER; else view.base.target = PIPE_TEXTURE_2D_ARRAY; panfrost_update_sampler_view(&view, &ctx->base); out[i] = view.bifrost_descriptor; panfrost_track_image_access(batch, stage, image); } return T.gpu; } #endif static mali_ptr panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch, enum pipe_shader_type st, struct panfrost_constant_buffer *buf, unsigned index) { struct pipe_constant_buffer *cb = &buf->cb[index]; struct panfrost_resource *rsrc = pan_resource(cb->buffer); if (rsrc) { panfrost_batch_read_rsrc(batch, rsrc, st); /* Alignment gauranteed by * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */ return rsrc->image.data.bo->ptr.gpu + cb->buffer_offset; } else if (cb->user_buffer) { return pan_pool_upload_aligned(&batch->pool.base, cb->user_buffer + cb->buffer_offset, cb->buffer_size, 16); } else { unreachable("No constant buffer"); } } struct sysval_uniform { union { float f[4]; int32_t i[4]; uint32_t u[4]; uint64_t du[2]; }; }; static void panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; const struct pipe_viewport_state *vp = &ctx->pipe_viewport; uniform->f[0] = vp->scale[0]; uniform->f[1] = vp->scale[1]; uniform->f[2] = vp->scale[2]; } static void panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; const struct pipe_viewport_state *vp = &ctx->pipe_viewport; uniform->f[0] = vp->translate[0]; uniform->f[1] = vp->translate[1]; uniform->f[2] = vp->translate[2]; } static void panfrost_upload_txs_sysval(struct panfrost_batch *batch, enum pipe_shader_type st, unsigned int sysvalid, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base; assert(dim); if (tex->target == PIPE_BUFFER) { assert(dim == 1); uniform->i[0] = tex->u.buf.size / util_format_get_blocksize(tex->format); return; } uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level); if (dim > 1) uniform->i[1] = u_minify(tex->texture->height0, tex->u.tex.first_level); if (dim > 2) uniform->i[2] = u_minify(tex->texture->depth0, tex->u.tex.first_level); if (is_array) { unsigned size = tex->texture->array_size; /* Internally, we store the number of 2D images (faces * array * size). Externally, we report the array size in terms of * complete cubes. So divide by the # of faces per cube. */ if (tex->target == PIPE_TEXTURE_CUBE_ARRAY) size /= 6; uniform->i[dim] = size; } } static void panfrost_upload_image_size_sysval(struct panfrost_batch *batch, enum pipe_shader_type st, unsigned int sysvalid, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; unsigned idx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid); unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid); unsigned is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid); assert(dim && dim < 4); struct pipe_image_view *image = &ctx->images[st][idx]; if (image->resource->target == PIPE_BUFFER) { unsigned blocksize = util_format_get_blocksize(image->format); uniform->i[0] = image->resource->width0 / blocksize; return; } uniform->i[0] = u_minify(image->resource->width0, image->u.tex.level); if (dim > 1) uniform->i[1] = u_minify(image->resource->height0, image->u.tex.level); if (dim > 2) uniform->i[2] = u_minify(image->resource->depth0, image->u.tex.level); if (is_array) uniform->i[dim] = image->resource->array_size; } static void panfrost_upload_ssbo_sysval(struct panfrost_batch *batch, enum pipe_shader_type st, unsigned ssbo_id, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; assert(ctx->ssbo_mask[st] & (1 << ssbo_id)); struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id]; /* Compute address */ struct panfrost_resource *rsrc = pan_resource(sb.buffer); struct panfrost_bo *bo = rsrc->image.data.bo; panfrost_batch_write_rsrc(batch, rsrc, st); util_range_add(&rsrc->base, &rsrc->valid_buffer_range, sb.buffer_offset, sb.buffer_size); /* Upload address and size as sysval */ uniform->du[0] = bo->ptr.gpu + sb.buffer_offset; uniform->u[2] = sb.buffer_size; } static void panfrost_upload_sampler_sysval(struct panfrost_batch *batch, enum pipe_shader_type st, unsigned samp_idx, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base; uniform->f[0] = sampl->min_lod; uniform->f[1] = sampl->max_lod; uniform->f[2] = sampl->lod_bias; /* Even without any errata, Midgard represents "no mipmapping" as * fixing the LOD with the clamps; keep behaviour consistent. c.f. * panfrost_create_sampler_state which also explains our choice of * epsilon value (again to keep behaviour consistent) */ if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) uniform->f[1] = uniform->f[0] + (1.0/256.0); } static void panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; uniform->u[0] = ctx->compute_grid->grid[0]; uniform->u[1] = ctx->compute_grid->grid[1]; uniform->u[2] = ctx->compute_grid->grid[2]; } static void panfrost_upload_local_group_size_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; uniform->u[0] = ctx->compute_grid->block[0]; uniform->u[1] = ctx->compute_grid->block[1]; uniform->u[2] = ctx->compute_grid->block[2]; } static void panfrost_upload_work_dim_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; uniform->u[0] = ctx->compute_grid->work_dim; } /* Sample positions are pushed in a Bifrost specific format on Bifrost. On * Midgard, we emulate the Bifrost path with some extra arithmetic in the * shader, to keep the code as unified as possible. */ static void panfrost_upload_sample_positions_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; struct panfrost_device *dev = pan_device(ctx->base.screen); unsigned samples = util_framebuffer_get_num_samples(&batch->key); uniform->du[0] = panfrost_sample_positions(dev, panfrost_sample_pattern(samples)); } static void panfrost_upload_multisampled_sysval(struct panfrost_batch *batch, struct sysval_uniform *uniform) { unsigned samples = util_framebuffer_get_num_samples(&batch->key); uniform->u[0] = samples > 1; } #if PAN_ARCH >= 6 static void panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch, unsigned size_and_rt, struct sysval_uniform *uniform) { struct panfrost_context *ctx = batch->ctx; struct panfrost_device *dev = pan_device(ctx->base.screen); unsigned rt = size_and_rt & 0xF; unsigned size = size_and_rt >> 4; if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) { enum pipe_format format = batch->key.cbufs[rt]->format; uniform->u[0] = GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32; } else { pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg) cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw; } } #endif static unsigned panfrost_xfb_offset(unsigned stride, struct pipe_stream_output_target *target) { return target->buffer_offset + (pan_so_target(target)->offset * stride); } static void panfrost_upload_sysvals(struct panfrost_batch *batch, const struct panfrost_ptr *ptr, struct panfrost_shader_state *ss, enum pipe_shader_type st) { struct sysval_uniform *uniforms = ptr->cpu; for (unsigned i = 0; i < ss->info.sysvals.sysval_count; ++i) { int sysval = ss->info.sysvals.sysvals[i]; switch (PAN_SYSVAL_TYPE(sysval)) { case PAN_SYSVAL_VIEWPORT_SCALE: panfrost_upload_viewport_scale_sysval(batch, &uniforms[i]); break; case PAN_SYSVAL_VIEWPORT_OFFSET: panfrost_upload_viewport_offset_sysval(batch, &uniforms[i]); break; case PAN_SYSVAL_TEXTURE_SIZE: panfrost_upload_txs_sysval(batch, st, PAN_SYSVAL_ID(sysval), &uniforms[i]); break; case PAN_SYSVAL_SSBO: panfrost_upload_ssbo_sysval(batch, st, PAN_SYSVAL_ID(sysval), &uniforms[i]); break; case PAN_SYSVAL_XFB: { unsigned buf = PAN_SYSVAL_ID(sysval); struct panfrost_shader_state *vs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_VERTEX); struct pipe_stream_output_info *so = &vs->stream_output; unsigned stride = so->stride[buf] * 4; struct pipe_stream_output_target *target = NULL; if (buf < batch->ctx->streamout.num_targets) target = batch->ctx->streamout.targets[buf]; if (!target) { /* Memory sink */ uniforms[i].du[0] = 0x8ull << 60; break; } struct panfrost_resource *rsrc = pan_resource(target->buffer); unsigned offset = panfrost_xfb_offset(stride, target); util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset, target->buffer_size - offset); panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); uniforms[i].du[0] = rsrc->image.data.bo->ptr.gpu + offset; break; } case PAN_SYSVAL_NUM_VERTICES: uniforms[i].u[0] = batch->ctx->vertex_count; break; case PAN_SYSVAL_NUM_WORK_GROUPS: for (unsigned j = 0; j < 3; j++) { batch->num_wg_sysval[j] = ptr->gpu + (i * sizeof(*uniforms)) + (j * 4); } panfrost_upload_num_work_groups_sysval(batch, &uniforms[i]); break; case PAN_SYSVAL_LOCAL_GROUP_SIZE: panfrost_upload_local_group_size_sysval(batch, &uniforms[i]); break; case PAN_SYSVAL_WORK_DIM: panfrost_upload_work_dim_sysval(batch, &uniforms[i]); break; case PAN_SYSVAL_SAMPLER: panfrost_upload_sampler_sysval(batch, st, PAN_SYSVAL_ID(sysval), &uniforms[i]); break; case PAN_SYSVAL_IMAGE_SIZE: panfrost_upload_image_size_sysval(batch, st, PAN_SYSVAL_ID(sysval), &uniforms[i]); break; case PAN_SYSVAL_SAMPLE_POSITIONS: panfrost_upload_sample_positions_sysval(batch, &uniforms[i]); break; case PAN_SYSVAL_MULTISAMPLED: panfrost_upload_multisampled_sysval(batch, &uniforms[i]); break; #if PAN_ARCH >= 6 case PAN_SYSVAL_RT_CONVERSION: panfrost_upload_rt_conversion_sysval(batch, PAN_SYSVAL_ID(sysval), &uniforms[i]); break; #endif case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: batch->ctx->first_vertex_sysval_ptr = ptr->gpu + (i * sizeof(*uniforms)); batch->ctx->base_vertex_sysval_ptr = batch->ctx->first_vertex_sysval_ptr + 4; batch->ctx->base_instance_sysval_ptr = batch->ctx->first_vertex_sysval_ptr + 8; uniforms[i].u[0] = batch->ctx->offset_start; uniforms[i].u[1] = batch->ctx->base_vertex; uniforms[i].u[2] = batch->ctx->base_instance; break; case PAN_SYSVAL_DRAWID: uniforms[i].u[0] = batch->ctx->drawid; break; default: assert(0); } } } static const void * panfrost_map_constant_buffer_cpu(struct panfrost_context *ctx, struct panfrost_constant_buffer *buf, unsigned index) { struct pipe_constant_buffer *cb = &buf->cb[index]; struct panfrost_resource *rsrc = pan_resource(cb->buffer); if (rsrc) { panfrost_bo_mmap(rsrc->image.data.bo); panfrost_flush_writer(ctx, rsrc, "CPU constant buffer mapping"); panfrost_bo_wait(rsrc->image.data.bo, INT64_MAX, false); return rsrc->image.data.bo->ptr.cpu + cb->buffer_offset; } else if (cb->user_buffer) { return cb->user_buffer + cb->buffer_offset; } else unreachable("No constant buffer"); } /* Emit a single UBO record. On Valhall, UBOs are dumb buffers and are * implemented with buffer descriptors in the resource table, sized in terms of * bytes. On Bifrost and older, UBOs have special uniform buffer data * structure, sized in terms of entries. */ static void panfrost_emit_ubo(void *base, unsigned index, mali_ptr address, size_t size) { #if PAN_ARCH >= 9 struct mali_buffer_packed *out = base; pan_pack(out + index, BUFFER, cfg) { cfg.size = size; cfg.address = address; } #else struct mali_uniform_buffer_packed *out = base; /* Issue (57) for the ARB_uniform_buffer_object spec says that * the buffer can be larger than the uniform data inside it, * so clamp ubo size to what hardware supports. */ pan_pack(out + index, UNIFORM_BUFFER, cfg) { cfg.entries = MIN2(DIV_ROUND_UP(size, 16), 1 << 12); cfg.pointer = address; } #endif } static mali_ptr panfrost_emit_const_buf(struct panfrost_batch *batch, enum pipe_shader_type stage, unsigned *buffer_count, mali_ptr *push_constants, unsigned *pushed_words) { struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_variants *all = ctx->shader[stage]; if (!all) return 0; struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage]; struct panfrost_shader_state *ss = &all->variants[all->active_variant]; /* Allocate room for the sysval and the uniforms */ size_t sys_size = sizeof(float) * 4 * ss->info.sysvals.sysval_count; struct panfrost_ptr transfer = pan_pool_alloc_aligned(&batch->pool.base, sys_size, 16); /* Upload sysvals requested by the shader */ panfrost_upload_sysvals(batch, &transfer, ss, stage); /* Next up, attach UBOs. UBO count includes gaps but no sysval UBO */ struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, stage); unsigned ubo_count = shader->info.ubo_count - (sys_size ? 1 : 0); unsigned sysval_ubo = sys_size ? ubo_count : ~0; struct panfrost_ptr ubos = { 0 }; #if PAN_ARCH >= 9 ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, BUFFER); #else ubos = pan_pool_alloc_desc_array(&batch->pool.base, ubo_count + 1, UNIFORM_BUFFER); #endif if (buffer_count) *buffer_count = ubo_count + (sys_size ? 1 : 0); /* Upload sysval as a final UBO */ if (sys_size) panfrost_emit_ubo(ubos.cpu, ubo_count, transfer.gpu, sys_size); /* The rest are honest-to-goodness UBOs */ u_foreach_bit(ubo, ss->info.ubo_mask & buf->enabled_mask) { size_t usz = buf->cb[ubo].buffer_size; mali_ptr address = 0; if (usz > 0) { address = panfrost_map_constant_buffer_gpu(batch, stage, buf, ubo); } panfrost_emit_ubo(ubos.cpu, ubo, address, usz); } if (pushed_words) *pushed_words = ss->info.push.count; if (ss->info.push.count == 0) return ubos.gpu; /* Copy push constants required by the shader */ struct panfrost_ptr push_transfer = pan_pool_alloc_aligned(&batch->pool.base, ss->info.push.count * 4, 16); uint32_t *push_cpu = (uint32_t *) push_transfer.cpu; *push_constants = push_transfer.gpu; for (unsigned i = 0; i < ss->info.push.count; ++i) { struct panfrost_ubo_word src = ss->info.push.words[i]; if (src.ubo == sysval_ubo) { unsigned sysval_idx = src.offset / 16; unsigned sysval_comp = (src.offset % 16) / 4; unsigned sysval_type = PAN_SYSVAL_TYPE(ss->info.sysvals.sysvals[sysval_idx]); mali_ptr ptr = push_transfer.gpu + (4 * i); switch (sysval_type) { case PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS: switch (sysval_comp) { case 0: batch->ctx->first_vertex_sysval_ptr = ptr; break; case 1: batch->ctx->base_vertex_sysval_ptr = ptr; break; case 2: batch->ctx->base_instance_sysval_ptr = ptr; break; case 3: /* Spurious (Midgard doesn't pack) */ break; default: unreachable("Invalid vertex/instance offset component\n"); } break; case PAN_SYSVAL_NUM_WORK_GROUPS: batch->num_wg_sysval[sysval_comp] = ptr; break; default: break; } } /* Map the UBO, this should be cheap. However this is reading * from write-combine memory which is _very_ slow. It might pay * off to upload sysvals to a staging buffer on the CPU on the * assumption sysvals will get pushed (TODO) */ const void *mapped_ubo = (src.ubo == sysval_ubo) ? transfer.cpu : panfrost_map_constant_buffer_cpu(ctx, buf, src.ubo); /* TODO: Is there any benefit to combining ranges */ memcpy(push_cpu + i, (uint8_t *) mapped_ubo + src.offset, 4); } return ubos.gpu; } static mali_ptr panfrost_emit_shared_memory(struct panfrost_batch *batch, const struct pipe_grid_info *grid) { struct panfrost_context *ctx = batch->ctx; struct panfrost_device *dev = pan_device(ctx->base.screen); struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE]; struct panfrost_shader_state *ss = &all->variants[all->active_variant]; struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); struct pan_tls_info info = { .tls.size = ss->info.tls_size, .wls.size = ss->info.wls_size, .wls.dim.x = grid->grid[0], .wls.dim.y = grid->grid[1], .wls.dim.z = grid->grid[2], }; if (ss->info.tls_size) { struct panfrost_bo *bo = panfrost_batch_get_scratchpad(batch, ss->info.tls_size, dev->thread_tls_alloc, dev->core_id_range); info.tls.ptr = bo->ptr.gpu; } if (ss->info.wls_size) { unsigned size = pan_wls_adjust_size(info.wls.size) * pan_wls_instances(&info.wls.dim) * dev->core_id_range; struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch, size, 1); info.wls.ptr = bo->ptr.gpu; } GENX(pan_emit_tls)(&info, t.cpu); return t.gpu; } #if PAN_ARCH <= 5 static mali_ptr panfrost_get_tex_desc(struct panfrost_batch *batch, enum pipe_shader_type st, struct panfrost_sampler_view *view) { if (!view) return (mali_ptr) 0; struct pipe_sampler_view *pview = &view->base; struct panfrost_resource *rsrc = pan_resource(pview->texture); panfrost_batch_read_rsrc(batch, rsrc, st); panfrost_batch_add_bo(batch, view->state.bo, st); return view->state.gpu; } #endif static void panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so, struct pipe_context *pctx, struct pipe_resource *texture) { struct panfrost_device *device = pan_device(pctx->screen); struct panfrost_context *ctx = pan_context(pctx); struct panfrost_resource *prsrc = (struct panfrost_resource *)texture; enum pipe_format format = so->base.format; assert(prsrc->image.data.bo); /* Format to access the stencil/depth portion of a Z32_S8 texture */ if (format == PIPE_FORMAT_X32_S8X24_UINT) { assert(prsrc->separate_stencil); texture = &prsrc->separate_stencil->base; prsrc = (struct panfrost_resource *)texture; format = texture->format; } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) { format = PIPE_FORMAT_Z32_FLOAT; } const struct util_format_description *desc = util_format_description(format); bool fake_rgtc = !panfrost_supports_compressed_format(device, MALI_BC4_UNORM); if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC && fake_rgtc) { if (desc->is_snorm) format = PIPE_FORMAT_R8G8B8A8_SNORM; else format = PIPE_FORMAT_R8G8B8A8_UNORM; desc = util_format_description(format); } so->texture_bo = prsrc->image.data.bo->ptr.gpu; so->modifier = prsrc->image.layout.modifier; /* MSAA only supported for 2D textures */ assert(texture->nr_samples <= 1 || so->base.target == PIPE_TEXTURE_2D || so->base.target == PIPE_TEXTURE_2D_ARRAY); enum mali_texture_dimension type = panfrost_translate_texture_dimension(so->base.target); bool is_buffer = (so->base.target == PIPE_BUFFER); unsigned first_level = is_buffer ? 0 : so->base.u.tex.first_level; unsigned last_level = is_buffer ? 0 : so->base.u.tex.last_level; unsigned first_layer = is_buffer ? 0 : so->base.u.tex.first_layer; unsigned last_layer = is_buffer ? 0 : so->base.u.tex.last_layer; unsigned buf_offset = is_buffer ? so->base.u.buf.offset : 0; unsigned buf_size = (is_buffer ? so->base.u.buf.size : 0) / util_format_get_blocksize(format); if (so->base.target == PIPE_TEXTURE_3D) { first_layer /= prsrc->image.layout.depth; last_layer /= prsrc->image.layout.depth; assert(!first_layer && !last_layer); } struct pan_image_view iview = { .format = format, .dim = type, .first_level = first_level, .last_level = last_level, .first_layer = first_layer, .last_layer = last_layer, .swizzle = { so->base.swizzle_r, so->base.swizzle_g, so->base.swizzle_b, so->base.swizzle_a, }, .image = &prsrc->image, .buf.offset = buf_offset, .buf.size = buf_size, }; unsigned size = (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) + GENX(panfrost_estimate_texture_payload_size)(&iview); struct panfrost_pool *pool = so->pool ?: &ctx->descs; struct panfrost_ptr payload = pan_pool_alloc_aligned(&pool->base, size, 64); so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu); void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu; if (PAN_ARCH <= 5) { payload.cpu += pan_size(TEXTURE); payload.gpu += pan_size(TEXTURE); } GENX(panfrost_new_texture)(device, &iview, tex, &payload); } static void panfrost_update_sampler_view(struct panfrost_sampler_view *view, struct pipe_context *pctx) { struct panfrost_resource *rsrc = pan_resource(view->base.texture); if (view->texture_bo != rsrc->image.data.bo->ptr.gpu || view->modifier != rsrc->image.layout.modifier) { panfrost_bo_unreference(view->state.bo); panfrost_create_sampler_view_bo(view, pctx, &rsrc->base); } } static mali_ptr panfrost_emit_texture_descriptors(struct panfrost_batch *batch, enum pipe_shader_type stage) { struct panfrost_context *ctx = batch->ctx; if (!ctx->sampler_view_count[stage]) return 0; #if PAN_ARCH >= 6 struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, ctx->sampler_view_count[stage], TEXTURE); struct mali_texture_packed *out = (struct mali_texture_packed *) T.cpu; for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; if (!view) { memset(&out[i], 0, sizeof(out[i])); continue; } struct pipe_sampler_view *pview = &view->base; struct panfrost_resource *rsrc = pan_resource(pview->texture); panfrost_update_sampler_view(view, &ctx->base); out[i] = view->bifrost_descriptor; panfrost_batch_read_rsrc(batch, rsrc, stage); panfrost_batch_add_bo(batch, view->state.bo, stage); } return T.gpu; #else uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS]; for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) { struct panfrost_sampler_view *view = ctx->sampler_views[stage][i]; if (!view) { trampolines[i] = 0; continue; } panfrost_update_sampler_view(view, &ctx->base); trampolines[i] = panfrost_get_tex_desc(batch, stage, view); } return pan_pool_upload_aligned(&batch->pool.base, trampolines, sizeof(uint64_t) * ctx->sampler_view_count[stage], sizeof(uint64_t)); #endif } static mali_ptr panfrost_emit_sampler_descriptors(struct panfrost_batch *batch, enum pipe_shader_type stage) { struct panfrost_context *ctx = batch->ctx; if (!ctx->sampler_count[stage]) return 0; struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, ctx->sampler_count[stage], SAMPLER); struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu; for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i) { struct panfrost_sampler_state *st = ctx->samplers[stage][i]; out[i] = st ? st->hw : (struct mali_sampler_packed){0}; } return T.gpu; } #if PAN_ARCH <= 7 /* Packs all image attribute descs and attribute buffer descs. * `first_image_buf_index` must be the index of the first image attribute buffer descriptor. */ static void emit_image_attribs(struct panfrost_context *ctx, enum pipe_shader_type shader, struct mali_attribute_packed *attribs, unsigned first_buf) { struct panfrost_device *dev = pan_device(ctx->base.screen); unsigned last_bit = util_last_bit(ctx->image_mask[shader]); for (unsigned i = 0; i < last_bit; ++i) { enum pipe_format format = ctx->images[shader][i].format; pan_pack(attribs + i, ATTRIBUTE, cfg) { /* Continuation record means 2 buffers per image */ cfg.buffer_index = first_buf + (i * 2); cfg.offset_enable = (PAN_ARCH <= 5); cfg.format = dev->formats[format].hw; } } } static enum mali_attribute_type pan_modifier_to_attr_type(uint64_t modifier) { switch (modifier) { case DRM_FORMAT_MOD_LINEAR: return MALI_ATTRIBUTE_TYPE_3D_LINEAR; case DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED: return MALI_ATTRIBUTE_TYPE_3D_INTERLEAVED; default: unreachable("Invalid modifier for attribute record"); } } static void emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader, struct mali_attribute_buffer_packed *bufs, unsigned first_image_buf_index) { struct panfrost_context *ctx = batch->ctx; unsigned last_bit = util_last_bit(ctx->image_mask[shader]); for (unsigned i = 0; i < last_bit; ++i) { struct pipe_image_view *image = &ctx->images[shader][i]; if (!(ctx->image_mask[shader] & (1 << i)) || !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) { /* Unused image bindings */ pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg); pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER, cfg); continue; } struct panfrost_resource *rsrc = pan_resource(image->resource); /* TODO: MSAA */ assert(image->resource->nr_samples <= 1 && "MSAA'd images not supported"); bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D; bool is_buffer = rsrc->base.target == PIPE_BUFFER; unsigned offset = is_buffer ? image->u.buf.offset : panfrost_texture_offset(&rsrc->image.layout, image->u.tex.level, is_3d ? 0 : image->u.tex.first_layer, is_3d ? image->u.tex.first_layer : 0); panfrost_track_image_access(batch, shader, image); pan_pack(bufs + (i * 2), ATTRIBUTE_BUFFER, cfg) { cfg.type = pan_modifier_to_attr_type(rsrc->image.layout.modifier); cfg.pointer = rsrc->image.data.bo->ptr.gpu + offset; cfg.stride = util_format_get_blocksize(image->format); cfg.size = rsrc->image.data.bo->size - offset; } if (is_buffer) { pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { cfg.s_dimension = rsrc->base.width0 / util_format_get_blocksize(image->format); cfg.t_dimension = cfg.r_dimension = 1; } continue; } pan_pack(bufs + (i * 2) + 1, ATTRIBUTE_BUFFER_CONTINUATION_3D, cfg) { unsigned level = image->u.tex.level; cfg.s_dimension = u_minify(rsrc->base.width0, level); cfg.t_dimension = u_minify(rsrc->base.height0, level); cfg.r_dimension = is_3d ? u_minify(rsrc->base.depth0, level) : image->u.tex.last_layer - image->u.tex.first_layer + 1; cfg.row_stride = rsrc->image.layout.slices[level].row_stride; if (rsrc->base.target != PIPE_TEXTURE_2D) { cfg.slice_stride = panfrost_get_layer_stride(&rsrc->image.layout, level); } } } } static mali_ptr panfrost_emit_image_attribs(struct panfrost_batch *batch, mali_ptr *buffers, enum pipe_shader_type type) { struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_state *shader = panfrost_get_shader_state(ctx, type); if (!shader->info.attribute_count) { *buffers = 0; return 0; } /* Images always need a MALI_ATTRIBUTE_BUFFER_CONTINUATION_3D */ unsigned attr_count = shader->info.attribute_count; unsigned buf_count = (attr_count * 2) + (PAN_ARCH >= 6 ? 1 : 0); struct panfrost_ptr bufs = pan_pool_alloc_desc_array(&batch->pool.base, buf_count, ATTRIBUTE_BUFFER); struct panfrost_ptr attribs = pan_pool_alloc_desc_array(&batch->pool.base, attr_count, ATTRIBUTE); emit_image_attribs(ctx, type, attribs.cpu, 0); emit_image_bufs(batch, type, bufs.cpu, 0); /* We need an empty attrib buf to stop the prefetching on Bifrost */ #if PAN_ARCH >= 6 pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)), ATTRIBUTE_BUFFER, cfg); #endif *buffers = bufs.gpu; return attribs.gpu; } static mali_ptr panfrost_emit_vertex_data(struct panfrost_batch *batch, mali_ptr *buffers) { struct panfrost_context *ctx = batch->ctx; struct panfrost_vertex_state *so = ctx->vertex; struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); bool instanced = ctx->indirect_draw || ctx->instance_count > 1; uint32_t image_mask = ctx->image_mask[PIPE_SHADER_VERTEX]; unsigned nr_images = util_last_bit(image_mask); /* Worst case: everything is NPOT, which is only possible if instancing * is enabled. Otherwise single record is gauranteed. * Also, we allocate more memory than what's needed here if either instancing * is enabled or images are present, this can be improved. */ unsigned bufs_per_attrib = (instanced || nr_images > 0) ? 2 : 1; unsigned nr_bufs = ((so->nr_bufs + nr_images) * bufs_per_attrib) + (PAN_ARCH >= 6 ? 1 : 0); unsigned count = vs->info.attribute_count; if (vs->xfb) count = MAX2(count, vs->xfb->info.attribute_count); #if PAN_ARCH <= 5 /* Midgard needs vertexid/instanceid handled specially */ bool special_vbufs = count >= PAN_VERTEX_ID; if (special_vbufs) nr_bufs += 2; #endif if (!nr_bufs) { *buffers = 0; return 0; } struct panfrost_ptr S = pan_pool_alloc_desc_array(&batch->pool.base, nr_bufs, ATTRIBUTE_BUFFER); struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, count, ATTRIBUTE); struct mali_attribute_buffer_packed *bufs = (struct mali_attribute_buffer_packed *) S.cpu; struct mali_attribute_packed *out = (struct mali_attribute_packed *) T.cpu; unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 }; unsigned k = 0; for (unsigned i = 0; i < so->nr_bufs; ++i) { unsigned vbi = so->buffers[i].vbi; unsigned divisor = so->buffers[i].divisor; attrib_to_buffer[i] = k; if (!(ctx->vb_mask & (1 << vbi))) continue; struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; struct panfrost_resource *rsrc; rsrc = pan_resource(buf->buffer.resource); if (!rsrc) continue; panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); /* Mask off lower bits, see offset fixup below */ mali_ptr raw_addr = rsrc->image.data.bo->ptr.gpu + buf->buffer_offset; mali_ptr addr = raw_addr & ~63; /* Since we advanced the base pointer, we shrink the buffer * size, but add the offset we subtracted */ unsigned size = rsrc->base.width0 + (raw_addr - addr) - buf->buffer_offset; /* When there is a divisor, the hardware-level divisor is * the product of the instance divisor and the padded count */ unsigned stride = buf->stride; if (ctx->indirect_draw) { /* We allocated 2 records for each attribute buffer */ assert((k & 1) == 0); /* With indirect draws we can't guess the vertex_count. * Pre-set the address, stride and size fields, the * compute shader do the rest. */ pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D; cfg.pointer = addr; cfg.stride = stride; cfg.size = size; } /* We store the unmodified divisor in the continuation * slot so the compute shader can retrieve it. */ pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { cfg.divisor = divisor; } k += 2; continue; } unsigned hw_divisor = ctx->padded_count * divisor; if (ctx->instance_count <= 1) { /* Per-instance would be every attribute equal */ if (divisor) stride = 0; pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { cfg.pointer = addr; cfg.stride = stride; cfg.size = size; } } else if (!divisor) { pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS; cfg.pointer = addr; cfg.stride = stride; cfg.size = size; cfg.divisor = ctx->padded_count; } } else if (util_is_power_of_two_or_zero(hw_divisor)) { pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; cfg.pointer = addr; cfg.stride = stride; cfg.size = size; cfg.divisor_r = __builtin_ctz(hw_divisor); } } else { unsigned shift = 0, extra_flags = 0; unsigned magic_divisor = panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); /* Records with continuations must be aligned */ k = ALIGN_POT(k, 2); attrib_to_buffer[i] = k; pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) { cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; cfg.pointer = addr; cfg.stride = stride; cfg.size = size; cfg.divisor_r = shift; cfg.divisor_e = extra_flags; } pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) { cfg.divisor_numerator = magic_divisor; cfg.divisor = divisor; } ++k; } ++k; } #if PAN_ARCH <= 5 /* Add special gl_VertexID/gl_InstanceID buffers */ if (special_vbufs) { panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) { cfg.buffer_index = k++; cfg.format = so->formats[PAN_VERTEX_ID]; } panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1); pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) { cfg.buffer_index = k++; cfg.format = so->formats[PAN_INSTANCE_ID]; } } #endif if (nr_images) { k = ALIGN_POT(k, 2); emit_image_attribs(ctx, PIPE_SHADER_VERTEX, out + so->num_elements, k); emit_image_bufs(batch, PIPE_SHADER_VERTEX, bufs + k, k); k += (util_last_bit(ctx->image_mask[PIPE_SHADER_VERTEX]) * 2); } #if PAN_ARCH >= 6 /* We need an empty attrib buf to stop the prefetching on Bifrost */ pan_pack(&bufs[k], ATTRIBUTE_BUFFER, cfg); #endif /* Attribute addresses require 64-byte alignment, so let: * * base' = base & ~63 = base - (base & 63) * offset' = offset + (base & 63) * * Since base' + offset' = base + offset, these are equivalent * addressing modes and now base is 64 aligned. */ /* While these are usually equal, they are not required to be. In some * cases, u_blitter passes too high a value for num_elements. */ assert(vs->info.attributes_read_count <= so->num_elements); for (unsigned i = 0; i < vs->info.attributes_read_count; ++i) { unsigned vbi = so->pipe[i].vertex_buffer_index; struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi]; /* BOs are aligned; just fixup for buffer_offset */ signed src_offset = so->pipe[i].src_offset; src_offset += (buf->buffer_offset & 63); /* Base instance offset */ if (ctx->base_instance && so->pipe[i].instance_divisor) { src_offset += (ctx->base_instance * buf->stride) / so->pipe[i].instance_divisor; } /* Also, somewhat obscurely per-instance data needs to be * offset in response to a delayed start in an indexed draw */ if (so->pipe[i].instance_divisor && ctx->instance_count > 1) src_offset -= buf->stride * ctx->offset_start; pan_pack(out + i, ATTRIBUTE, cfg) { cfg.buffer_index = attrib_to_buffer[so->element_buffer[i]]; cfg.format = so->formats[i]; cfg.offset = src_offset; } } *buffers = S.gpu; return T.gpu; } static mali_ptr panfrost_emit_varyings(struct panfrost_batch *batch, struct mali_attribute_buffer_packed *slot, unsigned stride, unsigned count) { unsigned size = stride * count; mali_ptr ptr = batch->ctx->indirect_draw ? 0 : pan_pool_alloc_aligned(&batch->invisible_pool.base, size, 64).gpu; pan_pack(slot, ATTRIBUTE_BUFFER, cfg) { cfg.stride = stride; cfg.size = size; cfg.pointer = ptr; } return ptr; } /* Given a varying, figure out which index it corresponds to */ static inline unsigned pan_varying_index(unsigned present, enum pan_special_varying v) { return util_bitcount(present & BITFIELD_MASK(v)); } /* Determines which varying buffers are required */ static inline unsigned pan_varying_present(const struct panfrost_device *dev, struct pan_shader_info *producer, struct pan_shader_info *consumer, uint16_t point_coord_mask) { /* At the moment we always emit general and position buffers. Not * strictly necessary but usually harmless */ unsigned present = BITFIELD_BIT(PAN_VARY_GENERAL) | BITFIELD_BIT(PAN_VARY_POSITION); /* Enable special buffers by the shader info */ if (producer->vs.writes_point_size) present |= BITFIELD_BIT(PAN_VARY_PSIZ); #if PAN_ARCH <= 5 /* On Midgard, these exist as real varyings. Later architectures use * LD_VAR_SPECIAL reads instead. */ if (consumer->fs.reads_point_coord) present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); if (consumer->fs.reads_face) present |= BITFIELD_BIT(PAN_VARY_FACE); if (consumer->fs.reads_frag_coord) present |= BITFIELD_BIT(PAN_VARY_FRAGCOORD); /* Also, if we have a point sprite, we need a point coord buffer */ for (unsigned i = 0; i < consumer->varyings.input_count; i++) { gl_varying_slot loc = consumer->varyings.input[i].location; if (util_varying_is_point_coord(loc, point_coord_mask)) present |= BITFIELD_BIT(PAN_VARY_PNTCOORD); } #endif return present; } /* Emitters for varying records */ static void pan_emit_vary(const struct panfrost_device *dev, struct mali_attribute_packed *out, unsigned buffer_index, mali_pixel_format format, unsigned offset) { pan_pack(out, ATTRIBUTE, cfg) { cfg.buffer_index = buffer_index; cfg.offset_enable = (PAN_ARCH <= 5); cfg.format = format; cfg.offset = offset; } } /* Special records */ static const struct { unsigned components; enum mali_format format; } pan_varying_formats[PAN_VARY_MAX] = { [PAN_VARY_POSITION] = { 4, MALI_SNAP_4 }, [PAN_VARY_PSIZ] = { 1, MALI_R16F }, [PAN_VARY_PNTCOORD] = { 1, MALI_R16F }, [PAN_VARY_FACE] = { 1, MALI_R32I }, [PAN_VARY_FRAGCOORD] = { 4, MALI_RGBA32F }, }; static mali_pixel_format pan_special_format(const struct panfrost_device *dev, enum pan_special_varying buf) { assert(buf < PAN_VARY_MAX); mali_pixel_format format = (pan_varying_formats[buf].format << 12); #if PAN_ARCH <= 6 unsigned nr = pan_varying_formats[buf].components; format |= panfrost_get_default_swizzle(nr); #endif return format; } static void pan_emit_vary_special(const struct panfrost_device *dev, struct mali_attribute_packed *out, unsigned present, enum pan_special_varying buf) { pan_emit_vary(dev, out, pan_varying_index(present, buf), pan_special_format(dev, buf), 0); } /* Negative indicates a varying is not found */ static signed pan_find_vary(const struct pan_shader_varying *vary, unsigned vary_count, unsigned loc) { for (unsigned i = 0; i < vary_count; ++i) { if (vary[i].location == loc) return i; } return -1; } /* Assign varying locations for the general buffer. Returns the calculated * per-vertex stride, and outputs offsets into the passed array. Negative * offset indicates a varying is not used. */ static unsigned pan_assign_varyings(const struct panfrost_device *dev, struct pan_shader_info *producer, struct pan_shader_info *consumer, signed *offsets) { unsigned producer_count = producer->varyings.output_count; unsigned consumer_count = consumer->varyings.input_count; const struct pan_shader_varying *producer_vars = producer->varyings.output; const struct pan_shader_varying *consumer_vars = consumer->varyings.input; unsigned stride = 0; for (unsigned i = 0; i < producer_count; ++i) { signed loc = pan_find_vary(consumer_vars, consumer_count, producer_vars[i].location); if (loc >= 0) { offsets[i] = stride; enum pipe_format format = consumer_vars[loc].format; stride += util_format_get_blocksize(format); } else { offsets[i] = -1; } } return stride; } /* Emitter for a single varying (attribute) descriptor */ static void panfrost_emit_varying(const struct panfrost_device *dev, struct mali_attribute_packed *out, const struct pan_shader_varying varying, enum pipe_format pipe_format, unsigned present, uint16_t point_sprite_mask, signed offset, enum pan_special_varying pos_varying) { /* Note: varying.format != pipe_format in some obscure cases due to a * limitation of the NIR linker. This should be fixed in the future to * eliminate the additional lookups. See: * dEQP-GLES3.functional.shaders.conditionals.if.sequence_statements_vertex */ gl_varying_slot loc = varying.location; mali_pixel_format format = dev->formats[pipe_format].hw; if (util_varying_is_point_coord(loc, point_sprite_mask)) { pan_emit_vary_special(dev, out, present, PAN_VARY_PNTCOORD); } else if (loc == VARYING_SLOT_POS) { pan_emit_vary_special(dev, out, present, pos_varying); } else if (loc == VARYING_SLOT_PSIZ) { pan_emit_vary_special(dev, out, present, PAN_VARY_PSIZ); } else if (loc == VARYING_SLOT_FACE) { pan_emit_vary_special(dev, out, present, PAN_VARY_FACE); } else if (offset < 0) { pan_emit_vary(dev, out, 0, (MALI_CONSTANT << 12), 0); } else { STATIC_ASSERT(PAN_VARY_GENERAL == 0); pan_emit_vary(dev, out, 0, format, offset); } } /* Links varyings and uploads ATTRIBUTE descriptors. Can execute at link time, * rather than draw time (under good conditions). */ static void panfrost_emit_varying_descs( struct panfrost_pool *pool, struct panfrost_shader_state *producer, struct panfrost_shader_state *consumer, uint16_t point_coord_mask, struct pan_linkage *out) { struct panfrost_device *dev = pool->base.dev; unsigned producer_count = producer->info.varyings.output_count; unsigned consumer_count = consumer->info.varyings.input_count; /* Offsets within the general varying buffer, indexed by location */ signed offsets[PAN_MAX_VARYINGS]; assert(producer_count <= ARRAY_SIZE(offsets)); assert(consumer_count <= ARRAY_SIZE(offsets)); /* Allocate enough descriptors for both shader stages */ struct panfrost_ptr T = pan_pool_alloc_desc_array(&pool->base, producer_count + consumer_count, ATTRIBUTE); /* Take a reference if we're being put on the CSO */ if (!pool->owned) { out->bo = pool->transient_bo; panfrost_bo_reference(out->bo); } struct mali_attribute_packed *descs = T.cpu; out->producer = producer_count ? T.gpu : 0; out->consumer = consumer_count ? T.gpu + (pan_size(ATTRIBUTE) * producer_count) : 0; /* Lay out the varyings. Must use producer to lay out, in order to * respect transform feedback precisions. */ out->present = pan_varying_present(dev, &producer->info, &consumer->info, point_coord_mask); out->stride = pan_assign_varyings(dev, &producer->info, &consumer->info, offsets); for (unsigned i = 0; i < producer_count; ++i) { signed j = pan_find_vary(consumer->info.varyings.input, consumer->info.varyings.input_count, producer->info.varyings.output[i].location); enum pipe_format format = (j >= 0) ? consumer->info.varyings.input[j].format : producer->info.varyings.output[i].format; panfrost_emit_varying(dev, descs + i, producer->info.varyings.output[i], format, out->present, 0, offsets[i], PAN_VARY_POSITION); } for (unsigned i = 0; i < consumer_count; ++i) { signed j = pan_find_vary(producer->info.varyings.output, producer->info.varyings.output_count, consumer->info.varyings.input[i].location); signed offset = (j >= 0) ? offsets[j] : -1; panfrost_emit_varying(dev, descs + producer_count + i, consumer->info.varyings.input[i], consumer->info.varyings.input[i].format, out->present, point_coord_mask, offset, PAN_VARY_FRAGCOORD); } } #if PAN_ARCH <= 5 static void pan_emit_special_input(struct mali_attribute_buffer_packed *out, unsigned present, enum pan_special_varying v, unsigned special) { if (present & BITFIELD_BIT(v)) { unsigned idx = pan_varying_index(present, v); pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) { cfg.special = special; cfg.type = 0; } } } #endif static void panfrost_emit_varying_descriptor(struct panfrost_batch *batch, unsigned vertex_count, mali_ptr *vs_attribs, mali_ptr *fs_attribs, mali_ptr *buffers, unsigned *buffer_count, mali_ptr *position, mali_ptr *psiz, bool point_coord_replace) { /* Load the shaders */ struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_state *vs, *fs; vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); uint16_t point_coord_mask = 0; #if PAN_ARCH <= 5 /* Point sprites are lowered on Bifrost and newer */ if (point_coord_replace) point_coord_mask = ctx->rasterizer->base.sprite_coord_enable; #endif /* In good conditions, we only need to link varyings once */ bool prelink = (point_coord_mask == 0) && !vs->info.separable && !fs->info.separable; /* Try to reduce copies */ struct pan_linkage _linkage; struct pan_linkage *linkage = prelink ? &vs->linkage : &_linkage; /* Emit ATTRIBUTE descriptors if needed */ if (!prelink || vs->linkage.bo == NULL) { struct panfrost_pool *pool = prelink ? &ctx->descs : &batch->pool; panfrost_emit_varying_descs(pool, vs, fs, point_coord_mask, linkage); } unsigned present = linkage->present, stride = linkage->stride; unsigned count = util_bitcount(present); struct panfrost_ptr T = pan_pool_alloc_desc_array(&batch->pool.base, count + 1, ATTRIBUTE_BUFFER); struct mali_attribute_buffer_packed *varyings = (struct mali_attribute_buffer_packed *) T.cpu; if (buffer_count) *buffer_count = count; #if PAN_ARCH >= 6 /* Suppress prefetch on Bifrost */ memset(varyings + count, 0, sizeof(*varyings)); #endif if (stride) { panfrost_emit_varyings(batch, &varyings[pan_varying_index(present, PAN_VARY_GENERAL)], stride, vertex_count); } else { /* The indirect draw code reads the stride field, make sure * that it is initialised */ memset(varyings + pan_varying_index(present, PAN_VARY_GENERAL), 0, sizeof(*varyings)); } /* fp32 vec4 gl_Position */ *position = panfrost_emit_varyings(batch, &varyings[pan_varying_index(present, PAN_VARY_POSITION)], sizeof(float) * 4, vertex_count); if (present & BITFIELD_BIT(PAN_VARY_PSIZ)) { *psiz = panfrost_emit_varyings(batch, &varyings[pan_varying_index(present, PAN_VARY_PSIZ)], 2, vertex_count); } #if PAN_ARCH <= 5 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD); pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING); pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD); #endif *buffers = T.gpu; *vs_attribs = linkage->producer; *fs_attribs = linkage->consumer; } /* * Emit jobs required for the rasterization pipeline. If there are side effects * from the vertex shader, these are handled ahead-of-time with a compute * shader. This function should not be called if rasterization is skipped. */ static void panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch, const struct panfrost_ptr *vertex_job, const struct panfrost_ptr *tiler_job) { struct panfrost_context *ctx = batch->ctx; /* XXX - set job_barrier in case buffers get ping-ponged and we need to * enforce ordering, this has a perf hit! See * KHR-GLES31.core.vertex_attrib_binding.advanced-iterations */ unsigned vertex = panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, true, false, ctx->indirect_draw ? batch->indirect_draw_job_id : 0, 0, vertex_job, false); panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, false, vertex, 0, tiler_job, false); } #endif static void emit_tls(struct panfrost_batch *batch) { struct panfrost_device *dev = pan_device(batch->ctx->base.screen); /* Emitted with the FB descriptor on Midgard. */ if (PAN_ARCH <= 5 && batch->framebuffer.gpu) return; struct panfrost_bo *tls_bo = batch->stack_size ? panfrost_batch_get_scratchpad(batch, batch->stack_size, dev->thread_tls_alloc, dev->core_id_range): NULL; struct pan_tls_info tls = { .tls = { .ptr = tls_bo ? tls_bo->ptr.gpu : 0, .size = batch->stack_size, }, }; assert(batch->tls.cpu); GENX(pan_emit_tls)(&tls, batch->tls.cpu); } static void emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb) { struct panfrost_device *dev = pan_device(batch->ctx->base.screen); struct panfrost_bo *tls_bo = batch->stack_size ? panfrost_batch_get_scratchpad(batch, batch->stack_size, dev->thread_tls_alloc, dev->core_id_range): NULL; struct pan_tls_info tls = { .tls = { .ptr = tls_bo ? tls_bo->ptr.gpu : 0, .size = batch->stack_size, }, }; batch->framebuffer.gpu |= GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx, batch->framebuffer.cpu); } /* Mark a surface as written */ static void panfrost_initialize_surface(struct panfrost_batch *batch, struct pipe_surface *surf) { if (surf) { struct panfrost_resource *rsrc = pan_resource(surf->texture); BITSET_SET(rsrc->valid.data, surf->u.tex.level); } } /* Generate a fragment job. This should be called once per frame. (Usually, * this corresponds to eglSwapBuffers or one of glFlush, glFinish) */ static mali_ptr emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb) { /* Mark the affected buffers as initialized, since we're writing to it. * Also, add the surfaces we're writing to to the batch */ struct pipe_framebuffer_state *fb = &batch->key; for (unsigned i = 0; i < fb->nr_cbufs; ++i) panfrost_initialize_surface(batch, fb->cbufs[i]); panfrost_initialize_surface(batch, fb->zsbuf); /* The passed tile coords can be out of range in some cases, so we need * to clamp them to the framebuffer size to avoid a TILE_RANGE_FAULT. * Theoretically we also need to clamp the coordinates positive, but we * avoid that edge case as all four values are unsigned. Also, * theoretically we could clamp the minima, but if that has to happen * the asserts would fail anyway (since the maxima would get clamped * and then be smaller than the minima). An edge case of sorts occurs * when no scissors are added to draw, so by default min=~0 and max=0. * But that can't happen if any actual drawing occurs (beyond a * wallpaper reload), so this is again irrelevant in practice. */ batch->maxx = MIN2(batch->maxx, fb->width); batch->maxy = MIN2(batch->maxy, fb->height); /* Rendering region must be at least 1x1; otherwise, there is nothing * to do and the whole job chain should have been discarded. */ assert(batch->maxx > batch->minx); assert(batch->maxy > batch->miny); struct panfrost_ptr transfer = pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB); GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu, transfer.cpu); return transfer.gpu; } #define DEFINE_CASE(c) case PIPE_PRIM_##c: return MALI_DRAW_MODE_##c; static uint8_t pan_draw_mode(enum pipe_prim_type mode) { switch (mode) { DEFINE_CASE(POINTS); DEFINE_CASE(LINES); DEFINE_CASE(LINE_LOOP); DEFINE_CASE(LINE_STRIP); DEFINE_CASE(TRIANGLES); DEFINE_CASE(TRIANGLE_STRIP); DEFINE_CASE(TRIANGLE_FAN); DEFINE_CASE(QUADS); DEFINE_CASE(POLYGON); #if PAN_ARCH <= 6 DEFINE_CASE(QUAD_STRIP); #endif default: unreachable("Invalid draw mode"); } } #undef DEFINE_CASE /* Count generated primitives (when there is no geom/tess shaders) for * transform feedback */ static void panfrost_statistics_record( struct panfrost_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw) { if (!ctx->active_queries) return; uint32_t prims = u_prims_for_vertices(info->mode, draw->count); ctx->prims_generated += prims; if (!ctx->streamout.num_targets) return; ctx->tf_prims_generated += prims; ctx->dirty |= PAN_DIRTY_SO; } static void panfrost_update_streamout_offsets(struct panfrost_context *ctx) { unsigned count = u_stream_outputs_for_vertices(ctx->active_prim, ctx->vertex_count); for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) { if (!ctx->streamout.targets[i]) continue; pan_so_target(ctx->streamout.targets[i])->offset += count; } } static inline enum mali_index_type panfrost_translate_index_size(unsigned size) { STATIC_ASSERT(MALI_INDEX_TYPE_NONE == 0); STATIC_ASSERT(MALI_INDEX_TYPE_UINT8 == 1); STATIC_ASSERT(MALI_INDEX_TYPE_UINT16 == 2); return (size == 4) ? MALI_INDEX_TYPE_UINT32 : size; } #if PAN_ARCH <= 7 static inline void pan_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d, enum pipe_shader_type st) { d->offset_start = batch->ctx->offset_start; d->instance_size = batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1; d->uniform_buffers = batch->uniform_buffers[st]; d->push_uniforms = batch->push_uniforms[st]; d->textures = batch->textures[st]; d->samplers = batch->samplers[st]; } static void panfrost_draw_emit_vertex_section(struct panfrost_batch *batch, mali_ptr vs_vary, mali_ptr varyings, mali_ptr attribs, mali_ptr attrib_bufs, void *section) { pan_pack(section, DRAW, cfg) { cfg.state = batch->rsd[PIPE_SHADER_VERTEX]; cfg.attributes = attribs; cfg.attribute_buffers = attrib_bufs; cfg.varyings = vs_vary; cfg.varying_buffers = vs_vary ? varyings : 0; cfg.thread_storage = batch->tls.gpu; pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX); } } static void panfrost_draw_emit_vertex(struct panfrost_batch *batch, const struct pipe_draw_info *info, void *invocation_template, mali_ptr vs_vary, mali_ptr varyings, mali_ptr attribs, mali_ptr attrib_bufs, void *job) { void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION); memcpy(section, invocation_template, pan_size(INVOCATION)); pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) { cfg.job_task_split = 5; } section = pan_section_ptr(job, COMPUTE_JOB, DRAW); panfrost_draw_emit_vertex_section(batch, vs_vary, varyings, attribs, attrib_bufs, section); } #endif static void panfrost_emit_primitive_size(struct panfrost_context *ctx, bool points, mali_ptr size_array, void *prim_size) { struct panfrost_rasterizer *rast = ctx->rasterizer; pan_pack(prim_size, PRIMITIVE_SIZE, cfg) { if (panfrost_writes_point_size(ctx)) { cfg.size_array = size_array; } else { cfg.constant = points ? rast->base.point_size : rast->base.line_width; } } } static bool panfrost_is_implicit_prim_restart(const struct pipe_draw_info *info) { /* As a reminder primitive_restart should always be checked before any access to restart_index. */ return info->primitive_restart && info->restart_index == (unsigned)BITFIELD_MASK(info->index_size * 8); } /* On Bifrost and older, the Renderer State Descriptor aggregates many pieces of * 3D state. In particular, it groups the fragment shader descriptor with * depth/stencil, blend, polygon offset, and multisampling state. These pieces * of state are dirty tracked independently for the benefit of newer GPUs that * separate the descriptors. FRAGMENT_RSD_DIRTY_MASK contains the list of 3D * dirty flags that trigger re-emits of the fragment RSD. * * Obscurely, occlusion queries are included. Occlusion query state is nominally * specified in the draw call descriptor, but must be considered when determing * early-Z state which is part of the RSD. */ #define FRAGMENT_RSD_DIRTY_MASK ( \ PAN_DIRTY_ZS | PAN_DIRTY_BLEND | PAN_DIRTY_MSAA | \ PAN_DIRTY_RASTERIZER | PAN_DIRTY_OQ) static inline void panfrost_update_shader_state(struct panfrost_batch *batch, enum pipe_shader_type st) { struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st); bool frag = (st == PIPE_SHADER_FRAGMENT); unsigned dirty_3d = ctx->dirty; unsigned dirty = ctx->dirty_shader[st]; if (dirty & PAN_DIRTY_STAGE_TEXTURE) { batch->textures[st] = panfrost_emit_texture_descriptors(batch, st); } if (dirty & PAN_DIRTY_STAGE_SAMPLER) { batch->samplers[st] = panfrost_emit_sampler_descriptors(batch, st); } /* On Bifrost and older, the fragment shader descriptor is fused * together with the renderer state; the combined renderer state * descriptor is emitted below. Otherwise, the shader descriptor is * standalone and is emitted here. */ if ((dirty & PAN_DIRTY_STAGE_SHADER) && !((PAN_ARCH <= 7) && frag)) { batch->rsd[st] = panfrost_emit_compute_shader_meta(batch, st); } #if PAN_ARCH >= 9 if (dirty & PAN_DIRTY_STAGE_IMAGE) batch->images[st] = panfrost_emit_images(batch, st); #endif if ((dirty & ss->dirty_shader) || (dirty_3d & ss->dirty_3d)) { batch->uniform_buffers[st] = panfrost_emit_const_buf(batch, st, NULL, &batch->push_uniforms[st], NULL); } #if PAN_ARCH <= 7 /* On Bifrost and older, if the fragment shader changes OR any renderer * state specified with the fragment shader, the whole renderer state * descriptor is dirtied and must be reemited. */ if (frag && ((dirty & PAN_DIRTY_STAGE_SHADER) || (dirty_3d & FRAGMENT_RSD_DIRTY_MASK))) { batch->rsd[st] = panfrost_emit_frag_shader_meta(batch); } if (frag && (dirty & PAN_DIRTY_STAGE_IMAGE)) { batch->attribs[st] = panfrost_emit_image_attribs(batch, &batch->attrib_bufs[st], st); } #endif } static inline void panfrost_update_state_3d(struct panfrost_batch *batch) { struct panfrost_context *ctx = batch->ctx; unsigned dirty = ctx->dirty; if (dirty & PAN_DIRTY_TLS_SIZE) panfrost_batch_adjust_stack_size(batch); if (dirty & PAN_DIRTY_BLEND) panfrost_set_batch_masks_blend(batch); if (dirty & PAN_DIRTY_ZS) panfrost_set_batch_masks_zs(batch); #if PAN_ARCH >= 9 if ((dirty & (PAN_DIRTY_ZS | PAN_DIRTY_RASTERIZER)) || (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & PAN_DIRTY_STAGE_SHADER)) batch->depth_stencil = panfrost_emit_depth_stencil(batch); if (dirty & PAN_DIRTY_BLEND) batch->blend = panfrost_emit_blend_valhall(batch); if (dirty & PAN_DIRTY_VERTEX) { batch->attribs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_data(batch); batch->attrib_bufs[PIPE_SHADER_VERTEX] = panfrost_emit_vertex_buffers(batch); } #endif } #if PAN_ARCH >= 6 static mali_ptr panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_count) { struct panfrost_device *dev = pan_device(batch->ctx->base.screen); if (!vertex_count) return 0; if (batch->tiler_ctx.bifrost) return batch->tiler_ctx.bifrost; struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP); GENX(pan_emit_tiler_heap)(dev, t.cpu); mali_ptr heap = t.gpu; t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT); GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height, util_framebuffer_get_num_samples(&batch->key), pan_tristate_get(batch->first_provoking_vertex), heap, t.cpu); batch->tiler_ctx.bifrost = t.gpu; return batch->tiler_ctx.bifrost; } #endif /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler * jobs and Valhall IDVS jobs */ static void panfrost_emit_primitive(struct panfrost_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw, mali_ptr indices, bool secondary_shader, void *out) { UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; bool lines = (info->mode == PIPE_PRIM_LINES || info->mode == PIPE_PRIM_LINE_LOOP || info->mode == PIPE_PRIM_LINE_STRIP); pan_pack(out, PRIMITIVE, cfg) { cfg.draw_mode = pan_draw_mode(info->mode); if (panfrost_writes_point_size(ctx)) cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16; #if PAN_ARCH <= 8 /* For line primitives, PRIMITIVE.first_provoking_vertex must * be set to true and the provoking vertex is selected with * DRAW.flat_shading_vertex. */ if (lines) cfg.first_provoking_vertex = true; else cfg.first_provoking_vertex = rast->flatshade_first; if (panfrost_is_implicit_prim_restart(info)) { cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT; } else if (info->primitive_restart) { cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT; cfg.primitive_restart_index = info->restart_index; } cfg.job_task_split = 6; #else struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); cfg.allow_rotating_primitives = !(lines || fs->info.bifrost.uses_flat_shading); cfg.primitive_restart = info->primitive_restart; /* Non-fixed restart indices should have been lowered */ assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info)); #endif cfg.index_count = ctx->indirect_draw ? 1 : draw->count; cfg.index_type = panfrost_translate_index_size(info->index_size); if (PAN_ARCH >= 9) { /* Base vertex offset on Valhall is used for both * indexed and non-indexed draws, in a simple way for * either. Handle both cases. */ if (cfg.index_type) cfg.base_vertex_offset = draw->index_bias; else cfg.base_vertex_offset = draw->start; /* Indices are moved outside the primitive descriptor * on Valhall, so we don't need to set that here */ } else if (cfg.index_type) { cfg.base_vertex_offset = draw->index_bias - ctx->offset_start; #if PAN_ARCH <= 7 cfg.indices = indices; #endif } #if PAN_ARCH >= 6 cfg.secondary_shader = secondary_shader; #endif } } #if PAN_ARCH >= 9 static mali_ptr panfrost_emit_resources(struct panfrost_batch *batch, enum pipe_shader_type stage, mali_ptr ubos, unsigned ubo_count) { struct panfrost_context *ctx = batch->ctx; struct panfrost_ptr T; unsigned nr_tables = 12; /* Although individual resources need only 16 byte alignment, the * resource table as a whole must be 64-byte aligned. */ T = pan_pool_alloc_aligned(&batch->pool.base, nr_tables * pan_size(RESOURCE), 64); memset(T.cpu, 0, nr_tables * pan_size(RESOURCE)); panfrost_make_resource_table(T, PAN_TABLE_UBO, ubos, ubo_count); panfrost_make_resource_table(T, PAN_TABLE_TEXTURE, batch->textures[stage], ctx->sampler_view_count[stage]); panfrost_make_resource_table(T, PAN_TABLE_SAMPLER, batch->samplers[stage], ctx->sampler_count[stage]); panfrost_make_resource_table(T, PAN_TABLE_IMAGE, batch->images[stage], util_last_bit(ctx->image_mask[stage])); if (stage == PIPE_SHADER_VERTEX) { panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE, batch->attribs[stage], ctx->vertex->num_elements); panfrost_make_resource_table(T, PAN_TABLE_ATTRIBUTE_BUFFER, batch->attrib_bufs[stage], util_last_bit(ctx->vb_mask)); } return T.gpu | nr_tables; } static void panfrost_emit_shader(struct panfrost_batch *batch, struct MALI_SHADER_ENVIRONMENT *cfg, enum pipe_shader_type stage, mali_ptr shader_ptr, mali_ptr thread_storage) { unsigned fau_words = 0, ubo_count = 0; mali_ptr ubos, resources; ubos = panfrost_emit_const_buf(batch, stage, &ubo_count, &cfg->fau, &fau_words); resources = panfrost_emit_resources(batch, stage, ubos, ubo_count); cfg->thread_storage = thread_storage; cfg->shader = shader_ptr; cfg->resources = resources; /* Each entry of FAU is 64-bits */ cfg->fau_count = DIV_ROUND_UP(fau_words, 2); } #endif static void panfrost_emit_draw(void *out, struct panfrost_batch *batch, bool fs_required, enum pipe_prim_type prim, mali_ptr pos, mali_ptr fs_vary, mali_ptr varyings) { struct panfrost_context *ctx = batch->ctx; struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; bool polygon = (prim == PIPE_PRIM_TRIANGLES); pan_pack(out, DRAW, cfg) { /* * From the Gallium documentation, * pipe_rasterizer_state::cull_face "indicates which faces of * polygons to cull". Points and lines are not considered * polygons and should be drawn even if all faces are culled. * The hardware does not take primitive type into account when * culling, so we need to do that check ourselves. */ cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT); cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK); cfg.front_face_ccw = rast->front_ccw; if (ctx->occlusion_query && ctx->active_queries) { if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER) cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER; else cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE; struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc); cfg.occlusion = rsrc->image.data.bo->ptr.gpu; panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT); } #if PAN_ARCH >= 9 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); cfg.multisample_enable = rast->multisample; cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF; /* Use per-sample shading if required by API Also use it when a * blend shader is used with multisampling, as this is handled * by a single ST_TILE in the blend shader with the current * sample ID, requiring per-sample shading. */ cfg.evaluate_per_sample = (rast->multisample && ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader)); cfg.single_sampled_lines = !rast->multisample; cfg.vertex_array.packet = true; cfg.minimum_z = batch->minimum_z; cfg.maximum_z = batch->maximum_z; cfg.depth_stencil = batch->depth_stencil; if (fs_required) { bool has_oq = ctx->occlusion_query && ctx->active_queries; struct pan_earlyzs_state earlyzs = pan_earlyzs_get(fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq, ctx->blend->base.alpha_to_coverage, ctx->depth_stencil->zs_always_passes); cfg.pixel_kill_operation = earlyzs.kill; cfg.zs_update_operation = earlyzs.update; cfg.allow_forward_pixel_to_kill = pan_allow_forward_pixel_to_kill(ctx, fs); cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global; /* Mask of render targets that may be written. A render * target may be written if the fragment shader writes * to it AND it actually exists. If the render target * doesn't actually exist, the blend descriptor will be * OFF so it may be omitted from the mask. * * Only set when there is a fragment shader, since * otherwise no colour updates are possible. */ cfg.render_target_mask = (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask; /* Also use per-sample shading if required by the shader */ cfg.evaluate_per_sample |= fs->info.fs.sample_shading; /* Unlike Bifrost, alpha-to-coverage must be included in * this identically-named flag. Confusing, isn't it? */ cfg.shader_modifies_coverage = fs->info.fs.writes_coverage || fs->info.fs.can_discard || ctx->blend->base.alpha_to_coverage; /* Blend descriptors are only accessed by a BLEND * instruction on Valhall. It follows that if the * fragment shader is omitted, we may also emit the * blend descriptors. */ cfg.blend = batch->blend; cfg.blend_count = MAX2(batch->key.nr_cbufs, 1); cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage; cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0); cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1); panfrost_emit_shader(batch, &cfg.shader, PIPE_SHADER_FRAGMENT, batch->rsd[PIPE_SHADER_FRAGMENT], batch->tls.gpu); } else { /* These operations need to be FORCE to benefit from the * depth-only pass optimizations. */ cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY; cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY; /* No shader and no blend => no shader or blend * reasons to disable FPK. The only FPK-related state * not covered is alpha-to-coverage which we don't set * without blend. */ cfg.allow_forward_pixel_to_kill = true; /* No shader => no shader side effects */ cfg.allow_forward_pixel_to_be_killed = true; /* Alpha isn't written so these are vacuous */ cfg.overdraw_alpha0 = true; cfg.overdraw_alpha1 = true; } #else cfg.position = pos; cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT]; cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT]; cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT]; cfg.viewport = batch->viewport; cfg.varyings = fs_vary; cfg.varying_buffers = fs_vary ? varyings : 0; cfg.thread_storage = batch->tls.gpu; /* For all primitives but lines DRAW.flat_shading_vertex must * be set to 0 and the provoking vertex is selected with the * PRIMITIVE.first_provoking_vertex field. */ if (prim == PIPE_PRIM_LINES) { /* The logic is inverted across arches. */ cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5); } pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT); #endif } } #if PAN_ARCH >= 9 static void panfrost_emit_malloc_vertex(struct panfrost_batch *batch, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw, mali_ptr indices, bool secondary_shader, void *job) { struct panfrost_context *ctx = batch->ctx; struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT); bool fs_required = panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil); /* Varying shaders only feed data to the fragment shader, so if we omit * the fragment shader, we should omit the varying shader too. */ secondary_shader &= fs_required; panfrost_emit_primitive(ctx, info, draw, 0, secondary_shader, pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE)); pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) { cfg.count = info->instance_count; } pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) { if (secondary_shader) { unsigned v = vs->info.varyings.output_count; unsigned f = fs->info.varyings.input_count; unsigned slots = MAX2(v, f); slots += util_bitcount(fs->key.fixed_varying_mask); unsigned size = slots * 16; /* Assumes 16 byte slots. We could do better. */ cfg.vertex_packet_stride = size + 16; cfg.vertex_attribute_stride = size; } else { /* Hardware requirement for "no varyings" */ cfg.vertex_packet_stride = 16; cfg.vertex_attribute_stride = 0; } } pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) { cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); } STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR)); memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor, pan_size(SCISSOR)); panfrost_emit_primitive_size(ctx, info->mode == PIPE_PRIM_POINTS, 0, pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE)); pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) { cfg.address = indices; } panfrost_emit_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch, fs_required, u_reduced_prim(info->mode), 0, 0, 0); pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) { /* IDVS/points vertex shader */ mali_ptr vs_ptr = batch->rsd[PIPE_SHADER_VERTEX]; /* IDVS/triangle vertex shader */ if (vs_ptr && info->mode != PIPE_PRIM_POINTS) vs_ptr += pan_size(SHADER_PROGRAM); panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, vs_ptr, batch->tls.gpu); } pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) { /* If a varying shader is used, we configure it with the same * state as the position shader for backwards compatible * behaviour with Bifrost. This could be optimized. */ if (!secondary_shader) continue; mali_ptr ptr = batch->rsd[PIPE_SHADER_VERTEX] + (2 * pan_size(SHADER_PROGRAM)); panfrost_emit_shader(batch, &cfg, PIPE_SHADER_VERTEX, ptr, batch->tls.gpu); } } #endif #if PAN_ARCH <= 7 static void panfrost_draw_emit_tiler(struct panfrost_batch *batch, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw, void *invocation_template, mali_ptr indices, mali_ptr fs_vary, mali_ptr varyings, mali_ptr pos, mali_ptr psiz, bool secondary_shader, void *job) { struct panfrost_context *ctx = batch->ctx; void *section = pan_section_ptr(job, TILER_JOB, INVOCATION); memcpy(section, invocation_template, pan_size(INVOCATION)); panfrost_emit_primitive(ctx, info, draw, indices, secondary_shader, pan_section_ptr(job, TILER_JOB, PRIMITIVE)); void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE); enum pipe_prim_type prim = u_reduced_prim(info->mode); #if PAN_ARCH >= 6 pan_section_pack(job, TILER_JOB, TILER, cfg) { cfg.address = panfrost_batch_get_bifrost_tiler(batch, ~0); } pan_section_pack(job, TILER_JOB, PADDING, cfg); #endif panfrost_emit_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim, pos, fs_vary, varyings); panfrost_emit_primitive_size(ctx, prim == PIPE_PRIM_POINTS, psiz, prim_size); } #endif static void panfrost_launch_xfb(struct panfrost_batch *batch, const struct pipe_draw_info *info, mali_ptr attribs, mali_ptr attrib_bufs, unsigned count) { struct panfrost_context *ctx = batch->ctx; struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); /* Nothing to do */ if (batch->ctx->streamout.num_targets == 0) return; /* TODO: XFB with index buffers */ //assert(info->index_size == 0); u_trim_pipe_prim(info->mode, &count); if (count == 0) return; struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); struct panfrost_shader_variants v = { .variants = vs->xfb }; vs->xfb->stream_output = vs->stream_output; struct panfrost_shader_variants *saved_vs = ctx->shader[PIPE_SHADER_VERTEX]; mali_ptr saved_rsd = batch->rsd[PIPE_SHADER_VERTEX]; mali_ptr saved_ubo = batch->uniform_buffers[PIPE_SHADER_VERTEX]; mali_ptr saved_push = batch->push_uniforms[PIPE_SHADER_VERTEX]; ctx->shader[PIPE_SHADER_VERTEX] = &v; batch->rsd[PIPE_SHADER_VERTEX] = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_VERTEX); #if PAN_ARCH >= 9 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { cfg.workgroup_size_x = 1; cfg.workgroup_size_y = 1; cfg.workgroup_size_z = 1; cfg.workgroup_count_x = count; cfg.workgroup_count_y = info->instance_count; cfg.workgroup_count_z = 1; panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_VERTEX, batch->rsd[PIPE_SHADER_VERTEX], batch->tls.gpu); /* TODO: Indexing. Also, this is a legacy feature... */ cfg.compute.attribute_offset = batch->ctx->offset_start; /* Transform feedback shaders do not use barriers or shared * memory, so we may merge workgroups. */ cfg.allow_merging_workgroups = true; cfg.task_increment = 1; cfg.task_axis = MALI_TASK_AXIS_Z; } #else struct mali_invocation_packed invocation; panfrost_pack_work_groups_compute(&invocation, 1, count, info->instance_count, 1, 1, 1, PAN_ARCH <= 5, false); batch->uniform_buffers[PIPE_SHADER_VERTEX] = panfrost_emit_const_buf(batch, PIPE_SHADER_VERTEX, NULL, &batch->push_uniforms[PIPE_SHADER_VERTEX], NULL); panfrost_draw_emit_vertex(batch, info, &invocation, 0, 0, attribs, attrib_bufs, t.cpu); #endif enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE; #if PAN_ARCH <= 5 job_type = MALI_JOB_TYPE_VERTEX; #endif panfrost_add_job(&batch->pool.base, &batch->scoreboard, job_type, true, false, 0, 0, &t, false); ctx->shader[PIPE_SHADER_VERTEX] = saved_vs; batch->rsd[PIPE_SHADER_VERTEX] = saved_rsd; batch->uniform_buffers[PIPE_SHADER_VERTEX] = saved_ubo; batch->push_uniforms[PIPE_SHADER_VERTEX] = saved_push; } static void panfrost_direct_draw(struct panfrost_batch *batch, const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_start_count_bias *draw) { if (!draw->count || !info->instance_count) return; struct panfrost_context *ctx = batch->ctx; /* If we change whether we're drawing points, or whether point sprites * are enabled (specified in the rasterizer), we may need to rebind * shaders accordingly. This implicitly covers the case of rebinding * framebuffers, because all dirty flags are set there. */ if ((ctx->dirty & PAN_DIRTY_RASTERIZER) || ((ctx->active_prim == PIPE_PRIM_POINTS) ^ (info->mode == PIPE_PRIM_POINTS))) { ctx->active_prim = info->mode; panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT); } /* Take into account a negative bias */ ctx->indirect_draw = false; ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0); ctx->instance_count = info->instance_count; ctx->base_vertex = info->index_size ? draw->index_bias : 0; ctx->base_instance = info->start_instance; ctx->active_prim = info->mode; ctx->drawid = drawid_offset; struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); bool idvs = vs->info.vs.idvs; bool secondary_shader = vs->info.vs.secondary_enable; UNUSED struct panfrost_ptr tiler, vertex; if (idvs) { #if PAN_ARCH >= 9 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB); #elif PAN_ARCH >= 6 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB); #else unreachable("IDVS is unsupported on Midgard"); #endif } else { vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); } unsigned vertex_count = ctx->vertex_count; unsigned min_index = 0, max_index = 0; mali_ptr indices = 0; if (info->index_size && PAN_ARCH >= 9) { indices = panfrost_get_index_buffer(batch, info, draw); } else if (info->index_size) { indices = panfrost_get_index_buffer_bounded(batch, info, draw, &min_index, &max_index); /* Use the corresponding values */ vertex_count = max_index - min_index + 1; ctx->offset_start = min_index + draw->index_bias; } else { ctx->offset_start = draw->start; } if (info->instance_count > 1) { unsigned count = vertex_count; /* Index-Driven Vertex Shading requires different instances to * have different cache lines for position results. Each vertex * position is 16 bytes and the Mali cache line is 64 bytes, so * the instance count must be aligned to 4 vertices. */ if (idvs) count = ALIGN_POT(count, 4); ctx->padded_count = panfrost_padded_vertex_count(count); } else ctx->padded_count = vertex_count; panfrost_statistics_record(ctx, info, draw); #if PAN_ARCH <= 7 struct mali_invocation_packed invocation; if (info->instance_count > 1) { panfrost_pack_work_groups_compute(&invocation, 1, vertex_count, info->instance_count, 1, 1, 1, true, false); } else { pan_pack(&invocation, INVOCATION, cfg) { cfg.invocations = MALI_POSITIVE(vertex_count); cfg.size_y_shift = 0; cfg.size_z_shift = 0; cfg.workgroups_x_shift = 0; cfg.workgroups_y_shift = 0; cfg.workgroups_z_shift = 32; cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT; } } /* Emit all sort of descriptors. */ mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0; panfrost_emit_varying_descriptor(batch, ctx->padded_count * ctx->instance_count, &vs_vary, &fs_vary, &varyings, NULL, &pos, &psiz, info->mode == PIPE_PRIM_POINTS); mali_ptr attribs, attrib_bufs; attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); #endif panfrost_update_state_3d(batch); panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX); panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT); panfrost_clean_state_3d(ctx); if (vs->xfb) { #if PAN_ARCH >= 9 mali_ptr attribs = 0, attrib_bufs = 0; #endif panfrost_launch_xfb(batch, info, attribs, attrib_bufs, draw->count); } /* Increment transform feedback offsets */ panfrost_update_streamout_offsets(ctx); /* Any side effects must be handled by the XFB shader, so we only need * to run vertex shaders if we need rasterization. */ if (panfrost_batch_skip_rasterization(batch)) return; #if PAN_ARCH >= 9 assert(idvs && "Memory allocated IDVS required on Valhall"); panfrost_emit_malloc_vertex(batch, info, draw, indices, secondary_shader, tiler.cpu); panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_MALLOC_VERTEX, false, false, 0, 0, &tiler, false); #else /* Fire off the draw itself */ panfrost_draw_emit_tiler(batch, info, draw, &invocation, indices, fs_vary, varyings, pos, psiz, secondary_shader, tiler.cpu); if (idvs) { #if PAN_ARCH >= 6 panfrost_draw_emit_vertex_section(batch, vs_vary, varyings, attribs, attrib_bufs, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW)); panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, 0, &tiler, false); #endif } else { panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings, attribs, attrib_bufs, vertex.cpu); panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); } #endif } #if PAN_GPU_INDIRECTS static void panfrost_indirect_draw(struct panfrost_batch *batch, const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count_bias *draw) { /* Indirect draw count and multi-draw not supported. */ assert(indirect->draw_count == 1 && !indirect->indirect_draw_count); struct panfrost_context *ctx = batch->ctx; struct panfrost_device *dev = pan_device(ctx->base.screen); /* TODO: update statistics (see panfrost_statistics_record()) */ /* TODO: Increment transform feedback offsets */ assert(ctx->streamout.num_targets == 0); ctx->active_prim = info->mode; ctx->drawid = drawid_offset; ctx->indirect_draw = true; struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX); bool idvs = vs->info.vs.idvs; bool secondary_shader = vs->info.vs.secondary_enable; struct panfrost_ptr tiler = { 0 }, vertex = { 0 }; if (idvs) { #if PAN_ARCH >= 6 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB); #else unreachable("IDVS is unsupported on Midgard"); #endif } else { vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB); } struct panfrost_bo *index_buf = NULL; if (info->index_size) { assert(!info->has_user_indices); struct panfrost_resource *rsrc = pan_resource(info->index.resource); index_buf = rsrc->image.data.bo; panfrost_batch_read_rsrc(batch, rsrc, PIPE_SHADER_VERTEX); } mali_ptr varyings = 0, vs_vary = 0, fs_vary = 0, pos = 0, psiz = 0; unsigned varying_buf_count; /* We want to create templates, set all count fields to 0 to reflect * that. */ ctx->instance_count = ctx->vertex_count = ctx->padded_count = 0; ctx->offset_start = 0; /* Set the {first,base}_vertex sysvals to NULL. Will be updated if the * vertex shader uses gl_VertexID or gl_BaseVertex. */ ctx->first_vertex_sysval_ptr = 0; ctx->base_vertex_sysval_ptr = 0; ctx->base_instance_sysval_ptr = 0; panfrost_update_state_3d(batch); panfrost_update_shader_state(batch, PIPE_SHADER_VERTEX); panfrost_update_shader_state(batch, PIPE_SHADER_FRAGMENT); panfrost_clean_state_3d(ctx); bool point_coord_replace = (info->mode == PIPE_PRIM_POINTS); panfrost_emit_varying_descriptor(batch, 0, &vs_vary, &fs_vary, &varyings, &varying_buf_count, &pos, &psiz, point_coord_replace); mali_ptr attribs, attrib_bufs; attribs = panfrost_emit_vertex_data(batch, &attrib_bufs); /* Zero-ed invocation, the compute job will update it. */ static struct mali_invocation_packed invocation; /* Fire off the draw itself */ panfrost_draw_emit_tiler(batch, info, draw, &invocation, index_buf ? index_buf->ptr.gpu : 0, fs_vary, varyings, pos, psiz, secondary_shader, tiler.cpu); if (idvs) { #if PAN_ARCH >= 6 panfrost_draw_emit_vertex_section(batch, vs_vary, varyings, attribs, attrib_bufs, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW)); #endif } else { panfrost_draw_emit_vertex(batch, info, &invocation, vs_vary, varyings, attribs, attrib_bufs, vertex.cpu); } /* Add the varying heap BO to the batch if we're allocating varyings. */ if (varyings) { panfrost_batch_add_bo(batch, dev->indirect_draw_shaders.varying_heap, PIPE_SHADER_VERTEX); } assert(indirect->buffer); struct panfrost_resource *draw_buf = pan_resource(indirect->buffer); /* Don't count images: those attributes don't need to be patched. */ unsigned attrib_count = vs->info.attribute_count - util_bitcount(ctx->image_mask[PIPE_SHADER_VERTEX]); panfrost_batch_read_rsrc(batch, draw_buf, PIPE_SHADER_VERTEX); struct pan_indirect_draw_info draw_info = { .last_indirect_draw = batch->indirect_draw_job_id, .draw_buf = draw_buf->image.data.bo->ptr.gpu + indirect->offset, .index_buf = index_buf ? index_buf->ptr.gpu : 0, .first_vertex_sysval = ctx->first_vertex_sysval_ptr, .base_vertex_sysval = ctx->base_vertex_sysval_ptr, .base_instance_sysval = ctx->base_instance_sysval_ptr, .vertex_job = vertex.gpu, .tiler_job = tiler.gpu, .attrib_bufs = attrib_bufs, .attribs = attribs, .attrib_count = attrib_count, .varying_bufs = varyings, .index_size = info->index_size, }; if (panfrost_writes_point_size(ctx)) draw_info.flags |= PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE; if (vs->info.vs.writes_point_size) draw_info.flags |= PAN_INDIRECT_DRAW_HAS_PSIZ; if (idvs) draw_info.flags |= PAN_INDIRECT_DRAW_IDVS; if (info->primitive_restart) { draw_info.restart_index = info->restart_index; draw_info.flags |= PAN_INDIRECT_DRAW_PRIMITIVE_RESTART; } batch->indirect_draw_job_id = GENX(panfrost_emit_indirect_draw)(&batch->pool.base, &batch->scoreboard, &draw_info, &batch->indirect_draw_ctx); if (idvs) { panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_INDEXED_VERTEX, false, false, 0, 0, &tiler, false); } else { panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler); } } #endif static bool panfrost_compatible_batch_state(struct panfrost_batch *batch, bool points) { /* Only applies on Valhall */ if (PAN_ARCH < 9) return true; struct panfrost_context *ctx = batch->ctx; struct pipe_rasterizer_state *rast = &ctx->rasterizer->base; bool coord = (rast->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT); bool first = rast->flatshade_first; /* gl_PointCoord orientation only matters when drawing points, but * provoking vertex doesn't matter for points. */ if (points) return pan_tristate_set(&batch->sprite_coord_origin, coord); else return pan_tristate_set(&batch->first_provoking_vertex, first); } static void panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count_bias *draws, unsigned num_draws) { struct panfrost_context *ctx = pan_context(pipe); struct panfrost_device *dev = pan_device(pipe->screen); if (!panfrost_render_condition_check(ctx)) return; /* Emulate indirect draws unless we're using the experimental path */ if ((!(dev->debug & PAN_DBG_INDIRECT) || !PAN_GPU_INDIRECTS) && indirect && indirect->buffer) { assert(num_draws == 1); util_draw_indirect(pipe, info, indirect); return; } /* Do some common setup */ struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); /* Don't add too many jobs to a single batch. Hardware has a hard limit * of 65536 jobs, but we choose a smaller soft limit (arbitrary) to * avoid the risk of timeouts. This might not be a good idea. */ if (unlikely(batch->scoreboard.job_index > 10000)) batch = panfrost_get_fresh_batch_for_fbo(ctx, "Too many draws"); bool points = (info->mode == PIPE_PRIM_POINTS); if (unlikely(!panfrost_compatible_batch_state(batch, points))) { batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change"); ASSERTED bool succ = panfrost_compatible_batch_state(batch, points); assert(succ && "must be able to set state for a fresh batch"); } /* panfrost_batch_skip_rasterization reads * batch->scissor_culls_everything, which is set by * panfrost_emit_viewport, so call that first. */ if (ctx->dirty & (PAN_DIRTY_VIEWPORT | PAN_DIRTY_SCISSOR)) batch->viewport = panfrost_emit_viewport(batch); /* Mark everything dirty when debugging */ if (unlikely(dev->debug & PAN_DBG_DIRTY)) panfrost_dirty_state_all(ctx); /* Conservatively assume draw parameters always change */ ctx->dirty |= PAN_DIRTY_PARAMS | PAN_DIRTY_DRAWID; if (indirect) { assert(num_draws == 1); assert(PAN_GPU_INDIRECTS); #if PAN_GPU_INDIRECTS if (indirect->count_from_stream_output) { struct pipe_draw_start_count_bias tmp_draw = *draws; struct panfrost_streamout_target *so = pan_so_target(indirect->count_from_stream_output); tmp_draw.start = 0; tmp_draw.count = so->offset; tmp_draw.index_bias = 0; panfrost_direct_draw(batch, info, drawid_offset, &tmp_draw); return; } panfrost_indirect_draw(batch, info, drawid_offset, indirect, &draws[0]); return; #endif } struct pipe_draw_info tmp_info = *info; unsigned drawid = drawid_offset; for (unsigned i = 0; i < num_draws; i++) { panfrost_direct_draw(batch, &tmp_info, drawid, &draws[i]); if (tmp_info.increment_draw_id) { ctx->dirty |= PAN_DIRTY_DRAWID; drawid++; } } } /* Launch grid is the compute equivalent of draw_vbo, so in this routine, we * construct the COMPUTE job and some of its payload. */ static void panfrost_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) { struct panfrost_context *ctx = pan_context(pipe); /* XXX - shouldn't be necessary with working memory barriers. Affected * test: KHR-GLES31.core.compute_shader.pipeline-post-xfb */ panfrost_flush_all_batches(ctx, "Launch grid pre-barrier"); struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx); struct panfrost_shader_state *cs = &ctx->shader[PIPE_SHADER_COMPUTE]->variants[0]; /* Indirect dispatch can't handle workgroup local storage since that * would require dynamic memory allocation. Bail in this case. */ if (info->indirect && ((cs->info.wls_size != 0) || !PAN_GPU_INDIRECTS)) { struct pipe_transfer *transfer; uint32_t *params = pipe_buffer_map_range(pipe, info->indirect, info->indirect_offset, 3 * sizeof(uint32_t), PIPE_MAP_READ, &transfer); struct pipe_grid_info direct = *info; direct.indirect = NULL; direct.grid[0] = params[0]; direct.grid[1] = params[1]; direct.grid[2] = params[2]; pipe_buffer_unmap(pipe, transfer); if (params[0] && params[1] && params[2]) panfrost_launch_grid(pipe, &direct); return; } ctx->compute_grid = info; struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB); /* We implement OpenCL inputs as uniforms (or a UBO -- same thing), so * reuse the graphics path for this by lowering to Gallium */ struct pipe_constant_buffer ubuf = { .buffer = NULL, .buffer_offset = 0, .buffer_size = ctx->shader[PIPE_SHADER_COMPUTE]->req_input_mem, .user_buffer = info->input }; if (info->input) pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &ubuf); /* Invoke according to the grid info */ unsigned num_wg[3] = { info->grid[0], info->grid[1], info->grid[2] }; if (info->indirect) num_wg[0] = num_wg[1] = num_wg[2] = 1; panfrost_update_shader_state(batch, PIPE_SHADER_COMPUTE); #if PAN_ARCH <= 7 panfrost_pack_work_groups_compute(pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1], num_wg[2], info->block[0], info->block[1], info->block[2], false, info->indirect != NULL); pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) { cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) + util_logbase2_ceil(info->block[1] + 1) + util_logbase2_ceil(info->block[2] + 1); } pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) { cfg.state = batch->rsd[PIPE_SHADER_COMPUTE]; cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE); cfg.thread_storage = panfrost_emit_shared_memory(batch, info); cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE]; cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE]; cfg.textures = batch->textures[PIPE_SHADER_COMPUTE]; cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE]; } #else pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) { cfg.workgroup_size_x = info->block[0]; cfg.workgroup_size_y = info->block[1]; cfg.workgroup_size_z = info->block[2]; cfg.workgroup_count_x = num_wg[0]; cfg.workgroup_count_y = num_wg[1]; cfg.workgroup_count_z = num_wg[2]; panfrost_emit_shader(batch, &cfg.compute, PIPE_SHADER_COMPUTE, batch->rsd[PIPE_SHADER_COMPUTE], panfrost_emit_shared_memory(batch, info)); cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups; cfg.task_increment = 1; cfg.task_axis = MALI_TASK_AXIS_Z; } #endif unsigned indirect_dep = 0; #if PAN_GPU_INDIRECTS if (info->indirect) { struct pan_indirect_dispatch_info indirect = { .job = t.gpu, .indirect_dim = pan_resource(info->indirect)->image.data.bo->ptr.gpu + info->indirect_offset, .num_wg_sysval = { batch->num_wg_sysval[0], batch->num_wg_sysval[1], batch->num_wg_sysval[2], }, }; indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base, &batch->scoreboard, &indirect); } #endif panfrost_add_job(&batch->pool.base, &batch->scoreboard, MALI_JOB_TYPE_COMPUTE, true, false, indirect_dep, 0, &t, false); panfrost_flush_all_batches(ctx, "Launch grid post-barrier"); } static void * panfrost_create_rasterizer_state( struct pipe_context *pctx, const struct pipe_rasterizer_state *cso) { struct panfrost_rasterizer *so = CALLOC_STRUCT(panfrost_rasterizer); so->base = *cso; /* Gauranteed with the core GL call, so don't expose ARB_polygon_offset */ assert(cso->offset_clamp == 0.0); #if PAN_ARCH <= 7 pan_pack(&so->multisample, MULTISAMPLE_MISC, cfg) { cfg.multisample_enable = cso->multisample; cfg.fixed_function_near_discard = cso->depth_clip_near; cfg.fixed_function_far_discard = cso->depth_clip_far; cfg.shader_depth_range_fixed = true; } pan_pack(&so->stencil_misc, STENCIL_MASK_MISC, cfg) { cfg.front_facing_depth_bias = cso->offset_tri; cfg.back_facing_depth_bias = cso->offset_tri; cfg.single_sampled_lines = !cso->multisample; } #endif return so; } #if PAN_ARCH >= 9 /* * Given a pipe_vertex_element, pack the corresponding Valhall attribute * descriptor. This function is called at CSO create time. Since * pipe_vertex_element lacks a stride, the packed attribute descriptor will not * be uploaded until draw time. */ static void panfrost_pack_attribute(struct panfrost_device *dev, const struct pipe_vertex_element el, struct mali_attribute_packed *out) { pan_pack(out, ATTRIBUTE, cfg) { cfg.table = PAN_TABLE_ATTRIBUTE_BUFFER; cfg.frequency = (el.instance_divisor > 0) ? MALI_ATTRIBUTE_FREQUENCY_INSTANCE : MALI_ATTRIBUTE_FREQUENCY_VERTEX; cfg.format = dev->formats[el.src_format].hw; cfg.offset = el.src_offset; cfg.buffer_index = el.vertex_buffer_index; if (el.instance_divisor == 0) { /* Per-vertex */ cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D; cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; cfg.offset_enable = true; } else if (util_is_power_of_two_or_zero(el.instance_divisor)) { /* Per-instance, POT divisor */ cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR; cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; cfg.divisor_r = __builtin_ctz(el.instance_divisor); } else { /* Per-instance, NPOT divisor */ cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR; cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE; cfg.divisor_d = panfrost_compute_magic_divisor(el.instance_divisor, &cfg.divisor_r, &cfg.divisor_e); } } } #endif static void * panfrost_create_vertex_elements_state( struct pipe_context *pctx, unsigned num_elements, const struct pipe_vertex_element *elements) { struct panfrost_vertex_state *so = CALLOC_STRUCT(panfrost_vertex_state); struct panfrost_device *dev = pan_device(pctx->screen); so->num_elements = num_elements; memcpy(so->pipe, elements, sizeof(*elements) * num_elements); #if PAN_ARCH >= 9 for (unsigned i = 0; i < num_elements; ++i) panfrost_pack_attribute(dev, elements[i], &so->attributes[i]); #else /* Assign attribute buffers corresponding to the vertex buffers, keyed * for a particular divisor since that's how instancing works on Mali */ for (unsigned i = 0; i < num_elements; ++i) { so->element_buffer[i] = pan_assign_vertex_buffer( so->buffers, &so->nr_bufs, elements[i].vertex_buffer_index, elements[i].instance_divisor); } for (int i = 0; i < num_elements; ++i) { enum pipe_format fmt = elements[i].src_format; so->formats[i] = dev->formats[fmt].hw; } /* Let's also prepare vertex builtins */ so->formats[PAN_VERTEX_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; so->formats[PAN_INSTANCE_ID] = dev->formats[PIPE_FORMAT_R32_UINT].hw; #endif return so; } static inline unsigned pan_pipe_to_stencil_op(enum pipe_stencil_op in) { switch (in) { case PIPE_STENCIL_OP_KEEP: return MALI_STENCIL_OP_KEEP; case PIPE_STENCIL_OP_ZERO: return MALI_STENCIL_OP_ZERO; case PIPE_STENCIL_OP_REPLACE: return MALI_STENCIL_OP_REPLACE; case PIPE_STENCIL_OP_INCR: return MALI_STENCIL_OP_INCR_SAT; case PIPE_STENCIL_OP_DECR: return MALI_STENCIL_OP_DECR_SAT; case PIPE_STENCIL_OP_INCR_WRAP: return MALI_STENCIL_OP_INCR_WRAP; case PIPE_STENCIL_OP_DECR_WRAP: return MALI_STENCIL_OP_DECR_WRAP; case PIPE_STENCIL_OP_INVERT: return MALI_STENCIL_OP_INVERT; default: unreachable("Invalid stencil op"); } } #if PAN_ARCH <= 7 static inline void pan_pipe_to_stencil(const struct pipe_stencil_state *in, struct mali_stencil_packed *out) { pan_pack(out, STENCIL, s) { s.mask = in->valuemask; s.compare_function = (enum mali_func) in->func; s.stencil_fail = pan_pipe_to_stencil_op(in->fail_op); s.depth_fail = pan_pipe_to_stencil_op(in->zfail_op); s.depth_pass = pan_pipe_to_stencil_op(in->zpass_op); } } #endif static bool pipe_zs_always_passes(const struct pipe_depth_stencil_alpha_state *zsa) { if (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS) return false; if (zsa->stencil[0].enabled && zsa->stencil[0].func != PIPE_FUNC_ALWAYS) return false; if (zsa->stencil[1].enabled && zsa->stencil[1].func != PIPE_FUNC_ALWAYS) return false; return true; } static void * panfrost_create_depth_stencil_state(struct pipe_context *pipe, const struct pipe_depth_stencil_alpha_state *zsa) { struct panfrost_zsa_state *so = CALLOC_STRUCT(panfrost_zsa_state); so->base = *zsa; const struct pipe_stencil_state front = zsa->stencil[0]; const struct pipe_stencil_state back = zsa->stencil[1].enabled ? zsa->stencil[1] : front; enum mali_func depth_func = zsa->depth_enabled ? (enum mali_func) zsa->depth_func : MALI_FUNC_ALWAYS; /* Normalize (there's no separate enable) */ if (PAN_ARCH <= 5 && !zsa->alpha_enabled) so->base.alpha_func = MALI_FUNC_ALWAYS; #if PAN_ARCH <= 7 /* Prepack relevant parts of the Renderer State Descriptor. They will * be ORed in at draw-time */ pan_pack(&so->rsd_depth, MULTISAMPLE_MISC, cfg) { cfg.depth_function = depth_func; cfg.depth_write_mask = zsa->depth_writemask; } pan_pack(&so->rsd_stencil, STENCIL_MASK_MISC, cfg) { cfg.stencil_enable = front.enabled; cfg.stencil_mask_front = front.writemask; cfg.stencil_mask_back = back.writemask; #if PAN_ARCH <= 5 cfg.alpha_test_compare_function = (enum mali_func) so->base.alpha_func; #endif } /* Stencil tests have their own words in the RSD */ pan_pipe_to_stencil(&front, &so->stencil_front); pan_pipe_to_stencil(&back, &so->stencil_back); #else pan_pack(&so->desc, DEPTH_STENCIL, cfg) { cfg.front_compare_function = (enum mali_func) front.func; cfg.front_stencil_fail = pan_pipe_to_stencil_op(front.fail_op); cfg.front_depth_fail = pan_pipe_to_stencil_op(front.zfail_op); cfg.front_depth_pass = pan_pipe_to_stencil_op(front.zpass_op); cfg.back_compare_function = (enum mali_func) back.func; cfg.back_stencil_fail = pan_pipe_to_stencil_op(back.fail_op); cfg.back_depth_fail = pan_pipe_to_stencil_op(back.zfail_op); cfg.back_depth_pass = pan_pipe_to_stencil_op(back.zpass_op); cfg.stencil_test_enable = front.enabled; cfg.front_write_mask = front.writemask; cfg.back_write_mask = back.writemask; cfg.front_value_mask = front.valuemask; cfg.back_value_mask = back.valuemask; cfg.depth_write_enable = zsa->depth_writemask; cfg.depth_function = depth_func; } #endif so->enabled = zsa->stencil[0].enabled || (zsa->depth_enabled && zsa->depth_func != PIPE_FUNC_ALWAYS); so->zs_always_passes = pipe_zs_always_passes(zsa); so->writes_zs = util_writes_depth_stencil(zsa); /* TODO: Bounds test should be easy */ assert(!zsa->depth_bounds_test); return so; } static struct pipe_sampler_view * panfrost_create_sampler_view( struct pipe_context *pctx, struct pipe_resource *texture, const struct pipe_sampler_view *template) { struct panfrost_context *ctx = pan_context(pctx); struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view); pan_legalize_afbc_format(ctx, pan_resource(texture), template->format); pipe_reference(NULL, &texture->reference); so->base = *template; so->base.texture = texture; so->base.reference.count = 1; so->base.context = pctx; panfrost_create_sampler_view_bo(so, pctx, texture); return (struct pipe_sampler_view *) so; } /* A given Gallium blend state can be encoded to the hardware in numerous, * dramatically divergent ways due to the interactions of blending with * framebuffer formats. Conceptually, there are two modes: * * - Fixed-function blending (for suitable framebuffer formats, suitable blend * state, and suitable blend constant) * * - Blend shaders (for everything else) * * A given Gallium blend configuration will compile to exactly one * fixed-function blend state, if it compiles to any, although the constant * will vary across runs as that is tracked outside of the Gallium CSO. * * However, that same blend configuration will compile to many different blend * shaders, depending on the framebuffer formats active. The rationale is that * blend shaders override not just fixed-function blending but also * fixed-function format conversion, so blend shaders are keyed to a particular * framebuffer format. As an example, the tilebuffer format is identical for * RG16F and RG16UI -- both are simply 32-bit raw pixels -- so both require * blend shaders. * * All of this state is encapsulated in the panfrost_blend_state struct * (our subclass of pipe_blend_state). */ /* Create a blend CSO. Essentially, try to compile a fixed-function * expression and initialize blend shaders */ static void * panfrost_create_blend_state(struct pipe_context *pipe, const struct pipe_blend_state *blend) { struct panfrost_blend_state *so = CALLOC_STRUCT(panfrost_blend_state); so->base = *blend; so->pan.logicop_enable = blend->logicop_enable; so->pan.logicop_func = blend->logicop_func; so->pan.rt_count = blend->max_rt + 1; for (unsigned c = 0; c < so->pan.rt_count; ++c) { unsigned g = blend->independent_blend_enable ? c : 0; const struct pipe_rt_blend_state pipe = blend->rt[g]; struct pan_blend_equation equation = {0}; equation.color_mask = pipe.colormask; equation.blend_enable = pipe.blend_enable; if (pipe.blend_enable) { equation.rgb_func = util_blend_func_to_shader(pipe.rgb_func); equation.rgb_src_factor = util_blend_factor_to_shader(pipe.rgb_src_factor); equation.rgb_invert_src_factor = util_blend_factor_is_inverted(pipe.rgb_src_factor); equation.rgb_dst_factor = util_blend_factor_to_shader(pipe.rgb_dst_factor); equation.rgb_invert_dst_factor = util_blend_factor_is_inverted(pipe.rgb_dst_factor); equation.alpha_func = util_blend_func_to_shader(pipe.alpha_func); equation.alpha_src_factor = util_blend_factor_to_shader(pipe.alpha_src_factor); equation.alpha_invert_src_factor = util_blend_factor_is_inverted(pipe.alpha_src_factor); equation.alpha_dst_factor = util_blend_factor_to_shader(pipe.alpha_dst_factor); equation.alpha_invert_dst_factor = util_blend_factor_is_inverted(pipe.alpha_dst_factor); } /* Determine some common properties */ unsigned constant_mask = pan_blend_constant_mask(equation); const bool supports_2src = pan_blend_supports_2src(PAN_ARCH); so->info[c] = (struct pan_blend_info) { .no_colour = (equation.color_mask == 0), .opaque = pan_blend_is_opaque(equation), .constant_mask = constant_mask, /* TODO: check the dest for the logicop */ .load_dest = blend->logicop_enable || pan_blend_reads_dest(equation), /* Could this possibly be fixed-function? */ .fixed_function = !blend->logicop_enable && pan_blend_can_fixed_function(equation, supports_2src) && (!constant_mask || pan_blend_supports_constant(PAN_ARCH, c)), .alpha_zero_nop = pan_blend_alpha_zero_nop(equation), .alpha_one_store = pan_blend_alpha_one_store(equation), }; so->pan.rts[c].equation = equation; /* Bifrost needs to know if any render target loads its * destination in the hot draw path, so precompute this */ if (so->info[c].load_dest) so->load_dest_mask |= BITFIELD_BIT(c); /* Converting equations to Mali style is expensive, do it at * CSO create time instead of draw-time */ if (so->info[c].fixed_function) { so->equation[c] = pan_pack_blend(equation); } } return so; } static void prepare_shader(struct panfrost_shader_state *state, struct panfrost_pool *pool, bool upload) { #if PAN_ARCH <= 7 void *out = &state->partial_rsd; if (upload) { struct panfrost_ptr ptr = pan_pool_alloc_desc(&pool->base, RENDERER_STATE); state->state = panfrost_pool_take_ref(pool, ptr.gpu); out = ptr.cpu; } pan_pack(out, RENDERER_STATE, cfg) { pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg); } #else assert(upload); /* The address in the shader program descriptor must be non-null, but * the entire shader program descriptor may be omitted. * * See dEQP-GLES31.functional.compute.basic.empty */ if (!state->bin.gpu) return; bool vs = (state->info.stage == MESA_SHADER_VERTEX); bool secondary_enable = (vs && state->info.vs.secondary_enable); unsigned nr_variants = secondary_enable ? 3 : vs ? 2 : 1; struct panfrost_ptr ptr = pan_pool_alloc_desc_array(&pool->base, nr_variants, SHADER_PROGRAM); state->state = panfrost_pool_take_ref(pool, ptr.gpu); /* Generic, or IDVS/points */ pan_pack(ptr.cpu, SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&state->info); cfg.primary_shader = true; cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); cfg.binary = state->bin.gpu; cfg.preload.r48_r63 = (state->info.preload >> 48); if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) cfg.requires_helper_threads = state->info.contains_barrier; } if (!vs) return; /* IDVS/triangles */ pan_pack(ptr.cpu + pan_size(SHADER_PROGRAM), SHADER_PROGRAM, cfg) { cfg.stage = pan_shader_stage(&state->info); cfg.primary_shader = true; cfg.register_allocation = pan_register_allocation(state->info.work_reg_count); cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset; cfg.preload.r48_r63 = (state->info.preload >> 48); } if (!secondary_enable) return; pan_pack(ptr.cpu + (pan_size(SHADER_PROGRAM) * 2), SHADER_PROGRAM, cfg) { unsigned work_count = state->info.vs.secondary_work_reg_count; cfg.stage = pan_shader_stage(&state->info); cfg.primary_shader = false; cfg.register_allocation = pan_register_allocation(work_count); cfg.binary = state->bin.gpu + state->info.vs.secondary_offset; cfg.preload.r48_r63 = (state->info.vs.secondary_preload >> 48); } #endif } static void panfrost_get_sample_position(struct pipe_context *context, unsigned sample_count, unsigned sample_index, float *out_value) { panfrost_query_sample_position( panfrost_sample_pattern(sample_count), sample_index, out_value); } static void screen_destroy(struct pipe_screen *pscreen) { struct panfrost_device *dev = pan_device(pscreen); GENX(pan_blitter_cleanup)(dev); #if PAN_GPU_INDIRECTS GENX(panfrost_cleanup_indirect_draw_shaders)(dev); GENX(pan_indirect_dispatch_cleanup)(dev); #endif } static void preload(struct panfrost_batch *batch, struct pan_fb_info *fb) { GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu, PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL); } static void init_batch(struct panfrost_batch *batch) { /* Reserve the framebuffer and local storage descriptors */ batch->framebuffer = #if PAN_ARCH == 4 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER); #else pan_pool_alloc_desc_aggregate(&batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION), PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET)); batch->framebuffer.gpu |= MALI_FBD_TAG_IS_MFBD; #endif #if PAN_ARCH >= 6 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE); #else /* On Midgard, the TLS is embedded in the FB descriptor */ batch->tls = batch->framebuffer; #endif } static void panfrost_sampler_view_destroy( struct pipe_context *pctx, struct pipe_sampler_view *pview) { struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview; pipe_resource_reference(&pview->texture, NULL); panfrost_bo_unreference(view->state.bo); ralloc_free(view); } static void context_init(struct pipe_context *pipe) { pipe->draw_vbo = panfrost_draw_vbo; pipe->launch_grid = panfrost_launch_grid; pipe->create_vertex_elements_state = panfrost_create_vertex_elements_state; pipe->create_rasterizer_state = panfrost_create_rasterizer_state; pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state; pipe->create_sampler_view = panfrost_create_sampler_view; pipe->sampler_view_destroy = panfrost_sampler_view_destroy; pipe->create_sampler_state = panfrost_create_sampler_state; pipe->create_blend_state = panfrost_create_blend_state; pipe->get_sample_position = panfrost_get_sample_position; } #if PAN_ARCH <= 5 /* Returns the polygon list's GPU address if available, or otherwise allocates * the polygon list. It's perfectly fast to use allocate/free BO directly, * since we'll hit the BO cache and this is one-per-batch anyway. */ static mali_ptr batch_get_polygon_list(struct panfrost_batch *batch) { struct panfrost_device *dev = pan_device(batch->ctx->base.screen); if (!batch->tiler_ctx.midgard.polygon_list) { bool has_draws = batch->scoreboard.first_tiler != NULL; unsigned size = panfrost_tiler_get_polygon_list_size(dev, batch->key.width, batch->key.height, has_draws); size = util_next_power_of_two(size); /* Create the BO as invisible if we can. In the non-hierarchical tiler case, * we need to write the polygon list manually because there's not WRITE_VALUE * job in the chain (maybe we should add one...). */ bool init_polygon_list = !has_draws && dev->model->quirks.no_hierarchical_tiling; batch->tiler_ctx.midgard.polygon_list = panfrost_batch_create_bo(batch, size, init_polygon_list ? 0 : PAN_BO_INVISIBLE, PIPE_SHADER_VERTEX, "Polygon list"); panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list, PIPE_SHADER_FRAGMENT); if (init_polygon_list) { assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu); uint32_t *polygon_list_body = batch->tiler_ctx.midgard.polygon_list->ptr.cpu + MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE; /* Magic for Mali T720 */ polygon_list_body[0] = 0xa0000000; } batch->tiler_ctx.midgard.disable = !has_draws; } return batch->tiler_ctx.midgard.polygon_list->ptr.gpu; } #endif static void init_polygon_list(struct panfrost_batch *batch) { #if PAN_ARCH <= 5 mali_ptr polygon_list = batch_get_polygon_list(batch); panfrost_scoreboard_initialize_tiler(&batch->pool.base, &batch->scoreboard, polygon_list); #endif } void GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen) { struct panfrost_device *dev = &screen->dev; screen->vtbl.prepare_shader = prepare_shader; screen->vtbl.emit_tls = emit_tls; screen->vtbl.emit_fbd = emit_fbd; screen->vtbl.emit_fragment_job = emit_fragment_job; screen->vtbl.screen_destroy = screen_destroy; screen->vtbl.preload = preload; screen->vtbl.context_init = context_init; screen->vtbl.init_batch = init_batch; screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked); screen->vtbl.init_polygon_list = init_polygon_list; screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options); screen->vtbl.compile_shader = GENX(pan_shader_compile); GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base, &screen->blitter.desc_pool.base); #if PAN_GPU_INDIRECTS GENX(pan_indirect_dispatch_init)(dev); GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base); #endif }