/************************************************************************** * * Copyright 2007 VMware, Inc. * Copyright 2012 Marek Olšák * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /* * This converts the VBO's vertex attribute/array information into * Gallium vertex state and binds it. * * Authors: * Keith Whitwell * Marek Olšák */ #include "st_context.h" #include "st_atom.h" #include "st_draw.h" #include "st_program.h" #include "cso_cache/cso_context.h" #include "util/u_cpu_detect.h" #include "util/u_math.h" #include "util/u_upload_mgr.h" #include "util/u_threaded_context.h" #include "main/bufferobj.h" #include "main/glformats.h" #include "main/varray.h" #include "main/arrayobj.h" enum st_fill_tc_set_vb { FILL_TC_SET_VB_OFF, /* always works */ FILL_TC_SET_VB_ON, /* specialized version (faster) */ }; enum st_use_vao_fast_path { VAO_FAST_PATH_OFF, /* more complicated version (slower) */ VAO_FAST_PATH_ON, /* always works (faster) */ }; enum st_allow_zero_stride_attribs { ZERO_STRIDE_ATTRIBS_OFF, /* specialized version (faster) */ ZERO_STRIDE_ATTRIBS_ON, /* always works */ }; /* Whether vertex attrib indices are equal to their vertex buffer indices. */ enum st_identity_attrib_mapping { IDENTITY_ATTRIB_MAPPING_OFF, /* always works */ IDENTITY_ATTRIB_MAPPING_ON, /* specialized version (faster) */ }; enum st_allow_user_buffers { USER_BUFFERS_OFF, /* specialized version (faster) */ USER_BUFFERS_ON, /* always works */ }; enum st_update_velems { UPDATE_VELEMS_OFF, /* specialized version (faster) */ UPDATE_VELEMS_ON, /* always works */ }; /* Always inline the non-64bit element code, so that the compiler can see * that velements is on the stack. */ static void ALWAYS_INLINE init_velement(struct pipe_vertex_element *velements, const struct gl_vertex_format *vformat, int src_offset, unsigned src_stride, unsigned instance_divisor, int vbo_index, bool dual_slot, int idx) { velements[idx].src_offset = src_offset; velements[idx].src_stride = src_stride; velements[idx].src_format = vformat->_PipeFormat; velements[idx].instance_divisor = instance_divisor; velements[idx].vertex_buffer_index = vbo_index; velements[idx].dual_slot = dual_slot; assert(velements[idx].src_format); } /* ALWAYS_INLINE helps the compiler realize that most of the parameters are * on the stack. */ template void ALWAYS_INLINE setup_arrays(struct gl_context *ctx, const struct gl_vertex_array_object *vao, const GLbitfield dual_slot_inputs, const GLbitfield inputs_read, GLbitfield mask, struct cso_velems_state *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { /* Set up enabled vertex arrays. */ if (USE_VAO_FAST_PATH) { const GLubyte *attribute_map = !HAS_IDENTITY_ATTRIB_MAPPING ? _mesa_vao_attribute_map[vao->_AttributeMapMode] : NULL; struct pipe_context *pipe = ctx->pipe; struct tc_buffer_list *next_buffer_list = NULL; if (FILL_TC_SET_VB) next_buffer_list = tc_get_next_buffer_list(pipe); /* Note: I did try to unroll this loop by passing the number of * iterations as a template parameter, but it resulted in more overhead. */ while (mask) { const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&mask); const struct gl_array_attributes *attrib; const struct gl_vertex_buffer_binding *binding; if (HAS_IDENTITY_ATTRIB_MAPPING) { attrib = &vao->VertexAttrib[attr]; binding = &vao->BufferBinding[attr]; } else { attrib = &vao->VertexAttrib[attribute_map[attr]]; binding = &vao->BufferBinding[attrib->BufferBindingIndex]; } const unsigned bufidx = (*num_vbuffers)++; /* Set the vertex buffer. */ if (!ALLOW_USER_BUFFERS || binding->BufferObj) { assert(binding->BufferObj); struct pipe_resource *buf = _mesa_get_bufferobj_reference(ctx, binding->BufferObj); vbuffer[bufidx].buffer.resource = buf; vbuffer[bufidx].is_user_buffer = false; vbuffer[bufidx].buffer_offset = binding->Offset + attrib->RelativeOffset; if (FILL_TC_SET_VB) tc_track_vertex_buffer(pipe, bufidx, buf, next_buffer_list); } else { vbuffer[bufidx].buffer.user = attrib->Ptr; vbuffer[bufidx].is_user_buffer = true; vbuffer[bufidx].buffer_offset = 0; assert(!FILL_TC_SET_VB); } if (!UPDATE_VELEMS) continue; /* Determine the vertex element index without popcnt * if !ALLOW_ZERO_STRIDE_ATTRIBS, which means that we don't need * to leave any holes for zero-stride attribs, thus the mapping from * vertex elements to vertex buffers is identity. */ unsigned index; if (ALLOW_ZERO_STRIDE_ATTRIBS) { assert(POPCNT != POPCNT_INVALID); index = util_bitcount_fast(inputs_read & BITFIELD_MASK(attr)); } else { index = bufidx; assert(index == util_bitcount(inputs_read & BITFIELD_MASK(attr))); } /* Set the vertex element. */ init_velement(velements->velems, &attrib->Format, 0, binding->Stride, binding->InstanceDivisor, bufidx, dual_slot_inputs & BITFIELD_BIT(attr), index); } return; } /* The slow path needs more fields initialized, which is not done if it's * disabled. */ assert(!ctx->Const.UseVAOFastPath || vao->SharedAndImmutable); /* Require these because we don't use them here and we don't want to * generate identical template variants. */ assert(!FILL_TC_SET_VB); assert(ALLOW_ZERO_STRIDE_ATTRIBS); assert(!HAS_IDENTITY_ATTRIB_MAPPING); assert(ALLOW_USER_BUFFERS); assert(UPDATE_VELEMS); while (mask) { /* The attribute index to start pulling a binding */ const gl_vert_attrib i = (gl_vert_attrib)(ffs(mask) - 1); const struct gl_vertex_buffer_binding *const binding = _mesa_draw_buffer_binding(vao, i); const unsigned bufidx = (*num_vbuffers)++; if (binding->BufferObj) { /* Set the binding */ vbuffer[bufidx].buffer.resource = _mesa_get_bufferobj_reference(ctx, binding->BufferObj); vbuffer[bufidx].is_user_buffer = false; vbuffer[bufidx].buffer_offset = _mesa_draw_binding_offset(binding); } else { /* Set the binding */ const void *ptr = (const void *)_mesa_draw_binding_offset(binding); vbuffer[bufidx].buffer.user = ptr; vbuffer[bufidx].is_user_buffer = true; vbuffer[bufidx].buffer_offset = 0; } const GLbitfield boundmask = _mesa_draw_bound_attrib_bits(binding); GLbitfield attrmask = mask & boundmask; /* Mark the those attributes as processed */ mask &= ~boundmask; /* We can assume that we have array for the binding */ assert(attrmask); /* Walk attributes belonging to the binding */ do { const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&attrmask); const struct gl_array_attributes *const attrib = _mesa_draw_array_attrib(vao, attr); const GLuint off = _mesa_draw_attributes_relative_offset(attrib); assert(POPCNT != POPCNT_INVALID); init_velement(velements->velems, &attrib->Format, off, binding->Stride, binding->InstanceDivisor, bufidx, dual_slot_inputs & BITFIELD_BIT(attr), util_bitcount_fast(inputs_read & BITFIELD_MASK(attr))); } while (attrmask); } } /* Only used by the select/feedback mode. */ void st_setup_arrays(struct st_context *st, const struct gl_vertex_program *vp, const struct st_common_variant *vp_variant, struct cso_velems_state *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { struct gl_context *ctx = st->ctx; GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx); setup_arrays (ctx, ctx->Array._DrawVAO, vp->Base.DualSlotInputs, vp_variant->vert_attrib_mask, vp_variant->vert_attrib_mask & enabled_arrays, velements, vbuffer, num_vbuffers); } /* ALWAYS_INLINE helps the compiler realize that most of the parameters are * on the stack. * * Return the index of the vertex buffer where current attribs have been * uploaded. */ template void ALWAYS_INLINE st_setup_current(struct st_context *st, const GLbitfield dual_slot_inputs, const GLbitfield inputs_read, GLbitfield curmask, struct cso_velems_state *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { /* Process values that should have better been uniforms in the application */ if (curmask) { struct gl_context *ctx = st->ctx; assert(POPCNT != POPCNT_INVALID); unsigned num_attribs = util_bitcount_fast(curmask); unsigned num_dual_attribs = util_bitcount_fast(curmask & dual_slot_inputs); /* num_attribs includes num_dual_attribs, so adding num_dual_attribs * doubles the size of those attribs. */ unsigned max_size = (num_attribs + num_dual_attribs) * 16; const unsigned bufidx = (*num_vbuffers)++; vbuffer[bufidx].is_user_buffer = false; vbuffer[bufidx].buffer.resource = NULL; /* vbuffer[bufidx].buffer_offset is set below */ /* Use const_uploader for zero-stride vertex attributes, because * it may use a better memory placement than stream_uploader. * The reason is that zero-stride attributes can be fetched many * times (thousands of times), so a better placement is going to * perform better. */ struct u_upload_mgr *uploader = st->can_bind_const_buffer_as_vertex ? st->pipe->const_uploader : st->pipe->stream_uploader; uint8_t *ptr = NULL; u_upload_alloc(uploader, 0, max_size, 16, &vbuffer[bufidx].buffer_offset, &vbuffer[bufidx].buffer.resource, (void**)&ptr); uint8_t *cursor = ptr; if (FILL_TC_SET_VB) { struct pipe_context *pipe = ctx->pipe; tc_track_vertex_buffer(pipe, bufidx, vbuffer[bufidx].buffer.resource, tc_get_next_buffer_list(pipe)); } do { const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&curmask); const struct gl_array_attributes *const attrib = _mesa_draw_current_attrib(ctx, attr); const unsigned size = attrib->Format._ElementSize; /* When the current attribs are set (e.g. via glColor3ub or * glVertexAttrib2s), they are always converted to float32 or int32 * or dual slots being 2x int32, so they are always dword-aligned. * glBegin/End behaves in the same way. It's really an internal Mesa * inefficiency that is convenient here, which is why this assertion * is always true. */ assert(size % 4 == 0); /* assume a hw-friendly alignment */ memcpy(cursor, attrib->Ptr, size); if (UPDATE_VELEMS) { init_velement(velements->velems, &attrib->Format, cursor - ptr, 0, 0, bufidx, dual_slot_inputs & BITFIELD_BIT(attr), util_bitcount_fast(inputs_read & BITFIELD_MASK(attr))); } cursor += size; } while (curmask); /* Always unmap. The uploader might use explicit flushes. */ u_upload_unmap(uploader); } } /* Only used by the select/feedback mode. */ void st_setup_current_user(struct st_context *st, const struct gl_vertex_program *vp, const struct st_common_variant *vp_variant, struct cso_velems_state *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { struct gl_context *ctx = st->ctx; const GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx); const GLbitfield inputs_read = vp_variant->vert_attrib_mask; const GLbitfield dual_slot_inputs = vp->Base.DualSlotInputs; /* Process values that should have better been uniforms in the application */ GLbitfield curmask = inputs_read & ~enabled_arrays; /* For each attribute, make an own user buffer binding. */ while (curmask) { const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&curmask); const struct gl_array_attributes *const attrib = _mesa_draw_current_attrib(ctx, attr); const unsigned bufidx = (*num_vbuffers)++; init_velement(velements->velems, &attrib->Format, 0, 0, 0, bufidx, dual_slot_inputs & BITFIELD_BIT(attr), util_bitcount(inputs_read & BITFIELD_MASK(attr))); vbuffer[bufidx].is_user_buffer = true; vbuffer[bufidx].buffer.user = attrib->Ptr; vbuffer[bufidx].buffer_offset = 0; } } template void ALWAYS_INLINE st_update_array_templ(struct st_context *st, const GLbitfield enabled_arrays, const GLbitfield enabled_user_arrays, const GLbitfield nonzero_divisor_arrays) { struct gl_context *ctx = st->ctx; /* vertex program validation must be done before this */ /* _NEW_PROGRAM, ST_NEW_VS_STATE */ const struct gl_vertex_program *vp = (struct gl_vertex_program *)ctx->VertexProgram._Current; const struct st_common_variant *vp_variant = st->vp_variant; const GLbitfield inputs_read = vp_variant->vert_attrib_mask; const GLbitfield dual_slot_inputs = vp->Base.DualSlotInputs; const GLbitfield userbuf_arrays = ALLOW_USER_BUFFERS ? inputs_read & enabled_user_arrays : 0; bool uses_user_vertex_buffers = userbuf_arrays != 0; st->draw_needs_minmax_index = (userbuf_arrays & ~nonzero_divisor_arrays) != 0; struct pipe_vertex_buffer vbuffer_local[PIPE_MAX_ATTRIBS]; struct pipe_vertex_buffer *vbuffer; unsigned num_vbuffers = 0, num_vbuffers_tc; struct cso_velems_state velements; if (FILL_TC_SET_VB) { assert(!uses_user_vertex_buffers); assert(POPCNT != POPCNT_INVALID); num_vbuffers_tc = util_bitcount_fast(inputs_read & enabled_arrays); /* Add up to 1 vertex buffer for zero-stride vertex attribs. */ num_vbuffers_tc += ALLOW_ZERO_STRIDE_ATTRIBS && inputs_read & ~enabled_arrays; vbuffer = tc_add_set_vertex_buffers_call(st->pipe, num_vbuffers_tc); } else { vbuffer = vbuffer_local; } /* ST_NEW_VERTEX_ARRAYS */ /* Setup arrays */ setup_arrays (ctx, ctx->Array._DrawVAO, dual_slot_inputs, inputs_read, inputs_read & enabled_arrays, &velements, vbuffer, &num_vbuffers); /* _NEW_CURRENT_ATTRIB */ /* Setup zero-stride attribs. */ if (ALLOW_ZERO_STRIDE_ATTRIBS) { st_setup_current (st, dual_slot_inputs, inputs_read, inputs_read & ~enabled_arrays, &velements, vbuffer, &num_vbuffers); } else { assert(!(inputs_read & ~enabled_arrays)); } if (FILL_TC_SET_VB) assert(num_vbuffers == num_vbuffers_tc); if (UPDATE_VELEMS) { struct cso_context *cso = st->cso_context; velements.count = vp->num_inputs + vp_variant->key.passthrough_edgeflags; /* Set vertex buffers and elements. */ if (FILL_TC_SET_VB) { cso_set_vertex_elements(cso, &velements); } else { cso_set_vertex_buffers_and_elements(cso, &velements, num_vbuffers, uses_user_vertex_buffers, vbuffer); } /* The driver should clear this after it has processed the update. */ ctx->Array.NewVertexElements = false; st->uses_user_vertex_buffers = uses_user_vertex_buffers; } else { /* Only vertex buffers. */ if (!FILL_TC_SET_VB) cso_set_vertex_buffers(st->cso_context, num_vbuffers, true, vbuffer); /* This can change only when we update vertex elements. */ assert(st->uses_user_vertex_buffers == uses_user_vertex_buffers); } } typedef void (*update_array_func)(struct st_context *st, const GLbitfield enabled_arrays, const GLbitfield enabled_user_attribs, const GLbitfield nonzero_divisor_attribs); /* This just initializes the table of all st_update_array variants. */ struct st_update_array_table { update_array_func funcs[2][2][2][2][2][2]; template void init_one() { /* These conditions reduce the number of compiled variants. */ /* The TC path is only valid without user buffers. */ constexpr st_fill_tc_set_vb fill_tc_set_vb = !ALLOW_USER_BUFFERS ? FILL_TC_SET_VB : FILL_TC_SET_VB_OFF; /* POPCNT is unused without zero-stride attribs and without TC. */ constexpr util_popcnt popcnt = !ALLOW_ZERO_STRIDE_ATTRIBS && !fill_tc_set_vb ? POPCNT_INVALID : POPCNT; funcs[POPCNT][FILL_TC_SET_VB][ALLOW_ZERO_STRIDE_ATTRIBS] [HAS_IDENTITY_ATTRIB_MAPPING][ALLOW_USER_BUFFERS][UPDATE_VELEMS] = st_update_array_templ< popcnt, fill_tc_set_vb, VAO_FAST_PATH_ON, ALLOW_ZERO_STRIDE_ATTRIBS, HAS_IDENTITY_ATTRIB_MAPPING, ALLOW_USER_BUFFERS, UPDATE_VELEMS>; } /* We have to do this in stages because of the combinatorial explosion of * variants. */ template void init_last_3_args() { init_one(); init_one(); init_one(); init_one(); init_one(); init_one(); init_one(); init_one(); } st_update_array_table() { init_last_3_args(); init_last_3_args(); init_last_3_args(); init_last_3_args(); init_last_3_args(); init_last_3_args(); init_last_3_args(); init_last_3_args(); } }; static st_update_array_table update_array_table; template void ALWAYS_INLINE st_update_array_impl(struct st_context *st) { struct gl_context *ctx = st->ctx; struct gl_vertex_array_object *vao = ctx->Array._DrawVAO; const GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx); GLbitfield enabled_user_arrays; GLbitfield nonzero_divisor_arrays; assert(vao->_EnabledWithMapMode == _mesa_vao_enable_to_vp_inputs(vao->_AttributeMapMode, vao->Enabled)); if (!USE_VAO_FAST_PATH && !vao->SharedAndImmutable) _mesa_update_vao_derived_arrays(ctx, vao, false); _mesa_get_derived_vao_masks(ctx, enabled_arrays, &enabled_user_arrays, &nonzero_divisor_arrays); /* Execute the slow path without using multiple C++ template variants. */ if (!USE_VAO_FAST_PATH) { st_update_array_templ (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays); return; } /* The fast path that selects from multiple C++ template variants. */ const GLbitfield inputs_read = st->vp_variant->vert_attrib_mask; const GLbitfield enabled_arrays_read = inputs_read & enabled_arrays; /* Check cso_context whether it goes directly to TC. */ bool fill_tc_set_vbs = st->cso_context->draw_vbo == tc_draw_vbo; bool has_zero_stride_attribs = inputs_read & ~enabled_arrays; uint32_t non_identity_attrib_mapping = vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_IDENTITY ? 0 : vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_POSITION ? VERT_BIT_GENERIC0 : VERT_BIT_POS; bool has_identity_mapping = !(enabled_arrays_read & (vao->NonIdentityBufferAttribMapping | non_identity_attrib_mapping)); /* has_user_buffers is always false with glthread. */ bool has_user_buffers = inputs_read & enabled_user_arrays; /* Changing from user to non-user buffers and vice versa can switch between * cso and u_vbuf, which means that we need to update vertex elements even * when they have not changed. */ bool update_velems = ctx->Array.NewVertexElements || st->uses_user_vertex_buffers != has_user_buffers; update_array_table.funcs[POPCNT][fill_tc_set_vbs][has_zero_stride_attribs] [has_identity_mapping][has_user_buffers] [update_velems] (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays); } /* The default callback that must be present before st_init_update_array * selects the driver-dependent variant. */ void st_update_array(struct st_context *st) { unreachable("st_init_update_array not called"); } void st_init_update_array(struct st_context *st) { st_update_func_t *func = &st->update_functions[ST_NEW_VERTEX_ARRAYS_INDEX]; if (util_get_cpu_caps()->has_popcnt) { if (st->ctx->Const.UseVAOFastPath) *func = st_update_array_impl; else *func = st_update_array_impl; } else { if (st->ctx->Const.UseVAOFastPath) *func = st_update_array_impl; else *func = st_update_array_impl; } } struct pipe_vertex_state * st_create_gallium_vertex_state(struct gl_context *ctx, const struct gl_vertex_array_object *vao, struct gl_buffer_object *indexbuf, uint32_t enabled_arrays) { struct st_context *st = st_context(ctx); const GLbitfield inputs_read = enabled_arrays; const GLbitfield dual_slot_inputs = 0; /* always zero */ struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS]; unsigned num_vbuffers = 0; struct cso_velems_state velements; /* This should use the slow path because there is only 1 interleaved * vertex buffers. */ setup_arrays (ctx, vao, dual_slot_inputs, inputs_read, inputs_read, &velements, vbuffer, &num_vbuffers); if (num_vbuffers != 1) { assert(!"this should never happen with display lists"); return NULL; } velements.count = util_bitcount(inputs_read); struct pipe_screen *screen = st->screen; struct pipe_vertex_state *state = screen->create_vertex_state(screen, &vbuffer[0], velements.velems, velements.count, indexbuf ? indexbuf->buffer : NULL, enabled_arrays); for (unsigned i = 0; i < num_vbuffers; i++) pipe_vertex_buffer_unreference(&vbuffer[i]); return state; }