/**************************************************************************
 *
 * Copyright 2007 VMware, Inc.
 * Copyright 2012 Marek Olšák <maraeo@gmail.com>
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

/*
 * This converts the VBO's vertex attribute/array information into
 * Gallium vertex state and binds it.
 *
 * Authors:
 *   Keith Whitwell <keithw@vmware.com>
 *   Marek Olšák <maraeo@gmail.com>
 */

#include "st_context.h"
#include "st_atom.h"
#include "st_draw.h"
#include "st_program.h"

#include "cso_cache/cso_context.h"
#include "util/u_cpu_detect.h"
#include "util/u_math.h"
#include "util/u_upload_mgr.h"
#include "util/u_threaded_context.h"
#include "main/bufferobj.h"
#include "main/glformats.h"
#include "main/varray.h"
#include "main/arrayobj.h"

enum st_fill_tc_set_vb {
   FILL_TC_SET_VB_OFF,        /* always works */
   FILL_TC_SET_VB_ON,         /* specialized version (faster) */
};

enum st_use_vao_fast_path {
   VAO_FAST_PATH_OFF,         /* more complicated version (slower) */
   VAO_FAST_PATH_ON,          /* always works (faster) */
};

enum st_allow_zero_stride_attribs {
   ZERO_STRIDE_ATTRIBS_OFF,   /* specialized version (faster) */
   ZERO_STRIDE_ATTRIBS_ON,    /* always works */
};

/* Whether vertex attrib indices are equal to their vertex buffer indices. */
enum st_identity_attrib_mapping {
   IDENTITY_ATTRIB_MAPPING_OFF,  /* always works */
   IDENTITY_ATTRIB_MAPPING_ON,   /* specialized version (faster) */
};

enum st_allow_user_buffers {
   USER_BUFFERS_OFF,          /* specialized version (faster) */
   USER_BUFFERS_ON,           /* always works */
};

enum st_update_velems {
   UPDATE_VELEMS_OFF,         /* specialized version (faster) */
   UPDATE_VELEMS_ON,          /* always works */
};

/* Always inline the non-64bit element code, so that the compiler can see
 * that velements is on the stack.
 */
static void ALWAYS_INLINE
init_velement(struct pipe_vertex_element *velements,
              const struct gl_vertex_format *vformat,
              int src_offset, unsigned src_stride,
              unsigned instance_divisor,
              int vbo_index, bool dual_slot, int idx)
{
   velements[idx].src_offset = src_offset;
   velements[idx].src_stride = src_stride;
   velements[idx].src_format = vformat->_PipeFormat;
   velements[idx].instance_divisor = instance_divisor;
   velements[idx].vertex_buffer_index = vbo_index;
   velements[idx].dual_slot = dual_slot;
   assert(velements[idx].src_format);
}

/* ALWAYS_INLINE helps the compiler realize that most of the parameters are
 * on the stack.
 */
template<util_popcnt POPCNT,
         st_fill_tc_set_vb FILL_TC_SET_VB,
         st_use_vao_fast_path USE_VAO_FAST_PATH,
         st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS,
         st_identity_attrib_mapping HAS_IDENTITY_ATTRIB_MAPPING,
         st_allow_user_buffers ALLOW_USER_BUFFERS,
         st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
setup_arrays(struct gl_context *ctx,
             const struct gl_vertex_array_object *vao,
             const GLbitfield dual_slot_inputs,
             const GLbitfield inputs_read,
             GLbitfield mask,
             struct cso_velems_state *velements,
             struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers)
{
   /* Set up enabled vertex arrays. */
   if (USE_VAO_FAST_PATH) {
      const GLubyte *attribute_map =
         !HAS_IDENTITY_ATTRIB_MAPPING ?
               _mesa_vao_attribute_map[vao->_AttributeMapMode] : NULL;
      struct pipe_context *pipe = ctx->pipe;
      struct tc_buffer_list *next_buffer_list = NULL;

      if (FILL_TC_SET_VB)
         next_buffer_list = tc_get_next_buffer_list(pipe);

      /* Note: I did try to unroll this loop by passing the number of
       * iterations as a template parameter, but it resulted in more overhead.
       */
      while (mask) {
         const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&mask);
         const struct gl_array_attributes *attrib;
         const struct gl_vertex_buffer_binding *binding;

         if (HAS_IDENTITY_ATTRIB_MAPPING) {
            attrib = &vao->VertexAttrib[attr];
            binding = &vao->BufferBinding[attr];
         } else {
            attrib = &vao->VertexAttrib[attribute_map[attr]];
            binding = &vao->BufferBinding[attrib->BufferBindingIndex];
         }
         const unsigned bufidx = (*num_vbuffers)++;

         /* Set the vertex buffer. */
         if (!ALLOW_USER_BUFFERS || binding->BufferObj) {
            assert(binding->BufferObj);
            struct pipe_resource *buf =
               _mesa_get_bufferobj_reference(ctx, binding->BufferObj);
            vbuffer[bufidx].buffer.resource = buf;
            vbuffer[bufidx].is_user_buffer = false;
            vbuffer[bufidx].buffer_offset = binding->Offset +
                                            attrib->RelativeOffset;
            if (FILL_TC_SET_VB)
               tc_track_vertex_buffer(pipe, bufidx, buf, next_buffer_list);
         } else {
            vbuffer[bufidx].buffer.user = attrib->Ptr;
            vbuffer[bufidx].is_user_buffer = true;
            vbuffer[bufidx].buffer_offset = 0;
            assert(!FILL_TC_SET_VB);
         }

         if (!UPDATE_VELEMS)
            continue;

         /* Determine the vertex element index without popcnt
          * if !ALLOW_ZERO_STRIDE_ATTRIBS, which means that we don't need
          * to leave any holes for zero-stride attribs, thus the mapping from
          * vertex elements to vertex buffers is identity.
          */
         unsigned index;

         if (ALLOW_ZERO_STRIDE_ATTRIBS) {
            assert(POPCNT != POPCNT_INVALID);
            index = util_bitcount_fast<POPCNT>(inputs_read &
                                               BITFIELD_MASK(attr));
         } else {
            index = bufidx;
            assert(index == util_bitcount(inputs_read &
                                          BITFIELD_MASK(attr)));
         }

         /* Set the vertex element. */
         init_velement(velements->velems, &attrib->Format, 0, binding->Stride,
                       binding->InstanceDivisor, bufidx,
                       dual_slot_inputs & BITFIELD_BIT(attr), index);
      }
      return;
   }

   /* The slow path needs more fields initialized, which is not done if it's
    * disabled.
    */
   assert(!ctx->Const.UseVAOFastPath || vao->SharedAndImmutable);

   /* Require these because we don't use them here and we don't want to
    * generate identical template variants.
    */
   assert(!FILL_TC_SET_VB);
   assert(ALLOW_ZERO_STRIDE_ATTRIBS);
   assert(!HAS_IDENTITY_ATTRIB_MAPPING);
   assert(ALLOW_USER_BUFFERS);
   assert(UPDATE_VELEMS);

   while (mask) {
      /* The attribute index to start pulling a binding */
      const gl_vert_attrib i = (gl_vert_attrib)(ffs(mask) - 1);
      const struct gl_vertex_buffer_binding *const binding
         = _mesa_draw_buffer_binding(vao, i);
      const unsigned bufidx = (*num_vbuffers)++;

      if (binding->BufferObj) {
         /* Set the binding */
         vbuffer[bufidx].buffer.resource =
            _mesa_get_bufferobj_reference(ctx, binding->BufferObj);
         vbuffer[bufidx].is_user_buffer = false;
         vbuffer[bufidx].buffer_offset = _mesa_draw_binding_offset(binding);
      } else {
         /* Set the binding */
         const void *ptr = (const void *)_mesa_draw_binding_offset(binding);
         vbuffer[bufidx].buffer.user = ptr;
         vbuffer[bufidx].is_user_buffer = true;
         vbuffer[bufidx].buffer_offset = 0;
      }

      const GLbitfield boundmask = _mesa_draw_bound_attrib_bits(binding);
      GLbitfield attrmask = mask & boundmask;
      /* Mark the those attributes as processed */
      mask &= ~boundmask;
      /* We can assume that we have array for the binding */
      assert(attrmask);


      /* Walk attributes belonging to the binding */
      do {
         const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&attrmask);
         const struct gl_array_attributes *const attrib
            = _mesa_draw_array_attrib(vao, attr);
         const GLuint off = _mesa_draw_attributes_relative_offset(attrib);
         assert(POPCNT != POPCNT_INVALID);

         init_velement(velements->velems, &attrib->Format, off,
                       binding->Stride, binding->InstanceDivisor, bufidx,
                       dual_slot_inputs & BITFIELD_BIT(attr),
                       util_bitcount_fast<POPCNT>(inputs_read &
                                                  BITFIELD_MASK(attr)));
      } while (attrmask);
   }
}

/* Only used by the select/feedback mode. */
void
st_setup_arrays(struct st_context *st,
                const struct gl_vertex_program *vp,
                const struct st_common_variant *vp_variant,
                struct cso_velems_state *velements,
                struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers)
{
   struct gl_context *ctx = st->ctx;
   GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx);

   setup_arrays<POPCNT_NO, FILL_TC_SET_VB_OFF, VAO_FAST_PATH_ON,
                ZERO_STRIDE_ATTRIBS_ON, IDENTITY_ATTRIB_MAPPING_OFF,
                USER_BUFFERS_ON, UPDATE_VELEMS_ON>
      (ctx, ctx->Array._DrawVAO, vp->Base.DualSlotInputs,
       vp_variant->vert_attrib_mask,
       vp_variant->vert_attrib_mask & enabled_arrays,
       velements, vbuffer, num_vbuffers);
}

/* ALWAYS_INLINE helps the compiler realize that most of the parameters are
 * on the stack.
 *
 * Return the index of the vertex buffer where current attribs have been
 * uploaded.
 */
template<util_popcnt POPCNT,
         st_fill_tc_set_vb FILL_TC_SET_VB,
         st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
st_setup_current(struct st_context *st,
                 const GLbitfield dual_slot_inputs,
                 const GLbitfield inputs_read,
                 GLbitfield curmask,
                 struct cso_velems_state *velements,
                 struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers)
{
   /* Process values that should have better been uniforms in the application */
   if (curmask) {
      struct gl_context *ctx = st->ctx;
      assert(POPCNT != POPCNT_INVALID);
      unsigned num_attribs = util_bitcount_fast<POPCNT>(curmask);
      unsigned num_dual_attribs = util_bitcount_fast<POPCNT>(curmask &
                                                             dual_slot_inputs);
      /* num_attribs includes num_dual_attribs, so adding num_dual_attribs
       * doubles the size of those attribs.
       */
      unsigned max_size = (num_attribs + num_dual_attribs) * 16;

      const unsigned bufidx = (*num_vbuffers)++;
      vbuffer[bufidx].is_user_buffer = false;
      vbuffer[bufidx].buffer.resource = NULL;
      /* vbuffer[bufidx].buffer_offset is set below */

      /* Use const_uploader for zero-stride vertex attributes, because
       * it may use a better memory placement than stream_uploader.
       * The reason is that zero-stride attributes can be fetched many
       * times (thousands of times), so a better placement is going to
       * perform better.
       */
      struct u_upload_mgr *uploader = st->can_bind_const_buffer_as_vertex ?
                                      st->pipe->const_uploader :
                                      st->pipe->stream_uploader;
      uint8_t *ptr = NULL;

      u_upload_alloc(uploader, 0, max_size, 16,
                     &vbuffer[bufidx].buffer_offset,
                     &vbuffer[bufidx].buffer.resource, (void**)&ptr);
      uint8_t *cursor = ptr;

      if (FILL_TC_SET_VB) {
         struct pipe_context *pipe = ctx->pipe;
         tc_track_vertex_buffer(pipe, bufidx, vbuffer[bufidx].buffer.resource,
                                tc_get_next_buffer_list(pipe));
      }

      do {
         const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&curmask);
         const struct gl_array_attributes *const attrib
            = _mesa_draw_current_attrib(ctx, attr);
         const unsigned size = attrib->Format._ElementSize;

         /* When the current attribs are set (e.g. via glColor3ub or
          * glVertexAttrib2s), they are always converted to float32 or int32
          * or dual slots being 2x int32, so they are always dword-aligned.
          * glBegin/End behaves in the same way. It's really an internal Mesa
          * inefficiency that is convenient here, which is why this assertion
          * is always true.
          */
         assert(size % 4 == 0); /* assume a hw-friendly alignment */
         memcpy(cursor, attrib->Ptr, size);

         if (UPDATE_VELEMS) {
            init_velement(velements->velems, &attrib->Format, cursor - ptr,
                          0, 0, bufidx, dual_slot_inputs & BITFIELD_BIT(attr),
                          util_bitcount_fast<POPCNT>(inputs_read &
                                                     BITFIELD_MASK(attr)));
         }

         cursor += size;
      } while (curmask);

      /* Always unmap. The uploader might use explicit flushes. */
      u_upload_unmap(uploader);
   }
}

/* Only used by the select/feedback mode. */
void
st_setup_current_user(struct st_context *st,
                      const struct gl_vertex_program *vp,
                      const struct st_common_variant *vp_variant,
                      struct cso_velems_state *velements,
                      struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers)
{
   struct gl_context *ctx = st->ctx;
   const GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx);
   const GLbitfield inputs_read = vp_variant->vert_attrib_mask;
   const GLbitfield dual_slot_inputs = vp->Base.DualSlotInputs;

   /* Process values that should have better been uniforms in the application */
   GLbitfield curmask = inputs_read & ~enabled_arrays;
   /* For each attribute, make an own user buffer binding. */
   while (curmask) {
      const gl_vert_attrib attr = (gl_vert_attrib)u_bit_scan(&curmask);
      const struct gl_array_attributes *const attrib
         = _mesa_draw_current_attrib(ctx, attr);
      const unsigned bufidx = (*num_vbuffers)++;

      init_velement(velements->velems, &attrib->Format, 0, 0, 0,
                    bufidx, dual_slot_inputs & BITFIELD_BIT(attr),
                    util_bitcount(inputs_read & BITFIELD_MASK(attr)));

      vbuffer[bufidx].is_user_buffer = true;
      vbuffer[bufidx].buffer.user = attrib->Ptr;
      vbuffer[bufidx].buffer_offset = 0;
   }
}

template<util_popcnt POPCNT,
         st_fill_tc_set_vb FILL_TC_SET_VB,
         st_use_vao_fast_path USE_VAO_FAST_PATH,
         st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS,
         st_identity_attrib_mapping HAS_IDENTITY_ATTRIB_MAPPING,
         st_allow_user_buffers ALLOW_USER_BUFFERS,
         st_update_velems UPDATE_VELEMS> void ALWAYS_INLINE
st_update_array_templ(struct st_context *st,
                      const GLbitfield enabled_arrays,
                      const GLbitfield enabled_user_arrays,
                      const GLbitfield nonzero_divisor_arrays)
{
   struct gl_context *ctx = st->ctx;

   /* vertex program validation must be done before this */
   /* _NEW_PROGRAM, ST_NEW_VS_STATE */
   const struct gl_vertex_program *vp =
      (struct gl_vertex_program *)ctx->VertexProgram._Current;
   const struct st_common_variant *vp_variant = st->vp_variant;
   const GLbitfield inputs_read = vp_variant->vert_attrib_mask;
   const GLbitfield dual_slot_inputs = vp->Base.DualSlotInputs;
   const GLbitfield userbuf_arrays =
      ALLOW_USER_BUFFERS ? inputs_read & enabled_user_arrays : 0;
   bool uses_user_vertex_buffers = userbuf_arrays != 0;

   st->draw_needs_minmax_index =
      (userbuf_arrays & ~nonzero_divisor_arrays) != 0;

   struct pipe_vertex_buffer vbuffer_local[PIPE_MAX_ATTRIBS];
   struct pipe_vertex_buffer *vbuffer;
   unsigned num_vbuffers = 0, num_vbuffers_tc;
   struct cso_velems_state velements;

   if (FILL_TC_SET_VB) {
      assert(!uses_user_vertex_buffers);
      assert(POPCNT != POPCNT_INVALID);
      num_vbuffers_tc = util_bitcount_fast<POPCNT>(inputs_read &
                                                   enabled_arrays);

      /* Add up to 1 vertex buffer for zero-stride vertex attribs. */
      num_vbuffers_tc += ALLOW_ZERO_STRIDE_ATTRIBS &&
                         inputs_read & ~enabled_arrays;
      vbuffer = tc_add_set_vertex_buffers_call(st->pipe, num_vbuffers_tc);
   } else {
      vbuffer = vbuffer_local;
   }

   /* ST_NEW_VERTEX_ARRAYS */
   /* Setup arrays */
   setup_arrays<POPCNT, FILL_TC_SET_VB, USE_VAO_FAST_PATH,
                ALLOW_ZERO_STRIDE_ATTRIBS, HAS_IDENTITY_ATTRIB_MAPPING,
                ALLOW_USER_BUFFERS, UPDATE_VELEMS>
      (ctx, ctx->Array._DrawVAO, dual_slot_inputs, inputs_read,
       inputs_read & enabled_arrays, &velements, vbuffer, &num_vbuffers);

   /* _NEW_CURRENT_ATTRIB */
   /* Setup zero-stride attribs. */
   if (ALLOW_ZERO_STRIDE_ATTRIBS) {
      st_setup_current<POPCNT, FILL_TC_SET_VB, UPDATE_VELEMS>
         (st, dual_slot_inputs, inputs_read, inputs_read & ~enabled_arrays,
          &velements, vbuffer, &num_vbuffers);
   } else {
      assert(!(inputs_read & ~enabled_arrays));
   }

   if (FILL_TC_SET_VB)
         assert(num_vbuffers == num_vbuffers_tc);

   if (UPDATE_VELEMS) {
      struct cso_context *cso = st->cso_context;
      velements.count = vp->num_inputs + vp_variant->key.passthrough_edgeflags;

      /* Set vertex buffers and elements. */
      if (FILL_TC_SET_VB) {
         cso_set_vertex_elements(cso, &velements);
      } else {
         cso_set_vertex_buffers_and_elements(cso, &velements, num_vbuffers,
                                             uses_user_vertex_buffers, vbuffer);
      }
      /* The driver should clear this after it has processed the update. */
      ctx->Array.NewVertexElements = false;
      st->uses_user_vertex_buffers = uses_user_vertex_buffers;
   } else {
      /* Only vertex buffers. */
      if (!FILL_TC_SET_VB)
         cso_set_vertex_buffers(st->cso_context, num_vbuffers, true, vbuffer);

      /* This can change only when we update vertex elements. */
      assert(st->uses_user_vertex_buffers == uses_user_vertex_buffers);
   }
}

typedef void (*update_array_func)(struct st_context *st,
                                  const GLbitfield enabled_arrays,
                                  const GLbitfield enabled_user_attribs,
                                  const GLbitfield nonzero_divisor_attribs);

/* This just initializes the table of all st_update_array variants. */
struct st_update_array_table {
   update_array_func funcs[2][2][2][2][2][2];

   template<util_popcnt POPCNT,
            st_fill_tc_set_vb FILL_TC_SET_VB,
            st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS,
            st_identity_attrib_mapping HAS_IDENTITY_ATTRIB_MAPPING,
            st_allow_user_buffers ALLOW_USER_BUFFERS,
            st_update_velems UPDATE_VELEMS>
   void init_one()
   {
      /* These conditions reduce the number of compiled variants. */
      /* The TC path is only valid without user buffers.
       */
      constexpr st_fill_tc_set_vb fill_tc_set_vb =
         !ALLOW_USER_BUFFERS ? FILL_TC_SET_VB : FILL_TC_SET_VB_OFF;

      /* POPCNT is unused without zero-stride attribs and without TC. */
      constexpr util_popcnt popcnt =
         !ALLOW_ZERO_STRIDE_ATTRIBS && !fill_tc_set_vb ?
            POPCNT_INVALID : POPCNT;

      funcs[POPCNT][FILL_TC_SET_VB][ALLOW_ZERO_STRIDE_ATTRIBS]
           [HAS_IDENTITY_ATTRIB_MAPPING][ALLOW_USER_BUFFERS][UPDATE_VELEMS] =
         st_update_array_templ<
            popcnt,
            fill_tc_set_vb,
            VAO_FAST_PATH_ON,
            ALLOW_ZERO_STRIDE_ATTRIBS,
            HAS_IDENTITY_ATTRIB_MAPPING,
            ALLOW_USER_BUFFERS,
            UPDATE_VELEMS>;
   }

   /* We have to do this in stages because of the combinatorial explosion of
    * variants.
    */
   template<util_popcnt POPCNT,
            st_fill_tc_set_vb FILL_TC_SET_VB,
            st_allow_zero_stride_attribs ALLOW_ZERO_STRIDE_ATTRIBS>
   void init_last_3_args()
   {
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_OFF, USER_BUFFERS_OFF,
               UPDATE_VELEMS_OFF>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_OFF,
               USER_BUFFERS_OFF, UPDATE_VELEMS_ON>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_OFF,
               USER_BUFFERS_ON,  UPDATE_VELEMS_OFF>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_OFF,
               USER_BUFFERS_ON,  UPDATE_VELEMS_ON>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_ON,
               USER_BUFFERS_OFF, UPDATE_VELEMS_OFF>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_ON,
               USER_BUFFERS_OFF, UPDATE_VELEMS_ON>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_ON,
               USER_BUFFERS_ON,  UPDATE_VELEMS_OFF>();
      init_one<POPCNT, FILL_TC_SET_VB, ALLOW_ZERO_STRIDE_ATTRIBS,
               IDENTITY_ATTRIB_MAPPING_ON,
               USER_BUFFERS_ON,  UPDATE_VELEMS_ON>();
   }

   st_update_array_table()
   {
      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_OFF,
                       ZERO_STRIDE_ATTRIBS_OFF>();
      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_OFF,
                       ZERO_STRIDE_ATTRIBS_ON>();
      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_ON,
                       ZERO_STRIDE_ATTRIBS_OFF>();
      init_last_3_args<POPCNT_NO,  FILL_TC_SET_VB_ON,
                       ZERO_STRIDE_ATTRIBS_ON>();
      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_OFF,
                       ZERO_STRIDE_ATTRIBS_OFF>();
      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_OFF,
                       ZERO_STRIDE_ATTRIBS_ON>();
      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_ON,
                       ZERO_STRIDE_ATTRIBS_OFF>();
      init_last_3_args<POPCNT_YES, FILL_TC_SET_VB_ON,
                       ZERO_STRIDE_ATTRIBS_ON>();
   }
};

static st_update_array_table update_array_table;

template<util_popcnt POPCNT,
         st_use_vao_fast_path USE_VAO_FAST_PATH> void ALWAYS_INLINE
st_update_array_impl(struct st_context *st)
{
   struct gl_context *ctx = st->ctx;
   struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;
   const GLbitfield enabled_arrays = _mesa_get_enabled_vertex_arrays(ctx);
   GLbitfield enabled_user_arrays;
   GLbitfield nonzero_divisor_arrays;

   assert(vao->_EnabledWithMapMode ==
          _mesa_vao_enable_to_vp_inputs(vao->_AttributeMapMode, vao->Enabled));

   if (!USE_VAO_FAST_PATH && !vao->SharedAndImmutable)
      _mesa_update_vao_derived_arrays(ctx, vao, false);

   _mesa_get_derived_vao_masks(ctx, enabled_arrays, &enabled_user_arrays,
                               &nonzero_divisor_arrays);

   /* Execute the slow path without using multiple C++ template variants. */
   if (!USE_VAO_FAST_PATH) {
      st_update_array_templ<POPCNT, FILL_TC_SET_VB_OFF, VAO_FAST_PATH_OFF,
                            ZERO_STRIDE_ATTRIBS_ON, IDENTITY_ATTRIB_MAPPING_OFF,
                            USER_BUFFERS_ON, UPDATE_VELEMS_ON>
         (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays);
      return;
   }

   /* The fast path that selects from multiple C++ template variants. */
   const GLbitfield inputs_read = st->vp_variant->vert_attrib_mask;
   const GLbitfield enabled_arrays_read = inputs_read & enabled_arrays;

   /* Check cso_context whether it goes directly to TC. */
   bool fill_tc_set_vbs = st->cso_context->draw_vbo == tc_draw_vbo;
   bool has_zero_stride_attribs = inputs_read & ~enabled_arrays;
   uint32_t non_identity_attrib_mapping =
      vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_IDENTITY ? 0 :
      vao->_AttributeMapMode == ATTRIBUTE_MAP_MODE_POSITION ? VERT_BIT_GENERIC0
                                                            : VERT_BIT_POS;
   bool has_identity_mapping = !(enabled_arrays_read &
                                 (vao->NonIdentityBufferAttribMapping |
                                  non_identity_attrib_mapping));
   /* has_user_buffers is always false with glthread. */
   bool has_user_buffers = inputs_read & enabled_user_arrays;
   /* Changing from user to non-user buffers and vice versa can switch between
    * cso and u_vbuf, which means that we need to update vertex elements even
    * when they have not changed.
    */
   bool update_velems = ctx->Array.NewVertexElements ||
                        st->uses_user_vertex_buffers != has_user_buffers;

   update_array_table.funcs[POPCNT][fill_tc_set_vbs][has_zero_stride_attribs]
                           [has_identity_mapping][has_user_buffers]
                           [update_velems]
      (st, enabled_arrays, enabled_user_arrays, nonzero_divisor_arrays);
}

/* The default callback that must be present before st_init_update_array
 * selects the driver-dependent variant.
 */
void
st_update_array(struct st_context *st)
{
   unreachable("st_init_update_array not called");
}

void
st_init_update_array(struct st_context *st)
{
   st_update_func_t *func = &st->update_functions[ST_NEW_VERTEX_ARRAYS_INDEX];

   if (util_get_cpu_caps()->has_popcnt) {
      if (st->ctx->Const.UseVAOFastPath)
         *func = st_update_array_impl<POPCNT_YES, VAO_FAST_PATH_ON>;
      else
         *func = st_update_array_impl<POPCNT_YES, VAO_FAST_PATH_OFF>;
   } else {
      if (st->ctx->Const.UseVAOFastPath)
         *func = st_update_array_impl<POPCNT_NO, VAO_FAST_PATH_ON>;
      else
         *func = st_update_array_impl<POPCNT_NO, VAO_FAST_PATH_OFF>;
   }
}

struct pipe_vertex_state *
st_create_gallium_vertex_state(struct gl_context *ctx,
                               const struct gl_vertex_array_object *vao,
                               struct gl_buffer_object *indexbuf,
                               uint32_t enabled_arrays)
{
   struct st_context *st = st_context(ctx);
   const GLbitfield inputs_read = enabled_arrays;
   const GLbitfield dual_slot_inputs = 0; /* always zero */
   struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS];
   unsigned num_vbuffers = 0;
   struct cso_velems_state velements;

   /* This should use the slow path because there is only 1 interleaved
    * vertex buffers.
    */
   setup_arrays<POPCNT_NO, FILL_TC_SET_VB_OFF, VAO_FAST_PATH_OFF,
                ZERO_STRIDE_ATTRIBS_ON, IDENTITY_ATTRIB_MAPPING_OFF,
                USER_BUFFERS_ON, UPDATE_VELEMS_ON>
      (ctx, vao, dual_slot_inputs, inputs_read, inputs_read, &velements,
       vbuffer, &num_vbuffers);

   if (num_vbuffers != 1) {
      assert(!"this should never happen with display lists");
      return NULL;
   }

   velements.count = util_bitcount(inputs_read);

   struct pipe_screen *screen = st->screen;
   struct pipe_vertex_state *state =
      screen->create_vertex_state(screen, &vbuffer[0], velements.velems,
                                  velements.count,
                                  indexbuf ?
                                  indexbuf->buffer : NULL,
                                  enabled_arrays);

   for (unsigned i = 0; i < num_vbuffers; i++)
      pipe_vertex_buffer_unreference(&vbuffer[i]);
   return state;
}