/*
 * Mesa 3-D graphics library
 *
 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *    Keith Whitwell <keithw@vmware.com>
 */

#include <stdio.h>

#include "main/glheader.h"
#include "main/arrayobj.h"
#include "main/bufferobj.h"
#include "main/condrender.h"
#include "main/context.h"

#include "main/mtypes.h"
#include "main/macros.h"
#include "main/enums.h"
#include "main/varray.h"
#include "util/half_float.h"

#include "t_context.h"
#include "t_rebase.h"
#include "tnl.h"


static GLubyte *get_space(struct gl_context *ctx, GLuint bytes)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   GLubyte *space = malloc(bytes);

   tnl->block[tnl->nr_blocks++] = space;
   return space;
}


static void free_space(struct gl_context *ctx)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);

   for (GLuint i = 0; i < tnl->nr_blocks; i++)
      free(tnl->block[i]);

   tnl->nr_blocks = 0;
}


/* Convert the incoming array to GLfloats.  Understands the
 * array->Normalized flag and selects the correct conversion method.
 */
#define CONVERT( TYPE, MACRO ) do {		\
   GLuint i, j;					\
   if (attrib->Format.Normalized) {		\
      for (i = 0; i < count; i++) {		\
         const TYPE *in = (TYPE *)ptr;		\
         for (j = 0; j < sz; j++) {		\
            *fptr++ = MACRO(*in);		\
            in++;				\
         }					\
         ptr += binding->Stride;		\
      }						\
   } else {					\
      for (i = 0; i < count; i++) {		\
         const TYPE *in = (TYPE *)ptr;		\
         for (j = 0; j < sz; j++) {		\
            *fptr++ = (GLfloat)(*in);		\
            in++;				\
         }					\
         ptr += binding->Stride;		\
      }						\
   }						\
} while (0)


/**
 * Convert array of BGRA/GLubyte[4] values to RGBA/float[4]
 * \param ptr  input/ubyte array
 * \param fptr  output/float array
 */
static void
convert_bgra_to_float(const struct gl_vertex_buffer_binding *binding,
                      const struct gl_array_attributes *attrib,
                      const GLubyte *ptr, GLfloat *fptr,
                      GLuint count)
{
   GLuint i;
   assert(attrib->Format.Normalized);
   assert(attrib->Format.Size == 4);
   for (i = 0; i < count; i++) {
      const GLubyte *in = (GLubyte *) ptr;  /* in is in BGRA order */
      *fptr++ = UBYTE_TO_FLOAT(in[2]);  /* red */
      *fptr++ = UBYTE_TO_FLOAT(in[1]);  /* green */
      *fptr++ = UBYTE_TO_FLOAT(in[0]);  /* blue */
      *fptr++ = UBYTE_TO_FLOAT(in[3]);  /* alpha */
      ptr += binding->Stride;
   }
}

static void
convert_half_to_float(const struct gl_vertex_buffer_binding *binding,
                      const GLubyte *ptr, GLfloat *fptr,
                      GLuint count, GLuint sz)
{
   GLuint i, j;

   for (i = 0; i < count; i++) {
      GLhalfARB *in = (GLhalfARB *)ptr;

      for (j = 0; j < sz; j++)
         *fptr++ = _mesa_half_to_float(in[j]);

      ptr += binding->Stride;
   }
}

/**
 * \brief Convert fixed-point to floating-point.
 *
 * In OpenGL, a fixed-point number is a "signed 2's complement 16.16 scaled
 * integer" (Table 2.2 of the OpenGL ES 2.0 spec).
 *
 * If the buffer has the \c normalized flag set, the formula
 *     \code normalize(x) := (2*x + 1) / (2^16 - 1) \endcode
 * is used to map the fixed-point numbers into the range [-1, 1].
 */
static void
convert_fixed_to_float(const struct gl_vertex_buffer_binding *binding,
                       const struct gl_array_attributes *attrib,
                       const GLubyte *ptr, GLfloat *fptr,
                       GLuint count)
{
   GLuint i;
   GLint j;
   const GLint size = attrib->Format.Size;

   if (attrib->Format.Normalized) {
      for (i = 0; i < count; ++i) {
         const GLfixed *in = (GLfixed *) ptr;
         for (j = 0; j < size; ++j) {
            *fptr++ = (GLfloat) (2 * in[j] + 1) / (GLfloat) ((1 << 16) - 1);
         }
         ptr += binding->Stride;
      }
   } else {
      for (i = 0; i < count; ++i) {
         const GLfixed *in = (GLfixed *) ptr;
         for (j = 0; j < size; ++j) {
            *fptr++ = in[j] / (GLfloat) (1 << 16);
         }
         ptr += binding->Stride;
      }
   }
}

/* Adjust pointer to point at first requested element, convert to
 * floating point, populate VB->AttribPtr[].
 */
static void _tnl_import_array(struct gl_context *ctx,
                              GLuint attr,
                              GLuint count,
                              const struct gl_vertex_buffer_binding *binding,
                              const struct gl_array_attributes *attrib,
                              const GLubyte *ptr)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   struct vertex_buffer *VB = &tnl->vb;
   GLuint stride = binding->Stride;

   if (attrib->Format.Type != GL_FLOAT) {
      const GLuint sz = attrib->Format.Size;
      GLubyte *buf = get_space(ctx, count * sz * sizeof(GLfloat));
      GLfloat *fptr = (GLfloat *)buf;

      switch (attrib->Format.Type) {
      case GL_BYTE:
         CONVERT(GLbyte, BYTE_TO_FLOAT);
         break;
      case GL_UNSIGNED_BYTE:
         if (attrib->Format.Format == GL_BGRA) {
            /* See GL_EXT_vertex_array_bgra */
            convert_bgra_to_float(binding, attrib, ptr, fptr, count);
         }
         else {
            CONVERT(GLubyte, UBYTE_TO_FLOAT);
         }
         break;
      case GL_SHORT:
         CONVERT(GLshort, SHORT_TO_FLOAT);
         break;
      case GL_UNSIGNED_SHORT:
         CONVERT(GLushort, USHORT_TO_FLOAT);
         break;
      case GL_INT:
         CONVERT(GLint, INT_TO_FLOAT);
         break;
      case GL_UNSIGNED_INT:
         CONVERT(GLuint, UINT_TO_FLOAT);
         break;
      case GL_DOUBLE:
         CONVERT(GLdouble, (GLfloat));
         break;
      case GL_HALF_FLOAT:
         convert_half_to_float(binding, ptr, fptr, count, sz);
         break;
      case GL_FIXED:
         convert_fixed_to_float(binding, attrib, ptr, fptr, count);
         break;
      default:
         unreachable("Invalid type.");
      }

      ptr = buf;
      stride = sz * sizeof(GLfloat);
   }

   VB->AttribPtr[attr] = &tnl->tmp_inputs[attr];
   VB->AttribPtr[attr]->data = (GLfloat (*)[4])ptr;
   VB->AttribPtr[attr]->start = (GLfloat *)ptr;
   VB->AttribPtr[attr]->count = count;
   VB->AttribPtr[attr]->stride = stride;
   VB->AttribPtr[attr]->size = attrib->Format.Size;

   /* This should die, but so should the whole GLvector4f concept:
    */
   VB->AttribPtr[attr]->flags = (((1<<attrib->Format.Size)-1) |
                                 VEC_NOT_WRITEABLE |
                                 (stride == 4*sizeof(GLfloat) ? 0 : VEC_BAD_STRIDE));

   VB->AttribPtr[attr]->storage = NULL;
}

#define CLIPVERTS  ((6 + MAX_CLIP_PLANES) * 2)


static GLboolean *_tnl_import_edgeflag(struct gl_context *ctx,
                                       const GLvector4f *input,
                                       GLuint count)
{
   const GLubyte *ptr = (const GLubyte *)input->data;
   const GLuint stride = input->stride;
   GLboolean *space = (GLboolean *)get_space(ctx, count + CLIPVERTS);
   GLboolean *bptr = space;

   for (GLuint i = 0; i < count; i++) {
      *bptr++ = ((GLfloat *)ptr)[0] == 1.0F;
      ptr += stride;
   }

   return space;
}


static void bind_inputs(struct gl_context *ctx,
                        const struct tnl_vertex_array *inputs,
                        GLint count,
                        struct gl_buffer_object **bo,
                        GLuint *nr_bo)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   struct vertex_buffer *VB = &tnl->vb;

   /* Map all the VBOs
    */
   for (unsigned i = 0; i < VERT_ATTRIB_MAX; i++) {
      const struct tnl_vertex_array *array = &inputs[i];
      const struct gl_vertex_buffer_binding *binding = array->BufferBinding;
      const struct gl_array_attributes *attrib = array->VertexAttrib;
      const void *ptr;

      if (binding->BufferObj) {
         if (!binding->BufferObj->Mappings[MAP_INTERNAL].Pointer) {
            bo[*nr_bo] = binding->BufferObj;
            (*nr_bo)++;
            ctx->Driver.MapBufferRange(ctx, 0, binding->BufferObj->Size,
                                       GL_MAP_READ_BIT,
                                       binding->BufferObj,
                                       MAP_INTERNAL);

            assert(binding->BufferObj->Mappings[MAP_INTERNAL].Pointer);
         }

         ptr = ADD_POINTERS(binding->BufferObj->Mappings[MAP_INTERNAL].Pointer,
                            binding->Offset + attrib->RelativeOffset);
      } else
         ptr = attrib->Ptr;

      /* Just make sure the array is floating point, otherwise convert to
       * temporary storage.
       *
       * XXX: remove the GLvector4f type at some stage and just use
       * client arrays.
       */
      _tnl_import_array(ctx, i, count, binding, attrib, ptr);
   }

   /* We process only the vertices between min & max index:
    */
   VB->Count = count;

   /* These should perhaps be part of _TNL_ATTRIB_* */
   VB->BackfaceColorPtr = NULL;
   VB->BackfaceIndexPtr = NULL;
   VB->BackfaceSecondaryColorPtr = NULL;

   /* Clipping and drawing code still requires this to be a packed
    * array of ubytes which can be written into.  TODO: Fix and
    * remove.
    */
   if (ctx->Polygon.FrontMode != GL_FILL ||
       ctx->Polygon.BackMode != GL_FILL) {
      VB->EdgeFlag = _tnl_import_edgeflag(ctx,
                                          VB->AttribPtr[_TNL_ATTRIB_EDGEFLAG],
                                          VB->Count);
   } else {
      /* the data previously pointed to by EdgeFlag may have been freed */
      VB->EdgeFlag = NULL;
   }
}


/* Translate indices to GLuints and store in VB->Elts.
 */
static void bind_indices(struct gl_context *ctx,
                         const struct _mesa_index_buffer *ib,
                         struct gl_buffer_object **bo,
                         GLuint *nr_bo)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   struct vertex_buffer *VB = &tnl->vb;
   GLuint i;
   const void *ptr;

   if (!ib) {
      VB->Elts = NULL;
      return;
   }

   if (ib->obj) {
      if (!_mesa_bufferobj_mapped(ib->obj, MAP_INTERNAL)) {
         /* if the buffer object isn't mapped yet, map it now */
         bo[*nr_bo] = ib->obj;
         (*nr_bo)++;
         ptr = ctx->Driver.MapBufferRange(ctx, (GLsizeiptr) ib->ptr,
                                          ib->count << ib->index_size_shift,
                                          GL_MAP_READ_BIT, ib->obj,
                                          MAP_INTERNAL);
         assert(ib->obj->Mappings[MAP_INTERNAL].Pointer);
      } else {
         /* user-space elements, or buffer already mapped */
         ptr = ADD_POINTERS(ib->obj->Mappings[MAP_INTERNAL].Pointer, ib->ptr);
      }
   } else
      ptr = ib->ptr;

   if (ib->index_size_shift == 2 && VB->Primitive[0].basevertex == 0) {
      VB->Elts = (GLuint *) ptr;
   }
   else {
      GLuint *elts = (GLuint *)get_space(ctx, ib->count * sizeof(GLuint));
      VB->Elts = elts;

      if (ib->index_size_shift == 2) {
         const GLuint *in = (GLuint *)ptr;
         for (i = 0; i < ib->count; i++)
            *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
      }
      else if (ib->index_size_shift == 1) {
         const GLushort *in = (GLushort *)ptr;
         for (i = 0; i < ib->count; i++)
            *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
      }
      else {
         const GLubyte *in = (GLubyte *)ptr;
         for (i = 0; i < ib->count; i++)
            *elts++ = (GLuint)(*in++) + VB->Primitive[0].basevertex;
      }
   }
}

static void bind_prims(struct gl_context *ctx,
                       const struct _mesa_prim *prim,
                       GLuint nr_prims)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   struct vertex_buffer *VB = &tnl->vb;

   VB->Primitive = prim;
   VB->PrimitiveCount = nr_prims;
}

static void unmap_vbos(struct gl_context *ctx,
                       struct gl_buffer_object **bo,
                       GLuint nr_bo)
{
   for (GLuint i = 0; i < nr_bo; i++) {
      ctx->Driver.UnmapBuffer(ctx, bo[i], MAP_INTERNAL);
   }
}


/* This is the main workhorse doing all the rendering work.
 */
void _tnl_draw_prims(struct gl_context *ctx,
                     const struct tnl_vertex_array *arrays,
                     const struct _mesa_prim *prim,
                     GLuint nr_prims,
                     const struct _mesa_index_buffer *ib,
                     GLboolean index_bounds_valid,
                     GLuint min_index,
                     GLuint max_index,
                     GLuint num_instances,
                     GLuint base_instance)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   const GLuint TEST_SPLIT = 0;
   const GLint max = TEST_SPLIT ? 8 : tnl->vb.Size - MAX_CLIPPED_VERTICES;
   GLint max_basevertex = prim->basevertex;
   GLuint i;

   if (!index_bounds_valid)
      vbo_get_minmax_indices(ctx, prim, ib, &min_index, &max_index, nr_prims);

   /* Mesa core state should have been validated already */
   assert(ctx->NewState == 0x0);

   if (!_mesa_check_conditional_render(ctx))
      return; /* don't draw */

   for (i = 1; i < nr_prims; i++)
      max_basevertex = MAX2(max_basevertex, prim[i].basevertex);

   if (0) {
      printf("%s %d..%d\n", __func__, min_index, max_index);
      for (i = 0; i < nr_prims; i++)
         printf("prim %d: %s start %d count %d\n", i,
                _mesa_enum_to_string(prim[i].mode),
                prim[i].start,
                prim[i].count);
   }

   if (min_index) {
      /* We always translate away calls with min_index != 0.
       */
      t_rebase_prims(ctx, arrays, prim, nr_prims, ib,
                     min_index, max_index, num_instances, base_instance,
                     _tnl_draw_prims);
      return;
   }
   else if ((GLint)max_index + max_basevertex > max) {
      /* The software TNL pipeline has a fixed amount of storage for
       * vertices and it is necessary to split incoming drawing commands
       * if they exceed that limit.
       */
      struct split_limits limits;
      limits.max_verts = max;
      limits.max_vb_size = ~0;
      limits.max_indices = ~0;

      /* This will split the buffers one way or another and
       * recursively call back into this function.
       */
      _tnl_split_prims(ctx, arrays, prim, nr_prims, ib,
                       0, max_index + prim->basevertex,
                       num_instances, base_instance,
                       _tnl_draw_prims,
                       &limits);
   }
   else {
      /* May need to map a vertex buffer object for every attribute plus
       * one for the index buffer.
       */
      struct gl_buffer_object *bo[VERT_ATTRIB_MAX + 1];
      GLuint nr_bo = 0;
      GLuint inst;

      assert(num_instances > 0);

      for (i = 0; i < nr_prims;) {
         GLuint this_nr_prims;

         /* Our SW TNL pipeline doesn't handle basevertex yet, so bind_indices
          * will rebase the elements to the basevertex, and we'll only
          * emit strings of prims with the same basevertex in one draw call.
          */
         for (this_nr_prims = 1; i + this_nr_prims < nr_prims;
              this_nr_prims++) {
            if (prim[i].basevertex != prim[i + this_nr_prims].basevertex)
               break;
         }

         /* Binding inputs may imply mapping some vertex buffer objects.
          * They will need to be unmapped below.
          */
         for (inst = 0; inst < num_instances; inst++) {

            bind_prims(ctx, &prim[i], this_nr_prims);
            bind_inputs(ctx, arrays, max_index + prim[i].basevertex + 1,
                        bo, &nr_bo);
            bind_indices(ctx, ib, bo, &nr_bo);

            tnl->CurInstance = inst;
            TNL_CONTEXT(ctx)->Driver.RunPipeline(ctx);

            unmap_vbos(ctx, bo, nr_bo);
            free_space(ctx);
         }

         i += this_nr_prims;
      }
   }
}


void
_tnl_init_inputs(struct tnl_inputs *inputs)
{
   inputs->current = 0;
   inputs->vertex_processing_mode = VP_MODE_FF;
}


/**
 * Update the tnl_inputs's arrays to point to the vao->_VertexArray arrays
 * according to the 'enable' bitmask.
 * \param enable  bitfield of VERT_BIT_x flags.
 */
static inline void
update_vao_inputs(struct gl_context *ctx,
                  struct tnl_inputs *inputs, GLbitfield enable)
{
   const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;

   /* Make sure we process only arrays enabled in the VAO */
   assert((enable & ~_mesa_get_vao_vp_inputs(vao)) == 0);

   /* Fill in the client arrays from the VAO */
   const struct gl_vertex_buffer_binding *bindings = &vao->BufferBinding[0];
   while (enable) {
      const int attr = u_bit_scan(&enable);
      struct tnl_vertex_array *input = &inputs->inputs[attr];
      const struct gl_array_attributes *attrib;
      attrib = _mesa_draw_array_attrib(vao, attr);
      input->VertexAttrib = attrib;
      input->BufferBinding = &bindings[attrib->BufferBindingIndex];
   }
}


/**
 * Update the tnl_inputs's arrays to point to the vbo->currval arrays
 * according to the 'current' bitmask.
 * \param current  bitfield of VERT_BIT_x flags.
 */
static inline void
update_current_inputs(struct gl_context *ctx,
                      struct tnl_inputs *inputs, GLbitfield current)
{
   gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;

   /* All previously non current array pointers need update. */
   GLbitfield mask = current & ~inputs->current;
   /* On mode change, the slots aliasing with materials need update too */
   if (mode != inputs->vertex_processing_mode)
      mask |= current & VERT_BIT_MAT_ALL;

   while (mask) {
      const int attr = u_bit_scan(&mask);
      struct tnl_vertex_array *input = &inputs->inputs[attr];
      input->VertexAttrib = _vbo_current_attrib(ctx, attr);
      input->BufferBinding = _vbo_current_binding(ctx);
   }

   inputs->current = current;
   inputs->vertex_processing_mode = mode;
}


/**
 * Update the tnl_inputs's arrays to point to the vao->_VertexArray and
 * vbo->currval arrays according to Array._DrawVAO and
 * Array._DrawVAOEnableAttribs.
 */
void
_tnl_update_inputs(struct gl_context *ctx, struct tnl_inputs *inputs)
{
   const GLbitfield enable = ctx->Array._DrawVAOEnabledAttribs;

   /* Update array input pointers */
   update_vao_inputs(ctx, inputs, enable);

   /* The rest must be current inputs. */
   update_current_inputs(ctx, inputs, ~enable & VERT_BIT_ALL);
}


const struct tnl_vertex_array *
_tnl_bind_inputs(struct gl_context *ctx)
{
   TNLcontext *tnl = TNL_CONTEXT(ctx);
   _tnl_update_inputs(ctx, &tnl->draw_arrays);
   return tnl->draw_arrays.inputs;
}


/* This is the main entrypoint into the slimmed-down software tnl
 * module.  In a regular swtnl driver, this can be plugged straight
 * into the ctx->Driver.Draw() callback.
 */
void
_tnl_draw(struct gl_context *ctx,
          const struct _mesa_prim *prim, GLuint nr_prims,
          const struct _mesa_index_buffer *ib,
          GLboolean index_bounds_valid, GLuint min_index, GLuint max_index,
          GLuint num_instances, GLuint base_instance,
          UNUSED struct gl_transform_feedback_object *tfb_vertcount,
          UNUSED unsigned stream)
{
   /* Update TNLcontext::draw_arrays and return that pointer.
    */
   const struct tnl_vertex_array* arrays = _tnl_bind_inputs(ctx);

   _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib,
                   index_bounds_valid, min_index, max_index,
                   num_instances, base_instance);
}


void
_tnl_init_driver_draw_function(struct dd_function_table *functions)
{
   functions->Draw = _tnl_draw;
}