vpx_scale/generic/bicubic_scaler.c

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include <float.h>
#include <math.h>
#include <stdio.h>
#include "vpx_mem/vpx_mem.h"
#include "vpxscale_arbitrary.h"

#define FIXED_POINT

#define MAX_IN_WIDTH        800
#define MAX_IN_HEIGHT       600
#define MAX_OUT_WIDTH       800
#define MAX_OUT_HEIGHT      600
#define MAX_OUT_DIMENSION   ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
                             MAX_OUT_WIDTH : MAX_OUT_HEIGHT)

BICUBIC_SCALER_STRUCT g_b_scaler;
static int g_first_time = 1;

#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
#pragma DATA_ALIGN (g_hbuf, 32);
unsigned char g_hbuf[MAX_OUT_DIMENSION];

#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
#pragma DATA_ALIGN (g_hbuf_uv, 32);
unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];


#ifdef FIXED_POINT
static int a_i = 0.6 * 65536;
#else
static float a = -0.6;
#endif

#ifdef FIXED_POINT
//         3     2
// C0 = a*t - a*t
//
static short c0_fixed(unsigned int t) {
  // put t in Q16 notation
  unsigned short v1, v2;

  // Q16
  v1 = (a_i * t) >> 16;
  v1 = (v1 * t) >> 16;

  // Q16
  v2 = (a_i * t) >> 16;
  v2 = (v2 * t) >> 16;
  v2 = (v2 * t) >> 16;

  // Q12
  return -((v1 - v2) >> 4);
}

//                     2          3
// C1 = a*t + (3-2*a)*t  - (2-a)*t
//
static short c1_fixed(unsigned int t) {
  unsigned short v1, v2, v3;
  unsigned short two, three;

  // Q16
  v1 = (a_i * t) >> 16;

  // Q13
  two = 2 << 13;
  v2 = two - (a_i >> 3);
  v2 = (v2 * t) >> 16;
  v2 = (v2 * t) >> 16;
  v2 = (v2 * t) >> 16;

  // Q13
  three = 3 << 13;
  v3 = three - (2 * (a_i >> 3));
  v3 = (v3 * t) >> 16;
  v3 = (v3 * t) >> 16;

  // Q12
  return (((v1 >> 3) - v2 + v3) >> 1);

}

//                 2          3
// C2 = 1 - (3-a)*t  + (2-a)*t
//
static short c2_fixed(unsigned int t) {
  unsigned short v1, v2, v3;
  unsigned short two, three;

  // Q13
  v1 = 1 << 13;

  // Q13
  three = 3 << 13;
  v2 = three - (a_i >> 3);
  v2 = (v2 * t) >> 16;
  v2 = (v2 * t) >> 16;

  // Q13
  two = 2 << 13;
  v3 = two - (a_i >> 3);
  v3 = (v3 * t) >> 16;
  v3 = (v3 * t) >> 16;
  v3 = (v3 * t) >> 16;

  // Q12
  return (v1 - v2 + v3) >> 1;
}

//                 2      3
// C3 = a*t - 2*a*t  + a*t
//
static short c3_fixed(unsigned int t) {
  int v1, v2, v3;

  // Q16
  v1 = (a_i * t) >> 16;

  // Q15
  v2 = 2 * (a_i >> 1);
  v2 = (v2 * t) >> 16;
  v2 = (v2 * t) >> 16;

  // Q16
  v3 = (a_i * t) >> 16;
  v3 = (v3 * t) >> 16;
  v3 = (v3 * t) >> 16;

  // Q12
  return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
}
#else
//          3     2
// C0 = -a*t + a*t
//
float C0(float t) {
  return -a * t * t * t + a * t * t;
}

//                      2          3
// C1 = -a*t + (2*a+3)*t  - (a+2)*t
//
float C1(float t) {
  return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
}

//                 2          3
// C2 = 1 - (a+3)*t  + (a+2)*t
//
float C2(float t) {
  return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
}

//                 2      3
// C3 = a*t - 2*a*t  + a*t
//
float C3(float t) {
  return a * t * t * t - 2.0f * a * t * t + a * t;
}
#endif

#if 0
int compare_real_fixed() {
  int i, errors = 0;
  float mult = 1.0 / 10000.0;
  unsigned int fixed_mult = mult * 4294967296;// 65536;
  unsigned int phase_offset_int;
  float phase_offset_real;

  for (i = 0; i < 10000; i++) {
    int fixed0, fixed1, fixed2, fixed3, fixed_total;
    int real0, real1, real2, real3, real_total;

    phase_offset_real = (float)i * mult;
    phase_offset_int = (fixed_mult * i) >> 16;
//      phase_offset_int = phase_offset_real * 65536;

    fixed0 = c0_fixed(phase_offset_int);
    real0 = C0(phase_offset_real) * 4096.0;

    if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
      errors++;

    fixed1 = c1_fixed(phase_offset_int);
    real1 = C1(phase_offset_real) * 4096.0;

    if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
      errors++;

    fixed2 = c2_fixed(phase_offset_int);
    real2 = C2(phase_offset_real) * 4096.0;

    if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
      errors++;

    fixed3 = c3_fixed(phase_offset_int);
    real3 = C3(phase_offset_real) * 4096.0;

    if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
      errors++;

    fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
    real_total = real0 + real1 + real2 + real3;

    if ((fixed_total > 4097) || (fixed_total < 4094))
      errors++;

    if ((real_total > 4097) || (real_total < 4095))
      errors++;
  }

  return errors;
}
#endif

// Find greatest common denominator between two integers.  Method used here is
//  slow compared to Euclid's algorithm, but does not require any division.
int gcd(int a, int b) {
  // Problem with this algorithm is that if a or b = 0 this function
  //  will never exit.  Don't want to return 0 because any computation
  //  that was based on a common denoninator and tried to reduce by
  //  dividing by 0 would fail.  Best solution that could be thought of
  //  would to be fail by returing a 1;
  if (a <= 0 || b <= 0)
    return 1;

  while (a != b) {
    if (b > a)
      b = b - a;
    else {
      int tmp = a;// swap large and
      a = b; // small
      b = tmp;
    }
  }

  return b;
}

void bicubic_coefficient_init() {
  vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
  g_first_time = 0;
}

void bicubic_coefficient_destroy() {
  if (!g_first_time) {
    vpx_free(g_b_scaler.l_w);

    vpx_free(g_b_scaler.l_h);

    vpx_free(g_b_scaler.l_h_uv);

    vpx_free(g_b_scaler.c_w);

    vpx_free(g_b_scaler.c_h);

    vpx_free(g_b_scaler.c_h_uv);

    vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
  }
}

// Create the coeffients that will be used for the cubic interpolation.
//  Because scaling does not have to be equal in the vertical and horizontal
//  regimes the phase offsets will be different.  There are 4 coefficents
//  for each point, two on each side.  The layout is that there are the
//  4 coefficents for each phase in the array and then the next phase.
int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height) {
  int i;
#ifdef FIXED_POINT
  int phase_offset_int;
  unsigned int fixed_mult;
  int product_val = 0;
#else
  float phase_offset;
#endif
  int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;

  if (g_first_time)
    bicubic_coefficient_init();


  // check to see if the coefficents have already been set up correctly
  if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
      && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
    return 0;

  g_b_scaler.in_width = in_width;
  g_b_scaler.in_height = in_height;
  g_b_scaler.out_width = out_width;
  g_b_scaler.out_height = out_height;

  // Don't want to allow crazy scaling, just try and prevent a catastrophic
  //  failure here.  Want to fail after setting the member functions so if
  //  if the scaler is called the member functions will not scale.
  if (out_width <= 0 || out_height <= 0)
    return -1;

  // reduce in/out width and height ratios using the gcd
  gcd_w = gcd(out_width, in_width);
  gcd_h = gcd(out_height, in_height);
  gcd_h_uv = gcd(out_height, in_height / 2);

  // the numerator width and height are to be saved in
  //  globals so they can be used during the scaling process
  //  without having to be recalculated.
  g_b_scaler.nw = out_width / gcd_w;
  d_w = in_width / gcd_w;

  g_b_scaler.nh = out_height / gcd_h;
  d_h = in_height / gcd_h;

  g_b_scaler.nh_uv = out_height / gcd_h_uv;
  d_h_uv = (in_height / 2) / gcd_h_uv;

  // allocate memory for the coefficents
  vpx_free(g_b_scaler.l_w);

  vpx_free(g_b_scaler.l_h);

  vpx_free(g_b_scaler.l_h_uv);

  g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
  g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
  g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);

  vpx_free(g_b_scaler.c_w);

  vpx_free(g_b_scaler.c_h);

  vpx_free(g_b_scaler.c_h_uv);

  g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
  g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
  g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);

  g_b_scaler.hbuf = g_hbuf;
  g_b_scaler.hbuf_uv = g_hbuf_uv;

  // Set up polyphase filter taps.  This needs to be done before
  //  the scaling because of the floating point math required.  The
  //  coefficients are multiplied by 2^12 so that fixed point math
  //  can be used in the main scaling loop.
#ifdef FIXED_POINT
  fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;

  product_val = 0;

  for (i = 0; i < g_b_scaler.nw; i++) {
    if (product_val > g_b_scaler.nw)
      product_val -= g_b_scaler.nw;

    phase_offset_int = (fixed_mult * product_val) >> 16;

    g_b_scaler.c_w[i * 4]   = c3_fixed(phase_offset_int);
    g_b_scaler.c_w[i * 4 + 1] = c2_fixed(phase_offset_int);
    g_b_scaler.c_w[i * 4 + 2] = c1_fixed(phase_offset_int);
    g_b_scaler.c_w[i * 4 + 3] = c0_fixed(phase_offset_int);

    product_val += d_w;
  }


  fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;

  product_val = 0;

  for (i = 0; i < g_b_scaler.nh; i++) {
    if (product_val > g_b_scaler.nh)
      product_val -= g_b_scaler.nh;

    phase_offset_int = (fixed_mult * product_val) >> 16;

    g_b_scaler.c_h[i * 4]   = c0_fixed(phase_offset_int);
    g_b_scaler.c_h[i * 4 + 1] = c1_fixed(phase_offset_int);
    g_b_scaler.c_h[i * 4 + 2] = c2_fixed(phase_offset_int);
    g_b_scaler.c_h[i * 4 + 3] = c3_fixed(phase_offset_int);

    product_val += d_h;
  }

  fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;

  product_val = 0;

  for (i = 0; i < g_b_scaler.nh_uv; i++) {
    if (product_val > g_b_scaler.nh_uv)
      product_val -= g_b_scaler.nh_uv;

    phase_offset_int = (fixed_mult * product_val) >> 16;

    g_b_scaler.c_h_uv[i * 4]   = c0_fixed(phase_offset_int);
    g_b_scaler.c_h_uv[i * 4 + 1] = c1_fixed(phase_offset_int);
    g_b_scaler.c_h_uv[i * 4 + 2] = c2_fixed(phase_offset_int);
    g_b_scaler.c_h_uv[i * 4 + 3] = c3_fixed(phase_offset_int);

    product_val += d_h_uv;
  }

#else

  for (i = 0; i < g_nw; i++) {
    phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
    g_c_w[i * 4]   = (C3(phase_offset) * 4096.0);
    g_c_w[i * 4 + 1] = (C2(phase_offset) * 4096.0);
    g_c_w[i * 4 + 2] = (C1(phase_offset) * 4096.0);
    g_c_w[i * 4 + 3] = (C0(phase_offset) * 4096.0);
  }

  for (i = 0; i < g_nh; i++) {
    phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
    g_c_h[i * 4]   = (C0(phase_offset) * 4096.0);
    g_c_h[i * 4 + 1] = (C1(phase_offset) * 4096.0);
    g_c_h[i * 4 + 2] = (C2(phase_offset) * 4096.0);
    g_c_h[i * 4 + 3] = (C3(phase_offset) * 4096.0);
  }

  for (i = 0; i < g_nh_uv; i++) {
    phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
    g_c_h_uv[i * 4]   = (C0(phase_offset) * 4096.0);
    g_c_h_uv[i * 4 + 1] = (C1(phase_offset) * 4096.0);
    g_c_h_uv[i * 4 + 2] = (C2(phase_offset) * 4096.0);
    g_c_h_uv[i * 4 + 3] = (C3(phase_offset) * 4096.0);
  }

#endif

  // Create an array that corresponds input lines to output lines.
  //  This doesn't require floating point math, but it does require
  //  a division and because hardware division is not present that
  //  is a call.
  for (i = 0; i < out_width; i++) {
    g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;

    if ((g_b_scaler.l_w[i] + 2) <= in_width)
      g_b_scaler.max_usable_out_width = i;

  }

  for (i = 0; i < out_height + 1; i++) {
    g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
    g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
  }

  return 0;
}

int bicubic_scale(int in_width, int in_height, int in_stride,
                  int out_width, int out_height, int out_stride,
                  unsigned char *input_image, unsigned char *output_image) {
  short *RESTRICT l_w, * RESTRICT l_h;
  short *RESTRICT c_w, * RESTRICT c_h;
  unsigned char *RESTRICT ip, * RESTRICT op;
  unsigned char *RESTRICT hbuf;
  int h, w, lw, lh;
  int temp_sum;
  int phase_offset_w, phase_offset_h;

  c_w = g_b_scaler.c_w;
  c_h = g_b_scaler.c_h;

  op = output_image;

  l_w = g_b_scaler.l_w;
  l_h = g_b_scaler.l_h;

  phase_offset_h = 0;

  for (h = 0; h < out_height; h++) {
    // select the row to work on
    lh = l_h[h];
    ip = input_image + (in_stride * lh);

    // vp8_filter the row vertically into an temporary buffer.
    //  If the phase offset == 0 then all the multiplication
    //  is going to result in the output equalling the input.
    //  So instead point the temporary buffer to the input.
    //  Also handle the boundry condition of not being able to
    //  filter that last lines.
    if (phase_offset_h && (lh < in_height - 2)) {
      hbuf = g_b_scaler.hbuf;

      for (w = 0; w < in_width; w++) {
        temp_sum =  c_h[phase_offset_h * 4 + 3] * ip[w - in_stride];
        temp_sum += c_h[phase_offset_h * 4 + 2] * ip[w];
        temp_sum += c_h[phase_offset_h * 4 + 1] * ip[w + in_stride];
        temp_sum += c_h[phase_offset_h * 4]   * ip[w + 2 * in_stride];

        hbuf[w] = temp_sum >> 12;
      }
    } else
      hbuf = ip;

    // increase the phase offset for the next time around.
    if (++phase_offset_h >= g_b_scaler.nh)
      phase_offset_h = 0;

    // now filter and expand it horizontally into the final
    //  output buffer
    phase_offset_w = 0;

    for (w = 0; w < out_width; w++) {
      // get the index to use to expand the image
      lw = l_w[w];

      temp_sum =  c_w[phase_offset_w * 4]   * hbuf[lw - 1];
      temp_sum += c_w[phase_offset_w * 4 + 1] * hbuf[lw];
      temp_sum += c_w[phase_offset_w * 4 + 2] * hbuf[lw + 1];
      temp_sum += c_w[phase_offset_w * 4 + 3] * hbuf[lw + 2];
      temp_sum = temp_sum >> 12;

      if (++phase_offset_w >= g_b_scaler.nw)
        phase_offset_w = 0;

      // boundry conditions
      if ((lw + 2) >= in_width)
        temp_sum = hbuf[lw];

      if (lw == 0)
        temp_sum = hbuf[0];

      op[w] = temp_sum;
    }

    op += out_stride;
  }

  return 0;
}

void bicubic_scale_frame_reset() {
  g_b_scaler.out_width = 0;
  g_b_scaler.out_height = 0;
}

void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                         int new_width, int new_height) {

  dst->y_width = new_width;
  dst->y_height = new_height;
  dst->uv_width = new_width / 2;
  dst->uv_height = new_height / 2;

  dst->y_stride = dst->y_width;
  dst->uv_stride = dst->uv_width;

  bicubic_scale(src->y_width, src->y_height, src->y_stride,
                new_width, new_height, dst->y_stride,
                src->y_buffer, dst->y_buffer);

  bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
                new_width / 2, new_height / 2, dst->uv_stride,
                src->u_buffer, dst->u_buffer);

  bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
                new_width / 2, new_height / 2, dst->uv_stride,
                src->v_buffer, dst->v_buffer);
}