1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <float.h>
13 #include <math.h>
14 #include <stdio.h>
15 #include "vpx_mem/vpx_mem.h"
16 #include "vpxscale_arbitrary.h"
17
18 #define FIXED_POINT
19
20 #define MAX_IN_WIDTH 800
21 #define MAX_IN_HEIGHT 600
22 #define MAX_OUT_WIDTH 800
23 #define MAX_OUT_HEIGHT 600
24 #define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
25 MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
26
27 BICUBIC_SCALER_STRUCT g_b_scaler;
28 static int g_first_time = 1;
29
30 #pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
31 #pragma DATA_ALIGN (g_hbuf, 32);
32 unsigned char g_hbuf[MAX_OUT_DIMENSION];
33
34 #pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
35 #pragma DATA_ALIGN (g_hbuf_uv, 32);
36 unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
37
38
39 #ifdef FIXED_POINT
40 static int a_i = 0.6 * 65536;
41 #else
42 static float a = -0.6;
43 #endif
44
45 #ifdef FIXED_POINT
46 // 3 2
47 // C0 = a*t - a*t
48 //
c0_fixed(unsigned int t)49 static short c0_fixed(unsigned int t)
50 {
51 // put t in Q16 notation
52 unsigned short v1, v2;
53
54 // Q16
55 v1 = (a_i * t) >> 16;
56 v1 = (v1 * t) >> 16;
57
58 // Q16
59 v2 = (a_i * t) >> 16;
60 v2 = (v2 * t) >> 16;
61 v2 = (v2 * t) >> 16;
62
63 // Q12
64 return -((v1 - v2) >> 4);
65 }
66
67 // 2 3
68 // C1 = a*t + (3-2*a)*t - (2-a)*t
69 //
c1_fixed(unsigned int t)70 static short c1_fixed(unsigned int t)
71 {
72 unsigned short v1, v2, v3;
73 unsigned short two, three;
74
75 // Q16
76 v1 = (a_i * t) >> 16;
77
78 // Q13
79 two = 2 << 13;
80 v2 = two - (a_i >> 3);
81 v2 = (v2 * t) >> 16;
82 v2 = (v2 * t) >> 16;
83 v2 = (v2 * t) >> 16;
84
85 // Q13
86 three = 3 << 13;
87 v3 = three - (2 * (a_i >> 3));
88 v3 = (v3 * t) >> 16;
89 v3 = (v3 * t) >> 16;
90
91 // Q12
92 return (((v1 >> 3) - v2 + v3) >> 1);
93
94 }
95
96 // 2 3
97 // C2 = 1 - (3-a)*t + (2-a)*t
98 //
c2_fixed(unsigned int t)99 static short c2_fixed(unsigned int t)
100 {
101 unsigned short v1, v2, v3;
102 unsigned short two, three;
103
104 // Q13
105 v1 = 1 << 13;
106
107 // Q13
108 three = 3 << 13;
109 v2 = three - (a_i >> 3);
110 v2 = (v2 * t) >> 16;
111 v2 = (v2 * t) >> 16;
112
113 // Q13
114 two = 2 << 13;
115 v3 = two - (a_i >> 3);
116 v3 = (v3 * t) >> 16;
117 v3 = (v3 * t) >> 16;
118 v3 = (v3 * t) >> 16;
119
120 // Q12
121 return (v1 - v2 + v3) >> 1;
122 }
123
124 // 2 3
125 // C3 = a*t - 2*a*t + a*t
126 //
c3_fixed(unsigned int t)127 static short c3_fixed(unsigned int t)
128 {
129 int v1, v2, v3;
130
131 // Q16
132 v1 = (a_i * t) >> 16;
133
134 // Q15
135 v2 = 2 * (a_i >> 1);
136 v2 = (v2 * t) >> 16;
137 v2 = (v2 * t) >> 16;
138
139 // Q16
140 v3 = (a_i * t) >> 16;
141 v3 = (v3 * t) >> 16;
142 v3 = (v3 * t) >> 16;
143
144 // Q12
145 return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
146 }
147 #else
148 // 3 2
149 // C0 = -a*t + a*t
150 //
C0(float t)151 float C0(float t)
152 {
153 return -a * t * t * t + a * t * t;
154 }
155
156 // 2 3
157 // C1 = -a*t + (2*a+3)*t - (a+2)*t
158 //
C1(float t)159 float C1(float t)
160 {
161 return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
162 }
163
164 // 2 3
165 // C2 = 1 - (a+3)*t + (a+2)*t
166 //
C2(float t)167 float C2(float t)
168 {
169 return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
170 }
171
172 // 2 3
173 // C3 = a*t - 2*a*t + a*t
174 //
C3(float t)175 float C3(float t)
176 {
177 return a * t * t * t - 2.0f * a * t * t + a * t;
178 }
179 #endif
180
181 #if 0
182 int compare_real_fixed()
183 {
184 int i, errors = 0;
185 float mult = 1.0 / 10000.0;
186 unsigned int fixed_mult = mult * 4294967296;//65536;
187 unsigned int phase_offset_int;
188 float phase_offset_real;
189
190 for (i = 0; i < 10000; i++)
191 {
192 int fixed0, fixed1, fixed2, fixed3, fixed_total;
193 int real0, real1, real2, real3, real_total;
194
195 phase_offset_real = (float)i * mult;
196 phase_offset_int = (fixed_mult * i) >> 16;
197 // phase_offset_int = phase_offset_real * 65536;
198
199 fixed0 = c0_fixed(phase_offset_int);
200 real0 = C0(phase_offset_real) * 4096.0;
201
202 if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
203 errors++;
204
205 fixed1 = c1_fixed(phase_offset_int);
206 real1 = C1(phase_offset_real) * 4096.0;
207
208 if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
209 errors++;
210
211 fixed2 = c2_fixed(phase_offset_int);
212 real2 = C2(phase_offset_real) * 4096.0;
213
214 if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
215 errors++;
216
217 fixed3 = c3_fixed(phase_offset_int);
218 real3 = C3(phase_offset_real) * 4096.0;
219
220 if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
221 errors++;
222
223 fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
224 real_total = real0 + real1 + real2 + real3;
225
226 if ((fixed_total > 4097) || (fixed_total < 4094))
227 errors ++;
228
229 if ((real_total > 4097) || (real_total < 4095))
230 errors ++;
231 }
232
233 return errors;
234 }
235 #endif
236
237 // Find greatest common denominator between two integers. Method used here is
238 // slow compared to Euclid's algorithm, but does not require any division.
gcd(int a,int b)239 int gcd(int a, int b)
240 {
241 // Problem with this algorithm is that if a or b = 0 this function
242 // will never exit. Don't want to return 0 because any computation
243 // that was based on a common denoninator and tried to reduce by
244 // dividing by 0 would fail. Best solution that could be thought of
245 // would to be fail by returing a 1;
246 if (a <= 0 || b <= 0)
247 return 1;
248
249 while (a != b)
250 {
251 if (b > a)
252 b = b - a;
253 else
254 {
255 int tmp = a;//swap large and
256 a = b; //small
257 b = tmp;
258 }
259 }
260
261 return b;
262 }
263
bicubic_coefficient_init()264 void bicubic_coefficient_init()
265 {
266 vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
267 g_first_time = 0;
268 }
269
bicubic_coefficient_destroy()270 void bicubic_coefficient_destroy()
271 {
272 if (!g_first_time)
273 {
274 vpx_free(g_b_scaler.l_w);
275
276 vpx_free(g_b_scaler.l_h);
277
278 vpx_free(g_b_scaler.l_h_uv);
279
280 vpx_free(g_b_scaler.c_w);
281
282 vpx_free(g_b_scaler.c_h);
283
284 vpx_free(g_b_scaler.c_h_uv);
285
286 vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
287 }
288 }
289
290 // Create the coeffients that will be used for the cubic interpolation.
291 // Because scaling does not have to be equal in the vertical and horizontal
292 // regimes the phase offsets will be different. There are 4 coefficents
293 // for each point, two on each side. The layout is that there are the
294 // 4 coefficents for each phase in the array and then the next phase.
bicubic_coefficient_setup(int in_width,int in_height,int out_width,int out_height)295 int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
296 {
297 int i;
298 #ifdef FIXED_POINT
299 int phase_offset_int;
300 unsigned int fixed_mult;
301 int product_val = 0;
302 #else
303 float phase_offset;
304 #endif
305 int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
306
307 if (g_first_time)
308 bicubic_coefficient_init();
309
310
311 // check to see if the coefficents have already been set up correctly
312 if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
313 && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
314 return 0;
315
316 g_b_scaler.in_width = in_width;
317 g_b_scaler.in_height = in_height;
318 g_b_scaler.out_width = out_width;
319 g_b_scaler.out_height = out_height;
320
321 // Don't want to allow crazy scaling, just try and prevent a catastrophic
322 // failure here. Want to fail after setting the member functions so if
323 // if the scaler is called the member functions will not scale.
324 if (out_width <= 0 || out_height <= 0)
325 return -1;
326
327 // reduce in/out width and height ratios using the gcd
328 gcd_w = gcd(out_width, in_width);
329 gcd_h = gcd(out_height, in_height);
330 gcd_h_uv = gcd(out_height, in_height / 2);
331
332 // the numerator width and height are to be saved in
333 // globals so they can be used during the scaling process
334 // without having to be recalculated.
335 g_b_scaler.nw = out_width / gcd_w;
336 d_w = in_width / gcd_w;
337
338 g_b_scaler.nh = out_height / gcd_h;
339 d_h = in_height / gcd_h;
340
341 g_b_scaler.nh_uv = out_height / gcd_h_uv;
342 d_h_uv = (in_height / 2) / gcd_h_uv;
343
344 // allocate memory for the coefficents
345 vpx_free(g_b_scaler.l_w);
346
347 vpx_free(g_b_scaler.l_h);
348
349 vpx_free(g_b_scaler.l_h_uv);
350
351 g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
352 g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
353 g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
354
355 vpx_free(g_b_scaler.c_w);
356
357 vpx_free(g_b_scaler.c_h);
358
359 vpx_free(g_b_scaler.c_h_uv);
360
361 g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
362 g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
363 g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
364
365 g_b_scaler.hbuf = g_hbuf;
366 g_b_scaler.hbuf_uv = g_hbuf_uv;
367
368 // Set up polyphase filter taps. This needs to be done before
369 // the scaling because of the floating point math required. The
370 // coefficients are multiplied by 2^12 so that fixed point math
371 // can be used in the main scaling loop.
372 #ifdef FIXED_POINT
373 fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
374
375 product_val = 0;
376
377 for (i = 0; i < g_b_scaler.nw; i++)
378 {
379 if (product_val > g_b_scaler.nw)
380 product_val -= g_b_scaler.nw;
381
382 phase_offset_int = (fixed_mult * product_val) >> 16;
383
384 g_b_scaler.c_w[i*4] = c3_fixed(phase_offset_int);
385 g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
386 g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
387 g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);
388
389 product_val += d_w;
390 }
391
392
393 fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
394
395 product_val = 0;
396
397 for (i = 0; i < g_b_scaler.nh; i++)
398 {
399 if (product_val > g_b_scaler.nh)
400 product_val -= g_b_scaler.nh;
401
402 phase_offset_int = (fixed_mult * product_val) >> 16;
403
404 g_b_scaler.c_h[i*4] = c0_fixed(phase_offset_int);
405 g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
406 g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
407 g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);
408
409 product_val += d_h;
410 }
411
412 fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
413
414 product_val = 0;
415
416 for (i = 0; i < g_b_scaler.nh_uv; i++)
417 {
418 if (product_val > g_b_scaler.nh_uv)
419 product_val -= g_b_scaler.nh_uv;
420
421 phase_offset_int = (fixed_mult * product_val) >> 16;
422
423 g_b_scaler.c_h_uv[i*4] = c0_fixed(phase_offset_int);
424 g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
425 g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
426 g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);
427
428 product_val += d_h_uv;
429 }
430
431 #else
432
433 for (i = 0; i < g_nw; i++)
434 {
435 phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
436 g_c_w[i*4] = (C3(phase_offset) * 4096.0);
437 g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
438 g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
439 g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
440 }
441
442 for (i = 0; i < g_nh; i++)
443 {
444 phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
445 g_c_h[i*4] = (C0(phase_offset) * 4096.0);
446 g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
447 g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
448 g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
449 }
450
451 for (i = 0; i < g_nh_uv; i++)
452 {
453 phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
454 g_c_h_uv[i*4] = (C0(phase_offset) * 4096.0);
455 g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
456 g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
457 g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
458 }
459
460 #endif
461
462 // Create an array that corresponds input lines to output lines.
463 // This doesn't require floating point math, but it does require
464 // a division and because hardware division is not present that
465 // is a call.
466 for (i = 0; i < out_width; i++)
467 {
468 g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
469
470 if ((g_b_scaler.l_w[i] + 2) <= in_width)
471 g_b_scaler.max_usable_out_width = i;
472
473 }
474
475 for (i = 0; i < out_height + 1; i++)
476 {
477 g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
478 g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
479 }
480
481 return 0;
482 }
483
bicubic_scale(int in_width,int in_height,int in_stride,int out_width,int out_height,int out_stride,unsigned char * input_image,unsigned char * output_image)484 int bicubic_scale(int in_width, int in_height, int in_stride,
485 int out_width, int out_height, int out_stride,
486 unsigned char *input_image, unsigned char *output_image)
487 {
488 short *RESTRICT l_w, * RESTRICT l_h;
489 short *RESTRICT c_w, * RESTRICT c_h;
490 unsigned char *RESTRICT ip, * RESTRICT op;
491 unsigned char *RESTRICT hbuf;
492 int h, w, lw, lh;
493 int temp_sum;
494 int phase_offset_w, phase_offset_h;
495
496 c_w = g_b_scaler.c_w;
497 c_h = g_b_scaler.c_h;
498
499 op = output_image;
500
501 l_w = g_b_scaler.l_w;
502 l_h = g_b_scaler.l_h;
503
504 phase_offset_h = 0;
505
506 for (h = 0; h < out_height; h++)
507 {
508 // select the row to work on
509 lh = l_h[h];
510 ip = input_image + (in_stride * lh);
511
512 // vp8_filter the row vertically into an temporary buffer.
513 // If the phase offset == 0 then all the multiplication
514 // is going to result in the output equalling the input.
515 // So instead point the temporary buffer to the input.
516 // Also handle the boundry condition of not being able to
517 // filter that last lines.
518 if (phase_offset_h && (lh < in_height - 2))
519 {
520 hbuf = g_b_scaler.hbuf;
521
522 for (w = 0; w < in_width; w++)
523 {
524 temp_sum = c_h[phase_offset_h*4+3] * ip[w - in_stride];
525 temp_sum += c_h[phase_offset_h*4+2] * ip[w];
526 temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
527 temp_sum += c_h[phase_offset_h*4] * ip[w + 2*in_stride];
528
529 hbuf[w] = temp_sum >> 12;
530 }
531 }
532 else
533 hbuf = ip;
534
535 // increase the phase offset for the next time around.
536 if (++phase_offset_h >= g_b_scaler.nh)
537 phase_offset_h = 0;
538
539 // now filter and expand it horizontally into the final
540 // output buffer
541 phase_offset_w = 0;
542
543 for (w = 0; w < out_width; w++)
544 {
545 // get the index to use to expand the image
546 lw = l_w[w];
547
548 temp_sum = c_w[phase_offset_w*4] * hbuf[lw - 1];
549 temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
550 temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
551 temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
552 temp_sum = temp_sum >> 12;
553
554 if (++phase_offset_w >= g_b_scaler.nw)
555 phase_offset_w = 0;
556
557 // boundry conditions
558 if ((lw + 2) >= in_width)
559 temp_sum = hbuf[lw];
560
561 if (lw == 0)
562 temp_sum = hbuf[0];
563
564 op[w] = temp_sum;
565 }
566
567 op += out_stride;
568 }
569
570 return 0;
571 }
572
bicubic_scale_frame_reset()573 void bicubic_scale_frame_reset()
574 {
575 g_b_scaler.out_width = 0;
576 g_b_scaler.out_height = 0;
577 }
578
bicubic_scale_frame(YV12_BUFFER_CONFIG * src,YV12_BUFFER_CONFIG * dst,int new_width,int new_height)579 void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
580 int new_width, int new_height)
581 {
582
583 dst->y_width = new_width;
584 dst->y_height = new_height;
585 dst->uv_width = new_width / 2;
586 dst->uv_height = new_height / 2;
587
588 dst->y_stride = dst->y_width;
589 dst->uv_stride = dst->uv_width;
590
591 bicubic_scale(src->y_width, src->y_height, src->y_stride,
592 new_width, new_height, dst->y_stride,
593 src->y_buffer, dst->y_buffer);
594
595 bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
596 new_width / 2, new_height / 2, dst->uv_stride,
597 src->u_buffer, dst->u_buffer);
598
599 bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
600 new_width / 2, new_height / 2, dst->uv_stride,
601 src->v_buffer, dst->v_buffer);
602 }
603