1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <float.h>
13 #include <math.h>
14 #include <stdio.h>
15 #include "vpx_mem/vpx_mem.h"
16 #include "vpxscale_arbitrary.h"
17
18 #define FIXED_POINT
19
20 #define MAX_IN_WIDTH 800
21 #define MAX_IN_HEIGHT 600
22 #define MAX_OUT_WIDTH 800
23 #define MAX_OUT_HEIGHT 600
24 #define MAX_OUT_DIMENSION ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
25 MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
26
27 BICUBIC_SCALER_STRUCT g_b_scaler;
28 static int g_first_time = 1;
29
30 #pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
31 #pragma DATA_ALIGN (g_hbuf, 32);
32 unsigned char g_hbuf[MAX_OUT_DIMENSION];
33
34 #pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
35 #pragma DATA_ALIGN (g_hbuf_uv, 32);
36 unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
37
38
39 #ifdef FIXED_POINT
40 static int a_i = 0.6 * 65536;
41 #else
42 static float a = -0.6;
43 #endif
44
45 #ifdef FIXED_POINT
46 // 3 2
47 // C0 = a*t - a*t
48 //
c0_fixed(unsigned int t)49 static short c0_fixed(unsigned int t) {
50 // put t in Q16 notation
51 unsigned short v1, v2;
52
53 // Q16
54 v1 = (a_i * t) >> 16;
55 v1 = (v1 * t) >> 16;
56
57 // Q16
58 v2 = (a_i * t) >> 16;
59 v2 = (v2 * t) >> 16;
60 v2 = (v2 * t) >> 16;
61
62 // Q12
63 return -((v1 - v2) >> 4);
64 }
65
66 // 2 3
67 // C1 = a*t + (3-2*a)*t - (2-a)*t
68 //
c1_fixed(unsigned int t)69 static short c1_fixed(unsigned int t) {
70 unsigned short v1, v2, v3;
71 unsigned short two, three;
72
73 // Q16
74 v1 = (a_i * t) >> 16;
75
76 // Q13
77 two = 2 << 13;
78 v2 = two - (a_i >> 3);
79 v2 = (v2 * t) >> 16;
80 v2 = (v2 * t) >> 16;
81 v2 = (v2 * t) >> 16;
82
83 // Q13
84 three = 3 << 13;
85 v3 = three - (2 * (a_i >> 3));
86 v3 = (v3 * t) >> 16;
87 v3 = (v3 * t) >> 16;
88
89 // Q12
90 return (((v1 >> 3) - v2 + v3) >> 1);
91
92 }
93
94 // 2 3
95 // C2 = 1 - (3-a)*t + (2-a)*t
96 //
c2_fixed(unsigned int t)97 static short c2_fixed(unsigned int t) {
98 unsigned short v1, v2, v3;
99 unsigned short two, three;
100
101 // Q13
102 v1 = 1 << 13;
103
104 // Q13
105 three = 3 << 13;
106 v2 = three - (a_i >> 3);
107 v2 = (v2 * t) >> 16;
108 v2 = (v2 * t) >> 16;
109
110 // Q13
111 two = 2 << 13;
112 v3 = two - (a_i >> 3);
113 v3 = (v3 * t) >> 16;
114 v3 = (v3 * t) >> 16;
115 v3 = (v3 * t) >> 16;
116
117 // Q12
118 return (v1 - v2 + v3) >> 1;
119 }
120
121 // 2 3
122 // C3 = a*t - 2*a*t + a*t
123 //
c3_fixed(unsigned int t)124 static short c3_fixed(unsigned int t) {
125 int v1, v2, v3;
126
127 // Q16
128 v1 = (a_i * t) >> 16;
129
130 // Q15
131 v2 = 2 * (a_i >> 1);
132 v2 = (v2 * t) >> 16;
133 v2 = (v2 * t) >> 16;
134
135 // Q16
136 v3 = (a_i * t) >> 16;
137 v3 = (v3 * t) >> 16;
138 v3 = (v3 * t) >> 16;
139
140 // Q12
141 return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
142 }
143 #else
144 // 3 2
145 // C0 = -a*t + a*t
146 //
C0(float t)147 float C0(float t) {
148 return -a * t * t * t + a * t * t;
149 }
150
151 // 2 3
152 // C1 = -a*t + (2*a+3)*t - (a+2)*t
153 //
C1(float t)154 float C1(float t) {
155 return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
156 }
157
158 // 2 3
159 // C2 = 1 - (a+3)*t + (a+2)*t
160 //
C2(float t)161 float C2(float t) {
162 return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
163 }
164
165 // 2 3
166 // C3 = a*t - 2*a*t + a*t
167 //
C3(float t)168 float C3(float t) {
169 return a * t * t * t - 2.0f * a * t * t + a * t;
170 }
171 #endif
172
173 #if 0
174 int compare_real_fixed() {
175 int i, errors = 0;
176 float mult = 1.0 / 10000.0;
177 unsigned int fixed_mult = mult * 4294967296;// 65536;
178 unsigned int phase_offset_int;
179 float phase_offset_real;
180
181 for (i = 0; i < 10000; i++) {
182 int fixed0, fixed1, fixed2, fixed3, fixed_total;
183 int real0, real1, real2, real3, real_total;
184
185 phase_offset_real = (float)i * mult;
186 phase_offset_int = (fixed_mult * i) >> 16;
187 // phase_offset_int = phase_offset_real * 65536;
188
189 fixed0 = c0_fixed(phase_offset_int);
190 real0 = C0(phase_offset_real) * 4096.0;
191
192 if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
193 errors++;
194
195 fixed1 = c1_fixed(phase_offset_int);
196 real1 = C1(phase_offset_real) * 4096.0;
197
198 if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
199 errors++;
200
201 fixed2 = c2_fixed(phase_offset_int);
202 real2 = C2(phase_offset_real) * 4096.0;
203
204 if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
205 errors++;
206
207 fixed3 = c3_fixed(phase_offset_int);
208 real3 = C3(phase_offset_real) * 4096.0;
209
210 if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
211 errors++;
212
213 fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
214 real_total = real0 + real1 + real2 + real3;
215
216 if ((fixed_total > 4097) || (fixed_total < 4094))
217 errors++;
218
219 if ((real_total > 4097) || (real_total < 4095))
220 errors++;
221 }
222
223 return errors;
224 }
225 #endif
226
227 // Find greatest common denominator between two integers. Method used here is
228 // slow compared to Euclid's algorithm, but does not require any division.
gcd(int a,int b)229 int gcd(int a, int b) {
230 // Problem with this algorithm is that if a or b = 0 this function
231 // will never exit. Don't want to return 0 because any computation
232 // that was based on a common denoninator and tried to reduce by
233 // dividing by 0 would fail. Best solution that could be thought of
234 // would to be fail by returing a 1;
235 if (a <= 0 || b <= 0)
236 return 1;
237
238 while (a != b) {
239 if (b > a)
240 b = b - a;
241 else {
242 int tmp = a;// swap large and
243 a = b; // small
244 b = tmp;
245 }
246 }
247
248 return b;
249 }
250
bicubic_coefficient_init()251 void bicubic_coefficient_init() {
252 vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
253 g_first_time = 0;
254 }
255
bicubic_coefficient_destroy()256 void bicubic_coefficient_destroy() {
257 if (!g_first_time) {
258 vpx_free(g_b_scaler.l_w);
259
260 vpx_free(g_b_scaler.l_h);
261
262 vpx_free(g_b_scaler.l_h_uv);
263
264 vpx_free(g_b_scaler.c_w);
265
266 vpx_free(g_b_scaler.c_h);
267
268 vpx_free(g_b_scaler.c_h_uv);
269
270 vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
271 }
272 }
273
274 // Create the coeffients that will be used for the cubic interpolation.
275 // Because scaling does not have to be equal in the vertical and horizontal
276 // regimes the phase offsets will be different. There are 4 coefficents
277 // for each point, two on each side. The layout is that there are the
278 // 4 coefficents for each phase in the array and then the next phase.
bicubic_coefficient_setup(int in_width,int in_height,int out_width,int out_height)279 int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height) {
280 int i;
281 #ifdef FIXED_POINT
282 int phase_offset_int;
283 unsigned int fixed_mult;
284 int product_val = 0;
285 #else
286 float phase_offset;
287 #endif
288 int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
289
290 if (g_first_time)
291 bicubic_coefficient_init();
292
293
294 // check to see if the coefficents have already been set up correctly
295 if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
296 && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
297 return 0;
298
299 g_b_scaler.in_width = in_width;
300 g_b_scaler.in_height = in_height;
301 g_b_scaler.out_width = out_width;
302 g_b_scaler.out_height = out_height;
303
304 // Don't want to allow crazy scaling, just try and prevent a catastrophic
305 // failure here. Want to fail after setting the member functions so if
306 // if the scaler is called the member functions will not scale.
307 if (out_width <= 0 || out_height <= 0)
308 return -1;
309
310 // reduce in/out width and height ratios using the gcd
311 gcd_w = gcd(out_width, in_width);
312 gcd_h = gcd(out_height, in_height);
313 gcd_h_uv = gcd(out_height, in_height / 2);
314
315 // the numerator width and height are to be saved in
316 // globals so they can be used during the scaling process
317 // without having to be recalculated.
318 g_b_scaler.nw = out_width / gcd_w;
319 d_w = in_width / gcd_w;
320
321 g_b_scaler.nh = out_height / gcd_h;
322 d_h = in_height / gcd_h;
323
324 g_b_scaler.nh_uv = out_height / gcd_h_uv;
325 d_h_uv = (in_height / 2) / gcd_h_uv;
326
327 // allocate memory for the coefficents
328 vpx_free(g_b_scaler.l_w);
329
330 vpx_free(g_b_scaler.l_h);
331
332 vpx_free(g_b_scaler.l_h_uv);
333
334 g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
335 g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
336 g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
337
338 vpx_free(g_b_scaler.c_w);
339
340 vpx_free(g_b_scaler.c_h);
341
342 vpx_free(g_b_scaler.c_h_uv);
343
344 g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
345 g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
346 g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
347
348 g_b_scaler.hbuf = g_hbuf;
349 g_b_scaler.hbuf_uv = g_hbuf_uv;
350
351 // Set up polyphase filter taps. This needs to be done before
352 // the scaling because of the floating point math required. The
353 // coefficients are multiplied by 2^12 so that fixed point math
354 // can be used in the main scaling loop.
355 #ifdef FIXED_POINT
356 fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
357
358 product_val = 0;
359
360 for (i = 0; i < g_b_scaler.nw; i++) {
361 if (product_val > g_b_scaler.nw)
362 product_val -= g_b_scaler.nw;
363
364 phase_offset_int = (fixed_mult * product_val) >> 16;
365
366 g_b_scaler.c_w[i * 4] = c3_fixed(phase_offset_int);
367 g_b_scaler.c_w[i * 4 + 1] = c2_fixed(phase_offset_int);
368 g_b_scaler.c_w[i * 4 + 2] = c1_fixed(phase_offset_int);
369 g_b_scaler.c_w[i * 4 + 3] = c0_fixed(phase_offset_int);
370
371 product_val += d_w;
372 }
373
374
375 fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
376
377 product_val = 0;
378
379 for (i = 0; i < g_b_scaler.nh; i++) {
380 if (product_val > g_b_scaler.nh)
381 product_val -= g_b_scaler.nh;
382
383 phase_offset_int = (fixed_mult * product_val) >> 16;
384
385 g_b_scaler.c_h[i * 4] = c0_fixed(phase_offset_int);
386 g_b_scaler.c_h[i * 4 + 1] = c1_fixed(phase_offset_int);
387 g_b_scaler.c_h[i * 4 + 2] = c2_fixed(phase_offset_int);
388 g_b_scaler.c_h[i * 4 + 3] = c3_fixed(phase_offset_int);
389
390 product_val += d_h;
391 }
392
393 fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
394
395 product_val = 0;
396
397 for (i = 0; i < g_b_scaler.nh_uv; i++) {
398 if (product_val > g_b_scaler.nh_uv)
399 product_val -= g_b_scaler.nh_uv;
400
401 phase_offset_int = (fixed_mult * product_val) >> 16;
402
403 g_b_scaler.c_h_uv[i * 4] = c0_fixed(phase_offset_int);
404 g_b_scaler.c_h_uv[i * 4 + 1] = c1_fixed(phase_offset_int);
405 g_b_scaler.c_h_uv[i * 4 + 2] = c2_fixed(phase_offset_int);
406 g_b_scaler.c_h_uv[i * 4 + 3] = c3_fixed(phase_offset_int);
407
408 product_val += d_h_uv;
409 }
410
411 #else
412
413 for (i = 0; i < g_nw; i++) {
414 phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
415 g_c_w[i * 4] = (C3(phase_offset) * 4096.0);
416 g_c_w[i * 4 + 1] = (C2(phase_offset) * 4096.0);
417 g_c_w[i * 4 + 2] = (C1(phase_offset) * 4096.0);
418 g_c_w[i * 4 + 3] = (C0(phase_offset) * 4096.0);
419 }
420
421 for (i = 0; i < g_nh; i++) {
422 phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
423 g_c_h[i * 4] = (C0(phase_offset) * 4096.0);
424 g_c_h[i * 4 + 1] = (C1(phase_offset) * 4096.0);
425 g_c_h[i * 4 + 2] = (C2(phase_offset) * 4096.0);
426 g_c_h[i * 4 + 3] = (C3(phase_offset) * 4096.0);
427 }
428
429 for (i = 0; i < g_nh_uv; i++) {
430 phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
431 g_c_h_uv[i * 4] = (C0(phase_offset) * 4096.0);
432 g_c_h_uv[i * 4 + 1] = (C1(phase_offset) * 4096.0);
433 g_c_h_uv[i * 4 + 2] = (C2(phase_offset) * 4096.0);
434 g_c_h_uv[i * 4 + 3] = (C3(phase_offset) * 4096.0);
435 }
436
437 #endif
438
439 // Create an array that corresponds input lines to output lines.
440 // This doesn't require floating point math, but it does require
441 // a division and because hardware division is not present that
442 // is a call.
443 for (i = 0; i < out_width; i++) {
444 g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
445
446 if ((g_b_scaler.l_w[i] + 2) <= in_width)
447 g_b_scaler.max_usable_out_width = i;
448
449 }
450
451 for (i = 0; i < out_height + 1; i++) {
452 g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
453 g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
454 }
455
456 return 0;
457 }
458
bicubic_scale(int in_width,int in_height,int in_stride,int out_width,int out_height,int out_stride,unsigned char * input_image,unsigned char * output_image)459 int bicubic_scale(int in_width, int in_height, int in_stride,
460 int out_width, int out_height, int out_stride,
461 unsigned char *input_image, unsigned char *output_image) {
462 short *RESTRICT l_w, * RESTRICT l_h;
463 short *RESTRICT c_w, * RESTRICT c_h;
464 unsigned char *RESTRICT ip, * RESTRICT op;
465 unsigned char *RESTRICT hbuf;
466 int h, w, lw, lh;
467 int temp_sum;
468 int phase_offset_w, phase_offset_h;
469
470 c_w = g_b_scaler.c_w;
471 c_h = g_b_scaler.c_h;
472
473 op = output_image;
474
475 l_w = g_b_scaler.l_w;
476 l_h = g_b_scaler.l_h;
477
478 phase_offset_h = 0;
479
480 for (h = 0; h < out_height; h++) {
481 // select the row to work on
482 lh = l_h[h];
483 ip = input_image + (in_stride * lh);
484
485 // vp8_filter the row vertically into an temporary buffer.
486 // If the phase offset == 0 then all the multiplication
487 // is going to result in the output equalling the input.
488 // So instead point the temporary buffer to the input.
489 // Also handle the boundry condition of not being able to
490 // filter that last lines.
491 if (phase_offset_h && (lh < in_height - 2)) {
492 hbuf = g_b_scaler.hbuf;
493
494 for (w = 0; w < in_width; w++) {
495 temp_sum = c_h[phase_offset_h * 4 + 3] * ip[w - in_stride];
496 temp_sum += c_h[phase_offset_h * 4 + 2] * ip[w];
497 temp_sum += c_h[phase_offset_h * 4 + 1] * ip[w + in_stride];
498 temp_sum += c_h[phase_offset_h * 4] * ip[w + 2 * in_stride];
499
500 hbuf[w] = temp_sum >> 12;
501 }
502 } else
503 hbuf = ip;
504
505 // increase the phase offset for the next time around.
506 if (++phase_offset_h >= g_b_scaler.nh)
507 phase_offset_h = 0;
508
509 // now filter and expand it horizontally into the final
510 // output buffer
511 phase_offset_w = 0;
512
513 for (w = 0; w < out_width; w++) {
514 // get the index to use to expand the image
515 lw = l_w[w];
516
517 temp_sum = c_w[phase_offset_w * 4] * hbuf[lw - 1];
518 temp_sum += c_w[phase_offset_w * 4 + 1] * hbuf[lw];
519 temp_sum += c_w[phase_offset_w * 4 + 2] * hbuf[lw + 1];
520 temp_sum += c_w[phase_offset_w * 4 + 3] * hbuf[lw + 2];
521 temp_sum = temp_sum >> 12;
522
523 if (++phase_offset_w >= g_b_scaler.nw)
524 phase_offset_w = 0;
525
526 // boundry conditions
527 if ((lw + 2) >= in_width)
528 temp_sum = hbuf[lw];
529
530 if (lw == 0)
531 temp_sum = hbuf[0];
532
533 op[w] = temp_sum;
534 }
535
536 op += out_stride;
537 }
538
539 return 0;
540 }
541
bicubic_scale_frame_reset()542 void bicubic_scale_frame_reset() {
543 g_b_scaler.out_width = 0;
544 g_b_scaler.out_height = 0;
545 }
546
bicubic_scale_frame(YV12_BUFFER_CONFIG * src,YV12_BUFFER_CONFIG * dst,int new_width,int new_height)547 void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
548 int new_width, int new_height) {
549
550 dst->y_width = new_width;
551 dst->y_height = new_height;
552 dst->uv_width = new_width / 2;
553 dst->uv_height = new_height / 2;
554
555 dst->y_stride = dst->y_width;
556 dst->uv_stride = dst->uv_width;
557
558 bicubic_scale(src->y_width, src->y_height, src->y_stride,
559 new_width, new_height, dst->y_stride,
560 src->y_buffer, dst->y_buffer);
561
562 bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
563 new_width / 2, new_height / 2, dst->uv_stride,
564 src->u_buffer, dst->u_buffer);
565
566 bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
567 new_width / 2, new_height / 2, dst->uv_stride,
568 src->v_buffer, dst->v_buffer);
569 }
570