1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 /****************************************************************************
13 *
14 * Module Title : gen_scalers.c
15 *
16 * Description : Generic image scaling functions.
17 *
18 ***************************************************************************/
19
20 /****************************************************************************
21 * Header Files
22 ****************************************************************************/
23 #include "vpx_scale/vpxscale.h"
24
25 /****************************************************************************
26 * Imports
27 ****************************************************************************/
28
29 /****************************************************************************
30 *
31 * ROUTINE : horizontal_line_4_5_scale_c4
32 *
33 * INPUTS : const unsigned char *source : Pointer to source data.
34 * unsigned int source_width : Stride of source.
35 * unsigned char *dest : Pointer to destination data.
36 * unsigned int dest_width : Stride of destination (NOT USED).
37 *
38 * OUTPUTS : None.
39 *
40 * RETURNS : void
41 *
42 * FUNCTION : Copies horizontal line of pixels from source to
43 * destination scaling up by 4 to 5.
44 *
45 * SPECIAL NOTES : None.
46 *
47 ****************************************************************************/
48 static
horizontal_line_4_5_scale_c64(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)49 void horizontal_line_4_5_scale_c64
50 (
51 const unsigned char *source,
52 unsigned int source_width,
53 unsigned char *dest,
54 unsigned int dest_width
55 )
56 {
57 unsigned i;
58 unsigned int ba, cb, dc, ed;
59 unsigned char *restrict des = dest;
60 unsigned int *restrict src = (unsigned int *)source;
61 unsigned int const_51_205, const_102_154,
62 const_205_51, const_154_102;
63
64 unsigned int src_current, src_next;
65
66 (void) dest_width;
67
68 // Constants that are to be used for the filtering. For
69 // best speed we are going to want to right shift by 16.
70 // In the generic version they were shift by 8, so put
71 // an extra 8 in now so that 16 will come out later.
72 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
73 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
74 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
75 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
76
77 // 5 points are needed to filter to give 5 output points.
78 // A load can pull up 4 at a time, and one needs to be
79 // "borrowed" from the next set of data. So instead of
80 // loading those 5 points each time, "steal" a point from
81 // the next set and only load up 4 each time through.
82 src_current = _mem4(src);
83
84 for (i = 0; i < source_width - 4; i += 4)
85 {
86 src_next = _mem4(src++);
87
88 // Reorder the data so that it is ready for the
89 // dot product.
90 ba = _unpklu4(src_current);
91 cb = _unpkhu4(_rotl(src_current, 8));
92 dc = _unpkhu4(src_current);
93 ed = _unpkhu4(_shrmb(src_next, src_current));
94
95 // Use the dot product with round and shift.
96 des [0] = src_current & 0xff;
97 des [1] = _dotprsu2(ba, const_205_51);
98 des [2] = _dotprsu2(cb, const_154_102);
99 des [3] = _dotprsu2(dc, const_102_154);
100 des [4] = _dotprsu2(ed, const_51_205);
101
102 des += 5;
103
104 // reuse loaded vales next time around.
105 src_current = src_next;
106 }
107
108 // vp8_filter the last set of points. Normally a point from the next set
109 // would be used, but there is no next set, so just fill.
110 ba = _unpklu4(src_current);
111 cb = _unpkhu4(_rotl(src_current, 8));
112 dc = _unpkhu4(src_current);
113
114 des [0] = src_current & 0xff;
115 des [1] = _dotprsu2(ba, const_205_51);
116 des [2] = _dotprsu2(cb, const_154_102);
117 des [3] = _dotprsu2(dc, const_102_154);
118 des [4] = src_current & 0xff;
119
120 }
121 /****************************************************************************
122 *
123 * ROUTINE : vertical_band_4_5_scale_c64
124 *
125 * INPUTS : unsigned char *dest : Pointer to destination data.
126 * unsigned int dest_pitch : Stride of destination data.
127 * unsigned int dest_width : Width of destination data.
128 *
129 * OUTPUTS : None.
130 *
131 * RETURNS : void
132 *
133 * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
134 * height of the band scaled is 4-pixels.
135 *
136 * SPECIAL NOTES : The routine uses the first line of the band below
137 * the current band.
138 *
139 ****************************************************************************/
140 static
vertical_band_4_5_scale_c64(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)141 void vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
142 {
143 unsigned int i;
144 unsigned int a, b, c, d, e;
145 unsigned int ba, cb, dc, ed;
146 unsigned char *restrict src = dest;
147 unsigned char *restrict des = dest;
148 unsigned int const_51_205, const_102_154,
149 const_205_51, const_154_102;
150
151 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
152 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
153 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
154 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
155
156 // Force a loop unroll here so that there is not such a
157 // dependancy.
158 a = src [0];
159 b = src [dest_pitch];
160 c = src [dest_pitch*2];
161 d = src [dest_pitch*3];
162 e = src [dest_pitch*5];
163 src ++;
164
165 for (i = 0; i < dest_width; i++)
166 {
167 ba = _pack2(b, a);
168 cb = _pack2(c, b);
169 dc = _pack2(d, c);
170 ed = _pack2(e, d);
171
172 a = src [0];
173 b = src [dest_pitch];
174 c = src [dest_pitch*2];
175 d = src [dest_pitch*3];
176 e = src [dest_pitch*5];
177 src ++;
178
179 des [dest_pitch] = _dotprsu2(ba, const_205_51);
180 des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
181 des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
182 des [dest_pitch*4] = _dotprsu2(ed, const_51_205);
183
184 des ++;
185 }
186 }
187
188 /****************************************************************************
189 *
190 * ROUTINE : last_vertical_band_4_5_scale_c64
191 *
192 * INPUTS : unsigned char *dest : Pointer to destination data.
193 * unsigned int dest_pitch : Stride of destination data.
194 * unsigned int dest_width : Width of destination data.
195 *
196 * OUTPUTS : None.
197 *
198 * RETURNS : void
199 *
200 * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The
201 * height of the band scaled is 4-pixels.
202 *
203 * SPECIAL NOTES : The routine does not have available the first line of
204 * the band below the current band, since this is the
205 * last band.
206 *
207 ****************************************************************************/
208 static
last_vertical_band_4_5_scale_c64(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)209 void last_vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
210 {
211 unsigned int i;
212 unsigned int a, b, c, d;
213 unsigned int ba, cb, dc;
214 unsigned char *restrict src = dest;
215 unsigned char *restrict des = dest;
216 unsigned int const_102_154, const_205_51, const_154_102;
217
218 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
219 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
220 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
221
222 a = src [0];
223 b = src [dest_pitch];
224 c = src [dest_pitch*2];
225 d = src [dest_pitch*3];
226 src ++;
227
228 for (i = 0; i < dest_width; ++i)
229 {
230 ba = _pack2(b, a);
231 cb = _pack2(c, b);
232 dc = _pack2(d, c);
233
234 a = src [0];
235 b = src [dest_pitch];
236 c = src [dest_pitch*2];
237 d = src [dest_pitch*3];
238 src ++;
239
240 des [dest_pitch] = _dotprsu2(ba, const_205_51);
241 des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
242 des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
243 des [dest_pitch*4] = (unsigned char) d;
244
245 des++;
246 }
247 }
248
249 /****************************************************************************
250 *
251 * ROUTINE : horizontal_line_3_5_scale_c64
252 *
253 * INPUTS : const unsigned char *source : Pointer to source data.
254 * unsigned int source_width : Stride of source.
255 * unsigned char *dest : Pointer to destination data.
256 * unsigned int dest_width : Stride of destination (NOT USED).
257 *
258 * OUTPUTS : None.
259 *
260 * RETURNS : void
261 *
262 * FUNCTION : Copies horizontal line of pixels from source to
263 * destination scaling up by 3 to 5.
264 *
265 * SPECIAL NOTES : None.
266 *
267 *
268 ****************************************************************************/
269 static
horizontal_line_3_5_scale_c64(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)270 void horizontal_line_3_5_scale_c64
271 (
272 const unsigned char *source,
273 unsigned int source_width,
274 unsigned char *dest,
275 unsigned int dest_width
276 )
277 {
278 unsigned int i;
279 unsigned int ba, cb, dc;
280 unsigned int src_current;
281 unsigned char *restrict des = dest;
282 unsigned char *restrict src = (unsigned char *)source;
283 unsigned int const_51_205, const_102_154,
284 const_205_51, const_154_102;
285
286 (void) dest_width;
287
288 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
289 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
290 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
291 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
292
293 for (i = 0; i < source_width - 3; i += 3)
294 {
295 src_current = _mem4(src);
296
297 // Reorder the data so that it is ready for the
298 // dot product.
299 ba = _unpklu4(src_current);
300 cb = _unpkhu4(_rotl(src_current, 8));
301 dc = _unpkhu4(src_current);
302
303 des [0] = src_current & 0xff;
304 des [1] = _dotprsu2(ba, const_154_102);
305 des [2] = _dotprsu2(cb, const_51_205);
306 des [3] = _dotprsu2(cb, const_205_51);
307 des [4] = _dotprsu2(dc, const_102_154);
308
309 src += 3;
310 des += 5;
311 }
312
313 src_current = _mem4(src);
314
315 ba = _unpklu4(src_current);
316 cb = _unpkhu4(_rotl(src_current, 8));
317 dc = _unpkhu4(src_current);
318
319
320 des [0] = src_current & 0xff;
321 des [1] = _dotprsu2(ba, const_154_102);
322 des [2] = _dotprsu2(cb, const_51_205);
323 des [3] = _dotprsu2(cb, const_205_51);
324 des [4] = dc & 0xff;
325
326 }
327
328 /****************************************************************************
329 *
330 * ROUTINE : vertical_band_3_5_scale_c64
331 *
332 * INPUTS : unsigned char *dest : Pointer to destination data.
333 * unsigned int dest_pitch : Stride of destination data.
334 * unsigned int dest_width : Width of destination data.
335 *
336 * OUTPUTS : None.
337 *
338 * RETURNS : void
339 *
340 * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
341 * height of the band scaled is 3-pixels.
342 *
343 * SPECIAL NOTES : The routine uses the first line of the band below
344 * the current band.
345 *
346 ****************************************************************************/
347 static
vertical_band_3_5_scale_c64(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)348 void vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
349 {
350 unsigned int i;
351 unsigned int a, b, c, d;
352 unsigned int ba, cb, dc;
353 unsigned char *restrict src = dest;
354 unsigned char *restrict des = dest;
355 unsigned int const_51_205, const_102_154,
356 const_205_51, const_154_102;
357
358 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
359 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
360 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
361 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
362
363 a = src [0];
364 b = src [dest_pitch];
365 c = src [dest_pitch*2];
366 d = src [dest_pitch*5];
367 src ++;
368
369 for (i = 0; i < dest_width; i++)
370 {
371 ba = _pack2(b, a);
372 cb = _pack2(c, b);
373 dc = _pack2(d, c);
374
375 a = src [0];
376 b = src [dest_pitch];
377 c = src [dest_pitch*2];
378 d = src [dest_pitch*5];
379 src ++;
380
381 des [dest_pitch] = _dotprsu2(ba, const_154_102);
382 des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
383 des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
384 des [dest_pitch*4] = _dotprsu2(dc, const_102_154);
385
386 des++;
387 }
388 }
389
390 /****************************************************************************
391 *
392 * ROUTINE : last_vertical_band_3_5_scale_c64
393 *
394 * INPUTS : unsigned char *dest : Pointer to destination data.
395 * unsigned int dest_pitch : Stride of destination data.
396 * unsigned int dest_width : Width of destination data.
397 *
398 * OUTPUTS : None.
399 *
400 * RETURNS : void
401 *
402 * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The
403 * height of the band scaled is 3-pixels.
404 *
405 * SPECIAL NOTES : The routine does not have available the first line of
406 * the band below the current band, since this is the
407 * last band.
408 *
409 ****************************************************************************/
410 static
last_vertical_band_3_5_scale_c64(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)411 void last_vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
412 {
413 unsigned int i;
414 unsigned int a, b, c;
415 unsigned int ba, cb;
416 unsigned char *restrict src = dest;
417 unsigned char *restrict des = dest;
418 unsigned int const_51_205, const_205_51, const_154_102;
419
420 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
421 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
422 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
423
424 a = src [0];
425 b = src [dest_pitch];
426 c = src [dest_pitch*2];
427 src ++;
428
429 for (i = 0; i < dest_width; ++i)
430 {
431 ba = _pack2(b, a);
432 cb = _pack2(c, b);
433
434 a = src [0];
435 b = src [dest_pitch];
436 c = src [dest_pitch*2];
437 src ++;
438
439 des [dest_pitch] = _dotprsu2(ba, const_154_102);
440 des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
441 des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
442 des [dest_pitch*4] = (unsigned char)(c) ;
443
444 des++;
445 }
446 }
447
448 /****************************************************************************
449 *
450 * ROUTINE : horizontal_line_1_2_scale_c64
451 *
452 * INPUTS : const unsigned char *source : Pointer to source data.
453 * unsigned int source_width : Stride of source.
454 * unsigned char *dest : Pointer to destination data.
455 * unsigned int dest_width : Stride of destination (NOT USED).
456 *
457 * OUTPUTS : None.
458 *
459 * RETURNS : void
460 *
461 * FUNCTION : Copies horizontal line of pixels from source to
462 * destination scaling up by 1 to 2.
463 *
464 * SPECIAL NOTES : source width must be a multiple of 4.
465 *
466 ****************************************************************************/
horizontal_line_1_2_scale_c64(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)467 void horizontal_line_1_2_scale_c64
468 (
469 const unsigned char *source,
470 unsigned int source_width,
471 unsigned char *dest,
472 unsigned int dest_width
473 )
474 {
475 unsigned int i;
476 unsigned char *restrict des = dest;
477 unsigned char *restrict src = (unsigned char *)source;
478 unsigned int src7_4i, src4_1i, src3_0i;
479 unsigned int a4_0i, ahi, alo;
480 double src7_0d, src3_0d;
481 const unsigned int k01 = 0x01010101;
482
483 for (i = 0; i < source_width / 4; i += 1)
484 {
485 // Load up the data from src. Here a wide load is
486 // used to get 8 bytes at once, only 5 will be used
487 // for the actual computation.
488 src7_0d = _memd8(src);
489 src3_0i = _lo(src7_0d);
490 src7_4i = _hi(src7_0d);
491
492 // Need to average between points. Shift byte 5 into
493 // the lower word. This will result in bytes 5-1
494 // averaged with 4-0.
495 src4_1i = _shrmb(src7_4i, src3_0i);
496 a4_0i = _avgu4(src4_1i, src3_0i);
497
498 // Expand the data out. Could do an unpack, however
499 // all but the multiply units are getting pretty hard
500 // here the multiply unit can take some of the computations.
501 src3_0d = _mpyu4(src3_0i, k01);
502
503 // The averages need to be unpacked so that they are in 16
504 // bit form and will be able to be interleaved with the
505 // original data
506 ahi = _unpkhu4(a4_0i);
507 alo = _unpklu4(a4_0i);
508
509 ahi = _swap4(ahi);
510 alo = _swap4(alo);
511
512 // Mix the average result in with the orginal data.
513 ahi = _hi(src3_0d) | ahi;
514 alo = _lo(src3_0d) | alo;
515
516 _memd8(des) = _itod(ahi, alo);
517
518 des += 8;
519 src += 4;
520 }
521 }
522
523
524 /****************************************************************************
525 *
526 * ROUTINE : vertical_band_1_2_scale_c64
527 *
528 * INPUTS : unsigned char *dest : Pointer to destination data.
529 * unsigned int dest_pitch : Stride of destination data.
530 * unsigned int dest_width : Width of destination data.
531 *
532 * OUTPUTS : None.
533 *
534 * RETURNS : void
535 *
536 * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
537 * height of the band scaled is 1-pixel.
538 *
539 * SPECIAL NOTES : The routine uses the first line of the band below
540 * the current band.
541 * Destination width must be a multiple of 4. Because the
542 * intput must be, therefore the output must be.
543 *
544 ****************************************************************************/
545 static
vertical_band_1_2_scale_c64(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)546 void vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
547 {
548 unsigned int i;
549 unsigned int a, b;
550 unsigned int *restrict line_a = (unsigned int *)dest;
551 unsigned int *restrict line_b = (unsigned int *)(dest + (dest_pitch * 2));
552 unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
553
554 for (i = 0; i < dest_width / 4; i++)
555 {
556 a = _mem4(line_a++);
557 b = _mem4(line_b++);
558
559 _mem4(des++) = _avgu4(a, b);
560 }
561 }
562
563 /****************************************************************************
564 *
565 * ROUTINE : last_vertical_band_1_2_scale_c64
566 *
567 * INPUTS : unsigned char *dest : Pointer to destination data.
568 * unsigned int dest_pitch : Stride of destination data.
569 * unsigned int dest_width : Width of destination data.
570 *
571 * OUTPUTS : None.
572 *
573 * RETURNS : void
574 *
575 * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The
576 * height of the band scaled is 1-pixel.
577 *
578 * SPECIAL NOTES : The routine does not have available the first line of
579 * the band below the current band, since this is the
580 * last band. Again, width must be a multiple of 4.
581 *
582 ****************************************************************************/
583 static
last_vertical_band_1_2_scale_c64(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)584 void last_vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
585 {
586 unsigned int i;
587 unsigned int *restrict src = (unsigned int *)dest;
588 unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
589
590 for (i = 0; i < dest_width / 4; ++i)
591 {
592 _mem4(des++) = _mem4(src++);
593 }
594 }
595
596 void
register_generic_scalers(void)597 register_generic_scalers(void)
598 {
599 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_c64;
600 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_c64;
601 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_c64;
602 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_c64;
603 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_c64;
604 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_c64;
605 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_c64;
606 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_c64;
607 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_c64;
608 }
609