1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 /****************************************************************************
13 *
14 * Module Title : scaleopt.cpp
15 *
16 * Description : Optimized scaling functions
17 *
18 ****************************************************************************/
19 #include "pragmas.h"
20
21
22
23 /****************************************************************************
24 * Module Statics
25 ****************************************************************************/
26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
37
38
39
40 #include "vpx_scale/vpxscale.h"
41 #include "vpx_mem/vpx_mem.h"
42
43 /****************************************************************************
44 *
45 * ROUTINE : horizontal_line_3_5_scale_mmx
46 *
47 * INPUTS : const unsigned char *source :
48 * unsigned int source_width :
49 * unsigned char *dest :
50 * unsigned int dest_width :
51 *
52 * OUTPUTS : None.
53 *
54 * RETURNS : void
55 *
56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
57 *
58 * SPECIAL NOTES : None.
59 *
60 ****************************************************************************/
61 static
horizontal_line_3_5_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)62 void horizontal_line_3_5_scale_mmx
63 (
64 const unsigned char *source,
65 unsigned int source_width,
66 unsigned char *dest,
67 unsigned int dest_width
68 )
69 {
70 (void) dest_width;
71
72 __asm
73 {
74
75 push ebx
76
77 mov esi, source
78 mov edi, dest
79
80 mov ecx, source_width
81 lea edx, [esi+ecx-3];
82
83 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
84 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
85
86 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
87 pxor mm7, mm7 // clear mm7
88
89 horiz_line_3_5_loop:
90
91 mov eax, DWORD PTR [esi] // eax = 00 01 02 03
92 mov ebx, eax
93
94 and ebx, 0xffff00 // ebx = xx 01 02 xx
95 mov ecx, eax // ecx = 00 01 02 03
96
97 and eax, 0xffff0000 // eax = xx xx 02 03
98 xor ecx, eax // ecx = 00 01 xx xx
99
100 shr ebx, 8 // ebx = 01 02 xx xx
101 or eax, ebx // eax = 01 02 02 03
102
103 shl ebx, 16 // ebx = xx xx 01 02
104 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
105
106 or ebx, ecx // ebx = 00 01 01 02
107 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
108
109 movd mm0, ebx // mm0 = 00 01 01 02
110 pmullw mm1, mm6 //
111
112 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
113 pmullw mm0, mm5 //
114
115 mov [edi], ebx // writeoutput 00 xx xx xx
116 add esi, 3
117
118 add edi, 5
119 paddw mm0, mm1
120
121 paddw mm0, mm4
122 psrlw mm0, 8
123
124 cmp esi, edx
125 packuswb mm0, mm7
126
127 movd DWORD Ptr [edi-4], mm0
128 jl horiz_line_3_5_loop
129
130 //Exit:
131 mov eax, DWORD PTR [esi] // eax = 00 01 02 03
132 mov ebx, eax
133
134 and ebx, 0xffff00 // ebx = xx 01 02 xx
135 mov ecx, eax // ecx = 00 01 02 03
136
137 and eax, 0xffff0000 // eax = xx xx 02 03
138 xor ecx, eax // ecx = 00 01 xx xx
139
140 shr ebx, 8 // ebx = 01 02 xx xx
141 or eax, ebx // eax = 01 02 02 03
142
143 shl eax, 8 // eax = xx 01 02 02
144 and eax, 0xffff0000 // eax = xx xx 02 02
145
146 or eax, ebx // eax = 01 02 02 02
147
148 shl ebx, 16 // ebx = xx xx 01 02
149 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
150
151 or ebx, ecx // ebx = 00 01 01 02
152 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
153
154 movd mm0, ebx // mm0 = 00 01 01 02
155 pmullw mm1, mm6 //
156
157 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
158 pmullw mm0, mm5 //
159
160 mov [edi], ebx // writeoutput 00 xx xx xx
161 paddw mm0, mm1
162
163 paddw mm0, mm4
164 psrlw mm0, 8
165
166 packuswb mm0, mm7
167 movd DWORD Ptr [edi+1], mm0
168
169 pop ebx
170
171 }
172
173 }
174
175
176 /****************************************************************************
177 *
178 * ROUTINE : horizontal_line_4_5_scale_mmx
179 *
180 * INPUTS : const unsigned char *source :
181 * unsigned int source_width :
182 * unsigned char *dest :
183 * unsigned int dest_width :
184 *
185 * OUTPUTS : None.
186 *
187 * RETURNS : void
188 *
189 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
190 *
191 * SPECIAL NOTES : None.
192 *
193 ****************************************************************************/
194 static
horizontal_line_4_5_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)195 void horizontal_line_4_5_scale_mmx
196 (
197 const unsigned char *source,
198 unsigned int source_width,
199 unsigned char *dest,
200 unsigned int dest_width
201 )
202 {
203 (void)dest_width;
204
205 __asm
206 {
207
208 mov esi, source
209 mov edi, dest
210
211 mov ecx, source_width
212 lea edx, [esi+ecx-8];
213
214 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
215 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
216
217 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
218 pxor mm7, mm7 // clear mm7
219
220 horiz_line_4_5_loop:
221
222 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
223 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
224
225 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
226 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
227
228 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
229 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
230
231 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
232 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
233
234 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
235 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
236
237 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
238 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
239
240 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
241 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
242
243 paddw mm0, mm1 // added round values
244 paddw mm0, mm4
245
246 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
247 packuswb mm0, mm7
248
249 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
250 add edi, 10
251
252 add esi, 8
253 paddw mm2, mm3 //
254
255 paddw mm2, mm4 // added round values
256 cmp esi, edx
257
258 psrlw mm2, 8
259 packuswb mm2, mm7
260
261 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
262 jl horiz_line_4_5_loop
263
264 //Exit:
265 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
266 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
267
268 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
269 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
270
271 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
272 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
273
274 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
275 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
276
277 movq mm3, mm1
278
279 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
280 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
281
282 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
283 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
284
285 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
286 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
287
288 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
289 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
290
291 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
292 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
293
294 paddw mm0, mm1 // added round values
295 paddw mm0, mm4
296
297 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
298 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
299
300 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
301 paddw mm2, mm3 //
302
303 paddw mm2, mm4 // added round values
304 psrlw mm2, 8
305
306 packuswb mm2, mm7
307 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
308
309
310 }
311 }
312
313 /****************************************************************************
314 *
315 * ROUTINE : vertical_band_4_5_scale_mmx
316 *
317 * INPUTS : unsigned char *dest :
318 * unsigned int dest_pitch :
319 * unsigned int dest_width :
320 *
321 * OUTPUTS : None.
322 *
323 * RETURNS : void
324 *
325 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
326 *
327 * SPECIAL NOTES : The routine uses the first line of the band below
328 * the current band. The function also has a "C" only
329 * version.
330 *
331 ****************************************************************************/
332 static
vertical_band_4_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)333 void vertical_band_4_5_scale_mmx
334 (
335 unsigned char *dest,
336 unsigned int dest_pitch,
337 unsigned int dest_width
338 )
339 {
340 __asm
341 {
342
343 mov esi, dest // Get the source and destination pointer
344 mov ecx, dest_pitch // Get the pitch size
345
346 lea edi, [esi+ecx*2] // tow lines below
347 add edi, ecx // three lines below
348
349 pxor mm7, mm7 // clear out mm7
350 mov edx, dest_width // Loop counter
351
352 vs_4_5_loop:
353
354 movq mm0, QWORD ptr [esi] // src[0];
355 movq mm1, QWORD ptr [esi+ecx] // src[1];
356
357 movq mm2, mm0 // Make a copy
358 punpcklbw mm0, mm7 // unpack low to word
359
360 movq mm5, one_fifth
361 punpckhbw mm2, mm7 // unpack high to word
362
363 pmullw mm0, mm5 // a * 1/5
364
365 movq mm3, mm1 // make a copy
366 punpcklbw mm1, mm7 // unpack low to word
367
368 pmullw mm2, mm5 // a * 1/5
369 movq mm6, four_fifths // constan
370
371 movq mm4, mm1 // copy of low b
372 pmullw mm4, mm6 // b * 4/5
373
374 punpckhbw mm3, mm7 // unpack high to word
375 movq mm5, mm3 // copy of high b
376
377 pmullw mm5, mm6 // b * 4/5
378 paddw mm0, mm4 // a * 1/5 + b * 4/5
379
380 paddw mm2, mm5 // a * 1/5 + b * 4/5
381 paddw mm0, round_values // + 128
382
383 paddw mm2, round_values // + 128
384 psrlw mm0, 8
385
386 psrlw mm2, 8
387 packuswb mm0, mm2 // des [1]
388
389 movq QWORD ptr [esi+ecx], mm0 // write des[1]
390 movq mm0, [esi+ecx*2] // mm0 = src[2]
391
392 // mm1, mm3 --- Src[1]
393 // mm0 --- Src[2]
394 // mm7 for unpacking
395
396 movq mm5, two_fifths
397 movq mm2, mm0 // make a copy
398
399 pmullw mm1, mm5 // b * 2/5
400 movq mm6, three_fifths
401
402
403 punpcklbw mm0, mm7 // unpack low to word
404 pmullw mm3, mm5 // b * 2/5
405
406 movq mm4, mm0 // make copy of c
407 punpckhbw mm2, mm7 // unpack high to word
408
409 pmullw mm4, mm6 // c * 3/5
410 movq mm5, mm2
411
412 pmullw mm5, mm6 // c * 3/5
413 paddw mm1, mm4 // b * 2/5 + c * 3/5
414
415 paddw mm3, mm5 // b * 2/5 + c * 3/5
416 paddw mm1, round_values // + 128
417
418 paddw mm3, round_values // + 128
419 psrlw mm1, 8
420
421 psrlw mm3, 8
422 packuswb mm1, mm3 // des[2]
423
424 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
425 movq mm1, [edi] // mm1=Src[3];
426
427 // mm0, mm2 --- Src[2]
428 // mm1 --- Src[3]
429 // mm6 --- 3/5
430 // mm7 for unpacking
431
432 pmullw mm0, mm6 // c * 3/5
433 movq mm5, two_fifths // mm5 = 2/5
434
435 movq mm3, mm1 // make a copy
436 pmullw mm2, mm6 // c * 3/5
437
438 punpcklbw mm1, mm7 // unpack low
439 movq mm4, mm1 // make a copy
440
441 punpckhbw mm3, mm7 // unpack high
442 pmullw mm4, mm5 // d * 2/5
443
444 movq mm6, mm3 // make a copy
445 pmullw mm6, mm5 // d * 2/5
446
447 paddw mm0, mm4 // c * 3/5 + d * 2/5
448 paddw mm2, mm6 // c * 3/5 + d * 2/5
449
450 paddw mm0, round_values // + 128
451 paddw mm2, round_values // + 128
452
453 psrlw mm0, 8
454 psrlw mm2, 8
455
456 packuswb mm0, mm2 // des[3]
457 movq QWORD ptr [edi], mm0 // write des[3]
458
459 // mm1, mm3 --- Src[3]
460 // mm7 -- cleared for unpacking
461
462 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
463
464 movq mm5, four_fifths // mm5 = 4/5
465 pmullw mm1, mm5 // d * 4/5
466
467 movq mm6, one_fifth // mm6 = 1/5
468 movq mm2, mm0 // make a copy
469
470 pmullw mm3, mm5 // d * 4/5
471 punpcklbw mm0, mm7 // unpack low
472
473 pmullw mm0, mm6 // an * 1/5
474 punpckhbw mm2, mm7 // unpack high
475
476 paddw mm1, mm0 // d * 4/5 + an * 1/5
477 pmullw mm2, mm6 // an * 1/5
478
479 paddw mm3, mm2 // d * 4/5 + an * 1/5
480 paddw mm1, round_values // + 128
481
482 paddw mm3, round_values // + 128
483 psrlw mm1, 8
484
485 psrlw mm3, 8
486 packuswb mm1, mm3 // des[4]
487
488 movq QWORD ptr [edi+ecx], mm1 // write des[4]
489
490 add edi, 8
491 add esi, 8
492
493 sub edx, 8
494 jg vs_4_5_loop
495 }
496 }
497
498 /****************************************************************************
499 *
500 * ROUTINE : last_vertical_band_4_5_scale_mmx
501 *
502 * INPUTS : unsigned char *dest :
503 * unsigned int dest_pitch :
504 * unsigned int dest_width :
505 *
506 * OUTPUTS : None.
507 *
508 * RETURNS : None
509 *
510 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
511 *
512 * SPECIAL NOTES : The routine uses the first line of the band below
513 * the current band. The function also has an "C" only
514 * version.
515 *
516 ****************************************************************************/
517 static
last_vertical_band_4_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)518 void last_vertical_band_4_5_scale_mmx
519 (
520 unsigned char *dest,
521 unsigned int dest_pitch,
522 unsigned int dest_width
523 )
524 {
525 __asm
526 {
527 mov esi, dest // Get the source and destination pointer
528 mov ecx, dest_pitch // Get the pitch size
529
530 lea edi, [esi+ecx*2] // tow lines below
531 add edi, ecx // three lines below
532
533 pxor mm7, mm7 // clear out mm7
534 mov edx, dest_width // Loop counter
535
536 last_vs_4_5_loop:
537
538 movq mm0, QWORD ptr [esi] // src[0];
539 movq mm1, QWORD ptr [esi+ecx] // src[1];
540
541 movq mm2, mm0 // Make a copy
542 punpcklbw mm0, mm7 // unpack low to word
543
544 movq mm5, one_fifth
545 punpckhbw mm2, mm7 // unpack high to word
546
547 pmullw mm0, mm5 // a * 1/5
548
549 movq mm3, mm1 // make a copy
550 punpcklbw mm1, mm7 // unpack low to word
551
552 pmullw mm2, mm5 // a * 1/5
553 movq mm6, four_fifths // constan
554
555 movq mm4, mm1 // copy of low b
556 pmullw mm4, mm6 // b * 4/5
557
558 punpckhbw mm3, mm7 // unpack high to word
559 movq mm5, mm3 // copy of high b
560
561 pmullw mm5, mm6 // b * 4/5
562 paddw mm0, mm4 // a * 1/5 + b * 4/5
563
564 paddw mm2, mm5 // a * 1/5 + b * 4/5
565 paddw mm0, round_values // + 128
566
567 paddw mm2, round_values // + 128
568 psrlw mm0, 8
569
570 psrlw mm2, 8
571 packuswb mm0, mm2 // des [1]
572
573 movq QWORD ptr [esi+ecx], mm0 // write des[1]
574 movq mm0, [esi+ecx*2] // mm0 = src[2]
575
576 // mm1, mm3 --- Src[1]
577 // mm0 --- Src[2]
578 // mm7 for unpacking
579
580 movq mm5, two_fifths
581 movq mm2, mm0 // make a copy
582
583 pmullw mm1, mm5 // b * 2/5
584 movq mm6, three_fifths
585
586
587 punpcklbw mm0, mm7 // unpack low to word
588 pmullw mm3, mm5 // b * 2/5
589
590 movq mm4, mm0 // make copy of c
591 punpckhbw mm2, mm7 // unpack high to word
592
593 pmullw mm4, mm6 // c * 3/5
594 movq mm5, mm2
595
596 pmullw mm5, mm6 // c * 3/5
597 paddw mm1, mm4 // b * 2/5 + c * 3/5
598
599 paddw mm3, mm5 // b * 2/5 + c * 3/5
600 paddw mm1, round_values // + 128
601
602 paddw mm3, round_values // + 128
603 psrlw mm1, 8
604
605 psrlw mm3, 8
606 packuswb mm1, mm3 // des[2]
607
608 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
609 movq mm1, [edi] // mm1=Src[3];
610
611 movq QWORD ptr [edi+ecx], mm1 // write des[4];
612
613 // mm0, mm2 --- Src[2]
614 // mm1 --- Src[3]
615 // mm6 --- 3/5
616 // mm7 for unpacking
617
618 pmullw mm0, mm6 // c * 3/5
619 movq mm5, two_fifths // mm5 = 2/5
620
621 movq mm3, mm1 // make a copy
622 pmullw mm2, mm6 // c * 3/5
623
624 punpcklbw mm1, mm7 // unpack low
625 movq mm4, mm1 // make a copy
626
627 punpckhbw mm3, mm7 // unpack high
628 pmullw mm4, mm5 // d * 2/5
629
630 movq mm6, mm3 // make a copy
631 pmullw mm6, mm5 // d * 2/5
632
633 paddw mm0, mm4 // c * 3/5 + d * 2/5
634 paddw mm2, mm6 // c * 3/5 + d * 2/5
635
636 paddw mm0, round_values // + 128
637 paddw mm2, round_values // + 128
638
639 psrlw mm0, 8
640 psrlw mm2, 8
641
642 packuswb mm0, mm2 // des[3]
643 movq QWORD ptr [edi], mm0 // write des[3]
644
645 // mm1, mm3 --- Src[3]
646 // mm7 -- cleared for unpacking
647 add edi, 8
648 add esi, 8
649
650 sub edx, 8
651 jg last_vs_4_5_loop
652 }
653 }
654
655 /****************************************************************************
656 *
657 * ROUTINE : vertical_band_3_5_scale_mmx
658 *
659 * INPUTS : unsigned char *dest :
660 * unsigned int dest_pitch :
661 * unsigned int dest_width :
662 *
663 * OUTPUTS : None.
664 *
665 * RETURNS : void
666 *
667 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
668 *
669 * SPECIAL NOTES : The routine uses the first line of the band below
670 * the current band. The function also has an "C" only
671 * version.
672 *
673 ****************************************************************************/
674 static
vertical_band_3_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)675 void vertical_band_3_5_scale_mmx
676 (
677 unsigned char *dest,
678 unsigned int dest_pitch,
679 unsigned int dest_width
680 )
681 {
682 __asm
683 {
684 mov esi, dest // Get the source and destination pointer
685 mov ecx, dest_pitch // Get the pitch size
686
687 lea edi, [esi+ecx*2] // tow lines below
688 add edi, ecx // three lines below
689
690 pxor mm7, mm7 // clear out mm7
691 mov edx, dest_width // Loop counter
692
693 vs_3_5_loop:
694
695 movq mm0, QWORD ptr [esi] // src[0];
696 movq mm1, QWORD ptr [esi+ecx] // src[1];
697
698 movq mm2, mm0 // Make a copy
699 punpcklbw mm0, mm7 // unpack low to word
700
701 movq mm5, two_fifths // mm5 = 2/5
702 punpckhbw mm2, mm7 // unpack high to word
703
704 pmullw mm0, mm5 // a * 2/5
705
706 movq mm3, mm1 // make a copy
707 punpcklbw mm1, mm7 // unpack low to word
708
709 pmullw mm2, mm5 // a * 2/5
710 movq mm6, three_fifths // mm6 = 3/5
711
712 movq mm4, mm1 // copy of low b
713 pmullw mm4, mm6 // b * 3/5
714
715 punpckhbw mm3, mm7 // unpack high to word
716 movq mm5, mm3 // copy of high b
717
718 pmullw mm5, mm6 // b * 3/5
719 paddw mm0, mm4 // a * 2/5 + b * 3/5
720
721 paddw mm2, mm5 // a * 2/5 + b * 3/5
722 paddw mm0, round_values // + 128
723
724 paddw mm2, round_values // + 128
725 psrlw mm0, 8
726
727 psrlw mm2, 8
728 packuswb mm0, mm2 // des [1]
729
730 movq QWORD ptr [esi+ecx], mm0 // write des[1]
731 movq mm0, [esi+ecx*2] // mm0 = src[2]
732
733 // mm1, mm3 --- Src[1]
734 // mm0 --- Src[2]
735 // mm7 for unpacking
736
737 movq mm4, mm1 // b low
738 pmullw mm1, four_fifths // b * 4/5 low
739
740 movq mm5, mm3 // b high
741 pmullw mm3, four_fifths // b * 4/5 high
742
743 movq mm2, mm0 // c
744 pmullw mm4, one_fifth // b * 1/5
745
746 punpcklbw mm0, mm7 // c low
747 pmullw mm5, one_fifth // b * 1/5
748
749 movq mm6, mm0 // make copy of c low
750 punpckhbw mm2, mm7 // c high
751
752 pmullw mm6, one_fifth // c * 1/5 low
753 movq mm7, mm2 // make copy of c high
754
755 pmullw mm7, one_fifth // c * 1/5 high
756 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
757
758 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
759 movq mm6, mm0 // make copy of c low
760
761 pmullw mm6, four_fifths // c * 4/5 low
762 movq mm7, mm2 // make copy of c high
763
764 pmullw mm7, four_fifths // c * 4/5 high
765
766 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
767 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
768
769 paddw mm1, round_values // + 128
770 paddw mm3, round_values // + 128
771
772 psrlw mm1, 8
773 psrlw mm3, 8
774
775 packuswb mm1, mm3 // des[2]
776 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
777
778 paddw mm4, round_values // + 128
779 paddw mm5, round_values // + 128
780
781 psrlw mm4, 8
782 psrlw mm5, 8
783
784 packuswb mm4, mm5 // des[3]
785 movq QWORD ptr [edi], mm4 // write des[3]
786
787 // mm0, mm2 --- Src[3]
788
789 pxor mm7, mm7 // clear mm7 for unpacking
790 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group
791
792 movq mm5, three_fifths // mm5 = 3/5
793 pmullw mm0, mm5 // d * 3/5
794
795 movq mm6, two_fifths // mm6 = 2/5
796 movq mm3, mm1 // make a copy
797
798 pmullw mm2, mm5 // d * 3/5
799 punpcklbw mm1, mm7 // unpack low
800
801 pmullw mm1, mm6 // an * 2/5
802 punpckhbw mm3, mm7 // unpack high
803
804 paddw mm0, mm1 // d * 3/5 + an * 2/5
805 pmullw mm3, mm6 // an * 2/5
806
807 paddw mm2, mm3 // d * 3/5 + an * 2/5
808 paddw mm0, round_values // + 128
809
810 paddw mm2, round_values // + 128
811 psrlw mm0, 8
812
813 psrlw mm2, 8
814 packuswb mm0, mm2 // des[4]
815
816 movq QWORD ptr [edi+ecx], mm0 // write des[4]
817
818 add edi, 8
819 add esi, 8
820
821 sub edx, 8
822 jg vs_3_5_loop
823 }
824 }
825
826 /****************************************************************************
827 *
828 * ROUTINE : last_vertical_band_3_5_scale_mmx
829 *
830 * INPUTS : unsigned char *dest :
831 * unsigned int dest_pitch :
832 * unsigned int dest_width :
833 *
834 * OUTPUTS : None.
835 *
836 * RETURNS : void
837 *
838 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
839 *
840 * SPECIAL NOTES : The routine uses the first line of the band below
841 * the current band. The function also has an "C" only
842 * version.
843 *
844 ****************************************************************************/
845 static
last_vertical_band_3_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)846 void last_vertical_band_3_5_scale_mmx
847 (
848 unsigned char *dest,
849 unsigned int dest_pitch,
850 unsigned int dest_width
851 )
852 {
853 __asm
854 {
855 mov esi, dest // Get the source and destination pointer
856 mov ecx, dest_pitch // Get the pitch size
857
858 lea edi, [esi+ecx*2] // tow lines below
859 add edi, ecx // three lines below
860
861 pxor mm7, mm7 // clear out mm7
862 mov edx, dest_width // Loop counter
863
864
865 last_vs_3_5_loop:
866
867 movq mm0, QWORD ptr [esi] // src[0];
868 movq mm1, QWORD ptr [esi+ecx] // src[1];
869
870 movq mm2, mm0 // Make a copy
871 punpcklbw mm0, mm7 // unpack low to word
872
873 movq mm5, two_fifths // mm5 = 2/5
874 punpckhbw mm2, mm7 // unpack high to word
875
876 pmullw mm0, mm5 // a * 2/5
877
878 movq mm3, mm1 // make a copy
879 punpcklbw mm1, mm7 // unpack low to word
880
881 pmullw mm2, mm5 // a * 2/5
882 movq mm6, three_fifths // mm6 = 3/5
883
884 movq mm4, mm1 // copy of low b
885 pmullw mm4, mm6 // b * 3/5
886
887 punpckhbw mm3, mm7 // unpack high to word
888 movq mm5, mm3 // copy of high b
889
890 pmullw mm5, mm6 // b * 3/5
891 paddw mm0, mm4 // a * 2/5 + b * 3/5
892
893 paddw mm2, mm5 // a * 2/5 + b * 3/5
894 paddw mm0, round_values // + 128
895
896 paddw mm2, round_values // + 128
897 psrlw mm0, 8
898
899 psrlw mm2, 8
900 packuswb mm0, mm2 // des [1]
901
902 movq QWORD ptr [esi+ecx], mm0 // write des[1]
903 movq mm0, [esi+ecx*2] // mm0 = src[2]
904
905
906
907 // mm1, mm3 --- Src[1]
908 // mm0 --- Src[2]
909 // mm7 for unpacking
910
911 movq mm4, mm1 // b low
912 pmullw mm1, four_fifths // b * 4/5 low
913
914 movq QWORD ptr [edi+ecx], mm0 // write des[4]
915
916 movq mm5, mm3 // b high
917 pmullw mm3, four_fifths // b * 4/5 high
918
919 movq mm2, mm0 // c
920 pmullw mm4, one_fifth // b * 1/5
921
922 punpcklbw mm0, mm7 // c low
923 pmullw mm5, one_fifth // b * 1/5
924
925 movq mm6, mm0 // make copy of c low
926 punpckhbw mm2, mm7 // c high
927
928 pmullw mm6, one_fifth // c * 1/5 low
929 movq mm7, mm2 // make copy of c high
930
931 pmullw mm7, one_fifth // c * 1/5 high
932 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
933
934 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
935 movq mm6, mm0 // make copy of c low
936
937 pmullw mm6, four_fifths // c * 4/5 low
938 movq mm7, mm2 // make copy of c high
939
940 pmullw mm7, four_fifths // c * 4/5 high
941
942 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
943 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
944
945 paddw mm1, round_values // + 128
946 paddw mm3, round_values // + 128
947
948 psrlw mm1, 8
949 psrlw mm3, 8
950
951 packuswb mm1, mm3 // des[2]
952 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
953
954 paddw mm4, round_values // + 128
955 paddw mm5, round_values // + 128
956
957 psrlw mm4, 8
958 psrlw mm5, 8
959
960 packuswb mm4, mm5 // des[3]
961 movq QWORD ptr [edi], mm4 // write des[3]
962
963 // mm0, mm2 --- Src[3]
964
965 add edi, 8
966 add esi, 8
967
968 sub edx, 8
969 jg last_vs_3_5_loop
970 }
971 }
972
973 /****************************************************************************
974 *
975 * ROUTINE : vertical_band_1_2_scale_mmx
976 *
977 * INPUTS : unsigned char *dest :
978 * unsigned int dest_pitch :
979 * unsigned int dest_width :
980 *
981 * OUTPUTS : None.
982 *
983 * RETURNS : void
984 *
985 * FUNCTION : 1 to 2 up-scaling of a band of pixels.
986 *
987 * SPECIAL NOTES : The routine uses the first line of the band below
988 * the current band. The function also has an "C" only
989 * version.
990 *
991 ****************************************************************************/
992 static
vertical_band_1_2_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)993 void vertical_band_1_2_scale_mmx
994 (
995 unsigned char *dest,
996 unsigned int dest_pitch,
997 unsigned int dest_width
998 )
999 {
1000 __asm
1001 {
1002
1003 mov esi, dest // Get the source and destination pointer
1004 mov ecx, dest_pitch // Get the pitch size
1005
1006 pxor mm7, mm7 // clear out mm7
1007 mov edx, dest_width // Loop counter
1008
1009 vs_1_2_loop:
1010
1011 movq mm0, [esi] // get Src[0]
1012 movq mm1, [esi + ecx * 2] // get Src[1]
1013
1014 movq mm2, mm0 // make copy before unpack
1015 movq mm3, mm1 // make copy before unpack
1016
1017 punpcklbw mm0, mm7 // low Src[0]
1018 movq mm6, four_ones // mm6= 1, 1, 1, 1
1019
1020 punpcklbw mm1, mm7 // low Src[1]
1021 paddw mm0, mm1 // low (a + b)
1022
1023 punpckhbw mm2, mm7 // high Src[0]
1024 paddw mm0, mm6 // low (a + b + 1)
1025
1026 punpckhbw mm3, mm7
1027 paddw mm2, mm3 // high (a + b )
1028
1029 psraw mm0, 1 // low (a + b +1 )/2
1030 paddw mm2, mm6 // high (a + b + 1)
1031
1032 psraw mm2, 1 // high (a + b + 1)/2
1033 packuswb mm0, mm2 // pack results
1034
1035 movq [esi+ecx], mm0 // write out eight bytes
1036 add esi, 8
1037
1038 sub edx, 8
1039 jg vs_1_2_loop
1040 }
1041
1042 }
1043
1044 /****************************************************************************
1045 *
1046 * ROUTINE : last_vertical_band_1_2_scale_mmx
1047 *
1048 * INPUTS : unsigned char *dest :
1049 * unsigned int dest_pitch :
1050 * unsigned int dest_width :
1051 *
1052 * OUTPUTS : None.
1053 *
1054 * RETURNS : void
1055 *
1056 * FUNCTION : 1 to 2 up-scaling of band of pixels.
1057 *
1058 * SPECIAL NOTES : The routine uses the first line of the band below
1059 * the current band. The function also has an "C" only
1060 * version.
1061 *
1062 ****************************************************************************/
1063 static
last_vertical_band_1_2_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1064 void last_vertical_band_1_2_scale_mmx
1065 (
1066 unsigned char *dest,
1067 unsigned int dest_pitch,
1068 unsigned int dest_width
1069 )
1070 {
1071 __asm
1072 {
1073 mov esi, dest // Get the source and destination pointer
1074 mov ecx, dest_pitch // Get the pitch size
1075
1076 mov edx, dest_width // Loop counter
1077
1078 last_vs_1_2_loop:
1079
1080 movq mm0, [esi] // get Src[0]
1081 movq [esi+ecx], mm0 // write out eight bytes
1082
1083 add esi, 8
1084 sub edx, 8
1085
1086 jg last_vs_1_2_loop
1087 }
1088 }
1089
1090 /****************************************************************************
1091 *
1092 * ROUTINE : horizontal_line_1_2_scale
1093 *
1094 * INPUTS : const unsigned char *source :
1095 * unsigned int source_width :
1096 * unsigned char *dest :
1097 * unsigned int dest_width :
1098 *
1099 * OUTPUTS : None.
1100 *
1101 * RETURNS : void
1102 *
1103 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1104 *
1105 * SPECIAL NOTES : None.
1106 *
1107 ****************************************************************************/
1108 static
horizontal_line_1_2_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1109 void horizontal_line_1_2_scale_mmx
1110 (
1111 const unsigned char *source,
1112 unsigned int source_width,
1113 unsigned char *dest,
1114 unsigned int dest_width
1115 )
1116 {
1117 (void) dest_width;
1118
1119 __asm
1120 {
1121 mov esi, source
1122 mov edi, dest
1123
1124 pxor mm7, mm7
1125 movq mm6, four_ones
1126
1127 mov ecx, source_width
1128
1129 hs_1_2_loop:
1130
1131 movq mm0, [esi]
1132 movq mm1, [esi+1]
1133
1134 movq mm2, mm0
1135 movq mm3, mm1
1136
1137 movq mm4, mm0
1138 punpcklbw mm0, mm7
1139
1140 punpcklbw mm1, mm7
1141 paddw mm0, mm1
1142
1143 paddw mm0, mm6
1144 punpckhbw mm2, mm7
1145
1146 punpckhbw mm3, mm7
1147 paddw mm2, mm3
1148
1149 paddw mm2, mm6
1150 psraw mm0, 1
1151
1152 psraw mm2, 1
1153 packuswb mm0, mm2
1154
1155 movq mm2, mm4
1156 punpcklbw mm2, mm0
1157
1158 movq [edi], mm2
1159 punpckhbw mm4, mm0
1160
1161 movq [edi+8], mm4
1162 add esi, 8
1163
1164 add edi, 16
1165 sub ecx, 8
1166
1167 cmp ecx, 8
1168 jg hs_1_2_loop
1169
1170 // last eight pixel
1171
1172 movq mm0, [esi]
1173 movq mm1, mm0
1174
1175 movq mm2, mm0
1176 movq mm3, mm1
1177
1178 psrlq mm1, 8
1179 psrlq mm3, 56
1180
1181 psllq mm3, 56
1182 por mm1, mm3
1183
1184 movq mm3, mm1
1185 movq mm4, mm0
1186
1187 punpcklbw mm0, mm7
1188 punpcklbw mm1, mm7
1189
1190 paddw mm0, mm1
1191 paddw mm0, mm6
1192
1193 punpckhbw mm2, mm7
1194 punpckhbw mm3, mm7
1195
1196 paddw mm2, mm3
1197 paddw mm2, mm6
1198
1199 psraw mm0, 1
1200 psraw mm2, 1
1201
1202 packuswb mm0, mm2
1203 movq mm2, mm4
1204
1205 punpcklbw mm2, mm0
1206 movq [edi], mm2
1207
1208 punpckhbw mm4, mm0
1209 movq [edi+8], mm4
1210 }
1211 }
1212
1213
1214
1215
1216
1217 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
1219
1220
1221 /****************************************************************************
1222 *
1223 * ROUTINE : horizontal_line_5_4_scale_mmx
1224 *
1225 * INPUTS : const unsigned char *source : Pointer to source data.
1226 * unsigned int source_width : Stride of source.
1227 * unsigned char *dest : Pointer to destination data.
1228 * unsigned int dest_width : Stride of destination (NOT USED).
1229 *
1230 * OUTPUTS : None.
1231 *
1232 * RETURNS : void
1233 *
1234 * FUNCTION : Copies horizontal line of pixels from source to
1235 * destination scaling up by 4 to 5.
1236 *
1237 * SPECIAL NOTES : None.
1238 *
1239 ****************************************************************************/
1240 static
horizontal_line_5_4_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1241 void horizontal_line_5_4_scale_mmx
1242 (
1243 const unsigned char *source,
1244 unsigned int source_width,
1245 unsigned char *dest,
1246 unsigned int dest_width
1247 )
1248 {
1249 /*
1250 unsigned i;
1251 unsigned int a, b, c, d, e;
1252 unsigned char *des = dest;
1253 const unsigned char *src = source;
1254
1255 (void) dest_width;
1256
1257 for ( i=0; i<source_width; i+=5 )
1258 {
1259 a = src[0];
1260 b = src[1];
1261 c = src[2];
1262 d = src[3];
1263 e = src[4];
1264
1265 des[0] = a;
1266 des[1] = ((b*192 + c* 64 + 128)>>8);
1267 des[2] = ((c*128 + d*128 + 128)>>8);
1268 des[3] = ((d* 64 + e*192 + 128)>>8);
1269
1270 src += 5;
1271 des += 4;
1272 }
1273 */
1274 (void) dest_width;
1275
1276 __asm
1277 {
1278
1279 mov esi, source ;
1280 mov edi, dest ;
1281
1282 mov ecx, source_width ;
1283 movq mm5, const54_1 ;
1284
1285 pxor mm7, mm7 ;
1286 movq mm6, const54_2 ;
1287
1288 movq mm4, round_values ;
1289 lea edx, [esi+ecx] ;
1290 horizontal_line_5_4_loop:
1291
1292 movq mm0, QWORD PTR [esi] ;
1293 00 01 02 03 04 05 06 07
1294 movq mm1, mm0 ;
1295 00 01 02 03 04 05 06 07
1296
1297 psrlq mm0, 8 ;
1298 01 02 03 04 05 06 07 xx
1299 punpcklbw mm1, mm7 ;
1300 xx 00 xx 01 xx 02 xx 03
1301
1302 punpcklbw mm0, mm7 ;
1303 xx 01 xx 02 xx 03 xx 04
1304 pmullw mm1, mm5
1305
1306 pmullw mm0, mm6
1307 add esi, 5
1308
1309 add edi, 4
1310 paddw mm1, mm0
1311
1312 paddw mm1, mm4
1313 psrlw mm1, 8
1314
1315 cmp esi, edx
1316 packuswb mm1, mm7
1317
1318 movd DWORD PTR [edi-4], mm1
1319
1320 jl horizontal_line_5_4_loop
1321
1322 }
1323
1324 }
1325 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
1326 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
1327 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1328
1329 static
vertical_band_5_4_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1330 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1331 {
1332
1333 __asm
1334 {
1335 push ebx
1336
1337 mov esi, source // Get the source and destination pointer
1338 mov ecx, src_pitch // Get the pitch size
1339
1340 mov edi, dest // tow lines below
1341 pxor mm7, mm7 // clear out mm7
1342
1343 mov edx, dest_pitch // Loop counter
1344 mov ebx, dest_width
1345
1346 vs_5_4_loop:
1347
1348 movd mm0, DWORD ptr [esi] // src[0];
1349 movd mm1, DWORD ptr [esi+ecx] // src[1];
1350
1351 movd mm2, DWORD ptr [esi+ecx*2]
1352 lea eax, [esi+ecx*2] //
1353
1354 punpcklbw mm1, mm7
1355 punpcklbw mm2, mm7
1356
1357 movq mm3, mm2
1358 pmullw mm1, three_fourths
1359
1360 pmullw mm2, one_fourths
1361 movd mm4, [eax+ecx]
1362
1363 pmullw mm3, two_fourths
1364 punpcklbw mm4, mm7
1365
1366 movq mm5, mm4
1367 pmullw mm4, two_fourths
1368
1369 paddw mm1, mm2
1370 movd mm6, [eax+ecx*2]
1371
1372 pmullw mm5, one_fourths
1373 paddw mm1, round_values;
1374
1375 paddw mm3, mm4
1376 psrlw mm1, 8
1377
1378 punpcklbw mm6, mm7
1379 paddw mm3, round_values
1380
1381 pmullw mm6, three_fourths
1382 psrlw mm3, 8
1383
1384 packuswb mm1, mm7
1385 packuswb mm3, mm7
1386
1387 movd DWORD PTR [edi], mm0
1388 movd DWORD PTR [edi+edx], mm1
1389
1390
1391 paddw mm5, mm6
1392 movd DWORD PTR [edi+edx*2], mm3
1393
1394 lea eax, [edi+edx*2]
1395 paddw mm5, round_values
1396
1397 psrlw mm5, 8
1398 add edi, 4
1399
1400 packuswb mm5, mm7
1401 movd DWORD PTR [eax+edx], mm5
1402
1403 add esi, 4
1404 sub ebx, 4
1405
1406 jg vs_5_4_loop
1407
1408 pop ebx
1409 }
1410 }
1411
1412
1413 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
1414 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
1415
1416
1417 static
horizontal_line_5_3_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1418 void horizontal_line_5_3_scale_mmx
1419 (
1420 const unsigned char *source,
1421 unsigned int source_width,
1422 unsigned char *dest,
1423 unsigned int dest_width
1424 )
1425 {
1426
1427 (void) dest_width;
1428 __asm
1429 {
1430
1431 mov esi, source ;
1432 mov edi, dest ;
1433
1434 mov ecx, source_width ;
1435 movq mm5, const53_1 ;
1436
1437 pxor mm7, mm7 ;
1438 movq mm6, const53_2 ;
1439
1440 movq mm4, round_values ;
1441 lea edx, [esi+ecx-5] ;
1442 horizontal_line_5_3_loop:
1443
1444 movq mm0, QWORD PTR [esi] ;
1445 00 01 02 03 04 05 06 07
1446 movq mm1, mm0 ;
1447 00 01 02 03 04 05 06 07
1448
1449 psllw mm0, 8 ;
1450 xx 00 xx 02 xx 04 xx 06
1451 psrlw mm1, 8 ;
1452 01 xx 03 xx 05 xx 07 xx
1453
1454 psrlw mm0, 8 ;
1455 00 xx 02 xx 04 xx 06 xx
1456 psllq mm1, 16 ;
1457 xx xx 01 xx 03 xx 05 xx
1458
1459 pmullw mm0, mm6
1460
1461 pmullw mm1, mm5
1462 add esi, 5
1463
1464 add edi, 3
1465 paddw mm1, mm0
1466
1467 paddw mm1, mm4
1468 psrlw mm1, 8
1469
1470 cmp esi, edx
1471 packuswb mm1, mm7
1472
1473 movd DWORD PTR [edi-3], mm1
1474 jl horizontal_line_5_3_loop
1475
1476 //exit condition
1477 movq mm0, QWORD PTR [esi] ;
1478 00 01 02 03 04 05 06 07
1479 movq mm1, mm0 ;
1480 00 01 02 03 04 05 06 07
1481
1482 psllw mm0, 8 ;
1483 xx 00 xx 02 xx 04 xx 06
1484 psrlw mm1, 8 ;
1485 01 xx 03 xx 05 xx 07 xx
1486
1487 psrlw mm0, 8 ;
1488 00 xx 02 xx 04 xx 06 xx
1489 psllq mm1, 16 ;
1490 xx xx 01 xx 03 xx 05 xx
1491
1492 pmullw mm0, mm6
1493
1494 pmullw mm1, mm5
1495 paddw mm1, mm0
1496
1497 paddw mm1, mm4
1498 psrlw mm1, 8
1499
1500 packuswb mm1, mm7
1501 movd eax, mm1
1502
1503 mov edx, eax
1504 shr edx, 16
1505
1506 mov WORD PTR[edi], ax
1507 mov BYTE PTR[edi+2], dl
1508
1509 }
1510
1511 }
1512
1513 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1515
1516 static
vertical_band_5_3_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1517 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1518 {
1519
1520 __asm
1521 {
1522 push ebx
1523
1524 mov esi, source // Get the source and destination pointer
1525 mov ecx, src_pitch // Get the pitch size
1526
1527 mov edi, dest // tow lines below
1528 pxor mm7, mm7 // clear out mm7
1529
1530 mov edx, dest_pitch // Loop counter
1531 movq mm5, one_thirds
1532
1533 movq mm6, two_thirds
1534 mov ebx, dest_width;
1535
1536 vs_5_3_loop:
1537
1538 movd mm0, DWORD ptr [esi] // src[0];
1539 movd mm1, DWORD ptr [esi+ecx] // src[1];
1540
1541 movd mm2, DWORD ptr [esi+ecx*2]
1542 lea eax, [esi+ecx*2] //
1543
1544 punpcklbw mm1, mm7
1545 punpcklbw mm2, mm7
1546
1547 pmullw mm1, mm5
1548 pmullw mm2, mm6
1549
1550 movd mm3, DWORD ptr [eax+ecx]
1551 movd mm4, DWORD ptr [eax+ecx*2]
1552
1553 punpcklbw mm3, mm7
1554 punpcklbw mm4, mm7
1555
1556 pmullw mm3, mm6
1557 pmullw mm4, mm5
1558
1559
1560 movd DWORD PTR [edi], mm0
1561 paddw mm1, mm2
1562
1563 paddw mm1, round_values
1564 psrlw mm1, 8
1565
1566 packuswb mm1, mm7
1567 paddw mm3, mm4
1568
1569 paddw mm3, round_values
1570 movd DWORD PTR [edi+edx], mm1
1571
1572 psrlw mm3, 8
1573 packuswb mm3, mm7
1574
1575 movd DWORD PTR [edi+edx*2], mm3
1576
1577
1578 add edi, 4
1579 add esi, 4
1580
1581 sub ebx, 4
1582 jg vs_5_3_loop
1583
1584 pop ebx
1585 }
1586 }
1587
1588
1589
1590
1591 /****************************************************************************
1592 *
1593 * ROUTINE : horizontal_line_2_1_scale
1594 *
1595 * INPUTS : const unsigned char *source :
1596 * unsigned int source_width :
1597 * unsigned char *dest :
1598 * unsigned int dest_width :
1599 *
1600 * OUTPUTS : None.
1601 *
1602 * RETURNS : void
1603 *
1604 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1605 *
1606 * SPECIAL NOTES : None.
1607 *
1608 ****************************************************************************/
1609 static
horizontal_line_2_1_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1610 void horizontal_line_2_1_scale_mmx
1611 (
1612 const unsigned char *source,
1613 unsigned int source_width,
1614 unsigned char *dest,
1615 unsigned int dest_width
1616 )
1617 {
1618 (void) dest_width;
1619 (void) source_width;
1620 __asm
1621 {
1622 mov esi, source
1623 mov edi, dest
1624
1625 pxor mm7, mm7
1626 mov ecx, dest_width
1627
1628 xor edx, edx
1629 hs_2_1_loop:
1630
1631 movq mm0, [esi+edx*2]
1632 psllw mm0, 8
1633
1634 psrlw mm0, 8
1635 packuswb mm0, mm7
1636
1637 movd DWORD Ptr [edi+edx], mm0;
1638 add edx, 4
1639
1640 cmp edx, ecx
1641 jl hs_2_1_loop
1642
1643 }
1644 }
1645
1646
1647
1648 static
vertical_band_2_1_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1649 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1650 {
1651 (void) dest_pitch;
1652 (void) src_pitch;
1653 vpx_memcpy(dest, source, dest_width);
1654 }
1655
1656
1657 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
1658 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
1659
1660 static
vertical_band_2_1_scale_i_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1662 {
1663
1664 (void) dest_pitch;
1665 __asm
1666 {
1667 mov esi, source
1668 mov edi, dest
1669
1670 mov eax, src_pitch
1671 mov edx, dest_width
1672
1673 pxor mm7, mm7
1674 sub esi, eax //back one line
1675
1676
1677 lea ecx, [esi+edx];
1678 movq mm6, round_values;
1679
1680 movq mm5, three_sixteenths;
1681 movq mm4, ten_sixteenths;
1682
1683 vs_2_1_i_loop:
1684 movd mm0, [esi] //
1685 movd mm1, [esi+eax] //
1686
1687 movd mm2, [esi+eax*2] //
1688 punpcklbw mm0, mm7
1689
1690 pmullw mm0, mm5
1691 punpcklbw mm1, mm7
1692
1693 pmullw mm1, mm4
1694 punpcklbw mm2, mm7
1695
1696 pmullw mm2, mm5
1697 paddw mm0, round_values
1698
1699 paddw mm1, mm2
1700 paddw mm0, mm1
1701
1702 psrlw mm0, 8
1703 packuswb mm0, mm7
1704
1705 movd DWORD PTR [edi], mm0
1706 add esi, 4
1707
1708 add edi, 4;
1709 cmp esi, ecx
1710 jl vs_2_1_i_loop
1711
1712 }
1713 }
1714
1715
1716
1717 void
register_mmxscalers(void)1718 register_mmxscalers(void)
1719 {
1720 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
1721 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
1722 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
1723 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
1724 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
1725 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
1726 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
1727 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
1728 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
1729
1730 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
1731 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
1732 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
1733 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
1734 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
1735 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
1736
1737
1738
1739 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
1740 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
1741 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
1742 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
1743 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
1744 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
1745 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
1746
1747
1748
1749
1750 }
1751