• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 /****************************************************************************
13 *
14 *   Module Title :     scaleopt.cpp
15 *
16 *   Description  :     Optimized scaling functions
17 *
18 ****************************************************************************/
19 #include "pragmas.h"
20 
21 
22 
23 /****************************************************************************
24 *  Module Statics
25 ****************************************************************************/
26 __declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
34 __declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
35 __declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
37 
38 
39 
40 #include "vpx_scale/vpxscale.h"
41 #include "vpx_mem/vpx_mem.h"
42 
43 /****************************************************************************
44  *
45  *  ROUTINE       : horizontal_line_3_5_scale_mmx
46  *
47  *  INPUTS        : const unsigned char *source :
48  *                  unsigned int source_width    :
49  *                  unsigned char *dest         :
50  *                  unsigned int dest_width      :
51  *
52  *  OUTPUTS       : None.
53  *
54  *  RETURNS       : void
55  *
56  *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
57  *
58  *  SPECIAL NOTES : None.
59  *
60  ****************************************************************************/
61 static
horizontal_line_3_5_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)62 void horizontal_line_3_5_scale_mmx
63 (
64     const unsigned char *source,
65     unsigned int source_width,
66     unsigned char *dest,
67     unsigned int dest_width
68 )
69 {
70     (void) dest_width;
71 
72     __asm
73     {
74 
75         push ebx
76 
77         mov         esi,    source
78         mov         edi,    dest
79 
80         mov         ecx,    source_width
81         lea         edx,    [esi+ecx-3];
82 
83         movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
84         movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
85 
86         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
87         pxor        mm7,    mm7             // clear mm7
88 
89         horiz_line_3_5_loop:
90 
91         mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
92         mov        ebx,    eax
93 
94         and         ebx,    0xffff00        // ebx = xx 01 02 xx
95         mov         ecx,    eax             // ecx = 00 01 02 03
96 
97         and         eax,    0xffff0000      // eax = xx xx 02 03
98         xor         ecx,    eax             // ecx = 00 01 xx xx
99 
100         shr         ebx,    8               // ebx = 01 02 xx xx
101         or          eax,    ebx             // eax = 01 02 02 03
102 
103         shl         ebx,    16              // ebx = xx xx 01 02
104         movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
105 
106         or          ebx,    ecx             // ebx = 00 01 01 02
107         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
108 
109         movd        mm0,    ebx             // mm0 = 00 01 01 02
110         pmullw      mm1,    mm6             //
111 
112         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
113         pmullw      mm0,    mm5             //
114 
115         mov         [edi],  ebx             // writeoutput 00 xx xx xx
116         add         esi,    3
117 
118         add         edi,    5
119         paddw       mm0,    mm1
120 
121         paddw       mm0,    mm4
122         psrlw       mm0,    8
123 
124         cmp         esi,    edx
125         packuswb    mm0,    mm7
126 
127         movd        DWORD Ptr [edi-4], mm0
128         jl          horiz_line_3_5_loop
129 
130 //Exit:
131         mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
132         mov         ebx,    eax
133 
134         and         ebx,    0xffff00        // ebx = xx 01 02 xx
135         mov         ecx,    eax             // ecx = 00 01 02 03
136 
137         and         eax,    0xffff0000      // eax = xx xx 02 03
138         xor         ecx,    eax             // ecx = 00 01 xx xx
139 
140         shr         ebx,    8               // ebx = 01 02 xx xx
141         or          eax,    ebx             // eax = 01 02 02 03
142 
143         shl         eax,    8               // eax = xx 01 02 02
144         and         eax,    0xffff0000      // eax = xx xx 02 02
145 
146         or          eax,    ebx             // eax = 01 02 02 02
147 
148         shl         ebx,    16              // ebx = xx xx 01 02
149         movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
150 
151         or          ebx,    ecx             // ebx = 00 01 01 02
152         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
153 
154         movd        mm0,    ebx             // mm0 = 00 01 01 02
155         pmullw      mm1,    mm6             //
156 
157         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
158         pmullw      mm0,    mm5             //
159 
160         mov         [edi],  ebx             // writeoutput 00 xx xx xx
161         paddw       mm0,    mm1
162 
163         paddw       mm0,    mm4
164         psrlw       mm0,    8
165 
166         packuswb    mm0,    mm7
167         movd        DWORD Ptr [edi+1], mm0
168 
169         pop ebx
170 
171     }
172 
173 }
174 
175 
176 /****************************************************************************
177  *
178  *  ROUTINE       : horizontal_line_4_5_scale_mmx
179  *
180  *  INPUTS        : const unsigned char *source :
181  *                  unsigned int source_width    :
182  *                  unsigned char *dest         :
183  *                  unsigned int dest_width      :
184  *
185  *  OUTPUTS       : None.
186  *
187  *  RETURNS       : void
188  *
189  *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
190  *
191  *  SPECIAL NOTES : None.
192  *
193  ****************************************************************************/
194 static
horizontal_line_4_5_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)195 void horizontal_line_4_5_scale_mmx
196 (
197     const unsigned char *source,
198     unsigned int source_width,
199     unsigned char *dest,
200     unsigned int dest_width
201 )
202 {
203     (void)dest_width;
204 
205     __asm
206     {
207 
208         mov         esi,    source
209         mov         edi,    dest
210 
211         mov         ecx,    source_width
212         lea         edx,    [esi+ecx-8];
213 
214         movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
215         movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
216 
217         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
218         pxor        mm7,    mm7             // clear mm7
219 
220         horiz_line_4_5_loop:
221 
222         movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
223         movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
224 
225         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
226         movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
227 
228         movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
229         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
230 
231         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
232         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
233 
234         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
235         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
236 
237         movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
238         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
239 
240         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
241         pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
242 
243         paddw       mm0,    mm1             // added round values
244         paddw       mm0,    mm4
245 
246         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
247         packuswb    mm0,    mm7
248 
249         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
250         add         edi,    10
251 
252         add         esi,    8
253         paddw       mm2,    mm3             //
254 
255         paddw       mm2,    mm4             // added round values
256         cmp         esi,    edx
257 
258         psrlw       mm2,    8
259         packuswb    mm2,    mm7
260 
261         movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
262         jl         horiz_line_4_5_loop
263 
264 //Exit:
265         movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
266         movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
267 
268         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
269         psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
270 
271         movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
272         pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
273 
274         psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
275         por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
276 
277         movq        mm3,    mm1
278 
279         movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
280         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
281 
282         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
283         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
284 
285         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
286         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
287 
288         movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
289         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
290 
291         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
292         pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
293 
294         paddw       mm0,    mm1             // added round values
295         paddw       mm0,    mm4
296 
297         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
298         packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
299 
300         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
301         paddw       mm2,    mm3             //
302 
303         paddw       mm2,    mm4             // added round values
304         psrlw       mm2,    8
305 
306         packuswb    mm2,    mm7
307         movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
308 
309 
310     }
311 }
312 
313 /****************************************************************************
314  *
315  *  ROUTINE       : vertical_band_4_5_scale_mmx
316  *
317  *  INPUTS        : unsigned char *dest    :
318  *                  unsigned int dest_pitch :
319  *                  unsigned int dest_width :
320  *
321  *  OUTPUTS       : None.
322  *
323  *  RETURNS       : void
324  *
325  *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
326  *
327  *  SPECIAL NOTES : The routine uses the first line of the band below
328  *                  the current band. The function also has a "C" only
329  *                  version.
330  *
331  ****************************************************************************/
332 static
vertical_band_4_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)333 void vertical_band_4_5_scale_mmx
334 (
335     unsigned char *dest,
336     unsigned int dest_pitch,
337     unsigned int dest_width
338 )
339 {
340     __asm
341     {
342 
343         mov         esi,    dest                    // Get the source and destination pointer
344         mov         ecx,    dest_pitch               // Get the pitch size
345 
346         lea         edi,    [esi+ecx*2]             // tow lines below
347         add         edi,    ecx                     // three lines below
348 
349         pxor        mm7,    mm7                     // clear out mm7
350         mov         edx,    dest_width               // Loop counter
351 
352         vs_4_5_loop:
353 
354         movq        mm0,    QWORD ptr [esi]         // src[0];
355         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
356 
357         movq        mm2,    mm0                     // Make a copy
358         punpcklbw   mm0,    mm7                     // unpack low to word
359 
360         movq        mm5,    one_fifth
361         punpckhbw   mm2,    mm7                     // unpack high to word
362 
363         pmullw      mm0,    mm5                     // a * 1/5
364 
365         movq        mm3,    mm1                     // make a copy
366         punpcklbw   mm1,    mm7                     // unpack low to word
367 
368         pmullw      mm2,    mm5                     // a * 1/5
369         movq        mm6,    four_fifths               // constan
370 
371         movq        mm4,    mm1                     // copy of low b
372         pmullw      mm4,    mm6                     // b * 4/5
373 
374         punpckhbw   mm3,    mm7                     // unpack high to word
375         movq        mm5,    mm3                     // copy of high b
376 
377         pmullw      mm5,    mm6                     // b * 4/5
378         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
379 
380         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
381         paddw       mm0,    round_values             // + 128
382 
383         paddw       mm2,    round_values             // + 128
384         psrlw       mm0,    8
385 
386         psrlw       mm2,    8
387         packuswb    mm0,    mm2                     // des [1]
388 
389         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
390         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
391 
392         // mm1, mm3 --- Src[1]
393         // mm0 --- Src[2]
394         // mm7 for unpacking
395 
396         movq        mm5,    two_fifths
397         movq        mm2,    mm0                     // make a copy
398 
399         pmullw      mm1,    mm5                     // b * 2/5
400         movq        mm6,    three_fifths
401 
402 
403         punpcklbw   mm0,    mm7                     // unpack low to word
404         pmullw      mm3,    mm5                     // b * 2/5
405 
406         movq        mm4,    mm0                     // make copy of c
407         punpckhbw   mm2,    mm7                     // unpack high to word
408 
409         pmullw      mm4,    mm6                     // c * 3/5
410         movq        mm5,    mm2
411 
412         pmullw      mm5,    mm6                     // c * 3/5
413         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
414 
415         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
416         paddw       mm1,    round_values             // + 128
417 
418         paddw       mm3,    round_values             // + 128
419         psrlw       mm1,    8
420 
421         psrlw       mm3,    8
422         packuswb    mm1,    mm3                     // des[2]
423 
424         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
425         movq        mm1,    [edi]                   // mm1=Src[3];
426 
427         // mm0, mm2 --- Src[2]
428         // mm1 --- Src[3]
429         // mm6 --- 3/5
430         // mm7 for unpacking
431 
432         pmullw      mm0,    mm6                     // c * 3/5
433         movq        mm5,    two_fifths               // mm5 = 2/5
434 
435         movq        mm3,    mm1                     // make a copy
436         pmullw      mm2,    mm6                     // c * 3/5
437 
438         punpcklbw   mm1,    mm7                     // unpack low
439         movq        mm4,    mm1                     // make a copy
440 
441         punpckhbw   mm3,    mm7                     // unpack high
442         pmullw      mm4,    mm5                     // d * 2/5
443 
444         movq        mm6,    mm3                     // make a copy
445         pmullw      mm6,    mm5                     // d * 2/5
446 
447         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
448         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
449 
450         paddw       mm0,    round_values             // + 128
451         paddw       mm2,    round_values             // + 128
452 
453         psrlw       mm0,    8
454         psrlw       mm2,    8
455 
456         packuswb    mm0,    mm2                     // des[3]
457         movq        QWORD ptr [edi], mm0            // write des[3]
458 
459         //  mm1, mm3 --- Src[3]
460         //  mm7 -- cleared for unpacking
461 
462         movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
463 
464         movq        mm5,    four_fifths              // mm5 = 4/5
465         pmullw      mm1,    mm5                     // d * 4/5
466 
467         movq        mm6,    one_fifth                // mm6 = 1/5
468         movq        mm2,    mm0                     // make a copy
469 
470         pmullw      mm3,    mm5                     // d * 4/5
471         punpcklbw   mm0,    mm7                     // unpack low
472 
473         pmullw      mm0,    mm6                     // an * 1/5
474         punpckhbw   mm2,    mm7                     // unpack high
475 
476         paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
477         pmullw      mm2,    mm6                     // an * 1/5
478 
479         paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
480         paddw       mm1,    round_values             // + 128
481 
482         paddw       mm3,    round_values             // + 128
483         psrlw       mm1,    8
484 
485         psrlw       mm3,    8
486         packuswb    mm1,    mm3                     // des[4]
487 
488         movq        QWORD ptr [edi+ecx], mm1        // write des[4]
489 
490         add         edi,    8
491         add         esi,    8
492 
493         sub         edx,    8
494         jg         vs_4_5_loop
495     }
496 }
497 
498 /****************************************************************************
499  *
500  *  ROUTINE       : last_vertical_band_4_5_scale_mmx
501  *
502  *  INPUTS        : unsigned char *dest    :
503  *                  unsigned int dest_pitch :
504  *                  unsigned int dest_width :
505  *
506  *  OUTPUTS       : None.
507  *
508  *  RETURNS       : None
509  *
510  *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
511  *
512  *  SPECIAL NOTES : The routine uses the first line of the band below
513  *                  the current band. The function also has an "C" only
514  *                  version.
515  *
516  ****************************************************************************/
517 static
last_vertical_band_4_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)518 void last_vertical_band_4_5_scale_mmx
519 (
520     unsigned char *dest,
521     unsigned int dest_pitch,
522     unsigned int dest_width
523 )
524 {
525     __asm
526     {
527         mov         esi,    dest                    // Get the source and destination pointer
528         mov         ecx,    dest_pitch               // Get the pitch size
529 
530         lea         edi,    [esi+ecx*2]             // tow lines below
531         add         edi,    ecx                     // three lines below
532 
533         pxor        mm7,    mm7                     // clear out mm7
534         mov         edx,    dest_width               // Loop counter
535 
536         last_vs_4_5_loop:
537 
538         movq        mm0,    QWORD ptr [esi]         // src[0];
539         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
540 
541         movq        mm2,    mm0                     // Make a copy
542         punpcklbw   mm0,    mm7                     // unpack low to word
543 
544         movq        mm5,    one_fifth
545         punpckhbw   mm2,    mm7                     // unpack high to word
546 
547         pmullw      mm0,    mm5                     // a * 1/5
548 
549         movq        mm3,    mm1                     // make a copy
550         punpcklbw   mm1,    mm7                     // unpack low to word
551 
552         pmullw      mm2,    mm5                     // a * 1/5
553         movq        mm6,    four_fifths               // constan
554 
555         movq        mm4,    mm1                     // copy of low b
556         pmullw      mm4,    mm6                     // b * 4/5
557 
558         punpckhbw   mm3,    mm7                     // unpack high to word
559         movq        mm5,    mm3                     // copy of high b
560 
561         pmullw      mm5,    mm6                     // b * 4/5
562         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
563 
564         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
565         paddw       mm0,    round_values             // + 128
566 
567         paddw       mm2,    round_values             // + 128
568         psrlw       mm0,    8
569 
570         psrlw       mm2,    8
571         packuswb    mm0,    mm2                     // des [1]
572 
573         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
574         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
575 
576         // mm1, mm3 --- Src[1]
577         // mm0 --- Src[2]
578         // mm7 for unpacking
579 
580         movq        mm5,    two_fifths
581         movq        mm2,    mm0                     // make a copy
582 
583         pmullw      mm1,    mm5                     // b * 2/5
584         movq        mm6,    three_fifths
585 
586 
587         punpcklbw   mm0,    mm7                     // unpack low to word
588         pmullw      mm3,    mm5                     // b * 2/5
589 
590         movq        mm4,    mm0                     // make copy of c
591         punpckhbw   mm2,    mm7                     // unpack high to word
592 
593         pmullw      mm4,    mm6                     // c * 3/5
594         movq        mm5,    mm2
595 
596         pmullw      mm5,    mm6                     // c * 3/5
597         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
598 
599         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
600         paddw       mm1,    round_values             // + 128
601 
602         paddw       mm3,    round_values             // + 128
603         psrlw       mm1,    8
604 
605         psrlw       mm3,    8
606         packuswb    mm1,    mm3                     // des[2]
607 
608         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
609         movq        mm1,    [edi]                   // mm1=Src[3];
610 
611         movq        QWORD ptr [edi+ecx], mm1        // write des[4];
612 
613         // mm0, mm2 --- Src[2]
614         // mm1 --- Src[3]
615         // mm6 --- 3/5
616         // mm7 for unpacking
617 
618         pmullw      mm0,    mm6                     // c * 3/5
619         movq        mm5,    two_fifths               // mm5 = 2/5
620 
621         movq        mm3,    mm1                     // make a copy
622         pmullw      mm2,    mm6                     // c * 3/5
623 
624         punpcklbw   mm1,    mm7                     // unpack low
625         movq        mm4,    mm1                     // make a copy
626 
627         punpckhbw   mm3,    mm7                     // unpack high
628         pmullw      mm4,    mm5                     // d * 2/5
629 
630         movq        mm6,    mm3                     // make a copy
631         pmullw      mm6,    mm5                     // d * 2/5
632 
633         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
634         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
635 
636         paddw       mm0,    round_values             // + 128
637         paddw       mm2,    round_values             // + 128
638 
639         psrlw       mm0,    8
640         psrlw       mm2,    8
641 
642         packuswb    mm0,    mm2                     // des[3]
643         movq        QWORD ptr [edi], mm0            // write des[3]
644 
645         //  mm1, mm3 --- Src[3]
646         //  mm7 -- cleared for unpacking
647         add         edi,    8
648         add         esi,    8
649 
650         sub         edx,    8
651         jg          last_vs_4_5_loop
652     }
653 }
654 
655 /****************************************************************************
656  *
657  *  ROUTINE       : vertical_band_3_5_scale_mmx
658  *
659  *  INPUTS        : unsigned char *dest    :
660  *                  unsigned int dest_pitch :
661  *                  unsigned int dest_width :
662  *
663  *  OUTPUTS       : None.
664  *
665  *  RETURNS       : void
666  *
667  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
668  *
669  *  SPECIAL NOTES : The routine uses the first line of the band below
670  *                  the current band. The function also has an "C" only
671  *                  version.
672  *
673  ****************************************************************************/
674 static
vertical_band_3_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)675 void vertical_band_3_5_scale_mmx
676 (
677     unsigned char *dest,
678     unsigned int dest_pitch,
679     unsigned int dest_width
680 )
681 {
682     __asm
683     {
684         mov         esi,    dest                    // Get the source and destination pointer
685         mov         ecx,    dest_pitch               // Get the pitch size
686 
687         lea         edi,    [esi+ecx*2]             // tow lines below
688         add         edi,    ecx                     // three lines below
689 
690         pxor        mm7,    mm7                     // clear out mm7
691         mov         edx,    dest_width               // Loop counter
692 
693         vs_3_5_loop:
694 
695         movq        mm0,    QWORD ptr [esi]         // src[0];
696         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
697 
698         movq        mm2,    mm0                     // Make a copy
699         punpcklbw   mm0,    mm7                     // unpack low to word
700 
701         movq        mm5,    two_fifths               // mm5 = 2/5
702         punpckhbw   mm2,    mm7                     // unpack high to word
703 
704         pmullw      mm0,    mm5                     // a * 2/5
705 
706         movq        mm3,    mm1                     // make a copy
707         punpcklbw   mm1,    mm7                     // unpack low to word
708 
709         pmullw      mm2,    mm5                     // a * 2/5
710         movq        mm6,    three_fifths             // mm6 = 3/5
711 
712         movq        mm4,    mm1                     // copy of low b
713         pmullw      mm4,    mm6                     // b * 3/5
714 
715         punpckhbw   mm3,    mm7                     // unpack high to word
716         movq        mm5,    mm3                     // copy of high b
717 
718         pmullw      mm5,    mm6                     // b * 3/5
719         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
720 
721         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
722         paddw       mm0,    round_values             // + 128
723 
724         paddw       mm2,    round_values             // + 128
725         psrlw       mm0,    8
726 
727         psrlw       mm2,    8
728         packuswb    mm0,    mm2                     // des [1]
729 
730         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
731         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
732 
733         // mm1, mm3 --- Src[1]
734         // mm0 --- Src[2]
735         // mm7 for unpacking
736 
737         movq        mm4,    mm1                     // b low
738         pmullw      mm1,    four_fifths              // b * 4/5 low
739 
740         movq        mm5,    mm3                     // b high
741         pmullw      mm3,    four_fifths              // b * 4/5 high
742 
743         movq        mm2,    mm0                     // c
744         pmullw      mm4,    one_fifth                // b * 1/5
745 
746         punpcklbw   mm0,    mm7                     // c low
747         pmullw      mm5,    one_fifth                // b * 1/5
748 
749         movq        mm6,    mm0                     // make copy of c low
750         punpckhbw   mm2,    mm7                     // c high
751 
752         pmullw      mm6,    one_fifth                // c * 1/5 low
753         movq        mm7,    mm2                     // make copy of c high
754 
755         pmullw      mm7,    one_fifth                // c * 1/5 high
756         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
757 
758         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
759         movq        mm6,    mm0                     // make copy of c low
760 
761         pmullw      mm6,    four_fifths              // c * 4/5 low
762         movq        mm7,    mm2                     // make copy of c high
763 
764         pmullw      mm7,    four_fifths              // c * 4/5 high
765 
766         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
767         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
768 
769         paddw       mm1,    round_values             // + 128
770         paddw       mm3,    round_values             // + 128
771 
772         psrlw       mm1,    8
773         psrlw       mm3,    8
774 
775         packuswb    mm1,    mm3                     // des[2]
776         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
777 
778         paddw       mm4,    round_values             // + 128
779         paddw       mm5,    round_values             // + 128
780 
781         psrlw       mm4,    8
782         psrlw       mm5,    8
783 
784         packuswb    mm4,    mm5                     // des[3]
785         movq        QWORD ptr [edi], mm4            // write des[3]
786 
787         //  mm0, mm2 --- Src[3]
788 
789         pxor        mm7,    mm7                     // clear mm7 for unpacking
790         movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
791 
792         movq        mm5,    three_fifths             // mm5 = 3/5
793         pmullw      mm0,    mm5                     // d * 3/5
794 
795         movq        mm6,    two_fifths                // mm6 = 2/5
796         movq        mm3,    mm1                     // make a copy
797 
798         pmullw      mm2,    mm5                     // d * 3/5
799         punpcklbw   mm1,    mm7                     // unpack low
800 
801         pmullw      mm1,    mm6                     // an * 2/5
802         punpckhbw   mm3,    mm7                     // unpack high
803 
804         paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
805         pmullw      mm3,    mm6                     // an * 2/5
806 
807         paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
808         paddw       mm0,    round_values             // + 128
809 
810         paddw       mm2,    round_values             // + 128
811         psrlw       mm0,    8
812 
813         psrlw       mm2,    8
814         packuswb    mm0,    mm2                     // des[4]
815 
816         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
817 
818         add         edi,    8
819         add         esi,    8
820 
821         sub         edx,    8
822         jg          vs_3_5_loop
823     }
824 }
825 
826 /****************************************************************************
827  *
828  *  ROUTINE       : last_vertical_band_3_5_scale_mmx
829  *
830  *  INPUTS        : unsigned char *dest    :
831  *                  unsigned int dest_pitch :
832  *                  unsigned int dest_width :
833  *
834  *  OUTPUTS       : None.
835  *
836  *  RETURNS       : void
837  *
838  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
839  *
840  *  SPECIAL NOTES : The routine uses the first line of the band below
841  *                  the current band. The function also has an "C" only
842  *                  version.
843  *
844  ****************************************************************************/
845 static
last_vertical_band_3_5_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)846 void last_vertical_band_3_5_scale_mmx
847 (
848     unsigned char *dest,
849     unsigned int dest_pitch,
850     unsigned int dest_width
851 )
852 {
853     __asm
854     {
855         mov         esi,    dest                    // Get the source and destination pointer
856         mov         ecx,    dest_pitch               // Get the pitch size
857 
858         lea         edi,    [esi+ecx*2]             // tow lines below
859         add         edi,    ecx                     // three lines below
860 
861         pxor        mm7,    mm7                     // clear out mm7
862         mov         edx,    dest_width               // Loop counter
863 
864 
865         last_vs_3_5_loop:
866 
867         movq        mm0,    QWORD ptr [esi]         // src[0];
868         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
869 
870         movq        mm2,    mm0                     // Make a copy
871         punpcklbw   mm0,    mm7                     // unpack low to word
872 
873         movq        mm5,    two_fifths               // mm5 = 2/5
874         punpckhbw   mm2,    mm7                     // unpack high to word
875 
876         pmullw      mm0,    mm5                     // a * 2/5
877 
878         movq        mm3,    mm1                     // make a copy
879         punpcklbw   mm1,    mm7                     // unpack low to word
880 
881         pmullw      mm2,    mm5                     // a * 2/5
882         movq        mm6,    three_fifths             // mm6 = 3/5
883 
884         movq        mm4,    mm1                     // copy of low b
885         pmullw      mm4,    mm6                     // b * 3/5
886 
887         punpckhbw   mm3,    mm7                     // unpack high to word
888         movq        mm5,    mm3                     // copy of high b
889 
890         pmullw      mm5,    mm6                     // b * 3/5
891         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
892 
893         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
894         paddw       mm0,    round_values             // + 128
895 
896         paddw       mm2,    round_values             // + 128
897         psrlw       mm0,    8
898 
899         psrlw       mm2,    8
900         packuswb    mm0,    mm2                     // des [1]
901 
902         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
903         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
904 
905 
906 
907         // mm1, mm3 --- Src[1]
908         // mm0 --- Src[2]
909         // mm7 for unpacking
910 
911         movq        mm4,    mm1                     // b low
912         pmullw      mm1,    four_fifths              // b * 4/5 low
913 
914         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
915 
916         movq        mm5,    mm3                     // b high
917         pmullw      mm3,    four_fifths              // b * 4/5 high
918 
919         movq        mm2,    mm0                     // c
920         pmullw      mm4,    one_fifth                // b * 1/5
921 
922         punpcklbw   mm0,    mm7                     // c low
923         pmullw      mm5,    one_fifth                // b * 1/5
924 
925         movq        mm6,    mm0                     // make copy of c low
926         punpckhbw   mm2,    mm7                     // c high
927 
928         pmullw      mm6,    one_fifth                // c * 1/5 low
929         movq        mm7,    mm2                     // make copy of c high
930 
931         pmullw      mm7,    one_fifth                // c * 1/5 high
932         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
933 
934         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
935         movq        mm6,    mm0                     // make copy of c low
936 
937         pmullw      mm6,    four_fifths              // c * 4/5 low
938         movq        mm7,    mm2                     // make copy of c high
939 
940         pmullw      mm7,    four_fifths              // c * 4/5 high
941 
942         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
943         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
944 
945         paddw       mm1,    round_values             // + 128
946         paddw       mm3,    round_values             // + 128
947 
948         psrlw       mm1,    8
949         psrlw       mm3,    8
950 
951         packuswb    mm1,    mm3                     // des[2]
952         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
953 
954         paddw       mm4,    round_values             // + 128
955         paddw       mm5,    round_values             // + 128
956 
957         psrlw       mm4,    8
958         psrlw       mm5,    8
959 
960         packuswb    mm4,    mm5                     // des[3]
961         movq        QWORD ptr [edi], mm4            // write des[3]
962 
963         //  mm0, mm2 --- Src[3]
964 
965         add         edi,    8
966         add         esi,    8
967 
968         sub         edx,    8
969         jg          last_vs_3_5_loop
970     }
971 }
972 
973 /****************************************************************************
974  *
975  *  ROUTINE       : vertical_band_1_2_scale_mmx
976  *
977  *  INPUTS        : unsigned char *dest    :
978  *                  unsigned int dest_pitch :
979  *                  unsigned int dest_width :
980  *
981  *  OUTPUTS       : None.
982  *
983  *  RETURNS       : void
984  *
985  *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
986  *
987  *  SPECIAL NOTES : The routine uses the first line of the band below
988  *                  the current band. The function also has an "C" only
989  *                  version.
990  *
991  ****************************************************************************/
992 static
vertical_band_1_2_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)993 void vertical_band_1_2_scale_mmx
994 (
995     unsigned char *dest,
996     unsigned int dest_pitch,
997     unsigned int dest_width
998 )
999 {
1000     __asm
1001     {
1002 
1003         mov         esi,    dest                    // Get the source and destination pointer
1004         mov         ecx,    dest_pitch               // Get the pitch size
1005 
1006         pxor        mm7,    mm7                     // clear out mm7
1007         mov         edx,    dest_width               // Loop counter
1008 
1009         vs_1_2_loop:
1010 
1011         movq        mm0,    [esi]                   // get Src[0]
1012         movq        mm1,    [esi + ecx * 2]         // get Src[1]
1013 
1014         movq        mm2,    mm0                     // make copy before unpack
1015         movq        mm3,    mm1                     // make copy before unpack
1016 
1017         punpcklbw   mm0,    mm7                     // low Src[0]
1018         movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
1019 
1020         punpcklbw   mm1,    mm7                     // low Src[1]
1021         paddw       mm0,    mm1                     // low (a + b)
1022 
1023         punpckhbw   mm2,    mm7                     // high Src[0]
1024         paddw       mm0,    mm6                     // low (a + b + 1)
1025 
1026         punpckhbw   mm3,    mm7
1027         paddw       mm2,    mm3                     // high (a + b )
1028 
1029         psraw       mm0,    1                       // low (a + b +1 )/2
1030         paddw       mm2,    mm6                     // high (a + b + 1)
1031 
1032         psraw       mm2,    1                       // high (a + b + 1)/2
1033         packuswb    mm0,    mm2                     // pack results
1034 
1035         movq        [esi+ecx], mm0                  // write out eight bytes
1036         add         esi,    8
1037 
1038         sub         edx,    8
1039         jg          vs_1_2_loop
1040     }
1041 
1042 }
1043 
1044 /****************************************************************************
1045  *
1046  *  ROUTINE       : last_vertical_band_1_2_scale_mmx
1047  *
1048  *  INPUTS        : unsigned char *dest    :
1049  *                  unsigned int dest_pitch :
1050  *                  unsigned int dest_width :
1051  *
1052  *  OUTPUTS       : None.
1053  *
1054  *  RETURNS       : void
1055  *
1056  *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
1057  *
1058  *  SPECIAL NOTES : The routine uses the first line of the band below
1059  *                  the current band. The function also has an "C" only
1060  *                  version.
1061  *
1062  ****************************************************************************/
1063 static
last_vertical_band_1_2_scale_mmx(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1064 void last_vertical_band_1_2_scale_mmx
1065 (
1066     unsigned char *dest,
1067     unsigned int dest_pitch,
1068     unsigned int dest_width
1069 )
1070 {
1071     __asm
1072     {
1073         mov         esi,    dest                    // Get the source and destination pointer
1074         mov         ecx,    dest_pitch               // Get the pitch size
1075 
1076         mov         edx,    dest_width               // Loop counter
1077 
1078         last_vs_1_2_loop:
1079 
1080         movq        mm0,    [esi]                   // get Src[0]
1081         movq        [esi+ecx], mm0                  // write out eight bytes
1082 
1083         add         esi,    8
1084         sub         edx,    8
1085 
1086         jg         last_vs_1_2_loop
1087     }
1088 }
1089 
1090 /****************************************************************************
1091  *
1092  *  ROUTINE       : horizontal_line_1_2_scale
1093  *
1094  *  INPUTS        : const unsigned char *source :
1095  *                  unsigned int source_width    :
1096  *                  unsigned char *dest         :
1097  *                  unsigned int dest_width      :
1098  *
1099  *  OUTPUTS       : None.
1100  *
1101  *  RETURNS       : void
1102  *
1103  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1104  *
1105  *  SPECIAL NOTES : None.
1106  *
1107  ****************************************************************************/
1108 static
horizontal_line_1_2_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1109 void horizontal_line_1_2_scale_mmx
1110 (
1111     const unsigned char *source,
1112     unsigned int source_width,
1113     unsigned char *dest,
1114     unsigned int dest_width
1115 )
1116 {
1117     (void) dest_width;
1118 
1119     __asm
1120     {
1121         mov         esi,    source
1122         mov         edi,    dest
1123 
1124         pxor        mm7,    mm7
1125         movq        mm6,    four_ones
1126 
1127         mov         ecx,    source_width
1128 
1129         hs_1_2_loop:
1130 
1131         movq        mm0,    [esi]
1132         movq        mm1,    [esi+1]
1133 
1134         movq        mm2,    mm0
1135         movq        mm3,    mm1
1136 
1137         movq        mm4,    mm0
1138         punpcklbw   mm0,    mm7
1139 
1140         punpcklbw   mm1,    mm7
1141         paddw       mm0,    mm1
1142 
1143         paddw       mm0,    mm6
1144         punpckhbw   mm2,    mm7
1145 
1146         punpckhbw   mm3,    mm7
1147         paddw       mm2,    mm3
1148 
1149         paddw       mm2,    mm6
1150         psraw       mm0,    1
1151 
1152         psraw       mm2,    1
1153         packuswb    mm0,    mm2
1154 
1155         movq        mm2,    mm4
1156         punpcklbw   mm2,    mm0
1157 
1158         movq        [edi],  mm2
1159         punpckhbw   mm4,    mm0
1160 
1161         movq        [edi+8], mm4
1162         add         esi,    8
1163 
1164         add         edi,    16
1165         sub         ecx,    8
1166 
1167         cmp         ecx,    8
1168         jg          hs_1_2_loop
1169 
1170 // last eight pixel
1171 
1172         movq        mm0,    [esi]
1173         movq        mm1,    mm0
1174 
1175         movq        mm2,    mm0
1176         movq        mm3,    mm1
1177 
1178         psrlq       mm1,    8
1179         psrlq       mm3,    56
1180 
1181         psllq       mm3,    56
1182         por         mm1,    mm3
1183 
1184         movq        mm3,    mm1
1185         movq        mm4,    mm0
1186 
1187         punpcklbw   mm0,    mm7
1188         punpcklbw   mm1,    mm7
1189 
1190         paddw       mm0,    mm1
1191         paddw       mm0,    mm6
1192 
1193         punpckhbw   mm2,    mm7
1194         punpckhbw   mm3,    mm7
1195 
1196         paddw       mm2,    mm3
1197         paddw       mm2,    mm6
1198 
1199         psraw       mm0,    1
1200         psraw       mm2,    1
1201 
1202         packuswb    mm0,    mm2
1203         movq        mm2,    mm4
1204 
1205         punpcklbw   mm2,    mm0
1206         movq        [edi],  mm2
1207 
1208         punpckhbw   mm4,    mm0
1209         movq        [edi+8], mm4
1210     }
1211 }
1212 
1213 
1214 
1215 
1216 
1217 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
1219 
1220 
1221 /****************************************************************************
1222  *
1223  *  ROUTINE       : horizontal_line_5_4_scale_mmx
1224  *
1225  *  INPUTS        : const unsigned char *source : Pointer to source data.
1226  *                  unsigned int source_width    : Stride of source.
1227  *                  unsigned char *dest         : Pointer to destination data.
1228  *                  unsigned int dest_width      : Stride of destination (NOT USED).
1229  *
1230  *  OUTPUTS       : None.
1231  *
1232  *  RETURNS       : void
1233  *
1234  *  FUNCTION      : Copies horizontal line of pixels from source to
1235  *                  destination scaling up by 4 to 5.
1236  *
1237  *  SPECIAL NOTES : None.
1238  *
1239  ****************************************************************************/
1240 static
horizontal_line_5_4_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1241 void horizontal_line_5_4_scale_mmx
1242 (
1243     const unsigned char *source,
1244     unsigned int source_width,
1245     unsigned char *dest,
1246     unsigned int dest_width
1247 )
1248 {
1249     /*
1250     unsigned i;
1251     unsigned int a, b, c, d, e;
1252     unsigned char *des = dest;
1253     const unsigned char *src = source;
1254 
1255     (void) dest_width;
1256 
1257     for ( i=0; i<source_width; i+=5 )
1258     {
1259         a = src[0];
1260         b = src[1];
1261         c = src[2];
1262         d = src[3];
1263         e = src[4];
1264 
1265         des[0] = a;
1266         des[1] = ((b*192 + c* 64 + 128)>>8);
1267         des[2] = ((c*128 + d*128 + 128)>>8);
1268         des[3] = ((d* 64 + e*192 + 128)>>8);
1269 
1270         src += 5;
1271         des += 4;
1272     }
1273     */
1274     (void) dest_width;
1275 
1276     __asm
1277     {
1278 
1279         mov         esi,        source              ;
1280         mov         edi,        dest                ;
1281 
1282         mov         ecx,        source_width         ;
1283         movq        mm5,        const54_1           ;
1284 
1285         pxor        mm7,        mm7                 ;
1286         movq        mm6,        const54_2           ;
1287 
1288         movq        mm4,        round_values         ;
1289         lea         edx,        [esi+ecx]           ;
1290         horizontal_line_5_4_loop:
1291 
1292         movq        mm0,        QWORD PTR  [esi]    ;
1293         00 01 02 03 04 05 06 07
1294         movq        mm1,        mm0                 ;
1295         00 01 02 03 04 05 06 07
1296 
1297         psrlq       mm0,        8                   ;
1298         01 02 03 04 05 06 07 xx
1299         punpcklbw   mm1,        mm7                 ;
1300         xx 00 xx 01 xx 02 xx 03
1301 
1302         punpcklbw   mm0,        mm7                 ;
1303         xx 01 xx 02 xx 03 xx 04
1304         pmullw      mm1,        mm5
1305 
1306         pmullw      mm0,        mm6
1307         add         esi,        5
1308 
1309         add         edi,        4
1310         paddw       mm1,        mm0
1311 
1312         paddw       mm1,        mm4
1313         psrlw       mm1,        8
1314 
1315         cmp         esi,        edx
1316         packuswb    mm1,        mm7
1317 
1318         movd        DWORD PTR [edi-4], mm1
1319 
1320         jl          horizontal_line_5_4_loop
1321 
1322     }
1323 
1324 }
1325 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
1326 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
1327 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1328 
1329 static
vertical_band_5_4_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1330 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1331 {
1332 
1333     __asm
1334     {
1335         push        ebx
1336 
1337         mov         esi,    source                    // Get the source and destination pointer
1338         mov         ecx,    src_pitch               // Get the pitch size
1339 
1340         mov         edi,    dest                    // tow lines below
1341         pxor        mm7,    mm7                     // clear out mm7
1342 
1343         mov         edx,    dest_pitch               // Loop counter
1344         mov         ebx,    dest_width
1345 
1346         vs_5_4_loop:
1347 
1348         movd        mm0,    DWORD ptr [esi]         // src[0];
1349         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
1350 
1351         movd        mm2,    DWORD ptr [esi+ecx*2]
1352         lea         eax,    [esi+ecx*2]             //
1353 
1354         punpcklbw   mm1,    mm7
1355         punpcklbw   mm2,    mm7
1356 
1357         movq        mm3,    mm2
1358         pmullw      mm1,    three_fourths
1359 
1360         pmullw      mm2,    one_fourths
1361         movd        mm4,    [eax+ecx]
1362 
1363         pmullw      mm3,    two_fourths
1364         punpcklbw   mm4,    mm7
1365 
1366         movq        mm5,    mm4
1367         pmullw      mm4,    two_fourths
1368 
1369         paddw       mm1,    mm2
1370         movd        mm6,    [eax+ecx*2]
1371 
1372         pmullw      mm5,    one_fourths
1373         paddw       mm1,    round_values;
1374 
1375         paddw       mm3,    mm4
1376         psrlw       mm1,    8
1377 
1378         punpcklbw   mm6,    mm7
1379         paddw       mm3,    round_values
1380 
1381         pmullw      mm6,    three_fourths
1382         psrlw       mm3,    8
1383 
1384         packuswb    mm1,    mm7
1385         packuswb    mm3,    mm7
1386 
1387         movd        DWORD PTR [edi], mm0
1388         movd        DWORD PTR [edi+edx], mm1
1389 
1390 
1391         paddw       mm5,    mm6
1392         movd        DWORD PTR [edi+edx*2], mm3
1393 
1394         lea         eax,    [edi+edx*2]
1395         paddw       mm5,    round_values
1396 
1397         psrlw       mm5,    8
1398         add         edi,    4
1399 
1400         packuswb    mm5,    mm7
1401         movd        DWORD PTR [eax+edx], mm5
1402 
1403         add         esi,    4
1404         sub         ebx,    4
1405 
1406         jg         vs_5_4_loop
1407 
1408         pop         ebx
1409     }
1410 }
1411 
1412 
1413 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
1414 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
1415 
1416 
1417 static
horizontal_line_5_3_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1418 void horizontal_line_5_3_scale_mmx
1419 (
1420     const unsigned char *source,
1421     unsigned int source_width,
1422     unsigned char *dest,
1423     unsigned int dest_width
1424 )
1425 {
1426 
1427     (void) dest_width;
1428     __asm
1429     {
1430 
1431         mov         esi,        source              ;
1432         mov         edi,        dest                ;
1433 
1434         mov         ecx,        source_width         ;
1435         movq        mm5,        const53_1           ;
1436 
1437         pxor        mm7,        mm7                 ;
1438         movq        mm6,        const53_2           ;
1439 
1440         movq        mm4,        round_values         ;
1441         lea         edx,        [esi+ecx-5]         ;
1442         horizontal_line_5_3_loop:
1443 
1444         movq        mm0,        QWORD PTR  [esi]    ;
1445         00 01 02 03 04 05 06 07
1446         movq        mm1,        mm0                 ;
1447         00 01 02 03 04 05 06 07
1448 
1449         psllw       mm0,        8                   ;
1450         xx 00 xx 02 xx 04 xx 06
1451         psrlw       mm1,        8                   ;
1452         01 xx 03 xx 05 xx 07 xx
1453 
1454         psrlw       mm0,        8                   ;
1455         00 xx 02 xx 04 xx 06 xx
1456         psllq       mm1,        16                  ;
1457         xx xx 01 xx 03 xx 05 xx
1458 
1459         pmullw      mm0,        mm6
1460 
1461         pmullw      mm1,        mm5
1462         add         esi,        5
1463 
1464         add         edi,        3
1465         paddw       mm1,        mm0
1466 
1467         paddw       mm1,        mm4
1468         psrlw       mm1,        8
1469 
1470         cmp         esi,        edx
1471         packuswb    mm1,        mm7
1472 
1473         movd        DWORD PTR [edi-3], mm1
1474         jl          horizontal_line_5_3_loop
1475 
1476 //exit condition
1477         movq        mm0,        QWORD PTR  [esi]    ;
1478         00 01 02 03 04 05 06 07
1479         movq        mm1,        mm0                 ;
1480         00 01 02 03 04 05 06 07
1481 
1482         psllw       mm0,        8                   ;
1483         xx 00 xx 02 xx 04 xx 06
1484         psrlw       mm1,        8                   ;
1485         01 xx 03 xx 05 xx 07 xx
1486 
1487         psrlw       mm0,        8                   ;
1488         00 xx 02 xx 04 xx 06 xx
1489         psllq       mm1,        16                  ;
1490         xx xx 01 xx 03 xx 05 xx
1491 
1492         pmullw      mm0,        mm6
1493 
1494         pmullw      mm1,        mm5
1495         paddw       mm1,        mm0
1496 
1497         paddw       mm1,        mm4
1498         psrlw       mm1,        8
1499 
1500         packuswb    mm1,        mm7
1501         movd        eax,        mm1
1502 
1503         mov         edx,        eax
1504         shr         edx,        16
1505 
1506         mov         WORD PTR[edi],   ax
1507         mov         BYTE PTR[edi+2], dl
1508 
1509     }
1510 
1511 }
1512 
1513 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1515 
1516 static
vertical_band_5_3_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1517 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1518 {
1519 
1520     __asm
1521     {
1522         push        ebx
1523 
1524         mov         esi,    source                    // Get the source and destination pointer
1525         mov         ecx,    src_pitch               // Get the pitch size
1526 
1527         mov         edi,    dest                    // tow lines below
1528         pxor        mm7,    mm7                     // clear out mm7
1529 
1530         mov         edx,    dest_pitch               // Loop counter
1531         movq        mm5,    one_thirds
1532 
1533         movq        mm6,    two_thirds
1534         mov         ebx,    dest_width;
1535 
1536         vs_5_3_loop:
1537 
1538         movd        mm0,    DWORD ptr [esi]         // src[0];
1539         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
1540 
1541         movd        mm2,    DWORD ptr [esi+ecx*2]
1542         lea         eax,    [esi+ecx*2]             //
1543 
1544         punpcklbw   mm1,    mm7
1545         punpcklbw   mm2,    mm7
1546 
1547         pmullw      mm1,    mm5
1548         pmullw      mm2,    mm6
1549 
1550         movd        mm3,    DWORD ptr [eax+ecx]
1551         movd        mm4,    DWORD ptr [eax+ecx*2]
1552 
1553         punpcklbw   mm3,    mm7
1554         punpcklbw   mm4,    mm7
1555 
1556         pmullw      mm3,    mm6
1557         pmullw      mm4,    mm5
1558 
1559 
1560         movd        DWORD PTR [edi], mm0
1561         paddw       mm1,    mm2
1562 
1563         paddw       mm1,    round_values
1564         psrlw       mm1,    8
1565 
1566         packuswb    mm1,    mm7
1567         paddw       mm3,    mm4
1568 
1569         paddw       mm3,    round_values
1570         movd        DWORD PTR [edi+edx], mm1
1571 
1572         psrlw       mm3,    8
1573         packuswb    mm3,    mm7
1574 
1575         movd        DWORD PTR [edi+edx*2], mm3
1576 
1577 
1578         add         edi,    4
1579         add         esi,    4
1580 
1581         sub         ebx,    4
1582         jg          vs_5_3_loop
1583 
1584         pop         ebx
1585     }
1586 }
1587 
1588 
1589 
1590 
1591 /****************************************************************************
1592  *
1593  *  ROUTINE       : horizontal_line_2_1_scale
1594  *
1595  *  INPUTS        : const unsigned char *source :
1596  *                  unsigned int source_width    :
1597  *                  unsigned char *dest         :
1598  *                  unsigned int dest_width      :
1599  *
1600  *  OUTPUTS       : None.
1601  *
1602  *  RETURNS       : void
1603  *
1604  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1605  *
1606  *  SPECIAL NOTES : None.
1607  *
1608  ****************************************************************************/
1609 static
horizontal_line_2_1_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)1610 void horizontal_line_2_1_scale_mmx
1611 (
1612     const unsigned char *source,
1613     unsigned int source_width,
1614     unsigned char *dest,
1615     unsigned int dest_width
1616 )
1617 {
1618     (void) dest_width;
1619     (void) source_width;
1620     __asm
1621     {
1622         mov         esi,    source
1623         mov         edi,    dest
1624 
1625         pxor        mm7,    mm7
1626         mov         ecx,    dest_width
1627 
1628         xor         edx,    edx
1629         hs_2_1_loop:
1630 
1631         movq        mm0,    [esi+edx*2]
1632         psllw       mm0,    8
1633 
1634         psrlw       mm0,    8
1635         packuswb    mm0,    mm7
1636 
1637         movd        DWORD Ptr [edi+edx], mm0;
1638         add         edx,    4
1639 
1640         cmp         edx,    ecx
1641         jl          hs_2_1_loop
1642 
1643     }
1644 }
1645 
1646 
1647 
1648 static
vertical_band_2_1_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1649 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1650 {
1651     (void) dest_pitch;
1652     (void) src_pitch;
1653     vpx_memcpy(dest, source, dest_width);
1654 }
1655 
1656 
1657 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
1658 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
1659 
1660 static
vertical_band_2_1_scale_i_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1662 {
1663 
1664     (void) dest_pitch;
1665     __asm
1666     {
1667         mov         esi,        source
1668         mov         edi,        dest
1669 
1670         mov         eax,        src_pitch
1671         mov         edx,        dest_width
1672 
1673         pxor        mm7,        mm7
1674         sub         esi,        eax             //back one line
1675 
1676 
1677         lea         ecx,        [esi+edx];
1678         movq        mm6,        round_values;
1679 
1680         movq        mm5,        three_sixteenths;
1681         movq        mm4,        ten_sixteenths;
1682 
1683         vs_2_1_i_loop:
1684         movd        mm0,        [esi]           //
1685         movd        mm1,        [esi+eax]       //
1686 
1687         movd        mm2,        [esi+eax*2]     //
1688         punpcklbw   mm0,        mm7
1689 
1690         pmullw      mm0,        mm5
1691         punpcklbw   mm1,        mm7
1692 
1693         pmullw      mm1,        mm4
1694         punpcklbw   mm2,        mm7
1695 
1696         pmullw      mm2,        mm5
1697         paddw       mm0,        round_values
1698 
1699         paddw       mm1,        mm2
1700         paddw       mm0,        mm1
1701 
1702         psrlw       mm0,        8
1703         packuswb    mm0,        mm7
1704 
1705         movd        DWORD PTR [edi],        mm0
1706         add         esi,        4
1707 
1708         add         edi,        4;
1709         cmp         esi,        ecx
1710         jl          vs_2_1_i_loop
1711 
1712     }
1713 }
1714 
1715 
1716 
1717 void
register_mmxscalers(void)1718 register_mmxscalers(void)
1719 {
1720     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
1721     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
1722     vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
1723     vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
1724     vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
1725     vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
1726     vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
1727     vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
1728     vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
1729 
1730     vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
1731     vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
1732     vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
1733     vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
1734     vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
1735     vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
1736 
1737 
1738 
1739     vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
1740     vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
1741     vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
1742     vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
1743     vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
1744     vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
1745     vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
1746 
1747 
1748 
1749 
1750 }
1751