• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3  *
4  * For Intel x86 CPU and Microsoft Visual C++ compiler
5  *
6  * Last changed in libpng 1.2.19 August 18, 2007
7  * For conditions of distribution and use, see copyright notice in png.h
8  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
9  * Copyright (c) 1998, Intel Corporation
10  *
11  * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
12  * Interface to libpng contributed by Gilles Vollant, 1999
13  *
14  *
15  * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
16  * a sign error in the post-MMX cleanup code for each pixel_depth resulted
17  * in bad pixels at the beginning of some rows of some images, and also
18  * (due to out-of-range memory reads and writes) caused heap corruption
19  * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
20  *
21  * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22  *
23  * [runtime MMX configuration, GRR 20010102]
24  *
25  * [Copy 6 bytes per pixel, not 4, and use stride of 6, not 4, in the
26  *  second loop of interlace processing of 48-bit pixels, GR-P 20070717]
27  *
28  * [move instances of uAll union into local, except for two constant
29  * instances, GR-P 20070805]
30  */
31 
32 #define PNG_INTERNAL
33 #include "png.h"
34 
35 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
36 
37 
38 static int mmx_supported=2;
39 
40 int PNGAPI
png_mmx_support(void)41 png_mmx_support(void)
42 {
43   int mmx_supported_local = 0;
44   _asm {
45     push ebx          //CPUID will trash these
46     push ecx
47     push edx
48 
49     pushfd            //Save Eflag to stack
50     pop eax           //Get Eflag from stack into eax
51     mov ecx, eax      //Make another copy of Eflag in ecx
52     xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
53     push eax          //Save modified Eflag back to stack
54 
55     popfd             //Restored modified value back to Eflag reg
56     pushfd            //Save Eflag to stack
57     pop eax           //Get Eflag from stack
58     push ecx          // save original Eflag to stack
59     popfd             // restore original Eflag
60     xor eax, ecx      //Compare the new Eflag with the original Eflag
61     jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
62                       //skip following instructions and jump to
63                       //NOT_SUPPORTED label
64 
65     xor eax, eax      //Set eax to zero
66 
67     _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
68     _asm _emit 0xa2
69 
70     cmp eax, 1        //make sure eax return non-zero value
71     jl NOT_SUPPORTED  //If eax is zero, mmx not supported
72 
73     xor eax, eax      //set eax to zero
74     inc eax           //Now increment eax to 1.  This instruction is
75                       //faster than the instruction "mov eax, 1"
76 
77     _asm _emit 0x0f   //CPUID instruction
78     _asm _emit 0xa2
79 
80     and edx, 0x00800000  //mask out all bits but mmx bit(24)
81     cmp edx, 0        // 0 = mmx not supported
82     jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
83 
84     mov  mmx_supported_local, 1  //set return value to 1
85 
86 NOT_SUPPORTED:
87     mov  eax, mmx_supported_local  //move return value to eax
88     pop edx          //CPUID trashed these
89     pop ecx
90     pop ebx
91   }
92 
93   //mmx_supported_local=0; // test code for force don't support MMX
94   //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
95 
96   mmx_supported = mmx_supported_local;
97   return mmx_supported_local;
98 }
99 
100 /* Combines the row recently read in with the previous row.
101    This routine takes care of alpha and transparency if requested.
102    This routine also handles the two methods of progressive display
103    of interlaced images, depending on the mask value.
104    The mask value describes which pixels are to be combined with
105    the row.  The pattern always repeats every 8 pixels, so just 8
106    bits are needed.  A one indicates the pixel is to be combined; a
107    zero indicates the pixel is to be skipped.  This is in addition
108    to any alpha or transparency value associated with the pixel.  If
109    you want all pixels to be combined, pass 0xff (255) in mask.  */
110 
111 /* Use this routine for x86 platform - uses faster MMX routine if machine
112    supports MMX */
113 
114 void /* PRIVATE */
png_combine_row(png_structp png_ptr,png_bytep row,int mask)115 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
116 {
117 #ifdef PNG_USE_LOCAL_ARRAYS
118    PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
119 #endif
120 
121    png_debug(1,"in png_combine_row_asm\n");
122 
123    if (mmx_supported == 2) {
124 #if !defined(PNG_1_0_X)
125        /* this should have happened in png_init_mmx_flags() already */
126        png_warning(png_ptr, "asm_flags may not have been initialized");
127 #endif
128        png_mmx_support();
129    }
130 
131    if (mask == 0xff)
132    {
133       png_memcpy(row, png_ptr->row_buf + 1,
134        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
135        png_ptr->width));
136    }
137    /* GRR:  add "else if (mask == 0)" case?
138     *       or does png_combine_row() not even get called in that case? */
139    else
140    {
141       switch (png_ptr->row_info.pixel_depth)
142       {
143          case 24:
144          {
145             png_bytep srcptr;
146             png_bytep dstptr;
147             png_uint_32 len;
148             int unmask, diff;
149 
150             __int64 mask2=0x0101010202020404,  //24bpp
151                     mask1=0x0408080810101020,
152                     mask0=0x2020404040808080;
153 
154             srcptr = png_ptr->row_buf + 1;
155             dstptr = row;
156 
157             unmask = ~mask;
158             len     = (png_ptr->width)&~7;
159             diff = (png_ptr->width)&7;
160 
161 #if !defined(PNG_1_0_X)
162             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
163                 /* && mmx_supported */ )
164 #else
165             if (mmx_supported)
166 #endif
167             {
168                _asm
169                {
170                   movd       mm7, unmask       //load bit pattern
171                   psubb      mm6,mm6           //zero mm6
172                   punpcklbw  mm7,mm7
173                   punpcklwd  mm7,mm7
174                   punpckldq  mm7,mm7           //fill register with 8 masks
175 
176                   movq       mm0,mask0
177                   movq       mm1,mask1
178                   movq       mm2,mask2
179 
180                   pand       mm0,mm7
181                   pand       mm1,mm7
182                   pand       mm2,mm7
183 
184                   pcmpeqb    mm0,mm6
185                   pcmpeqb    mm1,mm6
186                   pcmpeqb    mm2,mm6
187 
188                   mov        ecx,len           //load length of line
189                   mov        esi,srcptr        //load source
190                   mov        ebx,dstptr        //load dest
191                   cmp        ecx,0
192                   jz         mainloop24end
193 
194 mainloop24:
195                   movq       mm4,[esi]
196                   pand       mm4,mm0
197                   movq       mm6,mm0
198                   movq       mm7,[ebx]
199                   pandn      mm6,mm7
200                   por        mm4,mm6
201                   movq       [ebx],mm4
202 
203 
204                   movq       mm5,[esi+8]
205                   pand       mm5,mm1
206                   movq       mm7,mm1
207                   movq       mm6,[ebx+8]
208                   pandn      mm7,mm6
209                   por        mm5,mm7
210                   movq       [ebx+8],mm5
211 
212                   movq       mm6,[esi+16]
213                   pand       mm6,mm2
214                   movq       mm4,mm2
215                   movq       mm7,[ebx+16]
216                   pandn      mm4,mm7
217                   por        mm6,mm4
218                   movq       [ebx+16],mm6
219 
220                   add        esi,24            //inc by 24 bytes processed
221                   add        ebx,24
222                   sub        ecx,8             //dec by 8 pixels processed
223 
224                   ja         mainloop24
225 
226 mainloop24end:
227                   mov        ecx,diff
228                   cmp        ecx,0
229                   jz         end24
230 
231                   mov        edx,mask
232                   sal        edx,24            //make low byte the high byte
233 secondloop24:
234                   sal        edx,1             //move high bit to CF
235                   jnc        skip24            //if CF = 0
236                   mov        ax,[esi]
237                   mov        [ebx],ax
238                   xor        eax,eax
239                   mov        al,[esi+2]
240                   mov        [ebx+2],al
241 skip24:
242                   add        esi,3
243                   add        ebx,3
244 
245                   dec        ecx
246                   jnz        secondloop24
247 
248 end24:
249                   emms
250                }
251             }
252             else /* mmx not supported - use modified C routine */
253             {
254                register unsigned int incr1, initial_val, final_val;
255                png_size_t pixel_bytes;
256                png_uint_32 i;
257                register int disp = png_pass_inc[png_ptr->pass];
258                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
259 
260                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
261                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
262                   pixel_bytes;
263                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
264                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
265                final_val = png_ptr->width*pixel_bytes;
266                incr1 = (disp)*pixel_bytes;
267                for (i = initial_val; i < final_val; i += incr1)
268                {
269                   png_memcpy(dstptr, srcptr, pixel_bytes);
270                   srcptr += incr1;
271                   dstptr += incr1;
272                }
273             } /* end of else */
274 
275             break;
276          }       // end 24 bpp
277 
278          case 32:
279          {
280             png_bytep srcptr;
281             png_bytep dstptr;
282             png_uint_32 len;
283             int unmask, diff;
284 
285             __int64 mask3=0x0101010102020202,  //32bpp
286                     mask2=0x0404040408080808,
287                     mask1=0x1010101020202020,
288                     mask0=0x4040404080808080;
289 
290             srcptr = png_ptr->row_buf + 1;
291             dstptr = row;
292 
293             unmask = ~mask;
294             len     = (png_ptr->width)&~7;
295             diff = (png_ptr->width)&7;
296 
297 #if !defined(PNG_1_0_X)
298             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
299                 /* && mmx_supported */ )
300 #else
301             if (mmx_supported)
302 #endif
303             {
304                _asm
305                {
306                   movd       mm7, unmask       //load bit pattern
307                   psubb      mm6,mm6           //zero mm6
308                   punpcklbw  mm7,mm7
309                   punpcklwd  mm7,mm7
310                   punpckldq  mm7,mm7           //fill register with 8 masks
311 
312                   movq       mm0,mask0
313                   movq       mm1,mask1
314                   movq       mm2,mask2
315                   movq       mm3,mask3
316 
317                   pand       mm0,mm7
318                   pand       mm1,mm7
319                   pand       mm2,mm7
320                   pand       mm3,mm7
321 
322                   pcmpeqb    mm0,mm6
323                   pcmpeqb    mm1,mm6
324                   pcmpeqb    mm2,mm6
325                   pcmpeqb    mm3,mm6
326 
327                   mov        ecx,len           //load length of line
328                   mov        esi,srcptr        //load source
329                   mov        ebx,dstptr        //load dest
330 
331                   cmp        ecx,0             //lcr
332                   jz         mainloop32end
333 
334 mainloop32:
335                   movq       mm4,[esi]
336                   pand       mm4,mm0
337                   movq       mm6,mm0
338                   movq       mm7,[ebx]
339                   pandn      mm6,mm7
340                   por        mm4,mm6
341                   movq       [ebx],mm4
342 
343                   movq       mm5,[esi+8]
344                   pand       mm5,mm1
345                   movq       mm7,mm1
346                   movq       mm6,[ebx+8]
347                   pandn      mm7,mm6
348                   por        mm5,mm7
349                   movq       [ebx+8],mm5
350 
351                   movq       mm6,[esi+16]
352                   pand       mm6,mm2
353                   movq       mm4,mm2
354                   movq       mm7,[ebx+16]
355                   pandn      mm4,mm7
356                   por        mm6,mm4
357                   movq       [ebx+16],mm6
358 
359                   movq       mm7,[esi+24]
360                   pand       mm7,mm3
361                   movq       mm5,mm3
362                   movq       mm4,[ebx+24]
363                   pandn      mm5,mm4
364                   por        mm7,mm5
365                   movq       [ebx+24],mm7
366 
367                   add        esi,32            //inc by 32 bytes processed
368                   add        ebx,32
369                   sub        ecx,8             //dec by 8 pixels processed
370 
371                   ja         mainloop32
372 
373 mainloop32end:
374                   mov        ecx,diff
375                   cmp        ecx,0
376                   jz         end32
377 
378                   mov        edx,mask
379                   sal        edx,24            //make low byte the high byte
380 secondloop32:
381                   sal        edx,1             //move high bit to CF
382                   jnc        skip32            //if CF = 0
383                   mov        eax,[esi]
384                   mov        [ebx],eax
385 skip32:
386                   add        esi,4
387                   add        ebx,4
388 
389                   dec        ecx
390                   jnz        secondloop32
391 
392 end32:
393                   emms
394                }
395             }
396             else /* mmx _not supported - Use modified C routine */
397             {
398                register unsigned int incr1, initial_val, final_val;
399                png_size_t pixel_bytes;
400                png_uint_32 i;
401                register int disp = png_pass_inc[png_ptr->pass];
402                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
403 
404                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
405                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
406                   pixel_bytes;
407                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
408                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
409                final_val = png_ptr->width*pixel_bytes;
410                incr1 = (disp)*pixel_bytes;
411                for (i = initial_val; i < final_val; i += incr1)
412                {
413                   png_memcpy(dstptr, srcptr, pixel_bytes);
414                   srcptr += incr1;
415                   dstptr += incr1;
416                }
417             } /* end of else */
418 
419             break;
420          }       // end 32 bpp
421 
422          case 8:
423          {
424             png_bytep srcptr;
425             png_bytep dstptr;
426             png_uint_32 len;
427             int m;
428             int diff, unmask;
429 
430             __int64 mask0=0x0102040810204080;
431 
432 #if !defined(PNG_1_0_X)
433             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
434                 /* && mmx_supported */ )
435 #else
436             if (mmx_supported)
437 #endif
438             {
439                srcptr = png_ptr->row_buf + 1;
440                dstptr = row;
441                m = 0x80;
442                unmask = ~mask;
443                len  = png_ptr->width &~7;  //reduce to multiple of 8
444                diff = png_ptr->width & 7;  //amount lost
445 
446                _asm
447                {
448                   movd       mm7, unmask   //load bit pattern
449                   psubb      mm6,mm6       //zero mm6
450                   punpcklbw  mm7,mm7
451                   punpcklwd  mm7,mm7
452                   punpckldq  mm7,mm7       //fill register with 8 masks
453 
454                   movq       mm0,mask0
455 
456                   pand       mm0,mm7       //nonzero if keep byte
457                   pcmpeqb    mm0,mm6       //zeros->1s, v versa
458 
459                   mov        ecx,len       //load length of line (pixels)
460                   mov        esi,srcptr    //load source
461                   mov        ebx,dstptr    //load dest
462                   cmp        ecx,0         //lcr
463                   je         mainloop8end
464 
465 mainloop8:
466                   movq       mm4,[esi]
467                   pand       mm4,mm0
468                   movq       mm6,mm0
469                   pandn      mm6,[ebx]
470                   por        mm4,mm6
471                   movq       [ebx],mm4
472 
473                   add        esi,8         //inc by 8 bytes processed
474                   add        ebx,8
475                   sub        ecx,8         //dec by 8 pixels processed
476 
477                   ja         mainloop8
478 mainloop8end:
479 
480                   mov        ecx,diff
481                   cmp        ecx,0
482                   jz         end8
483 
484                   mov        edx,mask
485                   sal        edx,24        //make low byte the high byte
486 
487 secondloop8:
488                   sal        edx,1         //move high bit to CF
489                   jnc        skip8         //if CF = 0
490                   mov        al,[esi]
491                   mov        [ebx],al
492 skip8:
493                   inc        esi
494                   inc        ebx
495 
496                   dec        ecx
497                   jnz        secondloop8
498 end8:
499                   emms
500                }
501             }
502             else /* mmx not supported - use modified C routine */
503             {
504                register unsigned int incr1, initial_val, final_val;
505                png_size_t pixel_bytes;
506                png_uint_32 i;
507                register int disp = png_pass_inc[png_ptr->pass];
508                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
509 
510                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
511                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
512                   pixel_bytes;
513                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
514                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
515                final_val = png_ptr->width*pixel_bytes;
516                incr1 = (disp)*pixel_bytes;
517                for (i = initial_val; i < final_val; i += incr1)
518                {
519                   png_memcpy(dstptr, srcptr, pixel_bytes);
520                   srcptr += incr1;
521                   dstptr += incr1;
522                }
523             } /* end of else */
524 
525             break;
526          }       // end 8 bpp
527 
528          case 1:
529          {
530             png_bytep sp;
531             png_bytep dp;
532             int s_inc, s_start, s_end;
533             int m;
534             int shift;
535             png_uint_32 i;
536 
537             sp = png_ptr->row_buf + 1;
538             dp = row;
539             m = 0x80;
540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
541             if (png_ptr->transformations & PNG_PACKSWAP)
542             {
543                 s_start = 0;
544                 s_end = 7;
545                 s_inc = 1;
546             }
547             else
548 #endif
549             {
550                 s_start = 7;
551                 s_end = 0;
552                 s_inc = -1;
553             }
554 
555             shift = s_start;
556 
557             for (i = 0; i < png_ptr->width; i++)
558             {
559                if (m & mask)
560                {
561                   int value;
562 
563                   value = (*sp >> shift) & 0x1;
564                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
565                   *dp |= (png_byte)(value << shift);
566                }
567 
568                if (shift == s_end)
569                {
570                   shift = s_start;
571                   sp++;
572                   dp++;
573                }
574                else
575                   shift += s_inc;
576 
577                if (m == 1)
578                   m = 0x80;
579                else
580                   m >>= 1;
581             }
582             break;
583          }
584 
585          case 2:
586          {
587             png_bytep sp;
588             png_bytep dp;
589             int s_start, s_end, s_inc;
590             int m;
591             int shift;
592             png_uint_32 i;
593             int value;
594 
595             sp = png_ptr->row_buf + 1;
596             dp = row;
597             m = 0x80;
598 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
599             if (png_ptr->transformations & PNG_PACKSWAP)
600             {
601                s_start = 0;
602                s_end = 6;
603                s_inc = 2;
604             }
605             else
606 #endif
607             {
608                s_start = 6;
609                s_end = 0;
610                s_inc = -2;
611             }
612 
613             shift = s_start;
614 
615             for (i = 0; i < png_ptr->width; i++)
616             {
617                if (m & mask)
618                {
619                   value = (*sp >> shift) & 0x3;
620                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
621                   *dp |= (png_byte)(value << shift);
622                }
623 
624                if (shift == s_end)
625                {
626                   shift = s_start;
627                   sp++;
628                   dp++;
629                }
630                else
631                   shift += s_inc;
632                if (m == 1)
633                   m = 0x80;
634                else
635                   m >>= 1;
636             }
637             break;
638          }
639 
640          case 4:
641          {
642             png_bytep sp;
643             png_bytep dp;
644             int s_start, s_end, s_inc;
645             int m;
646             int shift;
647             png_uint_32 i;
648             int value;
649 
650             sp = png_ptr->row_buf + 1;
651             dp = row;
652             m = 0x80;
653 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
654             if (png_ptr->transformations & PNG_PACKSWAP)
655             {
656                s_start = 0;
657                s_end = 4;
658                s_inc = 4;
659             }
660             else
661 #endif
662             {
663                s_start = 4;
664                s_end = 0;
665                s_inc = -4;
666             }
667             shift = s_start;
668 
669             for (i = 0; i < png_ptr->width; i++)
670             {
671                if (m & mask)
672                {
673                   value = (*sp >> shift) & 0xf;
674                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
675                   *dp |= (png_byte)(value << shift);
676                }
677 
678                if (shift == s_end)
679                {
680                   shift = s_start;
681                   sp++;
682                   dp++;
683                }
684                else
685                   shift += s_inc;
686                if (m == 1)
687                   m = 0x80;
688                else
689                   m >>= 1;
690             }
691             break;
692          }
693 
694          case 16:
695          {
696             png_bytep srcptr;
697             png_bytep dstptr;
698             png_uint_32 len;
699             int unmask, diff;
700             __int64 mask1=0x0101020204040808,
701                     mask0=0x1010202040408080;
702 
703 #if !defined(PNG_1_0_X)
704             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
705                 /* && mmx_supported */ )
706 #else
707             if (mmx_supported)
708 #endif
709             {
710                srcptr = png_ptr->row_buf + 1;
711                dstptr = row;
712 
713                unmask = ~mask;
714                len     = (png_ptr->width)&~7;
715                diff = (png_ptr->width)&7;
716                _asm
717                {
718                   movd       mm7, unmask       //load bit pattern
719                   psubb      mm6,mm6           //zero mm6
720                   punpcklbw  mm7,mm7
721                   punpcklwd  mm7,mm7
722                   punpckldq  mm7,mm7           //fill register with 8 masks
723 
724                   movq       mm0,mask0
725                   movq       mm1,mask1
726 
727                   pand       mm0,mm7
728                   pand       mm1,mm7
729 
730                   pcmpeqb    mm0,mm6
731                   pcmpeqb    mm1,mm6
732 
733                   mov        ecx,len           //load length of line
734                   mov        esi,srcptr        //load source
735                   mov        ebx,dstptr        //load dest
736                   cmp        ecx,0             //lcr
737                   jz         mainloop16end
738 
739 mainloop16:
740                   movq       mm4,[esi]
741                   pand       mm4,mm0
742                   movq       mm6,mm0
743                   movq       mm7,[ebx]
744                   pandn      mm6,mm7
745                   por        mm4,mm6
746                   movq       [ebx],mm4
747 
748                   movq       mm5,[esi+8]
749                   pand       mm5,mm1
750                   movq       mm7,mm1
751                   movq       mm6,[ebx+8]
752                   pandn      mm7,mm6
753                   por        mm5,mm7
754                   movq       [ebx+8],mm5
755 
756                   add        esi,16            //inc by 16 bytes processed
757                   add        ebx,16
758                   sub        ecx,8             //dec by 8 pixels processed
759 
760                   ja         mainloop16
761 
762 mainloop16end:
763                   mov        ecx,diff
764                   cmp        ecx,0
765                   jz         end16
766 
767                   mov        edx,mask
768                   sal        edx,24            //make low byte the high byte
769 secondloop16:
770                   sal        edx,1             //move high bit to CF
771                   jnc        skip16            //if CF = 0
772                   mov        ax,[esi]
773                   mov        [ebx],ax
774 skip16:
775                   add        esi,2
776                   add        ebx,2
777 
778                   dec        ecx
779                   jnz        secondloop16
780 end16:
781                   emms
782                }
783             }
784             else /* mmx not supported - use modified C routine */
785             {
786                register unsigned int incr1, initial_val, final_val;
787                png_size_t pixel_bytes;
788                png_uint_32 i;
789                register int disp = png_pass_inc[png_ptr->pass];
790                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
791 
792                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
793                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
794                   pixel_bytes;
795                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
796                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
797                final_val = png_ptr->width*pixel_bytes;
798                incr1 = (disp)*pixel_bytes;
799                for (i = initial_val; i < final_val; i += incr1)
800                {
801                   png_memcpy(dstptr, srcptr, pixel_bytes);
802                   srcptr += incr1;
803                   dstptr += incr1;
804                }
805             } /* end of else */
806 
807             break;
808          }       // end 16 bpp
809 
810          case 48:
811          {
812             png_bytep srcptr;
813             png_bytep dstptr;
814             png_uint_32 len;
815             int unmask, diff;
816 
817             __int64 mask5=0x0101010101010202,
818                     mask4=0x0202020204040404,
819                     mask3=0x0404080808080808,
820                     mask2=0x1010101010102020,
821                     mask1=0x2020202040404040,
822                     mask0=0x4040808080808080;
823 
824 #if !defined(PNG_1_0_X)
825             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
826                 /* && mmx_supported */ )
827 #else
828             if (mmx_supported)
829 #endif
830             {
831                srcptr = png_ptr->row_buf + 1;
832                dstptr = row;
833 
834                unmask = ~mask;
835                len     = (png_ptr->width)&~7;
836                diff = (png_ptr->width)&7;
837                _asm
838                {
839                   movd       mm7, unmask       //load bit pattern
840                   psubb      mm6,mm6           //zero mm6
841                   punpcklbw  mm7,mm7
842                   punpcklwd  mm7,mm7
843                   punpckldq  mm7,mm7           //fill register with 8 masks
844 
845                   movq       mm0,mask0
846                   movq       mm1,mask1
847                   movq       mm2,mask2
848                   movq       mm3,mask3
849                   movq       mm4,mask4
850                   movq       mm5,mask5
851 
852                   pand       mm0,mm7
853                   pand       mm1,mm7
854                   pand       mm2,mm7
855                   pand       mm3,mm7
856                   pand       mm4,mm7
857                   pand       mm5,mm7
858 
859                   pcmpeqb    mm0,mm6
860                   pcmpeqb    mm1,mm6
861                   pcmpeqb    mm2,mm6
862                   pcmpeqb    mm3,mm6
863                   pcmpeqb    mm4,mm6
864                   pcmpeqb    mm5,mm6
865 
866                   mov        ecx,len           //load length of line
867                   mov        esi,srcptr        //load source
868                   mov        ebx,dstptr        //load dest
869 
870                   cmp        ecx,0
871                   jz         mainloop48end
872 
873 mainloop48:
874                   movq       mm7,[esi]
875                   pand       mm7,mm0
876                   movq       mm6,mm0
877                   pandn      mm6,[ebx]
878                   por        mm7,mm6
879                   movq       [ebx],mm7
880 
881                   movq       mm6,[esi+8]
882                   pand       mm6,mm1
883                   movq       mm7,mm1
884                   pandn      mm7,[ebx+8]
885                   por        mm6,mm7
886                   movq       [ebx+8],mm6
887 
888                   movq       mm6,[esi+16]
889                   pand       mm6,mm2
890                   movq       mm7,mm2
891                   pandn      mm7,[ebx+16]
892                   por        mm6,mm7
893                   movq       [ebx+16],mm6
894 
895                   movq       mm7,[esi+24]
896                   pand       mm7,mm3
897                   movq       mm6,mm3
898                   pandn      mm6,[ebx+24]
899                   por        mm7,mm6
900                   movq       [ebx+24],mm7
901 
902                   movq       mm6,[esi+32]
903                   pand       mm6,mm4
904                   movq       mm7,mm4
905                   pandn      mm7,[ebx+32]
906                   por        mm6,mm7
907                   movq       [ebx+32],mm6
908 
909                   movq       mm7,[esi+40]
910                   pand       mm7,mm5
911                   movq       mm6,mm5
912                   pandn      mm6,[ebx+40]
913                   por        mm7,mm6
914                   movq       [ebx+40],mm7
915 
916                   add        esi,48            //inc by 32 bytes processed
917                   add        ebx,48
918                   sub        ecx,8             //dec by 8 pixels processed
919 
920                   ja         mainloop48
921 mainloop48end:
922 
923                   mov        ecx,diff
924                   cmp        ecx,0
925                   jz         end48
926 
927                   mov        edx,mask
928                   sal        edx,24            //make low byte the high byte
929 
930 secondloop48:
931                   sal        edx,1             //move high bit to CF
932                   jnc        skip48            //if CF = 0
933                   mov        eax,[esi]
934                   mov        [ebx],eax
935                   mov        ax,[esi+4]       // These 2 lines added 20070717
936                   mov        [ebx+4],ax       // Glenn R-P
937 skip48:
938                   add        esi,6            // Changed 4 to 6 on these 2
939                   add        ebx,6            // lines.  Glenn R-P 20070717
940 
941                   dec        ecx
942                   jnz        secondloop48
943 
944 end48:
945                   emms
946                }
947             }
948             else /* mmx _not supported - Use modified C routine */
949             {
950                register unsigned int incr1, initial_val, final_val;
951                png_size_t pixel_bytes;
952                png_uint_32 i;
953                register int disp = png_pass_inc[png_ptr->pass];
954                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
955 
956                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
957                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
958                   pixel_bytes;
959                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
960                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
961                final_val = png_ptr->width*pixel_bytes;
962                incr1 = (disp)*pixel_bytes;
963                for (i = initial_val; i < final_val; i += incr1)
964                {
965                   png_memcpy(dstptr, srcptr, pixel_bytes);
966                   srcptr += incr1;
967                   dstptr += incr1;
968                }
969             } /* end of else */
970 
971             break;
972          }       // end 48 bpp
973 
974          default:
975          {
976             png_bytep sptr;
977             png_bytep dp;
978             png_size_t pixel_bytes;
979             int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
980             unsigned int i;
981             register int disp = png_pass_inc[png_ptr->pass];  // get the offset
982             register unsigned int incr1, initial_val, final_val;
983 
984             pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
985             sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
986                pixel_bytes;
987             dp = row + offset_table[png_ptr->pass]*pixel_bytes;
988             initial_val = offset_table[png_ptr->pass]*pixel_bytes;
989             final_val = png_ptr->width*pixel_bytes;
990             incr1 = (disp)*pixel_bytes;
991             for (i = initial_val; i < final_val; i += incr1)
992             {
993                png_memcpy(dp, sptr, pixel_bytes);
994                sptr += incr1;
995                dp += incr1;
996             }
997             break;
998          }
999       } /* end switch (png_ptr->row_info.pixel_depth) */
1000    } /* end if (non-trivial mask) */
1001 
1002 } /* end png_combine_row() */
1003 
1004 
1005 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1006 
1007 void /* PRIVATE */
png_do_read_interlace(png_structp png_ptr)1008 png_do_read_interlace(png_structp png_ptr)
1009 {
1010    png_row_infop row_info = &(png_ptr->row_info);
1011    png_bytep row = png_ptr->row_buf + 1;
1012    int pass = png_ptr->pass;
1013    png_uint_32 transformations = png_ptr->transformations;
1014 #ifdef PNG_USE_LOCAL_ARRAYS
1015    PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1016 #endif
1017 
1018    png_debug(1,"in png_do_read_interlace\n");
1019 
1020    if (mmx_supported == 2) {
1021 #if !defined(PNG_1_0_X)
1022        /* this should have happened in png_init_mmx_flags() already */
1023        png_warning(png_ptr, "asm_flags may not have been initialized");
1024 #endif
1025        png_mmx_support();
1026    }
1027 
1028    if (row != NULL && row_info != NULL)
1029    {
1030       png_uint_32 final_width;
1031 
1032       final_width = row_info->width * png_pass_inc[pass];
1033 
1034       switch (row_info->pixel_depth)
1035       {
1036          case 1:
1037          {
1038             png_bytep sp, dp;
1039             int sshift, dshift;
1040             int s_start, s_end, s_inc;
1041             png_byte v;
1042             png_uint_32 i;
1043             int j;
1044 
1045             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1046             dp = row + (png_size_t)((final_width - 1) >> 3);
1047 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1048             if (transformations & PNG_PACKSWAP)
1049             {
1050                sshift = (int)((row_info->width + 7) & 7);
1051                dshift = (int)((final_width + 7) & 7);
1052                s_start = 7;
1053                s_end = 0;
1054                s_inc = -1;
1055             }
1056             else
1057 #endif
1058             {
1059                sshift = 7 - (int)((row_info->width + 7) & 7);
1060                dshift = 7 - (int)((final_width + 7) & 7);
1061                s_start = 0;
1062                s_end = 7;
1063                s_inc = 1;
1064             }
1065 
1066             for (i = row_info->width; i; i--)
1067             {
1068                v = (png_byte)((*sp >> sshift) & 0x1);
1069                for (j = 0; j < png_pass_inc[pass]; j++)
1070                {
1071                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1072                   *dp |= (png_byte)(v << dshift);
1073                   if (dshift == s_end)
1074                   {
1075                      dshift = s_start;
1076                      dp--;
1077                   }
1078                   else
1079                      dshift += s_inc;
1080                }
1081                if (sshift == s_end)
1082                {
1083                   sshift = s_start;
1084                   sp--;
1085                }
1086                else
1087                   sshift += s_inc;
1088             }
1089             break;
1090          }
1091 
1092          case 2:
1093          {
1094             png_bytep sp, dp;
1095             int sshift, dshift;
1096             int s_start, s_end, s_inc;
1097             png_uint_32 i;
1098 
1099             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1100             dp = row + (png_size_t)((final_width - 1) >> 2);
1101 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1102             if (transformations & PNG_PACKSWAP)
1103             {
1104                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1105                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1106                s_start = 6;
1107                s_end = 0;
1108                s_inc = -2;
1109             }
1110             else
1111 #endif
1112             {
1113                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1114                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1115                s_start = 0;
1116                s_end = 6;
1117                s_inc = 2;
1118             }
1119 
1120             for (i = row_info->width; i; i--)
1121             {
1122                png_byte v;
1123                int j;
1124 
1125                v = (png_byte)((*sp >> sshift) & 0x3);
1126                for (j = 0; j < png_pass_inc[pass]; j++)
1127                {
1128                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1129                   *dp |= (png_byte)(v << dshift);
1130                   if (dshift == s_end)
1131                   {
1132                      dshift = s_start;
1133                      dp--;
1134                   }
1135                   else
1136                      dshift += s_inc;
1137                }
1138                if (sshift == s_end)
1139                {
1140                   sshift = s_start;
1141                   sp--;
1142                }
1143                else
1144                   sshift += s_inc;
1145             }
1146             break;
1147          }
1148 
1149          case 4:
1150          {
1151             png_bytep sp, dp;
1152             int sshift, dshift;
1153             int s_start, s_end, s_inc;
1154             png_uint_32 i;
1155 
1156             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1157             dp = row + (png_size_t)((final_width - 1) >> 1);
1158 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1159             if (transformations & PNG_PACKSWAP)
1160             {
1161                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1162                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1163                s_start = 4;
1164                s_end = 0;
1165                s_inc = -4;
1166             }
1167             else
1168 #endif
1169             {
1170                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1171                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1172                s_start = 0;
1173                s_end = 4;
1174                s_inc = 4;
1175             }
1176 
1177             for (i = row_info->width; i; i--)
1178             {
1179                png_byte v;
1180                int j;
1181 
1182                v = (png_byte)((*sp >> sshift) & 0xf);
1183                for (j = 0; j < png_pass_inc[pass]; j++)
1184                {
1185                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1186                   *dp |= (png_byte)(v << dshift);
1187                   if (dshift == s_end)
1188                   {
1189                      dshift = s_start;
1190                      dp--;
1191                   }
1192                   else
1193                      dshift += s_inc;
1194                }
1195                if (sshift == s_end)
1196                {
1197                   sshift = s_start;
1198                   sp--;
1199                }
1200                else
1201                   sshift += s_inc;
1202             }
1203             break;
1204          }
1205 
1206          default:         // This is the place where the routine is modified
1207          {
1208             __int64 const4 = 0x0000000000FFFFFF;
1209             // __int64 const5 = 0x000000FFFFFF0000;  // unused...
1210             __int64 const6 = 0x00000000000000FF;
1211             png_bytep sptr, dp;
1212             png_uint_32 i;
1213             png_size_t pixel_bytes;
1214             int width = row_info->width;
1215 
1216             pixel_bytes = (row_info->pixel_depth >> 3);
1217 
1218             sptr = row + (width - 1) * pixel_bytes;
1219             dp = row + (final_width - 1) * pixel_bytes;
1220             // New code by Nirav Chhatrapati - Intel Corporation
1221             // sign fix by GRR
1222             // NOTE:  there is NO MMX code for 48-bit and 64-bit images
1223 
1224             // use MMX routine if machine supports it
1225 #if !defined(PNG_1_0_X)
1226             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1227                 /* && mmx_supported */ )
1228 #else
1229             if (mmx_supported)
1230 #endif
1231             {
1232                if (pixel_bytes == 3)
1233                {
1234                   if (((pass == 4) || (pass == 5)) && width)
1235                   {
1236                      int width_mmx = ((width >> 1) << 1) - 8;
1237                      if (width_mmx < 0)
1238                          width_mmx = 0;
1239                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1240                      if (width_mmx)
1241                      {
1242                         _asm
1243                         {
1244                            mov esi, sptr
1245                            mov edi, dp
1246                            mov ecx, width_mmx
1247                            sub esi, 3
1248                            sub edi, 9
1249 loop_pass4:
1250                            movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
1251                            movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
1252                            movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
1253                            psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
1254                            pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
1255                            psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
1256                            por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
1257                            movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
1258                            psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
1259                            movq [edi], mm0     ; move quad to memory
1260                            psrlq mm5, 16       ; 0 0 0 0 0 X X v2
1261                            pand mm5, const6    ; 0 0 0 0 0 0 0 v2
1262                            por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
1263                            movd [edi+8], mm6   ; move double to memory
1264                            sub esi, 6
1265                            sub edi, 12
1266                            sub ecx, 2
1267                            jnz loop_pass4
1268                            EMMS
1269                         }
1270                      }
1271 
1272                      sptr -= width_mmx*3;
1273                      dp -= width_mmx*6;
1274                      for (i = width; i; i--)
1275                      {
1276                         png_byte v[8];
1277                         int j;
1278 
1279                         png_memcpy(v, sptr, 3);
1280                         for (j = 0; j < png_pass_inc[pass]; j++)
1281                         {
1282                            png_memcpy(dp, v, 3);
1283                            dp -= 3;
1284                         }
1285                         sptr -= 3;
1286                      }
1287                   }
1288                   else if (((pass == 2) || (pass == 3)) && width)
1289                   {
1290                      _asm
1291                      {
1292                         mov esi, sptr
1293                         mov edi, dp
1294                         mov ecx, width
1295                         sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
1296 loop_pass2:
1297                         movd mm0, [esi]     ; X X X X X v2 v1 v0
1298                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1299                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1300                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1301                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1302                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1303                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1304                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1305                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1306                         movq [edi+4], mm0   ; move to memory
1307                         psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
1308                         movd [edi], mm0     ; move to memory
1309                         sub esi, 3
1310                         sub edi, 12
1311                         dec ecx
1312                         jnz loop_pass2
1313                         EMMS
1314                      }
1315                   }
1316                   else if (width) /* && ((pass == 0) || (pass == 1))) */
1317                   {
1318                      _asm
1319                      {
1320                         mov esi, sptr
1321                         mov edi, dp
1322                         mov ecx, width
1323                         sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
1324 loop_pass0:
1325                         movd mm0, [esi]     ; X X X X X v2 v1 v0
1326                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1327                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1328                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1329                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1330                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1331                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1332                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1333                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1334                         movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
1335                         psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
1336                         movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
1337                         punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
1338                         movq [edi+16] , mm4
1339                         psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
1340                         movq [edi+8] , mm3
1341                         punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
1342                         sub esi, 3
1343                         movq [edi], mm0
1344                         sub edi, 24
1345                         //sub esi, 3
1346                         dec ecx
1347                         jnz loop_pass0
1348                         EMMS
1349                      }
1350                   }
1351                } /* end of pixel_bytes == 3 */
1352 
1353                else if (pixel_bytes == 1)
1354                {
1355                   if (((pass == 4) || (pass == 5)) && width)
1356                   {
1357                      int width_mmx = ((width >> 3) << 3);
1358                      width -= width_mmx;
1359                      if (width_mmx)
1360                      {
1361                         _asm
1362                         {
1363                            mov esi, sptr
1364                            mov edi, dp
1365                            mov ecx, width_mmx
1366                            sub edi, 15
1367                            sub esi, 7
1368 loop1_pass4:
1369                            movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
1370                            movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
1371                            punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
1372                            //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
1373                            punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
1374                            movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
1375                            sub esi, 8
1376                            movq [edi], mm0     ; move to memory v4 v5 v6 and v7
1377                            //sub esi, 4
1378                            sub edi, 16
1379                            sub ecx, 8
1380                            jnz loop1_pass4
1381                            EMMS
1382                         }
1383                      }
1384 
1385                      sptr -= width_mmx;
1386                      dp -= width_mmx*2;
1387                      for (i = width; i; i--)
1388                      {
1389                         int j;
1390 
1391                         for (j = 0; j < png_pass_inc[pass]; j++)
1392                         {
1393                            *dp-- = *sptr;
1394                         }
1395                         sptr --;
1396                      }
1397                   }
1398                   else if (((pass == 2) || (pass == 3)) && width)
1399                   {
1400                      int width_mmx = ((width >> 2) << 2);
1401                      width -= width_mmx;
1402                      if (width_mmx)
1403                      {
1404                         _asm
1405                         {
1406                            mov esi, sptr
1407                            mov edi, dp
1408                            mov ecx, width_mmx
1409                            sub edi, 15
1410                            sub esi, 3
1411 loop1_pass2:
1412                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1413                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1414                            movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1415                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1416                            punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
1417                            movq [edi], mm0     ; move to memory v2 and v3
1418                            sub esi, 4
1419                            movq [edi+8], mm1   ; move to memory v1     and v0
1420                            sub edi, 16
1421                            sub ecx, 4
1422                            jnz loop1_pass2
1423                            EMMS
1424                         }
1425                      }
1426 
1427                      sptr -= width_mmx;
1428                      dp -= width_mmx*4;
1429                      for (i = width; i; i--)
1430                      {
1431                         int j;
1432 
1433                         for (j = 0; j < png_pass_inc[pass]; j++)
1434                         {
1435                            *dp-- = *sptr;
1436                         }
1437                         sptr --;
1438                      }
1439                   }
1440                   else if (width) /* && ((pass == 0) || (pass == 1))) */
1441                   {
1442                      int width_mmx = ((width >> 2) << 2);
1443                      width -= width_mmx;
1444                      if (width_mmx)
1445                      {
1446                         _asm
1447                         {
1448                            mov esi, sptr
1449                            mov edi, dp
1450                            mov ecx, width_mmx
1451                            sub edi, 31
1452                            sub esi, 3
1453 loop1_pass0:
1454                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1455                            movq mm1, mm0       ; X X X X v0 v1 v2 v3
1456                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1457                            movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1458                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1459                            movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
1460                            punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
1461                            punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
1462                            movq [edi], mm0     ; move to memory v3
1463                            punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
1464                            movq [edi+8], mm3   ; move to memory v2
1465                            movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
1466                            punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
1467                            punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
1468                            movq [edi+16], mm2  ; move to memory v1
1469                            movq [edi+24], mm4  ; move to memory v0
1470                            sub esi, 4
1471                            sub edi, 32
1472                            sub ecx, 4
1473                            jnz loop1_pass0
1474                            EMMS
1475                         }
1476                      }
1477 
1478                      sptr -= width_mmx;
1479                      dp -= width_mmx*8;
1480                      for (i = width; i; i--)
1481                      {
1482                         int j;
1483 
1484                        /* I simplified this part in version 1.0.4e
1485                         * here and in several other instances where
1486                         * pixel_bytes == 1  -- GR-P
1487                         *
1488                         * Original code:
1489                         *
1490                         * png_byte v[8];
1491                         * png_memcpy(v, sptr, pixel_bytes);
1492                         * for (j = 0; j < png_pass_inc[pass]; j++)
1493                         * {
1494                         *    png_memcpy(dp, v, pixel_bytes);
1495                         *    dp -= pixel_bytes;
1496                         * }
1497                         * sptr -= pixel_bytes;
1498                         *
1499                         * Replacement code is in the next three lines:
1500                         */
1501 
1502                         for (j = 0; j < png_pass_inc[pass]; j++)
1503                            *dp-- = *sptr;
1504                         sptr--;
1505                      }
1506                   }
1507                } /* end of pixel_bytes == 1 */
1508 
1509                else if (pixel_bytes == 2)
1510                {
1511                   if (((pass == 4) || (pass == 5)) && width)
1512                   {
1513                      int width_mmx = ((width >> 1) << 1) ;
1514                      width -= width_mmx;
1515                      if (width_mmx)
1516                      {
1517                         _asm
1518                         {
1519                            mov esi, sptr
1520                            mov edi, dp
1521                            mov ecx, width_mmx
1522                            sub esi, 2
1523                            sub edi, 6
1524 loop2_pass4:
1525                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1526                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1527                            sub esi, 4
1528                            movq [edi], mm0
1529                            sub edi, 8
1530                            sub ecx, 2
1531                            jnz loop2_pass4
1532                            EMMS
1533                         }
1534                      }
1535 
1536                      sptr -= (width_mmx*2 - 2);            // sign fixed
1537                      dp -= (width_mmx*4 - 2);            // sign fixed
1538                      for (i = width; i; i--)
1539                      {
1540                         png_byte v[8];
1541                         int j;
1542                         sptr -= 2;
1543                         png_memcpy(v, sptr, 2);
1544                         for (j = 0; j < png_pass_inc[pass]; j++)
1545                         {
1546                            dp -= 2;
1547                            png_memcpy(dp, v, 2);
1548                         }
1549                      }
1550                   }
1551                   else if (((pass == 2) || (pass == 3)) && width)
1552                   {
1553                      int width_mmx = ((width >> 1) << 1) ;
1554                      width -= width_mmx;
1555                      if (width_mmx)
1556                      {
1557                         _asm
1558                         {
1559                            mov esi, sptr
1560                            mov edi, dp
1561                            mov ecx, width_mmx
1562                            sub esi, 2
1563                            sub edi, 14
1564 loop2_pass2:
1565                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1566                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1567                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1568                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1569                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1570                            movq [edi], mm0
1571                            sub esi, 4
1572                            movq [edi + 8], mm1
1573                            //sub esi, 4
1574                            sub edi, 16
1575                            sub ecx, 2
1576                            jnz loop2_pass2
1577                            EMMS
1578                         }
1579                      }
1580 
1581                      sptr -= (width_mmx*2 - 2);            // sign fixed
1582                      dp -= (width_mmx*8 - 2);            // sign fixed
1583                      for (i = width; i; i--)
1584                      {
1585                         png_byte v[8];
1586                         int j;
1587                         sptr -= 2;
1588                         png_memcpy(v, sptr, 2);
1589                         for (j = 0; j < png_pass_inc[pass]; j++)
1590                         {
1591                            dp -= 2;
1592                            png_memcpy(dp, v, 2);
1593                         }
1594                      }
1595                   }
1596                   else if (width) /* && ((pass == 0) || (pass == 1))) */
1597                   {
1598                      int width_mmx = ((width >> 1) << 1);
1599                      width -= width_mmx;
1600                      if (width_mmx)
1601                      {
1602                         _asm
1603                         {
1604                            mov esi, sptr
1605                            mov edi, dp
1606                            mov ecx, width_mmx
1607                            sub esi, 2
1608                            sub edi, 30
1609 loop2_pass0:
1610                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1611                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1612                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1613                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1614                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1615                            movq [edi], mm0
1616                            movq [edi + 8], mm0
1617                            movq [edi + 16], mm1
1618                            movq [edi + 24], mm1
1619                            sub esi, 4
1620                            sub edi, 32
1621                            sub ecx, 2
1622                            jnz loop2_pass0
1623                            EMMS
1624                         }
1625                      }
1626 
1627                      sptr -= (width_mmx*2 - 2);            // sign fixed
1628                      dp -= (width_mmx*16 - 2);            // sign fixed
1629                      for (i = width; i; i--)
1630                      {
1631                         png_byte v[8];
1632                         int j;
1633                         sptr -= 2;
1634                         png_memcpy(v, sptr, 2);
1635                         for (j = 0; j < png_pass_inc[pass]; j++)
1636                         {
1637                            dp -= 2;
1638                            png_memcpy(dp, v, 2);
1639                         }
1640                      }
1641                   }
1642                } /* end of pixel_bytes == 2 */
1643 
1644                else if (pixel_bytes == 4)
1645                {
1646                   if (((pass == 4) || (pass == 5)) && width)
1647                   {
1648                      int width_mmx = ((width >> 1) << 1) ;
1649                      width -= width_mmx;
1650                      if (width_mmx)
1651                      {
1652                         _asm
1653                         {
1654                            mov esi, sptr
1655                            mov edi, dp
1656                            mov ecx, width_mmx
1657                            sub esi, 4
1658                            sub edi, 12
1659 loop4_pass4:
1660                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1661                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1662                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1663                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1664                            movq [edi], mm0
1665                            sub esi, 8
1666                            movq [edi + 8], mm1
1667                            sub edi, 16
1668                            sub ecx, 2
1669                            jnz loop4_pass4
1670                            EMMS
1671                         }
1672                      }
1673 
1674                      sptr -= (width_mmx*4 - 4);          // sign fixed
1675                      dp -= (width_mmx*8 - 4);            // sign fixed
1676                      for (i = width; i; i--)
1677                      {
1678                         png_byte v[8];
1679                         int j;
1680                         sptr -= 4;
1681                         png_memcpy(v, sptr, 4);
1682                         for (j = 0; j < png_pass_inc[pass]; j++)
1683                         {
1684                            dp -= 4;
1685                            png_memcpy(dp, v, 4);
1686                         }
1687                      }
1688                   }
1689                   else if (((pass == 2) || (pass == 3)) && width)
1690                   {
1691                      int width_mmx = ((width >> 1) << 1) ;
1692                      width -= width_mmx;
1693                      if (width_mmx)
1694                      {
1695                         _asm
1696                         {
1697                            mov esi, sptr
1698                            mov edi, dp
1699                            mov ecx, width_mmx
1700                            sub esi, 4
1701                            sub edi, 28
1702 loop4_pass2:
1703                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1704                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1705                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1706                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1707                            movq [edi], mm0
1708                            movq [edi + 8], mm0
1709                            movq [edi+16], mm1
1710                            movq [edi + 24], mm1
1711                            sub esi, 8
1712                            sub edi, 32
1713                            sub ecx, 2
1714                            jnz loop4_pass2
1715                            EMMS
1716                         }
1717                      }
1718 
1719                      sptr -= (width_mmx*4 - 4);            // sign fixed
1720                      dp -= (width_mmx*16 - 4);            // sign fixed
1721                      for (i = width; i; i--)
1722                      {
1723                         png_byte v[8];
1724                         int j;
1725                         sptr -= 4;
1726                         png_memcpy(v, sptr, 4);
1727                         for (j = 0; j < png_pass_inc[pass]; j++)
1728                         {
1729                            dp -= 4;
1730                            png_memcpy(dp, v, 4);
1731                         }
1732                      }
1733                   }
1734                   else if (width) /* && ((pass == 0) || (pass == 1))) */
1735                   {
1736                      int width_mmx = ((width >> 1) << 1) ;
1737                      width -= width_mmx;
1738                      if (width_mmx)
1739                      {
1740                         _asm
1741                         {
1742                            mov esi, sptr
1743                            mov edi, dp
1744                            mov ecx, width_mmx
1745                            sub esi, 4
1746                            sub edi, 60
1747 loop4_pass0:
1748                            movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
1749                            movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
1750                            punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
1751                            punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
1752                            movq [edi], mm0
1753                            movq [edi + 8], mm0
1754                            movq [edi + 16], mm0
1755                            movq [edi + 24], mm0
1756                            movq [edi+32], mm1
1757                            movq [edi + 40], mm1
1758                            movq [edi+ 48], mm1
1759                            sub esi, 8
1760                            movq [edi + 56], mm1
1761                            sub edi, 64
1762                            sub ecx, 2
1763                            jnz loop4_pass0
1764                            EMMS
1765                         }
1766                      }
1767 
1768                      sptr -= (width_mmx*4 - 4);            // sign fixed
1769                      dp -= (width_mmx*32 - 4);            // sign fixed
1770                      for (i = width; i; i--)
1771                      {
1772                         png_byte v[8];
1773                         int j;
1774                         sptr -= 4;
1775                         png_memcpy(v, sptr, 4);
1776                         for (j = 0; j < png_pass_inc[pass]; j++)
1777                         {
1778                            dp -= 4;
1779                            png_memcpy(dp, v, 4);
1780                         }
1781                      }
1782                   }
1783 
1784                } /* end of pixel_bytes == 4 */
1785 
1786                else if (pixel_bytes == 6)
1787                {
1788                   for (i = width; i; i--)
1789                   {
1790                      png_byte v[8];
1791                      int j;
1792                      png_memcpy(v, sptr, 6);
1793                      for (j = 0; j < png_pass_inc[pass]; j++)
1794                      {
1795                         png_memcpy(dp, v, 6);
1796                         dp -= 6;
1797                      }
1798                      sptr -= 6;
1799                   }
1800                } /* end of pixel_bytes == 6 */
1801 
1802                else
1803                {
1804                   for (i = width; i; i--)
1805                   {
1806                      png_byte v[8];
1807                      int j;
1808                      png_memcpy(v, sptr, pixel_bytes);
1809                      for (j = 0; j < png_pass_inc[pass]; j++)
1810                      {
1811                         png_memcpy(dp, v, pixel_bytes);
1812                         dp -= pixel_bytes;
1813                      }
1814                      sptr-= pixel_bytes;
1815                   }
1816                }
1817             } /* end of mmx_supported */
1818 
1819             else /* MMX not supported:  use modified C code - takes advantage
1820                   * of inlining of memcpy for a constant */
1821             {
1822                if (pixel_bytes == 1)
1823                {
1824                   for (i = width; i; i--)
1825                   {
1826                      int j;
1827                      for (j = 0; j < png_pass_inc[pass]; j++)
1828                         *dp-- = *sptr;
1829                      sptr--;
1830                   }
1831                }
1832                else if (pixel_bytes == 3)
1833                {
1834                   for (i = width; i; i--)
1835                   {
1836                      png_byte v[8];
1837                      int j;
1838                      png_memcpy(v, sptr, pixel_bytes);
1839                      for (j = 0; j < png_pass_inc[pass]; j++)
1840                      {
1841                         png_memcpy(dp, v, pixel_bytes);
1842                         dp -= pixel_bytes;
1843                      }
1844                      sptr -= pixel_bytes;
1845                   }
1846                }
1847                else if (pixel_bytes == 2)
1848                {
1849                   for (i = width; i; i--)
1850                   {
1851                      png_byte v[8];
1852                      int j;
1853                      png_memcpy(v, sptr, pixel_bytes);
1854                      for (j = 0; j < png_pass_inc[pass]; j++)
1855                      {
1856                         png_memcpy(dp, v, pixel_bytes);
1857                         dp -= pixel_bytes;
1858                      }
1859                      sptr -= pixel_bytes;
1860                   }
1861                }
1862                else if (pixel_bytes == 4)
1863                {
1864                   for (i = width; i; i--)
1865                   {
1866                      png_byte v[8];
1867                      int j;
1868                      png_memcpy(v, sptr, pixel_bytes);
1869                      for (j = 0; j < png_pass_inc[pass]; j++)
1870                      {
1871                         png_memcpy(dp, v, pixel_bytes);
1872                         dp -= pixel_bytes;
1873                      }
1874                      sptr -= pixel_bytes;
1875                   }
1876                }
1877                else if (pixel_bytes == 6)
1878                {
1879                   for (i = width; i; i--)
1880                   {
1881                      png_byte v[8];
1882                      int j;
1883                      png_memcpy(v, sptr, pixel_bytes);
1884                      for (j = 0; j < png_pass_inc[pass]; j++)
1885                      {
1886                         png_memcpy(dp, v, pixel_bytes);
1887                         dp -= pixel_bytes;
1888                      }
1889                      sptr -= pixel_bytes;
1890                   }
1891                }
1892                else
1893                {
1894                   for (i = width; i; i--)
1895                   {
1896                      png_byte v[8];
1897                      int j;
1898                      png_memcpy(v, sptr, pixel_bytes);
1899                      for (j = 0; j < png_pass_inc[pass]; j++)
1900                      {
1901                         png_memcpy(dp, v, pixel_bytes);
1902                         dp -= pixel_bytes;
1903                      }
1904                      sptr -= pixel_bytes;
1905                   }
1906                }
1907 
1908             } /* end of MMX not supported */
1909             break;
1910          }
1911       } /* end switch (row_info->pixel_depth) */
1912 
1913       row_info->width = final_width;
1914 
1915       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1916    }
1917 
1918 }
1919 
1920 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1921 
1922 
1923 // These global constants are declared
1924 // here to ensure alignment on 8-byte boundaries.
1925   union uAll {
1926      __int64 use;
1927      double  double_align;
1928      long long long_long_align;
1929   } ;
1930   static PNG_CONST union uAll LBCarryMask = {0x0101010101010101},
1931                               HBClearMask = {0x7f7f7f7f7f7f7f7f};
1932 
1933 // Optimized code for PNG Average filter decoder
1934 void /* PRIVATE */
png_read_filter_row_mmx_avg(png_row_infop row_info,png_bytep row,png_bytep prev_row)1935 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1936                             , png_bytep prev_row)
1937 {
1938   // These variables are declared
1939   // here to ensure alignment on 8-byte boundaries.
1940   union uAll ActiveMask, ShiftBpp, ShiftRem;
1941 
1942    int bpp;
1943    png_uint_32 FullLength;
1944    png_uint_32 MMXLength;
1945    //png_uint_32 len;
1946    int diff;
1947 
1948    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1949    FullLength  = row_info->rowbytes; // # of bytes to filter
1950    _asm {
1951          // Init address pointers and offset
1952          mov edi, row          // edi ==> Avg(x)
1953          xor ebx, ebx          // ebx ==> x
1954          mov edx, edi
1955          mov esi, prev_row           // esi ==> Prior(x)
1956          sub edx, bpp          // edx ==> Raw(x-bpp)
1957 
1958          xor eax, eax
1959          // Compute the Raw value for the first bpp bytes
1960          //    Raw(x) = Avg(x) + (Prior(x)/2)
1961 davgrlp:
1962          mov al, [esi + ebx]   // Load al with Prior(x)
1963          inc ebx
1964          shr al, 1             // divide by 2
1965          add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
1966          cmp ebx, bpp
1967          mov [edi+ebx-1], al    // Write back Raw(x);
1968                             // mov does not affect flags; -1 to offset inc ebx
1969          jb davgrlp
1970          // get # of bytes to alignment
1971          mov diff, edi         // take start of row
1972          add diff, ebx         // add bpp
1973          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
1974          and diff, 0xfffffff8  // mask to alignment boundary
1975          sub diff, edi         // subtract from start ==> value ebx at alignment
1976          jz davggo
1977          // fix alignment
1978          // Compute the Raw value for the bytes upto the alignment boundary
1979          //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1980          xor ecx, ecx
1981 davglp1:
1982          xor eax, eax
1983          mov cl, [esi + ebx]        // load cl with Prior(x)
1984          mov al, [edx + ebx]  // load al with Raw(x-bpp)
1985          add ax, cx
1986          inc ebx
1987          shr ax, 1            // divide by 2
1988          add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
1989          cmp ebx, diff              // Check if at alignment boundary
1990          mov [edi+ebx-1], al        // Write back Raw(x);
1991                             // mov does not affect flags; -1 to offset inc ebx
1992          jb davglp1               // Repeat until at alignment boundary
1993 davggo:
1994          mov eax, FullLength
1995          mov ecx, eax
1996          sub eax, ebx          // subtract alignment fix
1997          and eax, 0x00000007   // calc bytes over mult of 8
1998          sub ecx, eax          // drop over bytes from original length
1999          mov MMXLength, ecx
2000    } // end _asm block
2001    // Now do the math for the rest of the row
2002    switch ( bpp )
2003    {
2004       case 3:
2005       {
2006          ActiveMask.use  = 0x0000000000ffffff;
2007          ShiftBpp.use = 24;    // == 3 * 8
2008          ShiftRem.use = 40;    // == 64 - 24
2009          _asm {
2010             // Re-init address pointers and offset
2011             movq mm7, ActiveMask
2012             mov ebx, diff      // ebx ==> x = offset to alignment boundary
2013             movq mm5, LBCarryMask
2014             mov edi, row       // edi ==> Avg(x)
2015             movq mm4, HBClearMask
2016             mov esi, prev_row        // esi ==> Prior(x)
2017             // PRIME the pump (load the first Raw(x-bpp) data set
2018             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2019                                // (we correct position in loop below)
2020 davg3lp:
2021             movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
2022             // Add (Prev_row/2) to Average
2023             movq mm3, mm5
2024             psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
2025             movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
2026             movq mm6, mm7
2027             pand mm3, mm1      // get lsb for each prev_row byte
2028             psrlq mm1, 1       // divide prev_row bytes by 2
2029             pand  mm1, mm4     // clear invalid bit 7 of each byte
2030             paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
2031             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2032             movq mm1, mm3      // now use mm1 for getting LBCarrys
2033             pand mm1, mm2      // get LBCarrys for each byte where both
2034                                // lsb's were == 1 (Only valid for active group)
2035             psrlq mm2, 1       // divide raw bytes by 2
2036             pand  mm2, mm4     // clear invalid bit 7 of each byte
2037             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2038             pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
2039             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2040                                //  byte
2041             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2042             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
2043             movq mm2, mm0        // mov updated Raws to mm2
2044             psllq mm2, ShiftBpp  // shift data to position correctly
2045             movq mm1, mm3        // now use mm1 for getting LBCarrys
2046             pand mm1, mm2      // get LBCarrys for each byte where both
2047                                // lsb's were == 1 (Only valid for active group)
2048             psrlq mm2, 1       // divide raw bytes by 2
2049             pand  mm2, mm4     // clear invalid bit 7 of each byte
2050             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2051             pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
2052             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2053                                //  byte
2054 
2055             // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2056             psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
2057                                  // bytes
2058             movq mm2, mm0        // mov updated Raws to mm2
2059             psllq mm2, ShiftBpp  // shift data to position correctly
2060                               // Data only needs to be shifted once here to
2061                               // get the correct x-bpp offset.
2062             movq mm1, mm3     // now use mm1 for getting LBCarrys
2063             pand mm1, mm2     // get LBCarrys for each byte where both
2064                               // lsb's were == 1 (Only valid for active group)
2065             psrlq mm2, 1      // divide raw bytes by 2
2066             pand  mm2, mm4    // clear invalid bit 7 of each byte
2067             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2068             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2069             add ebx, 8
2070             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2071                               // byte
2072 
2073             // Now ready to write back to memory
2074             movq [edi + ebx - 8], mm0
2075             // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2076             cmp ebx, MMXLength
2077             movq mm2, mm0     // mov updated Raw(x) to mm2
2078             jb davg3lp
2079          } // end _asm block
2080       }
2081       break;
2082 
2083       case 6:
2084       case 4:
2085       case 7:
2086       case 5:
2087       {
2088          ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
2089                                                 // appropriate inactive bytes
2090          ShiftBpp.use = bpp << 3;
2091          ShiftRem.use = 64 - ShiftBpp.use;
2092          _asm {
2093             movq mm4, HBClearMask
2094             // Re-init address pointers and offset
2095             mov ebx, diff       // ebx ==> x = offset to alignment boundary
2096             // Load ActiveMask and clear all bytes except for 1st active group
2097             movq mm7, ActiveMask
2098             mov edi, row         // edi ==> Avg(x)
2099             psrlq mm7, ShiftRem
2100             mov esi, prev_row    // esi ==> Prior(x)
2101             movq mm6, mm7
2102             movq mm5, LBCarryMask
2103             psllq mm6, ShiftBpp  // Create mask for 2nd active group
2104             // PRIME the pump (load the first Raw(x-bpp) data set
2105             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2106                                  // (we correct position in loop below)
2107 davg4lp:
2108             movq mm0, [edi + ebx]
2109             psrlq mm2, ShiftRem  // shift data to position correctly
2110             movq mm1, [esi + ebx]
2111             // Add (Prev_row/2) to Average
2112             movq mm3, mm5
2113             pand mm3, mm1     // get lsb for each prev_row byte
2114             psrlq mm1, 1      // divide prev_row bytes by 2
2115             pand  mm1, mm4    // clear invalid bit 7 of each byte
2116             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2117             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2118             movq mm1, mm3     // now use mm1 for getting LBCarrys
2119             pand mm1, mm2     // get LBCarrys for each byte where both
2120                               // lsb's were == 1 (Only valid for active group)
2121             psrlq mm2, 1      // divide raw bytes by 2
2122             pand  mm2, mm4    // clear invalid bit 7 of each byte
2123             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2124             pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
2125             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2126                               // byte
2127             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2128             movq mm2, mm0     // mov updated Raws to mm2
2129             psllq mm2, ShiftBpp // shift data to position correctly
2130             add ebx, 8
2131             movq mm1, mm3     // now use mm1 for getting LBCarrys
2132             pand mm1, mm2     // get LBCarrys for each byte where both
2133                               // lsb's were == 1 (Only valid for active group)
2134             psrlq mm2, 1      // divide raw bytes by 2
2135             pand  mm2, mm4    // clear invalid bit 7 of each byte
2136             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2137             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2138             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2139                               // byte
2140             cmp ebx, MMXLength
2141             // Now ready to write back to memory
2142             movq [edi + ebx - 8], mm0
2143             // Prep Raw(x-bpp) for next loop
2144             movq mm2, mm0     // mov updated Raws to mm2
2145             jb davg4lp
2146          } // end _asm block
2147       }
2148       break;
2149       case 2:
2150       {
2151          ActiveMask.use  = 0x000000000000ffff;
2152          ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
2153          ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
2154          _asm {
2155             // Load ActiveMask
2156             movq mm7, ActiveMask
2157             // Re-init address pointers and offset
2158             mov ebx, diff     // ebx ==> x = offset to alignment boundary
2159             movq mm5, LBCarryMask
2160             mov edi, row      // edi ==> Avg(x)
2161             movq mm4, HBClearMask
2162             mov esi, prev_row  // esi ==> Prior(x)
2163             // PRIME the pump (load the first Raw(x-bpp) data set
2164             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2165                               // (we correct position in loop below)
2166 davg2lp:
2167             movq mm0, [edi + ebx]
2168             psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
2169             movq mm1, [esi + ebx]
2170             // Add (Prev_row/2) to Average
2171             movq mm3, mm5
2172             pand mm3, mm1     // get lsb for each prev_row byte
2173             psrlq mm1, 1      // divide prev_row bytes by 2
2174             pand  mm1, mm4    // clear invalid bit 7 of each byte
2175             movq mm6, mm7
2176             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2177             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2178             movq mm1, mm3     // now use mm1 for getting LBCarrys
2179             pand mm1, mm2     // get LBCarrys for each byte where both
2180                               // lsb's were == 1 (Only valid for active group)
2181             psrlq mm2, 1      // divide raw bytes by 2
2182             pand  mm2, mm4    // clear invalid bit 7 of each byte
2183             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2184             pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
2185             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2186             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2187             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2188             movq mm2, mm0       // mov updated Raws to mm2
2189             psllq mm2, ShiftBpp // shift data to position correctly
2190             movq mm1, mm3       // now use mm1 for getting LBCarrys
2191             pand mm1, mm2       // get LBCarrys for each byte where both
2192                                 // lsb's were == 1 (Only valid for active group)
2193             psrlq mm2, 1        // divide raw bytes by 2
2194             pand  mm2, mm4      // clear invalid bit 7 of each byte
2195             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2196             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2197             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2198 
2199             // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2200             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2201             movq mm2, mm0       // mov updated Raws to mm2
2202             psllq mm2, ShiftBpp // shift data to position correctly
2203                                 // Data only needs to be shifted once here to
2204                                 // get the correct x-bpp offset.
2205             movq mm1, mm3       // now use mm1 for getting LBCarrys
2206             pand mm1, mm2       // get LBCarrys for each byte where both
2207                                 // lsb's were == 1 (Only valid for active group)
2208             psrlq mm2, 1        // divide raw bytes by 2
2209             pand  mm2, mm4      // clear invalid bit 7 of each byte
2210             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2211             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2212             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2213 
2214             // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2215             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
2216             movq mm2, mm0        // mov updated Raws to mm2
2217             psllq mm2, ShiftBpp  // shift data to position correctly
2218                                  // Data only needs to be shifted once here to
2219                                  // get the correct x-bpp offset.
2220             add ebx, 8
2221             movq mm1, mm3    // now use mm1 for getting LBCarrys
2222             pand mm1, mm2    // get LBCarrys for each byte where both
2223                              // lsb's were == 1 (Only valid for active group)
2224             psrlq mm2, 1     // divide raw bytes by 2
2225             pand  mm2, mm4   // clear invalid bit 7 of each byte
2226             paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
2227             pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
2228             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2229 
2230             cmp ebx, MMXLength
2231             // Now ready to write back to memory
2232             movq [edi + ebx - 8], mm0
2233             // Prep Raw(x-bpp) for next loop
2234             movq mm2, mm0    // mov updated Raws to mm2
2235             jb davg2lp
2236         } // end _asm block
2237       }
2238       break;
2239 
2240       case 1:                 // bpp == 1
2241       {
2242          _asm {
2243             // Re-init address pointers and offset
2244             mov ebx, diff     // ebx ==> x = offset to alignment boundary
2245             mov edi, row      // edi ==> Avg(x)
2246             cmp ebx, FullLength  // Test if offset at end of array
2247             jnb davg1end
2248             // Do Paeth decode for remaining bytes
2249             mov esi, prev_row    // esi ==> Prior(x)
2250             mov edx, edi
2251             xor ecx, ecx         // zero ecx before using cl & cx in loop below
2252             sub edx, bpp         // edx ==> Raw(x-bpp)
2253 davg1lp:
2254             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2255             xor eax, eax
2256             mov cl, [esi + ebx]  // load cl with Prior(x)
2257             mov al, [edx + ebx]  // load al with Raw(x-bpp)
2258             add ax, cx
2259             inc ebx
2260             shr ax, 1            // divide by 2
2261             add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
2262             cmp ebx, FullLength  // Check if at end of array
2263             mov [edi+ebx-1], al  // Write back Raw(x);
2264                          // mov does not affect flags; -1 to offset inc ebx
2265             jb davg1lp
2266 davg1end:
2267          } // end _asm block
2268       }
2269       return;
2270 
2271       case 8:             // bpp == 8
2272       {
2273          _asm {
2274             // Re-init address pointers and offset
2275             mov ebx, diff           // ebx ==> x = offset to alignment boundary
2276             movq mm5, LBCarryMask
2277             mov edi, row            // edi ==> Avg(x)
2278             movq mm4, HBClearMask
2279             mov esi, prev_row       // esi ==> Prior(x)
2280             // PRIME the pump (load the first Raw(x-bpp) data set
2281             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2282                                 // (NO NEED to correct position in loop below)
2283 davg8lp:
2284             movq mm0, [edi + ebx]
2285             movq mm3, mm5
2286             movq mm1, [esi + ebx]
2287             add ebx, 8
2288             pand mm3, mm1       // get lsb for each prev_row byte
2289             psrlq mm1, 1        // divide prev_row bytes by 2
2290             pand mm3, mm2       // get LBCarrys for each byte where both
2291                                 // lsb's were == 1
2292             psrlq mm2, 1        // divide raw bytes by 2
2293             pand  mm1, mm4      // clear invalid bit 7 of each byte
2294             paddb mm0, mm3      // add LBCarrys to Avg for each byte
2295             pand  mm2, mm4      // clear invalid bit 7 of each byte
2296             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2297             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2298             cmp ebx, MMXLength
2299             movq [edi + ebx - 8], mm0
2300             movq mm2, mm0       // reuse as Raw(x-bpp)
2301             jb davg8lp
2302         } // end _asm block
2303       }
2304       break;
2305       default:                  // bpp greater than 8
2306       {
2307         _asm {
2308             movq mm5, LBCarryMask
2309             // Re-init address pointers and offset
2310             mov ebx, diff       // ebx ==> x = offset to alignment boundary
2311             mov edi, row        // edi ==> Avg(x)
2312             movq mm4, HBClearMask
2313             mov edx, edi
2314             mov esi, prev_row   // esi ==> Prior(x)
2315             sub edx, bpp        // edx ==> Raw(x-bpp)
2316 davgAlp:
2317             movq mm0, [edi + ebx]
2318             movq mm3, mm5
2319             movq mm1, [esi + ebx]
2320             pand mm3, mm1       // get lsb for each prev_row byte
2321             movq mm2, [edx + ebx]
2322             psrlq mm1, 1        // divide prev_row bytes by 2
2323             pand mm3, mm2       // get LBCarrys for each byte where both
2324                                 // lsb's were == 1
2325             psrlq mm2, 1        // divide raw bytes by 2
2326             pand  mm1, mm4      // clear invalid bit 7 of each byte
2327             paddb mm0, mm3      // add LBCarrys to Avg for each byte
2328             pand  mm2, mm4      // clear invalid bit 7 of each byte
2329             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2330             add ebx, 8
2331             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2332             cmp ebx, MMXLength
2333             movq [edi + ebx - 8], mm0
2334             jb davgAlp
2335         } // end _asm block
2336       }
2337       break;
2338    }                         // end switch ( bpp )
2339 
2340    _asm {
2341          // MMX acceleration complete now do clean-up
2342          // Check if any remaining bytes left to decode
2343          mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
2344          mov edi, row          // edi ==> Avg(x)
2345          cmp ebx, FullLength   // Test if offset at end of array
2346          jnb davgend
2347          // Do Paeth decode for remaining bytes
2348          mov esi, prev_row     // esi ==> Prior(x)
2349          mov edx, edi
2350          xor ecx, ecx          // zero ecx before using cl & cx in loop below
2351          sub edx, bpp          // edx ==> Raw(x-bpp)
2352 davglp2:
2353          // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2354          xor eax, eax
2355          mov cl, [esi + ebx]   // load cl with Prior(x)
2356          mov al, [edx + ebx]   // load al with Raw(x-bpp)
2357          add ax, cx
2358          inc ebx
2359          shr ax, 1              // divide by 2
2360          add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
2361          cmp ebx, FullLength    // Check if at end of array
2362          mov [edi+ebx-1], al    // Write back Raw(x);
2363                           // mov does not affect flags; -1 to offset inc ebx
2364          jb davglp2
2365 davgend:
2366          emms             // End MMX instructions; prep for possible FP instrs.
2367    } // end _asm block
2368 }
2369 
2370 // Optimized code for PNG Paeth filter decoder
2371 void /* PRIVATE */
png_read_filter_row_mmx_paeth(png_row_infop row_info,png_bytep row,png_bytep prev_row)2372 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2373                               png_bytep prev_row)
2374 {
2375   // These variables are declared
2376   // here to ensure alignment on 8-byte boundaries.
2377   union uAll  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
2378 
2379    png_uint_32 FullLength;
2380    png_uint_32 MMXLength;
2381    //png_uint_32 len;
2382    int bpp;
2383    int diff;
2384    //int ptemp;
2385    int patemp, pbtemp, pctemp;
2386 
2387    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2388    FullLength  = row_info->rowbytes; // # of bytes to filter
2389    _asm
2390    {
2391          xor ebx, ebx        // ebx ==> x offset
2392          mov edi, row
2393          xor edx, edx        // edx ==> x-bpp offset
2394          mov esi, prev_row
2395          xor eax, eax
2396 
2397          // Compute the Raw value for the first bpp bytes
2398          // Note: the formula works out to be always
2399          //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
2400 dpthrlp:
2401          mov al, [edi + ebx]
2402          add al, [esi + ebx]
2403          inc ebx
2404          cmp ebx, bpp
2405          mov [edi + ebx - 1], al
2406          jb dpthrlp
2407          // get # of bytes to alignment
2408          mov diff, edi         // take start of row
2409          add diff, ebx         // add bpp
2410          xor ecx, ecx
2411          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
2412          and diff, 0xfffffff8  // mask to alignment boundary
2413          sub diff, edi         // subtract from start ==> value ebx at alignment
2414          jz dpthgo
2415          // fix alignment
2416 dpthlp1:
2417          xor eax, eax
2418          // pav = p - a = (a + b - c) - a = b - c
2419          mov al, [esi + ebx]   // load Prior(x) into al
2420          mov cl, [esi + edx]   // load Prior(x-bpp) into cl
2421          sub eax, ecx          // subtract Prior(x-bpp)
2422          mov patemp, eax       // Save pav for later use
2423          xor eax, eax
2424          // pbv = p - b = (a + b - c) - b = a - c
2425          mov al, [edi + edx]   // load Raw(x-bpp) into al
2426          sub eax, ecx          // subtract Prior(x-bpp)
2427          mov ecx, eax
2428          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2429          add eax, patemp       // pcv = pav + pbv
2430          // pc = abs(pcv)
2431          test eax, 0x80000000
2432          jz dpthpca
2433          neg eax               // reverse sign of neg values
2434 dpthpca:
2435          mov pctemp, eax       // save pc for later use
2436          // pb = abs(pbv)
2437          test ecx, 0x80000000
2438          jz dpthpba
2439          neg ecx               // reverse sign of neg values
2440 dpthpba:
2441          mov pbtemp, ecx       // save pb for later use
2442          // pa = abs(pav)
2443          mov eax, patemp
2444          test eax, 0x80000000
2445          jz dpthpaa
2446          neg eax               // reverse sign of neg values
2447 dpthpaa:
2448          mov patemp, eax       // save pa for later use
2449          // test if pa <= pb
2450          cmp eax, ecx
2451          jna dpthabb
2452          // pa > pb; now test if pb <= pc
2453          cmp ecx, pctemp
2454          jna dpthbbc
2455          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2456          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2457          jmp dpthpaeth
2458 dpthbbc:
2459          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2460          mov cl, [esi + ebx]   // load Prior(x) into cl
2461          jmp dpthpaeth
2462 dpthabb:
2463          // pa <= pb; now test if pa <= pc
2464          cmp eax, pctemp
2465          jna dpthabc
2466          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2467          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2468          jmp dpthpaeth
2469 dpthabc:
2470          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2471          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
2472 dpthpaeth:
2473          inc ebx
2474          inc edx
2475          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2476          add [edi + ebx - 1], cl
2477          cmp ebx, diff
2478          jb dpthlp1
2479 dpthgo:
2480          mov ecx, FullLength
2481          mov eax, ecx
2482          sub eax, ebx          // subtract alignment fix
2483          and eax, 0x00000007   // calc bytes over mult of 8
2484          sub ecx, eax          // drop over bytes from original length
2485          mov MMXLength, ecx
2486    } // end _asm block
2487    // Now do the math for the rest of the row
2488    switch ( bpp )
2489    {
2490       case 3:
2491       {
2492          ActiveMask.use = 0x0000000000ffffff;
2493          ActiveMaskEnd.use = 0xffff000000000000;
2494          ShiftBpp.use = 24;    // == bpp(3) * 8
2495          ShiftRem.use = 40;    // == 64 - 24
2496          _asm
2497          {
2498             mov ebx, diff
2499             mov edi, row
2500             mov esi, prev_row
2501             pxor mm0, mm0
2502             // PRIME the pump (load the first Raw(x-bpp) data set
2503             movq mm1, [edi+ebx-8]
2504 dpth3lp:
2505             psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2506             movq mm2, [esi + ebx]   // load b=Prior(x)
2507             punpcklbw mm1, mm0      // Unpack High bytes of a
2508             movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
2509             punpcklbw mm2, mm0      // Unpack High bytes of b
2510             psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2511             // pav = p - a = (a + b - c) - a = b - c
2512             movq mm4, mm2
2513             punpcklbw mm3, mm0      // Unpack High bytes of c
2514             // pbv = p - b = (a + b - c) - b = a - c
2515             movq mm5, mm1
2516             psubw mm4, mm3
2517             pxor mm7, mm7
2518             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2519             movq mm6, mm4
2520             psubw mm5, mm3
2521 
2522             // pa = abs(p-a) = abs(pav)
2523             // pb = abs(p-b) = abs(pbv)
2524             // pc = abs(p-c) = abs(pcv)
2525             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2526             paddw mm6, mm5
2527             pand mm0, mm4       // Only pav bytes < 0 in mm7
2528             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2529             psubw mm4, mm0
2530             pand mm7, mm5       // Only pbv bytes < 0 in mm0
2531             psubw mm4, mm0
2532             psubw mm5, mm7
2533             pxor mm0, mm0
2534             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2535             pand mm0, mm6       // Only pav bytes < 0 in mm7
2536             psubw mm5, mm7
2537             psubw mm6, mm0
2538             //  test pa <= pb
2539             movq mm7, mm4
2540             psubw mm6, mm0
2541             pcmpgtw mm7, mm5    // pa > pb?
2542             movq mm0, mm7
2543             // use mm7 mask to merge pa & pb
2544             pand mm5, mm7
2545             // use mm0 mask copy to merge a & b
2546             pand mm2, mm0
2547             pandn mm7, mm4
2548             pandn mm0, mm1
2549             paddw mm7, mm5
2550             paddw mm0, mm2
2551             //  test  ((pa <= pb)? pa:pb) <= pc
2552             pcmpgtw mm7, mm6       // pab > pc?
2553             pxor mm1, mm1
2554             pand mm3, mm7
2555             pandn mm7, mm0
2556             paddw mm7, mm3
2557             pxor mm0, mm0
2558             packuswb mm7, mm1
2559             movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
2560             pand mm7, ActiveMask
2561             movq mm2, mm3           // load b=Prior(x) step 1
2562             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2563             punpcklbw mm3, mm0      // Unpack High bytes of c
2564             movq [edi + ebx], mm7   // write back updated value
2565             movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
2566             // Now do Paeth for 2nd set of bytes (3-5)
2567             psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
2568             punpcklbw mm1, mm0      // Unpack High bytes of a
2569             pxor mm7, mm7
2570             punpcklbw mm2, mm0      // Unpack High bytes of b
2571             // pbv = p - b = (a + b - c) - b = a - c
2572             movq mm5, mm1
2573             // pav = p - a = (a + b - c) - a = b - c
2574             movq mm4, mm2
2575             psubw mm5, mm3
2576             psubw mm4, mm3
2577             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2578             //       pav + pbv = pbv + pav
2579             movq mm6, mm5
2580             paddw mm6, mm4
2581 
2582             // pa = abs(p-a) = abs(pav)
2583             // pb = abs(p-b) = abs(pbv)
2584             // pc = abs(p-c) = abs(pcv)
2585             pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
2586             pcmpgtw mm7, mm4       // Create mask pav bytes < 0
2587             pand mm0, mm5          // Only pbv bytes < 0 in mm0
2588             pand mm7, mm4          // Only pav bytes < 0 in mm7
2589             psubw mm5, mm0
2590             psubw mm4, mm7
2591             psubw mm5, mm0
2592             psubw mm4, mm7
2593             pxor mm0, mm0
2594             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2595             pand mm0, mm6          // Only pav bytes < 0 in mm7
2596             psubw mm6, mm0
2597             //  test pa <= pb
2598             movq mm7, mm4
2599             psubw mm6, mm0
2600             pcmpgtw mm7, mm5       // pa > pb?
2601             movq mm0, mm7
2602             // use mm7 mask to merge pa & pb
2603             pand mm5, mm7
2604             // use mm0 mask copy to merge a & b
2605             pand mm2, mm0
2606             pandn mm7, mm4
2607             pandn mm0, mm1
2608             paddw mm7, mm5
2609             paddw mm0, mm2
2610             //  test  ((pa <= pb)? pa:pb) <= pc
2611             pcmpgtw mm7, mm6       // pab > pc?
2612             movq mm2, [esi + ebx]  // load b=Prior(x)
2613             pand mm3, mm7
2614             pandn mm7, mm0
2615             pxor mm1, mm1
2616             paddw mm7, mm3
2617             pxor mm0, mm0
2618             packuswb mm7, mm1
2619             movq mm3, mm2           // load c=Prior(x-bpp) step 1
2620             pand mm7, ActiveMask
2621             punpckhbw mm2, mm0      // Unpack High bytes of b
2622             psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
2623              // pav = p - a = (a + b - c) - a = b - c
2624             movq mm4, mm2
2625             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2626             psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
2627             movq [edi + ebx], mm7   // write back updated value
2628             movq mm1, mm7
2629             punpckhbw mm3, mm0      // Unpack High bytes of c
2630             psllq mm1, ShiftBpp     // Shift bytes
2631                                     // Now mm1 will be used as Raw(x-bpp)
2632             // Now do Paeth for 3rd, and final, set of bytes (6-7)
2633             pxor mm7, mm7
2634             punpckhbw mm1, mm0      // Unpack High bytes of a
2635             psubw mm4, mm3
2636             // pbv = p - b = (a + b - c) - b = a - c
2637             movq mm5, mm1
2638             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2639             movq mm6, mm4
2640             psubw mm5, mm3
2641             pxor mm0, mm0
2642             paddw mm6, mm5
2643 
2644             // pa = abs(p-a) = abs(pav)
2645             // pb = abs(p-b) = abs(pbv)
2646             // pc = abs(p-c) = abs(pcv)
2647             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2648             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2649             pand mm0, mm4       // Only pav bytes < 0 in mm7
2650             pand mm7, mm5       // Only pbv bytes < 0 in mm0
2651             psubw mm4, mm0
2652             psubw mm5, mm7
2653             psubw mm4, mm0
2654             psubw mm5, mm7
2655             pxor mm0, mm0
2656             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2657             pand mm0, mm6       // Only pav bytes < 0 in mm7
2658             psubw mm6, mm0
2659             //  test pa <= pb
2660             movq mm7, mm4
2661             psubw mm6, mm0
2662             pcmpgtw mm7, mm5    // pa > pb?
2663             movq mm0, mm7
2664             // use mm0 mask copy to merge a & b
2665             pand mm2, mm0
2666             // use mm7 mask to merge pa & pb
2667             pand mm5, mm7
2668             pandn mm0, mm1
2669             pandn mm7, mm4
2670             paddw mm0, mm2
2671             paddw mm7, mm5
2672             //  test  ((pa <= pb)? pa:pb) <= pc
2673             pcmpgtw mm7, mm6    // pab > pc?
2674             pand mm3, mm7
2675             pandn mm7, mm0
2676             paddw mm7, mm3
2677             pxor mm1, mm1
2678             packuswb mm1, mm7
2679             // Step ebx to next set of 8 bytes and repeat loop til done
2680             add ebx, 8
2681             pand mm1, ActiveMaskEnd
2682             paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2683 
2684             cmp ebx, MMXLength
2685             pxor mm0, mm0              // pxor does not affect flags
2686             movq [edi + ebx - 8], mm1  // write back updated value
2687                                  // mm1 will be used as Raw(x-bpp) next loop
2688                            // mm3 ready to be used as Prior(x-bpp) next loop
2689             jb dpth3lp
2690          } // end _asm block
2691       }
2692       break;
2693 
2694       case 6:
2695       case 7:
2696       case 5:
2697       {
2698          ActiveMask.use  = 0x00000000ffffffff;
2699          ActiveMask2.use = 0xffffffff00000000;
2700          ShiftBpp.use = bpp << 3;    // == bpp * 8
2701          ShiftRem.use = 64 - ShiftBpp.use;
2702          _asm
2703          {
2704             mov ebx, diff
2705             mov edi, row
2706             mov esi, prev_row
2707             // PRIME the pump (load the first Raw(x-bpp) data set
2708             movq mm1, [edi+ebx-8]
2709             pxor mm0, mm0
2710 dpth6lp:
2711             // Must shift to position Raw(x-bpp) data
2712             psrlq mm1, ShiftRem
2713             // Do first set of 4 bytes
2714             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2715             punpcklbw mm1, mm0      // Unpack Low bytes of a
2716             movq mm2, [esi + ebx]   // load b=Prior(x)
2717             punpcklbw mm2, mm0      // Unpack Low bytes of b
2718             // Must shift to position Prior(x-bpp) data
2719             psrlq mm3, ShiftRem
2720             // pav = p - a = (a + b - c) - a = b - c
2721             movq mm4, mm2
2722             punpcklbw mm3, mm0      // Unpack Low bytes of c
2723             // pbv = p - b = (a + b - c) - b = a - c
2724             movq mm5, mm1
2725             psubw mm4, mm3
2726             pxor mm7, mm7
2727             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2728             movq mm6, mm4
2729             psubw mm5, mm3
2730             // pa = abs(p-a) = abs(pav)
2731             // pb = abs(p-b) = abs(pbv)
2732             // pc = abs(p-c) = abs(pcv)
2733             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2734             paddw mm6, mm5
2735             pand mm0, mm4       // Only pav bytes < 0 in mm7
2736             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2737             psubw mm4, mm0
2738             pand mm7, mm5       // Only pbv bytes < 0 in mm0
2739             psubw mm4, mm0
2740             psubw mm5, mm7
2741             pxor mm0, mm0
2742             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2743             pand mm0, mm6       // Only pav bytes < 0 in mm7
2744             psubw mm5, mm7
2745             psubw mm6, mm0
2746             //  test pa <= pb
2747             movq mm7, mm4
2748             psubw mm6, mm0
2749             pcmpgtw mm7, mm5    // pa > pb?
2750             movq mm0, mm7
2751             // use mm7 mask to merge pa & pb
2752             pand mm5, mm7
2753             // use mm0 mask copy to merge a & b
2754             pand mm2, mm0
2755             pandn mm7, mm4
2756             pandn mm0, mm1
2757             paddw mm7, mm5
2758             paddw mm0, mm2
2759             //  test  ((pa <= pb)? pa:pb) <= pc
2760             pcmpgtw mm7, mm6    // pab > pc?
2761             pxor mm1, mm1
2762             pand mm3, mm7
2763             pandn mm7, mm0
2764             paddw mm7, mm3
2765             pxor mm0, mm0
2766             packuswb mm7, mm1
2767             movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
2768             pand mm7, ActiveMask
2769             psrlq mm3, ShiftRem
2770             movq mm2, [esi + ebx]      // load b=Prior(x) step 1
2771             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2772             movq mm6, mm2
2773             movq [edi + ebx], mm7      // write back updated value
2774             movq mm1, [edi+ebx-8]
2775             psllq mm6, ShiftBpp
2776             movq mm5, mm7
2777             psrlq mm1, ShiftRem
2778             por mm3, mm6
2779             psllq mm5, ShiftBpp
2780             punpckhbw mm3, mm0         // Unpack High bytes of c
2781             por mm1, mm5
2782             // Do second set of 4 bytes
2783             punpckhbw mm2, mm0         // Unpack High bytes of b
2784             punpckhbw mm1, mm0         // Unpack High bytes of a
2785             // pav = p - a = (a + b - c) - a = b - c
2786             movq mm4, mm2
2787             // pbv = p - b = (a + b - c) - b = a - c
2788             movq mm5, mm1
2789             psubw mm4, mm3
2790             pxor mm7, mm7
2791             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2792             movq mm6, mm4
2793             psubw mm5, mm3
2794             // pa = abs(p-a) = abs(pav)
2795             // pb = abs(p-b) = abs(pbv)
2796             // pc = abs(p-c) = abs(pcv)
2797             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2798             paddw mm6, mm5
2799             pand mm0, mm4          // Only pav bytes < 0 in mm7
2800             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2801             psubw mm4, mm0
2802             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2803             psubw mm4, mm0
2804             psubw mm5, mm7
2805             pxor mm0, mm0
2806             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2807             pand mm0, mm6          // Only pav bytes < 0 in mm7
2808             psubw mm5, mm7
2809             psubw mm6, mm0
2810             //  test pa <= pb
2811             movq mm7, mm4
2812             psubw mm6, mm0
2813             pcmpgtw mm7, mm5       // pa > pb?
2814             movq mm0, mm7
2815             // use mm7 mask to merge pa & pb
2816             pand mm5, mm7
2817             // use mm0 mask copy to merge a & b
2818             pand mm2, mm0
2819             pandn mm7, mm4
2820             pandn mm0, mm1
2821             paddw mm7, mm5
2822             paddw mm0, mm2
2823             //  test  ((pa <= pb)? pa:pb) <= pc
2824             pcmpgtw mm7, mm6           // pab > pc?
2825             pxor mm1, mm1
2826             pand mm3, mm7
2827             pandn mm7, mm0
2828             pxor mm1, mm1
2829             paddw mm7, mm3
2830             pxor mm0, mm0
2831             // Step ex to next set of 8 bytes and repeat loop til done
2832             add ebx, 8
2833             packuswb mm1, mm7
2834             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2835             cmp ebx, MMXLength
2836             movq [edi + ebx - 8], mm1      // write back updated value
2837                                 // mm1 will be used as Raw(x-bpp) next loop
2838             jb dpth6lp
2839          } // end _asm block
2840       }
2841       break;
2842 
2843       case 4:
2844       {
2845          ActiveMask.use  = 0x00000000ffffffff;
2846          _asm {
2847             mov ebx, diff
2848             mov edi, row
2849             mov esi, prev_row
2850             pxor mm0, mm0
2851             // PRIME the pump (load the first Raw(x-bpp) data set
2852             movq mm1, [edi+ebx-8]    // Only time should need to read
2853                                      //  a=Raw(x-bpp) bytes
2854 dpth4lp:
2855             // Do first set of 4 bytes
2856             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
2857             punpckhbw mm1, mm0       // Unpack Low bytes of a
2858             movq mm2, [esi + ebx]    // load b=Prior(x)
2859             punpcklbw mm2, mm0       // Unpack High bytes of b
2860             // pav = p - a = (a + b - c) - a = b - c
2861             movq mm4, mm2
2862             punpckhbw mm3, mm0       // Unpack High bytes of c
2863             // pbv = p - b = (a + b - c) - b = a - c
2864             movq mm5, mm1
2865             psubw mm4, mm3
2866             pxor mm7, mm7
2867             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2868             movq mm6, mm4
2869             psubw mm5, mm3
2870             // pa = abs(p-a) = abs(pav)
2871             // pb = abs(p-b) = abs(pbv)
2872             // pc = abs(p-c) = abs(pcv)
2873             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2874             paddw mm6, mm5
2875             pand mm0, mm4          // Only pav bytes < 0 in mm7
2876             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2877             psubw mm4, mm0
2878             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2879             psubw mm4, mm0
2880             psubw mm5, mm7
2881             pxor mm0, mm0
2882             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2883             pand mm0, mm6          // Only pav bytes < 0 in mm7
2884             psubw mm5, mm7
2885             psubw mm6, mm0
2886             //  test pa <= pb
2887             movq mm7, mm4
2888             psubw mm6, mm0
2889             pcmpgtw mm7, mm5       // pa > pb?
2890             movq mm0, mm7
2891             // use mm7 mask to merge pa & pb
2892             pand mm5, mm7
2893             // use mm0 mask copy to merge a & b
2894             pand mm2, mm0
2895             pandn mm7, mm4
2896             pandn mm0, mm1
2897             paddw mm7, mm5
2898             paddw mm0, mm2
2899             //  test  ((pa <= pb)? pa:pb) <= pc
2900             pcmpgtw mm7, mm6       // pab > pc?
2901             pxor mm1, mm1
2902             pand mm3, mm7
2903             pandn mm7, mm0
2904             paddw mm7, mm3
2905             pxor mm0, mm0
2906             packuswb mm7, mm1
2907             movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
2908             pand mm7, ActiveMask
2909             movq mm2, mm3              // load b=Prior(x) step 1
2910             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2911             punpcklbw mm3, mm0         // Unpack High bytes of c
2912             movq [edi + ebx], mm7      // write back updated value
2913             movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
2914             // Do second set of 4 bytes
2915             punpckhbw mm2, mm0         // Unpack Low bytes of b
2916             punpcklbw mm1, mm0         // Unpack Low bytes of a
2917             // pav = p - a = (a + b - c) - a = b - c
2918             movq mm4, mm2
2919             // pbv = p - b = (a + b - c) - b = a - c
2920             movq mm5, mm1
2921             psubw mm4, mm3
2922             pxor mm7, mm7
2923             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2924             movq mm6, mm4
2925             psubw mm5, mm3
2926             // pa = abs(p-a) = abs(pav)
2927             // pb = abs(p-b) = abs(pbv)
2928             // pc = abs(p-c) = abs(pcv)
2929             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2930             paddw mm6, mm5
2931             pand mm0, mm4          // Only pav bytes < 0 in mm7
2932             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2933             psubw mm4, mm0
2934             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2935             psubw mm4, mm0
2936             psubw mm5, mm7
2937             pxor mm0, mm0
2938             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2939             pand mm0, mm6          // Only pav bytes < 0 in mm7
2940             psubw mm5, mm7
2941             psubw mm6, mm0
2942             //  test pa <= pb
2943             movq mm7, mm4
2944             psubw mm6, mm0
2945             pcmpgtw mm7, mm5       // pa > pb?
2946             movq mm0, mm7
2947             // use mm7 mask to merge pa & pb
2948             pand mm5, mm7
2949             // use mm0 mask copy to merge a & b
2950             pand mm2, mm0
2951             pandn mm7, mm4
2952             pandn mm0, mm1
2953             paddw mm7, mm5
2954             paddw mm0, mm2
2955             //  test  ((pa <= pb)? pa:pb) <= pc
2956             pcmpgtw mm7, mm6       // pab > pc?
2957             pxor mm1, mm1
2958             pand mm3, mm7
2959             pandn mm7, mm0
2960             pxor mm1, mm1
2961             paddw mm7, mm3
2962             pxor mm0, mm0
2963             // Step ex to next set of 8 bytes and repeat loop til done
2964             add ebx, 8
2965             packuswb mm1, mm7
2966             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2967             cmp ebx, MMXLength
2968             movq [edi + ebx - 8], mm1      // write back updated value
2969                                 // mm1 will be used as Raw(x-bpp) next loop
2970             jb dpth4lp
2971          } // end _asm block
2972       }
2973       break;
2974       case 8:                          // bpp == 8
2975       {
2976          ActiveMask.use  = 0x00000000ffffffff;
2977          _asm {
2978             mov ebx, diff
2979             mov edi, row
2980             mov esi, prev_row
2981             pxor mm0, mm0
2982             // PRIME the pump (load the first Raw(x-bpp) data set
2983             movq mm1, [edi+ebx-8]      // Only time should need to read
2984                                        //  a=Raw(x-bpp) bytes
2985 dpth8lp:
2986             // Do first set of 4 bytes
2987             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2988             punpcklbw mm1, mm0         // Unpack Low bytes of a
2989             movq mm2, [esi + ebx]      // load b=Prior(x)
2990             punpcklbw mm2, mm0         // Unpack Low bytes of b
2991             // pav = p - a = (a + b - c) - a = b - c
2992             movq mm4, mm2
2993             punpcklbw mm3, mm0         // Unpack Low bytes of c
2994             // pbv = p - b = (a + b - c) - b = a - c
2995             movq mm5, mm1
2996             psubw mm4, mm3
2997             pxor mm7, mm7
2998             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2999             movq mm6, mm4
3000             psubw mm5, mm3
3001             // pa = abs(p-a) = abs(pav)
3002             // pb = abs(p-b) = abs(pbv)
3003             // pc = abs(p-c) = abs(pcv)
3004             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
3005             paddw mm6, mm5
3006             pand mm0, mm4          // Only pav bytes < 0 in mm7
3007             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
3008             psubw mm4, mm0
3009             pand mm7, mm5          // Only pbv bytes < 0 in mm0
3010             psubw mm4, mm0
3011             psubw mm5, mm7
3012             pxor mm0, mm0
3013             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
3014             pand mm0, mm6          // Only pav bytes < 0 in mm7
3015             psubw mm5, mm7
3016             psubw mm6, mm0
3017             //  test pa <= pb
3018             movq mm7, mm4
3019             psubw mm6, mm0
3020             pcmpgtw mm7, mm5       // pa > pb?
3021             movq mm0, mm7
3022             // use mm7 mask to merge pa & pb
3023             pand mm5, mm7
3024             // use mm0 mask copy to merge a & b
3025             pand mm2, mm0
3026             pandn mm7, mm4
3027             pandn mm0, mm1
3028             paddw mm7, mm5
3029             paddw mm0, mm2
3030             //  test  ((pa <= pb)? pa:pb) <= pc
3031             pcmpgtw mm7, mm6       // pab > pc?
3032             pxor mm1, mm1
3033             pand mm3, mm7
3034             pandn mm7, mm0
3035             paddw mm7, mm3
3036             pxor mm0, mm0
3037             packuswb mm7, mm1
3038             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
3039             pand mm7, ActiveMask
3040             movq mm2, [esi + ebx]    // load b=Prior(x)
3041             paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
3042             punpckhbw mm3, mm0       // Unpack High bytes of c
3043             movq [edi + ebx], mm7    // write back updated value
3044             movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
3045 
3046             // Do second set of 4 bytes
3047             punpckhbw mm2, mm0       // Unpack High bytes of b
3048             punpckhbw mm1, mm0       // Unpack High bytes of a
3049             // pav = p - a = (a + b - c) - a = b - c
3050             movq mm4, mm2
3051             // pbv = p - b = (a + b - c) - b = a - c
3052             movq mm5, mm1
3053             psubw mm4, mm3
3054             pxor mm7, mm7
3055             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3056             movq mm6, mm4
3057             psubw mm5, mm3
3058             // pa = abs(p-a) = abs(pav)
3059             // pb = abs(p-b) = abs(pbv)
3060             // pc = abs(p-c) = abs(pcv)
3061             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
3062             paddw mm6, mm5
3063             pand mm0, mm4          // Only pav bytes < 0 in mm7
3064             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
3065             psubw mm4, mm0
3066             pand mm7, mm5          // Only pbv bytes < 0 in mm0
3067             psubw mm4, mm0
3068             psubw mm5, mm7
3069             pxor mm0, mm0
3070             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
3071             pand mm0, mm6          // Only pav bytes < 0 in mm7
3072             psubw mm5, mm7
3073             psubw mm6, mm0
3074             //  test pa <= pb
3075             movq mm7, mm4
3076             psubw mm6, mm0
3077             pcmpgtw mm7, mm5       // pa > pb?
3078             movq mm0, mm7
3079             // use mm7 mask to merge pa & pb
3080             pand mm5, mm7
3081             // use mm0 mask copy to merge a & b
3082             pand mm2, mm0
3083             pandn mm7, mm4
3084             pandn mm0, mm1
3085             paddw mm7, mm5
3086             paddw mm0, mm2
3087             //  test  ((pa <= pb)? pa:pb) <= pc
3088             pcmpgtw mm7, mm6       // pab > pc?
3089             pxor mm1, mm1
3090             pand mm3, mm7
3091             pandn mm7, mm0
3092             pxor mm1, mm1
3093             paddw mm7, mm3
3094             pxor mm0, mm0
3095             // Step ex to next set of 8 bytes and repeat loop til done
3096             add ebx, 8
3097             packuswb mm1, mm7
3098             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
3099             cmp ebx, MMXLength
3100             movq [edi + ebx - 8], mm1      // write back updated value
3101                             // mm1 will be used as Raw(x-bpp) next loop
3102             jb dpth8lp
3103          } // end _asm block
3104       }
3105       break;
3106 
3107       case 1:                // bpp = 1
3108       case 2:                // bpp = 2
3109       default:               // bpp > 8
3110       {
3111          _asm {
3112             mov ebx, diff
3113             cmp ebx, FullLength
3114             jnb dpthdend
3115             mov edi, row
3116             mov esi, prev_row
3117             // Do Paeth decode for remaining bytes
3118             mov edx, ebx
3119             xor ecx, ecx        // zero ecx before using cl & cx in loop below
3120             sub edx, bpp        // Set edx = ebx - bpp
3121 dpthdlp:
3122             xor eax, eax
3123             // pav = p - a = (a + b - c) - a = b - c
3124             mov al, [esi + ebx]        // load Prior(x) into al
3125             mov cl, [esi + edx]        // load Prior(x-bpp) into cl
3126             sub eax, ecx                 // subtract Prior(x-bpp)
3127             mov patemp, eax                 // Save pav for later use
3128             xor eax, eax
3129             // pbv = p - b = (a + b - c) - b = a - c
3130             mov al, [edi + edx]        // load Raw(x-bpp) into al
3131             sub eax, ecx                 // subtract Prior(x-bpp)
3132             mov ecx, eax
3133             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3134             add eax, patemp                 // pcv = pav + pbv
3135             // pc = abs(pcv)
3136             test eax, 0x80000000
3137             jz dpthdpca
3138             neg eax                     // reverse sign of neg values
3139 dpthdpca:
3140             mov pctemp, eax             // save pc for later use
3141             // pb = abs(pbv)
3142             test ecx, 0x80000000
3143             jz dpthdpba
3144             neg ecx                     // reverse sign of neg values
3145 dpthdpba:
3146             mov pbtemp, ecx             // save pb for later use
3147             // pa = abs(pav)
3148             mov eax, patemp
3149             test eax, 0x80000000
3150             jz dpthdpaa
3151             neg eax                     // reverse sign of neg values
3152 dpthdpaa:
3153             mov patemp, eax             // save pa for later use
3154             // test if pa <= pb
3155             cmp eax, ecx
3156             jna dpthdabb
3157             // pa > pb; now test if pb <= pc
3158             cmp ecx, pctemp
3159             jna dpthdbbc
3160             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3161             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3162             jmp dpthdpaeth
3163 dpthdbbc:
3164             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3165             mov cl, [esi + ebx]        // load Prior(x) into cl
3166             jmp dpthdpaeth
3167 dpthdabb:
3168             // pa <= pb; now test if pa <= pc
3169             cmp eax, pctemp
3170             jna dpthdabc
3171             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3172             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3173             jmp dpthdpaeth
3174 dpthdabc:
3175             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3176             mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3177 dpthdpaeth:
3178             inc ebx
3179             inc edx
3180             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3181             add [edi + ebx - 1], cl
3182             cmp ebx, FullLength
3183             jb dpthdlp
3184 dpthdend:
3185          } // end _asm block
3186       }
3187       return;                   // No need to go further with this one
3188    }                         // end switch ( bpp )
3189    _asm
3190    {
3191          // MMX acceleration complete now do clean-up
3192          // Check if any remaining bytes left to decode
3193          mov ebx, MMXLength
3194          cmp ebx, FullLength
3195          jnb dpthend
3196          mov edi, row
3197          mov esi, prev_row
3198          // Do Paeth decode for remaining bytes
3199          mov edx, ebx
3200          xor ecx, ecx         // zero ecx before using cl & cx in loop below
3201          sub edx, bpp         // Set edx = ebx - bpp
3202 dpthlp2:
3203          xor eax, eax
3204          // pav = p - a = (a + b - c) - a = b - c
3205          mov al, [esi + ebx]  // load Prior(x) into al
3206          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3207          sub eax, ecx         // subtract Prior(x-bpp)
3208          mov patemp, eax      // Save pav for later use
3209          xor eax, eax
3210          // pbv = p - b = (a + b - c) - b = a - c
3211          mov al, [edi + edx]  // load Raw(x-bpp) into al
3212          sub eax, ecx         // subtract Prior(x-bpp)
3213          mov ecx, eax
3214          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3215          add eax, patemp      // pcv = pav + pbv
3216          // pc = abs(pcv)
3217          test eax, 0x80000000
3218          jz dpthpca2
3219          neg eax              // reverse sign of neg values
3220 dpthpca2:
3221          mov pctemp, eax      // save pc for later use
3222          // pb = abs(pbv)
3223          test ecx, 0x80000000
3224          jz dpthpba2
3225          neg ecx              // reverse sign of neg values
3226 dpthpba2:
3227          mov pbtemp, ecx      // save pb for later use
3228          // pa = abs(pav)
3229          mov eax, patemp
3230          test eax, 0x80000000
3231          jz dpthpaa2
3232          neg eax              // reverse sign of neg values
3233 dpthpaa2:
3234          mov patemp, eax      // save pa for later use
3235          // test if pa <= pb
3236          cmp eax, ecx
3237          jna dpthabb2
3238          // pa > pb; now test if pb <= pc
3239          cmp ecx, pctemp
3240          jna dpthbbc2
3241          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3242          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3243          jmp dpthpaeth2
3244 dpthbbc2:
3245          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3246          mov cl, [esi + ebx]        // load Prior(x) into cl
3247          jmp dpthpaeth2
3248 dpthabb2:
3249          // pa <= pb; now test if pa <= pc
3250          cmp eax, pctemp
3251          jna dpthabc2
3252          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3253          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3254          jmp dpthpaeth2
3255 dpthabc2:
3256          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3257          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3258 dpthpaeth2:
3259          inc ebx
3260          inc edx
3261          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3262          add [edi + ebx - 1], cl
3263          cmp ebx, FullLength
3264          jb dpthlp2
3265 dpthend:
3266          emms             // End MMX instructions; prep for possible FP instrs.
3267    } // end _asm block
3268 }
3269 
3270 // Optimized code for PNG Sub filter decoder
3271 void /* PRIVATE */
png_read_filter_row_mmx_sub(png_row_infop row_info,png_bytep row)3272 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3273 {
3274   // These variables are declared
3275   // here to ensure alignment on 8-byte boundaries.
3276   union uAll ActiveMask, ShiftBpp, ShiftRem;
3277 
3278    //int test;
3279    int bpp;
3280    png_uint_32 FullLength;
3281    png_uint_32 MMXLength;
3282    int diff;
3283 
3284    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3285    FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
3286    _asm {
3287         mov edi, row
3288         mov esi, edi               // lp = row
3289         add edi, bpp               // rp = row + bpp
3290         xor eax, eax
3291         // get # of bytes to alignment
3292         mov diff, edi               // take start of row
3293         add diff, 0xf               // add 7 + 8 to incr past
3294                                         // alignment boundary
3295         xor ebx, ebx
3296         and diff, 0xfffffff8        // mask to alignment boundary
3297         sub diff, edi               // subtract from start ==> value
3298                                         //  ebx at alignment
3299         jz dsubgo
3300         // fix alignment
3301 dsublp1:
3302         mov al, [esi+ebx]
3303         add [edi+ebx], al
3304         inc ebx
3305         cmp ebx, diff
3306         jb dsublp1
3307 dsubgo:
3308         mov ecx, FullLength
3309         mov edx, ecx
3310         sub edx, ebx                  // subtract alignment fix
3311         and edx, 0x00000007           // calc bytes over mult of 8
3312         sub ecx, edx                  // drop over bytes from length
3313         mov MMXLength, ecx
3314    } // end _asm block
3315 
3316    // Now do the math for the rest of the row
3317    switch ( bpp )
3318    {
3319         case 3:
3320         {
3321          ActiveMask.use  = 0x0000ffffff000000;
3322          ShiftBpp.use = 24;       // == 3 * 8
3323          ShiftRem.use  = 40;      // == 64 - 24
3324          _asm {
3325             mov edi, row
3326             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3327             mov esi, edi              // lp = row
3328             add edi, bpp          // rp = row + bpp
3329             movq mm6, mm7
3330             mov ebx, diff
3331             psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
3332                                   // byte group
3333             // PRIME the pump (load the first Raw(x-bpp) data set
3334             movq mm1, [edi+ebx-8]
3335 dsub3lp:
3336             psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
3337                           // no need for mask; shift clears inactive bytes
3338             // Add 1st active group
3339             movq mm0, [edi+ebx]
3340             paddb mm0, mm1
3341             // Add 2nd active group
3342             movq mm1, mm0         // mov updated Raws to mm1
3343             psllq mm1, ShiftBpp   // shift data to position correctly
3344             pand mm1, mm7         // mask to use only 2nd active group
3345             paddb mm0, mm1
3346             // Add 3rd active group
3347             movq mm1, mm0         // mov updated Raws to mm1
3348             psllq mm1, ShiftBpp   // shift data to position correctly
3349             pand mm1, mm6         // mask to use only 3rd active group
3350             add ebx, 8
3351             paddb mm0, mm1
3352             cmp ebx, MMXLength
3353             movq [edi+ebx-8], mm0     // Write updated Raws back to array
3354             // Prep for doing 1st add at top of loop
3355             movq mm1, mm0
3356             jb dsub3lp
3357          } // end _asm block
3358       }
3359       break;
3360 
3361       case 1:
3362       {
3363          // Placed here just in case this is a duplicate of the
3364          // non-MMX code for the SUB filter in png_read_filter_row below
3365          //
3366          //         png_bytep rp;
3367          //         png_bytep lp;
3368          //         png_uint_32 i;
3369          //         bpp = (row_info->pixel_depth + 7) >> 3;
3370          //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3371          //            i < row_info->rowbytes; i++, rp++, lp++)
3372          //      {
3373          //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3374          //      }
3375          _asm {
3376             mov ebx, diff
3377             mov edi, row
3378             cmp ebx, FullLength
3379             jnb dsub1end
3380             mov esi, edi          // lp = row
3381             xor eax, eax
3382             add edi, bpp      // rp = row + bpp
3383 dsub1lp:
3384             mov al, [esi+ebx]
3385             add [edi+ebx], al
3386             inc ebx
3387             cmp ebx, FullLength
3388             jb dsub1lp
3389 dsub1end:
3390          } // end _asm block
3391       }
3392       return;
3393 
3394       case 6:
3395       case 7:
3396       case 4:
3397       case 5:
3398       {
3399          ShiftBpp.use = bpp << 3;
3400          ShiftRem.use = 64 - ShiftBpp.use;
3401          _asm {
3402             mov edi, row
3403             mov ebx, diff
3404             mov esi, edi               // lp = row
3405             add edi, bpp           // rp = row + bpp
3406             // PRIME the pump (load the first Raw(x-bpp) data set
3407             movq mm1, [edi+ebx-8]
3408 dsub4lp:
3409             psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3410                           // no need for mask; shift clears inactive bytes
3411             movq mm0, [edi+ebx]
3412             paddb mm0, mm1
3413             // Add 2nd active group
3414             movq mm1, mm0          // mov updated Raws to mm1
3415             psllq mm1, ShiftBpp    // shift data to position correctly
3416                                    // there is no need for any mask
3417                                    // since shift clears inactive bits/bytes
3418             add ebx, 8
3419             paddb mm0, mm1
3420             cmp ebx, MMXLength
3421             movq [edi+ebx-8], mm0
3422             movq mm1, mm0          // Prep for doing 1st add at top of loop
3423             jb dsub4lp
3424          } // end _asm block
3425       }
3426       break;
3427 
3428       case 2:
3429       {
3430          ActiveMask.use  = 0x00000000ffff0000;
3431          ShiftBpp.use = 16;       // == 2 * 8
3432          ShiftRem.use = 48;       // == 64 - 16
3433          _asm {
3434             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3435             mov ebx, diff
3436             movq mm6, mm7
3437             mov edi, row
3438             psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
3439                                     //  byte group
3440             mov esi, edi            // lp = row
3441             movq mm5, mm6
3442             add edi, bpp            // rp = row + bpp
3443             psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
3444                                     //  byte group
3445             // PRIME the pump (load the first Raw(x-bpp) data set
3446             movq mm1, [edi+ebx-8]
3447 dsub2lp:
3448             // Add 1st active group
3449             psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
3450                                     // no need for mask; shift clears inactive
3451                                     //  bytes
3452             movq mm0, [edi+ebx]
3453             paddb mm0, mm1
3454             // Add 2nd active group
3455             movq mm1, mm0           // mov updated Raws to mm1
3456             psllq mm1, ShiftBpp     // shift data to position correctly
3457             pand mm1, mm7           // mask to use only 2nd active group
3458             paddb mm0, mm1
3459             // Add 3rd active group
3460             movq mm1, mm0           // mov updated Raws to mm1
3461             psllq mm1, ShiftBpp     // shift data to position correctly
3462             pand mm1, mm6           // mask to use only 3rd active group
3463             paddb mm0, mm1
3464             // Add 4th active group
3465             movq mm1, mm0           // mov updated Raws to mm1
3466             psllq mm1, ShiftBpp     // shift data to position correctly
3467             pand mm1, mm5           // mask to use only 4th active group
3468             add ebx, 8
3469             paddb mm0, mm1
3470             cmp ebx, MMXLength
3471             movq [edi+ebx-8], mm0   // Write updated Raws back to array
3472             movq mm1, mm0           // Prep for doing 1st add at top of loop
3473             jb dsub2lp
3474          } // end _asm block
3475       }
3476       break;
3477       case 8:
3478       {
3479          _asm {
3480             mov edi, row
3481             mov ebx, diff
3482             mov esi, edi            // lp = row
3483             add edi, bpp            // rp = row + bpp
3484             mov ecx, MMXLength
3485             movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
3486                                     // Raw(x-bpp) data set
3487             and ecx, 0x0000003f     // calc bytes over mult of 64
3488 dsub8lp:
3489             movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
3490             paddb mm0, mm7
3491             movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
3492             movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
3493                                    // Now mm0 will be used as Raw(x-bpp) for
3494                                    // the 2nd group of 8 bytes.  This will be
3495                                    // repeated for each group of 8 bytes with
3496                                    // the 8th group being used as the Raw(x-bpp)
3497                                    // for the 1st group of the next loop.
3498             paddb mm1, mm0
3499             movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
3500             movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
3501             paddb mm2, mm1
3502             movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
3503             movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
3504             paddb mm3, mm2
3505             movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
3506             movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
3507             paddb mm4, mm3
3508             movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
3509             movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
3510             paddb mm5, mm4
3511             movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
3512             movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
3513             paddb mm6, mm5
3514             movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
3515             movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
3516             add ebx, 64
3517             paddb mm7, mm6
3518             cmp ebx, ecx
3519             movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
3520             jb dsub8lp
3521             cmp ebx, MMXLength
3522             jnb dsub8lt8
3523 dsub8lpA:
3524             movq mm0, [edi+ebx]
3525             add ebx, 8
3526             paddb mm0, mm7
3527             cmp ebx, MMXLength
3528             movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
3529             movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
3530                                     // be the new Raw(x-bpp) for the next loop
3531             jb dsub8lpA
3532 dsub8lt8:
3533          } // end _asm block
3534       }
3535       break;
3536 
3537       default:                // bpp greater than 8 bytes
3538       {
3539          _asm {
3540             mov ebx, diff
3541             mov edi, row
3542             mov esi, edi           // lp = row
3543             add edi, bpp           // rp = row + bpp
3544 dsubAlp:
3545             movq mm0, [edi+ebx]
3546             movq mm1, [esi+ebx]
3547             add ebx, 8
3548             paddb mm0, mm1
3549             cmp ebx, MMXLength
3550             movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
3551                                    //  add ebx
3552             jb dsubAlp
3553          } // end _asm block
3554       }
3555       break;
3556 
3557    } // end switch ( bpp )
3558 
3559    _asm {
3560         mov ebx, MMXLength
3561         mov edi, row
3562         cmp ebx, FullLength
3563         jnb dsubend
3564         mov esi, edi               // lp = row
3565         xor eax, eax
3566         add edi, bpp               // rp = row + bpp
3567 dsublp2:
3568         mov al, [esi+ebx]
3569         add [edi+ebx], al
3570         inc ebx
3571         cmp ebx, FullLength
3572         jb dsublp2
3573 dsubend:
3574         emms             // End MMX instructions; prep for possible FP instrs.
3575    } // end _asm block
3576 }
3577 
3578 // Optimized code for PNG Up filter decoder
3579 void /* PRIVATE */
png_read_filter_row_mmx_up(png_row_infop row_info,png_bytep row,png_bytep prev_row)3580 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3581    png_bytep prev_row)
3582 {
3583    png_uint_32 len;
3584    len  = row_info->rowbytes;       // # of bytes to filter
3585    _asm {
3586       mov edi, row
3587       // get # of bytes to alignment
3588       mov ecx, edi
3589       xor ebx, ebx
3590       add ecx, 0x7
3591       xor eax, eax
3592       and ecx, 0xfffffff8
3593       mov esi, prev_row
3594       sub ecx, edi
3595       jz dupgo
3596       // fix alignment
3597 duplp1:
3598       mov al, [edi+ebx]
3599       add al, [esi+ebx]
3600       inc ebx
3601       cmp ebx, ecx
3602       mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
3603       jb duplp1
3604 dupgo:
3605       mov ecx, len
3606       mov edx, ecx
3607       sub edx, ebx                  // subtract alignment fix
3608       and edx, 0x0000003f           // calc bytes over mult of 64
3609       sub ecx, edx                  // drop over bytes from length
3610       // Unrolled loop - use all MMX registers and interleave to reduce
3611       // number of branch instructions (loops) and reduce partial stalls
3612 duploop:
3613       movq mm1, [esi+ebx]
3614       movq mm0, [edi+ebx]
3615       movq mm3, [esi+ebx+8]
3616       paddb mm0, mm1
3617       movq mm2, [edi+ebx+8]
3618       movq [edi+ebx], mm0
3619       paddb mm2, mm3
3620       movq mm5, [esi+ebx+16]
3621       movq [edi+ebx+8], mm2
3622       movq mm4, [edi+ebx+16]
3623       movq mm7, [esi+ebx+24]
3624       paddb mm4, mm5
3625       movq mm6, [edi+ebx+24]
3626       movq [edi+ebx+16], mm4
3627       paddb mm6, mm7
3628       movq mm1, [esi+ebx+32]
3629       movq [edi+ebx+24], mm6
3630       movq mm0, [edi+ebx+32]
3631       movq mm3, [esi+ebx+40]
3632       paddb mm0, mm1
3633       movq mm2, [edi+ebx+40]
3634       movq [edi+ebx+32], mm0
3635       paddb mm2, mm3
3636       movq mm5, [esi+ebx+48]
3637       movq [edi+ebx+40], mm2
3638       movq mm4, [edi+ebx+48]
3639       movq mm7, [esi+ebx+56]
3640       paddb mm4, mm5
3641       movq mm6, [edi+ebx+56]
3642       movq [edi+ebx+48], mm4
3643       add ebx, 64
3644       paddb mm6, mm7
3645       cmp ebx, ecx
3646       movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3647                                      // -8 to offset add ebx
3648       jb duploop
3649 
3650       cmp edx, 0                     // Test for bytes over mult of 64
3651       jz dupend
3652 
3653 
3654       // 2 lines added by lcreeve at netins.net
3655       // (mail 11 Jul 98 in png-implement list)
3656       cmp edx, 8 //test for less than 8 bytes
3657       jb duplt8
3658 
3659 
3660       add ecx, edx
3661       and edx, 0x00000007           // calc bytes over mult of 8
3662       sub ecx, edx                  // drop over bytes from length
3663       jz duplt8
3664       // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3665 duplpA:
3666       movq mm1, [esi+ebx]
3667       movq mm0, [edi+ebx]
3668       add ebx, 8
3669       paddb mm0, mm1
3670       cmp ebx, ecx
3671       movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3672       jb duplpA
3673       cmp edx, 0            // Test for bytes over mult of 8
3674       jz dupend
3675 duplt8:
3676       xor eax, eax
3677       add ecx, edx          // move over byte count into counter
3678       // Loop using x86 registers to update remaining bytes
3679 duplp2:
3680       mov al, [edi + ebx]
3681       add al, [esi + ebx]
3682       inc ebx
3683       cmp ebx, ecx
3684       mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3685       jb duplp2
3686 dupend:
3687       // Conversion of filtered row completed
3688       emms          // End MMX instructions; prep for possible FP instrs.
3689    } // end _asm block
3690 }
3691 
3692 
3693 // Optimized png_read_filter_row routines
3694 void /* PRIVATE */
png_read_filter_row(png_structp png_ptr,png_row_infop row_info,png_bytep row,png_bytep prev_row,int filter)3695 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3696    row, png_bytep prev_row, int filter)
3697 {
3698 #ifdef PNG_DEBUG
3699    char filnm[10];
3700 #endif
3701 
3702    if (mmx_supported == 2) {
3703 #if !defined(PNG_1_0_X)
3704        /* this should have happened in png_init_mmx_flags() already */
3705        png_warning(png_ptr, "asm_flags may not have been initialized");
3706 #endif
3707        png_mmx_support();
3708    }
3709 
3710 #ifdef PNG_DEBUG
3711    png_debug(1, "in png_read_filter_row\n");
3712    switch (filter)
3713    {
3714       case 0: png_snprintf(filnm, 10, "none");
3715          break;
3716 #if !defined(PNG_1_0_X)
3717       case 1: png_snprintf(filnm, 10, "sub-%s",
3718         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3719          break;
3720       case 2: png_snprintf(filnm, 10, "up-%s",
3721         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3722          break;
3723       case 3: png_snprintf(filnm, 10, "avg-%s",
3724         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3725          break;
3726       case 4: png_snprintf(filnm, 10, "Paeth-%s",
3727         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3728          break;
3729 #else
3730       case 1: png_snprintf(filnm, 10, "sub");
3731          break;
3732       case 2: png_snprintf(filnm, 10, "up");
3733          break;
3734       case 3: png_snprintf(filnm, 10, "avg");
3735          break;
3736       case 4: png_snprintf(filnm, 10, "Paeth");
3737          break;
3738 #endif
3739       default: png_snprintf(filnm, 10, "unknw");
3740          break;
3741    }
3742    png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3743    png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3744       (int)((row_info->pixel_depth + 7) >> 3));
3745    png_debug1(0,"len=%8d, ", row_info->rowbytes);
3746 #endif /* PNG_DEBUG */
3747 
3748    switch (filter)
3749    {
3750       case PNG_FILTER_VALUE_NONE:
3751          break;
3752 
3753       case PNG_FILTER_VALUE_SUB:
3754       {
3755 #if !defined(PNG_1_0_X)
3756          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3757              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3758              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3759 #else
3760          if (mmx_supported)
3761 #endif
3762          {
3763             png_read_filter_row_mmx_sub(row_info, row);
3764          }
3765          else
3766          {
3767             png_uint_32 i;
3768             png_uint_32 istop = row_info->rowbytes;
3769             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3770             png_bytep rp = row + bpp;
3771             png_bytep lp = row;
3772 
3773             for (i = bpp; i < istop; i++)
3774             {
3775                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3776                rp++;
3777             }
3778          }
3779          break;
3780       }
3781 
3782       case PNG_FILTER_VALUE_UP:
3783       {
3784 #if !defined(PNG_1_0_X)
3785          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3786              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3787              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3788 #else
3789          if (mmx_supported)
3790 #endif
3791          {
3792             png_read_filter_row_mmx_up(row_info, row, prev_row);
3793          }
3794          else
3795          {
3796             png_uint_32 i;
3797             png_uint_32 istop = row_info->rowbytes;
3798             png_bytep rp = row;
3799             png_bytep pp = prev_row;
3800 
3801             for (i = 0; i < istop; ++i)
3802             {
3803                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3804                rp++;
3805             }
3806          }
3807          break;
3808       }
3809 
3810       case PNG_FILTER_VALUE_AVG:
3811       {
3812 #if !defined(PNG_1_0_X)
3813          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3814              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3815              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3816 #else
3817          if (mmx_supported)
3818 #endif
3819          {
3820             png_read_filter_row_mmx_avg(row_info, row, prev_row);
3821          }
3822          else
3823          {
3824             png_uint_32 i;
3825             png_bytep rp = row;
3826             png_bytep pp = prev_row;
3827             png_bytep lp = row;
3828             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3829             png_uint_32 istop = row_info->rowbytes - bpp;
3830 
3831             for (i = 0; i < bpp; i++)
3832             {
3833                *rp = (png_byte)(((int)(*rp) +
3834                   ((int)(*pp++) >> 1)) & 0xff);
3835                rp++;
3836             }
3837 
3838             for (i = 0; i < istop; i++)
3839             {
3840                *rp = (png_byte)(((int)(*rp) +
3841                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3842                rp++;
3843             }
3844          }
3845          break;
3846       }
3847 
3848       case PNG_FILTER_VALUE_PAETH:
3849       {
3850 #if !defined(PNG_1_0_X)
3851          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3852              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3853              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3854 #else
3855          if (mmx_supported)
3856 #endif
3857          {
3858             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3859          }
3860          else
3861          {
3862             png_uint_32 i;
3863             png_bytep rp = row;
3864             png_bytep pp = prev_row;
3865             png_bytep lp = row;
3866             png_bytep cp = prev_row;
3867             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3868             png_uint_32 istop=row_info->rowbytes - bpp;
3869 
3870             for (i = 0; i < bpp; i++)
3871             {
3872                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3873                rp++;
3874             }
3875 
3876             for (i = 0; i < istop; i++)   // use leftover rp,pp
3877             {
3878                int a, b, c, pa, pb, pc, p;
3879 
3880                a = *lp++;
3881                b = *pp++;
3882                c = *cp++;
3883 
3884                p = b - c;
3885                pc = a - c;
3886 
3887 #ifdef PNG_USE_ABS
3888                pa = abs(p);
3889                pb = abs(pc);
3890                pc = abs(p + pc);
3891 #else
3892                pa = p < 0 ? -p : p;
3893                pb = pc < 0 ? -pc : pc;
3894                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3895 #endif
3896 
3897                /*
3898                   if (pa <= pb && pa <= pc)
3899                      p = a;
3900                   else if (pb <= pc)
3901                      p = b;
3902                   else
3903                      p = c;
3904                 */
3905 
3906                p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3907 
3908                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3909                rp++;
3910             }
3911          }
3912          break;
3913       }
3914 
3915       default:
3916          png_warning(png_ptr, "Ignoring bad row filter type");
3917          *row=0;
3918          break;
3919    }
3920 }
3921 
3922 #endif /* PNG_MMX_CODE_SUPPORTED && PNG_USE_PNGVCRD */
3923