1
2 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 *
4 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 *
6 * Last changed in libpng 1.2.19 August 18, 2007
7 * For conditions of distribution and use, see copyright notice in png.h
8 * Copyright (c) 1998-2007 Glenn Randers-Pehrson
9 * Copyright (c) 1998, Intel Corporation
10 *
11 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
12 * Interface to libpng contributed by Gilles Vollant, 1999
13 *
14 *
15 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
16 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
17 * in bad pixels at the beginning of some rows of some images, and also
18 * (due to out-of-range memory reads and writes) caused heap corruption
19 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 *
21 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 *
23 * [runtime MMX configuration, GRR 20010102]
24 *
25 * [Copy 6 bytes per pixel, not 4, and use stride of 6, not 4, in the
26 * second loop of interlace processing of 48-bit pixels, GR-P 20070717]
27 *
28 * [move instances of uAll union into local, except for two constant
29 * instances, GR-P 20070805]
30 */
31
32 #define PNG_INTERNAL
33 #include "png.h"
34
35 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
36
37
38 static int mmx_supported=2;
39
40 int PNGAPI
png_mmx_support(void)41 png_mmx_support(void)
42 {
43 int mmx_supported_local = 0;
44 _asm {
45 push ebx //CPUID will trash these
46 push ecx
47 push edx
48
49 pushfd //Save Eflag to stack
50 pop eax //Get Eflag from stack into eax
51 mov ecx, eax //Make another copy of Eflag in ecx
52 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
53 push eax //Save modified Eflag back to stack
54
55 popfd //Restored modified value back to Eflag reg
56 pushfd //Save Eflag to stack
57 pop eax //Get Eflag from stack
58 push ecx // save original Eflag to stack
59 popfd // restore original Eflag
60 xor eax, ecx //Compare the new Eflag with the original Eflag
61 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
62 //skip following instructions and jump to
63 //NOT_SUPPORTED label
64
65 xor eax, eax //Set eax to zero
66
67 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
68 _asm _emit 0xa2
69
70 cmp eax, 1 //make sure eax return non-zero value
71 jl NOT_SUPPORTED //If eax is zero, mmx not supported
72
73 xor eax, eax //set eax to zero
74 inc eax //Now increment eax to 1. This instruction is
75 //faster than the instruction "mov eax, 1"
76
77 _asm _emit 0x0f //CPUID instruction
78 _asm _emit 0xa2
79
80 and edx, 0x00800000 //mask out all bits but mmx bit(24)
81 cmp edx, 0 // 0 = mmx not supported
82 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
83
84 mov mmx_supported_local, 1 //set return value to 1
85
86 NOT_SUPPORTED:
87 mov eax, mmx_supported_local //move return value to eax
88 pop edx //CPUID trashed these
89 pop ecx
90 pop ebx
91 }
92
93 //mmx_supported_local=0; // test code for force don't support MMX
94 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
95
96 mmx_supported = mmx_supported_local;
97 return mmx_supported_local;
98 }
99
100 /* Combines the row recently read in with the previous row.
101 This routine takes care of alpha and transparency if requested.
102 This routine also handles the two methods of progressive display
103 of interlaced images, depending on the mask value.
104 The mask value describes which pixels are to be combined with
105 the row. The pattern always repeats every 8 pixels, so just 8
106 bits are needed. A one indicates the pixel is to be combined; a
107 zero indicates the pixel is to be skipped. This is in addition
108 to any alpha or transparency value associated with the pixel. If
109 you want all pixels to be combined, pass 0xff (255) in mask. */
110
111 /* Use this routine for x86 platform - uses faster MMX routine if machine
112 supports MMX */
113
114 void /* PRIVATE */
png_combine_row(png_structp png_ptr,png_bytep row,int mask)115 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
116 {
117 #ifdef PNG_USE_LOCAL_ARRAYS
118 PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
119 #endif
120
121 png_debug(1,"in png_combine_row_asm\n");
122
123 if (mmx_supported == 2) {
124 #if !defined(PNG_1_0_X)
125 /* this should have happened in png_init_mmx_flags() already */
126 png_warning(png_ptr, "asm_flags may not have been initialized");
127 #endif
128 png_mmx_support();
129 }
130
131 if (mask == 0xff)
132 {
133 png_memcpy(row, png_ptr->row_buf + 1,
134 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
135 png_ptr->width));
136 }
137 /* GRR: add "else if (mask == 0)" case?
138 * or does png_combine_row() not even get called in that case? */
139 else
140 {
141 switch (png_ptr->row_info.pixel_depth)
142 {
143 case 24:
144 {
145 png_bytep srcptr;
146 png_bytep dstptr;
147 png_uint_32 len;
148 int unmask, diff;
149
150 __int64 mask2=0x0101010202020404, //24bpp
151 mask1=0x0408080810101020,
152 mask0=0x2020404040808080;
153
154 srcptr = png_ptr->row_buf + 1;
155 dstptr = row;
156
157 unmask = ~mask;
158 len = (png_ptr->width)&~7;
159 diff = (png_ptr->width)&7;
160
161 #if !defined(PNG_1_0_X)
162 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
163 /* && mmx_supported */ )
164 #else
165 if (mmx_supported)
166 #endif
167 {
168 _asm
169 {
170 movd mm7, unmask //load bit pattern
171 psubb mm6,mm6 //zero mm6
172 punpcklbw mm7,mm7
173 punpcklwd mm7,mm7
174 punpckldq mm7,mm7 //fill register with 8 masks
175
176 movq mm0,mask0
177 movq mm1,mask1
178 movq mm2,mask2
179
180 pand mm0,mm7
181 pand mm1,mm7
182 pand mm2,mm7
183
184 pcmpeqb mm0,mm6
185 pcmpeqb mm1,mm6
186 pcmpeqb mm2,mm6
187
188 mov ecx,len //load length of line
189 mov esi,srcptr //load source
190 mov ebx,dstptr //load dest
191 cmp ecx,0
192 jz mainloop24end
193
194 mainloop24:
195 movq mm4,[esi]
196 pand mm4,mm0
197 movq mm6,mm0
198 movq mm7,[ebx]
199 pandn mm6,mm7
200 por mm4,mm6
201 movq [ebx],mm4
202
203
204 movq mm5,[esi+8]
205 pand mm5,mm1
206 movq mm7,mm1
207 movq mm6,[ebx+8]
208 pandn mm7,mm6
209 por mm5,mm7
210 movq [ebx+8],mm5
211
212 movq mm6,[esi+16]
213 pand mm6,mm2
214 movq mm4,mm2
215 movq mm7,[ebx+16]
216 pandn mm4,mm7
217 por mm6,mm4
218 movq [ebx+16],mm6
219
220 add esi,24 //inc by 24 bytes processed
221 add ebx,24
222 sub ecx,8 //dec by 8 pixels processed
223
224 ja mainloop24
225
226 mainloop24end:
227 mov ecx,diff
228 cmp ecx,0
229 jz end24
230
231 mov edx,mask
232 sal edx,24 //make low byte the high byte
233 secondloop24:
234 sal edx,1 //move high bit to CF
235 jnc skip24 //if CF = 0
236 mov ax,[esi]
237 mov [ebx],ax
238 xor eax,eax
239 mov al,[esi+2]
240 mov [ebx+2],al
241 skip24:
242 add esi,3
243 add ebx,3
244
245 dec ecx
246 jnz secondloop24
247
248 end24:
249 emms
250 }
251 }
252 else /* mmx not supported - use modified C routine */
253 {
254 register unsigned int incr1, initial_val, final_val;
255 png_size_t pixel_bytes;
256 png_uint_32 i;
257 register int disp = png_pass_inc[png_ptr->pass];
258 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
259
260 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
261 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
262 pixel_bytes;
263 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
264 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
265 final_val = png_ptr->width*pixel_bytes;
266 incr1 = (disp)*pixel_bytes;
267 for (i = initial_val; i < final_val; i += incr1)
268 {
269 png_memcpy(dstptr, srcptr, pixel_bytes);
270 srcptr += incr1;
271 dstptr += incr1;
272 }
273 } /* end of else */
274
275 break;
276 } // end 24 bpp
277
278 case 32:
279 {
280 png_bytep srcptr;
281 png_bytep dstptr;
282 png_uint_32 len;
283 int unmask, diff;
284
285 __int64 mask3=0x0101010102020202, //32bpp
286 mask2=0x0404040408080808,
287 mask1=0x1010101020202020,
288 mask0=0x4040404080808080;
289
290 srcptr = png_ptr->row_buf + 1;
291 dstptr = row;
292
293 unmask = ~mask;
294 len = (png_ptr->width)&~7;
295 diff = (png_ptr->width)&7;
296
297 #if !defined(PNG_1_0_X)
298 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
299 /* && mmx_supported */ )
300 #else
301 if (mmx_supported)
302 #endif
303 {
304 _asm
305 {
306 movd mm7, unmask //load bit pattern
307 psubb mm6,mm6 //zero mm6
308 punpcklbw mm7,mm7
309 punpcklwd mm7,mm7
310 punpckldq mm7,mm7 //fill register with 8 masks
311
312 movq mm0,mask0
313 movq mm1,mask1
314 movq mm2,mask2
315 movq mm3,mask3
316
317 pand mm0,mm7
318 pand mm1,mm7
319 pand mm2,mm7
320 pand mm3,mm7
321
322 pcmpeqb mm0,mm6
323 pcmpeqb mm1,mm6
324 pcmpeqb mm2,mm6
325 pcmpeqb mm3,mm6
326
327 mov ecx,len //load length of line
328 mov esi,srcptr //load source
329 mov ebx,dstptr //load dest
330
331 cmp ecx,0 //lcr
332 jz mainloop32end
333
334 mainloop32:
335 movq mm4,[esi]
336 pand mm4,mm0
337 movq mm6,mm0
338 movq mm7,[ebx]
339 pandn mm6,mm7
340 por mm4,mm6
341 movq [ebx],mm4
342
343 movq mm5,[esi+8]
344 pand mm5,mm1
345 movq mm7,mm1
346 movq mm6,[ebx+8]
347 pandn mm7,mm6
348 por mm5,mm7
349 movq [ebx+8],mm5
350
351 movq mm6,[esi+16]
352 pand mm6,mm2
353 movq mm4,mm2
354 movq mm7,[ebx+16]
355 pandn mm4,mm7
356 por mm6,mm4
357 movq [ebx+16],mm6
358
359 movq mm7,[esi+24]
360 pand mm7,mm3
361 movq mm5,mm3
362 movq mm4,[ebx+24]
363 pandn mm5,mm4
364 por mm7,mm5
365 movq [ebx+24],mm7
366
367 add esi,32 //inc by 32 bytes processed
368 add ebx,32
369 sub ecx,8 //dec by 8 pixels processed
370
371 ja mainloop32
372
373 mainloop32end:
374 mov ecx,diff
375 cmp ecx,0
376 jz end32
377
378 mov edx,mask
379 sal edx,24 //make low byte the high byte
380 secondloop32:
381 sal edx,1 //move high bit to CF
382 jnc skip32 //if CF = 0
383 mov eax,[esi]
384 mov [ebx],eax
385 skip32:
386 add esi,4
387 add ebx,4
388
389 dec ecx
390 jnz secondloop32
391
392 end32:
393 emms
394 }
395 }
396 else /* mmx _not supported - Use modified C routine */
397 {
398 register unsigned int incr1, initial_val, final_val;
399 png_size_t pixel_bytes;
400 png_uint_32 i;
401 register int disp = png_pass_inc[png_ptr->pass];
402 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
403
404 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
405 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
406 pixel_bytes;
407 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
408 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
409 final_val = png_ptr->width*pixel_bytes;
410 incr1 = (disp)*pixel_bytes;
411 for (i = initial_val; i < final_val; i += incr1)
412 {
413 png_memcpy(dstptr, srcptr, pixel_bytes);
414 srcptr += incr1;
415 dstptr += incr1;
416 }
417 } /* end of else */
418
419 break;
420 } // end 32 bpp
421
422 case 8:
423 {
424 png_bytep srcptr;
425 png_bytep dstptr;
426 png_uint_32 len;
427 int m;
428 int diff, unmask;
429
430 __int64 mask0=0x0102040810204080;
431
432 #if !defined(PNG_1_0_X)
433 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
434 /* && mmx_supported */ )
435 #else
436 if (mmx_supported)
437 #endif
438 {
439 srcptr = png_ptr->row_buf + 1;
440 dstptr = row;
441 m = 0x80;
442 unmask = ~mask;
443 len = png_ptr->width &~7; //reduce to multiple of 8
444 diff = png_ptr->width & 7; //amount lost
445
446 _asm
447 {
448 movd mm7, unmask //load bit pattern
449 psubb mm6,mm6 //zero mm6
450 punpcklbw mm7,mm7
451 punpcklwd mm7,mm7
452 punpckldq mm7,mm7 //fill register with 8 masks
453
454 movq mm0,mask0
455
456 pand mm0,mm7 //nonzero if keep byte
457 pcmpeqb mm0,mm6 //zeros->1s, v versa
458
459 mov ecx,len //load length of line (pixels)
460 mov esi,srcptr //load source
461 mov ebx,dstptr //load dest
462 cmp ecx,0 //lcr
463 je mainloop8end
464
465 mainloop8:
466 movq mm4,[esi]
467 pand mm4,mm0
468 movq mm6,mm0
469 pandn mm6,[ebx]
470 por mm4,mm6
471 movq [ebx],mm4
472
473 add esi,8 //inc by 8 bytes processed
474 add ebx,8
475 sub ecx,8 //dec by 8 pixels processed
476
477 ja mainloop8
478 mainloop8end:
479
480 mov ecx,diff
481 cmp ecx,0
482 jz end8
483
484 mov edx,mask
485 sal edx,24 //make low byte the high byte
486
487 secondloop8:
488 sal edx,1 //move high bit to CF
489 jnc skip8 //if CF = 0
490 mov al,[esi]
491 mov [ebx],al
492 skip8:
493 inc esi
494 inc ebx
495
496 dec ecx
497 jnz secondloop8
498 end8:
499 emms
500 }
501 }
502 else /* mmx not supported - use modified C routine */
503 {
504 register unsigned int incr1, initial_val, final_val;
505 png_size_t pixel_bytes;
506 png_uint_32 i;
507 register int disp = png_pass_inc[png_ptr->pass];
508 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
509
510 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
511 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
512 pixel_bytes;
513 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
514 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
515 final_val = png_ptr->width*pixel_bytes;
516 incr1 = (disp)*pixel_bytes;
517 for (i = initial_val; i < final_val; i += incr1)
518 {
519 png_memcpy(dstptr, srcptr, pixel_bytes);
520 srcptr += incr1;
521 dstptr += incr1;
522 }
523 } /* end of else */
524
525 break;
526 } // end 8 bpp
527
528 case 1:
529 {
530 png_bytep sp;
531 png_bytep dp;
532 int s_inc, s_start, s_end;
533 int m;
534 int shift;
535 png_uint_32 i;
536
537 sp = png_ptr->row_buf + 1;
538 dp = row;
539 m = 0x80;
540 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
541 if (png_ptr->transformations & PNG_PACKSWAP)
542 {
543 s_start = 0;
544 s_end = 7;
545 s_inc = 1;
546 }
547 else
548 #endif
549 {
550 s_start = 7;
551 s_end = 0;
552 s_inc = -1;
553 }
554
555 shift = s_start;
556
557 for (i = 0; i < png_ptr->width; i++)
558 {
559 if (m & mask)
560 {
561 int value;
562
563 value = (*sp >> shift) & 0x1;
564 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
565 *dp |= (png_byte)(value << shift);
566 }
567
568 if (shift == s_end)
569 {
570 shift = s_start;
571 sp++;
572 dp++;
573 }
574 else
575 shift += s_inc;
576
577 if (m == 1)
578 m = 0x80;
579 else
580 m >>= 1;
581 }
582 break;
583 }
584
585 case 2:
586 {
587 png_bytep sp;
588 png_bytep dp;
589 int s_start, s_end, s_inc;
590 int m;
591 int shift;
592 png_uint_32 i;
593 int value;
594
595 sp = png_ptr->row_buf + 1;
596 dp = row;
597 m = 0x80;
598 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
599 if (png_ptr->transformations & PNG_PACKSWAP)
600 {
601 s_start = 0;
602 s_end = 6;
603 s_inc = 2;
604 }
605 else
606 #endif
607 {
608 s_start = 6;
609 s_end = 0;
610 s_inc = -2;
611 }
612
613 shift = s_start;
614
615 for (i = 0; i < png_ptr->width; i++)
616 {
617 if (m & mask)
618 {
619 value = (*sp >> shift) & 0x3;
620 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
621 *dp |= (png_byte)(value << shift);
622 }
623
624 if (shift == s_end)
625 {
626 shift = s_start;
627 sp++;
628 dp++;
629 }
630 else
631 shift += s_inc;
632 if (m == 1)
633 m = 0x80;
634 else
635 m >>= 1;
636 }
637 break;
638 }
639
640 case 4:
641 {
642 png_bytep sp;
643 png_bytep dp;
644 int s_start, s_end, s_inc;
645 int m;
646 int shift;
647 png_uint_32 i;
648 int value;
649
650 sp = png_ptr->row_buf + 1;
651 dp = row;
652 m = 0x80;
653 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
654 if (png_ptr->transformations & PNG_PACKSWAP)
655 {
656 s_start = 0;
657 s_end = 4;
658 s_inc = 4;
659 }
660 else
661 #endif
662 {
663 s_start = 4;
664 s_end = 0;
665 s_inc = -4;
666 }
667 shift = s_start;
668
669 for (i = 0; i < png_ptr->width; i++)
670 {
671 if (m & mask)
672 {
673 value = (*sp >> shift) & 0xf;
674 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
675 *dp |= (png_byte)(value << shift);
676 }
677
678 if (shift == s_end)
679 {
680 shift = s_start;
681 sp++;
682 dp++;
683 }
684 else
685 shift += s_inc;
686 if (m == 1)
687 m = 0x80;
688 else
689 m >>= 1;
690 }
691 break;
692 }
693
694 case 16:
695 {
696 png_bytep srcptr;
697 png_bytep dstptr;
698 png_uint_32 len;
699 int unmask, diff;
700 __int64 mask1=0x0101020204040808,
701 mask0=0x1010202040408080;
702
703 #if !defined(PNG_1_0_X)
704 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
705 /* && mmx_supported */ )
706 #else
707 if (mmx_supported)
708 #endif
709 {
710 srcptr = png_ptr->row_buf + 1;
711 dstptr = row;
712
713 unmask = ~mask;
714 len = (png_ptr->width)&~7;
715 diff = (png_ptr->width)&7;
716 _asm
717 {
718 movd mm7, unmask //load bit pattern
719 psubb mm6,mm6 //zero mm6
720 punpcklbw mm7,mm7
721 punpcklwd mm7,mm7
722 punpckldq mm7,mm7 //fill register with 8 masks
723
724 movq mm0,mask0
725 movq mm1,mask1
726
727 pand mm0,mm7
728 pand mm1,mm7
729
730 pcmpeqb mm0,mm6
731 pcmpeqb mm1,mm6
732
733 mov ecx,len //load length of line
734 mov esi,srcptr //load source
735 mov ebx,dstptr //load dest
736 cmp ecx,0 //lcr
737 jz mainloop16end
738
739 mainloop16:
740 movq mm4,[esi]
741 pand mm4,mm0
742 movq mm6,mm0
743 movq mm7,[ebx]
744 pandn mm6,mm7
745 por mm4,mm6
746 movq [ebx],mm4
747
748 movq mm5,[esi+8]
749 pand mm5,mm1
750 movq mm7,mm1
751 movq mm6,[ebx+8]
752 pandn mm7,mm6
753 por mm5,mm7
754 movq [ebx+8],mm5
755
756 add esi,16 //inc by 16 bytes processed
757 add ebx,16
758 sub ecx,8 //dec by 8 pixels processed
759
760 ja mainloop16
761
762 mainloop16end:
763 mov ecx,diff
764 cmp ecx,0
765 jz end16
766
767 mov edx,mask
768 sal edx,24 //make low byte the high byte
769 secondloop16:
770 sal edx,1 //move high bit to CF
771 jnc skip16 //if CF = 0
772 mov ax,[esi]
773 mov [ebx],ax
774 skip16:
775 add esi,2
776 add ebx,2
777
778 dec ecx
779 jnz secondloop16
780 end16:
781 emms
782 }
783 }
784 else /* mmx not supported - use modified C routine */
785 {
786 register unsigned int incr1, initial_val, final_val;
787 png_size_t pixel_bytes;
788 png_uint_32 i;
789 register int disp = png_pass_inc[png_ptr->pass];
790 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
791
792 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
793 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
794 pixel_bytes;
795 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
796 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
797 final_val = png_ptr->width*pixel_bytes;
798 incr1 = (disp)*pixel_bytes;
799 for (i = initial_val; i < final_val; i += incr1)
800 {
801 png_memcpy(dstptr, srcptr, pixel_bytes);
802 srcptr += incr1;
803 dstptr += incr1;
804 }
805 } /* end of else */
806
807 break;
808 } // end 16 bpp
809
810 case 48:
811 {
812 png_bytep srcptr;
813 png_bytep dstptr;
814 png_uint_32 len;
815 int unmask, diff;
816
817 __int64 mask5=0x0101010101010202,
818 mask4=0x0202020204040404,
819 mask3=0x0404080808080808,
820 mask2=0x1010101010102020,
821 mask1=0x2020202040404040,
822 mask0=0x4040808080808080;
823
824 #if !defined(PNG_1_0_X)
825 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
826 /* && mmx_supported */ )
827 #else
828 if (mmx_supported)
829 #endif
830 {
831 srcptr = png_ptr->row_buf + 1;
832 dstptr = row;
833
834 unmask = ~mask;
835 len = (png_ptr->width)&~7;
836 diff = (png_ptr->width)&7;
837 _asm
838 {
839 movd mm7, unmask //load bit pattern
840 psubb mm6,mm6 //zero mm6
841 punpcklbw mm7,mm7
842 punpcklwd mm7,mm7
843 punpckldq mm7,mm7 //fill register with 8 masks
844
845 movq mm0,mask0
846 movq mm1,mask1
847 movq mm2,mask2
848 movq mm3,mask3
849 movq mm4,mask4
850 movq mm5,mask5
851
852 pand mm0,mm7
853 pand mm1,mm7
854 pand mm2,mm7
855 pand mm3,mm7
856 pand mm4,mm7
857 pand mm5,mm7
858
859 pcmpeqb mm0,mm6
860 pcmpeqb mm1,mm6
861 pcmpeqb mm2,mm6
862 pcmpeqb mm3,mm6
863 pcmpeqb mm4,mm6
864 pcmpeqb mm5,mm6
865
866 mov ecx,len //load length of line
867 mov esi,srcptr //load source
868 mov ebx,dstptr //load dest
869
870 cmp ecx,0
871 jz mainloop48end
872
873 mainloop48:
874 movq mm7,[esi]
875 pand mm7,mm0
876 movq mm6,mm0
877 pandn mm6,[ebx]
878 por mm7,mm6
879 movq [ebx],mm7
880
881 movq mm6,[esi+8]
882 pand mm6,mm1
883 movq mm7,mm1
884 pandn mm7,[ebx+8]
885 por mm6,mm7
886 movq [ebx+8],mm6
887
888 movq mm6,[esi+16]
889 pand mm6,mm2
890 movq mm7,mm2
891 pandn mm7,[ebx+16]
892 por mm6,mm7
893 movq [ebx+16],mm6
894
895 movq mm7,[esi+24]
896 pand mm7,mm3
897 movq mm6,mm3
898 pandn mm6,[ebx+24]
899 por mm7,mm6
900 movq [ebx+24],mm7
901
902 movq mm6,[esi+32]
903 pand mm6,mm4
904 movq mm7,mm4
905 pandn mm7,[ebx+32]
906 por mm6,mm7
907 movq [ebx+32],mm6
908
909 movq mm7,[esi+40]
910 pand mm7,mm5
911 movq mm6,mm5
912 pandn mm6,[ebx+40]
913 por mm7,mm6
914 movq [ebx+40],mm7
915
916 add esi,48 //inc by 32 bytes processed
917 add ebx,48
918 sub ecx,8 //dec by 8 pixels processed
919
920 ja mainloop48
921 mainloop48end:
922
923 mov ecx,diff
924 cmp ecx,0
925 jz end48
926
927 mov edx,mask
928 sal edx,24 //make low byte the high byte
929
930 secondloop48:
931 sal edx,1 //move high bit to CF
932 jnc skip48 //if CF = 0
933 mov eax,[esi]
934 mov [ebx],eax
935 mov ax,[esi+4] // These 2 lines added 20070717
936 mov [ebx+4],ax // Glenn R-P
937 skip48:
938 add esi,6 // Changed 4 to 6 on these 2
939 add ebx,6 // lines. Glenn R-P 20070717
940
941 dec ecx
942 jnz secondloop48
943
944 end48:
945 emms
946 }
947 }
948 else /* mmx _not supported - Use modified C routine */
949 {
950 register unsigned int incr1, initial_val, final_val;
951 png_size_t pixel_bytes;
952 png_uint_32 i;
953 register int disp = png_pass_inc[png_ptr->pass];
954 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
955
956 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
957 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
958 pixel_bytes;
959 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
960 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
961 final_val = png_ptr->width*pixel_bytes;
962 incr1 = (disp)*pixel_bytes;
963 for (i = initial_val; i < final_val; i += incr1)
964 {
965 png_memcpy(dstptr, srcptr, pixel_bytes);
966 srcptr += incr1;
967 dstptr += incr1;
968 }
969 } /* end of else */
970
971 break;
972 } // end 48 bpp
973
974 default:
975 {
976 png_bytep sptr;
977 png_bytep dp;
978 png_size_t pixel_bytes;
979 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
980 unsigned int i;
981 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
982 register unsigned int incr1, initial_val, final_val;
983
984 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
985 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
986 pixel_bytes;
987 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
988 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
989 final_val = png_ptr->width*pixel_bytes;
990 incr1 = (disp)*pixel_bytes;
991 for (i = initial_val; i < final_val; i += incr1)
992 {
993 png_memcpy(dp, sptr, pixel_bytes);
994 sptr += incr1;
995 dp += incr1;
996 }
997 break;
998 }
999 } /* end switch (png_ptr->row_info.pixel_depth) */
1000 } /* end if (non-trivial mask) */
1001
1002 } /* end png_combine_row() */
1003
1004
1005 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1006
1007 void /* PRIVATE */
png_do_read_interlace(png_structp png_ptr)1008 png_do_read_interlace(png_structp png_ptr)
1009 {
1010 png_row_infop row_info = &(png_ptr->row_info);
1011 png_bytep row = png_ptr->row_buf + 1;
1012 int pass = png_ptr->pass;
1013 png_uint_32 transformations = png_ptr->transformations;
1014 #ifdef PNG_USE_LOCAL_ARRAYS
1015 PNG_CONST int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1016 #endif
1017
1018 png_debug(1,"in png_do_read_interlace\n");
1019
1020 if (mmx_supported == 2) {
1021 #if !defined(PNG_1_0_X)
1022 /* this should have happened in png_init_mmx_flags() already */
1023 png_warning(png_ptr, "asm_flags may not have been initialized");
1024 #endif
1025 png_mmx_support();
1026 }
1027
1028 if (row != NULL && row_info != NULL)
1029 {
1030 png_uint_32 final_width;
1031
1032 final_width = row_info->width * png_pass_inc[pass];
1033
1034 switch (row_info->pixel_depth)
1035 {
1036 case 1:
1037 {
1038 png_bytep sp, dp;
1039 int sshift, dshift;
1040 int s_start, s_end, s_inc;
1041 png_byte v;
1042 png_uint_32 i;
1043 int j;
1044
1045 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1046 dp = row + (png_size_t)((final_width - 1) >> 3);
1047 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1048 if (transformations & PNG_PACKSWAP)
1049 {
1050 sshift = (int)((row_info->width + 7) & 7);
1051 dshift = (int)((final_width + 7) & 7);
1052 s_start = 7;
1053 s_end = 0;
1054 s_inc = -1;
1055 }
1056 else
1057 #endif
1058 {
1059 sshift = 7 - (int)((row_info->width + 7) & 7);
1060 dshift = 7 - (int)((final_width + 7) & 7);
1061 s_start = 0;
1062 s_end = 7;
1063 s_inc = 1;
1064 }
1065
1066 for (i = row_info->width; i; i--)
1067 {
1068 v = (png_byte)((*sp >> sshift) & 0x1);
1069 for (j = 0; j < png_pass_inc[pass]; j++)
1070 {
1071 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1072 *dp |= (png_byte)(v << dshift);
1073 if (dshift == s_end)
1074 {
1075 dshift = s_start;
1076 dp--;
1077 }
1078 else
1079 dshift += s_inc;
1080 }
1081 if (sshift == s_end)
1082 {
1083 sshift = s_start;
1084 sp--;
1085 }
1086 else
1087 sshift += s_inc;
1088 }
1089 break;
1090 }
1091
1092 case 2:
1093 {
1094 png_bytep sp, dp;
1095 int sshift, dshift;
1096 int s_start, s_end, s_inc;
1097 png_uint_32 i;
1098
1099 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1100 dp = row + (png_size_t)((final_width - 1) >> 2);
1101 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1102 if (transformations & PNG_PACKSWAP)
1103 {
1104 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1105 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1106 s_start = 6;
1107 s_end = 0;
1108 s_inc = -2;
1109 }
1110 else
1111 #endif
1112 {
1113 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1114 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1115 s_start = 0;
1116 s_end = 6;
1117 s_inc = 2;
1118 }
1119
1120 for (i = row_info->width; i; i--)
1121 {
1122 png_byte v;
1123 int j;
1124
1125 v = (png_byte)((*sp >> sshift) & 0x3);
1126 for (j = 0; j < png_pass_inc[pass]; j++)
1127 {
1128 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1129 *dp |= (png_byte)(v << dshift);
1130 if (dshift == s_end)
1131 {
1132 dshift = s_start;
1133 dp--;
1134 }
1135 else
1136 dshift += s_inc;
1137 }
1138 if (sshift == s_end)
1139 {
1140 sshift = s_start;
1141 sp--;
1142 }
1143 else
1144 sshift += s_inc;
1145 }
1146 break;
1147 }
1148
1149 case 4:
1150 {
1151 png_bytep sp, dp;
1152 int sshift, dshift;
1153 int s_start, s_end, s_inc;
1154 png_uint_32 i;
1155
1156 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1157 dp = row + (png_size_t)((final_width - 1) >> 1);
1158 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1159 if (transformations & PNG_PACKSWAP)
1160 {
1161 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1162 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1163 s_start = 4;
1164 s_end = 0;
1165 s_inc = -4;
1166 }
1167 else
1168 #endif
1169 {
1170 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1171 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1172 s_start = 0;
1173 s_end = 4;
1174 s_inc = 4;
1175 }
1176
1177 for (i = row_info->width; i; i--)
1178 {
1179 png_byte v;
1180 int j;
1181
1182 v = (png_byte)((*sp >> sshift) & 0xf);
1183 for (j = 0; j < png_pass_inc[pass]; j++)
1184 {
1185 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1186 *dp |= (png_byte)(v << dshift);
1187 if (dshift == s_end)
1188 {
1189 dshift = s_start;
1190 dp--;
1191 }
1192 else
1193 dshift += s_inc;
1194 }
1195 if (sshift == s_end)
1196 {
1197 sshift = s_start;
1198 sp--;
1199 }
1200 else
1201 sshift += s_inc;
1202 }
1203 break;
1204 }
1205
1206 default: // This is the place where the routine is modified
1207 {
1208 __int64 const4 = 0x0000000000FFFFFF;
1209 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1210 __int64 const6 = 0x00000000000000FF;
1211 png_bytep sptr, dp;
1212 png_uint_32 i;
1213 png_size_t pixel_bytes;
1214 int width = row_info->width;
1215
1216 pixel_bytes = (row_info->pixel_depth >> 3);
1217
1218 sptr = row + (width - 1) * pixel_bytes;
1219 dp = row + (final_width - 1) * pixel_bytes;
1220 // New code by Nirav Chhatrapati - Intel Corporation
1221 // sign fix by GRR
1222 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1223
1224 // use MMX routine if machine supports it
1225 #if !defined(PNG_1_0_X)
1226 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1227 /* && mmx_supported */ )
1228 #else
1229 if (mmx_supported)
1230 #endif
1231 {
1232 if (pixel_bytes == 3)
1233 {
1234 if (((pass == 4) || (pass == 5)) && width)
1235 {
1236 int width_mmx = ((width >> 1) << 1) - 8;
1237 if (width_mmx < 0)
1238 width_mmx = 0;
1239 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1240 if (width_mmx)
1241 {
1242 _asm
1243 {
1244 mov esi, sptr
1245 mov edi, dp
1246 mov ecx, width_mmx
1247 sub esi, 3
1248 sub edi, 9
1249 loop_pass4:
1250 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1251 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1252 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1253 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1254 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1255 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1256 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1257 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1258 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1259 movq [edi], mm0 ; move quad to memory
1260 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1261 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1262 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1263 movd [edi+8], mm6 ; move double to memory
1264 sub esi, 6
1265 sub edi, 12
1266 sub ecx, 2
1267 jnz loop_pass4
1268 EMMS
1269 }
1270 }
1271
1272 sptr -= width_mmx*3;
1273 dp -= width_mmx*6;
1274 for (i = width; i; i--)
1275 {
1276 png_byte v[8];
1277 int j;
1278
1279 png_memcpy(v, sptr, 3);
1280 for (j = 0; j < png_pass_inc[pass]; j++)
1281 {
1282 png_memcpy(dp, v, 3);
1283 dp -= 3;
1284 }
1285 sptr -= 3;
1286 }
1287 }
1288 else if (((pass == 2) || (pass == 3)) && width)
1289 {
1290 _asm
1291 {
1292 mov esi, sptr
1293 mov edi, dp
1294 mov ecx, width
1295 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1296 loop_pass2:
1297 movd mm0, [esi] ; X X X X X v2 v1 v0
1298 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1299 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1300 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1301 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1302 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1303 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1304 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1305 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1306 movq [edi+4], mm0 ; move to memory
1307 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1308 movd [edi], mm0 ; move to memory
1309 sub esi, 3
1310 sub edi, 12
1311 dec ecx
1312 jnz loop_pass2
1313 EMMS
1314 }
1315 }
1316 else if (width) /* && ((pass == 0) || (pass == 1))) */
1317 {
1318 _asm
1319 {
1320 mov esi, sptr
1321 mov edi, dp
1322 mov ecx, width
1323 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1324 loop_pass0:
1325 movd mm0, [esi] ; X X X X X v2 v1 v0
1326 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1327 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1328 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1329 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1330 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1331 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1332 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1333 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1334 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1335 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1336 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1337 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1338 movq [edi+16] , mm4
1339 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1340 movq [edi+8] , mm3
1341 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1342 sub esi, 3
1343 movq [edi], mm0
1344 sub edi, 24
1345 //sub esi, 3
1346 dec ecx
1347 jnz loop_pass0
1348 EMMS
1349 }
1350 }
1351 } /* end of pixel_bytes == 3 */
1352
1353 else if (pixel_bytes == 1)
1354 {
1355 if (((pass == 4) || (pass == 5)) && width)
1356 {
1357 int width_mmx = ((width >> 3) << 3);
1358 width -= width_mmx;
1359 if (width_mmx)
1360 {
1361 _asm
1362 {
1363 mov esi, sptr
1364 mov edi, dp
1365 mov ecx, width_mmx
1366 sub edi, 15
1367 sub esi, 7
1368 loop1_pass4:
1369 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1370 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1371 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1372 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1373 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1374 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1375 sub esi, 8
1376 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1377 //sub esi, 4
1378 sub edi, 16
1379 sub ecx, 8
1380 jnz loop1_pass4
1381 EMMS
1382 }
1383 }
1384
1385 sptr -= width_mmx;
1386 dp -= width_mmx*2;
1387 for (i = width; i; i--)
1388 {
1389 int j;
1390
1391 for (j = 0; j < png_pass_inc[pass]; j++)
1392 {
1393 *dp-- = *sptr;
1394 }
1395 sptr --;
1396 }
1397 }
1398 else if (((pass == 2) || (pass == 3)) && width)
1399 {
1400 int width_mmx = ((width >> 2) << 2);
1401 width -= width_mmx;
1402 if (width_mmx)
1403 {
1404 _asm
1405 {
1406 mov esi, sptr
1407 mov edi, dp
1408 mov ecx, width_mmx
1409 sub edi, 15
1410 sub esi, 3
1411 loop1_pass2:
1412 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1413 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1414 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1415 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1416 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1417 movq [edi], mm0 ; move to memory v2 and v3
1418 sub esi, 4
1419 movq [edi+8], mm1 ; move to memory v1 and v0
1420 sub edi, 16
1421 sub ecx, 4
1422 jnz loop1_pass2
1423 EMMS
1424 }
1425 }
1426
1427 sptr -= width_mmx;
1428 dp -= width_mmx*4;
1429 for (i = width; i; i--)
1430 {
1431 int j;
1432
1433 for (j = 0; j < png_pass_inc[pass]; j++)
1434 {
1435 *dp-- = *sptr;
1436 }
1437 sptr --;
1438 }
1439 }
1440 else if (width) /* && ((pass == 0) || (pass == 1))) */
1441 {
1442 int width_mmx = ((width >> 2) << 2);
1443 width -= width_mmx;
1444 if (width_mmx)
1445 {
1446 _asm
1447 {
1448 mov esi, sptr
1449 mov edi, dp
1450 mov ecx, width_mmx
1451 sub edi, 31
1452 sub esi, 3
1453 loop1_pass0:
1454 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1455 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1456 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1457 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1458 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1459 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1460 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1461 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1462 movq [edi], mm0 ; move to memory v3
1463 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1464 movq [edi+8], mm3 ; move to memory v2
1465 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1466 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1467 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1468 movq [edi+16], mm2 ; move to memory v1
1469 movq [edi+24], mm4 ; move to memory v0
1470 sub esi, 4
1471 sub edi, 32
1472 sub ecx, 4
1473 jnz loop1_pass0
1474 EMMS
1475 }
1476 }
1477
1478 sptr -= width_mmx;
1479 dp -= width_mmx*8;
1480 for (i = width; i; i--)
1481 {
1482 int j;
1483
1484 /* I simplified this part in version 1.0.4e
1485 * here and in several other instances where
1486 * pixel_bytes == 1 -- GR-P
1487 *
1488 * Original code:
1489 *
1490 * png_byte v[8];
1491 * png_memcpy(v, sptr, pixel_bytes);
1492 * for (j = 0; j < png_pass_inc[pass]; j++)
1493 * {
1494 * png_memcpy(dp, v, pixel_bytes);
1495 * dp -= pixel_bytes;
1496 * }
1497 * sptr -= pixel_bytes;
1498 *
1499 * Replacement code is in the next three lines:
1500 */
1501
1502 for (j = 0; j < png_pass_inc[pass]; j++)
1503 *dp-- = *sptr;
1504 sptr--;
1505 }
1506 }
1507 } /* end of pixel_bytes == 1 */
1508
1509 else if (pixel_bytes == 2)
1510 {
1511 if (((pass == 4) || (pass == 5)) && width)
1512 {
1513 int width_mmx = ((width >> 1) << 1) ;
1514 width -= width_mmx;
1515 if (width_mmx)
1516 {
1517 _asm
1518 {
1519 mov esi, sptr
1520 mov edi, dp
1521 mov ecx, width_mmx
1522 sub esi, 2
1523 sub edi, 6
1524 loop2_pass4:
1525 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1526 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1527 sub esi, 4
1528 movq [edi], mm0
1529 sub edi, 8
1530 sub ecx, 2
1531 jnz loop2_pass4
1532 EMMS
1533 }
1534 }
1535
1536 sptr -= (width_mmx*2 - 2); // sign fixed
1537 dp -= (width_mmx*4 - 2); // sign fixed
1538 for (i = width; i; i--)
1539 {
1540 png_byte v[8];
1541 int j;
1542 sptr -= 2;
1543 png_memcpy(v, sptr, 2);
1544 for (j = 0; j < png_pass_inc[pass]; j++)
1545 {
1546 dp -= 2;
1547 png_memcpy(dp, v, 2);
1548 }
1549 }
1550 }
1551 else if (((pass == 2) || (pass == 3)) && width)
1552 {
1553 int width_mmx = ((width >> 1) << 1) ;
1554 width -= width_mmx;
1555 if (width_mmx)
1556 {
1557 _asm
1558 {
1559 mov esi, sptr
1560 mov edi, dp
1561 mov ecx, width_mmx
1562 sub esi, 2
1563 sub edi, 14
1564 loop2_pass2:
1565 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1566 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1567 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1568 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1569 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1570 movq [edi], mm0
1571 sub esi, 4
1572 movq [edi + 8], mm1
1573 //sub esi, 4
1574 sub edi, 16
1575 sub ecx, 2
1576 jnz loop2_pass2
1577 EMMS
1578 }
1579 }
1580
1581 sptr -= (width_mmx*2 - 2); // sign fixed
1582 dp -= (width_mmx*8 - 2); // sign fixed
1583 for (i = width; i; i--)
1584 {
1585 png_byte v[8];
1586 int j;
1587 sptr -= 2;
1588 png_memcpy(v, sptr, 2);
1589 for (j = 0; j < png_pass_inc[pass]; j++)
1590 {
1591 dp -= 2;
1592 png_memcpy(dp, v, 2);
1593 }
1594 }
1595 }
1596 else if (width) /* && ((pass == 0) || (pass == 1))) */
1597 {
1598 int width_mmx = ((width >> 1) << 1);
1599 width -= width_mmx;
1600 if (width_mmx)
1601 {
1602 _asm
1603 {
1604 mov esi, sptr
1605 mov edi, dp
1606 mov ecx, width_mmx
1607 sub esi, 2
1608 sub edi, 30
1609 loop2_pass0:
1610 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1611 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1612 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1613 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1614 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1615 movq [edi], mm0
1616 movq [edi + 8], mm0
1617 movq [edi + 16], mm1
1618 movq [edi + 24], mm1
1619 sub esi, 4
1620 sub edi, 32
1621 sub ecx, 2
1622 jnz loop2_pass0
1623 EMMS
1624 }
1625 }
1626
1627 sptr -= (width_mmx*2 - 2); // sign fixed
1628 dp -= (width_mmx*16 - 2); // sign fixed
1629 for (i = width; i; i--)
1630 {
1631 png_byte v[8];
1632 int j;
1633 sptr -= 2;
1634 png_memcpy(v, sptr, 2);
1635 for (j = 0; j < png_pass_inc[pass]; j++)
1636 {
1637 dp -= 2;
1638 png_memcpy(dp, v, 2);
1639 }
1640 }
1641 }
1642 } /* end of pixel_bytes == 2 */
1643
1644 else if (pixel_bytes == 4)
1645 {
1646 if (((pass == 4) || (pass == 5)) && width)
1647 {
1648 int width_mmx = ((width >> 1) << 1) ;
1649 width -= width_mmx;
1650 if (width_mmx)
1651 {
1652 _asm
1653 {
1654 mov esi, sptr
1655 mov edi, dp
1656 mov ecx, width_mmx
1657 sub esi, 4
1658 sub edi, 12
1659 loop4_pass4:
1660 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1661 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1662 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1663 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1664 movq [edi], mm0
1665 sub esi, 8
1666 movq [edi + 8], mm1
1667 sub edi, 16
1668 sub ecx, 2
1669 jnz loop4_pass4
1670 EMMS
1671 }
1672 }
1673
1674 sptr -= (width_mmx*4 - 4); // sign fixed
1675 dp -= (width_mmx*8 - 4); // sign fixed
1676 for (i = width; i; i--)
1677 {
1678 png_byte v[8];
1679 int j;
1680 sptr -= 4;
1681 png_memcpy(v, sptr, 4);
1682 for (j = 0; j < png_pass_inc[pass]; j++)
1683 {
1684 dp -= 4;
1685 png_memcpy(dp, v, 4);
1686 }
1687 }
1688 }
1689 else if (((pass == 2) || (pass == 3)) && width)
1690 {
1691 int width_mmx = ((width >> 1) << 1) ;
1692 width -= width_mmx;
1693 if (width_mmx)
1694 {
1695 _asm
1696 {
1697 mov esi, sptr
1698 mov edi, dp
1699 mov ecx, width_mmx
1700 sub esi, 4
1701 sub edi, 28
1702 loop4_pass2:
1703 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1704 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1705 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1706 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1707 movq [edi], mm0
1708 movq [edi + 8], mm0
1709 movq [edi+16], mm1
1710 movq [edi + 24], mm1
1711 sub esi, 8
1712 sub edi, 32
1713 sub ecx, 2
1714 jnz loop4_pass2
1715 EMMS
1716 }
1717 }
1718
1719 sptr -= (width_mmx*4 - 4); // sign fixed
1720 dp -= (width_mmx*16 - 4); // sign fixed
1721 for (i = width; i; i--)
1722 {
1723 png_byte v[8];
1724 int j;
1725 sptr -= 4;
1726 png_memcpy(v, sptr, 4);
1727 for (j = 0; j < png_pass_inc[pass]; j++)
1728 {
1729 dp -= 4;
1730 png_memcpy(dp, v, 4);
1731 }
1732 }
1733 }
1734 else if (width) /* && ((pass == 0) || (pass == 1))) */
1735 {
1736 int width_mmx = ((width >> 1) << 1) ;
1737 width -= width_mmx;
1738 if (width_mmx)
1739 {
1740 _asm
1741 {
1742 mov esi, sptr
1743 mov edi, dp
1744 mov ecx, width_mmx
1745 sub esi, 4
1746 sub edi, 60
1747 loop4_pass0:
1748 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1749 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1750 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1751 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1752 movq [edi], mm0
1753 movq [edi + 8], mm0
1754 movq [edi + 16], mm0
1755 movq [edi + 24], mm0
1756 movq [edi+32], mm1
1757 movq [edi + 40], mm1
1758 movq [edi+ 48], mm1
1759 sub esi, 8
1760 movq [edi + 56], mm1
1761 sub edi, 64
1762 sub ecx, 2
1763 jnz loop4_pass0
1764 EMMS
1765 }
1766 }
1767
1768 sptr -= (width_mmx*4 - 4); // sign fixed
1769 dp -= (width_mmx*32 - 4); // sign fixed
1770 for (i = width; i; i--)
1771 {
1772 png_byte v[8];
1773 int j;
1774 sptr -= 4;
1775 png_memcpy(v, sptr, 4);
1776 for (j = 0; j < png_pass_inc[pass]; j++)
1777 {
1778 dp -= 4;
1779 png_memcpy(dp, v, 4);
1780 }
1781 }
1782 }
1783
1784 } /* end of pixel_bytes == 4 */
1785
1786 else if (pixel_bytes == 6)
1787 {
1788 for (i = width; i; i--)
1789 {
1790 png_byte v[8];
1791 int j;
1792 png_memcpy(v, sptr, 6);
1793 for (j = 0; j < png_pass_inc[pass]; j++)
1794 {
1795 png_memcpy(dp, v, 6);
1796 dp -= 6;
1797 }
1798 sptr -= 6;
1799 }
1800 } /* end of pixel_bytes == 6 */
1801
1802 else
1803 {
1804 for (i = width; i; i--)
1805 {
1806 png_byte v[8];
1807 int j;
1808 png_memcpy(v, sptr, pixel_bytes);
1809 for (j = 0; j < png_pass_inc[pass]; j++)
1810 {
1811 png_memcpy(dp, v, pixel_bytes);
1812 dp -= pixel_bytes;
1813 }
1814 sptr-= pixel_bytes;
1815 }
1816 }
1817 } /* end of mmx_supported */
1818
1819 else /* MMX not supported: use modified C code - takes advantage
1820 * of inlining of memcpy for a constant */
1821 {
1822 if (pixel_bytes == 1)
1823 {
1824 for (i = width; i; i--)
1825 {
1826 int j;
1827 for (j = 0; j < png_pass_inc[pass]; j++)
1828 *dp-- = *sptr;
1829 sptr--;
1830 }
1831 }
1832 else if (pixel_bytes == 3)
1833 {
1834 for (i = width; i; i--)
1835 {
1836 png_byte v[8];
1837 int j;
1838 png_memcpy(v, sptr, pixel_bytes);
1839 for (j = 0; j < png_pass_inc[pass]; j++)
1840 {
1841 png_memcpy(dp, v, pixel_bytes);
1842 dp -= pixel_bytes;
1843 }
1844 sptr -= pixel_bytes;
1845 }
1846 }
1847 else if (pixel_bytes == 2)
1848 {
1849 for (i = width; i; i--)
1850 {
1851 png_byte v[8];
1852 int j;
1853 png_memcpy(v, sptr, pixel_bytes);
1854 for (j = 0; j < png_pass_inc[pass]; j++)
1855 {
1856 png_memcpy(dp, v, pixel_bytes);
1857 dp -= pixel_bytes;
1858 }
1859 sptr -= pixel_bytes;
1860 }
1861 }
1862 else if (pixel_bytes == 4)
1863 {
1864 for (i = width; i; i--)
1865 {
1866 png_byte v[8];
1867 int j;
1868 png_memcpy(v, sptr, pixel_bytes);
1869 for (j = 0; j < png_pass_inc[pass]; j++)
1870 {
1871 png_memcpy(dp, v, pixel_bytes);
1872 dp -= pixel_bytes;
1873 }
1874 sptr -= pixel_bytes;
1875 }
1876 }
1877 else if (pixel_bytes == 6)
1878 {
1879 for (i = width; i; i--)
1880 {
1881 png_byte v[8];
1882 int j;
1883 png_memcpy(v, sptr, pixel_bytes);
1884 for (j = 0; j < png_pass_inc[pass]; j++)
1885 {
1886 png_memcpy(dp, v, pixel_bytes);
1887 dp -= pixel_bytes;
1888 }
1889 sptr -= pixel_bytes;
1890 }
1891 }
1892 else
1893 {
1894 for (i = width; i; i--)
1895 {
1896 png_byte v[8];
1897 int j;
1898 png_memcpy(v, sptr, pixel_bytes);
1899 for (j = 0; j < png_pass_inc[pass]; j++)
1900 {
1901 png_memcpy(dp, v, pixel_bytes);
1902 dp -= pixel_bytes;
1903 }
1904 sptr -= pixel_bytes;
1905 }
1906 }
1907
1908 } /* end of MMX not supported */
1909 break;
1910 }
1911 } /* end switch (row_info->pixel_depth) */
1912
1913 row_info->width = final_width;
1914
1915 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1916 }
1917
1918 }
1919
1920 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1921
1922
1923 // These global constants are declared
1924 // here to ensure alignment on 8-byte boundaries.
1925 union uAll {
1926 __int64 use;
1927 double double_align;
1928 long long long_long_align;
1929 } ;
1930 static PNG_CONST union uAll LBCarryMask = {0x0101010101010101},
1931 HBClearMask = {0x7f7f7f7f7f7f7f7f};
1932
1933 // Optimized code for PNG Average filter decoder
1934 void /* PRIVATE */
png_read_filter_row_mmx_avg(png_row_infop row_info,png_bytep row,png_bytep prev_row)1935 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1936 , png_bytep prev_row)
1937 {
1938 // These variables are declared
1939 // here to ensure alignment on 8-byte boundaries.
1940 union uAll ActiveMask, ShiftBpp, ShiftRem;
1941
1942 int bpp;
1943 png_uint_32 FullLength;
1944 png_uint_32 MMXLength;
1945 //png_uint_32 len;
1946 int diff;
1947
1948 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1949 FullLength = row_info->rowbytes; // # of bytes to filter
1950 _asm {
1951 // Init address pointers and offset
1952 mov edi, row // edi ==> Avg(x)
1953 xor ebx, ebx // ebx ==> x
1954 mov edx, edi
1955 mov esi, prev_row // esi ==> Prior(x)
1956 sub edx, bpp // edx ==> Raw(x-bpp)
1957
1958 xor eax, eax
1959 // Compute the Raw value for the first bpp bytes
1960 // Raw(x) = Avg(x) + (Prior(x)/2)
1961 davgrlp:
1962 mov al, [esi + ebx] // Load al with Prior(x)
1963 inc ebx
1964 shr al, 1 // divide by 2
1965 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1966 cmp ebx, bpp
1967 mov [edi+ebx-1], al // Write back Raw(x);
1968 // mov does not affect flags; -1 to offset inc ebx
1969 jb davgrlp
1970 // get # of bytes to alignment
1971 mov diff, edi // take start of row
1972 add diff, ebx // add bpp
1973 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1974 and diff, 0xfffffff8 // mask to alignment boundary
1975 sub diff, edi // subtract from start ==> value ebx at alignment
1976 jz davggo
1977 // fix alignment
1978 // Compute the Raw value for the bytes upto the alignment boundary
1979 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1980 xor ecx, ecx
1981 davglp1:
1982 xor eax, eax
1983 mov cl, [esi + ebx] // load cl with Prior(x)
1984 mov al, [edx + ebx] // load al with Raw(x-bpp)
1985 add ax, cx
1986 inc ebx
1987 shr ax, 1 // divide by 2
1988 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1989 cmp ebx, diff // Check if at alignment boundary
1990 mov [edi+ebx-1], al // Write back Raw(x);
1991 // mov does not affect flags; -1 to offset inc ebx
1992 jb davglp1 // Repeat until at alignment boundary
1993 davggo:
1994 mov eax, FullLength
1995 mov ecx, eax
1996 sub eax, ebx // subtract alignment fix
1997 and eax, 0x00000007 // calc bytes over mult of 8
1998 sub ecx, eax // drop over bytes from original length
1999 mov MMXLength, ecx
2000 } // end _asm block
2001 // Now do the math for the rest of the row
2002 switch ( bpp )
2003 {
2004 case 3:
2005 {
2006 ActiveMask.use = 0x0000000000ffffff;
2007 ShiftBpp.use = 24; // == 3 * 8
2008 ShiftRem.use = 40; // == 64 - 24
2009 _asm {
2010 // Re-init address pointers and offset
2011 movq mm7, ActiveMask
2012 mov ebx, diff // ebx ==> x = offset to alignment boundary
2013 movq mm5, LBCarryMask
2014 mov edi, row // edi ==> Avg(x)
2015 movq mm4, HBClearMask
2016 mov esi, prev_row // esi ==> Prior(x)
2017 // PRIME the pump (load the first Raw(x-bpp) data set
2018 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2019 // (we correct position in loop below)
2020 davg3lp:
2021 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
2022 // Add (Prev_row/2) to Average
2023 movq mm3, mm5
2024 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
2025 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
2026 movq mm6, mm7
2027 pand mm3, mm1 // get lsb for each prev_row byte
2028 psrlq mm1, 1 // divide prev_row bytes by 2
2029 pand mm1, mm4 // clear invalid bit 7 of each byte
2030 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2031 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2032 movq mm1, mm3 // now use mm1 for getting LBCarrys
2033 pand mm1, mm2 // get LBCarrys for each byte where both
2034 // lsb's were == 1 (Only valid for active group)
2035 psrlq mm2, 1 // divide raw bytes by 2
2036 pand mm2, mm4 // clear invalid bit 7 of each byte
2037 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2038 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2039 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2040 // byte
2041 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2042 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2043 movq mm2, mm0 // mov updated Raws to mm2
2044 psllq mm2, ShiftBpp // shift data to position correctly
2045 movq mm1, mm3 // now use mm1 for getting LBCarrys
2046 pand mm1, mm2 // get LBCarrys for each byte where both
2047 // lsb's were == 1 (Only valid for active group)
2048 psrlq mm2, 1 // divide raw bytes by 2
2049 pand mm2, mm4 // clear invalid bit 7 of each byte
2050 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2051 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2052 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2053 // byte
2054
2055 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2056 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2057 // bytes
2058 movq mm2, mm0 // mov updated Raws to mm2
2059 psllq mm2, ShiftBpp // shift data to position correctly
2060 // Data only needs to be shifted once here to
2061 // get the correct x-bpp offset.
2062 movq mm1, mm3 // now use mm1 for getting LBCarrys
2063 pand mm1, mm2 // get LBCarrys for each byte where both
2064 // lsb's were == 1 (Only valid for active group)
2065 psrlq mm2, 1 // divide raw bytes by 2
2066 pand mm2, mm4 // clear invalid bit 7 of each byte
2067 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2068 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2069 add ebx, 8
2070 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2071 // byte
2072
2073 // Now ready to write back to memory
2074 movq [edi + ebx - 8], mm0
2075 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2076 cmp ebx, MMXLength
2077 movq mm2, mm0 // mov updated Raw(x) to mm2
2078 jb davg3lp
2079 } // end _asm block
2080 }
2081 break;
2082
2083 case 6:
2084 case 4:
2085 case 7:
2086 case 5:
2087 {
2088 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2089 // appropriate inactive bytes
2090 ShiftBpp.use = bpp << 3;
2091 ShiftRem.use = 64 - ShiftBpp.use;
2092 _asm {
2093 movq mm4, HBClearMask
2094 // Re-init address pointers and offset
2095 mov ebx, diff // ebx ==> x = offset to alignment boundary
2096 // Load ActiveMask and clear all bytes except for 1st active group
2097 movq mm7, ActiveMask
2098 mov edi, row // edi ==> Avg(x)
2099 psrlq mm7, ShiftRem
2100 mov esi, prev_row // esi ==> Prior(x)
2101 movq mm6, mm7
2102 movq mm5, LBCarryMask
2103 psllq mm6, ShiftBpp // Create mask for 2nd active group
2104 // PRIME the pump (load the first Raw(x-bpp) data set
2105 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2106 // (we correct position in loop below)
2107 davg4lp:
2108 movq mm0, [edi + ebx]
2109 psrlq mm2, ShiftRem // shift data to position correctly
2110 movq mm1, [esi + ebx]
2111 // Add (Prev_row/2) to Average
2112 movq mm3, mm5
2113 pand mm3, mm1 // get lsb for each prev_row byte
2114 psrlq mm1, 1 // divide prev_row bytes by 2
2115 pand mm1, mm4 // clear invalid bit 7 of each byte
2116 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2117 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2118 movq mm1, mm3 // now use mm1 for getting LBCarrys
2119 pand mm1, mm2 // get LBCarrys for each byte where both
2120 // lsb's were == 1 (Only valid for active group)
2121 psrlq mm2, 1 // divide raw bytes by 2
2122 pand mm2, mm4 // clear invalid bit 7 of each byte
2123 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2124 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2125 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2126 // byte
2127 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2128 movq mm2, mm0 // mov updated Raws to mm2
2129 psllq mm2, ShiftBpp // shift data to position correctly
2130 add ebx, 8
2131 movq mm1, mm3 // now use mm1 for getting LBCarrys
2132 pand mm1, mm2 // get LBCarrys for each byte where both
2133 // lsb's were == 1 (Only valid for active group)
2134 psrlq mm2, 1 // divide raw bytes by 2
2135 pand mm2, mm4 // clear invalid bit 7 of each byte
2136 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2137 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2138 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2139 // byte
2140 cmp ebx, MMXLength
2141 // Now ready to write back to memory
2142 movq [edi + ebx - 8], mm0
2143 // Prep Raw(x-bpp) for next loop
2144 movq mm2, mm0 // mov updated Raws to mm2
2145 jb davg4lp
2146 } // end _asm block
2147 }
2148 break;
2149 case 2:
2150 {
2151 ActiveMask.use = 0x000000000000ffff;
2152 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2153 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2154 _asm {
2155 // Load ActiveMask
2156 movq mm7, ActiveMask
2157 // Re-init address pointers and offset
2158 mov ebx, diff // ebx ==> x = offset to alignment boundary
2159 movq mm5, LBCarryMask
2160 mov edi, row // edi ==> Avg(x)
2161 movq mm4, HBClearMask
2162 mov esi, prev_row // esi ==> Prior(x)
2163 // PRIME the pump (load the first Raw(x-bpp) data set
2164 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2165 // (we correct position in loop below)
2166 davg2lp:
2167 movq mm0, [edi + ebx]
2168 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2169 movq mm1, [esi + ebx]
2170 // Add (Prev_row/2) to Average
2171 movq mm3, mm5
2172 pand mm3, mm1 // get lsb for each prev_row byte
2173 psrlq mm1, 1 // divide prev_row bytes by 2
2174 pand mm1, mm4 // clear invalid bit 7 of each byte
2175 movq mm6, mm7
2176 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2177 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2178 movq mm1, mm3 // now use mm1 for getting LBCarrys
2179 pand mm1, mm2 // get LBCarrys for each byte where both
2180 // lsb's were == 1 (Only valid for active group)
2181 psrlq mm2, 1 // divide raw bytes by 2
2182 pand mm2, mm4 // clear invalid bit 7 of each byte
2183 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2184 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2185 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2186 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2187 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2188 movq mm2, mm0 // mov updated Raws to mm2
2189 psllq mm2, ShiftBpp // shift data to position correctly
2190 movq mm1, mm3 // now use mm1 for getting LBCarrys
2191 pand mm1, mm2 // get LBCarrys for each byte where both
2192 // lsb's were == 1 (Only valid for active group)
2193 psrlq mm2, 1 // divide raw bytes by 2
2194 pand mm2, mm4 // clear invalid bit 7 of each byte
2195 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2196 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2197 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2198
2199 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2200 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2201 movq mm2, mm0 // mov updated Raws to mm2
2202 psllq mm2, ShiftBpp // shift data to position correctly
2203 // Data only needs to be shifted once here to
2204 // get the correct x-bpp offset.
2205 movq mm1, mm3 // now use mm1 for getting LBCarrys
2206 pand mm1, mm2 // get LBCarrys for each byte where both
2207 // lsb's were == 1 (Only valid for active group)
2208 psrlq mm2, 1 // divide raw bytes by 2
2209 pand mm2, mm4 // clear invalid bit 7 of each byte
2210 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2211 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2212 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2213
2214 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2215 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2216 movq mm2, mm0 // mov updated Raws to mm2
2217 psllq mm2, ShiftBpp // shift data to position correctly
2218 // Data only needs to be shifted once here to
2219 // get the correct x-bpp offset.
2220 add ebx, 8
2221 movq mm1, mm3 // now use mm1 for getting LBCarrys
2222 pand mm1, mm2 // get LBCarrys for each byte where both
2223 // lsb's were == 1 (Only valid for active group)
2224 psrlq mm2, 1 // divide raw bytes by 2
2225 pand mm2, mm4 // clear invalid bit 7 of each byte
2226 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2227 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2228 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2229
2230 cmp ebx, MMXLength
2231 // Now ready to write back to memory
2232 movq [edi + ebx - 8], mm0
2233 // Prep Raw(x-bpp) for next loop
2234 movq mm2, mm0 // mov updated Raws to mm2
2235 jb davg2lp
2236 } // end _asm block
2237 }
2238 break;
2239
2240 case 1: // bpp == 1
2241 {
2242 _asm {
2243 // Re-init address pointers and offset
2244 mov ebx, diff // ebx ==> x = offset to alignment boundary
2245 mov edi, row // edi ==> Avg(x)
2246 cmp ebx, FullLength // Test if offset at end of array
2247 jnb davg1end
2248 // Do Paeth decode for remaining bytes
2249 mov esi, prev_row // esi ==> Prior(x)
2250 mov edx, edi
2251 xor ecx, ecx // zero ecx before using cl & cx in loop below
2252 sub edx, bpp // edx ==> Raw(x-bpp)
2253 davg1lp:
2254 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2255 xor eax, eax
2256 mov cl, [esi + ebx] // load cl with Prior(x)
2257 mov al, [edx + ebx] // load al with Raw(x-bpp)
2258 add ax, cx
2259 inc ebx
2260 shr ax, 1 // divide by 2
2261 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2262 cmp ebx, FullLength // Check if at end of array
2263 mov [edi+ebx-1], al // Write back Raw(x);
2264 // mov does not affect flags; -1 to offset inc ebx
2265 jb davg1lp
2266 davg1end:
2267 } // end _asm block
2268 }
2269 return;
2270
2271 case 8: // bpp == 8
2272 {
2273 _asm {
2274 // Re-init address pointers and offset
2275 mov ebx, diff // ebx ==> x = offset to alignment boundary
2276 movq mm5, LBCarryMask
2277 mov edi, row // edi ==> Avg(x)
2278 movq mm4, HBClearMask
2279 mov esi, prev_row // esi ==> Prior(x)
2280 // PRIME the pump (load the first Raw(x-bpp) data set
2281 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2282 // (NO NEED to correct position in loop below)
2283 davg8lp:
2284 movq mm0, [edi + ebx]
2285 movq mm3, mm5
2286 movq mm1, [esi + ebx]
2287 add ebx, 8
2288 pand mm3, mm1 // get lsb for each prev_row byte
2289 psrlq mm1, 1 // divide prev_row bytes by 2
2290 pand mm3, mm2 // get LBCarrys for each byte where both
2291 // lsb's were == 1
2292 psrlq mm2, 1 // divide raw bytes by 2
2293 pand mm1, mm4 // clear invalid bit 7 of each byte
2294 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2295 pand mm2, mm4 // clear invalid bit 7 of each byte
2296 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2297 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2298 cmp ebx, MMXLength
2299 movq [edi + ebx - 8], mm0
2300 movq mm2, mm0 // reuse as Raw(x-bpp)
2301 jb davg8lp
2302 } // end _asm block
2303 }
2304 break;
2305 default: // bpp greater than 8
2306 {
2307 _asm {
2308 movq mm5, LBCarryMask
2309 // Re-init address pointers and offset
2310 mov ebx, diff // ebx ==> x = offset to alignment boundary
2311 mov edi, row // edi ==> Avg(x)
2312 movq mm4, HBClearMask
2313 mov edx, edi
2314 mov esi, prev_row // esi ==> Prior(x)
2315 sub edx, bpp // edx ==> Raw(x-bpp)
2316 davgAlp:
2317 movq mm0, [edi + ebx]
2318 movq mm3, mm5
2319 movq mm1, [esi + ebx]
2320 pand mm3, mm1 // get lsb for each prev_row byte
2321 movq mm2, [edx + ebx]
2322 psrlq mm1, 1 // divide prev_row bytes by 2
2323 pand mm3, mm2 // get LBCarrys for each byte where both
2324 // lsb's were == 1
2325 psrlq mm2, 1 // divide raw bytes by 2
2326 pand mm1, mm4 // clear invalid bit 7 of each byte
2327 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2328 pand mm2, mm4 // clear invalid bit 7 of each byte
2329 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2330 add ebx, 8
2331 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2332 cmp ebx, MMXLength
2333 movq [edi + ebx - 8], mm0
2334 jb davgAlp
2335 } // end _asm block
2336 }
2337 break;
2338 } // end switch ( bpp )
2339
2340 _asm {
2341 // MMX acceleration complete now do clean-up
2342 // Check if any remaining bytes left to decode
2343 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2344 mov edi, row // edi ==> Avg(x)
2345 cmp ebx, FullLength // Test if offset at end of array
2346 jnb davgend
2347 // Do Paeth decode for remaining bytes
2348 mov esi, prev_row // esi ==> Prior(x)
2349 mov edx, edi
2350 xor ecx, ecx // zero ecx before using cl & cx in loop below
2351 sub edx, bpp // edx ==> Raw(x-bpp)
2352 davglp2:
2353 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2354 xor eax, eax
2355 mov cl, [esi + ebx] // load cl with Prior(x)
2356 mov al, [edx + ebx] // load al with Raw(x-bpp)
2357 add ax, cx
2358 inc ebx
2359 shr ax, 1 // divide by 2
2360 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2361 cmp ebx, FullLength // Check if at end of array
2362 mov [edi+ebx-1], al // Write back Raw(x);
2363 // mov does not affect flags; -1 to offset inc ebx
2364 jb davglp2
2365 davgend:
2366 emms // End MMX instructions; prep for possible FP instrs.
2367 } // end _asm block
2368 }
2369
2370 // Optimized code for PNG Paeth filter decoder
2371 void /* PRIVATE */
png_read_filter_row_mmx_paeth(png_row_infop row_info,png_bytep row,png_bytep prev_row)2372 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2373 png_bytep prev_row)
2374 {
2375 // These variables are declared
2376 // here to ensure alignment on 8-byte boundaries.
2377 union uAll ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
2378
2379 png_uint_32 FullLength;
2380 png_uint_32 MMXLength;
2381 //png_uint_32 len;
2382 int bpp;
2383 int diff;
2384 //int ptemp;
2385 int patemp, pbtemp, pctemp;
2386
2387 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2388 FullLength = row_info->rowbytes; // # of bytes to filter
2389 _asm
2390 {
2391 xor ebx, ebx // ebx ==> x offset
2392 mov edi, row
2393 xor edx, edx // edx ==> x-bpp offset
2394 mov esi, prev_row
2395 xor eax, eax
2396
2397 // Compute the Raw value for the first bpp bytes
2398 // Note: the formula works out to be always
2399 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2400 dpthrlp:
2401 mov al, [edi + ebx]
2402 add al, [esi + ebx]
2403 inc ebx
2404 cmp ebx, bpp
2405 mov [edi + ebx - 1], al
2406 jb dpthrlp
2407 // get # of bytes to alignment
2408 mov diff, edi // take start of row
2409 add diff, ebx // add bpp
2410 xor ecx, ecx
2411 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2412 and diff, 0xfffffff8 // mask to alignment boundary
2413 sub diff, edi // subtract from start ==> value ebx at alignment
2414 jz dpthgo
2415 // fix alignment
2416 dpthlp1:
2417 xor eax, eax
2418 // pav = p - a = (a + b - c) - a = b - c
2419 mov al, [esi + ebx] // load Prior(x) into al
2420 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2421 sub eax, ecx // subtract Prior(x-bpp)
2422 mov patemp, eax // Save pav for later use
2423 xor eax, eax
2424 // pbv = p - b = (a + b - c) - b = a - c
2425 mov al, [edi + edx] // load Raw(x-bpp) into al
2426 sub eax, ecx // subtract Prior(x-bpp)
2427 mov ecx, eax
2428 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2429 add eax, patemp // pcv = pav + pbv
2430 // pc = abs(pcv)
2431 test eax, 0x80000000
2432 jz dpthpca
2433 neg eax // reverse sign of neg values
2434 dpthpca:
2435 mov pctemp, eax // save pc for later use
2436 // pb = abs(pbv)
2437 test ecx, 0x80000000
2438 jz dpthpba
2439 neg ecx // reverse sign of neg values
2440 dpthpba:
2441 mov pbtemp, ecx // save pb for later use
2442 // pa = abs(pav)
2443 mov eax, patemp
2444 test eax, 0x80000000
2445 jz dpthpaa
2446 neg eax // reverse sign of neg values
2447 dpthpaa:
2448 mov patemp, eax // save pa for later use
2449 // test if pa <= pb
2450 cmp eax, ecx
2451 jna dpthabb
2452 // pa > pb; now test if pb <= pc
2453 cmp ecx, pctemp
2454 jna dpthbbc
2455 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2456 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2457 jmp dpthpaeth
2458 dpthbbc:
2459 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2460 mov cl, [esi + ebx] // load Prior(x) into cl
2461 jmp dpthpaeth
2462 dpthabb:
2463 // pa <= pb; now test if pa <= pc
2464 cmp eax, pctemp
2465 jna dpthabc
2466 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2467 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2468 jmp dpthpaeth
2469 dpthabc:
2470 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2471 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2472 dpthpaeth:
2473 inc ebx
2474 inc edx
2475 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2476 add [edi + ebx - 1], cl
2477 cmp ebx, diff
2478 jb dpthlp1
2479 dpthgo:
2480 mov ecx, FullLength
2481 mov eax, ecx
2482 sub eax, ebx // subtract alignment fix
2483 and eax, 0x00000007 // calc bytes over mult of 8
2484 sub ecx, eax // drop over bytes from original length
2485 mov MMXLength, ecx
2486 } // end _asm block
2487 // Now do the math for the rest of the row
2488 switch ( bpp )
2489 {
2490 case 3:
2491 {
2492 ActiveMask.use = 0x0000000000ffffff;
2493 ActiveMaskEnd.use = 0xffff000000000000;
2494 ShiftBpp.use = 24; // == bpp(3) * 8
2495 ShiftRem.use = 40; // == 64 - 24
2496 _asm
2497 {
2498 mov ebx, diff
2499 mov edi, row
2500 mov esi, prev_row
2501 pxor mm0, mm0
2502 // PRIME the pump (load the first Raw(x-bpp) data set
2503 movq mm1, [edi+ebx-8]
2504 dpth3lp:
2505 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2506 movq mm2, [esi + ebx] // load b=Prior(x)
2507 punpcklbw mm1, mm0 // Unpack High bytes of a
2508 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2509 punpcklbw mm2, mm0 // Unpack High bytes of b
2510 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2511 // pav = p - a = (a + b - c) - a = b - c
2512 movq mm4, mm2
2513 punpcklbw mm3, mm0 // Unpack High bytes of c
2514 // pbv = p - b = (a + b - c) - b = a - c
2515 movq mm5, mm1
2516 psubw mm4, mm3
2517 pxor mm7, mm7
2518 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2519 movq mm6, mm4
2520 psubw mm5, mm3
2521
2522 // pa = abs(p-a) = abs(pav)
2523 // pb = abs(p-b) = abs(pbv)
2524 // pc = abs(p-c) = abs(pcv)
2525 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2526 paddw mm6, mm5
2527 pand mm0, mm4 // Only pav bytes < 0 in mm7
2528 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2529 psubw mm4, mm0
2530 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2531 psubw mm4, mm0
2532 psubw mm5, mm7
2533 pxor mm0, mm0
2534 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2535 pand mm0, mm6 // Only pav bytes < 0 in mm7
2536 psubw mm5, mm7
2537 psubw mm6, mm0
2538 // test pa <= pb
2539 movq mm7, mm4
2540 psubw mm6, mm0
2541 pcmpgtw mm7, mm5 // pa > pb?
2542 movq mm0, mm7
2543 // use mm7 mask to merge pa & pb
2544 pand mm5, mm7
2545 // use mm0 mask copy to merge a & b
2546 pand mm2, mm0
2547 pandn mm7, mm4
2548 pandn mm0, mm1
2549 paddw mm7, mm5
2550 paddw mm0, mm2
2551 // test ((pa <= pb)? pa:pb) <= pc
2552 pcmpgtw mm7, mm6 // pab > pc?
2553 pxor mm1, mm1
2554 pand mm3, mm7
2555 pandn mm7, mm0
2556 paddw mm7, mm3
2557 pxor mm0, mm0
2558 packuswb mm7, mm1
2559 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2560 pand mm7, ActiveMask
2561 movq mm2, mm3 // load b=Prior(x) step 1
2562 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2563 punpcklbw mm3, mm0 // Unpack High bytes of c
2564 movq [edi + ebx], mm7 // write back updated value
2565 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2566 // Now do Paeth for 2nd set of bytes (3-5)
2567 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2568 punpcklbw mm1, mm0 // Unpack High bytes of a
2569 pxor mm7, mm7
2570 punpcklbw mm2, mm0 // Unpack High bytes of b
2571 // pbv = p - b = (a + b - c) - b = a - c
2572 movq mm5, mm1
2573 // pav = p - a = (a + b - c) - a = b - c
2574 movq mm4, mm2
2575 psubw mm5, mm3
2576 psubw mm4, mm3
2577 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2578 // pav + pbv = pbv + pav
2579 movq mm6, mm5
2580 paddw mm6, mm4
2581
2582 // pa = abs(p-a) = abs(pav)
2583 // pb = abs(p-b) = abs(pbv)
2584 // pc = abs(p-c) = abs(pcv)
2585 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2586 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2587 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2588 pand mm7, mm4 // Only pav bytes < 0 in mm7
2589 psubw mm5, mm0
2590 psubw mm4, mm7
2591 psubw mm5, mm0
2592 psubw mm4, mm7
2593 pxor mm0, mm0
2594 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2595 pand mm0, mm6 // Only pav bytes < 0 in mm7
2596 psubw mm6, mm0
2597 // test pa <= pb
2598 movq mm7, mm4
2599 psubw mm6, mm0
2600 pcmpgtw mm7, mm5 // pa > pb?
2601 movq mm0, mm7
2602 // use mm7 mask to merge pa & pb
2603 pand mm5, mm7
2604 // use mm0 mask copy to merge a & b
2605 pand mm2, mm0
2606 pandn mm7, mm4
2607 pandn mm0, mm1
2608 paddw mm7, mm5
2609 paddw mm0, mm2
2610 // test ((pa <= pb)? pa:pb) <= pc
2611 pcmpgtw mm7, mm6 // pab > pc?
2612 movq mm2, [esi + ebx] // load b=Prior(x)
2613 pand mm3, mm7
2614 pandn mm7, mm0
2615 pxor mm1, mm1
2616 paddw mm7, mm3
2617 pxor mm0, mm0
2618 packuswb mm7, mm1
2619 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2620 pand mm7, ActiveMask
2621 punpckhbw mm2, mm0 // Unpack High bytes of b
2622 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2623 // pav = p - a = (a + b - c) - a = b - c
2624 movq mm4, mm2
2625 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2626 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2627 movq [edi + ebx], mm7 // write back updated value
2628 movq mm1, mm7
2629 punpckhbw mm3, mm0 // Unpack High bytes of c
2630 psllq mm1, ShiftBpp // Shift bytes
2631 // Now mm1 will be used as Raw(x-bpp)
2632 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2633 pxor mm7, mm7
2634 punpckhbw mm1, mm0 // Unpack High bytes of a
2635 psubw mm4, mm3
2636 // pbv = p - b = (a + b - c) - b = a - c
2637 movq mm5, mm1
2638 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2639 movq mm6, mm4
2640 psubw mm5, mm3
2641 pxor mm0, mm0
2642 paddw mm6, mm5
2643
2644 // pa = abs(p-a) = abs(pav)
2645 // pb = abs(p-b) = abs(pbv)
2646 // pc = abs(p-c) = abs(pcv)
2647 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2648 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2649 pand mm0, mm4 // Only pav bytes < 0 in mm7
2650 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2651 psubw mm4, mm0
2652 psubw mm5, mm7
2653 psubw mm4, mm0
2654 psubw mm5, mm7
2655 pxor mm0, mm0
2656 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2657 pand mm0, mm6 // Only pav bytes < 0 in mm7
2658 psubw mm6, mm0
2659 // test pa <= pb
2660 movq mm7, mm4
2661 psubw mm6, mm0
2662 pcmpgtw mm7, mm5 // pa > pb?
2663 movq mm0, mm7
2664 // use mm0 mask copy to merge a & b
2665 pand mm2, mm0
2666 // use mm7 mask to merge pa & pb
2667 pand mm5, mm7
2668 pandn mm0, mm1
2669 pandn mm7, mm4
2670 paddw mm0, mm2
2671 paddw mm7, mm5
2672 // test ((pa <= pb)? pa:pb) <= pc
2673 pcmpgtw mm7, mm6 // pab > pc?
2674 pand mm3, mm7
2675 pandn mm7, mm0
2676 paddw mm7, mm3
2677 pxor mm1, mm1
2678 packuswb mm1, mm7
2679 // Step ebx to next set of 8 bytes and repeat loop til done
2680 add ebx, 8
2681 pand mm1, ActiveMaskEnd
2682 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2683
2684 cmp ebx, MMXLength
2685 pxor mm0, mm0 // pxor does not affect flags
2686 movq [edi + ebx - 8], mm1 // write back updated value
2687 // mm1 will be used as Raw(x-bpp) next loop
2688 // mm3 ready to be used as Prior(x-bpp) next loop
2689 jb dpth3lp
2690 } // end _asm block
2691 }
2692 break;
2693
2694 case 6:
2695 case 7:
2696 case 5:
2697 {
2698 ActiveMask.use = 0x00000000ffffffff;
2699 ActiveMask2.use = 0xffffffff00000000;
2700 ShiftBpp.use = bpp << 3; // == bpp * 8
2701 ShiftRem.use = 64 - ShiftBpp.use;
2702 _asm
2703 {
2704 mov ebx, diff
2705 mov edi, row
2706 mov esi, prev_row
2707 // PRIME the pump (load the first Raw(x-bpp) data set
2708 movq mm1, [edi+ebx-8]
2709 pxor mm0, mm0
2710 dpth6lp:
2711 // Must shift to position Raw(x-bpp) data
2712 psrlq mm1, ShiftRem
2713 // Do first set of 4 bytes
2714 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2715 punpcklbw mm1, mm0 // Unpack Low bytes of a
2716 movq mm2, [esi + ebx] // load b=Prior(x)
2717 punpcklbw mm2, mm0 // Unpack Low bytes of b
2718 // Must shift to position Prior(x-bpp) data
2719 psrlq mm3, ShiftRem
2720 // pav = p - a = (a + b - c) - a = b - c
2721 movq mm4, mm2
2722 punpcklbw mm3, mm0 // Unpack Low bytes of c
2723 // pbv = p - b = (a + b - c) - b = a - c
2724 movq mm5, mm1
2725 psubw mm4, mm3
2726 pxor mm7, mm7
2727 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2728 movq mm6, mm4
2729 psubw mm5, mm3
2730 // pa = abs(p-a) = abs(pav)
2731 // pb = abs(p-b) = abs(pbv)
2732 // pc = abs(p-c) = abs(pcv)
2733 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2734 paddw mm6, mm5
2735 pand mm0, mm4 // Only pav bytes < 0 in mm7
2736 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2737 psubw mm4, mm0
2738 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2739 psubw mm4, mm0
2740 psubw mm5, mm7
2741 pxor mm0, mm0
2742 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2743 pand mm0, mm6 // Only pav bytes < 0 in mm7
2744 psubw mm5, mm7
2745 psubw mm6, mm0
2746 // test pa <= pb
2747 movq mm7, mm4
2748 psubw mm6, mm0
2749 pcmpgtw mm7, mm5 // pa > pb?
2750 movq mm0, mm7
2751 // use mm7 mask to merge pa & pb
2752 pand mm5, mm7
2753 // use mm0 mask copy to merge a & b
2754 pand mm2, mm0
2755 pandn mm7, mm4
2756 pandn mm0, mm1
2757 paddw mm7, mm5
2758 paddw mm0, mm2
2759 // test ((pa <= pb)? pa:pb) <= pc
2760 pcmpgtw mm7, mm6 // pab > pc?
2761 pxor mm1, mm1
2762 pand mm3, mm7
2763 pandn mm7, mm0
2764 paddw mm7, mm3
2765 pxor mm0, mm0
2766 packuswb mm7, mm1
2767 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2768 pand mm7, ActiveMask
2769 psrlq mm3, ShiftRem
2770 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2771 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2772 movq mm6, mm2
2773 movq [edi + ebx], mm7 // write back updated value
2774 movq mm1, [edi+ebx-8]
2775 psllq mm6, ShiftBpp
2776 movq mm5, mm7
2777 psrlq mm1, ShiftRem
2778 por mm3, mm6
2779 psllq mm5, ShiftBpp
2780 punpckhbw mm3, mm0 // Unpack High bytes of c
2781 por mm1, mm5
2782 // Do second set of 4 bytes
2783 punpckhbw mm2, mm0 // Unpack High bytes of b
2784 punpckhbw mm1, mm0 // Unpack High bytes of a
2785 // pav = p - a = (a + b - c) - a = b - c
2786 movq mm4, mm2
2787 // pbv = p - b = (a + b - c) - b = a - c
2788 movq mm5, mm1
2789 psubw mm4, mm3
2790 pxor mm7, mm7
2791 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2792 movq mm6, mm4
2793 psubw mm5, mm3
2794 // pa = abs(p-a) = abs(pav)
2795 // pb = abs(p-b) = abs(pbv)
2796 // pc = abs(p-c) = abs(pcv)
2797 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2798 paddw mm6, mm5
2799 pand mm0, mm4 // Only pav bytes < 0 in mm7
2800 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2801 psubw mm4, mm0
2802 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2803 psubw mm4, mm0
2804 psubw mm5, mm7
2805 pxor mm0, mm0
2806 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2807 pand mm0, mm6 // Only pav bytes < 0 in mm7
2808 psubw mm5, mm7
2809 psubw mm6, mm0
2810 // test pa <= pb
2811 movq mm7, mm4
2812 psubw mm6, mm0
2813 pcmpgtw mm7, mm5 // pa > pb?
2814 movq mm0, mm7
2815 // use mm7 mask to merge pa & pb
2816 pand mm5, mm7
2817 // use mm0 mask copy to merge a & b
2818 pand mm2, mm0
2819 pandn mm7, mm4
2820 pandn mm0, mm1
2821 paddw mm7, mm5
2822 paddw mm0, mm2
2823 // test ((pa <= pb)? pa:pb) <= pc
2824 pcmpgtw mm7, mm6 // pab > pc?
2825 pxor mm1, mm1
2826 pand mm3, mm7
2827 pandn mm7, mm0
2828 pxor mm1, mm1
2829 paddw mm7, mm3
2830 pxor mm0, mm0
2831 // Step ex to next set of 8 bytes and repeat loop til done
2832 add ebx, 8
2833 packuswb mm1, mm7
2834 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2835 cmp ebx, MMXLength
2836 movq [edi + ebx - 8], mm1 // write back updated value
2837 // mm1 will be used as Raw(x-bpp) next loop
2838 jb dpth6lp
2839 } // end _asm block
2840 }
2841 break;
2842
2843 case 4:
2844 {
2845 ActiveMask.use = 0x00000000ffffffff;
2846 _asm {
2847 mov ebx, diff
2848 mov edi, row
2849 mov esi, prev_row
2850 pxor mm0, mm0
2851 // PRIME the pump (load the first Raw(x-bpp) data set
2852 movq mm1, [edi+ebx-8] // Only time should need to read
2853 // a=Raw(x-bpp) bytes
2854 dpth4lp:
2855 // Do first set of 4 bytes
2856 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2857 punpckhbw mm1, mm0 // Unpack Low bytes of a
2858 movq mm2, [esi + ebx] // load b=Prior(x)
2859 punpcklbw mm2, mm0 // Unpack High bytes of b
2860 // pav = p - a = (a + b - c) - a = b - c
2861 movq mm4, mm2
2862 punpckhbw mm3, mm0 // Unpack High bytes of c
2863 // pbv = p - b = (a + b - c) - b = a - c
2864 movq mm5, mm1
2865 psubw mm4, mm3
2866 pxor mm7, mm7
2867 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2868 movq mm6, mm4
2869 psubw mm5, mm3
2870 // pa = abs(p-a) = abs(pav)
2871 // pb = abs(p-b) = abs(pbv)
2872 // pc = abs(p-c) = abs(pcv)
2873 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2874 paddw mm6, mm5
2875 pand mm0, mm4 // Only pav bytes < 0 in mm7
2876 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2877 psubw mm4, mm0
2878 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2879 psubw mm4, mm0
2880 psubw mm5, mm7
2881 pxor mm0, mm0
2882 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2883 pand mm0, mm6 // Only pav bytes < 0 in mm7
2884 psubw mm5, mm7
2885 psubw mm6, mm0
2886 // test pa <= pb
2887 movq mm7, mm4
2888 psubw mm6, mm0
2889 pcmpgtw mm7, mm5 // pa > pb?
2890 movq mm0, mm7
2891 // use mm7 mask to merge pa & pb
2892 pand mm5, mm7
2893 // use mm0 mask copy to merge a & b
2894 pand mm2, mm0
2895 pandn mm7, mm4
2896 pandn mm0, mm1
2897 paddw mm7, mm5
2898 paddw mm0, mm2
2899 // test ((pa <= pb)? pa:pb) <= pc
2900 pcmpgtw mm7, mm6 // pab > pc?
2901 pxor mm1, mm1
2902 pand mm3, mm7
2903 pandn mm7, mm0
2904 paddw mm7, mm3
2905 pxor mm0, mm0
2906 packuswb mm7, mm1
2907 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2908 pand mm7, ActiveMask
2909 movq mm2, mm3 // load b=Prior(x) step 1
2910 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2911 punpcklbw mm3, mm0 // Unpack High bytes of c
2912 movq [edi + ebx], mm7 // write back updated value
2913 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2914 // Do second set of 4 bytes
2915 punpckhbw mm2, mm0 // Unpack Low bytes of b
2916 punpcklbw mm1, mm0 // Unpack Low bytes of a
2917 // pav = p - a = (a + b - c) - a = b - c
2918 movq mm4, mm2
2919 // pbv = p - b = (a + b - c) - b = a - c
2920 movq mm5, mm1
2921 psubw mm4, mm3
2922 pxor mm7, mm7
2923 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2924 movq mm6, mm4
2925 psubw mm5, mm3
2926 // pa = abs(p-a) = abs(pav)
2927 // pb = abs(p-b) = abs(pbv)
2928 // pc = abs(p-c) = abs(pcv)
2929 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2930 paddw mm6, mm5
2931 pand mm0, mm4 // Only pav bytes < 0 in mm7
2932 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2933 psubw mm4, mm0
2934 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2935 psubw mm4, mm0
2936 psubw mm5, mm7
2937 pxor mm0, mm0
2938 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2939 pand mm0, mm6 // Only pav bytes < 0 in mm7
2940 psubw mm5, mm7
2941 psubw mm6, mm0
2942 // test pa <= pb
2943 movq mm7, mm4
2944 psubw mm6, mm0
2945 pcmpgtw mm7, mm5 // pa > pb?
2946 movq mm0, mm7
2947 // use mm7 mask to merge pa & pb
2948 pand mm5, mm7
2949 // use mm0 mask copy to merge a & b
2950 pand mm2, mm0
2951 pandn mm7, mm4
2952 pandn mm0, mm1
2953 paddw mm7, mm5
2954 paddw mm0, mm2
2955 // test ((pa <= pb)? pa:pb) <= pc
2956 pcmpgtw mm7, mm6 // pab > pc?
2957 pxor mm1, mm1
2958 pand mm3, mm7
2959 pandn mm7, mm0
2960 pxor mm1, mm1
2961 paddw mm7, mm3
2962 pxor mm0, mm0
2963 // Step ex to next set of 8 bytes and repeat loop til done
2964 add ebx, 8
2965 packuswb mm1, mm7
2966 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2967 cmp ebx, MMXLength
2968 movq [edi + ebx - 8], mm1 // write back updated value
2969 // mm1 will be used as Raw(x-bpp) next loop
2970 jb dpth4lp
2971 } // end _asm block
2972 }
2973 break;
2974 case 8: // bpp == 8
2975 {
2976 ActiveMask.use = 0x00000000ffffffff;
2977 _asm {
2978 mov ebx, diff
2979 mov edi, row
2980 mov esi, prev_row
2981 pxor mm0, mm0
2982 // PRIME the pump (load the first Raw(x-bpp) data set
2983 movq mm1, [edi+ebx-8] // Only time should need to read
2984 // a=Raw(x-bpp) bytes
2985 dpth8lp:
2986 // Do first set of 4 bytes
2987 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2988 punpcklbw mm1, mm0 // Unpack Low bytes of a
2989 movq mm2, [esi + ebx] // load b=Prior(x)
2990 punpcklbw mm2, mm0 // Unpack Low bytes of b
2991 // pav = p - a = (a + b - c) - a = b - c
2992 movq mm4, mm2
2993 punpcklbw mm3, mm0 // Unpack Low bytes of c
2994 // pbv = p - b = (a + b - c) - b = a - c
2995 movq mm5, mm1
2996 psubw mm4, mm3
2997 pxor mm7, mm7
2998 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2999 movq mm6, mm4
3000 psubw mm5, mm3
3001 // pa = abs(p-a) = abs(pav)
3002 // pb = abs(p-b) = abs(pbv)
3003 // pc = abs(p-c) = abs(pcv)
3004 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3005 paddw mm6, mm5
3006 pand mm0, mm4 // Only pav bytes < 0 in mm7
3007 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3008 psubw mm4, mm0
3009 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3010 psubw mm4, mm0
3011 psubw mm5, mm7
3012 pxor mm0, mm0
3013 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3014 pand mm0, mm6 // Only pav bytes < 0 in mm7
3015 psubw mm5, mm7
3016 psubw mm6, mm0
3017 // test pa <= pb
3018 movq mm7, mm4
3019 psubw mm6, mm0
3020 pcmpgtw mm7, mm5 // pa > pb?
3021 movq mm0, mm7
3022 // use mm7 mask to merge pa & pb
3023 pand mm5, mm7
3024 // use mm0 mask copy to merge a & b
3025 pand mm2, mm0
3026 pandn mm7, mm4
3027 pandn mm0, mm1
3028 paddw mm7, mm5
3029 paddw mm0, mm2
3030 // test ((pa <= pb)? pa:pb) <= pc
3031 pcmpgtw mm7, mm6 // pab > pc?
3032 pxor mm1, mm1
3033 pand mm3, mm7
3034 pandn mm7, mm0
3035 paddw mm7, mm3
3036 pxor mm0, mm0
3037 packuswb mm7, mm1
3038 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3039 pand mm7, ActiveMask
3040 movq mm2, [esi + ebx] // load b=Prior(x)
3041 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3042 punpckhbw mm3, mm0 // Unpack High bytes of c
3043 movq [edi + ebx], mm7 // write back updated value
3044 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3045
3046 // Do second set of 4 bytes
3047 punpckhbw mm2, mm0 // Unpack High bytes of b
3048 punpckhbw mm1, mm0 // Unpack High bytes of a
3049 // pav = p - a = (a + b - c) - a = b - c
3050 movq mm4, mm2
3051 // pbv = p - b = (a + b - c) - b = a - c
3052 movq mm5, mm1
3053 psubw mm4, mm3
3054 pxor mm7, mm7
3055 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3056 movq mm6, mm4
3057 psubw mm5, mm3
3058 // pa = abs(p-a) = abs(pav)
3059 // pb = abs(p-b) = abs(pbv)
3060 // pc = abs(p-c) = abs(pcv)
3061 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3062 paddw mm6, mm5
3063 pand mm0, mm4 // Only pav bytes < 0 in mm7
3064 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3065 psubw mm4, mm0
3066 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3067 psubw mm4, mm0
3068 psubw mm5, mm7
3069 pxor mm0, mm0
3070 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3071 pand mm0, mm6 // Only pav bytes < 0 in mm7
3072 psubw mm5, mm7
3073 psubw mm6, mm0
3074 // test pa <= pb
3075 movq mm7, mm4
3076 psubw mm6, mm0
3077 pcmpgtw mm7, mm5 // pa > pb?
3078 movq mm0, mm7
3079 // use mm7 mask to merge pa & pb
3080 pand mm5, mm7
3081 // use mm0 mask copy to merge a & b
3082 pand mm2, mm0
3083 pandn mm7, mm4
3084 pandn mm0, mm1
3085 paddw mm7, mm5
3086 paddw mm0, mm2
3087 // test ((pa <= pb)? pa:pb) <= pc
3088 pcmpgtw mm7, mm6 // pab > pc?
3089 pxor mm1, mm1
3090 pand mm3, mm7
3091 pandn mm7, mm0
3092 pxor mm1, mm1
3093 paddw mm7, mm3
3094 pxor mm0, mm0
3095 // Step ex to next set of 8 bytes and repeat loop til done
3096 add ebx, 8
3097 packuswb mm1, mm7
3098 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3099 cmp ebx, MMXLength
3100 movq [edi + ebx - 8], mm1 // write back updated value
3101 // mm1 will be used as Raw(x-bpp) next loop
3102 jb dpth8lp
3103 } // end _asm block
3104 }
3105 break;
3106
3107 case 1: // bpp = 1
3108 case 2: // bpp = 2
3109 default: // bpp > 8
3110 {
3111 _asm {
3112 mov ebx, diff
3113 cmp ebx, FullLength
3114 jnb dpthdend
3115 mov edi, row
3116 mov esi, prev_row
3117 // Do Paeth decode for remaining bytes
3118 mov edx, ebx
3119 xor ecx, ecx // zero ecx before using cl & cx in loop below
3120 sub edx, bpp // Set edx = ebx - bpp
3121 dpthdlp:
3122 xor eax, eax
3123 // pav = p - a = (a + b - c) - a = b - c
3124 mov al, [esi + ebx] // load Prior(x) into al
3125 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3126 sub eax, ecx // subtract Prior(x-bpp)
3127 mov patemp, eax // Save pav for later use
3128 xor eax, eax
3129 // pbv = p - b = (a + b - c) - b = a - c
3130 mov al, [edi + edx] // load Raw(x-bpp) into al
3131 sub eax, ecx // subtract Prior(x-bpp)
3132 mov ecx, eax
3133 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3134 add eax, patemp // pcv = pav + pbv
3135 // pc = abs(pcv)
3136 test eax, 0x80000000
3137 jz dpthdpca
3138 neg eax // reverse sign of neg values
3139 dpthdpca:
3140 mov pctemp, eax // save pc for later use
3141 // pb = abs(pbv)
3142 test ecx, 0x80000000
3143 jz dpthdpba
3144 neg ecx // reverse sign of neg values
3145 dpthdpba:
3146 mov pbtemp, ecx // save pb for later use
3147 // pa = abs(pav)
3148 mov eax, patemp
3149 test eax, 0x80000000
3150 jz dpthdpaa
3151 neg eax // reverse sign of neg values
3152 dpthdpaa:
3153 mov patemp, eax // save pa for later use
3154 // test if pa <= pb
3155 cmp eax, ecx
3156 jna dpthdabb
3157 // pa > pb; now test if pb <= pc
3158 cmp ecx, pctemp
3159 jna dpthdbbc
3160 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3161 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3162 jmp dpthdpaeth
3163 dpthdbbc:
3164 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3165 mov cl, [esi + ebx] // load Prior(x) into cl
3166 jmp dpthdpaeth
3167 dpthdabb:
3168 // pa <= pb; now test if pa <= pc
3169 cmp eax, pctemp
3170 jna dpthdabc
3171 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3172 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3173 jmp dpthdpaeth
3174 dpthdabc:
3175 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3176 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3177 dpthdpaeth:
3178 inc ebx
3179 inc edx
3180 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3181 add [edi + ebx - 1], cl
3182 cmp ebx, FullLength
3183 jb dpthdlp
3184 dpthdend:
3185 } // end _asm block
3186 }
3187 return; // No need to go further with this one
3188 } // end switch ( bpp )
3189 _asm
3190 {
3191 // MMX acceleration complete now do clean-up
3192 // Check if any remaining bytes left to decode
3193 mov ebx, MMXLength
3194 cmp ebx, FullLength
3195 jnb dpthend
3196 mov edi, row
3197 mov esi, prev_row
3198 // Do Paeth decode for remaining bytes
3199 mov edx, ebx
3200 xor ecx, ecx // zero ecx before using cl & cx in loop below
3201 sub edx, bpp // Set edx = ebx - bpp
3202 dpthlp2:
3203 xor eax, eax
3204 // pav = p - a = (a + b - c) - a = b - c
3205 mov al, [esi + ebx] // load Prior(x) into al
3206 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3207 sub eax, ecx // subtract Prior(x-bpp)
3208 mov patemp, eax // Save pav for later use
3209 xor eax, eax
3210 // pbv = p - b = (a + b - c) - b = a - c
3211 mov al, [edi + edx] // load Raw(x-bpp) into al
3212 sub eax, ecx // subtract Prior(x-bpp)
3213 mov ecx, eax
3214 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3215 add eax, patemp // pcv = pav + pbv
3216 // pc = abs(pcv)
3217 test eax, 0x80000000
3218 jz dpthpca2
3219 neg eax // reverse sign of neg values
3220 dpthpca2:
3221 mov pctemp, eax // save pc for later use
3222 // pb = abs(pbv)
3223 test ecx, 0x80000000
3224 jz dpthpba2
3225 neg ecx // reverse sign of neg values
3226 dpthpba2:
3227 mov pbtemp, ecx // save pb for later use
3228 // pa = abs(pav)
3229 mov eax, patemp
3230 test eax, 0x80000000
3231 jz dpthpaa2
3232 neg eax // reverse sign of neg values
3233 dpthpaa2:
3234 mov patemp, eax // save pa for later use
3235 // test if pa <= pb
3236 cmp eax, ecx
3237 jna dpthabb2
3238 // pa > pb; now test if pb <= pc
3239 cmp ecx, pctemp
3240 jna dpthbbc2
3241 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3242 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3243 jmp dpthpaeth2
3244 dpthbbc2:
3245 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3246 mov cl, [esi + ebx] // load Prior(x) into cl
3247 jmp dpthpaeth2
3248 dpthabb2:
3249 // pa <= pb; now test if pa <= pc
3250 cmp eax, pctemp
3251 jna dpthabc2
3252 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3253 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3254 jmp dpthpaeth2
3255 dpthabc2:
3256 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3257 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3258 dpthpaeth2:
3259 inc ebx
3260 inc edx
3261 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3262 add [edi + ebx - 1], cl
3263 cmp ebx, FullLength
3264 jb dpthlp2
3265 dpthend:
3266 emms // End MMX instructions; prep for possible FP instrs.
3267 } // end _asm block
3268 }
3269
3270 // Optimized code for PNG Sub filter decoder
3271 void /* PRIVATE */
png_read_filter_row_mmx_sub(png_row_infop row_info,png_bytep row)3272 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3273 {
3274 // These variables are declared
3275 // here to ensure alignment on 8-byte boundaries.
3276 union uAll ActiveMask, ShiftBpp, ShiftRem;
3277
3278 //int test;
3279 int bpp;
3280 png_uint_32 FullLength;
3281 png_uint_32 MMXLength;
3282 int diff;
3283
3284 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3285 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3286 _asm {
3287 mov edi, row
3288 mov esi, edi // lp = row
3289 add edi, bpp // rp = row + bpp
3290 xor eax, eax
3291 // get # of bytes to alignment
3292 mov diff, edi // take start of row
3293 add diff, 0xf // add 7 + 8 to incr past
3294 // alignment boundary
3295 xor ebx, ebx
3296 and diff, 0xfffffff8 // mask to alignment boundary
3297 sub diff, edi // subtract from start ==> value
3298 // ebx at alignment
3299 jz dsubgo
3300 // fix alignment
3301 dsublp1:
3302 mov al, [esi+ebx]
3303 add [edi+ebx], al
3304 inc ebx
3305 cmp ebx, diff
3306 jb dsublp1
3307 dsubgo:
3308 mov ecx, FullLength
3309 mov edx, ecx
3310 sub edx, ebx // subtract alignment fix
3311 and edx, 0x00000007 // calc bytes over mult of 8
3312 sub ecx, edx // drop over bytes from length
3313 mov MMXLength, ecx
3314 } // end _asm block
3315
3316 // Now do the math for the rest of the row
3317 switch ( bpp )
3318 {
3319 case 3:
3320 {
3321 ActiveMask.use = 0x0000ffffff000000;
3322 ShiftBpp.use = 24; // == 3 * 8
3323 ShiftRem.use = 40; // == 64 - 24
3324 _asm {
3325 mov edi, row
3326 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3327 mov esi, edi // lp = row
3328 add edi, bpp // rp = row + bpp
3329 movq mm6, mm7
3330 mov ebx, diff
3331 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3332 // byte group
3333 // PRIME the pump (load the first Raw(x-bpp) data set
3334 movq mm1, [edi+ebx-8]
3335 dsub3lp:
3336 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3337 // no need for mask; shift clears inactive bytes
3338 // Add 1st active group
3339 movq mm0, [edi+ebx]
3340 paddb mm0, mm1
3341 // Add 2nd active group
3342 movq mm1, mm0 // mov updated Raws to mm1
3343 psllq mm1, ShiftBpp // shift data to position correctly
3344 pand mm1, mm7 // mask to use only 2nd active group
3345 paddb mm0, mm1
3346 // Add 3rd active group
3347 movq mm1, mm0 // mov updated Raws to mm1
3348 psllq mm1, ShiftBpp // shift data to position correctly
3349 pand mm1, mm6 // mask to use only 3rd active group
3350 add ebx, 8
3351 paddb mm0, mm1
3352 cmp ebx, MMXLength
3353 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3354 // Prep for doing 1st add at top of loop
3355 movq mm1, mm0
3356 jb dsub3lp
3357 } // end _asm block
3358 }
3359 break;
3360
3361 case 1:
3362 {
3363 // Placed here just in case this is a duplicate of the
3364 // non-MMX code for the SUB filter in png_read_filter_row below
3365 //
3366 // png_bytep rp;
3367 // png_bytep lp;
3368 // png_uint_32 i;
3369 // bpp = (row_info->pixel_depth + 7) >> 3;
3370 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3371 // i < row_info->rowbytes; i++, rp++, lp++)
3372 // {
3373 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3374 // }
3375 _asm {
3376 mov ebx, diff
3377 mov edi, row
3378 cmp ebx, FullLength
3379 jnb dsub1end
3380 mov esi, edi // lp = row
3381 xor eax, eax
3382 add edi, bpp // rp = row + bpp
3383 dsub1lp:
3384 mov al, [esi+ebx]
3385 add [edi+ebx], al
3386 inc ebx
3387 cmp ebx, FullLength
3388 jb dsub1lp
3389 dsub1end:
3390 } // end _asm block
3391 }
3392 return;
3393
3394 case 6:
3395 case 7:
3396 case 4:
3397 case 5:
3398 {
3399 ShiftBpp.use = bpp << 3;
3400 ShiftRem.use = 64 - ShiftBpp.use;
3401 _asm {
3402 mov edi, row
3403 mov ebx, diff
3404 mov esi, edi // lp = row
3405 add edi, bpp // rp = row + bpp
3406 // PRIME the pump (load the first Raw(x-bpp) data set
3407 movq mm1, [edi+ebx-8]
3408 dsub4lp:
3409 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3410 // no need for mask; shift clears inactive bytes
3411 movq mm0, [edi+ebx]
3412 paddb mm0, mm1
3413 // Add 2nd active group
3414 movq mm1, mm0 // mov updated Raws to mm1
3415 psllq mm1, ShiftBpp // shift data to position correctly
3416 // there is no need for any mask
3417 // since shift clears inactive bits/bytes
3418 add ebx, 8
3419 paddb mm0, mm1
3420 cmp ebx, MMXLength
3421 movq [edi+ebx-8], mm0
3422 movq mm1, mm0 // Prep for doing 1st add at top of loop
3423 jb dsub4lp
3424 } // end _asm block
3425 }
3426 break;
3427
3428 case 2:
3429 {
3430 ActiveMask.use = 0x00000000ffff0000;
3431 ShiftBpp.use = 16; // == 2 * 8
3432 ShiftRem.use = 48; // == 64 - 16
3433 _asm {
3434 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3435 mov ebx, diff
3436 movq mm6, mm7
3437 mov edi, row
3438 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3439 // byte group
3440 mov esi, edi // lp = row
3441 movq mm5, mm6
3442 add edi, bpp // rp = row + bpp
3443 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3444 // byte group
3445 // PRIME the pump (load the first Raw(x-bpp) data set
3446 movq mm1, [edi+ebx-8]
3447 dsub2lp:
3448 // Add 1st active group
3449 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3450 // no need for mask; shift clears inactive
3451 // bytes
3452 movq mm0, [edi+ebx]
3453 paddb mm0, mm1
3454 // Add 2nd active group
3455 movq mm1, mm0 // mov updated Raws to mm1
3456 psllq mm1, ShiftBpp // shift data to position correctly
3457 pand mm1, mm7 // mask to use only 2nd active group
3458 paddb mm0, mm1
3459 // Add 3rd active group
3460 movq mm1, mm0 // mov updated Raws to mm1
3461 psllq mm1, ShiftBpp // shift data to position correctly
3462 pand mm1, mm6 // mask to use only 3rd active group
3463 paddb mm0, mm1
3464 // Add 4th active group
3465 movq mm1, mm0 // mov updated Raws to mm1
3466 psllq mm1, ShiftBpp // shift data to position correctly
3467 pand mm1, mm5 // mask to use only 4th active group
3468 add ebx, 8
3469 paddb mm0, mm1
3470 cmp ebx, MMXLength
3471 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3472 movq mm1, mm0 // Prep for doing 1st add at top of loop
3473 jb dsub2lp
3474 } // end _asm block
3475 }
3476 break;
3477 case 8:
3478 {
3479 _asm {
3480 mov edi, row
3481 mov ebx, diff
3482 mov esi, edi // lp = row
3483 add edi, bpp // rp = row + bpp
3484 mov ecx, MMXLength
3485 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3486 // Raw(x-bpp) data set
3487 and ecx, 0x0000003f // calc bytes over mult of 64
3488 dsub8lp:
3489 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3490 paddb mm0, mm7
3491 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3492 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3493 // Now mm0 will be used as Raw(x-bpp) for
3494 // the 2nd group of 8 bytes. This will be
3495 // repeated for each group of 8 bytes with
3496 // the 8th group being used as the Raw(x-bpp)
3497 // for the 1st group of the next loop.
3498 paddb mm1, mm0
3499 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3500 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3501 paddb mm2, mm1
3502 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3503 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3504 paddb mm3, mm2
3505 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3506 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3507 paddb mm4, mm3
3508 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3509 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3510 paddb mm5, mm4
3511 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3512 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3513 paddb mm6, mm5
3514 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3515 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3516 add ebx, 64
3517 paddb mm7, mm6
3518 cmp ebx, ecx
3519 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3520 jb dsub8lp
3521 cmp ebx, MMXLength
3522 jnb dsub8lt8
3523 dsub8lpA:
3524 movq mm0, [edi+ebx]
3525 add ebx, 8
3526 paddb mm0, mm7
3527 cmp ebx, MMXLength
3528 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3529 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3530 // be the new Raw(x-bpp) for the next loop
3531 jb dsub8lpA
3532 dsub8lt8:
3533 } // end _asm block
3534 }
3535 break;
3536
3537 default: // bpp greater than 8 bytes
3538 {
3539 _asm {
3540 mov ebx, diff
3541 mov edi, row
3542 mov esi, edi // lp = row
3543 add edi, bpp // rp = row + bpp
3544 dsubAlp:
3545 movq mm0, [edi+ebx]
3546 movq mm1, [esi+ebx]
3547 add ebx, 8
3548 paddb mm0, mm1
3549 cmp ebx, MMXLength
3550 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3551 // add ebx
3552 jb dsubAlp
3553 } // end _asm block
3554 }
3555 break;
3556
3557 } // end switch ( bpp )
3558
3559 _asm {
3560 mov ebx, MMXLength
3561 mov edi, row
3562 cmp ebx, FullLength
3563 jnb dsubend
3564 mov esi, edi // lp = row
3565 xor eax, eax
3566 add edi, bpp // rp = row + bpp
3567 dsublp2:
3568 mov al, [esi+ebx]
3569 add [edi+ebx], al
3570 inc ebx
3571 cmp ebx, FullLength
3572 jb dsublp2
3573 dsubend:
3574 emms // End MMX instructions; prep for possible FP instrs.
3575 } // end _asm block
3576 }
3577
3578 // Optimized code for PNG Up filter decoder
3579 void /* PRIVATE */
png_read_filter_row_mmx_up(png_row_infop row_info,png_bytep row,png_bytep prev_row)3580 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3581 png_bytep prev_row)
3582 {
3583 png_uint_32 len;
3584 len = row_info->rowbytes; // # of bytes to filter
3585 _asm {
3586 mov edi, row
3587 // get # of bytes to alignment
3588 mov ecx, edi
3589 xor ebx, ebx
3590 add ecx, 0x7
3591 xor eax, eax
3592 and ecx, 0xfffffff8
3593 mov esi, prev_row
3594 sub ecx, edi
3595 jz dupgo
3596 // fix alignment
3597 duplp1:
3598 mov al, [edi+ebx]
3599 add al, [esi+ebx]
3600 inc ebx
3601 cmp ebx, ecx
3602 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3603 jb duplp1
3604 dupgo:
3605 mov ecx, len
3606 mov edx, ecx
3607 sub edx, ebx // subtract alignment fix
3608 and edx, 0x0000003f // calc bytes over mult of 64
3609 sub ecx, edx // drop over bytes from length
3610 // Unrolled loop - use all MMX registers and interleave to reduce
3611 // number of branch instructions (loops) and reduce partial stalls
3612 duploop:
3613 movq mm1, [esi+ebx]
3614 movq mm0, [edi+ebx]
3615 movq mm3, [esi+ebx+8]
3616 paddb mm0, mm1
3617 movq mm2, [edi+ebx+8]
3618 movq [edi+ebx], mm0
3619 paddb mm2, mm3
3620 movq mm5, [esi+ebx+16]
3621 movq [edi+ebx+8], mm2
3622 movq mm4, [edi+ebx+16]
3623 movq mm7, [esi+ebx+24]
3624 paddb mm4, mm5
3625 movq mm6, [edi+ebx+24]
3626 movq [edi+ebx+16], mm4
3627 paddb mm6, mm7
3628 movq mm1, [esi+ebx+32]
3629 movq [edi+ebx+24], mm6
3630 movq mm0, [edi+ebx+32]
3631 movq mm3, [esi+ebx+40]
3632 paddb mm0, mm1
3633 movq mm2, [edi+ebx+40]
3634 movq [edi+ebx+32], mm0
3635 paddb mm2, mm3
3636 movq mm5, [esi+ebx+48]
3637 movq [edi+ebx+40], mm2
3638 movq mm4, [edi+ebx+48]
3639 movq mm7, [esi+ebx+56]
3640 paddb mm4, mm5
3641 movq mm6, [edi+ebx+56]
3642 movq [edi+ebx+48], mm4
3643 add ebx, 64
3644 paddb mm6, mm7
3645 cmp ebx, ecx
3646 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3647 // -8 to offset add ebx
3648 jb duploop
3649
3650 cmp edx, 0 // Test for bytes over mult of 64
3651 jz dupend
3652
3653
3654 // 2 lines added by lcreeve at netins.net
3655 // (mail 11 Jul 98 in png-implement list)
3656 cmp edx, 8 //test for less than 8 bytes
3657 jb duplt8
3658
3659
3660 add ecx, edx
3661 and edx, 0x00000007 // calc bytes over mult of 8
3662 sub ecx, edx // drop over bytes from length
3663 jz duplt8
3664 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3665 duplpA:
3666 movq mm1, [esi+ebx]
3667 movq mm0, [edi+ebx]
3668 add ebx, 8
3669 paddb mm0, mm1
3670 cmp ebx, ecx
3671 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3672 jb duplpA
3673 cmp edx, 0 // Test for bytes over mult of 8
3674 jz dupend
3675 duplt8:
3676 xor eax, eax
3677 add ecx, edx // move over byte count into counter
3678 // Loop using x86 registers to update remaining bytes
3679 duplp2:
3680 mov al, [edi + ebx]
3681 add al, [esi + ebx]
3682 inc ebx
3683 cmp ebx, ecx
3684 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3685 jb duplp2
3686 dupend:
3687 // Conversion of filtered row completed
3688 emms // End MMX instructions; prep for possible FP instrs.
3689 } // end _asm block
3690 }
3691
3692
3693 // Optimized png_read_filter_row routines
3694 void /* PRIVATE */
png_read_filter_row(png_structp png_ptr,png_row_infop row_info,png_bytep row,png_bytep prev_row,int filter)3695 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3696 row, png_bytep prev_row, int filter)
3697 {
3698 #ifdef PNG_DEBUG
3699 char filnm[10];
3700 #endif
3701
3702 if (mmx_supported == 2) {
3703 #if !defined(PNG_1_0_X)
3704 /* this should have happened in png_init_mmx_flags() already */
3705 png_warning(png_ptr, "asm_flags may not have been initialized");
3706 #endif
3707 png_mmx_support();
3708 }
3709
3710 #ifdef PNG_DEBUG
3711 png_debug(1, "in png_read_filter_row\n");
3712 switch (filter)
3713 {
3714 case 0: png_snprintf(filnm, 10, "none");
3715 break;
3716 #if !defined(PNG_1_0_X)
3717 case 1: png_snprintf(filnm, 10, "sub-%s",
3718 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3719 break;
3720 case 2: png_snprintf(filnm, 10, "up-%s",
3721 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3722 break;
3723 case 3: png_snprintf(filnm, 10, "avg-%s",
3724 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3725 break;
3726 case 4: png_snprintf(filnm, 10, "Paeth-%s",
3727 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3728 break;
3729 #else
3730 case 1: png_snprintf(filnm, 10, "sub");
3731 break;
3732 case 2: png_snprintf(filnm, 10, "up");
3733 break;
3734 case 3: png_snprintf(filnm, 10, "avg");
3735 break;
3736 case 4: png_snprintf(filnm, 10, "Paeth");
3737 break;
3738 #endif
3739 default: png_snprintf(filnm, 10, "unknw");
3740 break;
3741 }
3742 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3743 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3744 (int)((row_info->pixel_depth + 7) >> 3));
3745 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3746 #endif /* PNG_DEBUG */
3747
3748 switch (filter)
3749 {
3750 case PNG_FILTER_VALUE_NONE:
3751 break;
3752
3753 case PNG_FILTER_VALUE_SUB:
3754 {
3755 #if !defined(PNG_1_0_X)
3756 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3757 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3758 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3759 #else
3760 if (mmx_supported)
3761 #endif
3762 {
3763 png_read_filter_row_mmx_sub(row_info, row);
3764 }
3765 else
3766 {
3767 png_uint_32 i;
3768 png_uint_32 istop = row_info->rowbytes;
3769 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3770 png_bytep rp = row + bpp;
3771 png_bytep lp = row;
3772
3773 for (i = bpp; i < istop; i++)
3774 {
3775 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3776 rp++;
3777 }
3778 }
3779 break;
3780 }
3781
3782 case PNG_FILTER_VALUE_UP:
3783 {
3784 #if !defined(PNG_1_0_X)
3785 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3786 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3787 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3788 #else
3789 if (mmx_supported)
3790 #endif
3791 {
3792 png_read_filter_row_mmx_up(row_info, row, prev_row);
3793 }
3794 else
3795 {
3796 png_uint_32 i;
3797 png_uint_32 istop = row_info->rowbytes;
3798 png_bytep rp = row;
3799 png_bytep pp = prev_row;
3800
3801 for (i = 0; i < istop; ++i)
3802 {
3803 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3804 rp++;
3805 }
3806 }
3807 break;
3808 }
3809
3810 case PNG_FILTER_VALUE_AVG:
3811 {
3812 #if !defined(PNG_1_0_X)
3813 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3814 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3815 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3816 #else
3817 if (mmx_supported)
3818 #endif
3819 {
3820 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3821 }
3822 else
3823 {
3824 png_uint_32 i;
3825 png_bytep rp = row;
3826 png_bytep pp = prev_row;
3827 png_bytep lp = row;
3828 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3829 png_uint_32 istop = row_info->rowbytes - bpp;
3830
3831 for (i = 0; i < bpp; i++)
3832 {
3833 *rp = (png_byte)(((int)(*rp) +
3834 ((int)(*pp++) >> 1)) & 0xff);
3835 rp++;
3836 }
3837
3838 for (i = 0; i < istop; i++)
3839 {
3840 *rp = (png_byte)(((int)(*rp) +
3841 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3842 rp++;
3843 }
3844 }
3845 break;
3846 }
3847
3848 case PNG_FILTER_VALUE_PAETH:
3849 {
3850 #if !defined(PNG_1_0_X)
3851 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3852 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3853 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3854 #else
3855 if (mmx_supported)
3856 #endif
3857 {
3858 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3859 }
3860 else
3861 {
3862 png_uint_32 i;
3863 png_bytep rp = row;
3864 png_bytep pp = prev_row;
3865 png_bytep lp = row;
3866 png_bytep cp = prev_row;
3867 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3868 png_uint_32 istop=row_info->rowbytes - bpp;
3869
3870 for (i = 0; i < bpp; i++)
3871 {
3872 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3873 rp++;
3874 }
3875
3876 for (i = 0; i < istop; i++) // use leftover rp,pp
3877 {
3878 int a, b, c, pa, pb, pc, p;
3879
3880 a = *lp++;
3881 b = *pp++;
3882 c = *cp++;
3883
3884 p = b - c;
3885 pc = a - c;
3886
3887 #ifdef PNG_USE_ABS
3888 pa = abs(p);
3889 pb = abs(pc);
3890 pc = abs(p + pc);
3891 #else
3892 pa = p < 0 ? -p : p;
3893 pb = pc < 0 ? -pc : pc;
3894 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3895 #endif
3896
3897 /*
3898 if (pa <= pb && pa <= pc)
3899 p = a;
3900 else if (pb <= pc)
3901 p = b;
3902 else
3903 p = c;
3904 */
3905
3906 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3907
3908 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3909 rp++;
3910 }
3911 }
3912 break;
3913 }
3914
3915 default:
3916 png_warning(png_ptr, "Ignoring bad row filter type");
3917 *row=0;
3918 break;
3919 }
3920 }
3921
3922 #endif /* PNG_MMX_CODE_SUPPORTED && PNG_USE_PNGVCRD */
3923